Skip to main content

edgeparse_core/output/
markdown.rs

1//! Markdown output generator.
2
3#[cfg(not(target_arch = "wasm32"))]
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6#[cfg(not(target_arch = "wasm32"))]
7use std::path::Path;
8#[cfg(not(target_arch = "wasm32"))]
9use std::process::Command;
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::TextChunk;
13use crate::models::content::ContentElement;
14use crate::models::document::PdfDocument;
15use crate::models::enums::SemanticType;
16use crate::models::semantic::SemanticTextNode;
17use crate::models::table::TableTokenRow;
18use crate::EdgePdfError;
19
20#[cfg(not(target_arch = "wasm32"))]
21struct CachedBBoxLayout {
22    page_width: f64,
23    lines: Vec<BBoxLayoutLine>,
24    blocks: Vec<BBoxLayoutBlock>,
25}
26
27#[cfg(not(target_arch = "wasm32"))]
28#[derive(Default)]
29struct LayoutSourceCache {
30    bbox_layout: Option<Option<CachedBBoxLayout>>,
31    layout_lines: Option<Option<Vec<String>>>,
32}
33
34#[cfg(not(target_arch = "wasm32"))]
35impl LayoutSourceCache {
36    fn bbox_layout(&mut self, doc: &PdfDocument) -> Option<&CachedBBoxLayout> {
37        if self.bbox_layout.is_none() {
38            let loaded = doc.source_path.as_deref().and_then(|source_path| {
39                let (page_width, lines) = read_pdftotext_bbox_layout_lines(Path::new(source_path))?;
40                let blocks = collect_bbox_layout_blocks(&lines);
41                Some(CachedBBoxLayout {
42                    page_width,
43                    lines,
44                    blocks,
45                })
46            });
47            self.bbox_layout = Some(loaded);
48        }
49        self.bbox_layout.as_ref().and_then(Option::as_ref)
50    }
51
52    fn layout_lines(&mut self, doc: &PdfDocument) -> Option<&[String]> {
53        if self.layout_lines.is_none() {
54            let loaded = doc
55                .source_path
56                .as_deref()
57                .and_then(|source_path| read_pdftotext_layout_lines(Path::new(source_path)));
58            self.layout_lines = Some(loaded);
59        }
60        self.layout_lines
61            .as_ref()
62            .and_then(Option::as_ref)
63            .map(Vec::as_slice)
64    }
65}
66
67/// Generate Markdown representation of a PdfDocument.
68///
69/// # Errors
70/// Returns `EdgePdfError::OutputError` on write failures.
71pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
72    #[cfg(not(target_arch = "wasm32"))]
73    let mut layout_cache = LayoutSourceCache::default();
74    #[cfg(not(target_arch = "wasm32"))]
75    if let Some(rendered) = render_layout_open_plate_document_cached(doc, &mut layout_cache) {
76        return Ok(rendered);
77    }
78    #[cfg(not(target_arch = "wasm32"))]
79    if let Some(rendered) =
80        render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
81    {
82        return Ok(rendered);
83    }
84    #[cfg(not(target_arch = "wasm32"))]
85    if let Some(rendered) = render_layout_captioned_media_document_cached(doc, &mut layout_cache) {
86        return Ok(rendered);
87    }
88    #[cfg(not(target_arch = "wasm32"))]
89    if let Some(rendered) =
90        render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
91    {
92        return Ok(rendered);
93    }
94    #[cfg(not(target_arch = "wasm32"))]
95    if let Some(rendered) = render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
96    {
97        return Ok(rendered);
98    }
99    #[cfg(not(target_arch = "wasm32"))]
100    if let Some(rendered) = render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
101    {
102        return Ok(rendered);
103    }
104    #[cfg(not(target_arch = "wasm32"))]
105    if let Some(rendered) =
106        render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
107    {
108        return Ok(rendered);
109    }
110    #[cfg(not(target_arch = "wasm32"))]
111    if let Some(rendered) = render_layout_toc_document_cached(doc, &mut layout_cache) {
112        return Ok(rendered);
113    }
114    if looks_like_contents_document(doc) {
115        return Ok(render_contents_document(doc));
116    }
117    if looks_like_compact_toc_document(doc) {
118        return Ok(render_compact_toc_document(doc));
119    }
120    #[cfg(not(target_arch = "wasm32"))]
121    if let Some(rendered) = render_layout_projection_sheet_document_cached(doc, &mut layout_cache) {
122        return Ok(rendered);
123    }
124    #[cfg(not(target_arch = "wasm32"))]
125    if let Some(rendered) = render_layout_appendix_tables_document_cached(doc, &mut layout_cache) {
126        return Ok(rendered);
127    }
128    #[cfg(not(target_arch = "wasm32"))]
129    if let Some(rendered) = render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
130    {
131        return Ok(rendered);
132    }
133    #[cfg(not(target_arch = "wasm32"))]
134    if let Some(rendered) = render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
135    {
136        return Ok(rendered);
137    }
138    #[cfg(not(target_arch = "wasm32"))]
139    if let Some(rendered) =
140        render_layout_registration_report_document_cached(doc, &mut layout_cache)
141    {
142        return Ok(rendered);
143    }
144    if let Some(rendered) = render_top_table_plate_document(doc) {
145        return Ok(rendered);
146    }
147    if let Some(rendered) = render_single_table_report_document(doc) {
148        return Ok(rendered);
149    }
150    if let Some(rendered) = render_late_section_boundary_document(doc) {
151        return Ok(rendered);
152    }
153    #[cfg(not(target_arch = "wasm32"))]
154    if let Some(rendered) = render_layout_matrix_document_cached(doc, &mut layout_cache) {
155        return Ok(rendered);
156    }
157    #[cfg(not(target_arch = "wasm32"))]
158    if let Some(rendered) = render_layout_panel_stub_document_cached(doc, &mut layout_cache) {
159        return Ok(rendered);
160    }
161
162    Ok(render_markdown_core(doc))
163}
164
165fn render_markdown_core(doc: &PdfDocument) -> String {
166    let mut output = String::new();
167
168    // Title
169    if let Some(ref title) = doc.title {
170        let trimmed = title.trim();
171        if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
172            if should_render_document_title_as_plaintext(doc, trimmed) {
173                output.push_str(trimmed);
174                output.push_str("\n\n");
175            } else {
176                output.push_str(&format!("# {}\n\n", trimmed));
177            }
178        }
179    }
180
181    if doc.kids.is_empty() {
182        output.push_str("*No content extracted.*\n");
183        return output;
184    }
185
186    let geometric_table_regions = detect_geometric_table_regions(doc);
187    let mut geometric_table_cover = HashMap::new();
188    for region in geometric_table_regions {
189        for idx in region.start_idx..=region.end_idx {
190            geometric_table_cover.insert(idx, region.clone());
191        }
192    }
193
194    let mut i = 0usize;
195    while i < doc.kids.len() {
196        if let Some(region) = geometric_table_cover.get(&i) {
197            output.push_str(&region.rendered);
198            i = region.end_idx + 1;
199            continue;
200        }
201
202        match &doc.kids[i] {
203            ContentElement::Heading(h) => {
204                let text = h.base.base.value();
205                let trimmed = text.trim();
206                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
207                    i += 1;
208                    continue;
209                }
210
211                // Demote headings that sit in the bottom margin of the page
212                // (running footers misclassified as headings by the pipeline).
213                if looks_like_bottom_margin_heading(doc, i) {
214                    output.push_str(&escape_md_line_start(trimmed));
215                    output.push_str("\n\n");
216                    i += 1;
217                    continue;
218                }
219
220                // Demote pipeline headings that look like sentence fragments
221                // ending with a period but are not numbered section headings.
222                if should_demote_period_heading(trimmed) {
223                    output.push_str(&escape_md_line_start(trimmed));
224                    output.push_str("\n\n");
225                    i += 1;
226                    continue;
227                }
228
229                // Demote headings ending with comma (footnotes / data labels).
230                if should_demote_comma_heading(trimmed) {
231                    output.push_str(&escape_md_line_start(trimmed));
232                    output.push_str("\n\n");
233                    i += 1;
234                    continue;
235                }
236
237                // Demote headings containing math symbols.
238                if should_demote_math_heading(trimmed) {
239                    output.push_str(&escape_md_line_start(trimmed));
240                    output.push_str("\n\n");
241                    i += 1;
242                    continue;
243                }
244
245                // Demote headings containing percentage signs.
246                if should_demote_percentage_heading(trimmed) {
247                    output.push_str(&escape_md_line_start(trimmed));
248                    output.push_str("\n\n");
249                    i += 1;
250                    continue;
251                }
252
253                // Demote headings that start with a known caption prefix
254                // (e.g. "Source:", "Figure", "Table") — these are captions,
255                // not section headings, regardless of pipeline classification.
256                if starts_with_caption_prefix(trimmed) {
257                    output.push_str(&escape_md_line_start(trimmed));
258                    output.push_str("\n\n");
259                    i += 1;
260                    continue;
261                }
262
263                // Demote bibliography entries: lines starting with a 4-digit
264                // year followed by a period (e.g. "2020. Title of paper...").
265                if should_demote_bibliography_heading(trimmed) {
266                    output.push_str(&escape_md_line_start(trimmed));
267                    output.push_str("\n\n");
268                    i += 1;
269                    continue;
270                }
271
272                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
273                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
274                        let mut merged = trimmed.to_string();
275                        merge_paragraph_text(&mut merged, &next_text);
276                        output.push_str(&escape_md_line_start(merged.trim()));
277                        output.push_str("\n\n");
278                        i += 2;
279                        continue;
280                    }
281                }
282
283                // Merge consecutive heading fragments.
284                // When the PDF splits a title across multiple text elements,
285                // each becomes a separate heading; merge them into one.
286                let mut merged_heading = trimmed.to_string();
287                while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
288                    let next_text = next_h.base.base.value();
289                    let next_trimmed = next_text.trim();
290                    if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
291                        i += 1;
292                        continue;
293                    }
294                    // Only merge if the combined text stays under max heading length
295                    if merged_heading.len() + 1 + next_trimmed.len() > 200 {
296                        break;
297                    }
298                    merge_paragraph_text(&mut merged_heading, next_trimmed);
299                    i += 1;
300                }
301
302                let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
303
304                // Check if this heading contains a merged subsection
305                if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
306                    let first = cleaned_heading[..split_pos].trim();
307                    let second = cleaned_heading[split_pos..].trim();
308                    output.push_str(&format!("# {}\n\n", first));
309                    output.push_str(&format!("# {}\n\n", second));
310                } else {
311                    output.push_str(&format!("# {}\n\n", cleaned_heading));
312                }
313            }
314            ContentElement::NumberHeading(nh) => {
315                let text = nh.base.base.base.value();
316                let trimmed = text.trim();
317                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
318                    i += 1;
319                    continue;
320                }
321
322                // Demote number headings ending with comma (footnotes).
323                if should_demote_comma_heading(trimmed) {
324                    output.push_str(&escape_md_line_start(trimmed));
325                    output.push_str("\n\n");
326                    i += 1;
327                    continue;
328                }
329
330                // Demote number headings containing math symbols.
331                if should_demote_math_heading(trimmed) {
332                    output.push_str(&escape_md_line_start(trimmed));
333                    output.push_str("\n\n");
334                    i += 1;
335                    continue;
336                }
337
338                // Demote number headings containing percentage signs.
339                if should_demote_percentage_heading(trimmed) {
340                    output.push_str(&escape_md_line_start(trimmed));
341                    output.push_str("\n\n");
342                    i += 1;
343                    continue;
344                }
345
346                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
347                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
348                        let mut merged = trimmed.to_string();
349                        merge_paragraph_text(&mut merged, &next_text);
350                        output.push_str(&escape_md_line_start(merged.trim()));
351                        output.push_str("\n\n");
352                        i += 2;
353                        continue;
354                    }
355                }
356
357                let cleaned = strip_trailing_page_number(trimmed);
358
359                // Check if this heading contains a merged subsection
360                if let Some(split_pos) = find_merged_subsection_split(cleaned) {
361                    let first = cleaned[..split_pos].trim();
362                    let second = cleaned[split_pos..].trim();
363                    output.push_str(&format!("# {}\n\n", first));
364                    output.push_str(&format!("# {}\n\n", second));
365                } else {
366                    output.push_str(&format!("# {}\n\n", cleaned));
367                }
368            }
369            ContentElement::Paragraph(_)
370            | ContentElement::TextBlock(_)
371            | ContentElement::TextLine(_) => {
372                let element = &doc.kids[i];
373                let text = match &doc.kids[i] {
374                    ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
375                    ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
376                    ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
377                    _ => unreachable!(),
378                };
379                let trimmed = text.trim();
380                if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
381                    i += 1;
382                    continue;
383                }
384                if should_skip_leading_figure_carryover(doc, i, trimmed) {
385                    i += 1;
386                    continue;
387                }
388
389                if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
390                    let cleaned = strip_trailing_page_number(trimmed);
391                    // Check if this heading contains a merged subsection
392                    if let Some(split_pos) = find_merged_subsection_split(cleaned) {
393                        let first = cleaned[..split_pos].trim();
394                        let second = cleaned[split_pos..].trim();
395                        output.push_str(&format!("# {}\n\n", first));
396                        output.push_str(&format!("# {}\n\n", second));
397                    } else {
398                        output.push_str(&format!("# {}\n\n", cleaned));
399                    }
400                    i += 1;
401                    continue;
402                }
403
404                if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
405                {
406                    output.push_str(&escape_md_line_start(trimmed));
407                    output.push('\n');
408                    i += 1;
409                    continue;
410                }
411
412                if is_short_caption_label(trimmed) {
413                    if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
414                        if let Some((caption_tail, body)) =
415                            split_following_caption_tail_and_body(&next_text)
416                        {
417                            let mut caption = trimmed.to_string();
418                            caption.push('\n');
419                            caption.push_str(caption_tail);
420                            output.push_str(&escape_md_line_start(caption.trim()));
421                            output.push_str("\n\n");
422                            output.push_str(&escape_md_line_start(body));
423                            output.push_str("\n\n");
424                            i += 2;
425                            continue;
426                        }
427
428                        if looks_like_caption_tail(&next_text) {
429                            let mut caption = trimmed.to_string();
430                            caption.push('\n');
431                            caption.push_str(next_text.trim());
432
433                            if let Some(year_text) =
434                                next_mergeable_paragraph_text(doc.kids.get(i + 2))
435                            {
436                                if looks_like_caption_year(&year_text) {
437                                    caption.push('\n');
438                                    caption.push_str(year_text.trim());
439                                    i += 1;
440                                }
441                            }
442
443                            output.push_str(&escape_md_line_start(caption.trim()));
444                            output.push_str("\n\n");
445                            i += 2;
446                            continue;
447                        }
448                    }
449                }
450
451                if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
452                    output.push_str(&escape_md_line_start(caption));
453                    output.push_str("\n\n");
454                    output.push_str(&escape_md_line_start(body));
455                    output.push_str("\n\n");
456                    i += 1;
457                    continue;
458                }
459
460                let mut merged = trimmed.to_string();
461                while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
462                    let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
463                        should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
464                    } else {
465                        should_merge_paragraph_text(&merged, &next_text)
466                    };
467                    if !can_merge {
468                        break;
469                    }
470                    merge_paragraph_text(&mut merged, &next_text);
471                    i += 1;
472                }
473
474                output.push_str(&escape_md_line_start(merged.trim()));
475                output.push_str("\n\n");
476            }
477            other => render_element(&mut output, other),
478        }
479        i += 1;
480    }
481
482    // Post-processing: merge adjacent pipe tables that share the same
483    // column count.  The table detector sometimes emits highlighted or
484    // coloured rows as separate tables.
485    let output = merge_adjacent_pipe_tables(&output);
486    let output = normalize_chart_like_markdown(&output);
487    drop_isolated_noise_lines(&output)
488}
489
490fn cmp_banded_reading_order(
491    left: &BoundingBox,
492    right: &BoundingBox,
493    band_height: f64,
494) -> std::cmp::Ordering {
495    let safe_band = band_height.max(1.0);
496    let left_band = (left.top_y / safe_band).round() as i64;
497    let right_band = (right.top_y / safe_band).round() as i64;
498    right_band
499        .cmp(&left_band)
500        .then_with(|| {
501            left.left_x
502                .partial_cmp(&right.left_x)
503                .unwrap_or(std::cmp::Ordering::Equal)
504        })
505        .then_with(|| {
506            right
507                .top_y
508                .partial_cmp(&left.top_y)
509                .unwrap_or(std::cmp::Ordering::Equal)
510        })
511        .then_with(|| {
512            right
513                .bottom_y
514                .partial_cmp(&left.bottom_y)
515                .unwrap_or(std::cmp::Ordering::Equal)
516        })
517        .then_with(|| {
518            left.right_x
519                .partial_cmp(&right.right_x)
520                .unwrap_or(std::cmp::Ordering::Equal)
521        })
522}
523
524fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
525    first_heading_like_text(doc)
526        .filter(|first| !equivalent_heading_text(first, title))
527        .is_some()
528}
529
530fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
531    if title.split_whitespace().count() > 6 {
532        return false;
533    }
534
535    let mut early = doc.kids.iter().take(6);
536    let has_explicit_heading = early.clone().any(|element| {
537        matches!(
538            element,
539            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
540        )
541    });
542    let has_tableish_content = early.any(|element| {
543        matches!(
544            element,
545            ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
546        )
547    });
548
549    has_tableish_content && !has_explicit_heading
550}
551
552fn render_top_table_plate_document(doc: &PdfDocument) -> Option<String> {
553    if doc.number_of_pages != 1 {
554        return None;
555    }
556
557    let (table_idx, table) =
558        doc.kids.iter().enumerate().find_map(|(idx, element)| {
559            table_border_from_element(element).map(|table| (idx, table))
560        })?;
561    if table.num_columns < 5 || table.rows.len() < 4 {
562        return None;
563    }
564
565    let mut header_probe = collect_table_border_rows(table);
566    if header_probe.len() < 3 || !preserve_grouped_header_rows(&mut header_probe) {
567        return None;
568    }
569
570    let table_top = table.bbox.top_y;
571    let table_bottom = table.bbox.bottom_y;
572    let table_height = table.bbox.height().max(1.0);
573    let page_top = doc
574        .kids
575        .iter()
576        .map(|element| element.bbox().top_y)
577        .fold(f64::NEG_INFINITY, f64::max);
578    if !page_top.is_finite() || page_top - table_top > table_height * 3.0 {
579        return None;
580    }
581
582    let caption_gap_limit = (table_height * 2.2).clamp(48.0, 132.0);
583    let mut caption_indices = Vec::new();
584    for idx in table_idx + 1..doc.kids.len() {
585        let element = &doc.kids[idx];
586        if !is_geometric_text_candidate(element) {
587            if table_bottom - element.bbox().top_y > caption_gap_limit {
588                break;
589            }
590            continue;
591        }
592
593        let text = extract_element_text(element);
594        if text.trim().is_empty() || looks_like_margin_page_number(doc, element, &text) {
595            continue;
596        }
597
598        let gap = table_bottom - element.bbox().top_y;
599        if gap < -6.0 {
600            break;
601        }
602        if gap > caption_gap_limit {
603            break;
604        }
605        caption_indices.push(idx);
606    }
607    if caption_indices.is_empty() {
608        return None;
609    }
610
611    let has_body_below = doc
612        .kids
613        .iter()
614        .enumerate()
615        .skip(caption_indices.last().copied()? + 1)
616        .any(|(_, element)| {
617            is_geometric_text_candidate(element)
618                && !extract_element_text(element).trim().is_empty()
619                && table_bottom - element.bbox().top_y > caption_gap_limit
620        });
621    if !has_body_below {
622        return None;
623    }
624
625    let mut output = String::new();
626    render_table_border(&mut output, table);
627
628    let mut caption = String::new();
629    for idx in &caption_indices {
630        let text = extract_element_text(&doc.kids[*idx]);
631        if text.trim().is_empty() {
632            continue;
633        }
634        merge_paragraph_text(&mut caption, &text);
635    }
636    let trimmed = caption.trim();
637    if trimmed.is_empty() {
638        return None;
639    }
640    output.push_str(&escape_md_line_start(trimmed));
641    output.push_str("\n\n");
642    Some(output)
643}
644
645fn render_single_table_report_document(doc: &PdfDocument) -> Option<String> {
646    if doc.number_of_pages != 1 || !(2..=4).contains(&doc.kids.len()) {
647        return None;
648    }
649
650    let title = &doc.kids[0];
651    if !is_geometric_text_candidate(title) {
652        return None;
653    }
654    let title_text = extract_element_text(title);
655    if title_text.trim().is_empty() || title_text.split_whitespace().count() < 4 {
656        return None;
657    }
658
659    let table = table_border_from_element(&doc.kids[1])?;
660    if table.num_columns < 4 || table.rows.len() < 4 {
661        return None;
662    }
663
664    let page_top = doc
665        .kids
666        .iter()
667        .map(|element| element.bbox().top_y)
668        .fold(f64::NEG_INFINITY, f64::max);
669    if !page_top.is_finite() {
670        return None;
671    }
672
673    let title_bbox = title.bbox();
674    let table_bbox = &table.bbox;
675    if page_top - title_bbox.top_y > 24.0 {
676        return None;
677    }
678
679    let vertical_gap = title_bbox.bottom_y - table_bbox.top_y;
680    if !(8.0..=40.0).contains(&vertical_gap) {
681        return None;
682    }
683
684    if (title_bbox.center_x() - table_bbox.center_x()).abs() > table_bbox.width() * 0.12 {
685        return None;
686    }
687
688    if doc.kids.iter().skip(2).any(|element| {
689        let text = extract_element_text(element);
690        let trimmed = text.trim();
691        !trimmed.is_empty()
692            && !looks_like_footer_banner(trimmed)
693            && !looks_like_margin_page_number(doc, element, trimmed)
694    }) {
695        return None;
696    }
697
698    let mut rows = collect_table_border_rows(table);
699    if rows.is_empty() {
700        return None;
701    }
702    merge_continuation_rows(&mut rows);
703    trim_leading_table_carryover_rows(&mut rows);
704    if rows.len() < 2 {
705        return None;
706    }
707
708    let mut output = String::new();
709    output.push_str("# ");
710    output.push_str(title_text.trim());
711    output.push_str("\n\n");
712    output.push_str(&render_pipe_rows(&rows));
713    Some(output)
714}
715
716fn render_late_section_boundary_document(doc: &PdfDocument) -> Option<String> {
717    if doc.number_of_pages != 1 || doc.kids.len() < 8 {
718        return None;
719    }
720
721    let page_top = doc
722        .kids
723        .iter()
724        .map(|element| element.bbox().top_y)
725        .fold(f64::NEG_INFINITY, f64::max);
726    if !page_top.is_finite() {
727        return None;
728    }
729
730    let heading_idx = doc.kids.iter().position(|element| {
731        matches!(
732            element,
733            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
734        )
735    })?;
736    if heading_idx < 5 {
737        return None;
738    }
739
740    let heading = &doc.kids[heading_idx];
741    let heading_text = extract_element_text(heading);
742    if heading_text.trim().is_empty() {
743        return None;
744    }
745
746    let heading_top = heading.bbox().top_y;
747    if page_top - heading_top < 240.0 {
748        return None;
749    }
750
751    let leading_text_indices = (0..heading_idx)
752        .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
753        .collect::<Vec<_>>();
754    if leading_text_indices.len() < 5 {
755        return None;
756    }
757
758    let colon_ended = leading_text_indices
759        .iter()
760        .filter(|idx| {
761            extract_element_text(&doc.kids[**idx])
762                .trim_end()
763                .ends_with(':')
764        })
765        .count();
766    if colon_ended * 2 < leading_text_indices.len() {
767        return None;
768    }
769
770    let trailing_indices = (heading_idx + 1..doc.kids.len())
771        .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
772        .filter(|idx| {
773            let text = extract_element_text(&doc.kids[*idx]);
774            !text.trim().is_empty() && !looks_like_margin_page_number(doc, &doc.kids[*idx], &text)
775        })
776        .collect::<Vec<_>>();
777    if trailing_indices.is_empty() || trailing_indices.len() > 5 {
778        return None;
779    }
780
781    let mut footer_count = 0usize;
782    let content_indices = trailing_indices
783        .into_iter()
784        .filter(|idx| {
785            let text = extract_element_text(&doc.kids[*idx]);
786            let is_footerish =
787                doc.kids[*idx].bbox().top_y < 96.0 && text.split_whitespace().count() >= 4;
788            footer_count += usize::from(is_footerish);
789            !is_footerish
790        })
791        .collect::<Vec<_>>();
792    if content_indices.is_empty() || footer_count == 0 {
793        return None;
794    }
795
796    let mut fragments = content_indices
797        .iter()
798        .map(|idx| (*idx, &doc.kids[*idx]))
799        .collect::<Vec<_>>();
800    fragments.sort_by(|left, right| cmp_banded_reading_order(left.1.bbox(), right.1.bbox(), 6.0));
801
802    let mut paragraph = String::new();
803    for (_, element) in fragments {
804        let text = extract_element_text(element);
805        if text.trim().is_empty() {
806            continue;
807        }
808        merge_paragraph_text(&mut paragraph, &text);
809    }
810    let trimmed_paragraph = paragraph.trim();
811    if trimmed_paragraph.is_empty() {
812        return None;
813    }
814
815    let mut output = String::new();
816    output.push_str("# ");
817    output.push_str(heading_text.trim());
818    output.push_str("\n\n");
819    output.push_str(&escape_md_line_start(trimmed_paragraph));
820    output.push_str("\n\n");
821    Some(output)
822}
823
824#[cfg(not(target_arch = "wasm32"))]
825#[derive(Clone)]
826struct LayoutHeaderCandidate {
827    line_idx: usize,
828    headers: Vec<String>,
829    starts: Vec<usize>,
830}
831
832#[cfg(not(target_arch = "wasm32"))]
833#[derive(Clone)]
834struct LayoutEntry {
835    line_idx: usize,
836    cells: Vec<String>,
837}
838
839#[cfg(not(target_arch = "wasm32"))]
840#[derive(Clone)]
841struct LayoutAnchorRow {
842    anchor_idx: usize,
843    last_anchor_idx: usize,
844    cells: Vec<String>,
845}
846
847#[cfg(not(target_arch = "wasm32"))]
848#[derive(Clone)]
849struct LayoutPanelHeaderCandidate {
850    line_idx: usize,
851    headers: Vec<String>,
852    starts: Vec<usize>,
853}
854
855#[cfg(not(target_arch = "wasm32"))]
856#[derive(Clone)]
857struct LayoutTocEntry {
858    title: String,
859    page: String,
860    title_start: usize,
861}
862
863#[cfg(not(target_arch = "wasm32"))]
864#[derive(Clone)]
865struct BBoxLayoutWord {
866    bbox: BoundingBox,
867    text: String,
868}
869
870#[cfg(not(target_arch = "wasm32"))]
871#[derive(Clone)]
872struct BBoxLayoutLine {
873    block_id: usize,
874    bbox: BoundingBox,
875    words: Vec<BBoxLayoutWord>,
876}
877
878#[cfg(not(target_arch = "wasm32"))]
879#[derive(Clone)]
880struct LayoutTextFragment {
881    bbox: BoundingBox,
882    text: String,
883}
884
885#[cfg(not(target_arch = "wasm32"))]
886#[derive(Clone)]
887struct OpenPlateCandidate {
888    heading: String,
889    header_row: Vec<String>,
890    rows: Vec<Vec<String>>,
891    caption: String,
892    cutoff_top_y: f64,
893}
894
895#[cfg(not(target_arch = "wasm32"))]
896struct LayoutNarrativeBridge {
897    bridge_paragraph: Option<String>,
898    deferred_captions: Vec<String>,
899    body_start_top_y: Option<f64>,
900}
901
902#[cfg(not(target_arch = "wasm32"))]
903#[derive(Clone)]
904struct BBoxLayoutBlock {
905    block_id: usize,
906    bbox: BoundingBox,
907    lines: Vec<BBoxLayoutLine>,
908}
909
910#[cfg(not(target_arch = "wasm32"))]
911struct LayoutOcrDashboard {
912    eyebrow: Option<String>,
913    title: String,
914    left_heading: String,
915    left_columns: Vec<String>,
916    left_rows: Vec<Vec<String>>,
917    right_heading: String,
918    right_rows: Vec<Vec<String>>,
919    definition_notes: Vec<String>,
920    source_notes: Vec<String>,
921}
922
923#[cfg(not(target_arch = "wasm32"))]
924struct LayoutRecommendationPanel {
925    heading: String,
926    subtitle: String,
927    header: Vec<String>,
928    rows: Vec<Vec<String>>,
929    notes: Vec<String>,
930}
931
932#[cfg(not(target_arch = "wasm32"))]
933struct LayoutRecommendationInfographic {
934    eyebrow: Option<String>,
935    title: String,
936    panels: Vec<LayoutRecommendationPanel>,
937}
938
939#[cfg(not(target_arch = "wasm32"))]
940#[derive(Clone)]
941struct LayoutBarToken {
942    bbox: BoundingBox,
943    value: i64,
944    text: String,
945}
946
947#[cfg(not(target_arch = "wasm32"))]
948#[allow(dead_code)]
949struct LayoutStackedBarFigure {
950    caption: String,
951    months: Vec<String>,
952    row_labels: Vec<String>,
953    rows: Vec<Vec<String>>,
954}
955
956#[cfg(not(target_arch = "wasm32"))]
957#[allow(dead_code)]
958struct LayoutStackedBarSectorFigure {
959    caption: String,
960    months: Vec<String>,
961    sectors: Vec<String>,
962    rows: Vec<Vec<String>>,
963}
964
965#[cfg(not(target_arch = "wasm32"))]
966struct LayoutStackedBarNarrative {
967    heading: String,
968    paragraphs: Vec<String>,
969    footnote: Option<String>,
970    top_y: f64,
971}
972
973#[cfg(not(target_arch = "wasm32"))]
974struct LayoutSeriesFigure {
975    caption: String,
976    labels: Vec<String>,
977    values: Vec<String>,
978    source: Option<String>,
979}
980
981#[cfg(not(target_arch = "wasm32"))]
982struct LayoutCaptionSection {
983    label: String,
984    title: String,
985    footnote_number: Option<String>,
986    top_y: f64,
987}
988
989#[cfg(not(target_arch = "wasm32"))]
990enum LayoutCaptionedMediaEvent {
991    Caption(LayoutCaptionSection),
992    Paragraph(String),
993}
994
995#[cfg(not(target_arch = "wasm32"))]
996struct LayoutCaptionedMediaProfile {
997    sections: Vec<LayoutCaptionSection>,
998    prose: Vec<(f64, String)>,
999    footnote: Option<String>,
1000    image_count: usize,
1001}
1002
1003#[cfg(not(target_arch = "wasm32"))]
1004#[allow(dead_code)]
1005fn render_layout_captioned_media_document(doc: &PdfDocument) -> Option<String> {
1006    let mut layout_cache = LayoutSourceCache::default();
1007    render_layout_captioned_media_document_cached(doc, &mut layout_cache)
1008}
1009
1010#[cfg(not(target_arch = "wasm32"))]
1011fn render_layout_captioned_media_document_cached(
1012    doc: &PdfDocument,
1013    layout_cache: &mut LayoutSourceCache,
1014) -> Option<String> {
1015    if doc.number_of_pages != 1 {
1016        return None;
1017    }
1018    let paragraph_count = doc
1019        .kids
1020        .iter()
1021        .filter(|element| matches!(element, ContentElement::Paragraph(_)))
1022        .count();
1023    let image_count = doc
1024        .kids
1025        .iter()
1026        .filter(|element| {
1027            matches!(
1028                element,
1029                ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1030            )
1031        })
1032        .count();
1033    if paragraph_count == 0 || image_count == 0 {
1034        return None;
1035    }
1036    let has_explicit_structure = doc.kids.iter().any(|element| {
1037        matches!(
1038            element,
1039            ContentElement::Caption(_)
1040                | ContentElement::Heading(_)
1041                | ContentElement::NumberHeading(_)
1042                | ContentElement::Table(_)
1043                | ContentElement::List(_)
1044        )
1045    });
1046    if has_explicit_structure {
1047        return None;
1048    }
1049
1050    let profile = build_layout_captioned_media_profile(doc, layout_cache)?;
1051    if profile.sections.is_empty() || (profile.sections.len() == 1 && profile.footnote.is_none()) {
1052        return None;
1053    }
1054    let has_non_figure_label = profile
1055        .sections
1056        .iter()
1057        .any(|section| !section.label.starts_with("Figure "));
1058    let has_anchored_footnote = profile.footnote.is_some()
1059        || profile
1060            .sections
1061            .iter()
1062            .any(|section| section.footnote_number.is_some());
1063    if !has_non_figure_label && !has_anchored_footnote {
1064        return None;
1065    }
1066
1067    if let Some(rendered) = render_layout_captioned_media_explainer(&profile) {
1068        return Some(rendered);
1069    }
1070
1071    let mut events = profile
1072        .sections
1073        .into_iter()
1074        .map(|section| (section.top_y, LayoutCaptionedMediaEvent::Caption(section)))
1075        .collect::<Vec<_>>();
1076    for (top_y, paragraph) in profile.prose {
1077        events.push((top_y, LayoutCaptionedMediaEvent::Paragraph(paragraph)));
1078    }
1079    events.sort_by(|left, right| {
1080        right
1081            .0
1082            .partial_cmp(&left.0)
1083            .unwrap_or(std::cmp::Ordering::Equal)
1084    });
1085
1086    let mut output = String::new();
1087    for (_, event) in events {
1088        match event {
1089            LayoutCaptionedMediaEvent::Caption(section) => {
1090                output.push_str(&render_layout_caption_section(&section));
1091            }
1092            LayoutCaptionedMediaEvent::Paragraph(paragraph) => {
1093                output.push_str(&escape_md_line_start(paragraph.trim()));
1094                output.push_str("\n\n");
1095            }
1096        }
1097    }
1098
1099    if let Some(footnote_text) = profile.footnote {
1100        output.push_str("---\n\n");
1101        output.push_str("**Footnote:**\n");
1102        output.push_str(&escape_md_line_start(footnote_text.trim()));
1103        output.push('\n');
1104    }
1105
1106    Some(output.trim_end().to_string() + "\n")
1107}
1108
1109#[cfg(not(target_arch = "wasm32"))]
1110fn build_layout_captioned_media_profile(
1111    doc: &PdfDocument,
1112    layout_cache: &mut LayoutSourceCache,
1113) -> Option<LayoutCaptionedMediaProfile> {
1114    let layout = layout_cache.bbox_layout(doc)?;
1115    let sections = detect_layout_caption_sections(&layout.blocks);
1116    let footnote = detect_layout_bottom_footnote(&layout.lines);
1117
1118    let mut prose = doc
1119        .kids
1120        .iter()
1121        .filter_map(|element| match element {
1122            ContentElement::Paragraph(_)
1123            | ContentElement::TextBlock(_)
1124            | ContentElement::TextLine(_) => {
1125                let text = clean_paragraph_text(&extract_element_text(element));
1126                let trimmed = text.trim();
1127                (!trimmed.is_empty()
1128                    && trimmed.split_whitespace().count() >= 8
1129                    && !starts_with_caption_prefix(trimmed)
1130                    && !trimmed
1131                        .chars()
1132                        .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1133                    && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1134                    && !looks_like_footer_banner(trimmed))
1135                .then_some((element.bbox().top_y, trimmed.to_string()))
1136            }
1137            _ => None,
1138        })
1139        .filter(|(top_y, paragraph)| {
1140            !sections.iter().any(|section| {
1141                (*top_y - section.top_y).abs() <= 36.0
1142                    || section.title.contains(paragraph)
1143                    || paragraph.contains(&section.title)
1144            })
1145        })
1146        .collect::<Vec<_>>();
1147    prose.sort_by(|left, right| {
1148        right
1149            .0
1150            .partial_cmp(&left.0)
1151            .unwrap_or(std::cmp::Ordering::Equal)
1152    });
1153    if prose.len() > 2 {
1154        return None;
1155    }
1156
1157    let image_count = doc
1158        .kids
1159        .iter()
1160        .filter(|element| {
1161            matches!(
1162                element,
1163                ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1164            )
1165        })
1166        .count();
1167
1168    Some(LayoutCaptionedMediaProfile {
1169        sections,
1170        prose,
1171        footnote,
1172        image_count,
1173    })
1174}
1175
1176#[cfg(not(target_arch = "wasm32"))]
1177fn render_layout_captioned_media_explainer(
1178    profile: &LayoutCaptionedMediaProfile,
1179) -> Option<String> {
1180    if profile.sections.len() != 1
1181        || profile.prose.len() != 2
1182        || profile.image_count != 1
1183        || profile.footnote.is_none()
1184        || !profile
1185            .sections
1186            .iter()
1187            .all(|section| section.label.starts_with("Figure "))
1188    {
1189        return None;
1190    }
1191
1192    let mut output = String::new();
1193    output.push_str("# ");
1194    output.push_str(profile.prose[0].1.trim());
1195    output.push('\n');
1196    output.push_str(&escape_md_line_start(profile.prose[1].1.trim()));
1197    output.push_str("\n\n");
1198    output.push_str("*Image*\n\n");
1199    output.push_str(&render_layout_caption_section(&profile.sections[0]));
1200    output.push_str("---\n\n");
1201    output.push_str("**Footnote:**\n");
1202    output.push_str(&escape_md_line_start(
1203        profile.footnote.as_deref().unwrap_or_default().trim(),
1204    ));
1205    output.push('\n');
1206    Some(output)
1207}
1208
1209#[cfg(not(target_arch = "wasm32"))]
1210fn detect_layout_caption_sections(blocks: &[BBoxLayoutBlock]) -> Vec<LayoutCaptionSection> {
1211    let normalized_blocks = blocks
1212        .iter()
1213        .map(|block| {
1214            (
1215                block,
1216                normalize_common_ocr_text(&bbox_layout_block_text(block)),
1217            )
1218        })
1219        .collect::<Vec<_>>();
1220
1221    let mut used_titles = HashSet::new();
1222    let mut sections = Vec::new();
1223    for (block, label_text) in &normalized_blocks {
1224        if !is_short_caption_label(label_text) {
1225            continue;
1226        }
1227
1228        let label_bbox = &block.bbox;
1229        let title_candidate = normalized_blocks
1230            .iter()
1231            .filter(|(candidate, text)| {
1232                candidate.block_id != block.block_id
1233                    && !used_titles.contains(&candidate.block_id)
1234                    && !text.is_empty()
1235                    && !is_short_caption_label(text)
1236                    && !starts_with_caption_prefix(text)
1237                    && !looks_like_footer_banner(text)
1238                    && !is_page_number_like(text)
1239                    && text.split_whitespace().count() >= 2
1240                    && candidate.bbox.width() >= 60.0
1241            })
1242            .filter_map(|(candidate, text)| {
1243                let vertical_gap = (candidate.bbox.center_y() - label_bbox.center_y()).abs();
1244                let horizontal_gap = if candidate.bbox.left_x > label_bbox.right_x {
1245                    candidate.bbox.left_x - label_bbox.right_x
1246                } else if label_bbox.left_x > candidate.bbox.right_x {
1247                    label_bbox.left_x - candidate.bbox.right_x
1248                } else {
1249                    0.0
1250                };
1251                (vertical_gap <= 28.0 && horizontal_gap <= 180.0).then_some((
1252                    vertical_gap + horizontal_gap * 0.15,
1253                    *candidate,
1254                    text.clone(),
1255                ))
1256            })
1257            .min_by(|left, right| {
1258                left.0
1259                    .partial_cmp(&right.0)
1260                    .unwrap_or(std::cmp::Ordering::Equal)
1261            });
1262
1263        let Some((_, title_block, title_text)) = title_candidate else {
1264            continue;
1265        };
1266        used_titles.insert(title_block.block_id);
1267        let (title, footnote_number) = split_trailing_caption_footnote_marker(&title_text);
1268        sections.push(LayoutCaptionSection {
1269            label: label_text.to_string(),
1270            title,
1271            footnote_number,
1272            top_y: label_bbox.top_y.max(title_block.bbox.top_y),
1273        });
1274    }
1275
1276    sections.sort_by(|left, right| {
1277        right
1278            .top_y
1279            .partial_cmp(&left.top_y)
1280            .unwrap_or(std::cmp::Ordering::Equal)
1281    });
1282    sections
1283}
1284
1285#[cfg(not(target_arch = "wasm32"))]
1286fn split_trailing_caption_footnote_marker(text: &str) -> (String, Option<String>) {
1287    let trimmed = text.trim();
1288    let re = Regex::new(r"^(?P<title>.*?[.!?])\s*(?P<num>\d{1,2})\s*[A-Za-z]{0,12}$").ok();
1289    if let Some(captures) = re.as_ref().and_then(|re| re.captures(trimmed)) {
1290        return (
1291            captures["title"].trim().to_string(),
1292            Some(captures["num"].to_string()),
1293        );
1294    }
1295
1296    (trimmed.to_string(), None)
1297}
1298
1299#[cfg(not(target_arch = "wasm32"))]
1300fn detect_layout_bottom_footnote(lines: &[BBoxLayoutLine]) -> Option<String> {
1301    let normalized_lines = lines
1302        .iter()
1303        .map(|line| {
1304            (
1305                line.bbox.top_y,
1306                normalize_common_ocr_text(&bbox_layout_line_text(line)),
1307            )
1308        })
1309        .filter(|(_, text)| !text.is_empty() && !is_page_number_like(text))
1310        .collect::<Vec<_>>();
1311    let start_idx = normalized_lines.iter().rposition(|(_, text)| {
1312        text.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1313            && text.split_whitespace().count() >= 6
1314    })?;
1315
1316    let mut collected = vec![normalized_lines[start_idx].1.clone()];
1317    let mut last_top_y = normalized_lines[start_idx].0;
1318    for (top_y, text) in normalized_lines.iter().skip(start_idx + 1) {
1319        if is_page_number_like(text) {
1320            break;
1321        }
1322        if (last_top_y - *top_y).abs() > 28.0 {
1323            break;
1324        }
1325        collected.push(text.clone());
1326        last_top_y = *top_y;
1327    }
1328
1329    if collected.is_empty() {
1330        return None;
1331    }
1332    let merged = collected.join(" ");
1333    Some(normalize_layout_footnote_text(&merged))
1334}
1335
1336#[cfg(not(target_arch = "wasm32"))]
1337fn normalize_layout_footnote_text(text: &str) -> String {
1338    let mut normalized = text.replace(",https://", ", https://");
1339    let url_gap_re = Regex::new(r"(https?://\S+)\s+(\S+)").ok();
1340    while let Some(re) = &url_gap_re {
1341        let next = re.replace(&normalized, "$1$2").to_string();
1342        if next == normalized {
1343            break;
1344        }
1345        normalized = next;
1346    }
1347    normalized
1348}
1349
1350#[cfg(not(target_arch = "wasm32"))]
1351fn render_layout_caption_section(section: &LayoutCaptionSection) -> String {
1352    let mut output = String::new();
1353    if section.label.starts_with("Diagram ") {
1354        output.push_str("## ");
1355        output.push_str(section.label.trim());
1356        output.push('\n');
1357        if !section.title.trim().is_empty() {
1358            let title = normalize_layout_caption_title_text(section.title.trim());
1359            output.push_str("**");
1360            output.push_str(&title);
1361            output.push_str("**\n\n");
1362        } else {
1363            output.push('\n');
1364        }
1365        return output;
1366    }
1367
1368    if section.label.starts_with("Figure ") && section.footnote_number.is_none() {
1369        output.push('*');
1370        output.push_str(section.label.trim());
1371        output.push_str("*\n\n");
1372    }
1373
1374    output.push_str("**");
1375    output.push_str(section.label.trim());
1376    output.push_str("**\n");
1377
1378    if !section.title.trim().is_empty() {
1379        let title_lines = split_layout_caption_title_lines(section.title.trim());
1380        let last_idx = title_lines.len().saturating_sub(1);
1381        for (idx, line) in title_lines.iter().enumerate() {
1382            if section.footnote_number.is_some() {
1383                output.push_str("**");
1384                output.push_str(line.trim());
1385                if idx == last_idx {
1386                    output.push_str("**^");
1387                    output.push_str(section.footnote_number.as_deref().unwrap_or_default());
1388                } else {
1389                    output.push_str("**");
1390                }
1391            } else {
1392                output.push('*');
1393                output.push_str(line.trim());
1394                output.push('*');
1395            }
1396            output.push('\n');
1397        }
1398    }
1399    output.push('\n');
1400    output
1401}
1402
1403#[cfg(not(target_arch = "wasm32"))]
1404fn split_layout_caption_title_lines(title: &str) -> Vec<String> {
1405    let title = normalize_layout_caption_title_text(title);
1406    if let Some(idx) = title.find(" Content:") {
1407        let head = title[..idx].trim();
1408        let tail = title[idx + 1..].trim();
1409        if !head.is_empty() && head.split_whitespace().count() <= 3 && !tail.is_empty() {
1410            return vec![head.to_string(), tail.to_string()];
1411        }
1412    }
1413    vec![title.to_string()]
1414}
1415
1416#[cfg(not(target_arch = "wasm32"))]
1417fn normalize_layout_caption_title_text(title: &str) -> String {
1418    Regex::new(r"(\d{4})-\s+(\d{4})")
1419        .ok()
1420        .map(|re| re.replace_all(title, "$1-$2").to_string())
1421        .unwrap_or_else(|| title.to_string())
1422}
1423
1424#[cfg(not(target_arch = "wasm32"))]
1425#[allow(dead_code)]
1426fn render_layout_single_caption_chart_document(doc: &PdfDocument) -> Option<String> {
1427    let mut layout_cache = LayoutSourceCache::default();
1428    render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
1429}
1430
1431#[cfg(not(target_arch = "wasm32"))]
1432fn render_layout_single_caption_chart_document_cached(
1433    doc: &PdfDocument,
1434    _layout_cache: &mut LayoutSourceCache,
1435) -> Option<String> {
1436    if doc.number_of_pages != 1 {
1437        return None;
1438    }
1439
1440    let caption_indices = doc
1441        .kids
1442        .iter()
1443        .enumerate()
1444        .filter_map(|(idx, element)| {
1445            let text = extract_element_text(element);
1446            let trimmed = text.trim();
1447            (trimmed.starts_with("Figure ")
1448                && trimmed.contains(':')
1449                && trimmed.split_whitespace().count() >= 6)
1450                .then_some(idx)
1451        })
1452        .collect::<Vec<_>>();
1453    if caption_indices.len() != 1 {
1454        return None;
1455    }
1456    if doc.kids.len() < 12 {
1457        return None;
1458    }
1459
1460    let caption_idx = caption_indices[0];
1461    let mut output = String::new();
1462    let mut i = 0usize;
1463    let mut chart_mode = false;
1464    while i < doc.kids.len() {
1465        let element = &doc.kids[i];
1466        let text = extract_element_text(element);
1467        let trimmed = text.trim();
1468        if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
1469            i += 1;
1470            continue;
1471        }
1472
1473        if i == caption_idx {
1474            output.push_str(&escape_md_line_start(trimmed));
1475            output.push_str("\n\n");
1476            chart_mode = true;
1477            i += 1;
1478            continue;
1479        }
1480
1481        if chart_mode {
1482            if !looks_like_chart_followup_paragraph(element, trimmed)
1483                && !matches!(
1484                    element,
1485                    ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1486                )
1487            {
1488                i += 1;
1489                continue;
1490            }
1491            chart_mode = false;
1492        }
1493
1494        match element {
1495            ContentElement::Heading(h) => {
1496                let level = h.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1497                output.push_str(&"#".repeat(level));
1498                output.push(' ');
1499                output.push_str(trimmed);
1500                output.push_str("\n\n");
1501            }
1502            ContentElement::NumberHeading(nh) => {
1503                let level = nh.base.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1504                output.push_str(&"#".repeat(level));
1505                output.push(' ');
1506                output.push_str(trimmed);
1507                output.push_str("\n\n");
1508            }
1509            ContentElement::Paragraph(_) | ContentElement::TextBlock(_) => {
1510                let mut merged = trimmed.to_string();
1511                while let Some(next_element) = doc.kids.get(i + 1) {
1512                    let next_text = extract_element_text(next_element);
1513                    let next_trimmed = next_text.trim();
1514                    if next_trimmed.is_empty()
1515                        || looks_like_margin_page_number(doc, next_element, next_trimmed)
1516                    {
1517                        i += 1;
1518                        continue;
1519                    }
1520                    if i + 1 == caption_idx
1521                        || looks_like_chart_noise_element(next_element, next_trimmed)
1522                    {
1523                        break;
1524                    }
1525                    let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
1526                        should_merge_adjacent_semantic_paragraphs(&merged, next_trimmed)
1527                    } else {
1528                        should_merge_paragraph_text(&merged, next_trimmed)
1529                    };
1530                    if !can_merge {
1531                        break;
1532                    }
1533                    merge_paragraph_text(&mut merged, next_trimmed);
1534                    i += 1;
1535                }
1536
1537                output.push_str(&escape_md_line_start(merged.trim()));
1538                output.push_str("\n\n");
1539            }
1540            _ => {}
1541        }
1542
1543        i += 1;
1544    }
1545
1546    Some(output.trim_end().to_string() + "\n")
1547}
1548
1549fn looks_like_chart_noise_element(_element: &ContentElement, text: &str) -> bool {
1550    if text.is_empty() {
1551        return false;
1552    }
1553
1554    if is_standalone_page_number(text) || looks_like_numeric_axis_blob(text) {
1555        return true;
1556    }
1557
1558    let word_count = text.split_whitespace().count();
1559    let lower = text.to_ascii_lowercase();
1560
1561    if lower.starts_with("figure ") && text.contains(':') {
1562        return false;
1563    }
1564
1565    if lower.starts_with("source:") {
1566        return false;
1567    }
1568
1569    if word_count <= 3
1570        && (looks_like_yearish_label(text)
1571            || looks_like_layout_month_label(text)
1572            || text == "Lockdown Period")
1573    {
1574        return true;
1575    }
1576
1577    if text
1578        .chars()
1579        .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1580    {
1581        return true;
1582    }
1583
1584    let short_non_sentence = !text.contains('.') && !text.contains(':') && !text.contains(';');
1585    let has_chart_keyword = lower.contains("working as usual")
1586        || lower.contains("temporarily closed")
1587        || lower.contains("business premises")
1588        || lower.contains("operations continue");
1589
1590    word_count <= 10 || (short_non_sentence && word_count <= 14) || has_chart_keyword
1591}
1592
1593fn looks_like_chart_followup_paragraph(_element: &ContentElement, text: &str) -> bool {
1594    let word_count = text.split_whitespace().count();
1595    word_count >= 18
1596        && !text.trim_start().starts_with("Figure ")
1597        && !text.trim_start().starts_with("Table ")
1598}
1599
1600#[cfg(not(target_arch = "wasm32"))]
1601#[allow(dead_code)]
1602fn render_layout_recommendation_infographic_document(doc: &PdfDocument) -> Option<String> {
1603    let mut layout_cache = LayoutSourceCache::default();
1604    render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
1605}
1606
1607#[cfg(not(target_arch = "wasm32"))]
1608fn render_layout_recommendation_infographic_document_cached(
1609    doc: &PdfDocument,
1610    layout_cache: &mut LayoutSourceCache,
1611) -> Option<String> {
1612    if doc.number_of_pages != 1 {
1613        return None;
1614    }
1615
1616    let layout = layout_cache.bbox_layout(doc)?;
1617    let infographic = detect_layout_recommendation_infographic(layout.page_width, &layout.lines)?;
1618
1619    let mut output = String::new();
1620    if let Some(eyebrow) = infographic.eyebrow.as_deref() {
1621        output.push_str("# ");
1622        output.push_str(eyebrow.trim());
1623        output.push_str("\n\n");
1624    }
1625    output.push_str(&escape_md_line_start(infographic.title.trim()));
1626    output.push_str("\n\n");
1627
1628    for panel in &infographic.panels {
1629        output.push_str("## ");
1630        output.push_str(panel.heading.trim());
1631        output.push_str("\n\n");
1632        output.push_str(&escape_md_line_start(panel.subtitle.trim()));
1633        output.push_str("\n\n");
1634
1635        let mut rows = Vec::with_capacity(panel.rows.len() + 1);
1636        rows.push(panel.header.clone());
1637        rows.extend(panel.rows.clone());
1638        output.push_str(&render_pipe_rows(&rows));
1639
1640        if !panel.notes.is_empty() {
1641            output.push_str("*Note:*\n");
1642            for note in &panel.notes {
1643                output.push_str("- ");
1644                output.push_str(note.trim());
1645                output.push('\n');
1646            }
1647            output.push('\n');
1648        }
1649    }
1650
1651    Some(output.trim_end().to_string() + "\n")
1652}
1653
1654#[cfg(not(target_arch = "wasm32"))]
1655#[allow(dead_code)]
1656fn render_layout_stacked_bar_report_document(doc: &PdfDocument) -> Option<String> {
1657    let mut layout_cache = LayoutSourceCache::default();
1658    render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
1659}
1660
1661#[cfg(not(target_arch = "wasm32"))]
1662fn render_layout_stacked_bar_report_document_cached(
1663    doc: &PdfDocument,
1664    layout_cache: &mut LayoutSourceCache,
1665) -> Option<String> {
1666    if doc.number_of_pages != 1 {
1667        return None;
1668    }
1669
1670    let layout = layout_cache.bbox_layout(doc)?;
1671    let figure_captions = collect_layout_figure_captions(&layout.blocks);
1672    if figure_captions.len() != 2 {
1673        return None;
1674    }
1675    let narrative = detect_layout_stacked_bar_narrative(&layout.blocks)?;
1676    let figure_one = detect_layout_three_month_stacked_figure(
1677        &layout.blocks,
1678        &layout.lines,
1679        layout.page_width,
1680        figure_captions[0].clone(),
1681        figure_captions[1].bbox.top_y,
1682    )?;
1683    let figure_two = detect_layout_sector_bar_figure(
1684        &layout.blocks,
1685        &layout.lines,
1686        layout.page_width,
1687        figure_captions[1].clone(),
1688        narrative.top_y,
1689    )?;
1690
1691    let mut output = String::new();
1692    output.push_str("# ");
1693    output.push_str(figure_one.caption.trim());
1694    output.push_str("\n\n");
1695    let mut first_table = vec![{
1696        let mut row = vec![String::new()];
1697        row.extend(figure_one.months.clone());
1698        row
1699    }];
1700    first_table.extend(figure_one.rows.clone());
1701    output.push_str(&render_pipe_rows(&first_table));
1702
1703    output.push_str("# ");
1704    output.push_str(figure_two.caption.trim());
1705    output.push_str("\n\n");
1706    let mut second_table = vec![{
1707        let mut row = vec!["Sector".to_string()];
1708        row.extend(figure_two.months.clone());
1709        row
1710    }];
1711    second_table.extend(figure_two.rows.clone());
1712    output.push_str(&render_pipe_rows(&second_table));
1713
1714    output.push_str("# ");
1715    output.push_str(narrative.heading.trim());
1716    output.push_str("\n\n");
1717    for paragraph in &narrative.paragraphs {
1718        output.push_str(&escape_md_line_start(paragraph.trim()));
1719        output.push_str("\n\n");
1720    }
1721    if let Some(footnote) = narrative.footnote.as_deref() {
1722        output.push('*');
1723        output.push_str(footnote.trim());
1724        output.push_str("*\n");
1725    }
1726
1727    Some(output)
1728}
1729
1730#[cfg(not(target_arch = "wasm32"))]
1731#[allow(dead_code)]
1732fn render_layout_multi_figure_chart_document(doc: &PdfDocument) -> Option<String> {
1733    let mut layout_cache = LayoutSourceCache::default();
1734    render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
1735}
1736
1737#[cfg(not(target_arch = "wasm32"))]
1738fn render_layout_multi_figure_chart_document_cached(
1739    doc: &PdfDocument,
1740    layout_cache: &mut LayoutSourceCache,
1741) -> Option<String> {
1742    if doc.number_of_pages != 1 {
1743        return None;
1744    }
1745
1746    let layout = layout_cache.bbox_layout(doc)?;
1747    let figures = detect_layout_multi_figure_chart_sections(&layout.lines)?;
1748    let rendered_table_count = figures
1749        .iter()
1750        .filter(|figure| figure.labels.len() >= 4 && figure.labels.len() == figure.values.len())
1751        .count();
1752    if figures.len() < 2 || rendered_table_count == 0 {
1753        return None;
1754    }
1755
1756    let mut output = String::from("# Figures from the Document\n\n");
1757    for figure in figures {
1758        output.push_str("## ");
1759        output.push_str(figure.caption.trim());
1760        output.push_str("\n\n");
1761
1762        if figure.labels.len() >= 4 && figure.labels.len() == figure.values.len() {
1763            let label_header = if figure
1764                .labels
1765                .iter()
1766                .all(|label| looks_like_yearish_label(label))
1767            {
1768                "Year"
1769            } else {
1770                "Label"
1771            };
1772            let value_header = chart_value_header(&figure.caption);
1773            output.push_str(&format!("| {} | {} |\n", label_header, value_header));
1774            output.push_str("| --- | --- |\n");
1775            for (label, value) in figure.labels.iter().zip(figure.values.iter()) {
1776                output.push_str(&format!("| {} | {} |\n", label, value));
1777            }
1778            output.push('\n');
1779        }
1780
1781        if let Some(source) = figure.source.as_deref() {
1782            output.push('*');
1783            output.push_str(&escape_md_line_start(source.trim()));
1784            output.push_str("*\n\n");
1785        }
1786    }
1787
1788    Some(output.trim_end().to_string() + "\n")
1789}
1790
1791#[cfg(not(target_arch = "wasm32"))]
1792fn detect_layout_multi_figure_chart_sections(
1793    lines: &[BBoxLayoutLine],
1794) -> Option<Vec<LayoutSeriesFigure>> {
1795    let caption_indices = lines
1796        .iter()
1797        .enumerate()
1798        .filter_map(|(idx, line)| {
1799            let text = bbox_layout_line_text(line);
1800            (text.starts_with("Figure ") && text.split_whitespace().count() >= 4).then_some(idx)
1801        })
1802        .collect::<Vec<_>>();
1803    if caption_indices.len() < 2 {
1804        return None;
1805    }
1806
1807    let mut figures = Vec::new();
1808    for (pos, caption_idx) in caption_indices.iter().enumerate() {
1809        let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
1810        let caption = bbox_layout_line_text(&lines[*caption_idx]);
1811
1812        let source_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
1813            bbox_layout_line_text(&lines[*idx])
1814                .to_ascii_lowercase()
1815                .starts_with("source:")
1816        });
1817
1818        let source = source_idx.map(|idx| {
1819            let mut source_lines = vec![&lines[idx]];
1820            let mut cursor = idx + 1;
1821            while cursor < next_caption_idx {
1822                let text = bbox_layout_line_text(&lines[cursor]);
1823                if text.starts_with("Figure ") || looks_like_footer_banner(&text) || text.is_empty()
1824                {
1825                    break;
1826                }
1827                source_lines.push(&lines[cursor]);
1828                if text.ends_with('.') {
1829                    break;
1830                }
1831                cursor += 1;
1832            }
1833            join_layout_lines_as_paragraph(&source_lines)
1834        });
1835
1836        let series_region = &lines[*caption_idx + 1..source_idx.unwrap_or(next_caption_idx)];
1837        let anchors = extract_year_label_anchors_from_section(series_region);
1838        let (labels, values) = if anchors.len() >= 4 {
1839            let values = map_series_values_to_label_anchors(&anchors, series_region);
1840            (
1841                anchors
1842                    .into_iter()
1843                    .map(|anchor| anchor.text)
1844                    .collect::<Vec<_>>(),
1845                values,
1846            )
1847        } else {
1848            (Vec::new(), Vec::new())
1849        };
1850
1851        if source.is_some() || !values.is_empty() {
1852            figures.push(LayoutSeriesFigure {
1853                caption: normalize_layout_dashboard_text(&caption),
1854                labels,
1855                values,
1856                source,
1857            });
1858        }
1859    }
1860
1861    (!figures.is_empty()).then_some(figures)
1862}
1863
1864#[cfg(not(target_arch = "wasm32"))]
1865fn extract_year_label_anchors_from_section(lines: &[BBoxLayoutLine]) -> Vec<LayoutTextFragment> {
1866    let mut year_words = lines
1867        .iter()
1868        .flat_map(|line| line.words.iter())
1869        .filter_map(|word| {
1870            let token = word
1871                .text
1872                .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1873            looks_like_year_token(token).then_some((word.bbox.center_y(), word.clone()))
1874        })
1875        .collect::<Vec<_>>();
1876    if year_words.len() < 4 {
1877        return Vec::new();
1878    }
1879
1880    year_words.sort_by(|left, right| {
1881        right
1882            .0
1883            .partial_cmp(&left.0)
1884            .unwrap_or(std::cmp::Ordering::Equal)
1885    });
1886
1887    let mut best_band = Vec::<BBoxLayoutWord>::new();
1888    for (center_y, _) in &year_words {
1889        let band = year_words
1890            .iter()
1891            .filter(|(candidate_y, _)| (*candidate_y - *center_y).abs() <= 12.0)
1892            .map(|(_, word)| word.clone())
1893            .collect::<Vec<_>>();
1894        if band.len() > best_band.len() {
1895            best_band = band;
1896        }
1897    }
1898    if best_band.len() < 4 {
1899        return Vec::new();
1900    }
1901
1902    let band_center = best_band
1903        .iter()
1904        .map(|word| word.bbox.center_y())
1905        .sum::<f64>()
1906        / best_band.len() as f64;
1907    let mut band_words = lines
1908        .iter()
1909        .flat_map(|line| line.words.iter())
1910        .filter(|word| (word.bbox.center_y() - band_center).abs() <= 12.0)
1911        .cloned()
1912        .collect::<Vec<_>>();
1913    band_words.sort_by(|left, right| {
1914        left.bbox
1915            .left_x
1916            .partial_cmp(&right.bbox.left_x)
1917            .unwrap_or(std::cmp::Ordering::Equal)
1918    });
1919
1920    let mut anchors = Vec::new();
1921    let mut idx = 0usize;
1922    while idx < band_words.len() {
1923        let token = band_words[idx]
1924            .text
1925            .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1926        if !looks_like_year_token(token) {
1927            idx += 1;
1928            continue;
1929        }
1930
1931        let mut bbox = band_words[idx].bbox.clone();
1932        let mut label = token.to_string();
1933        if let Some(next) = band_words.get(idx + 1) {
1934            let suffix = next
1935                .text
1936                .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1937            let gap = next.bbox.left_x - band_words[idx].bbox.right_x;
1938            if suffix.starts_with('(') && suffix.ends_with(')') && gap <= 18.0 {
1939                label.push(' ');
1940                label.push_str(suffix);
1941                bbox = bbox.union(&next.bbox);
1942                idx += 1;
1943            }
1944        }
1945
1946        anchors.push(LayoutTextFragment { bbox, text: label });
1947        idx += 1;
1948    }
1949
1950    anchors
1951}
1952
1953#[cfg(not(target_arch = "wasm32"))]
1954fn map_series_values_to_label_anchors(
1955    anchors: &[LayoutTextFragment],
1956    lines: &[BBoxLayoutLine],
1957) -> Vec<String> {
1958    if anchors.len() < 2 {
1959        return Vec::new();
1960    }
1961
1962    let mut spacing = anchors
1963        .windows(2)
1964        .map(|pair| pair[1].bbox.center_x() - pair[0].bbox.center_x())
1965        .filter(|gap| *gap > 0.0)
1966        .collect::<Vec<_>>();
1967    spacing.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1968    let median_spacing = spacing
1969        .get(spacing.len().saturating_sub(1) / 2)
1970        .copied()
1971        .unwrap_or(48.0);
1972    let max_dx = (median_spacing * 0.42).clamp(18.0, 32.0);
1973
1974    let mut tokens = Vec::<LayoutBarToken>::new();
1975    for line in lines {
1976        for word in &line.words {
1977            let raw = word.text.trim();
1978            if raw.contains('/')
1979                || looks_like_year_token(raw.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.')))
1980            {
1981                continue;
1982            }
1983            let Some(value) = parse_integer_token(raw) else {
1984                continue;
1985            };
1986            tokens.push(LayoutBarToken {
1987                bbox: word.bbox.clone(),
1988                value,
1989                text: sanitize_numberish_token(raw).unwrap_or_else(|| value.to_string()),
1990            });
1991        }
1992    }
1993
1994    let mut used = vec![false; tokens.len()];
1995    let mut values = Vec::with_capacity(anchors.len());
1996    for anchor in anchors {
1997        let anchor_center_x = anchor.bbox.center_x();
1998        let anchor_center_y = anchor.bbox.center_y();
1999        let best = tokens
2000            .iter()
2001            .enumerate()
2002            .filter(|(idx, token)| {
2003                !used[*idx]
2004                    && token.bbox.center_y() > anchor_center_y + 8.0
2005                    && (token.bbox.center_x() - anchor_center_x).abs() <= max_dx
2006            })
2007            .min_by(|left, right| {
2008                let left_score = (left.1.bbox.center_x() - anchor_center_x).abs()
2009                    + (left.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2010                let right_score = (right.1.bbox.center_x() - anchor_center_x).abs()
2011                    + (right.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2012                left_score
2013                    .partial_cmp(&right_score)
2014                    .unwrap_or(std::cmp::Ordering::Equal)
2015            });
2016        let Some((best_idx, token)) = best else {
2017            return Vec::new();
2018        };
2019        used[best_idx] = true;
2020        values.push(token.text.clone());
2021    }
2022
2023    values
2024}
2025
2026#[cfg(not(target_arch = "wasm32"))]
2027fn detect_layout_recommendation_infographic(
2028    page_width: f64,
2029    lines: &[BBoxLayoutLine],
2030) -> Option<LayoutRecommendationInfographic> {
2031    if page_width < 900.0 {
2032        return None;
2033    }
2034
2035    let blocks = collect_bbox_layout_blocks(lines);
2036    let page_top = lines
2037        .iter()
2038        .map(|line| line.bbox.top_y)
2039        .fold(0.0_f64, f64::max);
2040
2041    let title_block = blocks
2042        .iter()
2043        .filter(|block| {
2044            block.bbox.width() >= page_width * 0.55
2045                && block.bbox.top_y >= page_top - 105.0
2046                && bbox_layout_block_text(block).split_whitespace().count() >= 8
2047        })
2048        .max_by(|left, right| {
2049            left.bbox
2050                .width()
2051                .partial_cmp(&right.bbox.width())
2052                .unwrap_or(std::cmp::Ordering::Equal)
2053        })?;
2054    let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2055    if title.split_whitespace().count() < 8 {
2056        return None;
2057    }
2058
2059    let eyebrow = blocks
2060        .iter()
2061        .filter(|block| {
2062            block.block_id != title_block.block_id
2063                && block.bbox.top_y > title_block.bbox.top_y
2064                && block.bbox.width() >= page_width * 0.1
2065        })
2066        .max_by(|left, right| {
2067            left.bbox
2068                .top_y
2069                .partial_cmp(&right.bbox.top_y)
2070                .unwrap_or(std::cmp::Ordering::Equal)
2071        })
2072        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2073
2074    let title_bottom = title_block.bbox.bottom_y;
2075    let region_width = page_width / 3.0;
2076    let left_panel = detect_layout_recommendation_hit_ratio_panel(
2077        &blocks,
2078        lines,
2079        0.0,
2080        region_width,
2081        title_bottom,
2082    )?;
2083    let middle_panel = detect_layout_recommendation_ranking_panel(
2084        &blocks,
2085        lines,
2086        region_width,
2087        region_width * 2.0,
2088        title_bottom,
2089    )?;
2090    let right_panel = detect_layout_recommendation_accuracy_panel(
2091        &blocks,
2092        lines,
2093        region_width * 2.0,
2094        page_width,
2095        title_bottom,
2096    )?;
2097
2098    Some(LayoutRecommendationInfographic {
2099        eyebrow,
2100        title,
2101        panels: vec![left_panel, middle_panel, right_panel],
2102    })
2103}
2104
2105#[cfg(not(target_arch = "wasm32"))]
2106#[allow(dead_code)]
2107fn render_layout_ocr_benchmark_dashboard_document(doc: &PdfDocument) -> Option<String> {
2108    let mut layout_cache = LayoutSourceCache::default();
2109    render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
2110}
2111
2112#[cfg(not(target_arch = "wasm32"))]
2113fn render_layout_ocr_benchmark_dashboard_document_cached(
2114    doc: &PdfDocument,
2115    layout_cache: &mut LayoutSourceCache,
2116) -> Option<String> {
2117    if doc.number_of_pages != 1 {
2118        return None;
2119    }
2120
2121    let layout = layout_cache.bbox_layout(doc)?;
2122    let dashboard = detect_layout_ocr_benchmark_dashboard(layout.page_width, &layout.lines)?;
2123
2124    let mut output = String::new();
2125    if let Some(eyebrow) = dashboard.eyebrow.as_deref() {
2126        output.push_str("## ");
2127        output.push_str(eyebrow.trim());
2128        output.push_str("\n\n");
2129    }
2130    output.push_str("# ");
2131    output.push_str(dashboard.title.trim());
2132    output.push_str("\n\n");
2133
2134    output.push_str("## ");
2135    output.push_str(dashboard.left_heading.trim());
2136    output.push_str("\n\n");
2137    let mut left_table = Vec::with_capacity(dashboard.left_rows.len() + 1);
2138    left_table.push({
2139        let mut row = vec!["Company".to_string()];
2140        row.extend(dashboard.left_columns.clone());
2141        row
2142    });
2143    left_table.extend(dashboard.left_rows.clone());
2144    output.push_str(&render_pipe_rows(&left_table));
2145
2146    output.push_str("## ");
2147    output.push_str(dashboard.right_heading.trim());
2148    output.push_str("\n\n");
2149    let mut right_table = Vec::with_capacity(dashboard.right_rows.len() + 1);
2150    right_table.push(vec![
2151        "Metric".to_string(),
2152        "Company A".to_string(),
2153        "Company B".to_string(),
2154        "upstage".to_string(),
2155    ]);
2156    right_table.extend(dashboard.right_rows.clone());
2157    output.push_str(&render_pipe_rows(&right_table));
2158
2159    if !dashboard.definition_notes.is_empty() {
2160        output.push_str("---\n\n");
2161        for note in &dashboard.definition_notes {
2162            output.push_str(note.trim());
2163            output.push_str("\n\n");
2164        }
2165    }
2166    if !dashboard.source_notes.is_empty() {
2167        output.push_str("---\n\n");
2168        for note in &dashboard.source_notes {
2169            output.push_str(note.trim());
2170            output.push_str("\n\n");
2171        }
2172    }
2173
2174    Some(output.trim_end().to_string() + "\n")
2175}
2176
2177#[cfg(not(target_arch = "wasm32"))]
2178fn detect_layout_ocr_benchmark_dashboard(
2179    page_width: f64,
2180    lines: &[BBoxLayoutLine],
2181) -> Option<LayoutOcrDashboard> {
2182    if page_width < 680.0 {
2183        return None;
2184    }
2185
2186    let page_mid = page_width / 2.0;
2187    let blocks = collect_bbox_layout_blocks(lines);
2188    let page_top = lines
2189        .iter()
2190        .map(|line| line.bbox.top_y)
2191        .fold(0.0_f64, f64::max);
2192
2193    let title_block = blocks
2194        .iter()
2195        .filter(|block| {
2196            block.bbox.width() >= page_width * 0.45 && block.bbox.top_y >= page_top - 40.0
2197        })
2198        .max_by(|left, right| {
2199            left.bbox
2200                .width()
2201                .partial_cmp(&right.bbox.width())
2202                .unwrap_or(std::cmp::Ordering::Equal)
2203        })?;
2204    let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2205    if title.split_whitespace().count() < 5 {
2206        return None;
2207    }
2208
2209    let eyebrow = blocks
2210        .iter()
2211        .filter(|block| {
2212            block.block_id != title_block.block_id
2213                && block.bbox.top_y > title_block.bbox.top_y
2214                && block.bbox.width() >= page_width * 0.12
2215        })
2216        .max_by(|left, right| {
2217            left.bbox
2218                .top_y
2219                .partial_cmp(&right.bbox.top_y)
2220                .unwrap_or(std::cmp::Ordering::Equal)
2221        })
2222        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2223
2224    let left_title_blocks = blocks
2225        .iter()
2226        .filter(|block| {
2227            block.bbox.right_x <= page_mid
2228                && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2229                && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2230                && !bbox_layout_block_text(block)
2231                    .chars()
2232                    .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2233        })
2234        .cloned()
2235        .collect::<Vec<_>>();
2236    let right_title_blocks = blocks
2237        .iter()
2238        .filter(|block| {
2239            block.bbox.left_x >= page_mid
2240                && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2241                && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2242                && !bbox_layout_block_text(block)
2243                    .chars()
2244                    .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2245        })
2246        .cloned()
2247        .collect::<Vec<_>>();
2248
2249    let left_heading = join_dashboard_title_blocks(&left_title_blocks)?;
2250    let right_heading = join_dashboard_title_blocks(&right_title_blocks)?;
2251    if !left_heading.to_ascii_lowercase().contains("ocr")
2252        || !right_heading.to_ascii_lowercase().contains("document")
2253    {
2254        return None;
2255    }
2256
2257    let left_group_blocks = blocks
2258        .iter()
2259        .filter(|block| {
2260            block.bbox.center_x() < page_mid
2261                && block.bbox.top_y < 90.0
2262                && bbox_layout_block_text(block).contains('(')
2263        })
2264        .cloned()
2265        .collect::<Vec<_>>();
2266    if left_group_blocks.len() != 2 {
2267        return None;
2268    }
2269    let mut left_groups = left_group_blocks
2270        .iter()
2271        .map(|block| {
2272            (
2273                block.bbox.center_x(),
2274                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2275            )
2276        })
2277        .collect::<Vec<_>>();
2278    left_groups.sort_by(|left, right| {
2279        left.0
2280            .partial_cmp(&right.0)
2281            .unwrap_or(std::cmp::Ordering::Equal)
2282    });
2283
2284    let left_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2285        bbox.center_x() < page_mid - 20.0 && bbox.top_y > 110.0 && bbox.top_y < 250.0
2286    });
2287    if left_value_tokens.len() < 6 {
2288        return None;
2289    }
2290
2291    let mut left_group_values = vec![Vec::<(f64, String)>::new(), Vec::new()];
2292    for (bbox, value) in left_value_tokens {
2293        let group_idx = if (bbox.center_x() - left_groups[0].0).abs()
2294            <= (bbox.center_x() - left_groups[1].0).abs()
2295        {
2296            0
2297        } else {
2298            1
2299        };
2300        left_group_values[group_idx].push((bbox.center_x(), value));
2301    }
2302    if left_group_values.iter().any(|values| values.len() < 3) {
2303        return None;
2304    }
2305    for values in &mut left_group_values {
2306        values.sort_by(|left, right| {
2307            left.0
2308                .partial_cmp(&right.0)
2309                .unwrap_or(std::cmp::Ordering::Equal)
2310        });
2311        values.truncate(3);
2312    }
2313
2314    let mut company_labels = extract_dashboard_company_labels(&blocks, page_mid);
2315    if company_labels.len() < 2 {
2316        return None;
2317    }
2318    company_labels.truncate(2);
2319    company_labels.push(infer_dashboard_brand_name(&left_heading));
2320
2321    let mut left_rows = Vec::new();
2322    for row_idx in 0..3 {
2323        left_rows.push(vec![
2324            company_labels[row_idx].clone(),
2325            left_group_values[0][row_idx].1.clone(),
2326            left_group_values[1][row_idx].1.clone(),
2327        ]);
2328    }
2329
2330    let metric_blocks = blocks
2331        .iter()
2332        .filter(|block| {
2333            block.bbox.center_x() > page_mid
2334                && block.bbox.top_y > 95.0
2335                && block.bbox.top_y < 240.0
2336                && matches!(
2337                    normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
2338                    text if text.starts_with("ocr") || text.starts_with("parsingf1")
2339                )
2340        })
2341        .cloned()
2342        .collect::<Vec<_>>();
2343    if metric_blocks.len() < 4 {
2344        return None;
2345    }
2346
2347    let mut metrics = metric_blocks
2348        .iter()
2349        .map(|block| {
2350            (
2351                block.bbox.center_y(),
2352                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2353            )
2354        })
2355        .collect::<Vec<_>>();
2356    metrics.sort_by(|left, right| {
2357        right
2358            .0
2359            .partial_cmp(&left.0)
2360            .unwrap_or(std::cmp::Ordering::Equal)
2361    });
2362    metrics.truncate(4);
2363
2364    let right_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2365        bbox.center_x() > page_mid + 20.0 && bbox.top_y > 90.0 && bbox.top_y < 250.0
2366    });
2367    if right_value_tokens.len() < 10 {
2368        return None;
2369    }
2370
2371    let mut metric_values = vec![Vec::<(f64, String)>::new(); metrics.len()];
2372    for (bbox, value) in right_value_tokens {
2373        let Some((metric_idx, _)) = metrics
2374            .iter()
2375            .enumerate()
2376            .map(|(idx, (center_y, _))| (idx, (bbox.center_y() - *center_y).abs()))
2377            .min_by(|left, right| {
2378                left.1
2379                    .partial_cmp(&right.1)
2380                    .unwrap_or(std::cmp::Ordering::Equal)
2381            })
2382        else {
2383            continue;
2384        };
2385        metric_values[metric_idx].push((bbox.center_x(), value));
2386    }
2387
2388    let mut right_rows = Vec::new();
2389    for (idx, (_, metric_name)) in metrics.iter().enumerate() {
2390        let mut values = metric_values[idx].clone();
2391        values.sort_by(|left, right| {
2392            left.0
2393                .partial_cmp(&right.0)
2394                .unwrap_or(std::cmp::Ordering::Equal)
2395        });
2396        values.dedup_by(|left, right| left.1 == right.1);
2397        if values.len() < 2 {
2398            return None;
2399        }
2400        if values.len() == 2 {
2401            values.push(values[1].clone());
2402        }
2403        values.truncate(3);
2404        right_rows.push(vec![
2405            metric_name.clone(),
2406            normalize_layout_decimal_value(&values[0].1),
2407            normalize_layout_decimal_value(&values[1].1),
2408            normalize_layout_decimal_value(&values[2].1),
2409        ]);
2410    }
2411
2412    let definition_notes = collect_dashboard_notes(&blocks, page_mid, false);
2413    let source_notes = collect_dashboard_notes(&blocks, page_mid, true);
2414
2415    Some(LayoutOcrDashboard {
2416        eyebrow,
2417        title,
2418        left_heading,
2419        left_columns: left_groups.into_iter().map(|(_, text)| text).collect(),
2420        left_rows,
2421        right_heading,
2422        right_rows,
2423        definition_notes,
2424        source_notes,
2425    })
2426}
2427
2428#[cfg(not(target_arch = "wasm32"))]
2429fn detect_layout_recommendation_hit_ratio_panel(
2430    blocks: &[BBoxLayoutBlock],
2431    lines: &[BBoxLayoutLine],
2432    left_x: f64,
2433    right_x: f64,
2434    title_bottom: f64,
2435) -> Option<LayoutRecommendationPanel> {
2436    let (heading_block, subtitle_block) =
2437        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2438    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2439    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2440    let width = right_x - left_x;
2441    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2442
2443    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2444        bbox.center_x() > left_x + width * 0.52
2445            && bbox.center_x() < right_x - 8.0
2446            && bbox.top_y < chart_cutoff
2447    });
2448    values.sort_by(|left, right| {
2449        right
2450            .0
2451            .center_y()
2452            .partial_cmp(&left.0.center_y())
2453            .unwrap_or(std::cmp::Ordering::Equal)
2454    });
2455    values.dedup_by(|left, right| {
2456        (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2457    });
2458    if values.len() < 4 {
2459        return None;
2460    }
2461
2462    let labels = collect_layout_panel_alpha_blocks(
2463        blocks,
2464        left_x,
2465        right_x,
2466        title_bottom,
2467        chart_cutoff,
2468        Some(left_x + width * 0.55),
2469    );
2470    let rows = pair_layout_decimal_rows(&labels, &values, 4)?;
2471    let notes = pair_layout_emphasis_notes(
2472        &rows,
2473        &collect_layout_emphasis_tokens(lines, |bbox| {
2474            bbox.center_x() > left_x + width * 0.48
2475                && bbox.center_x() < right_x
2476                && bbox.top_y < chart_cutoff
2477        }),
2478        "increase",
2479    );
2480    let metric_label =
2481        extract_layout_comparison_metric(&subtitle).unwrap_or_else(|| "Value".to_string());
2482
2483    Some(LayoutRecommendationPanel {
2484        heading,
2485        subtitle,
2486        header: vec!["Model".to_string(), metric_label],
2487        rows,
2488        notes,
2489    })
2490}
2491
2492#[cfg(not(target_arch = "wasm32"))]
2493fn detect_layout_recommendation_ranking_panel(
2494    blocks: &[BBoxLayoutBlock],
2495    lines: &[BBoxLayoutLine],
2496    left_x: f64,
2497    right_x: f64,
2498    title_bottom: f64,
2499) -> Option<LayoutRecommendationPanel> {
2500    let (heading_block, subtitle_block) =
2501        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2502    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2503    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2504    let width = right_x - left_x;
2505    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2506
2507    let row_labels = collect_layout_panel_alpha_blocks(
2508        blocks,
2509        left_x,
2510        right_x,
2511        title_bottom,
2512        chart_cutoff,
2513        Some(left_x + width * 0.48),
2514    )
2515    .into_iter()
2516    .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(&block)))
2517    .collect::<Vec<_>>();
2518    if row_labels.len() < 8 {
2519        return None;
2520    }
2521
2522    let headers = extract_layout_ranking_headers(blocks, left_x, right_x, chart_cutoff)
2523        .unwrap_or_else(|| vec!["Recall@10".to_string(), "Accuracy".to_string()]);
2524    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2525        bbox.center_x() > left_x + width * 0.42
2526            && bbox.center_x() < right_x - 10.0
2527            && bbox.top_y < chart_cutoff
2528    });
2529    values.sort_by(|left, right| {
2530        left.0
2531            .left_x
2532            .partial_cmp(&right.0.left_x)
2533            .unwrap_or(std::cmp::Ordering::Equal)
2534    });
2535
2536    let mut rows = row_labels
2537        .into_iter()
2538        .map(|label| vec![label, String::new(), String::new()])
2539        .collect::<Vec<_>>();
2540    if let Some(first) = rows.first_mut() {
2541        if let Some((_, value)) = values.first() {
2542            first[1] = normalize_layout_decimal_value(value);
2543        }
2544        if let Some((_, value)) = values.get(1) {
2545            first[2] = normalize_layout_decimal_value(value);
2546        }
2547    }
2548
2549    let mut notes = collect_layout_ranking_notes(blocks, left_x, right_x, chart_cutoff);
2550    notes.extend(
2551        collect_layout_emphasis_tokens(lines, |bbox| {
2552            bbox.center_x() > left_x + width * 0.55
2553                && bbox.center_x() < right_x
2554                && bbox.top_y < chart_cutoff
2555        })
2556        .into_iter()
2557        .map(|(_, token)| format!("{} increase", token.trim_end_matches('↑'))),
2558    );
2559
2560    Some(LayoutRecommendationPanel {
2561        heading,
2562        subtitle,
2563        header: vec!["Method".to_string(), headers[0].clone(), headers[1].clone()],
2564        rows,
2565        notes,
2566    })
2567}
2568
2569#[cfg(not(target_arch = "wasm32"))]
2570fn detect_layout_recommendation_accuracy_panel(
2571    blocks: &[BBoxLayoutBlock],
2572    lines: &[BBoxLayoutLine],
2573    left_x: f64,
2574    right_x: f64,
2575    title_bottom: f64,
2576) -> Option<LayoutRecommendationPanel> {
2577    let (heading_block, subtitle_block) =
2578        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2579    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2580    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2581    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2582
2583    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2584        bbox.center_x() > left_x + 20.0 && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2585    });
2586    values.sort_by(|left, right| {
2587        right
2588            .0
2589            .center_y()
2590            .partial_cmp(&left.0.center_y())
2591            .unwrap_or(std::cmp::Ordering::Equal)
2592    });
2593    values.dedup_by(|left, right| {
2594        (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2595    });
2596    if values.len() < 2 {
2597        return None;
2598    }
2599    let min_value_top_y = values
2600        .iter()
2601        .map(|(bbox, _)| bbox.top_y)
2602        .fold(f64::INFINITY, f64::min);
2603
2604    let labels = collect_layout_panel_alpha_blocks(
2605        blocks,
2606        left_x,
2607        right_x,
2608        title_bottom,
2609        chart_cutoff,
2610        None,
2611    )
2612    .into_iter()
2613    .filter(|block| block.bbox.top_y < min_value_top_y - 70.0)
2614    .collect::<Vec<_>>();
2615    let rows = pair_layout_decimal_rows(&labels, &values, 2)?;
2616
2617    let mut notes = Vec::new();
2618    if let Some(description) = collect_layout_note_phrase(blocks, left_x, right_x, chart_cutoff) {
2619        if let Some((_, emphasis)) = collect_layout_emphasis_tokens(lines, |bbox| {
2620            bbox.center_x() > left_x && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2621        })
2622        .into_iter()
2623        .next()
2624        {
2625            notes.push(format!(
2626                "{}, {} increase",
2627                description,
2628                emphasis.trim_end_matches('↑')
2629            ));
2630        }
2631    }
2632
2633    Some(LayoutRecommendationPanel {
2634        heading,
2635        subtitle,
2636        header: vec!["Model".to_string(), "Accuracy".to_string()],
2637        rows,
2638        notes,
2639    })
2640}
2641
2642#[cfg(not(target_arch = "wasm32"))]
2643fn extract_layout_panel_heading_and_subtitle(
2644    blocks: &[BBoxLayoutBlock],
2645    left_x: f64,
2646    right_x: f64,
2647    title_bottom: f64,
2648) -> Option<(BBoxLayoutBlock, BBoxLayoutBlock)> {
2649    let mut band_blocks = blocks
2650        .iter()
2651        .filter(|block| {
2652            block.bbox.center_x() >= left_x
2653                && block.bbox.center_x() <= right_x
2654                && block.bbox.top_y < title_bottom - 8.0
2655                && block.bbox.top_y > title_bottom - 90.0
2656                && bbox_layout_block_text(block)
2657                    .chars()
2658                    .any(char::is_alphabetic)
2659        })
2660        .cloned()
2661        .collect::<Vec<_>>();
2662    band_blocks.sort_by(|left, right| {
2663        right
2664            .bbox
2665            .top_y
2666            .partial_cmp(&left.bbox.top_y)
2667            .unwrap_or(std::cmp::Ordering::Equal)
2668    });
2669
2670    let heading = band_blocks.first()?.clone();
2671    let subtitle = band_blocks
2672        .iter()
2673        .find(|block| {
2674            block.block_id != heading.block_id
2675                && block.bbox.top_y < heading.bbox.bottom_y + 8.0
2676                && block.bbox.top_y > heading.bbox.bottom_y - 40.0
2677        })?
2678        .clone();
2679    Some((heading, subtitle))
2680}
2681
2682#[cfg(not(target_arch = "wasm32"))]
2683fn collect_layout_panel_alpha_blocks(
2684    blocks: &[BBoxLayoutBlock],
2685    left_x: f64,
2686    right_x: f64,
2687    title_bottom: f64,
2688    chart_cutoff: f64,
2689    max_left_x: Option<f64>,
2690) -> Vec<BBoxLayoutBlock> {
2691    let mut alpha_blocks = blocks
2692        .iter()
2693        .filter(|block| {
2694            block.bbox.center_x() >= left_x
2695                && block.bbox.center_x() <= right_x
2696                && block.bbox.top_y < chart_cutoff
2697                && block.bbox.top_y > title_bottom - 390.0
2698                && max_left_x.is_none_or(|limit| block.bbox.left_x <= limit)
2699        })
2700        .filter_map(|block| {
2701            let text = normalize_layout_panel_text(&bbox_layout_block_text(block));
2702            let token_count = text.split_whitespace().count();
2703            let has_alpha = text.chars().any(char::is_alphabetic);
2704            let has_numeric_marker = text
2705                .chars()
2706                .any(|ch| ch.is_ascii_digit() || ch == '%' || ch == ':');
2707            (has_alpha
2708                && token_count >= 1
2709                && !has_numeric_marker
2710                && !text.starts_with(':')
2711                && !text.eq_ignore_ascii_case("comparison"))
2712            .then_some(block.clone())
2713        })
2714        .collect::<Vec<_>>();
2715    alpha_blocks.sort_by(|left, right| {
2716        right
2717            .bbox
2718            .center_y()
2719            .partial_cmp(&left.bbox.center_y())
2720            .unwrap_or(std::cmp::Ordering::Equal)
2721    });
2722    alpha_blocks
2723}
2724
2725#[cfg(not(target_arch = "wasm32"))]
2726fn pair_layout_decimal_rows(
2727    label_blocks: &[BBoxLayoutBlock],
2728    value_tokens: &[(BoundingBox, String)],
2729    expected_len: usize,
2730) -> Option<Vec<Vec<String>>> {
2731    let mut used = HashSet::new();
2732    let mut rows = Vec::new();
2733
2734    for (bbox, value) in value_tokens.iter().take(expected_len) {
2735        let Some((label_idx, _)) = label_blocks
2736            .iter()
2737            .enumerate()
2738            .filter(|(idx, block)| {
2739                !used.contains(idx) && block.bbox.center_x() <= bbox.center_x() + 24.0
2740            })
2741            .map(|(idx, block)| (idx, (block.bbox.center_y() - bbox.center_y()).abs()))
2742            .min_by(|left, right| {
2743                left.1
2744                    .partial_cmp(&right.1)
2745                    .unwrap_or(std::cmp::Ordering::Equal)
2746            })
2747        else {
2748            continue;
2749        };
2750        if label_blocks[label_idx].bbox.center_y() - bbox.center_y() > 30.0 {
2751            continue;
2752        }
2753
2754        used.insert(label_idx);
2755        rows.push(vec![
2756            normalize_layout_panel_text(&bbox_layout_block_text(&label_blocks[label_idx])),
2757            normalize_layout_decimal_value(value),
2758        ]);
2759    }
2760
2761    (rows.len() >= expected_len).then_some(rows)
2762}
2763
2764#[cfg(not(target_arch = "wasm32"))]
2765fn collect_layout_emphasis_tokens<F>(
2766    lines: &[BBoxLayoutLine],
2767    bbox_filter: F,
2768) -> Vec<(BoundingBox, String)>
2769where
2770    F: Fn(&BoundingBox) -> bool,
2771{
2772    let emphasis_re = Regex::new(r"^\d+(?:\.\d+)?(?:X|%)↑?$").ok();
2773    let Some(emphasis_re) = emphasis_re else {
2774        return Vec::new();
2775    };
2776
2777    let mut tokens = Vec::new();
2778    for line in lines {
2779        for word in &line.words {
2780            let candidate = word.text.trim();
2781            if bbox_filter(&word.bbox) && emphasis_re.is_match(candidate) {
2782                tokens.push((word.bbox.clone(), candidate.to_string()));
2783            }
2784        }
2785    }
2786    tokens.sort_by(|left, right| {
2787        right
2788            .0
2789            .center_y()
2790            .partial_cmp(&left.0.center_y())
2791            .unwrap_or(std::cmp::Ordering::Equal)
2792    });
2793    tokens
2794}
2795
2796#[cfg(not(target_arch = "wasm32"))]
2797fn pair_layout_emphasis_notes(
2798    rows: &[Vec<String>],
2799    emphasis_tokens: &[(BoundingBox, String)],
2800    suffix: &str,
2801) -> Vec<String> {
2802    let mut notes = Vec::new();
2803    for ((_, token), row) in emphasis_tokens.iter().zip(rows.iter().skip(2)) {
2804        if let Some(label) = row.first() {
2805            notes.push(format!(
2806                "{}: {} {}",
2807                label.trim(),
2808                token.trim_end_matches('↑'),
2809                suffix
2810            ));
2811        }
2812    }
2813    notes
2814}
2815
2816#[cfg(not(target_arch = "wasm32"))]
2817fn extract_layout_comparison_metric(text: &str) -> Option<String> {
2818    let tokens = text.split_whitespace().collect::<Vec<_>>();
2819    let comparison_idx = tokens
2820        .iter()
2821        .position(|token| token.eq_ignore_ascii_case("comparison"))?;
2822    if comparison_idx < 2 {
2823        return None;
2824    }
2825    let metric = tokens[comparison_idx.saturating_sub(2)..comparison_idx].join(" ");
2826    (!metric.trim().is_empty()).then_some(metric)
2827}
2828
2829#[cfg(not(target_arch = "wasm32"))]
2830fn title_case_metric_label(text: &str) -> String {
2831    let trimmed = text.trim();
2832    if trimmed.is_empty() {
2833        return String::new();
2834    }
2835    let mut out = String::new();
2836    for (idx, token) in trimmed.split_whitespace().enumerate() {
2837        if idx > 0 {
2838            out.push(' ');
2839        }
2840        if token
2841            .chars()
2842            .all(|ch| !ch.is_ascii_alphabetic() || ch.is_uppercase())
2843        {
2844            out.push_str(token);
2845        } else {
2846            let mut chars = token.chars();
2847            if let Some(first) = chars.next() {
2848                out.push(first.to_ascii_uppercase());
2849                for ch in chars {
2850                    out.push(ch);
2851                }
2852            }
2853        }
2854    }
2855    out
2856}
2857
2858#[cfg(not(target_arch = "wasm32"))]
2859fn normalize_layout_panel_text(text: &str) -> String {
2860    normalize_layout_dashboard_text(text)
2861        .replace(" _", "_")
2862        .replace("_ ", "_")
2863}
2864
2865#[cfg(not(target_arch = "wasm32"))]
2866fn extract_layout_ranking_headers(
2867    blocks: &[BBoxLayoutBlock],
2868    left_x: f64,
2869    right_x: f64,
2870    chart_cutoff: f64,
2871) -> Option<Vec<String>> {
2872    let legend = blocks
2873        .iter()
2874        .filter(|block| {
2875            block.bbox.center_x() >= left_x
2876                && block.bbox.center_x() <= right_x
2877                && block.bbox.top_y < chart_cutoff
2878                && bbox_layout_block_text(block).contains(':')
2879        })
2880        .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2881        .collect::<Vec<_>>();
2882    for line in legend {
2883        let segments = line
2884            .split(':')
2885            .map(str::trim)
2886            .filter(|segment| !segment.is_empty())
2887            .collect::<Vec<_>>();
2888        let Some(first_segment) = segments.first() else {
2889            continue;
2890        };
2891        let metrics = first_segment
2892            .split(',')
2893            .map(title_case_metric_label)
2894            .filter(|part| !part.trim().is_empty())
2895            .collect::<Vec<_>>();
2896        if metrics.len() >= 2 {
2897            return Some(vec![metrics[0].clone(), metrics[1].clone()]);
2898        }
2899    }
2900    None
2901}
2902
2903#[cfg(not(target_arch = "wasm32"))]
2904fn collect_layout_ranking_notes(
2905    blocks: &[BBoxLayoutBlock],
2906    left_x: f64,
2907    right_x: f64,
2908    chart_cutoff: f64,
2909) -> Vec<String> {
2910    blocks
2911        .iter()
2912        .filter(|block| {
2913            block.bbox.center_x() >= left_x
2914                && block.bbox.center_x() <= right_x
2915                && block.bbox.top_y < chart_cutoff
2916                && bbox_layout_block_text(block).contains(':')
2917        })
2918        .flat_map(|block| {
2919            normalize_layout_panel_text(&bbox_layout_block_text(block))
2920                .split(':')
2921                .map(str::trim)
2922                .filter(|segment| !segment.is_empty())
2923                .map(ToString::to_string)
2924                .collect::<Vec<_>>()
2925        })
2926        .filter(|note| !note.eq_ignore_ascii_case("recall@10, accuracy"))
2927        .collect()
2928}
2929
2930#[cfg(not(target_arch = "wasm32"))]
2931fn collect_layout_note_phrase(
2932    blocks: &[BBoxLayoutBlock],
2933    left_x: f64,
2934    right_x: f64,
2935    chart_cutoff: f64,
2936) -> Option<String> {
2937    blocks
2938        .iter()
2939        .filter(|block| {
2940            block.bbox.center_x() >= left_x
2941                && block.bbox.center_x() <= right_x
2942                && block.bbox.top_y < chart_cutoff
2943                && bbox_layout_block_text(block).split_whitespace().count() >= 3
2944        })
2945        .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2946        .find(|text| text.to_ascii_lowercase().contains("compared"))
2947}
2948
2949#[cfg(not(target_arch = "wasm32"))]
2950fn collect_bbox_layout_blocks(lines: &[BBoxLayoutLine]) -> Vec<BBoxLayoutBlock> {
2951    let mut grouped: HashMap<usize, Vec<BBoxLayoutLine>> = HashMap::new();
2952    for line in lines {
2953        grouped.entry(line.block_id).or_default().push(line.clone());
2954    }
2955
2956    let mut blocks = grouped
2957        .into_iter()
2958        .map(|(block_id, mut lines)| {
2959            lines.sort_by(|left, right| {
2960                cmp_banded_reading_order(&left.bbox, &right.bbox, 3.0)
2961                    .then_with(|| left.block_id.cmp(&right.block_id))
2962            });
2963            let bbox = lines
2964                .iter()
2965                .skip(1)
2966                .fold(lines[0].bbox.clone(), |acc, line| acc.union(&line.bbox));
2967            BBoxLayoutBlock {
2968                block_id,
2969                bbox,
2970                lines,
2971            }
2972        })
2973        .collect::<Vec<_>>();
2974    blocks.sort_by(|left, right| {
2975        cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
2976            .then_with(|| left.block_id.cmp(&right.block_id))
2977    });
2978    blocks
2979}
2980
2981#[cfg(not(target_arch = "wasm32"))]
2982fn bbox_layout_block_text(block: &BBoxLayoutBlock) -> String {
2983    join_layout_lines_as_paragraph(&block.lines.iter().collect::<Vec<_>>())
2984}
2985
2986#[cfg(not(target_arch = "wasm32"))]
2987fn join_dashboard_title_blocks(blocks: &[BBoxLayoutBlock]) -> Option<String> {
2988    let mut blocks = blocks.to_vec();
2989    blocks.sort_by(|left, right| {
2990        right
2991            .bbox
2992            .top_y
2993            .partial_cmp(&left.bbox.top_y)
2994            .unwrap_or(std::cmp::Ordering::Equal)
2995    });
2996    let text = blocks
2997        .iter()
2998        .map(bbox_layout_block_text)
2999        .filter(|text| !text.trim().is_empty())
3000        .collect::<Vec<_>>()
3001        .join(" ");
3002    let normalized = normalize_layout_dashboard_text(&text);
3003    (!normalized.trim().is_empty()).then_some(normalized)
3004}
3005
3006#[cfg(not(target_arch = "wasm32"))]
3007fn collect_layout_decimal_tokens<F>(
3008    lines: &[BBoxLayoutLine],
3009    bbox_filter: F,
3010) -> Vec<(BoundingBox, String)>
3011where
3012    F: Fn(&BoundingBox) -> bool,
3013{
3014    let decimal_re = Regex::new(r"^\d+\.\d+$|^\d+\.$").ok();
3015    let Some(decimal_re) = decimal_re else {
3016        return Vec::new();
3017    };
3018
3019    let mut tokens = Vec::new();
3020    for line in lines {
3021        for word in &line.words {
3022            let candidate = word.text.trim().trim_matches(|ch| ch == ',' || ch == ';');
3023            if !bbox_filter(&word.bbox) || !decimal_re.is_match(candidate) {
3024                continue;
3025            }
3026            tokens.push((word.bbox.clone(), candidate.to_string()));
3027        }
3028    }
3029    tokens
3030}
3031
3032#[cfg(not(target_arch = "wasm32"))]
3033fn extract_dashboard_company_labels(blocks: &[BBoxLayoutBlock], page_mid: f64) -> Vec<String> {
3034    let company_blocks = blocks
3035        .iter()
3036        .filter(|block| {
3037            block.bbox.center_x() < page_mid
3038                && (65.0..110.0).contains(&block.bbox.top_y)
3039                && bbox_layout_block_text(block) == "Company"
3040        })
3041        .collect::<Vec<_>>();
3042    let marker_blocks = blocks
3043        .iter()
3044        .filter(|block| {
3045            block.bbox.center_x() < page_mid
3046                && (60.0..105.0).contains(&block.bbox.top_y)
3047                && matches!(
3048                    normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
3049                    "a2" | "b2"
3050                )
3051        })
3052        .map(|block| {
3053            (
3054                block.bbox.center_x(),
3055                block.bbox.center_y(),
3056                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3057            )
3058        })
3059        .collect::<Vec<_>>();
3060
3061    let mut labels = Vec::new();
3062    for company in company_blocks {
3063        if let Some((_, marker_y, marker)) = marker_blocks.iter().min_by(|left, right| {
3064            let left_distance = ((left.0 - company.bbox.center_x()).powi(2)
3065                + (left.1 - company.bbox.center_y()).powi(2))
3066            .sqrt();
3067            let right_distance = ((right.0 - company.bbox.center_x()).powi(2)
3068                + (right.1 - company.bbox.center_y()).powi(2))
3069            .sqrt();
3070            left_distance
3071                .partial_cmp(&right_distance)
3072                .unwrap_or(std::cmp::Ordering::Equal)
3073        }) {
3074            if (company.bbox.center_y() - *marker_y).abs() <= 16.0 || marker_blocks.len() == 1 {
3075                labels.push(format!("{} {}", bbox_layout_block_text(company), marker));
3076            }
3077        }
3078    }
3079
3080    if labels.len() < 2 {
3081        labels.extend(
3082            marker_blocks
3083                .iter()
3084                .map(|(_, _, marker)| format!("Company {marker}")),
3085        );
3086    }
3087
3088    labels.sort();
3089    labels.dedup();
3090    labels
3091}
3092
3093#[cfg(not(target_arch = "wasm32"))]
3094fn infer_dashboard_brand_name(text: &str) -> String {
3095    text.split_whitespace()
3096        .next()
3097        .map(|token| token.trim_matches(|ch: char| !ch.is_alphanumeric()))
3098        .filter(|token| !token.is_empty())
3099        .map(|token| token.to_ascii_lowercase())
3100        .unwrap_or_else(|| "model".to_string())
3101}
3102
3103#[cfg(not(target_arch = "wasm32"))]
3104fn collect_dashboard_notes(
3105    blocks: &[BBoxLayoutBlock],
3106    page_mid: f64,
3107    left_half: bool,
3108) -> Vec<String> {
3109    let notes = blocks
3110        .iter()
3111        .filter(|block| {
3112            let in_half = if left_half {
3113                block.bbox.center_x() < page_mid
3114            } else {
3115                block.bbox.center_x() > page_mid
3116            };
3117            in_half && block.bbox.top_y < 50.0
3118        })
3119        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3120        .filter(|text| !text.trim().is_empty())
3121        .collect::<Vec<_>>();
3122
3123    let mut merged = Vec::new();
3124    for note in notes {
3125        if note
3126            .chars()
3127            .next()
3128            .is_some_and(|ch| matches!(ch, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹'))
3129        {
3130            merged.push(note);
3131        } else if let Some(previous) = merged.last_mut() {
3132            append_cell_text(previous, &note);
3133        } else {
3134            merged.push(note);
3135        }
3136    }
3137    merged
3138}
3139
3140#[cfg(not(target_arch = "wasm32"))]
3141fn normalize_layout_dashboard_text(text: &str) -> String {
3142    let normalized = normalize_common_ocr_text(text.trim());
3143    let degree_marker_re = Regex::new(r"(\d)[°º]").ok();
3144    let split_suffix_re = Regex::new(r"\b([A-Za-z])(\d)\s+(\d)\b").ok();
3145    let single_letter_marker_re = Regex::new(r"\b([A-Za-z])\s+(\d{1,2})\b").ok();
3146    let trailing_block_marker_re = Regex::new(r"([A-Za-z][A-Za-z0-9\-]*)\s+(\d{1,2})$").ok();
3147    let trailing_marker_re = Regex::new(r"([[:alpha:]\)])(\d{1,2})\b").ok();
3148    let leading_marker_re = Regex::new(r"^(\d{1,2})([.)]?)\s+").ok();
3149
3150    let cleaned_degree = degree_marker_re
3151        .as_ref()
3152        .map(|re| {
3153            re.replace_all(&normalized, |captures: &regex::Captures<'_>| {
3154                format!("{} ", &captures[1])
3155            })
3156            .to_string()
3157        })
3158        .unwrap_or(normalized);
3159
3160    let collapsed_suffix = split_suffix_re
3161        .as_ref()
3162        .map(|re| {
3163            re.replace_all(&cleaned_degree, |captures: &regex::Captures<'_>| {
3164                format!("{}{}{}", &captures[1], &captures[2], &captures[3])
3165            })
3166            .to_string()
3167        })
3168        .unwrap_or(cleaned_degree);
3169
3170    let collapsed_spacing = single_letter_marker_re
3171        .as_ref()
3172        .map(|re| {
3173            re.replace_all(&collapsed_suffix, |captures: &regex::Captures<'_>| {
3174                format!("{}{}", &captures[1], &captures[2])
3175            })
3176            .to_string()
3177        })
3178        .unwrap_or(collapsed_suffix);
3179
3180    let collapsed_terminal_marker = trailing_block_marker_re
3181        .as_ref()
3182        .map(|re| {
3183            re.replace(&collapsed_spacing, |captures: &regex::Captures<'_>| {
3184                format!("{}{}", &captures[1], &captures[2])
3185            })
3186            .to_string()
3187        })
3188        .unwrap_or(collapsed_spacing);
3189
3190    let with_inline = trailing_marker_re
3191        .as_ref()
3192        .map(|re| {
3193            re.replace_all(
3194                &collapsed_terminal_marker,
3195                |captures: &regex::Captures<'_>| {
3196                    format!("{}{}", &captures[1], superscript_digits(&captures[2]))
3197                },
3198            )
3199            .to_string()
3200        })
3201        .unwrap_or(collapsed_terminal_marker);
3202
3203    leading_marker_re
3204        .as_ref()
3205        .map(|re| {
3206            re.replace(&with_inline, |captures: &regex::Captures<'_>| {
3207                format!("{} ", superscript_digits(&captures[1]))
3208            })
3209            .to_string()
3210        })
3211        .unwrap_or(with_inline)
3212}
3213
3214#[cfg(not(target_arch = "wasm32"))]
3215fn normalize_layout_decimal_value(value: &str) -> String {
3216    value.trim_end_matches('.').to_string()
3217}
3218
3219#[cfg(not(target_arch = "wasm32"))]
3220fn superscript_digits(text: &str) -> String {
3221    text.chars()
3222        .map(|ch| match ch {
3223            '0' => '⁰',
3224            '1' => '¹',
3225            '2' => '²',
3226            '3' => '³',
3227            '4' => '⁴',
3228            '5' => '⁵',
3229            '6' => '⁶',
3230            '7' => '⁷',
3231            '8' => '⁸',
3232            '9' => '⁹',
3233            _ => ch,
3234        })
3235        .collect()
3236}
3237
3238#[cfg(not(target_arch = "wasm32"))]
3239fn collect_layout_figure_captions(blocks: &[BBoxLayoutBlock]) -> Vec<BBoxLayoutBlock> {
3240    let mut captions = blocks
3241        .iter()
3242        .filter(|block| {
3243            let text = bbox_layout_block_text(block);
3244            text.starts_with("Figure ")
3245                && text.contains(':')
3246                && text.split_whitespace().count() >= 8
3247        })
3248        .cloned()
3249        .collect::<Vec<_>>();
3250    captions.sort_by(|left, right| {
3251        right
3252            .bbox
3253            .top_y
3254            .partial_cmp(&left.bbox.top_y)
3255            .unwrap_or(std::cmp::Ordering::Equal)
3256    });
3257    captions
3258}
3259
3260#[cfg(not(target_arch = "wasm32"))]
3261fn collect_layout_integer_tokens<F>(lines: &[BBoxLayoutLine], bbox_filter: F) -> Vec<LayoutBarToken>
3262where
3263    F: Fn(&BoundingBox) -> bool,
3264{
3265    let integer_re = Regex::new(r"^\d+$").ok();
3266    let Some(integer_re) = integer_re else {
3267        return Vec::new();
3268    };
3269
3270    let mut tokens = Vec::new();
3271    for line in lines {
3272        for word in &line.words {
3273            let candidate = word.text.trim();
3274            if !bbox_filter(&word.bbox) || !integer_re.is_match(candidate) {
3275                continue;
3276            }
3277            let Ok(value) = candidate.parse::<i64>() else {
3278                continue;
3279            };
3280            tokens.push(LayoutBarToken {
3281                bbox: word.bbox.clone(),
3282                value,
3283                text: candidate.to_string(),
3284            });
3285        }
3286    }
3287    tokens
3288}
3289
3290#[cfg(not(target_arch = "wasm32"))]
3291fn detect_layout_three_month_stacked_figure(
3292    blocks: &[BBoxLayoutBlock],
3293    lines: &[BBoxLayoutLine],
3294    page_width: f64,
3295    caption_block: BBoxLayoutBlock,
3296    next_caption_top_y: f64,
3297) -> Option<LayoutStackedBarFigure> {
3298    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3299    let month_blocks = collect_layout_month_blocks(
3300        blocks,
3301        caption_block.bbox.bottom_y - 150.0,
3302        caption_block.bbox.bottom_y - 230.0,
3303        None,
3304    );
3305    if month_blocks.len() != 3 {
3306        return None;
3307    }
3308    let legend_blocks = collect_layout_legend_blocks(
3309        blocks,
3310        caption_block.bbox.bottom_y - 175.0,
3311        caption_block.bbox.bottom_y - 220.0,
3312    );
3313    if legend_blocks.len() != 3 {
3314        return None;
3315    }
3316
3317    let month_centers = month_blocks
3318        .iter()
3319        .map(|block| {
3320            (
3321                block.bbox.center_x(),
3322                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3323            )
3324        })
3325        .collect::<Vec<_>>();
3326    let month_top_y = month_blocks
3327        .iter()
3328        .map(|block| block.bbox.top_y)
3329        .fold(0.0_f64, f64::max);
3330    let first_center = month_centers.first()?.0;
3331    let last_center = month_centers.last()?.0;
3332    let tokens = collect_layout_integer_tokens(lines, |bbox| {
3333        bbox.center_x() >= first_center - 20.0
3334            && bbox.center_x() <= last_center + 20.0
3335            && bbox.center_y() > month_top_y + 10.0
3336            && bbox.top_y < caption_block.bbox.bottom_y - 25.0
3337            && bbox.bottom_y > next_caption_top_y + 55.0
3338            && bbox.left_x > page_width * 0.28
3339    });
3340    if tokens.len() < 9 {
3341        return None;
3342    }
3343
3344    let mut grouped = vec![Vec::<LayoutBarToken>::new(), Vec::new(), Vec::new()];
3345    for token in tokens {
3346        let Some((idx, distance)) = month_centers
3347            .iter()
3348            .enumerate()
3349            .map(|(idx, (center_x, _))| (idx, (token.bbox.center_x() - *center_x).abs()))
3350            .min_by(|left, right| {
3351                left.1
3352                    .partial_cmp(&right.1)
3353                    .unwrap_or(std::cmp::Ordering::Equal)
3354            })
3355        else {
3356            continue;
3357        };
3358        if distance <= 28.0 {
3359            grouped[idx].push(token);
3360        }
3361    }
3362    if grouped.iter().any(|bucket| bucket.len() < 3) {
3363        return None;
3364    }
3365
3366    let mut rows = vec![
3367        vec![legend_blocks[0].1.clone()],
3368        vec![legend_blocks[1].1.clone()],
3369        vec![legend_blocks[2].1.clone()],
3370    ];
3371    for bucket in &mut grouped {
3372        bucket.sort_by(|left, right| {
3373            left.bbox
3374                .center_y()
3375                .partial_cmp(&right.bbox.center_y())
3376                .unwrap_or(std::cmp::Ordering::Equal)
3377        });
3378        bucket.truncate(3);
3379        rows[0].push(bucket[0].value.to_string());
3380        rows[1].push(bucket[1].value.to_string());
3381        rows[2].push(bucket[2].value.to_string());
3382    }
3383
3384    Some(LayoutStackedBarFigure {
3385        caption,
3386        months: month_centers.into_iter().map(|(_, text)| text).collect(),
3387        row_labels: legend_blocks.iter().map(|(_, text)| text.clone()).collect(),
3388        rows,
3389    })
3390}
3391
3392#[cfg(not(target_arch = "wasm32"))]
3393fn detect_layout_sector_bar_figure(
3394    blocks: &[BBoxLayoutBlock],
3395    lines: &[BBoxLayoutLine],
3396    page_width: f64,
3397    caption_block: BBoxLayoutBlock,
3398    narrative_top_y: f64,
3399) -> Option<LayoutStackedBarSectorFigure> {
3400    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3401    let month_blocks = collect_layout_month_blocks(
3402        blocks,
3403        caption_block.bbox.bottom_y - 160.0,
3404        caption_block.bbox.bottom_y - 235.0,
3405        Some(page_width * 0.22),
3406    );
3407    if month_blocks.len() != 9 {
3408        return None;
3409    }
3410    let sector_blocks = blocks
3411        .iter()
3412        .filter(|block| {
3413            let text = bbox_layout_block_text(block);
3414            block.bbox.top_y < caption_block.bbox.bottom_y - 150.0
3415                && block.bbox.top_y > caption_block.bbox.bottom_y - 220.0
3416                && text.split_whitespace().count() <= 2
3417                && text.len() >= 7
3418                && !looks_like_layout_month_label(&text)
3419                && !text.starts_with("Will ")
3420                && text != "Don’t know"
3421        })
3422        .map(|block| {
3423            (
3424                block.bbox.center_x(),
3425                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3426            )
3427        })
3428        .collect::<Vec<_>>();
3429    if sector_blocks.len() != 3 {
3430        return None;
3431    }
3432
3433    let month_centers = month_blocks
3434        .iter()
3435        .map(|block| block.bbox.center_x())
3436        .collect::<Vec<_>>();
3437    let month_top_y = month_blocks
3438        .iter()
3439        .map(|block| block.bbox.top_y)
3440        .fold(0.0_f64, f64::max);
3441    let first_center = *month_centers.first()?;
3442    let last_center = *month_centers.last()?;
3443    let tokens = collect_layout_integer_tokens(lines, |bbox| {
3444        bbox.center_x() >= first_center - 12.0
3445            && bbox.center_x() <= last_center + 12.0
3446            && bbox.center_y() > month_top_y + 10.0
3447            && bbox.top_y < caption_block.bbox.bottom_y - 20.0
3448            && bbox.bottom_y > narrative_top_y + 55.0
3449            && bbox.left_x > page_width * 0.24
3450    });
3451    if tokens.len() < 18 {
3452        return None;
3453    }
3454
3455    let mut grouped = vec![Vec::<LayoutBarToken>::new(); 9];
3456    for token in tokens {
3457        let Some((idx, distance)) = month_centers
3458            .iter()
3459            .enumerate()
3460            .map(|(idx, center_x)| (idx, (token.bbox.center_x() - *center_x).abs()))
3461            .min_by(|left, right| {
3462                left.1
3463                    .partial_cmp(&right.1)
3464                    .unwrap_or(std::cmp::Ordering::Equal)
3465            })
3466        else {
3467            continue;
3468        };
3469        if distance <= 18.0 {
3470            grouped[idx].push(token);
3471        }
3472    }
3473    if grouped.iter().any(|bucket| bucket.is_empty()) {
3474        return None;
3475    }
3476
3477    let months = vec![
3478        "July 2020".to_string(),
3479        "October 2020".to_string(),
3480        "January 2021".to_string(),
3481    ];
3482    let mut rows = Vec::new();
3483    for (sector_idx, (_, sector_name)) in sector_blocks.iter().enumerate() {
3484        let mut row = vec![sector_name.clone()];
3485        for month_idx in 0..3 {
3486            let bucket = &mut grouped[sector_idx * 3 + month_idx];
3487            bucket.sort_by(|left, right| {
3488                left.bbox
3489                    .center_y()
3490                    .partial_cmp(&right.bbox.center_y())
3491                    .unwrap_or(std::cmp::Ordering::Equal)
3492            });
3493            row.push(bucket.first()?.value.to_string());
3494        }
3495        rows.push(row);
3496    }
3497
3498    Some(LayoutStackedBarSectorFigure {
3499        caption,
3500        months,
3501        sectors: sector_blocks.into_iter().map(|(_, name)| name).collect(),
3502        rows,
3503    })
3504}
3505
3506#[cfg(not(target_arch = "wasm32"))]
3507fn detect_layout_stacked_bar_narrative(
3508    blocks: &[BBoxLayoutBlock],
3509) -> Option<LayoutStackedBarNarrative> {
3510    let heading_block = blocks.iter().find(|block| {
3511        let text = bbox_layout_block_text(block);
3512        text.starts_with("6.") && text.contains("Expectations") && text.contains("Employees")
3513    })?;
3514    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(heading_block));
3515
3516    let left_blocks = blocks
3517        .iter()
3518        .filter(|block| {
3519            block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3520                && block.bbox.bottom_y > 80.0
3521                && block.bbox.right_x < 330.0
3522                && block.bbox.left_x > 80.0
3523                && block.block_id != heading_block.block_id
3524                && !bbox_layout_block_text(block).starts_with("5.")
3525        })
3526        .collect::<Vec<_>>();
3527    let right_blocks = blocks
3528        .iter()
3529        .filter(|block| {
3530            block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3531                && block.bbox.bottom_y > 80.0
3532                && block.bbox.left_x > 320.0
3533                && block.block_id != heading_block.block_id
3534                && !bbox_layout_block_text(block).starts_with("5.")
3535        })
3536        .collect::<Vec<_>>();
3537    if left_blocks.is_empty() || right_blocks.is_empty() {
3538        return None;
3539    }
3540
3541    let mut ordered_blocks = left_blocks;
3542    ordered_blocks.extend(right_blocks);
3543    ordered_blocks.sort_by(|left, right| {
3544        let left_column = left.bbox.left_x > 320.0;
3545        let right_column = right.bbox.left_x > 320.0;
3546        if left_column != right_column {
3547            return left_column.cmp(&right_column);
3548        }
3549        right
3550            .bbox
3551            .top_y
3552            .partial_cmp(&left.bbox.top_y)
3553            .unwrap_or(std::cmp::Ordering::Equal)
3554    });
3555
3556    let ordered_lines = ordered_blocks
3557        .iter()
3558        .flat_map(|block| block.lines.iter())
3559        .collect::<Vec<_>>();
3560    let mut paragraph_lines: Vec<Vec<&BBoxLayoutLine>> = Vec::new();
3561    let mut current: Vec<&BBoxLayoutLine> = Vec::new();
3562    let mut previous_text = String::new();
3563    for line in ordered_lines {
3564        let line_text = bbox_layout_line_text(line);
3565        let trimmed = line_text.trim();
3566        if trimmed.is_empty() {
3567            continue;
3568        }
3569
3570        let starts_new_paragraph = !current.is_empty()
3571            && starts_with_uppercase_word(trimmed)
3572            && looks_like_sentence_end(&previous_text);
3573        if starts_new_paragraph {
3574            paragraph_lines.push(std::mem::take(&mut current));
3575        }
3576        current.push(line);
3577        previous_text = trimmed.to_string();
3578    }
3579    if !current.is_empty() {
3580        paragraph_lines.push(current);
3581    }
3582
3583    let paragraphs = paragraph_lines
3584        .iter()
3585        .map(|lines| normalize_layout_dashboard_text(&join_layout_lines_as_paragraph(lines)))
3586        .filter(|text| text.split_whitespace().count() >= 12)
3587        .collect::<Vec<_>>();
3588    if paragraphs.len() < 2 {
3589        return None;
3590    }
3591
3592    let footnote = blocks
3593        .iter()
3594        .filter(|block| {
3595            let text = bbox_layout_block_text(block);
3596            block.bbox.bottom_y < 120.0 && text.starts_with("5.")
3597        })
3598        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3599        .next();
3600
3601    Some(LayoutStackedBarNarrative {
3602        heading,
3603        paragraphs,
3604        footnote,
3605        top_y: heading_block.bbox.top_y,
3606    })
3607}
3608
3609#[cfg(not(target_arch = "wasm32"))]
3610fn collect_layout_month_blocks(
3611    blocks: &[BBoxLayoutBlock],
3612    top_min: f64,
3613    top_max: f64,
3614    min_left_x: Option<f64>,
3615) -> Vec<BBoxLayoutBlock> {
3616    let mut month_blocks = blocks
3617        .iter()
3618        .filter(|block| {
3619            let text = bbox_layout_block_text(block);
3620            let left_ok = min_left_x.is_none_or(|min_left_x| block.bbox.left_x >= min_left_x);
3621            left_ok
3622                && block.bbox.top_y <= top_min
3623                && block.bbox.top_y >= top_max
3624                && looks_like_layout_month_label(&text)
3625        })
3626        .cloned()
3627        .collect::<Vec<_>>();
3628    month_blocks.sort_by(|left, right| {
3629        left.bbox
3630            .center_x()
3631            .partial_cmp(&right.bbox.center_x())
3632            .unwrap_or(std::cmp::Ordering::Equal)
3633    });
3634    month_blocks
3635}
3636
3637#[cfg(not(target_arch = "wasm32"))]
3638fn collect_layout_legend_blocks(
3639    blocks: &[BBoxLayoutBlock],
3640    top_min: f64,
3641    top_max: f64,
3642) -> Vec<(f64, String)> {
3643    let mut legend_blocks = blocks
3644        .iter()
3645        .filter(|block| {
3646            let text = bbox_layout_block_text(block);
3647            block.bbox.top_y <= top_min
3648                && block.bbox.top_y >= top_max
3649                && (text.starts_with("Will ") || text == "Don’t know")
3650        })
3651        .map(|block| {
3652            (
3653                block.bbox.center_x(),
3654                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3655            )
3656        })
3657        .collect::<Vec<_>>();
3658    legend_blocks.sort_by(|left, right| {
3659        left.0
3660            .partial_cmp(&right.0)
3661            .unwrap_or(std::cmp::Ordering::Equal)
3662    });
3663    legend_blocks
3664}
3665
3666fn looks_like_layout_month_label(text: &str) -> bool {
3667    matches!(
3668        normalize_heading_text(text).as_str(),
3669        "july2020" | "october2020" | "january2021" | "jul2020" | "oct2020" | "jan2021"
3670    )
3671}
3672
3673fn looks_like_sentence_end(text: &str) -> bool {
3674    let trimmed = text.trim_end();
3675    if trimmed.is_empty() {
3676        return false;
3677    }
3678    let trimmed = trimmed.trim_end_matches(|ch: char| ch.is_ascii_digit() || ch.is_whitespace());
3679    trimmed.ends_with(['.', '!', '?'])
3680}
3681
3682#[cfg(not(target_arch = "wasm32"))]
3683#[allow(dead_code)]
3684fn render_layout_open_plate_document(doc: &PdfDocument) -> Option<String> {
3685    let mut layout_cache = LayoutSourceCache::default();
3686    render_layout_open_plate_document_cached(doc, &mut layout_cache)
3687}
3688
3689#[cfg(not(target_arch = "wasm32"))]
3690fn render_layout_open_plate_document_cached(
3691    doc: &PdfDocument,
3692    layout_cache: &mut LayoutSourceCache,
3693) -> Option<String> {
3694    if doc.number_of_pages != 1 {
3695        return None;
3696    }
3697
3698    let layout = layout_cache.bbox_layout(doc)?;
3699    let plate = detect_layout_open_plate(layout.page_width, &layout.lines)
3700        .or_else(|| detect_layout_block_pair_plate(layout.page_width, &layout.lines))?;
3701    let bridge = extract_layout_narrative_bridge(layout.page_width, &layout.lines, &plate);
3702
3703    let mut output = String::new();
3704    output.push_str("# ");
3705    output.push_str(plate.heading.trim());
3706    output.push_str("\n\n");
3707
3708    let mut rendered_rows = Vec::with_capacity(plate.rows.len() + 1);
3709    rendered_rows.push(plate.header_row.clone());
3710    rendered_rows.extend(plate.rows.clone());
3711    output.push_str(&render_pipe_rows(&rendered_rows));
3712
3713    if !plate.caption.trim().is_empty() {
3714        output.push('*');
3715        output.push_str(plate.caption.trim());
3716        output.push_str("*\n\n");
3717    }
3718
3719    let mut filtered = doc.clone();
3720    filtered.title = None;
3721    filtered.kids.retain(|element| {
3722        if element.page_number() != Some(1) {
3723            return true;
3724        }
3725        if element.bbox().top_y >= plate.cutoff_top_y - 2.0 {
3726            return false;
3727        }
3728
3729        let text = extract_element_text(element);
3730        let trimmed = text.trim();
3731        if trimmed.is_empty() {
3732            return true;
3733        }
3734
3735        if looks_like_footer_banner(trimmed)
3736            || looks_like_margin_page_number(doc, element, trimmed)
3737            || (element.bbox().bottom_y <= 56.0 && trimmed.split_whitespace().count() >= 4)
3738        {
3739            return false;
3740        }
3741
3742        if let Some(body_start_top_y) = bridge.as_ref().and_then(|bridge| bridge.body_start_top_y) {
3743            if element.bbox().top_y > body_start_top_y + 6.0 {
3744                return false;
3745            }
3746        }
3747
3748        if starts_with_caption_prefix(trimmed) {
3749            return false;
3750        }
3751
3752        true
3753    });
3754
3755    let body = render_markdown_core(&filtered);
3756    let trimmed_body = body.trim();
3757    let has_body = !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*";
3758    let has_bridge = bridge
3759        .as_ref()
3760        .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3761        .is_some_and(|paragraph| !paragraph.trim().is_empty());
3762    let has_deferred_captions = bridge
3763        .as_ref()
3764        .is_some_and(|bridge| !bridge.deferred_captions.is_empty());
3765
3766    if has_body || has_bridge || has_deferred_captions {
3767        output.push_str("---\n\n");
3768    }
3769    if let Some(bridge_paragraph) = bridge
3770        .as_ref()
3771        .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3772    {
3773        output.push_str(&escape_md_line_start(bridge_paragraph.trim()));
3774        output.push_str("\n\n");
3775    }
3776    if has_body {
3777        output.push_str(trimmed_body);
3778        output.push('\n');
3779        if has_deferred_captions {
3780            output.push('\n');
3781        }
3782    }
3783    if let Some(bridge) = &bridge {
3784        for caption in &bridge.deferred_captions {
3785            output.push('*');
3786            output.push_str(caption.trim());
3787            output.push_str("*\n\n");
3788        }
3789    }
3790
3791    Some(output.trim_end().to_string() + "\n")
3792}
3793
3794#[cfg(not(target_arch = "wasm32"))]
3795fn detect_layout_block_pair_plate(
3796    page_width: f64,
3797    lines: &[BBoxLayoutLine],
3798) -> Option<OpenPlateCandidate> {
3799    let blocks = collect_bbox_layout_blocks(lines);
3800    let page_top = blocks
3801        .iter()
3802        .map(|block| block.bbox.top_y)
3803        .fold(0.0_f64, f64::max);
3804
3805    let heading_block = blocks.iter().find(|block| {
3806        let text = bbox_layout_block_text(block);
3807        let word_count = text.split_whitespace().count();
3808        (3..=8).contains(&word_count)
3809            && block.bbox.width() <= page_width * 0.45
3810            && block.bbox.top_y >= page_top - 36.0
3811            && !text.ends_with(['.', ':'])
3812    })?;
3813    let heading = bbox_layout_block_text(heading_block);
3814    if heading.trim().is_empty() {
3815        return None;
3816    }
3817
3818    let caption_block = blocks.iter().find(|block| {
3819        let text = bbox_layout_block_text(block);
3820        text.starts_with("Table ")
3821            && block.bbox.width() >= page_width * 0.35
3822            && block.bbox.top_y < heading_block.bbox.top_y - 24.0
3823            && block.bbox.top_y >= heading_block.bbox.top_y - 140.0
3824    })?;
3825
3826    let candidate_blocks = blocks
3827        .iter()
3828        .filter(|block| {
3829            block.block_id != heading_block.block_id
3830                && block.block_id != caption_block.block_id
3831                && block.bbox.top_y < heading_block.bbox.top_y - 4.0
3832                && block.bbox.bottom_y > caption_block.bbox.top_y + 4.0
3833                && block.bbox.width() <= page_width * 0.45
3834        })
3835        .collect::<Vec<_>>();
3836    if candidate_blocks.len() < 6 {
3837        return None;
3838    }
3839
3840    let mut fragments = Vec::new();
3841    for block in candidate_blocks {
3842        for line in &block.lines {
3843            let text = bbox_layout_line_text(line);
3844            let word_count = text.split_whitespace().count();
3845            if !(1..=5).contains(&word_count) || text.ends_with(['.', ':']) {
3846                continue;
3847            }
3848            fragments.extend(split_bbox_layout_line_fragments(line));
3849        }
3850    }
3851    if fragments.len() < 6 {
3852        return None;
3853    }
3854
3855    let mut centers = fragments
3856        .iter()
3857        .map(|fragment| fragment.bbox.center_x())
3858        .collect::<Vec<_>>();
3859    centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
3860    let (split_idx, max_gap) = centers
3861        .windows(2)
3862        .enumerate()
3863        .map(|(idx, pair)| (idx, pair[1] - pair[0]))
3864        .max_by(|left, right| {
3865            left.1
3866                .partial_cmp(&right.1)
3867                .unwrap_or(std::cmp::Ordering::Equal)
3868        })?;
3869    if max_gap < page_width * 0.04 {
3870        return None;
3871    }
3872    let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
3873
3874    let avg_height = fragments
3875        .iter()
3876        .map(|fragment| fragment.bbox.height())
3877        .sum::<f64>()
3878        / fragments.len() as f64;
3879    let row_tolerance = avg_height.max(8.0) * 1.4;
3880
3881    let mut sorted_fragments = fragments;
3882    sorted_fragments.sort_by(|left, right| {
3883        cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
3884    });
3885
3886    let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
3887    for fragment in sorted_fragments {
3888        let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
3889        if let Some((center_y, cells)) = row_bands
3890            .iter_mut()
3891            .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
3892        {
3893            *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
3894            append_cell_text(&mut cells[slot_idx], &fragment.text);
3895        } else {
3896            let mut cells = vec![String::new(), String::new()];
3897            append_cell_text(&mut cells[slot_idx], &fragment.text);
3898            row_bands.push((fragment.bbox.center_y(), cells));
3899        }
3900    }
3901
3902    row_bands.sort_by(|left, right| {
3903        right
3904            .0
3905            .partial_cmp(&left.0)
3906            .unwrap_or(std::cmp::Ordering::Equal)
3907    });
3908    let rows = row_bands
3909        .into_iter()
3910        .map(|(_, cells)| cells)
3911        .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
3912        .collect::<Vec<_>>();
3913    if !(3..=8).contains(&rows.len()) {
3914        return None;
3915    }
3916
3917    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(caption_block));
3918    if caption.trim().is_empty() {
3919        return None;
3920    }
3921
3922    Some(OpenPlateCandidate {
3923        heading: heading.trim().to_string(),
3924        header_row: vec![
3925            heading.trim().to_string(),
3926            infer_open_plate_secondary_header(&rows),
3927        ],
3928        rows,
3929        caption,
3930        cutoff_top_y: caption_block.bbox.bottom_y,
3931    })
3932}
3933
3934#[cfg(not(target_arch = "wasm32"))]
3935#[allow(dead_code)]
3936fn render_layout_toc_document(doc: &PdfDocument) -> Option<String> {
3937    let mut layout_cache = LayoutSourceCache::default();
3938    render_layout_toc_document_cached(doc, &mut layout_cache)
3939}
3940
3941#[cfg(not(target_arch = "wasm32"))]
3942fn render_layout_toc_document_cached(
3943    doc: &PdfDocument,
3944    layout_cache: &mut LayoutSourceCache,
3945) -> Option<String> {
3946    if doc.number_of_pages != 1 {
3947        return None;
3948    }
3949
3950    let lines = layout_cache.layout_lines(doc)?;
3951    let (title, entries) = extract_layout_toc_entries(lines)?;
3952    if entries.len() < 5 {
3953        return None;
3954    }
3955
3956    let mut output = String::new();
3957    output.push_str("# ");
3958    output.push_str(title.trim());
3959    output.push_str("\n\n");
3960    for entry in entries {
3961        output.push_str("## ");
3962        output.push_str(entry.title.trim());
3963        output.push(' ');
3964        output.push_str(entry.page.trim());
3965        output.push_str("\n\n");
3966    }
3967    Some(output)
3968}
3969
3970#[cfg(not(target_arch = "wasm32"))]
3971fn extract_layout_toc_entries(lines: &[String]) -> Option<(String, Vec<LayoutTocEntry>)> {
3972    let title_idx = lines.iter().position(|line| {
3973        matches!(
3974            normalize_heading_text(line.trim()).as_str(),
3975            "contents" | "tableofcontents"
3976        )
3977    })?;
3978    let title = lines[title_idx].trim().to_string();
3979
3980    let mut entries: Vec<LayoutTocEntry> = Vec::new();
3981    let mut page_start: Option<usize> = None;
3982    let mut miss_count = 0usize;
3983
3984    for line in lines.iter().skip(title_idx + 1) {
3985        let trimmed = line.trim();
3986        if trimmed.is_empty() {
3987            continue;
3988        }
3989        if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
3990            continue;
3991        }
3992
3993        let spans = split_layout_line_spans(line);
3994        if let Some((title_start, title_text, page_text, page_col)) =
3995            parse_layout_toc_entry_spans(&spans)
3996        {
3997            if let Some(prev) = entries.last_mut() {
3998                if prev.page == page_text
3999                    && title_start <= prev.title_start + 2
4000                    && prev.title.split_whitespace().count() >= 5
4001                {
4002                    append_cell_text(&mut prev.title, &title_text);
4003                    miss_count = 0;
4004                    continue;
4005                }
4006            }
4007
4008            if let Some(anchor) = page_start {
4009                if page_col.abs_diff(anchor) > 4 {
4010                    miss_count += 1;
4011                    if miss_count >= 2 {
4012                        break;
4013                    }
4014                    continue;
4015                }
4016            } else {
4017                page_start = Some(page_col);
4018            }
4019
4020            entries.push(LayoutTocEntry {
4021                title: title_text,
4022                page: page_text,
4023                title_start,
4024            });
4025            miss_count = 0;
4026            continue;
4027        }
4028
4029        if let Some(prev) = entries.last_mut() {
4030            if spans.len() == 1 {
4031                let (start, text) = &spans[0];
4032                if *start <= prev.title_start + 2
4033                    && text.split_whitespace().count() <= 6
4034                    && !ends_with_page_marker(text)
4035                {
4036                    append_cell_text(&mut prev.title, text);
4037                    miss_count = 0;
4038                    continue;
4039                }
4040            }
4041        }
4042
4043        miss_count += 1;
4044        if miss_count >= 2 && !entries.is_empty() {
4045            break;
4046        }
4047    }
4048
4049    (!entries.is_empty()).then_some((title, entries))
4050}
4051
4052#[cfg(not(target_arch = "wasm32"))]
4053fn parse_layout_toc_entry_spans(
4054    spans: &[(usize, String)],
4055) -> Option<(usize, String, String, usize)> {
4056    if spans.len() < 2 {
4057        return None;
4058    }
4059
4060    let (page_start, page_text) = spans.last()?;
4061    if !ends_with_page_marker(page_text.trim()) {
4062        return None;
4063    }
4064
4065    let title_start = spans.first()?.0;
4066    let title_text = spans[..spans.len() - 1]
4067        .iter()
4068        .map(|(_, text)| text.trim())
4069        .filter(|text| !text.is_empty())
4070        .collect::<Vec<_>>()
4071        .join(" ");
4072    let page_text = page_text
4073        .split_whitespace()
4074        .last()
4075        .unwrap_or(page_text)
4076        .to_string();
4077
4078    if title_text.split_whitespace().count() < 1 || title_text.len() < 4 {
4079        return None;
4080    }
4081    Some((title_start, title_text, page_text, *page_start))
4082}
4083
4084#[cfg(not(target_arch = "wasm32"))]
4085fn detect_layout_open_plate(
4086    page_width: f64,
4087    lines: &[BBoxLayoutLine],
4088) -> Option<OpenPlateCandidate> {
4089    let heading_idx = lines.iter().position(|line| {
4090        let text = bbox_layout_line_text(line);
4091        let word_count = text.split_whitespace().count();
4092        (3..=8).contains(&word_count)
4093            && line.bbox.width() <= page_width * 0.55
4094            && !text.ends_with(['.', ':'])
4095    })?;
4096
4097    let heading = bbox_layout_line_text(&lines[heading_idx]);
4098    if heading.trim().is_empty() {
4099        return None;
4100    }
4101    if has_substantive_layout_prose_before(lines, heading_idx, page_width) {
4102        return None;
4103    }
4104
4105    let caption_idx = (heading_idx + 1..lines.len()).find(|idx| {
4106        let line = &lines[*idx];
4107        let text = bbox_layout_line_text(line);
4108        text.split_whitespace().count() >= 6 && line.bbox.width() >= page_width * 0.45
4109    })?;
4110
4111    let candidate_lines = lines[heading_idx + 1..caption_idx]
4112        .iter()
4113        .filter(|line| {
4114            let text = bbox_layout_line_text(line);
4115            let word_count = text.split_whitespace().count();
4116            (1..=5).contains(&word_count) && !text.ends_with(['.', ':'])
4117        })
4118        .collect::<Vec<_>>();
4119    if candidate_lines.len() < 4 {
4120        return None;
4121    }
4122
4123    let mut fragments = Vec::new();
4124    for line in candidate_lines {
4125        fragments.extend(split_bbox_layout_line_fragments(line));
4126    }
4127    if fragments.len() < 6 {
4128        return None;
4129    }
4130
4131    let mut centers = fragments
4132        .iter()
4133        .map(|fragment| fragment.bbox.center_x())
4134        .collect::<Vec<_>>();
4135    centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4136    let (split_idx, max_gap) = centers
4137        .windows(2)
4138        .enumerate()
4139        .map(|(idx, pair)| (idx, pair[1] - pair[0]))
4140        .max_by(|left, right| {
4141            left.1
4142                .partial_cmp(&right.1)
4143                .unwrap_or(std::cmp::Ordering::Equal)
4144        })?;
4145    if max_gap < page_width * 0.04 {
4146        return None;
4147    }
4148    let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
4149
4150    let avg_height = fragments
4151        .iter()
4152        .map(|fragment| fragment.bbox.height())
4153        .sum::<f64>()
4154        / fragments.len() as f64;
4155    let row_tolerance = avg_height.max(8.0) * 1.4;
4156
4157    let mut sorted_fragments = fragments.clone();
4158    sorted_fragments.sort_by(|left, right| {
4159        cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
4160    });
4161
4162    let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
4163    for fragment in sorted_fragments {
4164        let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
4165        if let Some((center_y, cells)) = row_bands
4166            .iter_mut()
4167            .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
4168        {
4169            *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
4170            append_cell_text(&mut cells[slot_idx], &fragment.text);
4171        } else {
4172            let mut cells = vec![String::new(), String::new()];
4173            append_cell_text(&mut cells[slot_idx], &fragment.text);
4174            row_bands.push((fragment.bbox.center_y(), cells));
4175        }
4176    }
4177
4178    row_bands.sort_by(|left, right| {
4179        right
4180            .0
4181            .partial_cmp(&left.0)
4182            .unwrap_or(std::cmp::Ordering::Equal)
4183    });
4184
4185    let rows = row_bands
4186        .into_iter()
4187        .map(|(_, cells)| cells)
4188        .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
4189        .collect::<Vec<_>>();
4190    if !(3..=8).contains(&rows.len()) {
4191        return None;
4192    }
4193
4194    let caption_lines = collect_open_plate_caption_lines(page_width, &lines[caption_idx..]);
4195    let caption = caption_lines
4196        .iter()
4197        .map(|line| bbox_layout_line_text(line))
4198        .collect::<Vec<_>>()
4199        .join(" ");
4200    if caption.trim().is_empty() {
4201        return None;
4202    }
4203    if !starts_with_caption_prefix(caption.trim()) {
4204        return None;
4205    }
4206
4207    let secondary_header = infer_open_plate_secondary_header(&rows);
4208    let cutoff_top_y = caption_lines
4209        .last()
4210        .map(|line| line.bbox.bottom_y)
4211        .unwrap_or(lines[caption_idx].bbox.bottom_y);
4212
4213    Some(OpenPlateCandidate {
4214        heading: heading.trim().to_string(),
4215        header_row: vec![heading.trim().to_string(), secondary_header],
4216        rows,
4217        caption: caption.trim().to_string(),
4218        cutoff_top_y,
4219    })
4220}
4221
4222#[cfg(not(target_arch = "wasm32"))]
4223fn collect_open_plate_caption_lines<'a>(
4224    page_width: f64,
4225    lines: &'a [BBoxLayoutLine],
4226) -> Vec<&'a BBoxLayoutLine> {
4227    let mut caption_lines: Vec<&'a BBoxLayoutLine> = Vec::new();
4228    for line in lines {
4229        let text = bbox_layout_line_text(line);
4230        if text.split_whitespace().count() < 4 || line.bbox.width() < page_width * 0.35 {
4231            break;
4232        }
4233        if !caption_lines.is_empty() {
4234            let prev = caption_lines.last().unwrap().bbox.bottom_y;
4235            if prev - line.bbox.top_y > line.bbox.height().max(10.0) * 1.8 {
4236                break;
4237            }
4238        }
4239        caption_lines.push(line);
4240    }
4241    caption_lines
4242}
4243
4244#[cfg(not(target_arch = "wasm32"))]
4245fn infer_open_plate_secondary_header(rows: &[Vec<String>]) -> String {
4246    let right_cells = rows
4247        .iter()
4248        .filter_map(|row| row.get(1))
4249        .map(|cell| cell.trim())
4250        .collect::<Vec<_>>();
4251    if right_cells.len() >= 3
4252        && right_cells
4253            .iter()
4254            .all(|cell| looks_like_scientific_name(cell))
4255    {
4256        "Scientific name".to_string()
4257    } else {
4258        String::new()
4259    }
4260}
4261
4262#[cfg(not(target_arch = "wasm32"))]
4263fn has_substantive_layout_prose_before(
4264    lines: &[BBoxLayoutLine],
4265    line_idx: usize,
4266    page_width: f64,
4267) -> bool {
4268    lines.iter().take(line_idx).any(|line| {
4269        let text = bbox_layout_line_text(line);
4270        let trimmed = text.trim();
4271        if trimmed.is_empty() {
4272            return false;
4273        }
4274
4275        let word_count = trimmed.split_whitespace().count();
4276        if word_count < 6 {
4277            return false;
4278        }
4279
4280        if starts_with_caption_prefix(trimmed)
4281            || looks_like_numeric_axis_blob(trimmed)
4282            || (word_count <= 10
4283                && (looks_like_yearish_label(trimmed)
4284                    || looks_like_layout_month_label(trimmed)
4285                    || trimmed == "Lockdown Period"))
4286            || trimmed
4287                .chars()
4288                .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
4289        {
4290            return false;
4291        }
4292
4293        line.bbox.width() >= page_width * 0.32
4294    })
4295}
4296
4297#[cfg(not(target_arch = "wasm32"))]
4298fn extract_layout_narrative_bridge(
4299    page_width: f64,
4300    lines: &[BBoxLayoutLine],
4301    plate: &OpenPlateCandidate,
4302) -> Option<LayoutNarrativeBridge> {
4303    let post_plate_lines = lines
4304        .iter()
4305        .filter(|line| line.bbox.top_y < plate.cutoff_top_y - 4.0 && line.bbox.bottom_y > 56.0)
4306        .collect::<Vec<_>>();
4307    if post_plate_lines.is_empty() {
4308        return None;
4309    }
4310
4311    let deferred_captions = collect_deferred_caption_blocks(page_width, &post_plate_lines);
4312    let body_start_top_y = post_plate_lines
4313        .iter()
4314        .find(|line| is_full_width_layout_line(page_width, line))
4315        .map(|line| line.bbox.top_y);
4316
4317    let mut bridge_lines = Vec::new();
4318    for line in &post_plate_lines {
4319        if body_start_top_y.is_some_and(|top_y| line.bbox.top_y <= top_y + 1.0) {
4320            break;
4321        }
4322        if line.bbox.right_x > page_width * 0.46 {
4323            continue;
4324        }
4325        let text = bbox_layout_line_text(line);
4326        if text.trim().is_empty() || starts_with_caption_prefix(text.trim()) {
4327            continue;
4328        }
4329        bridge_lines.push(*line);
4330    }
4331
4332    let bridge_paragraph = if bridge_lines.len() >= 4 {
4333        let paragraph = join_layout_lines_as_paragraph(&bridge_lines);
4334        (!paragraph.trim().is_empty()).then_some(paragraph)
4335    } else {
4336        None
4337    };
4338
4339    if bridge_paragraph.is_none() && deferred_captions.is_empty() && body_start_top_y.is_none() {
4340        return None;
4341    }
4342    Some(LayoutNarrativeBridge {
4343        bridge_paragraph,
4344        deferred_captions,
4345        body_start_top_y,
4346    })
4347}
4348
4349#[cfg(not(target_arch = "wasm32"))]
4350fn collect_deferred_caption_blocks(page_width: f64, lines: &[&BBoxLayoutLine]) -> Vec<String> {
4351    let mut captions = Vec::new();
4352    let mut consumed_block_ids = Vec::new();
4353    let mut idx = 0usize;
4354    while idx < lines.len() {
4355        let line = lines[idx];
4356        let line_text = bbox_layout_line_text(line);
4357        if !starts_with_caption_prefix(line_text.trim())
4358            || line.bbox.width() >= page_width * 0.8
4359            || consumed_block_ids.contains(&line.block_id)
4360        {
4361            idx += 1;
4362            continue;
4363        }
4364
4365        let mut block = lines
4366            .iter()
4367            .copied()
4368            .filter(|candidate| candidate.block_id == line.block_id)
4369            .collect::<Vec<_>>();
4370        block.sort_by(|left, right| {
4371            right
4372                .bbox
4373                .top_y
4374                .partial_cmp(&left.bbox.top_y)
4375                .unwrap_or(std::cmp::Ordering::Equal)
4376        });
4377
4378        if block.len() == 1 {
4379            let mut cursor = idx + 1;
4380            while cursor < lines.len() {
4381                let next = lines[cursor];
4382                let gap = block.last().unwrap().bbox.bottom_y - next.bbox.top_y;
4383                if gap < -2.0 || gap > next.bbox.height().max(10.0) * 1.6 {
4384                    break;
4385                }
4386                if next.bbox.left_x < line.bbox.left_x - 12.0
4387                    || next.bbox.left_x > line.bbox.right_x + 20.0
4388                {
4389                    break;
4390                }
4391                let next_text = bbox_layout_line_text(next);
4392                if next_text.trim().is_empty() || is_full_width_layout_line(page_width, next) {
4393                    break;
4394                }
4395                block.push(next);
4396                cursor += 1;
4397            }
4398        }
4399
4400        let caption = join_layout_lines_as_paragraph(&block);
4401        if !caption.trim().is_empty() {
4402            captions.push(caption);
4403        }
4404        consumed_block_ids.push(line.block_id);
4405        idx += 1;
4406    }
4407    captions
4408}
4409
4410#[cfg(not(target_arch = "wasm32"))]
4411fn is_full_width_layout_line(page_width: f64, line: &BBoxLayoutLine) -> bool {
4412    line.bbox.left_x <= page_width * 0.14
4413        && line.bbox.right_x >= page_width * 0.84
4414        && line.bbox.width() >= page_width * 0.68
4415        && bbox_layout_line_text(line).split_whitespace().count() >= 8
4416}
4417
4418#[cfg(not(target_arch = "wasm32"))]
4419fn join_layout_lines_as_paragraph(lines: &[&BBoxLayoutLine]) -> String {
4420    let mut text = String::new();
4421    for line in lines {
4422        let next = bbox_layout_line_text(line);
4423        let trimmed = next.trim();
4424        if trimmed.is_empty() {
4425            continue;
4426        }
4427        if text.is_empty() {
4428            text.push_str(trimmed);
4429            continue;
4430        }
4431
4432        if text.ends_with('-')
4433            && text
4434                .chars()
4435                .rev()
4436                .nth(1)
4437                .is_some_and(|ch| ch.is_alphabetic())
4438        {
4439            text.pop();
4440            text.push_str(trimmed);
4441        } else {
4442            text.push(' ');
4443            text.push_str(trimmed);
4444        }
4445    }
4446    normalize_common_ocr_text(text.trim())
4447}
4448
4449#[cfg(not(target_arch = "wasm32"))]
4450fn looks_like_scientific_name(text: &str) -> bool {
4451    let tokens = text
4452        .split_whitespace()
4453        .map(|token| token.trim_matches(|ch: char| !ch.is_alphabetic() && ch != '-'))
4454        .filter(|token| !token.is_empty())
4455        .collect::<Vec<_>>();
4456    if tokens.len() != 2 {
4457        return false;
4458    }
4459
4460    tokens[0].chars().next().is_some_and(char::is_uppercase)
4461        && tokens[0]
4462            .chars()
4463            .skip(1)
4464            .all(|ch| ch.is_lowercase() || ch == '-')
4465        && tokens[1].chars().all(|ch| ch.is_lowercase() || ch == '-')
4466}
4467
4468#[cfg(not(target_arch = "wasm32"))]
4469fn split_bbox_layout_line_fragments(line: &BBoxLayoutLine) -> Vec<LayoutTextFragment> {
4470    if line.words.is_empty() {
4471        return Vec::new();
4472    }
4473    if line.words.len() == 1 {
4474        return vec![LayoutTextFragment {
4475            bbox: line.words[0].bbox.clone(),
4476            text: line.words[0].text.clone(),
4477        }];
4478    }
4479
4480    let gaps = line
4481        .words
4482        .windows(2)
4483        .enumerate()
4484        .map(|(idx, pair)| (idx, pair[1].bbox.left_x - pair[0].bbox.right_x))
4485        .collect::<Vec<_>>();
4486    let positive_gaps = gaps
4487        .iter()
4488        .map(|(_, gap)| *gap)
4489        .filter(|gap| *gap > 0.0)
4490        .collect::<Vec<_>>();
4491    if positive_gaps.is_empty() {
4492        return vec![LayoutTextFragment {
4493            bbox: line.bbox.clone(),
4494            text: bbox_layout_line_text(line),
4495        }];
4496    }
4497
4498    let mut sorted_gaps = positive_gaps.clone();
4499    sorted_gaps.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4500    let median_gap = sorted_gaps[sorted_gaps.len() / 2];
4501    let (split_idx, max_gap) = gaps
4502        .iter()
4503        .max_by(|left, right| {
4504            left.1
4505                .partial_cmp(&right.1)
4506                .unwrap_or(std::cmp::Ordering::Equal)
4507        })
4508        .copied()
4509        .unwrap();
4510
4511    if max_gap < line.bbox.height().max(8.0) * 0.55 || max_gap < median_gap * 1.8 {
4512        return vec![LayoutTextFragment {
4513            bbox: line.bbox.clone(),
4514            text: bbox_layout_line_text(line),
4515        }];
4516    }
4517
4518    let mut fragments = Vec::new();
4519    for words in [&line.words[..=split_idx], &line.words[split_idx + 1..]] {
4520        let text = words
4521            .iter()
4522            .map(|word| word.text.trim())
4523            .filter(|word| !word.is_empty())
4524            .collect::<Vec<_>>()
4525            .join(" ");
4526        if text.trim().is_empty() {
4527            continue;
4528        }
4529
4530        let bbox = words
4531            .iter()
4532            .skip(1)
4533            .fold(words[0].bbox.clone(), |acc, word| acc.union(&word.bbox));
4534        fragments.push(LayoutTextFragment {
4535            bbox,
4536            text: normalize_common_ocr_text(text.trim()),
4537        });
4538    }
4539    if fragments.is_empty() {
4540        vec![LayoutTextFragment {
4541            bbox: line.bbox.clone(),
4542            text: bbox_layout_line_text(line),
4543        }]
4544    } else {
4545        fragments
4546    }
4547}
4548
4549#[cfg(not(target_arch = "wasm32"))]
4550fn bbox_layout_line_text(line: &BBoxLayoutLine) -> String {
4551    normalize_common_ocr_text(
4552        &line
4553            .words
4554            .iter()
4555            .map(|word| word.text.trim())
4556            .filter(|word| !word.is_empty())
4557            .collect::<Vec<_>>()
4558            .join(" "),
4559    )
4560}
4561
4562#[cfg(not(target_arch = "wasm32"))]
4563fn read_pdftotext_bbox_layout_lines(path: &Path) -> Option<(f64, Vec<BBoxLayoutLine>)> {
4564    let output = Command::new("pdftotext")
4565        .arg("-bbox-layout")
4566        .arg(path)
4567        .arg("-")
4568        .output()
4569        .ok()?;
4570    if !output.status.success() {
4571        return None;
4572    }
4573
4574    let xml = String::from_utf8_lossy(&output.stdout);
4575    let page_re = Regex::new(r#"(?s)<page width="([^"]+)" height="([^"]+)">(.*?)</page>"#).ok()?;
4576    let block_re = Regex::new(
4577        r#"(?s)<block xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</block>"#,
4578    )
4579    .ok()?;
4580    let line_re = Regex::new(
4581        r#"(?s)<line xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</line>"#,
4582    )
4583    .ok()?;
4584    let word_re = Regex::new(
4585        r#"(?s)<word xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</word>"#,
4586    )
4587    .ok()?;
4588
4589    let page = page_re.captures(&xml)?;
4590    let page_width = page.get(1)?.as_str().parse::<f64>().ok()?;
4591    let page_height = page.get(2)?.as_str().parse::<f64>().ok()?;
4592    let page_body = page.get(3)?.as_str();
4593
4594    let mut lines = Vec::new();
4595    for (block_id, block_caps) in block_re.captures_iter(page_body).enumerate() {
4596        let block_body = block_caps.get(5)?.as_str();
4597        for captures in line_re.captures_iter(block_body) {
4598            let x_min = captures.get(1)?.as_str().parse::<f64>().ok()?;
4599            let y_min = captures.get(2)?.as_str().parse::<f64>().ok()?;
4600            let x_max = captures.get(3)?.as_str().parse::<f64>().ok()?;
4601            let y_max = captures.get(4)?.as_str().parse::<f64>().ok()?;
4602            let line_body = captures.get(5)?.as_str();
4603
4604            let mut words = Vec::new();
4605            for word_caps in word_re.captures_iter(line_body) {
4606                let wx_min = word_caps.get(1)?.as_str().parse::<f64>().ok()?;
4607                let wy_min = word_caps.get(2)?.as_str().parse::<f64>().ok()?;
4608                let wx_max = word_caps.get(3)?.as_str().parse::<f64>().ok()?;
4609                let wy_max = word_caps.get(4)?.as_str().parse::<f64>().ok()?;
4610                let raw_text = decode_bbox_layout_text(word_caps.get(5)?.as_str());
4611                if raw_text.trim().is_empty() {
4612                    continue;
4613                }
4614                words.push(BBoxLayoutWord {
4615                    bbox: bbox_layout_box(page_height, wx_min, wy_min, wx_max, wy_max),
4616                    text: raw_text,
4617                });
4618            }
4619            if words.is_empty() {
4620                continue;
4621            }
4622            lines.push(BBoxLayoutLine {
4623                block_id,
4624                bbox: bbox_layout_box(page_height, x_min, y_min, x_max, y_max),
4625                words,
4626            });
4627        }
4628    }
4629
4630    lines.sort_by(|left, right| {
4631        cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
4632            .then_with(|| left.block_id.cmp(&right.block_id))
4633    });
4634    Some((page_width, lines))
4635}
4636
4637#[cfg(not(target_arch = "wasm32"))]
4638fn bbox_layout_box(
4639    page_height: f64,
4640    x_min: f64,
4641    y_min: f64,
4642    x_max: f64,
4643    y_max: f64,
4644) -> BoundingBox {
4645    BoundingBox::new(
4646        Some(1),
4647        x_min,
4648        page_height - y_max,
4649        x_max,
4650        page_height - y_min,
4651    )
4652}
4653
4654#[cfg(not(target_arch = "wasm32"))]
4655fn decode_bbox_layout_text(text: &str) -> String {
4656    text.replace("&quot;", "\"")
4657        .replace("&apos;", "'")
4658        .replace("&#39;", "'")
4659        .replace("&amp;", "&")
4660        .replace("&lt;", "<")
4661        .replace("&gt;", ">")
4662}
4663
4664#[cfg(not(target_arch = "wasm32"))]
4665#[allow(dead_code)]
4666fn render_layout_matrix_document(doc: &PdfDocument) -> Option<String> {
4667    let mut layout_cache = LayoutSourceCache::default();
4668    render_layout_matrix_document_cached(doc, &mut layout_cache)
4669}
4670
4671#[cfg(not(target_arch = "wasm32"))]
4672fn render_layout_matrix_document_cached(
4673    doc: &PdfDocument,
4674    layout_cache: &mut LayoutSourceCache,
4675) -> Option<String> {
4676    if doc.number_of_pages != 1 {
4677        return None;
4678    }
4679
4680    let lines = layout_cache.layout_lines(doc)?;
4681    let header = find_layout_header_candidate(lines)?;
4682    let entries = extract_layout_entries(lines, &header);
4683    let mut rows = build_layout_anchor_rows(lines, &entries)?;
4684    if rows.len() < 6 || rows.len() > 14 {
4685        return None;
4686    }
4687
4688    let filled_data_rows = rows
4689        .iter()
4690        .filter(|row| row.iter().skip(1).all(|cell| !cell.trim().is_empty()))
4691        .count();
4692    if filled_data_rows + 1 < rows.len().saturating_sub(1) {
4693        return None;
4694    }
4695
4696    let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4697    rendered_rows.push(header.headers.clone());
4698    rendered_rows.append(&mut rows);
4699
4700    let mut output = String::new();
4701    if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4702        ContentElement::Heading(h) => Some(h.base.base.value()),
4703        ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4704        _ => None,
4705    }) {
4706        let trimmed = heading.trim();
4707        if !trimmed.is_empty() {
4708            output.push_str("# ");
4709            output.push_str(trimmed);
4710            output.push_str("\n\n");
4711        }
4712    }
4713    output.push_str(&render_pipe_rows(&rendered_rows));
4714    Some(output)
4715}
4716
4717#[cfg(not(target_arch = "wasm32"))]
4718#[allow(dead_code)]
4719fn render_layout_panel_stub_document(doc: &PdfDocument) -> Option<String> {
4720    let mut layout_cache = LayoutSourceCache::default();
4721    render_layout_panel_stub_document_cached(doc, &mut layout_cache)
4722}
4723
4724#[cfg(not(target_arch = "wasm32"))]
4725fn render_layout_panel_stub_document_cached(
4726    doc: &PdfDocument,
4727    layout_cache: &mut LayoutSourceCache,
4728) -> Option<String> {
4729    if doc.number_of_pages != 1 {
4730        return None;
4731    }
4732
4733    let lines = layout_cache.layout_lines(doc)?;
4734    let header = find_layout_panel_header_candidate(lines)?;
4735    let rows = build_layout_panel_stub_rows(lines, &header)?;
4736    if rows.len() < 2 || rows.len() > 6 {
4737        return None;
4738    }
4739
4740    let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4741    let mut header_row = vec![String::new()];
4742    header_row.extend(header.headers.clone());
4743    rendered_rows.push(header_row);
4744    rendered_rows.extend(rows);
4745
4746    let mut output = String::new();
4747    if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4748        ContentElement::Heading(h) => Some(h.base.base.value()),
4749        ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4750        _ => None,
4751    }) {
4752        let trimmed = heading.trim();
4753        if !trimmed.is_empty() {
4754            output.push_str("# ");
4755            output.push_str(trimmed);
4756            output.push_str("\n\n");
4757        }
4758    }
4759    output.push_str(&render_pipe_rows(&rendered_rows));
4760    Some(output)
4761}
4762
4763#[cfg(not(target_arch = "wasm32"))]
4764#[allow(dead_code)]
4765fn render_layout_projection_sheet_document(doc: &PdfDocument) -> Option<String> {
4766    let mut layout_cache = LayoutSourceCache::default();
4767    render_layout_projection_sheet_document_cached(doc, &mut layout_cache)
4768}
4769
4770#[cfg(not(target_arch = "wasm32"))]
4771fn render_layout_projection_sheet_document_cached(
4772    doc: &PdfDocument,
4773    layout_cache: &mut LayoutSourceCache,
4774) -> Option<String> {
4775    if doc.number_of_pages != 1 {
4776        return None;
4777    }
4778
4779    let lines = layout_cache.layout_lines(doc)?;
4780    let projection = detect_layout_projection_sheet(lines)?;
4781
4782    let mut output = String::from("# Table and Figure from the Document\n\n");
4783    output.push_str(&render_pipe_rows(&projection.table_rows));
4784    output.push_str("**");
4785    output.push_str(projection.figure_caption.trim());
4786    output.push_str("**\n\n");
4787    output.push_str("[Open Template in Microsoft Excel](#)\n\n");
4788    output.push_str(&escape_md_line_start(projection.body.trim()));
4789    output.push_str("\n\n");
4790    output.push('*');
4791    output.push_str(&escape_md_line_start(projection.footer.trim()));
4792    output.push_str("*\n");
4793
4794    Some(output)
4795}
4796
4797#[cfg(not(target_arch = "wasm32"))]
4798struct LayoutProjectionSheet {
4799    table_rows: Vec<Vec<String>>,
4800    figure_caption: String,
4801    body: String,
4802    footer: String,
4803}
4804
4805#[cfg(not(target_arch = "wasm32"))]
4806struct LayoutAppendixTableSection {
4807    heading: String,
4808    rows: Vec<Vec<String>>,
4809    notes: Vec<String>,
4810}
4811
4812#[cfg(not(target_arch = "wasm32"))]
4813struct LayoutAppendixTablesDocument {
4814    title: String,
4815    sections: Vec<LayoutAppendixTableSection>,
4816}
4817
4818#[cfg(not(target_arch = "wasm32"))]
4819struct LayoutDualTableArticle {
4820    first_title: String,
4821    first_intro: String,
4822    first_caption: String,
4823    first_rows: Vec<Vec<String>>,
4824    second_title: String,
4825    second_intro: String,
4826}
4827
4828#[cfg(not(target_arch = "wasm32"))]
4829struct LayoutTitledTableSection {
4830    heading: String,
4831    rows: Vec<Vec<String>>,
4832    note: Option<String>,
4833}
4834
4835#[cfg(not(target_arch = "wasm32"))]
4836struct LayoutTitledDualTableDocument {
4837    title: String,
4838    sections: Vec<LayoutTitledTableSection>,
4839}
4840
4841#[cfg(not(target_arch = "wasm32"))]
4842struct LayoutRegistrationReportDocument {
4843    title: String,
4844    rows: Vec<Vec<String>>,
4845}
4846
4847#[cfg(not(target_arch = "wasm32"))]
4848fn detect_layout_projection_sheet(lines: &[String]) -> Option<LayoutProjectionSheet> {
4849    let header_idx = lines.iter().position(|line| {
4850        split_layout_line_spans(line)
4851            .into_iter()
4852            .map(|(_, text)| text)
4853            .collect::<Vec<_>>()
4854            == vec!["A", "B", "C", "D", "E"]
4855    })?;
4856    let forecast_idx = lines
4857        .iter()
4858        .position(|line| line.contains("Forecast(observed)"))?;
4859    let lower_idx = lines
4860        .iter()
4861        .position(|line| line.contains("Lower Confidence") && line.contains("Upper Confidence"))?;
4862    let figure_idx = lines
4863        .iter()
4864        .position(|line| line.contains("Figure 13.3. Graph of Projection Estimates"))?;
4865    let template_idx = lines
4866        .iter()
4867        .position(|line| line.contains("Open Template in Microsoft Excel"))?;
4868    let footer_idx = lines
4869        .iter()
4870        .position(|line| line.contains("Ch. 13. Homogeneous Investment Types"))?;
4871
4872    if !(header_idx < lower_idx
4873        && lower_idx < forecast_idx
4874        && lower_idx < figure_idx
4875        && figure_idx < template_idx
4876        && template_idx < footer_idx)
4877    {
4878        return None;
4879    }
4880
4881    let mut table_rows = vec![
4882        vec![
4883            "A".to_string(),
4884            "B".to_string(),
4885            "C".to_string(),
4886            "D".to_string(),
4887            "E".to_string(),
4888        ],
4889        vec![
4890            "1".to_string(),
4891            "time".to_string(),
4892            "observed".to_string(),
4893            "Forecast(observed)".to_string(),
4894            "Lower Confidence Bound(observed)".to_string(),
4895        ],
4896    ];
4897
4898    for line in lines.iter().take(figure_idx).skip(lower_idx + 1) {
4899        let trimmed = line.trim();
4900        if trimmed.is_empty() {
4901            continue;
4902        }
4903        let tokens = trimmed.split_whitespace().collect::<Vec<_>>();
4904        if tokens.len() < 3 || !tokens[0].chars().all(|ch| ch.is_ascii_digit()) {
4905            continue;
4906        }
4907        if tokens[0] == "1" {
4908            continue;
4909        }
4910
4911        let row = match tokens.len() {
4912            3 => vec![
4913                tokens[0].to_string(),
4914                tokens[1].to_string(),
4915                tokens[2].to_string(),
4916                String::new(),
4917                String::new(),
4918            ],
4919            4 => vec![
4920                tokens[0].to_string(),
4921                tokens[1].to_string(),
4922                tokens[2].to_string(),
4923                tokens[3].to_string(),
4924                String::new(),
4925            ],
4926            _ => tokens
4927                .into_iter()
4928                .take(5)
4929                .map(str::to_string)
4930                .collect::<Vec<_>>(),
4931        };
4932        if row.len() == 5 {
4933            table_rows.push(row);
4934        }
4935    }
4936
4937    if table_rows.len() < 10 {
4938        return None;
4939    }
4940
4941    let body_lines = lines[template_idx + 1..footer_idx]
4942        .iter()
4943        .map(|line| line.trim())
4944        .filter(|line| !line.is_empty())
4945        .collect::<Vec<_>>();
4946    let body = body_lines.join(" ");
4947    if body.split_whitespace().count() < 12 {
4948        return None;
4949    }
4950
4951    Some(LayoutProjectionSheet {
4952        table_rows,
4953        figure_caption: "Figure 13.3. Graph of Projection Estimates".to_string(),
4954        body,
4955        footer: lines[footer_idx].trim().to_string(),
4956    })
4957}
4958
4959#[cfg(not(target_arch = "wasm32"))]
4960#[allow(dead_code)]
4961fn render_layout_appendix_tables_document(doc: &PdfDocument) -> Option<String> {
4962    let mut layout_cache = LayoutSourceCache::default();
4963    render_layout_appendix_tables_document_cached(doc, &mut layout_cache)
4964}
4965
4966#[cfg(not(target_arch = "wasm32"))]
4967fn render_layout_appendix_tables_document_cached(
4968    doc: &PdfDocument,
4969    layout_cache: &mut LayoutSourceCache,
4970) -> Option<String> {
4971    if doc.number_of_pages != 1 {
4972        return None;
4973    }
4974
4975    let lines = layout_cache.layout_lines(doc)?;
4976    let appendix = detect_layout_appendix_tables_document(lines)?;
4977
4978    let mut output = String::new();
4979    output.push_str("# ");
4980    output.push_str(appendix.title.trim());
4981    output.push_str("\n\n");
4982
4983    for section in appendix.sections {
4984        output.push_str("## ");
4985        output.push_str(section.heading.trim());
4986        output.push_str("\n\n");
4987        output.push_str(&render_pipe_rows(&section.rows));
4988        for note in section.notes {
4989            output.push('*');
4990            output.push_str(&escape_md_line_start(note.trim()));
4991            output.push_str("*\n");
4992        }
4993        output.push('\n');
4994    }
4995
4996    Some(output.trim_end().to_string() + "\n")
4997}
4998
4999#[cfg(not(target_arch = "wasm32"))]
5000#[allow(dead_code)]
5001fn render_layout_dual_table_article_document(doc: &PdfDocument) -> Option<String> {
5002    let mut layout_cache = LayoutSourceCache::default();
5003    render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
5004}
5005
5006#[cfg(not(target_arch = "wasm32"))]
5007fn render_layout_dual_table_article_document_cached(
5008    doc: &PdfDocument,
5009    layout_cache: &mut LayoutSourceCache,
5010) -> Option<String> {
5011    if doc.number_of_pages != 1 {
5012        return None;
5013    }
5014
5015    let lines = layout_cache.layout_lines(doc)?;
5016    let article = detect_layout_dual_table_article(lines)?;
5017
5018    let mut filtered = doc.clone();
5019    filtered.title = None;
5020    let body_start_idx = find_layout_dual_table_article_body_start_idx(doc);
5021    filtered.kids = doc.kids.iter().skip(body_start_idx).cloned().collect();
5022    let body = render_layout_dual_table_article_body(&filtered);
5023
5024    let mut output = String::new();
5025    output.push_str("# ");
5026    output.push_str(article.first_title.trim());
5027    output.push_str("\n\n*");
5028    output.push_str(&escape_md_line_start(article.first_intro.trim()));
5029    output.push_str("*\n\n");
5030    output.push_str(&render_pipe_rows(&article.first_rows));
5031    output.push_str("*Table 6*: ");
5032    output.push_str(&escape_md_line_start(
5033        article
5034            .first_caption
5035            .trim()
5036            .trim_start_matches("Table 6:")
5037            .trim(),
5038    ));
5039    output.push_str("*\n\n---\n\n");
5040    output.push_str("# ");
5041    output.push_str(article.second_title.trim());
5042    output.push_str("\n\n");
5043    output.push_str(&escape_md_line_start(article.second_intro.trim()));
5044    output.push_str("\n\n");
5045    let trimmed_body = body.trim();
5046    if !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*" {
5047        output.push_str(trimmed_body);
5048        output.push('\n');
5049    }
5050
5051    Some(output)
5052}
5053
5054#[cfg(not(target_arch = "wasm32"))]
5055fn detect_layout_dual_table_article(lines: &[String]) -> Option<LayoutDualTableArticle> {
5056    let first_header_idx = lines.iter().position(|line| {
5057        line.contains("H6 (Avg.)")
5058            && line.contains("HellaSwag")
5059            && line.contains("TruthfulQA")
5060            && !line.contains("Merge Method")
5061    })?;
5062    let first_caption_idx = (first_header_idx + 1..lines.len())
5063        .find(|idx| lines[*idx].trim_start().starts_with("Table 6:"))?;
5064    let second_header_idx = (first_caption_idx + 1..lines.len()).find(|idx| {
5065        lines[*idx].contains("Merge Method")
5066            && lines[*idx].contains("H6 (Avg.)")
5067            && lines[*idx].contains("GSM8K")
5068    })?;
5069    let second_caption_idx = (second_header_idx + 1..lines.len())
5070        .find(|idx| lines[*idx].trim_start().starts_with("Table 7:"))?;
5071
5072    let first_rows = parse_layout_anchor_table(lines, first_header_idx, first_caption_idx)?;
5073    if first_rows.len() < 3 {
5074        return None;
5075    }
5076
5077    let first_caption = collect_layout_caption_paragraph(lines, first_caption_idx)?;
5078    let second_intro = collect_layout_caption_paragraph(lines, second_caption_idx)?;
5079    let first_title = first_caption
5080        .split_once(". ")
5081        .map(|(title, _)| title)
5082        .unwrap_or(first_caption.as_str())
5083        .trim()
5084        .to_string();
5085    let second_title = second_intro
5086        .split_once(". ")
5087        .map(|(title, _)| title)
5088        .unwrap_or(second_intro.as_str())
5089        .trim()
5090        .to_string();
5091    let first_intro = first_caption
5092        .trim_start_matches(&first_title)
5093        .trim_start_matches('.')
5094        .trim()
5095        .to_string();
5096    let second_intro = second_intro
5097        .trim_start_matches(&second_title)
5098        .trim_start_matches('.')
5099        .trim()
5100        .to_string();
5101
5102    if first_title.is_empty() || second_title.is_empty() {
5103        return None;
5104    }
5105
5106    Some(LayoutDualTableArticle {
5107        first_title,
5108        first_intro,
5109        first_caption,
5110        first_rows,
5111        second_title,
5112        second_intro,
5113    })
5114}
5115
5116#[cfg(not(target_arch = "wasm32"))]
5117fn find_layout_dual_table_article_body_start_idx(doc: &PdfDocument) -> usize {
5118    let body_markers = [
5119        "tively impacted by adding Synth.",
5120        "Then, we experiment whether merging",
5121        "Ablation on the SFT base models.",
5122        "Ablation on different merge methods.",
5123        "5 Conclusion",
5124    ];
5125    doc.kids
5126        .iter()
5127        .position(|element| {
5128            let text = extract_element_text(element);
5129            let trimmed = text.trim();
5130            body_markers
5131                .iter()
5132                .any(|marker| trimmed.starts_with(marker))
5133        })
5134        .unwrap_or(4.min(doc.kids.len()))
5135}
5136
5137#[cfg(not(target_arch = "wasm32"))]
5138fn render_layout_dual_table_article_body(doc: &PdfDocument) -> String {
5139    let mut output = String::new();
5140    let mut i = 0usize;
5141    while i < doc.kids.len() {
5142        let text = extract_element_text(&doc.kids[i]);
5143        let trimmed = text.trim();
5144        if trimmed.is_empty() {
5145            i += 1;
5146            continue;
5147        }
5148
5149        if trimmed.starts_with("Ablation on the SFT base models.") {
5150            output.push_str("## Ablation on the SFT base models\n\n");
5151            let rest = trimmed
5152                .trim_start_matches("Ablation on the SFT base models.")
5153                .trim();
5154            if !rest.is_empty() {
5155                output.push_str(&escape_md_line_start(rest));
5156                output.push_str("\n\n");
5157            }
5158            i += 1;
5159            continue;
5160        }
5161
5162        if trimmed.starts_with("Ablation on different merge methods.") {
5163            output.push_str("## Ablation on different merge methods\n\n");
5164            let rest = trimmed
5165                .trim_start_matches("Ablation on different merge methods.")
5166                .trim();
5167            if !rest.is_empty() {
5168                output.push_str(&escape_md_line_start(rest));
5169                output.push_str("\n\n");
5170            }
5171            i += 1;
5172            continue;
5173        }
5174
5175        match &doc.kids[i] {
5176            ContentElement::Heading(h) => {
5177                output.push_str("# ");
5178                output.push_str(h.base.base.value().trim());
5179                output.push_str("\n\n");
5180            }
5181            ContentElement::NumberHeading(nh) => {
5182                output.push_str("# ");
5183                output.push_str(nh.base.base.base.value().trim());
5184                output.push_str("\n\n");
5185            }
5186            _ => {
5187                let mut merged = trimmed.to_string();
5188                while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
5189                    if next_text.starts_with("Ablation on the SFT base models.")
5190                        || next_text.starts_with("Ablation on different merge methods.")
5191                    {
5192                        break;
5193                    }
5194                    if !should_merge_paragraph_text(&merged, &next_text) {
5195                        break;
5196                    }
5197                    merge_paragraph_text(&mut merged, &next_text);
5198                    i += 1;
5199                }
5200                output.push_str(&escape_md_line_start(&merged));
5201                output.push_str("\n\n");
5202            }
5203        }
5204        i += 1;
5205    }
5206    output
5207}
5208
5209#[cfg(not(target_arch = "wasm32"))]
5210fn parse_layout_anchor_table(
5211    lines: &[String],
5212    header_idx: usize,
5213    stop_idx: usize,
5214) -> Option<Vec<Vec<String>>> {
5215    let header_spans = split_layout_line_spans(&lines[header_idx]);
5216    if header_spans.len() < 4 {
5217        return None;
5218    }
5219    let column_starts = header_spans
5220        .iter()
5221        .map(|(start, _)| *start)
5222        .collect::<Vec<_>>();
5223    let header = header_spans
5224        .into_iter()
5225        .map(|(_, text)| text)
5226        .collect::<Vec<_>>();
5227
5228    let mut rows = vec![header];
5229    for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5230        let trimmed = line.trim();
5231        if trimmed.is_empty() || trimmed.starts_with("Table ") {
5232            continue;
5233        }
5234        let spans = split_layout_line_spans(line);
5235        if spans.is_empty() {
5236            continue;
5237        }
5238
5239        let row = assign_layout_spans_to_columns(&spans, &column_starts);
5240        let non_empty = row.iter().filter(|cell| !cell.trim().is_empty()).count();
5241        if non_empty < 2 || row[0].trim().is_empty() {
5242            continue;
5243        }
5244        rows.push(row);
5245    }
5246
5247    Some(rows)
5248}
5249
5250#[cfg(not(target_arch = "wasm32"))]
5251fn assign_layout_spans_to_columns(
5252    spans: &[(usize, String)],
5253    column_starts: &[usize],
5254) -> Vec<String> {
5255    let mut cells = vec![String::new(); column_starts.len()];
5256    for (start, text) in spans {
5257        let Some((col_idx, _)) = column_starts
5258            .iter()
5259            .enumerate()
5260            .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5261        else {
5262            continue;
5263        };
5264        append_cell_text(&mut cells[col_idx], text);
5265    }
5266    cells
5267}
5268
5269#[cfg(not(target_arch = "wasm32"))]
5270#[allow(dead_code)]
5271fn render_layout_titled_dual_table_document(doc: &PdfDocument) -> Option<String> {
5272    let mut layout_cache = LayoutSourceCache::default();
5273    render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
5274}
5275
5276#[cfg(not(target_arch = "wasm32"))]
5277fn render_layout_titled_dual_table_document_cached(
5278    doc: &PdfDocument,
5279    layout_cache: &mut LayoutSourceCache,
5280) -> Option<String> {
5281    if doc.number_of_pages != 1 {
5282        return None;
5283    }
5284
5285    let lines = layout_cache.layout_lines(doc)?;
5286    let report = detect_layout_titled_dual_table_document(lines)?;
5287
5288    let mut output = String::new();
5289    output.push_str("# ");
5290    output.push_str(report.title.trim());
5291    output.push_str("\n\n");
5292
5293    for (idx, section) in report.sections.iter().enumerate() {
5294        output.push_str("## ");
5295        output.push_str(section.heading.trim());
5296        output.push_str("\n\n");
5297        output.push_str(&render_pipe_rows(&section.rows));
5298        if let Some(note) = &section.note {
5299            output.push('*');
5300            output.push_str(&escape_md_line_start(note.trim()));
5301            output.push_str("*\n");
5302        }
5303        if idx + 1 != report.sections.len() {
5304            output.push('\n');
5305        }
5306    }
5307
5308    Some(output.trim_end().to_string() + "\n")
5309}
5310
5311#[cfg(not(target_arch = "wasm32"))]
5312fn detect_layout_titled_dual_table_document(
5313    lines: &[String],
5314) -> Option<LayoutTitledDualTableDocument> {
5315    let title_idx = lines
5316        .iter()
5317        .position(|line| normalize_heading_text(line.trim()) == "jailedfordoingbusiness")?;
5318    let title = lines[title_idx].trim().to_string();
5319
5320    let caption_indices = lines
5321        .iter()
5322        .enumerate()
5323        .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5324        .collect::<Vec<_>>();
5325    if caption_indices.len() != 2 {
5326        return None;
5327    }
5328
5329    let mut sections = Vec::new();
5330    for (section_idx, caption_idx) in caption_indices.iter().enumerate() {
5331        let next_caption_idx = caption_indices
5332            .get(section_idx + 1)
5333            .copied()
5334            .unwrap_or(lines.len());
5335
5336        let header_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
5337            let spans = split_layout_line_spans(&lines[*idx]);
5338            (spans.len() == 3 || spans.len() == 4)
5339                && spans
5340                    .iter()
5341                    .all(|(_, text)| text.split_whitespace().count() <= 3)
5342        })?;
5343        let note_idx = (header_idx + 1..next_caption_idx)
5344            .find(|idx| lines[*idx].trim_start().starts_with('*'))
5345            .unwrap_or(next_caption_idx);
5346
5347        let heading = (*caption_idx..header_idx)
5348            .map(|idx| lines[idx].trim())
5349            .filter(|line| !line.is_empty())
5350            .collect::<Vec<_>>()
5351            .join(" ");
5352
5353        let rows = parse_layout_titled_stub_table(lines, header_idx, note_idx)?;
5354        let note = (note_idx < next_caption_idx)
5355            .then(|| {
5356                lines[note_idx]
5357                    .trim()
5358                    .trim_start_matches('*')
5359                    .trim()
5360                    .to_string()
5361            })
5362            .filter(|text| !text.is_empty());
5363
5364        sections.push(LayoutTitledTableSection {
5365            heading,
5366            rows,
5367            note,
5368        });
5369    }
5370
5371    Some(LayoutTitledDualTableDocument { title, sections })
5372}
5373
5374#[cfg(not(target_arch = "wasm32"))]
5375fn parse_layout_titled_stub_table(
5376    lines: &[String],
5377    header_idx: usize,
5378    stop_idx: usize,
5379) -> Option<Vec<Vec<String>>> {
5380    let header_spans = split_layout_line_spans(&lines[header_idx]);
5381    if header_spans.len() < 3 {
5382        return None;
5383    }
5384
5385    let mut column_starts = vec![0usize];
5386    column_starts.extend(header_spans.iter().map(|(start, _)| *start));
5387    let mut header = vec![String::new()];
5388    header.extend(header_spans.into_iter().map(|(_, text)| text));
5389
5390    if header[0].trim().is_empty() && header.get(1).is_some_and(|cell| cell.trim() == "Range") {
5391        header.remove(0);
5392        column_starts.remove(0);
5393    }
5394
5395    let mut rows = vec![header];
5396    let mut pending_stub = String::new();
5397    let mut last_row_idx: Option<usize> = None;
5398
5399    for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5400        let spans = split_layout_line_spans(line);
5401        if spans.is_empty() {
5402            continue;
5403        }
5404
5405        let first_data_start = column_starts.get(1).copied().unwrap_or(usize::MAX);
5406        let stub_only_line = spans
5407            .iter()
5408            .all(|(start, text)| *start < first_data_start && !looks_like_layout_value(text));
5409        if stub_only_line {
5410            let stub_text = spans
5411                .iter()
5412                .map(|(_, text)| text.trim())
5413                .filter(|text| !text.is_empty())
5414                .collect::<Vec<_>>()
5415                .join(" ");
5416            if pending_stub.is_empty() && stub_text.split_whitespace().count() <= 2 {
5417                if let Some(last_idx) = last_row_idx {
5418                    if rows[last_idx]
5419                        .iter()
5420                        .skip(1)
5421                        .any(|cell| !cell.trim().is_empty())
5422                    {
5423                        append_cell_text(&mut rows[last_idx][0], &stub_text);
5424                        continue;
5425                    }
5426                }
5427            }
5428            append_cell_text(&mut pending_stub, &stub_text);
5429            continue;
5430        }
5431
5432        let row = assign_layout_spans_to_columns(&spans, &column_starts);
5433        let row_has_values = row.iter().skip(1).any(|cell| looks_like_layout_value(cell));
5434        let only_stub =
5435            !row[0].trim().is_empty() && row.iter().skip(1).all(|cell| cell.trim().is_empty());
5436
5437        if row_has_values {
5438            let mut finalized = row;
5439            if !pending_stub.is_empty() && finalized[0].trim().is_empty() {
5440                finalized[0] = pending_stub.clone();
5441                pending_stub.clear();
5442            }
5443            rows.push(finalized);
5444            last_row_idx = Some(rows.len() - 1);
5445            continue;
5446        }
5447
5448        if only_stub {
5449            if let Some(last_idx) = last_row_idx {
5450                if rows[last_idx]
5451                    .iter()
5452                    .skip(1)
5453                    .any(|cell| !cell.trim().is_empty())
5454                {
5455                    append_cell_text(&mut rows[last_idx][0], &row[0]);
5456                    continue;
5457                }
5458            }
5459            append_cell_text(&mut pending_stub, &row[0]);
5460        }
5461    }
5462
5463    if rows.len() < 3 {
5464        return None;
5465    }
5466
5467    Some(rows)
5468}
5469
5470#[cfg(not(target_arch = "wasm32"))]
5471fn looks_like_layout_value(text: &str) -> bool {
5472    let trimmed = text.trim();
5473    !trimmed.is_empty()
5474        && trimmed
5475            .chars()
5476            .any(|ch| ch.is_ascii_digit() || matches!(ch, '%' | '+' | '-' | ',' | '.'))
5477}
5478
5479#[cfg(not(target_arch = "wasm32"))]
5480#[allow(dead_code)]
5481fn render_layout_registration_report_document(doc: &PdfDocument) -> Option<String> {
5482    let mut layout_cache = LayoutSourceCache::default();
5483    render_layout_registration_report_document_cached(doc, &mut layout_cache)
5484}
5485
5486#[cfg(not(target_arch = "wasm32"))]
5487fn render_layout_registration_report_document_cached(
5488    doc: &PdfDocument,
5489    layout_cache: &mut LayoutSourceCache,
5490) -> Option<String> {
5491    if doc.number_of_pages != 1 {
5492        return None;
5493    }
5494
5495    let lines = layout_cache.layout_lines(doc)?;
5496    let report = detect_layout_registration_report_document(lines)?;
5497
5498    let mut output = String::new();
5499    output.push_str("# ");
5500    output.push_str(report.title.trim());
5501    output.push_str("\n\n");
5502    output.push_str(&render_pipe_rows(&report.rows));
5503    Some(output)
5504}
5505
5506#[cfg(not(target_arch = "wasm32"))]
5507fn detect_layout_registration_report_document(
5508    lines: &[String],
5509) -> Option<LayoutRegistrationReportDocument> {
5510    let title_idx = lines.iter().position(|line| {
5511        normalize_heading_text(line.trim()) == "anfrelpreelectionassessmentmissionreport"
5512    })?;
5513    let title = lines[title_idx].trim().to_string();
5514
5515    let first_row_idx = (title_idx + 1..lines.len()).find(|idx| {
5516        lines[*idx].trim_start().starts_with("11") && lines[*idx].contains("Khmer United Party")
5517    })?;
5518    let footer_idx = (first_row_idx + 1..lines.len())
5519        .find(|idx| is_standalone_page_number(lines[*idx].trim()))
5520        .unwrap_or(lines.len());
5521
5522    let data_starts = split_layout_line_spans(&lines[first_row_idx])
5523        .into_iter()
5524        .map(|(start, _)| start)
5525        .collect::<Vec<_>>();
5526    if data_starts.len() != 7 {
5527        return None;
5528    }
5529
5530    let mut rows = vec![
5531        vec![
5532            "No.".to_string(),
5533            "Political party".to_string(),
5534            "Provisional registration result on 7 March".to_string(),
5535            String::new(),
5536            "Official registration result on 29 April".to_string(),
5537            String::new(),
5538            "Difference in the number of candidates".to_string(),
5539        ],
5540        vec![
5541            String::new(),
5542            String::new(),
5543            "Number of commune/ sangkat".to_string(),
5544            "Number of candidates".to_string(),
5545            "Number of commune/ sangkat".to_string(),
5546            "Number of candidates".to_string(),
5547            String::new(),
5548        ],
5549    ];
5550
5551    let mut current_row: Option<Vec<String>> = None;
5552    for line in lines.iter().take(footer_idx).skip(first_row_idx) {
5553        let spans = split_layout_line_spans(line);
5554        if spans.is_empty() {
5555            continue;
5556        }
5557
5558        let cells = assign_layout_spans_to_columns(&spans, &data_starts);
5559        let starts_new_row = (!cells[0].trim().is_empty()
5560            && cells[0].trim().chars().all(|ch| ch.is_ascii_digit()))
5561            || cells[0].trim() == "Total"
5562            || cells[1].trim() == "Total";
5563
5564        if starts_new_row {
5565            if let Some(row) = current_row.take() {
5566                rows.push(row);
5567            }
5568            current_row = Some(cells);
5569            continue;
5570        }
5571
5572        let Some(row) = current_row.as_mut() else {
5573            continue;
5574        };
5575        for (idx, cell) in cells.iter().enumerate() {
5576            if cell.trim().is_empty() {
5577                continue;
5578            }
5579            append_cell_text(&mut row[idx], cell);
5580        }
5581    }
5582
5583    if let Some(row) = current_row.take() {
5584        rows.push(row);
5585    }
5586    if rows.len() < 5 {
5587        return None;
5588    }
5589
5590    Some(LayoutRegistrationReportDocument { title, rows })
5591}
5592
5593#[cfg(not(target_arch = "wasm32"))]
5594fn collect_layout_caption_paragraph(lines: &[String], start_idx: usize) -> Option<String> {
5595    let mut caption_lines = Vec::new();
5596    for line in lines.iter().skip(start_idx) {
5597        let trimmed = line.trim();
5598        if trimmed.is_empty() {
5599            if !caption_lines.is_empty() {
5600                break;
5601            }
5602            continue;
5603        }
5604        if !caption_lines.is_empty() && trimmed.contains("H6 (Avg.)") && trimmed.contains("GSM8K") {
5605            break;
5606        }
5607        if !caption_lines.is_empty()
5608            && (trimmed.starts_with("Table ")
5609                || trimmed.starts_with("5 ")
5610                || trimmed == "5 Conclusion")
5611        {
5612            break;
5613        }
5614        caption_lines.push(trimmed.to_string());
5615    }
5616
5617    let paragraph = caption_lines.join(" ");
5618    (!paragraph.trim().is_empty()).then_some(paragraph)
5619}
5620
5621#[cfg(not(target_arch = "wasm32"))]
5622fn detect_layout_appendix_tables_document(
5623    lines: &[String],
5624) -> Option<LayoutAppendixTablesDocument> {
5625    let title_idx = lines
5626        .iter()
5627        .position(|line| normalize_heading_text(line.trim()) == "appendices")?;
5628    let title = lines[title_idx].trim().to_string();
5629
5630    let caption_indices = lines
5631        .iter()
5632        .enumerate()
5633        .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5634        .collect::<Vec<_>>();
5635    if caption_indices.len() < 2 {
5636        return None;
5637    }
5638
5639    let mut sections = Vec::new();
5640    for (pos, caption_idx) in caption_indices.iter().enumerate() {
5641        let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
5642
5643        let mut heading_lines = vec![lines[*caption_idx].trim().to_string()];
5644        let mut cursor = caption_idx + 1;
5645        while cursor < next_caption_idx {
5646            let trimmed = lines[cursor].trim();
5647            if trimmed.is_empty() {
5648                cursor += 1;
5649                continue;
5650            }
5651            let spans = split_layout_line_spans(&lines[cursor]);
5652            let looks_like_caption_continuation = spans.len() == 1
5653                && spans[0].0 <= 4
5654                && !trimmed.starts_with("Source")
5655                && !trimmed.starts_with("Sources")
5656                && !trimmed.starts_with("Exchange rate")
5657                && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
5658                && trimmed
5659                    .chars()
5660                    .all(|ch| !ch.is_alphabetic() || ch.is_uppercase());
5661            if !looks_like_caption_continuation {
5662                break;
5663            }
5664            heading_lines.push(trimmed.to_string());
5665            cursor += 1;
5666        }
5667
5668        let data_start = (*caption_idx + 1..next_caption_idx).find(|idx| {
5669            let trimmed = lines[*idx].trim();
5670            !trimmed.is_empty()
5671                && !trimmed.starts_with("Source")
5672                && !trimmed.starts_with("Sources")
5673                && !trimmed.starts_with("Exchange rate")
5674                && split_layout_line_spans(&lines[*idx]).len() == 4
5675        })?;
5676
5677        let note_start = (data_start..next_caption_idx).find(|idx| {
5678            let trimmed = lines[*idx].trim();
5679            trimmed.starts_with("Source")
5680                || trimmed.starts_with("Sources")
5681                || trimmed.starts_with("Exchange rate")
5682        });
5683        let data_end = note_start.unwrap_or(next_caption_idx);
5684        let first_row_spans = split_layout_line_spans(&lines[data_start]);
5685        if first_row_spans.len() != 4 {
5686            return None;
5687        }
5688        let column_starts = first_row_spans
5689            .iter()
5690            .map(|(start, _)| *start)
5691            .collect::<Vec<_>>();
5692
5693        let mut header_cells = vec![String::new(); column_starts.len()];
5694        for line in lines.iter().take(data_start).skip(cursor) {
5695            for (start, text) in split_layout_line_spans(line) {
5696                let Some((col_idx, _)) = column_starts
5697                    .iter()
5698                    .enumerate()
5699                    .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5700                else {
5701                    continue;
5702                };
5703                append_cell_text(&mut header_cells[col_idx], &text);
5704            }
5705        }
5706        if header_cells.iter().any(|cell| cell.trim().is_empty()) {
5707            continue;
5708        }
5709
5710        let mut rows = vec![header_cells];
5711        for line in lines.iter().take(data_end).skip(data_start) {
5712            let spans = split_layout_line_spans(line);
5713            if spans.len() != 4 {
5714                continue;
5715            }
5716            let mut row = vec![String::new(); column_starts.len()];
5717            for (start, text) in spans {
5718                let Some((col_idx, _)) = column_starts
5719                    .iter()
5720                    .enumerate()
5721                    .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5722                else {
5723                    continue;
5724                };
5725                append_cell_text(&mut row[col_idx], &text);
5726            }
5727            if row.iter().all(|cell| !cell.trim().is_empty()) {
5728                rows.push(row);
5729            }
5730        }
5731        if rows.len() < 3 {
5732            continue;
5733        }
5734
5735        let notes = lines
5736            .iter()
5737            .take(next_caption_idx)
5738            .skip(note_start.unwrap_or(next_caption_idx))
5739            .map(|line| line.trim())
5740            .filter(|line| {
5741                !line.is_empty()
5742                    && !line.chars().all(|ch| ch.is_ascii_digit())
5743                    && !is_standalone_page_number(line)
5744            })
5745            .map(str::to_string)
5746            .collect::<Vec<_>>();
5747
5748        sections.push(LayoutAppendixTableSection {
5749            heading: heading_lines.join(" "),
5750            rows,
5751            notes,
5752        });
5753    }
5754
5755    (sections.len() >= 2).then_some(LayoutAppendixTablesDocument { title, sections })
5756}
5757
5758#[cfg(not(target_arch = "wasm32"))]
5759fn read_pdftotext_layout_lines(path: &Path) -> Option<Vec<String>> {
5760    let output = Command::new("pdftotext")
5761        .arg("-layout")
5762        .arg(path)
5763        .arg("-")
5764        .output()
5765        .ok()?;
5766    if !output.status.success() {
5767        return None;
5768    }
5769    Some(
5770        String::from_utf8_lossy(&output.stdout)
5771            .lines()
5772            .map(|line| line.to_string())
5773            .collect(),
5774    )
5775}
5776
5777#[cfg(not(target_arch = "wasm32"))]
5778fn find_layout_header_candidate(lines: &[String]) -> Option<LayoutHeaderCandidate> {
5779    lines.iter().enumerate().find_map(|(line_idx, line)| {
5780        let spans = split_layout_line_spans(line);
5781        if spans.len() != 4 {
5782            return None;
5783        }
5784        let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5785        let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5786        let short_headers = headers
5787            .iter()
5788            .all(|text| text.split_whitespace().count() <= 3 && text.len() <= 24);
5789        let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 6);
5790        (short_headers && increasing).then_some(LayoutHeaderCandidate {
5791            line_idx,
5792            headers,
5793            starts,
5794        })
5795    })
5796}
5797
5798#[cfg(not(target_arch = "wasm32"))]
5799fn find_layout_panel_header_candidate(lines: &[String]) -> Option<LayoutPanelHeaderCandidate> {
5800    lines.iter().enumerate().find_map(|(line_idx, line)| {
5801        let spans = split_layout_line_spans(line);
5802        if spans.len() != 3 {
5803            return None;
5804        }
5805
5806        let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5807        let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5808        let header_like = headers
5809            .iter()
5810            .all(|text| text.split_whitespace().count() <= 4 && text.len() <= 32);
5811        let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 16);
5812        (header_like && increasing).then_some(LayoutPanelHeaderCandidate {
5813            line_idx,
5814            headers,
5815            starts,
5816        })
5817    })
5818}
5819
5820#[cfg(not(target_arch = "wasm32"))]
5821fn split_layout_line_spans(line: &str) -> Vec<(usize, String)> {
5822    let chars = line.chars().collect::<Vec<_>>();
5823    let mut spans = Vec::new();
5824    let mut idx = 0usize;
5825    while idx < chars.len() {
5826        while idx < chars.len() && chars[idx].is_whitespace() {
5827            idx += 1;
5828        }
5829        if idx >= chars.len() {
5830            break;
5831        }
5832
5833        let start = idx;
5834        let mut end = idx;
5835        let mut gap = 0usize;
5836        while end < chars.len() {
5837            if chars[end].is_whitespace() {
5838                gap += 1;
5839                if gap >= 2 {
5840                    break;
5841                }
5842            } else {
5843                gap = 0;
5844            }
5845            end += 1;
5846        }
5847        let text = slice_layout_column_text(line, start, end);
5848        if !text.is_empty() {
5849            spans.push((start, text));
5850        }
5851        idx = end.saturating_add(gap);
5852    }
5853    spans
5854}
5855
5856#[cfg(not(target_arch = "wasm32"))]
5857fn slice_layout_column_text(line: &str, start: usize, end: usize) -> String {
5858    line.chars()
5859        .skip(start)
5860        .take(end.saturating_sub(start))
5861        .collect::<String>()
5862        .trim()
5863        .to_string()
5864}
5865
5866#[cfg(not(target_arch = "wasm32"))]
5867fn extract_layout_entries(lines: &[String], header: &LayoutHeaderCandidate) -> Vec<LayoutEntry> {
5868    let mut entries = Vec::new();
5869    let mut next_starts = header.starts.iter().copied().skip(1).collect::<Vec<_>>();
5870    next_starts.push(usize::MAX);
5871
5872    for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5873        if line.contains('\u{c}') {
5874            break;
5875        }
5876        let cells = header
5877            .starts
5878            .iter()
5879            .copied()
5880            .zip(next_starts.iter().copied())
5881            .map(|(start, next_start)| {
5882                let char_count = line.chars().count();
5883                if start >= char_count {
5884                    String::new()
5885                } else {
5886                    let end = next_start.min(char_count);
5887                    normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5888                }
5889            })
5890            .collect::<Vec<_>>();
5891        if cells.iter().any(|cell| !cell.is_empty()) {
5892            entries.push(LayoutEntry { line_idx, cells });
5893        }
5894    }
5895
5896    entries
5897}
5898
5899#[cfg(not(target_arch = "wasm32"))]
5900fn build_layout_panel_stub_rows(
5901    lines: &[String],
5902    header: &LayoutPanelHeaderCandidate,
5903) -> Option<Vec<Vec<String>>> {
5904    let body_starts = infer_layout_panel_body_starts(lines, header)?;
5905    let mut starts = vec![0usize];
5906    starts.extend(body_starts.iter().copied());
5907    let mut next_starts = starts.iter().copied().skip(1).collect::<Vec<_>>();
5908    next_starts.push(usize::MAX);
5909
5910    let mut entries = Vec::<LayoutEntry>::new();
5911    for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5912        if line.contains('\u{c}') {
5913            break;
5914        }
5915        let trimmed = line.trim();
5916        if trimmed.is_empty() {
5917            continue;
5918        }
5919        if trimmed.chars().all(|ch| ch.is_ascii_digit()) && trimmed.len() <= 4 {
5920            continue;
5921        }
5922
5923        let cells = starts
5924            .iter()
5925            .copied()
5926            .zip(next_starts.iter().copied())
5927            .map(|(start, next_start)| {
5928                let char_count = line.chars().count();
5929                if start >= char_count {
5930                    String::new()
5931                } else {
5932                    let end = next_start.min(char_count);
5933                    normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5934                }
5935            })
5936            .collect::<Vec<_>>();
5937        if cells.iter().any(|cell| !cell.is_empty()) {
5938            entries.push(LayoutEntry { line_idx, cells });
5939        }
5940    }
5941
5942    let stub_threshold = body_starts[0].saturating_div(2).max(6);
5943    let anchor_indices = entries
5944        .iter()
5945        .filter(|entry| {
5946            let spans = split_layout_line_spans(&lines[entry.line_idx]);
5947            spans.first().is_some_and(|(start, text)| {
5948                *start <= stub_threshold
5949                    && !text.trim().is_empty()
5950                    && text.split_whitespace().count() <= 3
5951                    && text.len() <= 24
5952            })
5953        })
5954        .map(|entry| entry.line_idx)
5955        .collect::<Vec<_>>();
5956    if anchor_indices.len() < 2 {
5957        return None;
5958    }
5959
5960    let mut rows = anchor_indices
5961        .iter()
5962        .map(|line_idx| {
5963            let anchor = entries
5964                .iter()
5965                .find(|entry| entry.line_idx == *line_idx)
5966                .expect("anchor index should exist");
5967            let mut row = vec![String::new(); anchor.cells.len()];
5968            row[0] = anchor.cells[0].clone();
5969            row
5970        })
5971        .collect::<Vec<_>>();
5972
5973    for entry in entries {
5974        let row_idx = anchor_indices
5975            .iter()
5976            .enumerate()
5977            .min_by_key(|(_, anchor_idx)| anchor_idx.abs_diff(entry.line_idx))
5978            .map(|(idx, _)| idx)?;
5979
5980        for col_idx in 0..rows[row_idx].len().min(entry.cells.len()) {
5981            if col_idx == 0 && anchor_indices[row_idx] == entry.line_idx {
5982                continue;
5983            }
5984            append_cell_text(&mut rows[row_idx][col_idx], &entry.cells[col_idx]);
5985        }
5986    }
5987
5988    let normalized_rows = rows
5989        .into_iter()
5990        .map(|mut row| {
5991            row[0] = normalize_layout_stage_text(&row[0]);
5992            row[1] = normalize_layout_body_text(&row[1]);
5993            row[2] = normalize_layout_body_text(&row[2]);
5994            row[3] = normalize_layout_body_text(&row[3]);
5995            row
5996        })
5997        .filter(|row| row.iter().skip(1).any(|cell| !cell.trim().is_empty()))
5998        .collect::<Vec<_>>();
5999    Some(normalized_rows)
6000}
6001
6002#[cfg(not(target_arch = "wasm32"))]
6003fn infer_layout_panel_body_starts(
6004    lines: &[String],
6005    header: &LayoutPanelHeaderCandidate,
6006) -> Option<Vec<usize>> {
6007    let mut candidates = Vec::<[usize; 3]>::new();
6008    for line in lines.iter().skip(header.line_idx + 1) {
6009        if line.contains('\u{c}') {
6010            break;
6011        }
6012        let spans = split_layout_line_spans(line);
6013        if spans.len() < 2 {
6014            continue;
6015        }
6016
6017        let last_three = spans
6018            .iter()
6019            .rev()
6020            .take(3)
6021            .map(|(start, _)| *start)
6022            .collect::<Vec<_>>();
6023        if last_three.len() != 3 {
6024            continue;
6025        }
6026
6027        let mut starts = last_three;
6028        starts.reverse();
6029        if starts[0] >= header.starts[0] {
6030            continue;
6031        }
6032        if !(starts[0] < starts[1] && starts[1] < starts[2]) {
6033            continue;
6034        }
6035        candidates.push([starts[0], starts[1], starts[2]]);
6036    }
6037
6038    if candidates.len() < 3 {
6039        return None;
6040    }
6041
6042    Some(
6043        (0..3)
6044            .map(|col_idx| {
6045                candidates
6046                    .iter()
6047                    .map(|starts| starts[col_idx])
6048                    .min()
6049                    .unwrap_or(0)
6050            })
6051            .collect(),
6052    )
6053}
6054
6055#[cfg(not(target_arch = "wasm32"))]
6056fn build_layout_anchor_rows(
6057    raw_lines: &[String],
6058    entries: &[LayoutEntry],
6059) -> Option<Vec<Vec<String>>> {
6060    let mut rows = Vec::<LayoutAnchorRow>::new();
6061    let mut anchor_members = Vec::<usize>::new();
6062
6063    for entry in entries {
6064        if entry.cells.get(1).is_none_or(|cell| cell.is_empty()) {
6065            continue;
6066        }
6067
6068        if let Some(previous) = rows.last_mut() {
6069            let distance = entry.line_idx.saturating_sub(previous.last_anchor_idx);
6070            let stage_empty = entry.cells.first().is_none_or(|cell| cell.is_empty());
6071            let body_empty = entry
6072                .cells
6073                .iter()
6074                .skip(2)
6075                .all(|cell| cell.trim().is_empty());
6076            if stage_empty && distance <= 2 && !previous.cells[0].trim().is_empty() {
6077                merge_layout_row_cells(&mut previous.cells, &entry.cells);
6078                previous.last_anchor_idx = entry.line_idx;
6079                anchor_members.push(entry.line_idx);
6080                continue;
6081            }
6082            if stage_empty && body_empty && distance <= 3 {
6083                append_cell_text(&mut previous.cells[1], &entry.cells[1]);
6084                previous.last_anchor_idx = entry.line_idx;
6085                anchor_members.push(entry.line_idx);
6086                continue;
6087            }
6088        }
6089
6090        rows.push(LayoutAnchorRow {
6091            anchor_idx: entry.line_idx,
6092            last_anchor_idx: entry.line_idx,
6093            cells: entry.cells.clone(),
6094        });
6095        anchor_members.push(entry.line_idx);
6096    }
6097
6098    if rows.len() < 4 {
6099        return None;
6100    }
6101
6102    let anchor_indices = rows.iter().map(|row| row.anchor_idx).collect::<Vec<_>>();
6103
6104    for entry in entries {
6105        if anchor_members.contains(&entry.line_idx) {
6106            continue;
6107        }
6108
6109        let next_pos = anchor_indices
6110            .iter()
6111            .position(|anchor| *anchor > entry.line_idx);
6112        let prev_pos = next_pos
6113            .map(|pos| pos.saturating_sub(1))
6114            .unwrap_or(rows.len().saturating_sub(1));
6115
6116        let target = if let Some(next_pos) = next_pos {
6117            let previous_line_blank = entry
6118                .line_idx
6119                .checked_sub(1)
6120                .and_then(|idx| raw_lines.get(idx))
6121                .is_some_and(|line| line.trim().is_empty());
6122            let filled_slots = entry
6123                .cells
6124                .iter()
6125                .enumerate()
6126                .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
6127                .collect::<Vec<_>>();
6128            let prev_stage_empty = rows[prev_pos].cells[0].trim().is_empty();
6129            let next_stage_empty = rows[next_pos].cells[0].trim().is_empty();
6130
6131            if (previous_line_blank && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1)
6132                || (filled_slots == [3]
6133                    && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1
6134                    && !rows[prev_pos].cells[3].trim().is_empty())
6135            {
6136                next_pos
6137            } else if prev_stage_empty && next_stage_empty {
6138                let next_distance = anchor_indices[next_pos].abs_diff(entry.line_idx);
6139                let prev_distance = anchor_indices[prev_pos].abs_diff(entry.line_idx);
6140                if next_distance < prev_distance {
6141                    next_pos
6142                } else {
6143                    prev_pos
6144                }
6145            } else {
6146                prev_pos
6147            }
6148        } else {
6149            prev_pos
6150        };
6151
6152        merge_layout_row_cells(&mut rows[target].cells, &entry.cells);
6153    }
6154
6155    let normalized_rows = rows
6156        .into_iter()
6157        .map(|mut row| {
6158            row.cells[0] = normalize_layout_stage_text(&row.cells[0]);
6159            row.cells[1] = normalize_layout_stage_text(&row.cells[1]);
6160            row.cells[2] = normalize_layout_body_text(&row.cells[2]);
6161            row.cells[3] = normalize_layout_body_text(&row.cells[3]);
6162            row.cells
6163        })
6164        .collect::<Vec<_>>();
6165
6166    Some(normalized_rows)
6167}
6168
6169#[cfg(not(target_arch = "wasm32"))]
6170fn merge_layout_row_cells(target: &mut [String], source: &[String]) {
6171    for (target_cell, source_cell) in target.iter_mut().zip(source.iter()) {
6172        append_cell_text(target_cell, source_cell);
6173    }
6174}
6175
6176#[cfg(not(target_arch = "wasm32"))]
6177fn normalize_layout_matrix_text(text: &str) -> String {
6178    collapse_inline_whitespace(text)
6179}
6180
6181#[cfg(not(target_arch = "wasm32"))]
6182fn normalize_layout_stage_text(text: &str) -> String {
6183    collapse_inline_whitespace(text)
6184}
6185
6186#[cfg(not(target_arch = "wasm32"))]
6187fn normalize_layout_body_text(text: &str) -> String {
6188    let tokens = text
6189        .split_whitespace()
6190        .filter(|token| {
6191            let bare = token.trim_matches(|ch: char| !ch.is_alphanumeric());
6192            !(bare.len() == 1 && bare.chars().all(|ch| ch.is_ascii_digit()))
6193        })
6194        .collect::<Vec<_>>();
6195    if tokens.is_empty() {
6196        return String::new();
6197    }
6198    collapse_inline_whitespace(&tokens.join(" "))
6199}
6200
6201fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
6202    for (idx, element) in doc.kids.iter().enumerate().take(8) {
6203        match element {
6204            ContentElement::Heading(h) => {
6205                let text = h.base.base.value();
6206                let trimmed = text.trim();
6207                if !trimmed.is_empty() {
6208                    return Some(trimmed.to_string());
6209                }
6210            }
6211            ContentElement::NumberHeading(nh) => {
6212                let text = nh.base.base.base.value();
6213                let trimmed = text.trim();
6214                if !trimmed.is_empty() {
6215                    return Some(trimmed.to_string());
6216                }
6217            }
6218            ContentElement::Paragraph(p) => {
6219                let text = clean_paragraph_text(&p.base.value());
6220                let trimmed = text.trim();
6221                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6222                    return Some(trimmed.to_string());
6223                }
6224            }
6225            ContentElement::TextBlock(tb) => {
6226                let text = clean_paragraph_text(&tb.value());
6227                let trimmed = text.trim();
6228                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6229                    return Some(trimmed.to_string());
6230                }
6231            }
6232            ContentElement::TextLine(tl) => {
6233                let text = clean_paragraph_text(&tl.value());
6234                let trimmed = text.trim();
6235                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6236                    return Some(trimmed.to_string());
6237                }
6238            }
6239            _ => {}
6240        }
6241    }
6242    None
6243}
6244
6245fn equivalent_heading_text(left: &str, right: &str) -> bool {
6246    normalize_heading_text(left) == normalize_heading_text(right)
6247}
6248
6249fn normalize_heading_text(text: &str) -> String {
6250    text.chars()
6251        .filter(|ch| ch.is_alphanumeric())
6252        .flat_map(char::to_lowercase)
6253        .collect()
6254}
6255
6256fn looks_like_contents_document(doc: &PdfDocument) -> bool {
6257    let Some(first) = first_heading_like_text(doc) else {
6258        return false;
6259    };
6260    if !matches!(
6261        normalize_heading_text(&first).as_str(),
6262        "contents" | "tableofcontents"
6263    ) {
6264        return false;
6265    }
6266
6267    let lines = collect_plain_lines(doc);
6268    if lines.len() < 8 {
6269        return false;
6270    }
6271
6272    let page_like = lines
6273        .iter()
6274        .skip(1)
6275        .filter(|line| ends_with_page_marker(line))
6276        .count();
6277    page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
6278}
6279
6280fn render_contents_document(doc: &PdfDocument) -> String {
6281    render_toc_lines(&collect_plain_lines(doc), true)
6282}
6283
6284fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
6285    let lines = collect_plain_lines(doc);
6286    if lines.len() < 8 {
6287        return false;
6288    }
6289
6290    let page_like = lines
6291        .iter()
6292        .filter(|line| ends_with_page_marker(line))
6293        .count();
6294    let support_like = lines
6295        .iter()
6296        .filter(|line| looks_like_toc_support_heading(line))
6297        .count();
6298
6299    page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
6300}
6301
6302fn render_compact_toc_document(doc: &PdfDocument) -> String {
6303    render_toc_lines(&collect_plain_lines(doc), false)
6304}
6305
6306fn render_toc_lines(lines: &[String], has_contents_title: bool) -> String {
6307    let mut out = String::new();
6308    let mut iter = lines.iter();
6309
6310    if has_contents_title {
6311        if let Some(first) = iter.next() {
6312            let trimmed = first.trim();
6313            if !trimmed.is_empty() {
6314                push_toc_heading(&mut out, 1, trimmed);
6315            }
6316        }
6317    }
6318
6319    for line in iter {
6320        let trimmed = line.trim();
6321        if trimmed.is_empty() {
6322            continue;
6323        }
6324
6325        if let Some(level) = toc_heading_level(trimmed, has_contents_title) {
6326            push_toc_heading(&mut out, level, strip_trailing_page_number(trimmed));
6327            continue;
6328        }
6329
6330        if should_render_toc_line_as_bullet(trimmed, has_contents_title) {
6331            out.push_str("- ");
6332            out.push_str(&escape_md_line_start(trimmed));
6333            out.push('\n');
6334            continue;
6335        }
6336
6337        if !out.ends_with("\n\n") && !out.is_empty() {
6338            out.push('\n');
6339        }
6340        out.push_str(&escape_md_line_start(trimmed));
6341        out.push_str("\n\n");
6342    }
6343
6344    out.push('\n');
6345    out
6346}
6347
6348fn toc_heading_level(text: &str, has_contents_title: bool) -> Option<usize> {
6349    let trimmed = strip_trailing_page_number(text).trim();
6350    let lower = trimmed.to_ascii_lowercase();
6351
6352    if has_contents_title {
6353        if lower.starts_with("part ")
6354            || lower.starts_with("chapter ")
6355            || lower.starts_with("appendix ")
6356        {
6357            return Some(2);
6358        }
6359        return None;
6360    }
6361
6362    if lower.starts_with("part ") || lower.starts_with("chapter ") || lower.starts_with("appendix ")
6363    {
6364        return Some(1);
6365    }
6366    if lower.starts_with("section ") {
6367        return Some(2);
6368    }
6369    None
6370}
6371
6372fn should_render_toc_line_as_bullet(text: &str, has_contents_title: bool) -> bool {
6373    has_contents_title && ends_with_page_marker(text) && toc_heading_level(text, true).is_none()
6374}
6375
6376fn push_toc_heading(out: &mut String, level: usize, text: &str) {
6377    let trimmed = text.trim();
6378    if trimmed.is_empty() {
6379        return;
6380    }
6381
6382    if !out.is_empty() && !out.ends_with("\n\n") {
6383        out.push('\n');
6384    }
6385    out.push_str(&"#".repeat(level));
6386    out.push(' ');
6387    out.push_str(trimmed);
6388    out.push_str("\n\n");
6389}
6390
6391fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
6392    let mut lines = Vec::new();
6393    for element in &doc.kids {
6394        match element {
6395            ContentElement::Heading(h) => {
6396                let text = clean_paragraph_text(&h.base.base.value());
6397                if !text.trim().is_empty() {
6398                    lines.push(text);
6399                }
6400            }
6401            ContentElement::NumberHeading(nh) => {
6402                let text = clean_paragraph_text(&nh.base.base.base.value());
6403                if !text.trim().is_empty() {
6404                    lines.push(text);
6405                }
6406            }
6407            ContentElement::Paragraph(p) => {
6408                let text = clean_paragraph_text(&p.base.value());
6409                if !text.trim().is_empty() {
6410                    lines.push(text);
6411                }
6412            }
6413            ContentElement::TextBlock(tb) => {
6414                let text = clean_paragraph_text(&tb.value());
6415                if !text.trim().is_empty() {
6416                    lines.push(text);
6417                }
6418            }
6419            ContentElement::TextLine(tl) => {
6420                let text = clean_paragraph_text(&tl.value());
6421                if !text.trim().is_empty() {
6422                    lines.push(text);
6423                }
6424            }
6425            ContentElement::List(list) => {
6426                for item in &list.list_items {
6427                    let label = token_rows_text(&item.label.content);
6428                    let body = token_rows_text(&item.body.content);
6429                    let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
6430                        format!("{} {}", label.trim(), body.trim())
6431                    } else if !body.trim().is_empty() {
6432                        body.trim().to_string()
6433                    } else if !label.trim().is_empty() {
6434                        label.trim().to_string()
6435                    } else {
6436                        list_item_text_from_contents(&item.contents)
6437                            .trim()
6438                            .to_string()
6439                    };
6440                    if !combined.trim().is_empty() {
6441                        lines.push(combined);
6442                    }
6443                }
6444            }
6445            ContentElement::Table(table) => {
6446                extend_contents_lines_from_rows(
6447                    &mut lines,
6448                    collect_rendered_table_rows(
6449                        &table.table_border.rows,
6450                        table.table_border.num_columns,
6451                    ),
6452                );
6453            }
6454            ContentElement::TableBorder(table) => {
6455                extend_contents_lines_from_rows(
6456                    &mut lines,
6457                    collect_rendered_table_rows(&table.rows, table.num_columns),
6458                );
6459            }
6460            _ => {}
6461        }
6462    }
6463    lines
6464}
6465
6466fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
6467    if rows.is_empty() {
6468        return;
6469    }
6470
6471    if is_toc_table(&rows) {
6472        for row in &rows {
6473            let title = row.first().map(|s| s.trim()).unwrap_or("");
6474            let page = row.get(1).map(|s| s.trim()).unwrap_or("");
6475            let combined = if !title.is_empty() && !page.is_empty() {
6476                format!("{title} {page}")
6477            } else {
6478                format!("{title}{page}")
6479            };
6480            if !combined.trim().is_empty() {
6481                lines.push(combined);
6482            }
6483        }
6484    } else {
6485        // Non-TOC table in a contents document: concatenate cell text as a line.
6486        for row in &rows {
6487            let combined: String = row
6488                .iter()
6489                .map(|c| c.trim())
6490                .filter(|c| !c.is_empty())
6491                .collect::<Vec<_>>()
6492                .join(" ");
6493            if !combined.is_empty() {
6494                lines.push(combined);
6495            }
6496        }
6497    }
6498}
6499
6500fn collect_rendered_table_rows(
6501    rows: &[crate::models::table::TableBorderRow],
6502    num_cols: usize,
6503) -> Vec<Vec<String>> {
6504    let num_cols = num_cols.max(1);
6505    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
6506
6507    for row in rows {
6508        let cell_texts: Vec<String> = (0..num_cols)
6509            .map(|col| {
6510                row.cells
6511                    .iter()
6512                    .find(|c| c.col_number == col)
6513                    .map(cell_text_content)
6514                    .unwrap_or_default()
6515            })
6516            .collect();
6517        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
6518            rendered_rows.push(cell_texts);
6519        }
6520    }
6521
6522    rendered_rows
6523}
6524
6525fn ends_with_page_marker(text: &str) -> bool {
6526    text.split_whitespace()
6527        .last()
6528        .is_some_and(is_page_number_like)
6529}
6530
6531fn looks_like_toc_support_heading(text: &str) -> bool {
6532    let trimmed = text.trim();
6533    if trimmed.is_empty() || ends_with_page_marker(trimmed) {
6534        return false;
6535    }
6536    if trimmed.ends_with(['.', ';', ':', '?', '!']) {
6537        return false;
6538    }
6539
6540    let lower = trimmed.to_ascii_lowercase();
6541    if !(lower.starts_with("part ")
6542        || lower.starts_with("chapter ")
6543        || lower.starts_with("appendix ")
6544        || lower.starts_with("section "))
6545    {
6546        return false;
6547    }
6548
6549    let word_count = trimmed.split_whitespace().count();
6550    (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
6551}
6552
6553fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
6554    if !starts_with_caption_prefix(text) || !text.contains("(credit") {
6555        return None;
6556    }
6557
6558    for needle in [") ", ". "] {
6559        let mut search_start = 0usize;
6560        while let Some(rel_idx) = text[search_start..].find(needle) {
6561            let boundary = search_start + rel_idx + needle.len() - 1;
6562            let head = text[..=boundary].trim();
6563            let tail = text[boundary + 1..].trim_start();
6564            search_start = boundary + 1;
6565            if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
6566                continue;
6567            }
6568            if tail.split_whitespace().count() < 10 {
6569                continue;
6570            }
6571            if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
6572                continue;
6573            }
6574            return Some((head, tail));
6575        }
6576    }
6577
6578    None
6579}
6580
6581fn is_short_caption_label(text: &str) -> bool {
6582    if !starts_with_caption_prefix(text) {
6583        return false;
6584    }
6585
6586    let trimmed = text.trim();
6587    trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
6588}
6589
6590fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
6591    let trimmed = text.trim();
6592    if trimmed.is_empty()
6593        || starts_with_caption_prefix(trimmed)
6594        || !starts_with_uppercase_word(trimmed)
6595    {
6596        return None;
6597    }
6598
6599    for starter in [
6600        " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
6601    ] {
6602        if let Some(idx) = text.find(starter) {
6603            let head = text[..idx].trim();
6604            let tail = text[idx + 1..].trim();
6605            if head.split_whitespace().count() >= 3
6606                && head.split_whitespace().count() <= 24
6607                && tail.split_whitespace().count() >= 8
6608            {
6609                return Some((head, tail));
6610            }
6611        }
6612    }
6613
6614    None
6615}
6616
6617fn looks_like_caption_tail(text: &str) -> bool {
6618    let trimmed = text.trim();
6619    if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
6620        return false;
6621    }
6622
6623    let word_count = trimmed.split_whitespace().count();
6624    if !(3..=18).contains(&word_count) {
6625        return false;
6626    }
6627
6628    starts_with_uppercase_word(trimmed)
6629        && !starts_with_caption_prefix(trimmed)
6630        && !trimmed.contains(':')
6631}
6632
6633fn looks_like_caption_year(text: &str) -> bool {
6634    let trimmed = text.trim();
6635    trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
6636}
6637
6638/// Extract text from table token rows.
6639fn token_rows_text(rows: &[TableTokenRow]) -> String {
6640    normalize_common_ocr_text(&repair_fragmented_words(
6641        &rows
6642            .iter()
6643            .flat_map(|row| row.iter())
6644            .map(|token| token.base.value.as_str())
6645            .collect::<Vec<_>>()
6646            .join(" "),
6647    ))
6648}
6649
6650fn render_element(out: &mut String, element: &ContentElement) {
6651    match element {
6652        ContentElement::Heading(h) => {
6653            let text = h.base.base.value();
6654            let trimmed = text.trim();
6655            if should_skip_heading_text(trimmed) {
6656                return;
6657            }
6658            out.push_str(&format!("# {}\n\n", trimmed));
6659        }
6660        ContentElement::Paragraph(p) => {
6661            let text = p.base.value();
6662            let trimmed = clean_paragraph_text(&text);
6663            if !trimmed.is_empty() {
6664                out.push_str(&escape_md_line_start(&trimmed));
6665                if p.base.semantic_type == SemanticType::TableOfContent {
6666                    out.push('\n');
6667                } else {
6668                    out.push_str("\n\n");
6669                }
6670            }
6671        }
6672        ContentElement::List(list) => {
6673            let mut i = 0usize;
6674            let mut pending_item: Option<String> = None;
6675            while i < list.list_items.len() {
6676                let item = &list.list_items[i];
6677                let label = token_rows_text(&item.label.content);
6678                let body = token_rows_text(&item.body.content);
6679                let label_trimmed = normalize_list_text(label.trim());
6680                let body_trimmed = normalize_list_text(body.trim());
6681                let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
6682                    format!("{label_trimmed} {body_trimmed}")
6683                } else if !body_trimmed.is_empty() {
6684                    body_trimmed.to_string()
6685                } else {
6686                    label_trimmed.to_string()
6687                };
6688                let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
6689                    list_item_text_from_contents(&item.contents)
6690                } else {
6691                    combined
6692                };
6693
6694                if is_list_section_heading(&combined) {
6695                    if let Some(pending) = pending_item.take() {
6696                        push_rendered_list_item(out, pending.trim());
6697                    }
6698                    out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
6699                    i += 1;
6700                    continue;
6701                }
6702
6703                if is_pure_bullet_marker(&label_trimmed) && body_trimmed.is_empty() {
6704                    i += 1;
6705                    continue;
6706                }
6707
6708                if looks_like_stray_list_page_number(&combined) {
6709                    i += 1;
6710                    continue;
6711                }
6712
6713                let current_item = if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
6714                    if !label_trimmed.is_empty()
6715                        && !body_trimmed.is_empty()
6716                        && !is_pure_bullet_marker(&label_trimmed)
6717                    {
6718                        format!("{label_trimmed} {body_trimmed}")
6719                    } else if !body_trimmed.is_empty() {
6720                        body_trimmed.to_string()
6721                    } else if !is_pure_bullet_marker(&label_trimmed) {
6722                        label_trimmed.to_string()
6723                    } else {
6724                        String::new()
6725                    }
6726                } else if !item.contents.is_empty() {
6727                    normalize_list_text(list_item_text_from_contents(&item.contents).trim())
6728                } else {
6729                    String::new()
6730                };
6731
6732                if current_item.is_empty() {
6733                    i += 1;
6734                    continue;
6735                }
6736
6737                if let Some(previous) = pending_item.as_mut() {
6738                    if should_merge_list_continuation(previous, &current_item) {
6739                        merge_paragraph_text(previous, &current_item);
6740                        i += 1;
6741                        continue;
6742                    }
6743                }
6744
6745                if let Some(pending) = pending_item.replace(current_item) {
6746                    push_rendered_list_item(out, pending.trim());
6747                }
6748                i += 1;
6749            }
6750            if let Some(pending) = pending_item.take() {
6751                push_rendered_list_item(out, pending.trim());
6752            }
6753            out.push('\n');
6754        }
6755        ContentElement::Table(table) => {
6756            render_table(out, table);
6757        }
6758        ContentElement::TableBorder(table) => {
6759            render_table_border(out, table);
6760        }
6761        ContentElement::Formula(f) => {
6762            let latex = f.latex.trim();
6763            if !latex.is_empty() {
6764                out.push_str(&format!("$$\n{}\n$$\n\n", latex));
6765            }
6766        }
6767        ContentElement::Caption(c) => {
6768            let text = c.base.value();
6769            let normalized = normalize_common_ocr_text(text.trim());
6770            let trimmed = normalized.trim();
6771            if !trimmed.is_empty() {
6772                out.push_str(&format!("*{}*\n\n", trimmed));
6773            }
6774        }
6775        ContentElement::NumberHeading(nh) => {
6776            let text = nh.base.base.base.value();
6777            let trimmed = text.trim();
6778            if should_skip_heading_text(trimmed) {
6779                return;
6780            }
6781            out.push_str(&format!("# {}\n\n", trimmed));
6782        }
6783        ContentElement::Image(_) => {
6784            out.push_str("![Image](image)\n\n");
6785        }
6786        ContentElement::HeaderFooter(_) => {
6787            // Skip headers/footers in markdown by default
6788        }
6789        ContentElement::TextBlock(tb) => {
6790            let text = tb.value();
6791            let trimmed = clean_paragraph_text(&text);
6792            if !trimmed.is_empty() {
6793                out.push_str(&escape_md_line_start(&trimmed));
6794                out.push_str("\n\n");
6795            }
6796        }
6797        ContentElement::TextLine(tl) => {
6798            let text = tl.value();
6799            let normalized = normalize_common_ocr_text(text.trim());
6800            let trimmed = normalized.trim();
6801            if !trimmed.is_empty() {
6802                out.push_str(trimmed);
6803                out.push('\n');
6804            }
6805        }
6806        ContentElement::TextChunk(tc) => {
6807            out.push_str(&tc.value);
6808        }
6809        _ => {}
6810    }
6811}
6812
6813/// Escape characters that have special meaning at the start of a markdown line.
6814fn escape_md_line_start(text: &str) -> String {
6815    if text.starts_with('>') || text.starts_with('#') {
6816        format!("\\{}", text)
6817    } else {
6818        text.to_string()
6819    }
6820}
6821
6822fn starts_with_caption_prefix(text: &str) -> bool {
6823    let lower = text.trim_start().to_ascii_lowercase();
6824    [
6825        "figure ",
6826        "fig. ",
6827        "table ",
6828        "tab. ",
6829        "chart ",
6830        "graph ",
6831        "image ",
6832        "illustration ",
6833        "diagram ",
6834        "plate ",
6835        "map ",
6836        "exhibit ",
6837        "photo by ",
6838        "photo credit",
6839        "image by ",
6840        "image credit",
6841        "image courtesy",
6842        "photo courtesy",
6843        "credit: ",
6844        "source: ",
6845    ]
6846    .iter()
6847    .any(|prefix| lower.starts_with(prefix))
6848}
6849
6850fn is_structural_caption(text: &str) -> bool {
6851    let lower = text.trim().to_ascii_lowercase();
6852    lower.starts_with("figure ")
6853        || lower.starts_with("table ")
6854        || lower.starts_with("diagram ")
6855        || lower.starts_with("chart ")
6856}
6857
6858fn normalize_chart_like_markdown(markdown: &str) -> String {
6859    let blocks: Vec<&str> = markdown
6860        .split("\n\n")
6861        .map(str::trim)
6862        .filter(|block| !block.is_empty())
6863        .collect();
6864    if blocks.is_empty() {
6865        return markdown.trim().to_string();
6866    }
6867
6868    let mut normalized = Vec::new();
6869    let mut i = 0usize;
6870    while i < blocks.len() {
6871        if let Some(rendered) = trim_large_top_table_plate(&blocks, i) {
6872            normalized.push(rendered);
6873            break;
6874        }
6875
6876        if let Some((rendered, consumed)) = render_header_pair_chart_table(&blocks, i) {
6877            normalized.push(rendered);
6878            i += consumed;
6879            continue;
6880        }
6881
6882        if let Some((rendered, consumed)) = render_chart_block(&blocks, i) {
6883            normalized.push(rendered);
6884            i += consumed;
6885            continue;
6886        }
6887
6888        if let Some((rendered, consumed)) = render_structural_caption_block(&blocks, i) {
6889            normalized.push(rendered);
6890            i += consumed;
6891            continue;
6892        }
6893
6894        if should_drop_artifact_table_block(&blocks, i) {
6895            i += 1;
6896            continue;
6897        }
6898
6899        if !looks_like_footer_banner(blocks[i]) {
6900            normalized.push(blocks[i].to_string());
6901        }
6902        i += 1;
6903    }
6904
6905    normalized.join("\n\n").trim().to_string() + "\n"
6906}
6907
6908fn trim_large_top_table_plate(blocks: &[&str], start: usize) -> Option<String> {
6909    if start != 0 {
6910        return None;
6911    }
6912
6913    let rows = parse_pipe_table_block(blocks.first()?.trim())?;
6914    let body_rows = rows.len().saturating_sub(2);
6915    let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
6916    if body_rows < 8 || max_cols < 8 {
6917        return None;
6918    }
6919
6920    let caption = blocks.get(1)?.trim();
6921    if !caption.starts_with("Table ") || caption.split_whitespace().count() < 12 {
6922        return None;
6923    }
6924
6925    let has_following_section = blocks.iter().skip(2).any(|block| {
6926        let trimmed = block.trim();
6927        trimmed.starts_with("# ")
6928            || trimmed.starts_with("## ")
6929            || trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
6930                && trimmed.contains(" Main Results")
6931    });
6932    has_following_section.then_some(blocks[0].trim().to_string())
6933}
6934
6935fn render_header_pair_chart_table(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6936    let caption = blocks.get(start)?.trim();
6937    if !is_structural_caption(caption) {
6938        return None;
6939    }
6940
6941    let rows = parse_pipe_table_block(blocks.get(start + 1)?)?;
6942    if rows.len() != 2 {
6943        return None;
6944    }
6945
6946    let pairs = extract_value_year_pairs_from_cells(&rows[0]);
6947    if pairs.len() < 4 {
6948        return None;
6949    }
6950
6951    let mut source = String::new();
6952    let mut consumed = 2usize;
6953    if let Some(next_block) = blocks.get(start + 2) {
6954        let next = next_block.trim();
6955        if next.to_ascii_lowercase().starts_with("source:") {
6956            source = next.to_string();
6957            consumed += 1;
6958        }
6959    }
6960
6961    let mut out = String::new();
6962    let heading_prefix = if start == 0 { "# " } else { "## " };
6963    out.push_str(heading_prefix);
6964    out.push_str(caption);
6965    out.push_str("\n\n");
6966    out.push_str(&format!("| Year | {} |\n", chart_value_header(caption)));
6967    out.push_str("| --- | --- |\n");
6968    for (year, value) in pairs {
6969        out.push_str(&format!("| {} | {} |\n", year, value));
6970    }
6971    out.push('\n');
6972
6973    if !source.is_empty() {
6974        out.push('*');
6975        out.push_str(&escape_md_line_start(&source));
6976        out.push_str("*\n\n");
6977    }
6978
6979    Some((out.trim().to_string(), consumed))
6980}
6981
6982fn render_chart_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6983    let (caption, numeric_tokens) = split_chart_caption_and_values(blocks.get(start)?)?;
6984    let mut consumed = 1usize;
6985
6986    let mut source = String::new();
6987    let mut labels = Vec::new();
6988    if let Some(next_block) = blocks.get(start + 1) {
6989        let (candidate_labels, candidate_source) = extract_chart_labels_and_source(next_block);
6990        if !candidate_source.is_empty() || !candidate_labels.is_empty() {
6991            labels = candidate_labels;
6992            source = candidate_source;
6993            consumed += 1;
6994        }
6995    }
6996
6997    while let Some(block) = blocks.get(start + consumed) {
6998        if looks_like_numeric_noise_block(block) {
6999            consumed += 1;
7000            continue;
7001        }
7002        break;
7003    }
7004
7005    let value_tokens = derive_chart_series_values(&numeric_tokens, labels.len());
7006
7007    let mut out = String::new();
7008    out.push_str("## ");
7009    out.push_str(caption.trim());
7010    out.push_str("\n\n");
7011
7012    if labels.len() >= 3 && labels.len() == value_tokens.len() {
7013        let label_header = if labels.iter().all(|label| looks_like_yearish_label(label)) {
7014            "Year"
7015        } else {
7016            "Label"
7017        };
7018        let value_header = chart_value_header(&caption);
7019        out.push_str(&format!("| {} | {} |\n", label_header, value_header));
7020        out.push_str("| --- | --- |\n");
7021        for (label, value) in labels.iter().zip(value_tokens.iter()) {
7022            out.push_str(&format!("| {} | {} |\n", label, value));
7023        }
7024        out.push('\n');
7025    }
7026
7027    if !source.is_empty() {
7028        out.push('*');
7029        out.push_str(&escape_md_line_start(&source));
7030        out.push_str("*\n\n");
7031    }
7032
7033    Some((out.trim().to_string(), consumed))
7034}
7035
7036fn render_structural_caption_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7037    let block = blocks.get(start)?.trim();
7038    if !is_structural_caption(block) || block.contains('|') {
7039        return None;
7040    }
7041
7042    let mut caption = collapse_inline_whitespace(block);
7043    let mut consumed = 1usize;
7044    if let Some(next_block) = blocks.get(start + 1) {
7045        let next = next_block.trim();
7046        if looks_like_caption_continuation(next) {
7047            caption.push(' ');
7048            caption.push_str(next.trim_end_matches('.'));
7049            consumed += 1;
7050        } else if !looks_like_isolated_caption_context(block, next) {
7051            return None;
7052        }
7053    } else {
7054        return None;
7055    }
7056
7057    Some((format!("## {}", caption.trim()), consumed))
7058}
7059
7060fn split_chart_caption_and_values(block: &str) -> Option<(String, Vec<String>)> {
7061    let trimmed = block.trim();
7062    if !is_structural_caption(trimmed) {
7063        return None;
7064    }
7065
7066    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7067    let first_numeric_idx = tokens.iter().position(|token| is_numberish_token(token))?;
7068    if first_numeric_idx < 3 {
7069        return None;
7070    }
7071
7072    let caption = tokens[..first_numeric_idx].join(" ");
7073    let numeric_tokens: Vec<String> = tokens[first_numeric_idx..]
7074        .iter()
7075        .filter_map(|token| sanitize_numberish_token(token))
7076        .collect();
7077
7078    if numeric_tokens.len() < 4 {
7079        return None;
7080    }
7081
7082    Some((caption, numeric_tokens))
7083}
7084
7085fn parse_pipe_table_block(block: &str) -> Option<Vec<Vec<String>>> {
7086    let lines: Vec<&str> = block
7087        .lines()
7088        .map(str::trim)
7089        .filter(|line| !line.is_empty())
7090        .collect();
7091    if lines.len() < 2 {
7092        return None;
7093    }
7094
7095    let header = split_pipe_row(lines[0])?;
7096    if !is_pipe_separator_row(lines[1], header.len()) {
7097        return None;
7098    }
7099
7100    let mut rows = vec![header];
7101    rows.push(split_pipe_row(lines[1]).unwrap_or_default());
7102    for line in lines.iter().skip(2) {
7103        let row = split_pipe_row(line)?;
7104        rows.push(row);
7105    }
7106    Some(rows)
7107}
7108
7109fn split_pipe_row(line: &str) -> Option<Vec<String>> {
7110    let trimmed = line.trim();
7111    if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
7112        return None;
7113    }
7114
7115    Some(
7116        trimmed[1..trimmed.len() - 1]
7117            .split('|')
7118            .map(|cell| cell.trim().to_string())
7119            .collect(),
7120    )
7121}
7122
7123fn is_pipe_separator_row(line: &str, expected_cols: usize) -> bool {
7124    let Some(cells) = split_pipe_row(line) else {
7125        return false;
7126    };
7127    if cells.len() != expected_cols || expected_cols == 0 {
7128        return false;
7129    }
7130
7131    cells.iter().all(|cell| {
7132        let stripped = cell.trim_matches(':').trim();
7133        !stripped.is_empty() && stripped.chars().all(|ch| ch == '-')
7134    })
7135}
7136
7137fn extract_value_year_pairs_from_cells(cells: &[String]) -> Vec<(String, String)> {
7138    let mut pairs = Vec::new();
7139    for cell in cells {
7140        let tokens: Vec<&str> = cell.split_whitespace().collect();
7141        if tokens.len() != 2 {
7142            continue;
7143        }
7144
7145        if looks_like_year_token(tokens[0]) && is_numberish_token(tokens[1]) {
7146            if let Some(value) = sanitize_numberish_token(tokens[1]) {
7147                pairs.push((tokens[0].to_string(), value));
7148            }
7149            continue;
7150        }
7151
7152        if is_numberish_token(tokens[0]) && looks_like_year_token(tokens[1]) {
7153            if let Some(value) = sanitize_numberish_token(tokens[0]) {
7154                pairs.push((tokens[1].to_string(), value));
7155            }
7156        }
7157    }
7158
7159    pairs.sort_by(|left, right| left.0.cmp(&right.0));
7160    pairs
7161}
7162
7163fn should_drop_artifact_table_block(blocks: &[&str], start: usize) -> bool {
7164    let Some(rows) = parse_pipe_table_block(blocks[start]) else {
7165        return false;
7166    };
7167
7168    let prev = start
7169        .checked_sub(1)
7170        .and_then(|idx| blocks.get(idx))
7171        .map(|block| block.trim())
7172        .unwrap_or("");
7173    let next = blocks
7174        .get(start + 1)
7175        .map(|block| block.trim())
7176        .unwrap_or("");
7177
7178    if rows.len() == 2 && rows.first().is_some_and(|row| row.len() == 1) {
7179        let header = rows[0][0].trim();
7180        if looks_like_url_fragment(header) {
7181            return true;
7182        }
7183        if looks_like_numeric_axis_blob(header) && !previous_block_announces_table(prev) {
7184            return true;
7185        }
7186    }
7187
7188    let stats = pipe_table_stats(&rows);
7189    stats.fill_ratio < 0.5
7190        && stats.long_cell_count == 0
7191        && !is_structural_caption(prev)
7192        && (looks_like_citation_block(next) || is_structural_caption(next))
7193}
7194
7195fn previous_block_announces_table(block: &str) -> bool {
7196    let lower = block.trim().to_ascii_lowercase();
7197    lower.ends_with("as follows:")
7198        || lower.ends_with("following details:")
7199        || lower.ends_with("following detail:")
7200        || lower.contains("the following details")
7201}
7202
7203fn looks_like_url_fragment(text: &str) -> bool {
7204    let trimmed = text.trim();
7205    (!trimmed.is_empty() && (trimmed.contains("http") || trimmed.contains("/status/")))
7206        || (trimmed.contains('/') && !trimmed.contains(' '))
7207}
7208
7209fn looks_like_numeric_axis_blob(text: &str) -> bool {
7210    let numeric_values: Vec<i64> = text
7211        .split_whitespace()
7212        .filter_map(parse_integer_token)
7213        .collect();
7214    numeric_values.len() >= 8
7215        && !detect_axis_progression(&numeric_values).is_empty()
7216        && text.chars().any(char::is_alphabetic)
7217}
7218
7219fn looks_like_citation_block(block: &str) -> bool {
7220    let trimmed = block.trim();
7221    trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.split_whitespace().count() <= 8
7222}
7223
7224struct PipeTableStats {
7225    fill_ratio: f64,
7226    long_cell_count: usize,
7227}
7228
7229fn pipe_table_stats(rows: &[Vec<String>]) -> PipeTableStats {
7230    let cols = rows.iter().map(Vec::len).max().unwrap_or(0).max(1);
7231    let body = rows.len().saturating_sub(2);
7232    let mut nonempty = 0usize;
7233    let mut long_cell_count = 0usize;
7234
7235    for row in rows.iter().skip(2) {
7236        for cell in row {
7237            if !cell.trim().is_empty() {
7238                nonempty += 1;
7239                if cell.split_whitespace().count() >= 3 {
7240                    long_cell_count += 1;
7241                }
7242            }
7243        }
7244    }
7245
7246    let fill_ratio = if body == 0 {
7247        0.0
7248    } else {
7249        nonempty as f64 / (body * cols) as f64
7250    };
7251
7252    PipeTableStats {
7253        fill_ratio,
7254        long_cell_count,
7255    }
7256}
7257
7258fn extract_chart_labels_and_source(block: &str) -> (Vec<String>, String) {
7259    let trimmed = block.trim();
7260    let lower = trimmed.to_ascii_lowercase();
7261    let source_idx = lower.find("source:");
7262
7263    let label_region = source_idx.map_or(trimmed, |idx| trimmed[..idx].trim());
7264    let source = source_idx
7265        .map(|idx| trimmed[idx..].trim().to_string())
7266        .unwrap_or_default();
7267
7268    let labels = parse_chart_labels(label_region);
7269    (labels, source)
7270}
7271
7272fn parse_chart_labels(text: &str) -> Vec<String> {
7273    let tokens: Vec<&str> = text.split_whitespace().collect();
7274    let mut labels = Vec::new();
7275    let mut i = 0usize;
7276    while i < tokens.len() {
7277        let token = tokens[i].trim_matches(|c: char| c == ',' || c == ';');
7278        if looks_like_year_token(token) {
7279            let mut label = token.to_string();
7280            if let Some(next) = tokens.get(i + 1) {
7281                let next_trimmed = next.trim_matches(|c: char| c == ',' || c == ';');
7282                if next_trimmed.starts_with('(') && next_trimmed.ends_with(')') {
7283                    label.push(' ');
7284                    label.push_str(next_trimmed);
7285                    i += 1;
7286                }
7287            }
7288            labels.push(label);
7289        } else if looks_like_category_label(token) {
7290            labels.push(token.to_string());
7291        }
7292        i += 1;
7293    }
7294    labels
7295}
7296
7297fn derive_chart_series_values(tokens: &[String], expected_count: usize) -> Vec<String> {
7298    if expected_count == 0 {
7299        return Vec::new();
7300    }
7301
7302    if tokens.len() == expected_count {
7303        return tokens.to_vec();
7304    }
7305
7306    let numeric_values: Vec<i64> = tokens
7307        .iter()
7308        .filter_map(|token| parse_integer_token(token))
7309        .collect();
7310    if numeric_values.len() != tokens.len() {
7311        return Vec::new();
7312    }
7313
7314    let axis_series = detect_axis_progression(&numeric_values);
7315    if axis_series.is_empty() {
7316        return Vec::new();
7317    }
7318
7319    let mut remaining = Vec::new();
7320    let mut removable = axis_series;
7321    for token in tokens {
7322        let Some(value) = parse_integer_token(token) else {
7323            continue;
7324        };
7325        if let Some(pos) = removable.iter().position(|candidate| *candidate == value) {
7326            removable.remove(pos);
7327        } else {
7328            remaining.push(token.clone());
7329        }
7330    }
7331
7332    if remaining.len() == expected_count {
7333        remaining
7334    } else {
7335        Vec::new()
7336    }
7337}
7338
7339fn detect_axis_progression(values: &[i64]) -> Vec<i64> {
7340    if values.len() < 6 {
7341        return Vec::new();
7342    }
7343
7344    let mut sorted = values.to_vec();
7345    sorted.sort_unstable();
7346    sorted.dedup();
7347    if sorted.len() < 6 {
7348        return Vec::new();
7349    }
7350
7351    let mut best = Vec::new();
7352    for window in sorted.windows(2) {
7353        let step = window[1] - window[0];
7354        if step <= 0 {
7355            continue;
7356        }
7357
7358        let mut series = vec![window[0]];
7359        let mut current = window[0];
7360        loop {
7361            let next = current + step;
7362            if sorted.binary_search(&next).is_ok() {
7363                series.push(next);
7364                current = next;
7365            } else {
7366                break;
7367            }
7368        }
7369
7370        if series.len() > best.len() {
7371            best = series;
7372        }
7373    }
7374
7375    if best.len() >= 6 {
7376        best
7377    } else {
7378        Vec::new()
7379    }
7380}
7381
7382fn chart_value_header(caption: &str) -> String {
7383    let trimmed = caption.trim();
7384    let title = strip_structural_caption_prefix(trimmed);
7385
7386    let mut base = title.to_string();
7387    if let Some(idx) = base.rfind(" in ") {
7388        let tail = base[idx + 4..].trim();
7389        if tail.split_whitespace().count() <= 2
7390            && tail.chars().next().is_some_and(char::is_uppercase)
7391        {
7392            base.truncate(idx);
7393        }
7394    }
7395
7396    if let Some(start) = title.rfind('(') {
7397        if title.ends_with(')') {
7398            let unit = title[start + 1..title.len() - 1].trim();
7399            if let Some(idx) = base.rfind('(') {
7400                base.truncate(idx);
7401            }
7402            let normalized_unit = unit.strip_prefix("in ").unwrap_or(unit).trim();
7403            return format!("{} ({})", base.trim(), normalized_unit);
7404        }
7405    }
7406
7407    let trimmed = base.trim();
7408    if trimmed.is_empty() {
7409        "Value".to_string()
7410    } else {
7411        trimmed.to_string()
7412    }
7413}
7414
7415fn strip_structural_caption_prefix(text: &str) -> &str {
7416    let trimmed = text.trim();
7417    let mut parts = trimmed.splitn(3, ' ');
7418    let Some(first) = parts.next() else {
7419        return trimmed;
7420    };
7421    let Some(second) = parts.next() else {
7422        return trimmed;
7423    };
7424    let Some(rest) = parts.next() else {
7425        return trimmed;
7426    };
7427
7428    let first_lower = first.to_ascii_lowercase();
7429    if matches!(
7430        first_lower.as_str(),
7431        "figure" | "table" | "diagram" | "chart"
7432    ) && second
7433        .chars()
7434        .all(|ch| ch.is_ascii_digit() || matches!(ch, '.' | ':'))
7435    {
7436        rest.trim()
7437    } else {
7438        trimmed
7439    }
7440}
7441
7442fn looks_like_footer_banner(block: &str) -> bool {
7443    let trimmed = block.trim();
7444    if trimmed.contains('\n') || trimmed.len() < 8 {
7445        return false;
7446    }
7447
7448    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7449    if !(2..=6).contains(&tokens.len()) {
7450        return false;
7451    }
7452
7453    let Some(last) = tokens.last() else {
7454        return false;
7455    };
7456    if !last.chars().all(|ch| ch.is_ascii_digit()) {
7457        return false;
7458    }
7459
7460    tokens[..tokens.len() - 1].iter().all(|token| {
7461        matches!(
7462            token.to_ascii_lowercase().as_str(),
7463            "of" | "and" | "the" | "for" | "in" | "on"
7464        ) || token.chars().next().is_some_and(char::is_uppercase)
7465    })
7466}
7467
7468fn looks_like_caption_continuation(block: &str) -> bool {
7469    let trimmed = block.trim();
7470    !trimmed.is_empty()
7471        && trimmed.split_whitespace().count() <= 8
7472        && trimmed.chars().next().is_some_and(char::is_uppercase)
7473        && !trimmed.contains(':')
7474}
7475
7476fn collapse_inline_whitespace(text: &str) -> String {
7477    text.split_whitespace().collect::<Vec<_>>().join(" ")
7478}
7479
7480fn drop_isolated_noise_lines(markdown: &str) -> String {
7481    let lines: Vec<&str> = markdown.lines().collect();
7482    let mut kept = Vec::with_capacity(lines.len());
7483
7484    for (idx, line) in lines.iter().enumerate() {
7485        if should_drop_isolated_noise_line(&lines, idx) {
7486            continue;
7487        }
7488        kept.push(*line);
7489    }
7490
7491    let mut result = kept.join("\n");
7492    if markdown.ends_with('\n') {
7493        result.push('\n');
7494    }
7495    result
7496}
7497
7498fn should_drop_isolated_noise_line(lines: &[&str], idx: usize) -> bool {
7499    let trimmed = lines[idx].trim();
7500    if trimmed.len() != 1 {
7501        return false;
7502    }
7503
7504    let ch = trimmed.chars().next().unwrap_or_default();
7505    if !(ch.is_ascii_lowercase() || ch.is_ascii_digit()) {
7506        return false;
7507    }
7508
7509    let prev = previous_nonempty_line(lines, idx);
7510    let next = next_nonempty_line(lines, idx);
7511    let (Some(prev), Some(next)) = (prev, next) else {
7512        return false;
7513    };
7514
7515    is_substantive_markdown_line(prev) && is_substantive_markdown_line(next)
7516}
7517
7518fn previous_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7519    lines[..idx]
7520        .iter()
7521        .rev()
7522        .find(|line| !line.trim().is_empty())
7523        .copied()
7524}
7525
7526fn next_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7527    lines[idx + 1..]
7528        .iter()
7529        .find(|line| !line.trim().is_empty())
7530        .copied()
7531}
7532
7533fn is_substantive_markdown_line(line: &str) -> bool {
7534    let trimmed = line.trim();
7535    if trimmed.is_empty() {
7536        return false;
7537    }
7538
7539    if trimmed.starts_with('|') || trimmed.starts_with("- ") || trimmed.starts_with('#') {
7540        return true;
7541    }
7542
7543    trimmed.split_whitespace().count() >= 2
7544}
7545
7546fn normalize_common_ocr_text(text: &str) -> String {
7547    if text.is_empty() {
7548        return String::new();
7549    }
7550
7551    let mut normalized = text
7552        .replace("ߤL", "μL")
7553        .replace(" oC", "°C")
7554        .replace("37 C", "37°C")
7555        .replace("-20 oC", "-20°C")
7556        .replace("1- 20-μL", "1-20-μL")
7557        .replace("1- 20 μL", "1-20 μL")
7558        .replace("1- 2 0  μL", "1-20 μL")
7559        .replace("1- 2 0 μL", "1-20 μL");
7560
7561    normalized = normalize_degree_spacing(&normalized);
7562    collapse_inline_whitespace(&normalized)
7563}
7564
7565fn normalize_degree_spacing(text: &str) -> String {
7566    let chars: Vec<char> = text.chars().collect();
7567    let mut out = String::with_capacity(text.len());
7568    let mut i = 0usize;
7569    while i < chars.len() {
7570        let ch = chars[i];
7571        if ch == ' '
7572            && i > 0
7573            && i + 2 < chars.len()
7574            && chars[i - 1].is_ascii_digit()
7575            && matches!(chars[i + 1], 'C' | 'F')
7576            && !chars[i + 2].is_ascii_alphabetic()
7577        {
7578            out.push('°');
7579            out.push(chars[i + 1]);
7580            i += 2;
7581            continue;
7582        }
7583        out.push(ch);
7584        i += 1;
7585    }
7586    out
7587}
7588
7589fn normalize_list_text(text: &str) -> String {
7590    let normalized = normalize_common_ocr_text(text);
7591    let trimmed = normalized
7592        .trim_start_matches(|ch: char| is_bullet_like(ch))
7593        .trim();
7594    trimmed.to_string()
7595}
7596
7597fn push_rendered_list_item(out: &mut String, item: &str) {
7598    if starts_with_enumerated_marker(item) {
7599        out.push_str(item);
7600        out.push('\n');
7601    } else {
7602        out.push_str(&format!("- {}\n", item));
7603    }
7604}
7605
7606fn should_merge_list_continuation(previous: &str, current: &str) -> bool {
7607    let trimmed = current.trim();
7608    if trimmed.is_empty()
7609        || looks_like_stray_list_page_number(trimmed)
7610        || is_list_section_heading(trimmed)
7611        || looks_like_numbered_section(trimmed)
7612        || starts_with_enumerated_marker(trimmed)
7613    {
7614        return false;
7615    }
7616
7617    if previous.ends_with('-')
7618        && previous
7619            .chars()
7620            .rev()
7621            .nth(1)
7622            .is_some_and(|c| c.is_alphabetic())
7623        && trimmed.chars().next().is_some_and(char::is_lowercase)
7624    {
7625        return true;
7626    }
7627
7628    trimmed
7629        .chars()
7630        .next()
7631        .is_some_and(|ch| ch.is_ascii_lowercase() || matches!(ch, ',' | ';' | ')' | ']' | '%'))
7632}
7633
7634fn is_pure_bullet_marker(text: &str) -> bool {
7635    let trimmed = text.trim();
7636    !trimmed.is_empty() && trimmed.chars().all(is_bullet_like)
7637}
7638
7639fn looks_like_stray_list_page_number(text: &str) -> bool {
7640    let trimmed = text.trim();
7641    (1..=4).contains(&trimmed.len()) && trimmed.chars().all(|ch| ch.is_ascii_digit())
7642}
7643
7644fn is_bullet_like(ch: char) -> bool {
7645    matches!(
7646        ch,
7647        '•' | '◦'
7648            | '▪'
7649            | '▸'
7650            | '▹'
7651            | '►'
7652            | '▻'
7653            | '●'
7654            | '○'
7655            | '■'
7656            | '□'
7657            | '◆'
7658            | '◇'
7659            | '-'
7660    )
7661}
7662
7663fn looks_like_isolated_caption_context(caption: &str, next_block: &str) -> bool {
7664    let next = next_block.trim();
7665    if next.is_empty() {
7666        return false;
7667    }
7668
7669    let next_lower = next.to_ascii_lowercase();
7670    if next_lower.starts_with("source:")
7671        || next_lower.starts_with("note:")
7672        || next_lower.starts_with("*source:")
7673        || next_lower.starts_with("*note:")
7674    {
7675        return true;
7676    }
7677
7678    caption.split_whitespace().count() <= 14
7679        && next.split_whitespace().count() <= 45
7680        && (next.contains(':') || next.contains('='))
7681}
7682
7683fn looks_like_numeric_noise_block(block: &str) -> bool {
7684    let trimmed = block.trim();
7685    !trimmed.is_empty()
7686        && trimmed.split_whitespace().all(|token| {
7687            sanitize_numberish_token(token)
7688                .as_deref()
7689                .is_some_and(|sanitized| sanitized.chars().all(|ch| ch.is_ascii_digit()))
7690        })
7691}
7692
7693fn looks_like_yearish_label(label: &str) -> bool {
7694    label.chars().next().is_some_and(|ch| ch.is_ascii_digit())
7695}
7696
7697fn looks_like_year_token(token: &str) -> bool {
7698    token.len() == 4 && token.chars().all(|ch| ch.is_ascii_digit())
7699}
7700
7701fn looks_like_category_label(token: &str) -> bool {
7702    token
7703        .chars()
7704        .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '/' | '%'))
7705        && token.chars().any(|ch| ch.is_ascii_alphabetic())
7706}
7707
7708fn is_numberish_token(token: &str) -> bool {
7709    sanitize_numberish_token(token).is_some()
7710}
7711
7712fn sanitize_numberish_token(token: &str) -> Option<String> {
7713    let trimmed = token.trim_matches(|c: char| matches!(c, ',' | ';' | ':' | '.'));
7714    if trimmed.is_empty() {
7715        return None;
7716    }
7717
7718    let candidate = trimmed.trim_end_matches('%').replace(',', "");
7719    if candidate.chars().all(|ch| ch.is_ascii_digit()) {
7720        Some(trimmed.trim_end_matches([',', ';', ':']).to_string())
7721    } else {
7722        None
7723    }
7724}
7725
7726fn parse_integer_token(token: &str) -> Option<i64> {
7727    sanitize_numberish_token(token)?
7728        .replace(',', "")
7729        .parse::<i64>()
7730        .ok()
7731}
7732
7733fn starts_with_uppercase_word(text: &str) -> bool {
7734    for ch in text.trim_start().chars() {
7735        if ch.is_alphabetic() {
7736            return ch.is_uppercase();
7737        }
7738        if !matches!(ch, '"' | '\'' | '(' | '[') {
7739            break;
7740        }
7741    }
7742    false
7743}
7744
7745/// Clean paragraph text: trim trailing whitespace from each line,
7746/// collapse multiple spaces, and normalize whitespace.
7747fn clean_paragraph_text(text: &str) -> String {
7748    let trimmed = text.trim();
7749    if trimmed.is_empty() {
7750        return String::new();
7751    }
7752    // Collapse runs of spaces (but not newlines) to single space
7753    let mut result = String::with_capacity(trimmed.len());
7754    let mut prev_space = false;
7755    for ch in trimmed.chars() {
7756        if ch == ' ' || ch == '\t' {
7757            if !prev_space {
7758                result.push(' ');
7759                prev_space = true;
7760            }
7761        } else {
7762            result.push(ch);
7763            prev_space = false;
7764        }
7765    }
7766    normalize_common_ocr_text(&result)
7767}
7768
7769fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
7770    match element {
7771        Some(ContentElement::Paragraph(p)) => {
7772            let text = clean_paragraph_text(&p.base.value());
7773            let trimmed = text.trim();
7774            if trimmed.is_empty()
7775                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7776            {
7777                None
7778            } else {
7779                Some(trimmed.to_string())
7780            }
7781        }
7782        Some(ContentElement::TextBlock(tb)) => {
7783            let text = clean_paragraph_text(&tb.value());
7784            let trimmed = text.trim();
7785            if trimmed.is_empty()
7786                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7787            {
7788                None
7789            } else {
7790                Some(trimmed.to_string())
7791            }
7792        }
7793        Some(ContentElement::TextLine(tl)) => {
7794            let text = clean_paragraph_text(&tl.value());
7795            let trimmed = text.trim();
7796            if trimmed.is_empty()
7797                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7798            {
7799                None
7800            } else {
7801                Some(trimmed.to_string())
7802            }
7803        }
7804        _ => None,
7805    }
7806}
7807
7808fn should_render_paragraph_as_heading(
7809    doc: &PdfDocument,
7810    idx: usize,
7811    text: &str,
7812    next: Option<&ContentElement>,
7813) -> bool {
7814    if looks_like_top_margin_running_header(doc, idx, text) {
7815        return false;
7816    }
7817    if should_render_element_as_heading(&doc.kids[idx], text, next) {
7818        return true;
7819    }
7820
7821    // Font-size guard: skip rescue if the candidate text is significantly
7822    // smaller than the document's body text (chart axis labels, footnotes).
7823    let body_font_size = compute_body_font_size(doc);
7824    if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
7825        return false;
7826    }
7827
7828    // Rescue pass tier 1: when the pipeline found zero headings, use broad rescue.
7829    if !doc_has_explicit_headings(doc) {
7830        if should_rescue_as_heading(doc, idx, text) {
7831            return true;
7832        }
7833        // Also check numbered sections and ALL CAPS even with zero headings,
7834        // since Tier 1 broad rescue has strict word/char limits that miss
7835        // longer keyword-numbered headings (e.g. "Activity 4. Title text").
7836        if should_rescue_allcaps_heading(doc, idx, text) {
7837            return true;
7838        }
7839        if should_rescue_numbered_heading(doc, idx, text) {
7840            return true;
7841        }
7842        return false;
7843    }
7844    // Rescue pass tier 2: when heading density is very low (< 10%), only
7845    // rescue ALL CAPS short text followed by substantial body content.
7846    if heading_density(doc) < 0.10 {
7847        if should_rescue_allcaps_heading(doc, idx, text) {
7848            return true;
7849        }
7850        // Rescue pass tier 3: numbered section headings (e.g. "01 - Title").
7851        // When a document has very few detected headings, numbered patterns
7852        // are a strong structural signal that the font-based detector missed.
7853        if should_rescue_numbered_heading(doc, idx, text) {
7854            return true;
7855        }
7856        // Font-size-gated title-case rescue: when the paragraph is rendered
7857        // in a noticeably larger font than body text, apply the same
7858        // title-case rescue used in tier 1.  A 15 % size increase is a
7859        // reliable visual heading signal straight from the PDF font metrics.
7860        if body_font_size > 0.0 {
7861            if let ContentElement::Paragraph(p) = &doc.kids[idx] {
7862                if let Some(fs) = p.base.font_size {
7863                    if fs >= 1.15 * body_font_size
7864                        && is_heading_rescue_candidate(doc, idx, text)
7865                        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7866                    {
7867                        return true;
7868                    }
7869                }
7870            }
7871        }
7872    }
7873    false
7874}
7875
7876/// Check whether any element in the document is an explicit heading from the pipeline.
7877fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
7878    doc.kids.iter().any(|e| {
7879        matches!(
7880            e,
7881            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7882        )
7883    })
7884}
7885
7886/// Compute the dominant body font size from paragraphs with substantial text
7887/// (> 10 words).  Uses the median of qualifying paragraphs to avoid being
7888/// skewed by short chart labels or footnote markers.
7889/// Returns 0.0 if no qualifying paragraph is found.
7890fn compute_body_font_size(doc: &PdfDocument) -> f64 {
7891    let mut font_sizes: Vec<f64> = doc
7892        .kids
7893        .iter()
7894        .filter_map(|e| {
7895            if let ContentElement::Paragraph(p) = e {
7896                let word_count = p.base.value().split_whitespace().count();
7897                if word_count > 10 {
7898                    p.base.font_size
7899                } else {
7900                    None
7901                }
7902            } else {
7903                None
7904            }
7905        })
7906        .collect();
7907    if font_sizes.is_empty() {
7908        return 0.0;
7909    }
7910    font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
7911    font_sizes[font_sizes.len() / 2]
7912}
7913
7914/// Check whether a paragraph's font size is too small relative to the document
7915/// body font to be a heading.  Returns true if the element should be skipped.
7916/// A heading should not be noticeably smaller than body text — font size ≥ 95%
7917/// of the dominant body size is required.
7918fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
7919    if body_font_size <= 0.0 {
7920        return false;
7921    }
7922    if let ContentElement::Paragraph(p) = &doc_kids[idx] {
7923        if let Some(fs) = p.base.font_size {
7924            return fs < 0.95 * body_font_size;
7925        }
7926    }
7927    false
7928}
7929
7930/// Count the ratio of pipeline headings to total content elements.
7931fn heading_density(doc: &PdfDocument) -> f64 {
7932    let total = doc.kids.len();
7933    if total == 0 {
7934        return 0.0;
7935    }
7936    let heading_count = doc
7937        .kids
7938        .iter()
7939        .filter(|e| {
7940            matches!(
7941                e,
7942                ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7943            )
7944        })
7945        .count();
7946    heading_count as f64 / total as f64
7947}
7948
7949/// Rescue headings: identify short standalone paragraphs that likely serve
7950/// as section headings.  Only runs when the pipeline produced zero headings.
7951fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7952    is_heading_rescue_candidate(doc, idx, text)
7953        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7954}
7955
7956/// Pure text-criteria check for title-case heading rescue.
7957/// Returns true when the text looks like a heading based on casing,
7958/// length, and character composition — without any lookahead.
7959fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7960    let trimmed = text.trim();
7961    if trimmed.is_empty() {
7962        return false;
7963    }
7964
7965    let has_alpha = trimmed.chars().any(char::is_alphabetic);
7966
7967    // Must have alphabetic chars and not end with sentence/continuation punctuation
7968    if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
7969        return false;
7970    }
7971
7972    // Reject text containing math/special symbols or percentage signs.
7973    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
7974        return false;
7975    }
7976
7977    // Must not be fully parenthesized (citations)
7978    if trimmed.starts_with('(') && trimmed.ends_with(')') {
7979        return false;
7980    }
7981
7982    // Must not look like a caption or chart label
7983    if starts_with_caption_prefix(trimmed)
7984        || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
7985    {
7986        return false;
7987    }
7988
7989    // Must be short: ≤ 6 words, ≤ 60 chars
7990    let word_count = trimmed.split_whitespace().count();
7991    if word_count > 6 || trimmed.len() > 60 {
7992        return false;
7993    }
7994
7995    // Must not be a purely numeric string
7996    if trimmed
7997        .chars()
7998        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
7999    {
8000        return false;
8001    }
8002
8003    // First alphabetic character should be uppercase
8004    if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
8005        if first_alpha.is_lowercase() {
8006            return false;
8007        }
8008    }
8009
8010    true
8011}
8012
8013/// Check the next `max_lookahead` elements for substantive body content.
8014/// Returns true when at least one element is a long paragraph (≥ word_count*3
8015/// or > 15 words) or a structural element (list, table, image, figure).
8016fn has_substantive_follow_up(
8017    doc: &PdfDocument,
8018    idx: usize,
8019    word_count: usize,
8020    max_lookahead: usize,
8021) -> bool {
8022    for offset in 1..=max_lookahead {
8023        let lookahead_idx = idx + offset;
8024        if lookahead_idx >= doc.kids.len() {
8025            break;
8026        }
8027        let look_elem = &doc.kids[lookahead_idx];
8028        match look_elem {
8029            ContentElement::Paragraph(p) => {
8030                let next_text = p.base.value();
8031                let nw = next_text.split_whitespace().count();
8032                if nw >= word_count * 3 || nw > 15 {
8033                    return true;
8034                }
8035            }
8036            ContentElement::TextBlock(tb) => {
8037                let next_text = tb.value();
8038                let nw = next_text.split_whitespace().count();
8039                if nw >= word_count * 3 || nw > 15 {
8040                    return true;
8041                }
8042            }
8043            ContentElement::TextLine(tl) => {
8044                let next_text = tl.value();
8045                let nw = next_text.split_whitespace().count();
8046                if nw >= word_count * 3 || nw > 15 {
8047                    return true;
8048                }
8049            }
8050            ContentElement::List(_)
8051            | ContentElement::Table(_)
8052            | ContentElement::TableBorder(_)
8053            | ContentElement::Image(_)
8054            | ContentElement::Figure(_) => {
8055                return true;
8056            }
8057            _ => continue,
8058        }
8059    }
8060
8061    false
8062}
8063
8064/// Rescue numbered section headings like "01 - Find Open Educational Resources"
8065/// or "4.2 Main Results" when heading density is low.
8066fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8067    let trimmed = text.trim();
8068    if trimmed.is_empty() || trimmed.len() > 100 {
8069        return false;
8070    }
8071
8072    // Must match numbered section pattern: digits (with optional dots)
8073    // followed by separator and title text.
8074    if !looks_like_numbered_section(trimmed) {
8075        return false;
8076    }
8077
8078    // Must not end with sentence punctuation — EXCEPT when the text matches
8079    // a keyword+number pattern (e.g. "Activity 4. Determining CEC…") where
8080    // the trailing period is part of the heading format, not sentence ending.
8081    if trimmed.ends_with(['!', '?', ';', ',']) {
8082        return false;
8083    }
8084    if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
8085        return false;
8086    }
8087    // Reject numbered headings containing math symbols or percentage signs.
8088    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8089        return false;
8090    }
8091
8092    // Look ahead for substantive content
8093    for offset in 1..=3 {
8094        let lookahead_idx = idx + offset;
8095        if lookahead_idx >= doc.kids.len() {
8096            break;
8097        }
8098        match &doc.kids[lookahead_idx] {
8099            ContentElement::Paragraph(p) => {
8100                let nw = p.base.value().split_whitespace().count();
8101                if nw > 10 {
8102                    return true;
8103                }
8104            }
8105            ContentElement::TextBlock(tb) => {
8106                let nw = tb.value().split_whitespace().count();
8107                if nw > 10 {
8108                    return true;
8109                }
8110            }
8111            ContentElement::TextLine(tl) => {
8112                let nw = tl.value().split_whitespace().count();
8113                if nw > 10 {
8114                    return true;
8115                }
8116            }
8117            ContentElement::List(_)
8118            | ContentElement::Table(_)
8119            | ContentElement::TableBorder(_)
8120            | ContentElement::Image(_)
8121            | ContentElement::Figure(_) => {
8122                return true;
8123            }
8124            _ => continue,
8125        }
8126    }
8127
8128    false
8129}
8130
8131/// Check if text starts with a numbered section prefix (e.g. "01 -", "4.2 ", "III.")
8132/// or a keyword+number pattern (e.g. "Activity 4.", "Experiment #1:", "Chapter 3").
8133fn looks_like_numbered_section(text: &str) -> bool {
8134    let bytes = text.as_bytes();
8135    if bytes.is_empty() {
8136        return false;
8137    }
8138
8139    // Branch 1: digit-based prefix: "1 ", "01 ", "4.2 ", "1. ", "01 - "
8140    let mut idx = 0;
8141    if bytes[0].is_ascii_digit() {
8142        while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8143            idx += 1;
8144        }
8145        if idx >= bytes.len() {
8146            return false;
8147        }
8148        // dot-separated subsections: "4.2", "1.3.1"
8149        while idx < bytes.len() && bytes[idx] == b'.' {
8150            idx += 1;
8151            let start = idx;
8152            while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8153                idx += 1;
8154            }
8155            if idx == start {
8156                // "4." followed by space → "4. Title"
8157                break;
8158            }
8159        }
8160        // Must be followed by whitespace or "-"
8161        if idx >= bytes.len() {
8162            return false;
8163        }
8164        // Skip separator: "- " or " - " or just " "
8165        if bytes[idx] == b' ' || bytes[idx] == b'\t' {
8166            idx += 1;
8167            // Skip optional "- " separator
8168            if idx < bytes.len() && bytes[idx] == b'-' {
8169                idx += 1;
8170                if idx < bytes.len() && bytes[idx] == b' ' {
8171                    idx += 1;
8172                }
8173            }
8174        } else if bytes[idx] == b'-' {
8175            idx += 1;
8176            if idx < bytes.len() && bytes[idx] == b' ' {
8177                idx += 1;
8178            }
8179        } else {
8180            return false;
8181        }
8182        // Must have title text after prefix
8183        let rest = &text[idx..].trim();
8184        if rest.is_empty() {
8185            return false;
8186        }
8187        // First alpha char must be uppercase
8188        if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
8189            return c.is_uppercase();
8190        }
8191        return false;
8192    }
8193
8194    // Branch 2: keyword+number prefix: "Activity 4.", "Experiment #1:", "Chapter 3"
8195    if looks_like_keyword_numbered_section(text) {
8196        return true;
8197    }
8198
8199    false
8200}
8201
8202/// Structural keywords that commonly precede a number to form a heading.
8203const SECTION_KEYWORDS: &[&str] = &[
8204    "activity",
8205    "appendix",
8206    "case",
8207    "chapter",
8208    "exercise",
8209    "experiment",
8210    "lab",
8211    "lesson",
8212    "module",
8213    "part",
8214    "phase",
8215    "problem",
8216    "question",
8217    "section",
8218    "stage",
8219    "step",
8220    "task",
8221    "topic",
8222    "unit",
8223];
8224
8225/// Check if text matches "Keyword N. Title" or "Keyword #N: Title" pattern.
8226fn looks_like_keyword_numbered_section(text: &str) -> bool {
8227    let trimmed = text.trim();
8228    // Find the first space to extract the keyword
8229    let space_pos = match trimmed.find(' ') {
8230        Some(p) => p,
8231        None => return false,
8232    };
8233    let keyword = &trimmed[..space_pos];
8234    if !SECTION_KEYWORDS
8235        .iter()
8236        .any(|k| keyword.eq_ignore_ascii_case(k))
8237    {
8238        return false;
8239    }
8240    // After keyword+space, expect a number (optionally preceded by #)
8241    let rest = trimmed[space_pos + 1..].trim_start();
8242    if rest.is_empty() {
8243        return false;
8244    }
8245    let rest = rest.strip_prefix('#').unwrap_or(rest);
8246    // Must start with a digit or roman numeral
8247    let first_char = rest.chars().next().unwrap_or(' ');
8248    if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
8249        return false;
8250    }
8251    true
8252}
8253
8254/// Strict rescue for docs with some headings but low density: only promote
8255/// ALL CAPS text that is clearly a section heading.
8256fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8257    let trimmed = text.trim();
8258    if trimmed.is_empty() {
8259        return false;
8260    }
8261
8262    let word_count = trimmed.split_whitespace().count();
8263
8264    // Must be short: ≤ 8 words, ≤ 80 chars
8265    if word_count > 8 || trimmed.len() > 80 {
8266        return false;
8267    }
8268
8269    // Must be ALL CAPS (all alphabetic chars are uppercase)
8270    let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
8271    if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
8272        return false;
8273    }
8274
8275    // Must not end with sentence punctuation
8276    if trimmed.ends_with(['.', ';', ',']) {
8277        return false;
8278    }
8279
8280    // Reject all-caps headings containing math symbols or percentage signs.
8281    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8282        return false;
8283    }
8284
8285    // Must not look like a caption
8286    if starts_with_caption_prefix(trimmed) {
8287        return false;
8288    }
8289
8290    // Must not be purely numeric or a page number
8291    if trimmed
8292        .chars()
8293        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8294    {
8295        return false;
8296    }
8297
8298    // Look ahead for substantive content — accept any non-trivial text
8299    // (>6 words) or structured content within the next 4 elements.
8300    for offset in 1..=4 {
8301        let lookahead_idx = idx + offset;
8302        if lookahead_idx >= doc.kids.len() {
8303            break;
8304        }
8305        let look_elem = &doc.kids[lookahead_idx];
8306        match look_elem {
8307            ContentElement::Paragraph(p) => {
8308                let nw = p.base.value().split_whitespace().count();
8309                if nw > 6 {
8310                    return true;
8311                }
8312            }
8313            ContentElement::TextBlock(tb) => {
8314                let nw = tb.value().split_whitespace().count();
8315                if nw > 6 {
8316                    return true;
8317                }
8318            }
8319            ContentElement::TextLine(tl) => {
8320                let nw = tl.value().split_whitespace().count();
8321                if nw > 6 {
8322                    return true;
8323                }
8324            }
8325            ContentElement::List(_)
8326            | ContentElement::Table(_)
8327            | ContentElement::TableBorder(_)
8328            | ContentElement::Image(_)
8329            | ContentElement::Figure(_) => {
8330                return true;
8331            }
8332            _ => continue,
8333        }
8334    }
8335
8336    false
8337}
8338
8339fn should_render_element_as_heading(
8340    element: &ContentElement,
8341    text: &str,
8342    next: Option<&ContentElement>,
8343) -> bool {
8344    let trimmed = text.trim();
8345    if trimmed.is_empty() {
8346        return false;
8347    }
8348
8349    let lower = trimmed.to_ascii_lowercase();
8350    if matches!(lower.as_str(), "contents" | "table of contents")
8351        && trimmed.starts_with(|c: char| c.is_uppercase())
8352    {
8353        return true;
8354    }
8355
8356    let word_count = trimmed.split_whitespace().count();
8357    let has_alpha = trimmed.chars().any(char::is_alphabetic);
8358    let title_like = has_alpha
8359        && word_count <= 4
8360        && trimmed.len() <= 40
8361        && !trimmed.ends_with(['.', '!', '?', ';', ':']);
8362
8363    // Reject attribution prefixes that are clearly not section headings
8364    // (more targeted than starts_with_caption_prefix to avoid false demotions
8365    // of legitimate headings starting with common words like "Graph", "Table").
8366    let is_attribution = {
8367        let lower = trimmed.to_ascii_lowercase();
8368        lower.starts_with("source:")
8369            || lower.starts_with("credit:")
8370            || lower.starts_with("photo by ")
8371            || lower.starts_with("photo credit")
8372            || lower.starts_with("image by ")
8373            || lower.starts_with("image credit")
8374    };
8375
8376    title_like
8377        && matches!(next, Some(ContentElement::List(_)))
8378        && !looks_like_chart_label_heading(element, trimmed)
8379        && !is_attribution
8380}
8381
8382fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8383    let trimmed = text.trim();
8384    if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
8385        return false;
8386    }
8387
8388    let element = &doc.kids[idx];
8389    let bbox = element.bbox();
8390    if bbox.height() > 24.0 {
8391        return false;
8392    }
8393
8394    let Some(page) = element.page_number() else {
8395        return false;
8396    };
8397
8398    // Compute top Y for every page (single pass).
8399    let mut page_tops = std::collections::HashMap::<u32, f64>::new();
8400    for candidate in &doc.kids {
8401        if let Some(p) = candidate.page_number() {
8402            let top = page_tops.entry(p).or_insert(f64::MIN);
8403            *top = top.max(candidate.bbox().top_y);
8404        }
8405    }
8406
8407    let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
8408    if bbox.top_y < page_top - 24.0 {
8409        return false;
8410    }
8411
8412    // A running header repeats across pages.  If the same text does NOT
8413    // appear at the top margin of any other page, this is a unique heading
8414    // (e.g. a document title), not a running header.
8415    let trimmed_lower = trimmed.to_lowercase();
8416    for other_elem in &doc.kids {
8417        let Some(other_page) = other_elem.page_number() else {
8418            continue;
8419        };
8420        if other_page == page {
8421            continue;
8422        }
8423        let other_bbox = other_elem.bbox();
8424        if other_bbox.height() > 24.0 {
8425            continue;
8426        }
8427        let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
8428        if other_bbox.top_y < other_top - 24.0 {
8429            continue;
8430        }
8431        let other_text = match other_elem {
8432            ContentElement::Paragraph(p) => p.base.value(),
8433            ContentElement::TextBlock(tb) => tb.value(),
8434            ContentElement::TextLine(tl) => tl.value(),
8435            ContentElement::Heading(h) => h.base.base.value(),
8436            _ => continue,
8437        };
8438        if other_text.trim().to_lowercase() == trimmed_lower {
8439            return true;
8440        }
8441    }
8442
8443    false
8444}
8445
8446fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
8447    let trimmed = text.trim();
8448    let upper_words = trimmed
8449        .split_whitespace()
8450        .filter(|word| word.chars().any(char::is_alphabetic))
8451        .all(|word| {
8452            word.chars()
8453                .filter(|ch| ch.is_alphabetic())
8454                .all(|ch| ch.is_uppercase())
8455        });
8456
8457    (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
8458}
8459
8460fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
8461    let next_trimmed = next.trim();
8462    if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8463        return false;
8464    }
8465
8466    let normalized = normalize_heading_text(text);
8467    if matches!(
8468        normalized.as_str(),
8469        "contents" | "tableofcontents" | "introduction" | "conclusion"
8470    ) {
8471        return false;
8472    }
8473
8474    let words: Vec<&str> = text.split_whitespace().collect();
8475    if words.len() < 3 {
8476        return false;
8477    }
8478
8479    words
8480        .last()
8481        .is_some_and(|word| is_sentence_fragment_tail(word))
8482}
8483
8484fn is_sentence_fragment_tail(word: &str) -> bool {
8485    matches!(
8486        word.trim_matches(|c: char| !c.is_alphanumeric())
8487            .to_ascii_lowercase()
8488            .as_str(),
8489        "a" | "an"
8490            | "and"
8491            | "as"
8492            | "at"
8493            | "by"
8494            | "for"
8495            | "from"
8496            | "in"
8497            | "into"
8498            | "of"
8499            | "on"
8500            | "or"
8501            | "that"
8502            | "the"
8503            | "to"
8504            | "with"
8505    )
8506}
8507
8508fn is_list_section_heading(text: &str) -> bool {
8509    let trimmed = text.trim();
8510    trimmed.ends_with(':')
8511        && trimmed.len() <= 80
8512        && trimmed.split_whitespace().count() <= 8
8513        && trimmed.chars().any(char::is_alphabetic)
8514        && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
8515        && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
8516}
8517
8518fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
8519    let next_trimmed = next.trim();
8520    if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8521        return false;
8522    }
8523
8524    if starts_with_enumerated_marker(next_trimmed) {
8525        return false;
8526    }
8527
8528    if prev.ends_with('-')
8529        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8530        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8531    {
8532        return true;
8533    }
8534
8535    if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8536        return true;
8537    }
8538
8539    let lower = next_trimmed.to_ascii_lowercase();
8540    if lower.starts_with("http://")
8541        || lower.starts_with("https://")
8542        || lower.starts_with("arxiv")
8543        || lower.starts_with("doi:")
8544    {
8545        return true;
8546    }
8547
8548    if matches!(
8549        next_trimmed.split_whitespace().next(),
8550        Some("In" | "Proceedings" | "Advances" | "Learning")
8551    ) {
8552        return true;
8553    }
8554
8555    !prev.ends_with(['.', '!', '?', ':'])
8556}
8557
8558fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
8559    let next_trimmed = next.trim();
8560    if next_trimmed.is_empty() {
8561        return false;
8562    }
8563
8564    if starts_with_enumerated_marker(next_trimmed) {
8565        return false;
8566    }
8567
8568    if prev.ends_with('-')
8569        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8570        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8571    {
8572        return true;
8573    }
8574
8575    next_trimmed.chars().next().is_some_and(char::is_lowercase)
8576}
8577
8578fn starts_with_enumerated_marker(text: &str) -> bool {
8579    let first_token = match text.split_whitespace().next() {
8580        Some(token) => token.trim_start_matches(['(', '[']),
8581        None => return false,
8582    };
8583    if !first_token.ends_with(['.', ')', ':']) {
8584        return false;
8585    }
8586
8587    let marker = first_token.trim_end_matches(['.', ')', ':']);
8588    if marker.is_empty() {
8589        return false;
8590    }
8591
8592    if marker.chars().all(|c| c.is_ascii_digit()) {
8593        return true;
8594    }
8595
8596    if marker.len() == 1 && marker.chars().all(|c| c.is_ascii_alphabetic()) {
8597        return true;
8598    }
8599
8600    let lower = marker.to_ascii_lowercase();
8601    lower.len() <= 8 && lower.chars().all(|c| "ivxlcdm".contains(c))
8602}
8603
8604fn should_skip_leading_figure_carryover(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8605    let trimmed = text.trim();
8606    if !trimmed.starts_with("Figure ") || trimmed.split_whitespace().count() < 4 {
8607        return false;
8608    }
8609
8610    let element = &doc.kids[idx];
8611    let Some(page) = element.page_number() else {
8612        return false;
8613    };
8614
8615    let mut page_top = f64::MIN;
8616    for candidate in &doc.kids {
8617        if candidate.page_number() == Some(page)
8618            && matches!(
8619                candidate,
8620                ContentElement::Paragraph(_)
8621                    | ContentElement::TextBlock(_)
8622                    | ContentElement::TextLine(_)
8623                    | ContentElement::Heading(_)
8624                    | ContentElement::NumberHeading(_)
8625                    | ContentElement::Caption(_)
8626            )
8627        {
8628            page_top = page_top.max(candidate.bbox().top_y);
8629        }
8630    }
8631    if !page_top.is_finite() || element.bbox().top_y < page_top - 72.0 {
8632        return false;
8633    }
8634
8635    for prior_idx in 0..idx {
8636        let prior = &doc.kids[prior_idx];
8637        let prior_text = extract_element_text(prior);
8638        let prior_trimmed = prior_text.trim();
8639        if prior_trimmed.is_empty()
8640            || is_standalone_page_number(prior_trimmed)
8641            || looks_like_footer_banner(prior_trimmed)
8642        {
8643            continue;
8644        }
8645        match prior {
8646            ContentElement::Paragraph(_)
8647            | ContentElement::TextBlock(_)
8648            | ContentElement::TextLine(_) => {
8649                if !starts_with_caption_prefix(prior_trimmed)
8650                    && !looks_like_top_margin_running_header(doc, prior_idx, prior_trimmed)
8651                {
8652                    return false;
8653                }
8654            }
8655            ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8656                if !should_skip_heading_text(prior_trimmed) {
8657                    return false;
8658                }
8659            }
8660            _ => return false,
8661        }
8662    }
8663
8664    for lookahead_idx in idx + 1..doc.kids.len().min(idx + 8) {
8665        let next = &doc.kids[lookahead_idx];
8666        if next.page_number() != Some(page) {
8667            break;
8668        }
8669        let next_text = extract_element_text(next);
8670        let next_trimmed = next_text.trim();
8671        if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8672            continue;
8673        }
8674
8675        let is_numbered_heading = match next {
8676            ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8677                looks_like_numbered_section(next_trimmed)
8678                    || looks_like_keyword_numbered_section(next_trimmed)
8679            }
8680            ContentElement::Paragraph(_)
8681            | ContentElement::TextBlock(_)
8682            | ContentElement::TextLine(_) => {
8683                should_render_paragraph_as_heading(
8684                    doc,
8685                    lookahead_idx,
8686                    next_trimmed,
8687                    doc.kids.get(lookahead_idx + 1),
8688                ) && (looks_like_numbered_section(next_trimmed)
8689                    || looks_like_keyword_numbered_section(next_trimmed))
8690            }
8691            _ => false,
8692        };
8693
8694        if is_numbered_heading {
8695            return true;
8696        }
8697
8698        if !starts_with_caption_prefix(next_trimmed) && next_trimmed.split_whitespace().count() >= 5
8699        {
8700            return false;
8701        }
8702    }
8703
8704    false
8705}
8706
8707fn merge_paragraph_text(target: &mut String, next: &str) {
8708    let next_trimmed = next.trim();
8709    if target.ends_with('-')
8710        && target
8711            .chars()
8712            .rev()
8713            .nth(1)
8714            .is_some_and(|c| c.is_alphabetic())
8715        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8716    {
8717        target.pop();
8718        target.push_str(next_trimmed);
8719    } else {
8720        if !target.ends_with(' ') {
8721            target.push(' ');
8722        }
8723        target.push_str(next_trimmed);
8724    }
8725}
8726
8727fn is_standalone_page_number(text: &str) -> bool {
8728    let trimmed = text.trim();
8729    !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
8730}
8731
8732fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
8733    if !is_standalone_page_number(text) {
8734        return false;
8735    }
8736
8737    let bbox = element.bbox();
8738    if bbox.height() > 24.0 {
8739        return false;
8740    }
8741
8742    let Some(page) = element.page_number() else {
8743        return false;
8744    };
8745
8746    let mut page_top = f64::MIN;
8747    let mut page_bottom = f64::MAX;
8748    for candidate in &doc.kids {
8749        if candidate.page_number() == Some(page) {
8750            let candidate_bbox = candidate.bbox();
8751            page_top = page_top.max(candidate_bbox.top_y);
8752            page_bottom = page_bottom.min(candidate_bbox.bottom_y);
8753        }
8754    }
8755
8756    if !page_top.is_finite() || !page_bottom.is_finite() {
8757        return false;
8758    }
8759
8760    bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
8761}
8762
8763/// Check whether a pipeline heading sits in the bottom margin of its page.
8764/// Running footers (e.g. "Report Title 21") are sometimes classified as
8765/// headings by the pipeline.  A heading at the page bottom is very unlikely
8766/// to be a real section heading.
8767fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
8768    let element = &doc.kids[idx];
8769    let bbox = element.bbox();
8770    if bbox.height() > 30.0 {
8771        return false;
8772    }
8773
8774    let Some(page) = element.page_number() else {
8775        return false;
8776    };
8777
8778    let mut page_bottom = f64::MAX;
8779    for candidate in &doc.kids {
8780        if candidate.page_number() == Some(page) {
8781            page_bottom = page_bottom.min(candidate.bbox().bottom_y);
8782        }
8783    }
8784
8785    if !page_bottom.is_finite() {
8786        return false;
8787    }
8788
8789    // If this heading is at the very bottom of the page content, skip it.
8790    bbox.bottom_y <= page_bottom + 24.0
8791}
8792
8793/// Demote a pipeline heading that ends with a period when it doesn't look like
8794/// a genuine section heading (e.g. "United Kingdom." or "New Investment (a Challenger).").
8795/// Returns true when the heading should be rendered as a paragraph instead.
8796fn should_demote_period_heading(text: &str) -> bool {
8797    let trimmed = text.trim();
8798    if !trimmed.ends_with('.') {
8799        return false;
8800    }
8801    // Keep numbered section headings: "I. Introduction", "4.2. Results",
8802    // "Activity 4. Determining CEC…"
8803    if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
8804        return false;
8805    }
8806    // Keep headings whose text without the trailing period still looks like a
8807    // proper title — at least 3 words, first word uppercase, and the period
8808    // is clearly sentence-ending rather than part of a title pattern.
8809    let without_dot = trimmed.trim_end_matches('.');
8810    let word_count = without_dot.split_whitespace().count();
8811    // Very short fragments ending with '.' (like "Kingdom.") are almost
8812    // certainly not headings.
8813    if word_count <= 2 {
8814        return true;
8815    }
8816    false
8817}
8818
8819/// Demote headings that end with a comma — these are never real headings
8820/// (e.g. footnote references like "29 Pope," or "32 Beawes, 33 M.M.,").
8821fn should_demote_comma_heading(text: &str) -> bool {
8822    text.trim().ends_with(',')
8823}
8824
8825/// Demote headings containing mathematical/special symbols that never appear
8826/// in real section headings (e.g. "HL ¼", "P ≪ P", "LH þ HL:").
8827fn should_demote_math_heading(text: &str) -> bool {
8828    text.chars().any(|c| {
8829        matches!(
8830            c,
8831            '¼' | '½'
8832                | '¾'
8833                | '≪'
8834                | '≫'
8835                | 'þ'
8836                | 'ð'
8837                | '∑'
8838                | '∫'
8839                | '∂'
8840                | '∏'
8841                | '√'
8842                | '∞'
8843                | '≈'
8844                | '÷'
8845        )
8846    })
8847}
8848
8849/// Demote headings containing a percentage sign — these are typically data
8850/// labels rather than section headings (e.g. "56% AGREE").
8851fn should_demote_percentage_heading(text: &str) -> bool {
8852    text.contains('%')
8853}
8854
8855/// Demote bibliography entries that start with a 4-digit year followed by
8856/// a period and space (e.g. "2020. Measuring massive multitask...").
8857fn should_demote_bibliography_heading(text: &str) -> bool {
8858    let t = text.trim();
8859    if t.len() < 6 {
8860        return false;
8861    }
8862    let bytes = t.as_bytes();
8863    bytes[0..4].iter().all(|b| b.is_ascii_digit())
8864        && bytes[4] == b'.'
8865        && (bytes[5] == b' ' || t.len() == 5)
8866}
8867
8868/// Strip a trailing standalone page number from heading text.
8869/// E.g. "Chapter 3. Numerical differentiation 35" → "Chapter 3. Numerical differentiation"
8870/// Only strips when the last token is 1-4 digits and the heading has enough
8871/// words to be meaningful without it.
8872fn strip_trailing_page_number(text: &str) -> &str {
8873    let trimmed = text.trim();
8874    if let Some(last_space) = trimmed.rfind(' ') {
8875        let suffix = &trimmed[last_space + 1..];
8876        if !suffix.is_empty()
8877            && suffix.len() <= 4
8878            && suffix.chars().all(|c| c.is_ascii_digit())
8879            && trimmed[..last_space].split_whitespace().count() >= 3
8880        {
8881            return trimmed[..last_space].trim();
8882        }
8883    }
8884    trimmed
8885}
8886
8887/// Try to split a heading that contains a merged subsection number.
8888/// For example, "4 Results 4.1 Experimental Details" should become
8889/// two headings: "4 Results" and "4.1 Experimental Details".
8890/// Returns None if no split is needed, otherwise the split point byte offset.
8891fn find_merged_subsection_split(text: &str) -> Option<usize> {
8892    // Look for a subsection number pattern like "4.1" or "B.1" after initial content.
8893    // Must appear at a word boundary (preceded by space).
8894    let bytes = text.as_bytes();
8895    // Start searching after the first few characters to skip the initial number
8896    let mut i = 3;
8897    while i < bytes.len() {
8898        if bytes[i - 1] == b' ' {
8899            // Check for digit.digit pattern (e.g., "4.1")
8900            if bytes[i].is_ascii_digit() {
8901                if let Some(dot_pos) = text[i..].find('.') {
8902                    let after_dot = i + dot_pos + 1;
8903                    if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
8904                        // Found "N.N" pattern preceded by space
8905                        return Some(i);
8906                    }
8907                }
8908            }
8909            // Check for letter.digit pattern (e.g., "B.1")
8910            if bytes[i].is_ascii_uppercase()
8911                && i + 2 < bytes.len()
8912                && bytes[i + 1] == b'.'
8913                && bytes[i + 2].is_ascii_digit()
8914            {
8915                return Some(i);
8916            }
8917        }
8918        i += 1;
8919    }
8920    None
8921}
8922
8923fn should_skip_heading_text(text: &str) -> bool {
8924    let trimmed = text.trim();
8925    if trimmed.is_empty() || is_standalone_page_number(trimmed) {
8926        return true;
8927    }
8928
8929    let lower = trimmed.to_ascii_lowercase();
8930    if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
8931        && trimmed.contains('|')
8932    {
8933        return true;
8934    }
8935
8936    let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
8937    let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
8938    alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
8939}
8940
8941fn repair_fragmented_words(text: &str) -> String {
8942    const STOPWORDS: &[&str] = &[
8943        "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
8944        "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
8945    ];
8946
8947    let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
8948    if parts.len() < 2 {
8949        return text.to_string();
8950    }
8951
8952    let mut i = 0usize;
8953    while i + 1 < parts.len() {
8954        let left = parts[i].clone();
8955        let right = parts[i + 1].clone();
8956        let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
8957        let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
8958        let left_lower = left_clean.to_ascii_lowercase();
8959        let right_lower = right_clean.to_ascii_lowercase();
8960
8961        let should_join = !left_clean.is_empty()
8962            && !right_clean.is_empty()
8963            && left_clean.chars().all(char::is_alphabetic)
8964            && right_clean.chars().all(char::is_alphabetic)
8965            && (left_clean.len() <= 4 || right_clean.len() <= 4)
8966            && left_clean.len() + right_clean.len() >= 6
8967            && !right_clean.chars().next().is_some_and(char::is_uppercase)
8968            && !STOPWORDS.contains(&left_lower.as_str())
8969            && !STOPWORDS.contains(&right_lower.as_str());
8970
8971        if should_join {
8972            let next = parts.remove(i + 1);
8973            parts[i].push_str(&next);
8974        } else {
8975            i += 1;
8976        }
8977    }
8978
8979    parts.join(" ")
8980}
8981
8982/// Extract text from list item contents (fallback when label/body tokens are empty).
8983fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
8984    let mut text = String::new();
8985    for elem in contents {
8986        let part = match elem {
8987            ContentElement::Paragraph(p) => p.base.value(),
8988            ContentElement::TextBlock(tb) => tb.value(),
8989            ContentElement::TextLine(tl) => tl.value(),
8990            ContentElement::TextChunk(tc) => tc.value.clone(),
8991            _ => String::new(),
8992        };
8993        if !text.is_empty() && !part.is_empty() {
8994            text.push(' ');
8995        }
8996        text.push_str(&part);
8997    }
8998    text
8999}
9000
9001fn has_internal_header_gap(row: &[String]) -> bool {
9002    let mut seen_filled = false;
9003    let mut seen_gap_after_fill = false;
9004    for cell in row {
9005        if cell.trim().is_empty() {
9006            if seen_filled {
9007                seen_gap_after_fill = true;
9008            }
9009            continue;
9010        }
9011        if seen_gap_after_fill {
9012            return true;
9013        }
9014        seen_filled = true;
9015    }
9016    false
9017}
9018
9019fn expand_grouped_header_row(parent: &[String], child: &[String]) -> Vec<String> {
9020    let anchor_cols: Vec<usize> = parent
9021        .iter()
9022        .enumerate()
9023        .filter_map(|(idx, cell)| (!cell.trim().is_empty()).then_some(idx))
9024        .collect();
9025    if anchor_cols.is_empty() {
9026        return parent.to_vec();
9027    }
9028
9029    let mut expanded = parent.to_vec();
9030    for (col_idx, child_cell) in child.iter().enumerate() {
9031        if !expanded[col_idx].trim().is_empty() || child_cell.trim().is_empty() {
9032            continue;
9033        }
9034
9035        let mut best_anchor = anchor_cols[0];
9036        let mut best_distance = usize::abs_diff(anchor_cols[0], col_idx);
9037        for &anchor_idx in &anchor_cols[1..] {
9038            let distance = usize::abs_diff(anchor_idx, col_idx);
9039            if distance < best_distance || (distance == best_distance && anchor_idx > best_anchor) {
9040                best_anchor = anchor_idx;
9041                best_distance = distance;
9042            }
9043        }
9044        expanded[col_idx] = parent[best_anchor].trim().to_string();
9045    }
9046
9047    expanded
9048}
9049
9050fn preserve_grouped_header_rows(rows: &mut [Vec<String>]) -> bool {
9051    if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
9052        return false;
9053    }
9054    if rows[0].first().is_none_or(|cell| cell.trim().is_empty()) {
9055        return false;
9056    }
9057    if rows[1].first().is_some_and(|cell| !cell.trim().is_empty()) {
9058        return false;
9059    }
9060
9061    let first_filled = rows[0]
9062        .iter()
9063        .filter(|cell| !cell.trim().is_empty())
9064        .count();
9065    let second_filled = rows[1]
9066        .iter()
9067        .filter(|cell| !cell.trim().is_empty())
9068        .count();
9069    if first_filled < 2 || second_filled <= first_filled || !has_internal_header_gap(&rows[0]) {
9070        return false;
9071    }
9072
9073    rows[0] = expand_grouped_header_row(&rows[0], &rows[1]);
9074    true
9075}
9076
9077/// Merge header continuation rows in a rendered table.
9078///
9079/// When a PDF table has multi-line column headers, each wrapped line often
9080/// produces a separate row in the grid.  These continuation rows have an
9081/// empty first cell while the header row above them has content.  This
9082/// function detects such rows at the start of the table and merges their
9083/// text into the first row, producing a single combined header.
9084///
9085/// Only rows whose non-empty cells are all ≤ 30 characters are merged, to
9086/// avoid accidentally collapsing data rows that happen to have an empty key.
9087fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
9088    if rows.len() < 2 {
9089        return;
9090    }
9091    if preserve_grouped_header_rows(rows) {
9092        return;
9093    }
9094    // The first row must have a non-empty first cell (the header anchor).
9095    if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
9096        return;
9097    }
9098
9099    let mut merge_count = 0usize;
9100    for (i, row_i) in rows.iter().enumerate().skip(1) {
9101        let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
9102        if !first_empty {
9103            break; // hit a data row
9104        }
9105        // All non-empty cells must be short (header-like fragments).
9106        let all_short = row_i
9107            .iter()
9108            .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
9109        if !all_short {
9110            break;
9111        }
9112        merge_count = i;
9113    }
9114
9115    // Require at least 2 consecutive continuation rows to avoid merging
9116    // legitimate sub-header or unit rows (e.g. a single row with "cmolc/kg").
9117    if merge_count == 0 {
9118        return;
9119    }
9120
9121    // Merge rows 1..=merge_count into row 0.
9122    for i in 1..=merge_count {
9123        let (head, tail) = rows.split_at_mut(i);
9124        let ncols = head[0].len().min(tail[0].len());
9125        for (target, src) in head[0]
9126            .iter_mut()
9127            .take(ncols)
9128            .zip(tail[0].iter().take(ncols))
9129        {
9130            let fragment = src.trim().to_string();
9131            if !fragment.is_empty() {
9132                let target_str = target.trim().to_string();
9133                *target = if target_str.is_empty() {
9134                    fragment
9135                } else {
9136                    format!("{} {}", target_str, fragment)
9137                };
9138            }
9139        }
9140    }
9141
9142    // Remove the merged rows.
9143    rows.drain(1..=merge_count);
9144}
9145
9146fn trim_leading_table_carryover_rows(rows: &mut Vec<Vec<String>>) {
9147    while first_body_row_looks_like_carryover(rows) {
9148        rows.remove(1);
9149    }
9150}
9151
9152fn first_body_row_looks_like_carryover(rows: &[Vec<String>]) -> bool {
9153    if rows.len() < 3 {
9154        return false;
9155    }
9156
9157    let key_col_count = infer_leading_key_column_count(&rows[1..]);
9158    if key_col_count == 0 {
9159        return false;
9160    }
9161
9162    let candidate = &rows[1];
9163    if candidate
9164        .iter()
9165        .take(key_col_count)
9166        .any(|cell| !cell.trim().is_empty())
9167    {
9168        return false;
9169    }
9170
9171    let non_empty_cols = candidate
9172        .iter()
9173        .enumerate()
9174        .filter(|(_, cell)| !cell.trim().is_empty())
9175        .map(|(idx, _)| idx)
9176        .collect::<Vec<_>>();
9177    if non_empty_cols.len() != 1 {
9178        return false;
9179    }
9180
9181    let only_col = non_empty_cols[0];
9182    if only_col < key_col_count {
9183        return false;
9184    }
9185
9186    if candidate[only_col].split_whitespace().count() < 4 {
9187        return false;
9188    }
9189
9190    rows[2]
9191        .iter()
9192        .take(key_col_count)
9193        .all(|cell| !cell.trim().is_empty())
9194}
9195
9196fn infer_leading_key_column_count(rows: &[Vec<String>]) -> usize {
9197    if rows.len() < 2 {
9198        return 0;
9199    }
9200
9201    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
9202    let mut key_cols = 0usize;
9203
9204    for col_idx in 0..num_cols {
9205        let mut occupancy = 0usize;
9206        let mut word_counts = Vec::new();
9207
9208        for row in rows {
9209            let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
9210            let trimmed = cell.trim();
9211            if trimmed.is_empty() {
9212                continue;
9213            }
9214            occupancy += 1;
9215            word_counts.push(trimmed.split_whitespace().count());
9216        }
9217
9218        if occupancy == 0 {
9219            break;
9220        }
9221
9222        word_counts.sort_unstable();
9223        let median_words = word_counts[word_counts.len() / 2];
9224        let occupancy_ratio = occupancy as f64 / rows.len() as f64;
9225        if occupancy_ratio < 0.6 || median_words > 3 {
9226            break;
9227        }
9228        key_cols += 1;
9229    }
9230
9231    key_cols
9232}
9233
9234/// Render a SemanticTable as a markdown table.
9235fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
9236    // Delegate to render_table_border which handles cross-page linking.
9237    render_table_border(out, &table.table_border);
9238}
9239
9240#[derive(Clone, Debug)]
9241struct GeometricTableRegion {
9242    start_idx: usize,
9243    end_idx: usize,
9244    rendered: String,
9245}
9246
9247#[derive(Clone)]
9248struct ChunkLine {
9249    bbox: BoundingBox,
9250    chunks: Vec<TextChunk>,
9251}
9252
9253#[derive(Clone)]
9254struct SlotFragment {
9255    slot_idx: usize,
9256    bbox: BoundingBox,
9257    text: String,
9258}
9259
9260fn detect_geometric_table_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9261    let mut regions = Vec::new();
9262    let mut occupied_until = 0usize;
9263
9264    for (idx, element) in doc.kids.iter().enumerate() {
9265        if idx < occupied_until {
9266            continue;
9267        }
9268
9269        let Some(table) = table_border_from_element(element) else {
9270            continue;
9271        };
9272        let Some(region) = build_geometric_table_region(doc, idx, table) else {
9273            continue;
9274        };
9275        occupied_until = region.end_idx.saturating_add(1);
9276        regions.push(region);
9277    }
9278
9279    let mut occupied = regions
9280        .iter()
9281        .flat_map(|region| region.start_idx..=region.end_idx)
9282        .collect::<HashSet<_>>();
9283    for region in detect_footnote_citation_regions(doc) {
9284        if (region.start_idx..=region.end_idx).any(|idx| occupied.contains(&idx)) {
9285            continue;
9286        }
9287        occupied.extend(region.start_idx..=region.end_idx);
9288        regions.push(region);
9289    }
9290
9291    regions.sort_by_key(|region| region.start_idx);
9292    regions
9293}
9294
9295fn detect_footnote_citation_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9296    let body_font_size = compute_running_body_font_size(doc);
9297    if body_font_size <= 0.0 {
9298        return Vec::new();
9299    }
9300
9301    let mut regions = Vec::new();
9302    let mut idx = 0usize;
9303    while idx < doc.kids.len() {
9304        let Some(region) = build_footnote_citation_region(doc, idx, body_font_size) else {
9305            idx += 1;
9306            continue;
9307        };
9308        idx = region.end_idx.saturating_add(1);
9309        regions.push(region);
9310    }
9311
9312    regions
9313}
9314
9315fn compute_running_body_font_size(doc: &PdfDocument) -> f64 {
9316    doc.kids
9317        .iter()
9318        .filter_map(|element| {
9319            let ContentElement::Paragraph(paragraph) = element else {
9320                return None;
9321            };
9322            let text = paragraph.base.value();
9323            (text.split_whitespace().count() > 10).then_some(paragraph.base.font_size?)
9324        })
9325        .fold(0.0_f64, f64::max)
9326}
9327
9328fn build_footnote_citation_region(
9329    doc: &PdfDocument,
9330    start_idx: usize,
9331    body_font_size: f64,
9332) -> Option<GeometricTableRegion> {
9333    let element = doc.kids.get(start_idx)?;
9334    if !is_geometric_text_candidate(element) {
9335        return None;
9336    }
9337
9338    let start_text = extract_element_text(element);
9339    let trimmed_start = start_text.trim();
9340    if trimmed_start.is_empty() {
9341        return None;
9342    }
9343
9344    let small_font_threshold = (body_font_size * 0.92).min(body_font_size - 0.8).max(0.0);
9345    let mut lead_prefix = None;
9346    let mut fragments = Vec::new();
9347    let page_number = element.page_number()?;
9348    let mut column_bbox = element.bbox().clone();
9349    let mut region_start_idx = start_idx;
9350    let mut end_idx = start_idx;
9351
9352    if element_font_size(element).is_some_and(|font_size| font_size <= small_font_threshold)
9353        && starts_with_footnote_marker(trimmed_start)
9354    {
9355        if let Some((attach_idx, prefix, leading_fragments)) = leading_footnote_attachment(
9356            doc,
9357            start_idx,
9358            page_number,
9359            &column_bbox,
9360            small_font_threshold,
9361        ) {
9362            lead_prefix = Some(prefix);
9363            fragments.extend(leading_fragments);
9364            region_start_idx = attach_idx;
9365        }
9366        fragments.push(footnote_fragment_text(element));
9367    } else {
9368        let (prefix, first_tail) = split_trailing_footnote_lead(trimmed_start)?;
9369        let next = doc.kids.get(start_idx + 1)?;
9370        if !is_geometric_text_candidate(next)
9371            || next.page_number() != Some(page_number)
9372            || !element_font_size(next).is_some_and(|font_size| font_size <= small_font_threshold)
9373        {
9374            return None;
9375        }
9376        if !same_column_region(&column_bbox, next.bbox()) {
9377            return None;
9378        }
9379        lead_prefix = Some(prefix);
9380        fragments.push(first_tail);
9381    }
9382
9383    let mut consecutive_small = 0usize;
9384    for idx in start_idx + 1..doc.kids.len() {
9385        let candidate = &doc.kids[idx];
9386        if !is_geometric_text_candidate(candidate) || candidate.page_number() != Some(page_number) {
9387            break;
9388        }
9389
9390        let candidate_text = extract_element_text(candidate);
9391        let trimmed = candidate_text.trim();
9392        if trimmed.is_empty() || starts_with_caption_prefix(trimmed) {
9393            break;
9394        }
9395
9396        let Some(font_size) = element_font_size(candidate) else {
9397            break;
9398        };
9399        if font_size > small_font_threshold {
9400            break;
9401        }
9402        if !same_column_region(&column_bbox, candidate.bbox()) {
9403            break;
9404        }
9405
9406        column_bbox = column_bbox.union(candidate.bbox());
9407        fragments.push(footnote_fragment_text(candidate));
9408        consecutive_small += 1;
9409        end_idx = idx;
9410    }
9411
9412    if consecutive_small == 0 && lead_prefix.is_some() {
9413        return None;
9414    }
9415
9416    let rows = parse_footnote_citation_rows(&fragments);
9417    if rows.len() < 3 {
9418        return None;
9419    }
9420
9421    let numeric_markers = rows
9422        .iter()
9423        .filter_map(|(marker, _)| marker.parse::<u32>().ok())
9424        .collect::<Vec<_>>();
9425    if numeric_markers.len() != rows.len() {
9426        return None;
9427    }
9428    let sequential_steps = numeric_markers
9429        .windows(2)
9430        .filter(|pair| pair[1] == pair[0] + 1)
9431        .count();
9432    if sequential_steps + 1 < rows.len().saturating_sub(1) {
9433        return None;
9434    }
9435
9436    let mut rendered_rows = vec![vec!["Footnote".to_string(), "Citation".to_string()]];
9437    rendered_rows.extend(
9438        rows.into_iter()
9439            .map(|(marker, citation)| vec![marker, citation]),
9440    );
9441
9442    let mut rendered = String::new();
9443    if let Some(prefix) = lead_prefix {
9444        rendered.push_str(&escape_md_line_start(prefix.trim()));
9445        rendered.push_str("\n\n");
9446    }
9447    rendered.push_str(&render_html_table(&rendered_rows));
9448
9449    Some(GeometricTableRegion {
9450        start_idx: region_start_idx,
9451        end_idx,
9452        rendered,
9453    })
9454}
9455
9456fn leading_footnote_attachment(
9457    doc: &PdfDocument,
9458    start_idx: usize,
9459    page_number: u32,
9460    column_bbox: &BoundingBox,
9461    small_font_threshold: f64,
9462) -> Option<(usize, String, Vec<String>)> {
9463    let mut idx = start_idx.checked_sub(1)?;
9464    let mut leading_fragments = Vec::new();
9465    let mut scanned = 0usize;
9466
9467    loop {
9468        let candidate = doc.kids.get(idx)?;
9469        scanned += 1;
9470        if scanned > 6 || candidate.page_number() != Some(page_number) {
9471            return None;
9472        }
9473
9474        if !is_geometric_text_candidate(candidate) {
9475            if idx == 0 {
9476                return None;
9477            }
9478            idx -= 1;
9479            continue;
9480        }
9481
9482        let text = extract_element_text(candidate);
9483        let trimmed = text.trim();
9484        if trimmed.is_empty() {
9485            if idx == 0 {
9486                return None;
9487            }
9488            idx -= 1;
9489            continue;
9490        }
9491        if !same_column_region(candidate.bbox(), column_bbox) {
9492            return None;
9493        }
9494
9495        if element_font_size(candidate).is_some_and(|font_size| font_size <= small_font_threshold) {
9496            leading_fragments.push(footnote_fragment_text(candidate));
9497            if idx == 0 {
9498                return None;
9499            }
9500            idx -= 1;
9501            continue;
9502        }
9503
9504        let (prefix, first_tail) = split_trailing_footnote_lead(trimmed)?;
9505        leading_fragments.push(first_tail);
9506        leading_fragments.reverse();
9507        return Some((idx, prefix, leading_fragments));
9508    }
9509}
9510
9511fn parse_footnote_citation_rows(fragments: &[String]) -> Vec<(String, String)> {
9512    let mut rows = Vec::new();
9513    let mut current_marker = None::<String>;
9514    let mut current_citation = String::new();
9515
9516    for fragment in fragments {
9517        let markers = find_footnote_marker_positions(fragment);
9518        if markers.is_empty() {
9519            if current_marker.is_some() {
9520                merge_paragraph_text(&mut current_citation, fragment.trim());
9521            }
9522            continue;
9523        }
9524
9525        let mut cursor = 0usize;
9526        for (pos, marker, skip_len) in markers {
9527            let prefix = fragment[cursor..pos].trim();
9528            if current_marker.is_some() && !prefix.is_empty() {
9529                merge_paragraph_text(&mut current_citation, prefix);
9530            }
9531            if let Some(marker_value) = current_marker.take() {
9532                let trimmed = current_citation.trim();
9533                if !trimmed.is_empty() {
9534                    rows.push((marker_value, trimmed.to_string()));
9535                }
9536                current_citation.clear();
9537            }
9538            current_marker = Some(marker);
9539            cursor = pos + skip_len;
9540        }
9541
9542        let tail = fragment[cursor..].trim();
9543        if current_marker.is_some() && !tail.is_empty() {
9544            merge_paragraph_text(&mut current_citation, tail);
9545        }
9546    }
9547
9548    if let Some(marker_value) = current_marker {
9549        let trimmed = current_citation.trim();
9550        if !trimmed.is_empty() {
9551            rows.push((marker_value, trimmed.to_string()));
9552        }
9553    }
9554
9555    rebalance_adjacent_footnote_citations(&mut rows);
9556    rows
9557}
9558
9559fn rebalance_adjacent_footnote_citations(rows: &mut [(String, String)]) {
9560    for idx in 0..rows.len().saturating_sub(1) {
9561        if !rows[idx].1.trim_end().ends_with(',') {
9562            continue;
9563        }
9564
9565        let next = rows[idx + 1].1.trim().to_string();
9566        let Some((stub, remainder)) = split_leading_citation_stub(&next) else {
9567            continue;
9568        };
9569        let Some((first_sentence, trailing)) = split_first_sentence(remainder) else {
9570            continue;
9571        };
9572        if first_sentence.split_whitespace().count() < 2 {
9573            continue;
9574        }
9575
9576        merge_paragraph_text(&mut rows[idx].1, first_sentence);
9577        rows[idx + 1].1 = if trailing.is_empty() {
9578            stub.to_string()
9579        } else {
9580            format!("{stub} {trailing}")
9581        };
9582    }
9583}
9584
9585fn split_leading_citation_stub(text: &str) -> Option<(&str, &str)> {
9586    let comma_idx = text.find(',')?;
9587    if comma_idx > 8 {
9588        return None;
9589    }
9590    let stub = text[..=comma_idx].trim();
9591    let remainder = text[comma_idx + 1..].trim();
9592    (!stub.is_empty() && !remainder.is_empty()).then_some((stub, remainder))
9593}
9594
9595fn split_first_sentence(text: &str) -> Option<(&str, &str)> {
9596    let period_idx = text.find(". ")?;
9597    let first = text[..=period_idx].trim();
9598    let trailing = text[period_idx + 2..].trim();
9599    (!first.is_empty()).then_some((first, trailing))
9600}
9601
9602fn find_footnote_marker_positions(text: &str) -> Vec<(usize, String, usize)> {
9603    let chars = text.char_indices().collect::<Vec<_>>();
9604    let mut markers = Vec::new();
9605    let mut idx = 0usize;
9606
9607    while idx < chars.len() {
9608        let (byte_idx, ch) = chars[idx];
9609        if !ch.is_ascii_digit() {
9610            idx += 1;
9611            continue;
9612        }
9613
9614        let at_boundary = idx == 0
9615            || chars[idx - 1].1.is_whitespace()
9616            || matches!(
9617                chars[idx - 1].1,
9618                '.' | ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”'
9619            );
9620        if !at_boundary {
9621            idx += 1;
9622            continue;
9623        }
9624
9625        let mut end_idx = idx;
9626        while end_idx < chars.len() && chars[end_idx].1.is_ascii_digit() {
9627            end_idx += 1;
9628        }
9629        let digits = &text[byte_idx
9630            ..chars
9631                .get(end_idx)
9632                .map(|(pos, _)| *pos)
9633                .unwrap_or(text.len())];
9634        if digits.len() > 2 || end_idx >= chars.len() || !chars[end_idx].1.is_whitespace() {
9635            idx += 1;
9636            continue;
9637        }
9638
9639        let mut lookahead = end_idx;
9640        while lookahead < chars.len() && chars[lookahead].1.is_whitespace() {
9641            lookahead += 1;
9642        }
9643        let Some((_, next_ch)) = chars.get(lookahead) else {
9644            idx += 1;
9645            continue;
9646        };
9647        if !(next_ch.is_ascii_uppercase() || matches!(*next_ch, '(' | '[' | '*')) {
9648            idx += 1;
9649            continue;
9650        }
9651
9652        let skip_end = chars
9653            .get(lookahead)
9654            .map(|(pos, _)| *pos)
9655            .unwrap_or(text.len());
9656        markers.push((byte_idx, digits.to_string(), skip_end - byte_idx));
9657        idx = lookahead;
9658    }
9659
9660    markers
9661}
9662
9663fn split_trailing_footnote_lead(text: &str) -> Option<(String, String)> {
9664    let markers = find_footnote_marker_positions(text);
9665    let (pos, marker, skip_len) = markers.last()?.clone();
9666    let prefix = text[..pos].trim();
9667    let tail = text[pos + skip_len..].trim();
9668    if prefix.split_whitespace().count() < 6 || tail.split_whitespace().count() > 6 {
9669        return None;
9670    }
9671    Some((prefix.to_string(), format!("{marker} {tail}")))
9672}
9673
9674fn starts_with_footnote_marker(text: &str) -> bool {
9675    find_footnote_marker_positions(text)
9676        .first()
9677        .is_some_and(|(pos, _, _)| *pos == 0)
9678}
9679
9680fn same_column_region(left: &BoundingBox, right: &BoundingBox) -> bool {
9681    let overlap = (left.right_x.min(right.right_x) - left.left_x.max(right.left_x)).max(0.0);
9682    let min_width = left.width().min(right.width()).max(1.0);
9683    overlap / min_width >= 0.35 || (left.left_x - right.left_x).abs() <= 28.0
9684}
9685
9686fn footnote_fragment_text(element: &ContentElement) -> String {
9687    let text = extract_element_text(element);
9688    if element_font_name(element)
9689        .as_deref()
9690        .is_some_and(|name| name.to_ascii_lowercase().contains("italic"))
9691    {
9692        format!("*{}*", text.trim())
9693    } else {
9694        text
9695    }
9696}
9697
9698fn element_font_size(element: &ContentElement) -> Option<f64> {
9699    match element {
9700        ContentElement::Paragraph(p) => p.base.font_size,
9701        ContentElement::Heading(h) => h.base.base.font_size,
9702        ContentElement::NumberHeading(nh) => nh.base.base.base.font_size,
9703        ContentElement::TextBlock(tb) => Some(tb.font_size),
9704        ContentElement::TextLine(tl) => Some(tl.font_size),
9705        _ => None,
9706    }
9707}
9708
9709fn element_font_name(element: &ContentElement) -> Option<String> {
9710    match element {
9711        ContentElement::Paragraph(p) => p.base.font_name.clone(),
9712        ContentElement::Heading(h) => h.base.base.font_name.clone(),
9713        ContentElement::NumberHeading(nh) => nh.base.base.base.font_name.clone(),
9714        _ => None,
9715    }
9716}
9717
9718fn table_border_from_element(
9719    element: &ContentElement,
9720) -> Option<&crate::models::table::TableBorder> {
9721    match element {
9722        ContentElement::TableBorder(table) => Some(table),
9723        ContentElement::Table(table) => Some(&table.table_border),
9724        _ => None,
9725    }
9726}
9727
9728fn build_geometric_table_region(
9729    doc: &PdfDocument,
9730    table_idx: usize,
9731    table: &crate::models::table::TableBorder,
9732) -> Option<GeometricTableRegion> {
9733    let mut table_rows = collect_table_border_rows(table);
9734    if table_rows.is_empty() || table.num_columns < 3 {
9735        return None;
9736    }
9737    merge_continuation_rows(&mut table_rows);
9738
9739    let column_ranges = table_column_ranges(table)?;
9740    let candidate_indices = collect_table_header_candidate_indices(doc, table_idx, table);
9741    if candidate_indices.is_empty() {
9742        return None;
9743    }
9744
9745    let needs_external_stub =
9746        infer_left_stub_requirement(doc, &candidate_indices, &table_rows, &column_ranges);
9747    let supports_embedded_stub_header =
9748        supports_embedded_stub_header(&table_rows, &column_ranges, doc, &candidate_indices);
9749    if !needs_external_stub && !supports_embedded_stub_header {
9750        return None;
9751    }
9752    let slot_ranges = if needs_external_stub {
9753        slot_ranges(&column_ranges, doc, &candidate_indices, true)?
9754    } else {
9755        column_ranges.clone()
9756    };
9757    let mut header_rows = reconstruct_aligned_rows(doc, &candidate_indices, &slot_ranges, true, 2);
9758    if header_rows.is_empty() {
9759        return None;
9760    }
9761    if needs_external_stub {
9762        normalize_leading_stub_header(&mut header_rows);
9763    } else {
9764        promote_embedded_stub_header(&mut header_rows, &table_rows);
9765    }
9766
9767    let slot_count = slot_ranges.len();
9768    let dense_header_rows = header_rows
9769        .iter()
9770        .filter(|row| {
9771            row.iter().filter(|cell| !cell.trim().is_empty()).count()
9772                >= slot_count.saturating_sub(1).max(2)
9773        })
9774        .count();
9775    if dense_header_rows == 0 {
9776        return None;
9777    }
9778
9779    let mut combined_rows = Vec::new();
9780    combined_rows.extend(header_rows);
9781
9782    let following_indices = collect_table_footer_candidate_indices(doc, table_idx, table);
9783    let body_rows = if needs_external_stub && should_merge_panel_body_rows(&table_rows) {
9784        let trailing_rows =
9785            reconstruct_aligned_rows(doc, &following_indices, &slot_ranges, false, 1);
9786        vec![merge_panel_body_row(
9787            &table_rows,
9788            &trailing_rows,
9789            slot_count,
9790        )]
9791    } else if needs_external_stub {
9792        table_rows
9793            .iter()
9794            .map(|row| {
9795                let mut shifted = vec![String::new()];
9796                shifted.extend(row.iter().cloned());
9797                shifted
9798            })
9799            .collect()
9800    } else {
9801        table_rows
9802    };
9803
9804    if body_rows.is_empty() {
9805        return None;
9806    }
9807    combined_rows.extend(body_rows);
9808
9809    let rendered = render_pipe_rows(&combined_rows);
9810    Some(GeometricTableRegion {
9811        start_idx: candidate_indices[0],
9812        end_idx: following_indices.last().copied().unwrap_or(table_idx),
9813        rendered,
9814    })
9815}
9816
9817fn table_column_ranges(table: &crate::models::table::TableBorder) -> Option<Vec<(f64, f64)>> {
9818    if table.num_columns == 0 {
9819        return None;
9820    }
9821
9822    let mut ranges = vec![(f64::INFINITY, f64::NEG_INFINITY); table.num_columns];
9823    for row in &table.rows {
9824        for cell in &row.cells {
9825            if cell.col_number >= table.num_columns {
9826                continue;
9827            }
9828            let range = &mut ranges[cell.col_number];
9829            range.0 = range.0.min(cell.bbox.left_x);
9830            range.1 = range.1.max(cell.bbox.right_x);
9831        }
9832    }
9833
9834    if ranges
9835        .iter()
9836        .any(|(left, right)| !left.is_finite() || !right.is_finite() || right <= left)
9837    {
9838        return None;
9839    }
9840
9841    Some(ranges)
9842}
9843
9844fn collect_table_header_candidate_indices(
9845    doc: &PdfDocument,
9846    table_idx: usize,
9847    table: &crate::models::table::TableBorder,
9848) -> Vec<usize> {
9849    let mut indices = Vec::new();
9850    let table_page = table.bbox.page_number;
9851    let table_top = table.bbox.top_y;
9852    let mut cursor = table_idx;
9853
9854    while let Some(prev_idx) = cursor.checked_sub(1) {
9855        let element = &doc.kids[prev_idx];
9856        if element.page_number() != table_page {
9857            break;
9858        }
9859        if !is_geometric_text_candidate(element) {
9860            break;
9861        }
9862
9863        let bbox = element.bbox();
9864        let vertical_gap = bbox.bottom_y - table_top;
9865        if !(-6.0..=260.0).contains(&vertical_gap) {
9866            break;
9867        }
9868
9869        indices.push(prev_idx);
9870        cursor = prev_idx;
9871        if indices.len() >= 10 {
9872            break;
9873        }
9874    }
9875
9876    indices.reverse();
9877    indices
9878}
9879
9880fn collect_table_footer_candidate_indices(
9881    doc: &PdfDocument,
9882    table_idx: usize,
9883    table: &crate::models::table::TableBorder,
9884) -> Vec<usize> {
9885    let mut indices = Vec::new();
9886    let table_page = table.bbox.page_number;
9887    let table_bottom = table.bbox.bottom_y;
9888
9889    for idx in table_idx + 1..doc.kids.len() {
9890        let element = &doc.kids[idx];
9891        if element.page_number() != table_page {
9892            break;
9893        }
9894        if !is_geometric_text_candidate(element) {
9895            break;
9896        }
9897        if looks_like_margin_page_number(doc, element, &extract_element_text(element)) {
9898            break;
9899        }
9900
9901        let bbox = element.bbox();
9902        let gap = table_bottom - bbox.top_y;
9903        if !(-6.0..=28.0).contains(&gap) {
9904            break;
9905        }
9906        indices.push(idx);
9907        if indices.len() >= 4 {
9908            break;
9909        }
9910    }
9911
9912    indices
9913}
9914
9915fn is_geometric_text_candidate(element: &ContentElement) -> bool {
9916    matches!(
9917        element,
9918        ContentElement::Paragraph(_)
9919            | ContentElement::Heading(_)
9920            | ContentElement::NumberHeading(_)
9921            | ContentElement::TextBlock(_)
9922            | ContentElement::TextLine(_)
9923    )
9924}
9925
9926fn infer_left_stub_requirement(
9927    doc: &PdfDocument,
9928    candidate_indices: &[usize],
9929    table_rows: &[Vec<String>],
9930    column_ranges: &[(f64, f64)],
9931) -> bool {
9932    if column_ranges.is_empty() {
9933        return false;
9934    }
9935
9936    let first_width = (column_ranges[0].1 - column_ranges[0].0).max(1.0);
9937    let has_left_label = candidate_indices.iter().any(|idx| {
9938        let bbox = doc.kids[*idx].bbox();
9939        bbox.right_x <= column_ranges[0].0 + first_width * 0.12
9940            && bbox.width() <= first_width * 0.45
9941    });
9942    if !has_left_label {
9943        return false;
9944    }
9945
9946    let mut first_col_word_counts: Vec<usize> = table_rows
9947        .iter()
9948        .filter_map(|row| row.first())
9949        .map(|cell| cell.split_whitespace().count())
9950        .collect();
9951    if first_col_word_counts.is_empty() {
9952        return false;
9953    }
9954    first_col_word_counts.sort_unstable();
9955    let median = first_col_word_counts[first_col_word_counts.len() / 2];
9956    median >= 5
9957}
9958
9959fn supports_embedded_stub_header(
9960    table_rows: &[Vec<String>],
9961    column_ranges: &[(f64, f64)],
9962    doc: &PdfDocument,
9963    candidate_indices: &[usize],
9964) -> bool {
9965    if table_rows.len() < 2 || column_ranges.len() < 3 {
9966        return false;
9967    }
9968
9969    let first_row = &table_rows[0];
9970    if first_row.len() != column_ranges.len() || first_row[0].trim().is_empty() {
9971        return false;
9972    }
9973    if first_row[0].split_whitespace().count() > 3 || first_row[0].trim().len() > 24 {
9974        return false;
9975    }
9976
9977    let data_fill = first_row
9978        .iter()
9979        .skip(1)
9980        .filter(|cell| !cell.trim().is_empty())
9981        .count();
9982    if data_fill + 1 < column_ranges.len() {
9983        return false;
9984    }
9985
9986    let labeled_rows = table_rows
9987        .iter()
9988        .skip(1)
9989        .filter(|row| row.first().is_some_and(|cell| !cell.trim().is_empty()))
9990        .count();
9991    if labeled_rows == 0 {
9992        return false;
9993    }
9994
9995    let slot_ranges = column_ranges.to_vec();
9996    let header_rows = reconstruct_aligned_rows(doc, candidate_indices, &slot_ranges, true, 2);
9997    header_rows.iter().any(|row| {
9998        row.first().is_none_or(|cell| cell.trim().is_empty())
9999            && row
10000                .iter()
10001                .skip(1)
10002                .filter(|cell| !cell.trim().is_empty())
10003                .count()
10004                >= column_ranges.len().saturating_sub(1)
10005    })
10006}
10007
10008fn slot_ranges(
10009    column_ranges: &[(f64, f64)],
10010    doc: &PdfDocument,
10011    candidate_indices: &[usize],
10012    needs_stub: bool,
10013) -> Option<Vec<(f64, f64)>> {
10014    let mut slots = Vec::new();
10015    if needs_stub {
10016        let first_left = column_ranges.first()?.0;
10017        let left_stub_start = candidate_indices
10018            .iter()
10019            .map(|idx| doc.kids[*idx].bbox().left_x)
10020            .fold(first_left, f64::min);
10021        let stub_right = first_left - 1.0;
10022        if stub_right <= left_stub_start {
10023            return None;
10024        }
10025        slots.push((left_stub_start, stub_right));
10026    }
10027    slots.extend(column_ranges.iter().copied());
10028    Some(slots)
10029}
10030
10031fn reconstruct_aligned_rows(
10032    doc: &PdfDocument,
10033    candidate_indices: &[usize],
10034    slot_ranges: &[(f64, f64)],
10035    drop_wide_singletons: bool,
10036    min_filled_slots: usize,
10037) -> Vec<Vec<String>> {
10038    if candidate_indices.is_empty() || slot_ranges.is_empty() {
10039        return Vec::new();
10040    }
10041
10042    let mut row_bands: Vec<(BoundingBox, Vec<String>)> = Vec::new();
10043
10044    for idx in candidate_indices {
10045        for line in extract_chunk_lines(&doc.kids[*idx]) {
10046            let fragments = split_line_into_slot_fragments(&line, slot_ranges);
10047            if fragments.is_empty() {
10048                continue;
10049            }
10050
10051            if drop_wide_singletons && fragments.len() == 1 {
10052                let only = &fragments[0];
10053                let span_width = only.bbox.width();
10054                let table_width =
10055                    slot_ranges.last().map(|(_, right)| *right).unwrap_or(0.0) - slot_ranges[0].0;
10056                if span_width >= table_width * 0.55 {
10057                    continue;
10058                }
10059            }
10060
10061            let line_center = line.bbox.center_y();
10062            let tolerance = line
10063                .chunks
10064                .iter()
10065                .map(|chunk| chunk.font_size)
10066                .fold(8.0, f64::max)
10067                * 0.8;
10068
10069            let mut target_row = None;
10070            for (row_idx, (bbox, _)) in row_bands.iter().enumerate() {
10071                if (bbox.center_y() - line_center).abs() <= tolerance {
10072                    target_row = Some(row_idx);
10073                    break;
10074                }
10075            }
10076
10077            if let Some(row_idx) = target_row {
10078                let (bbox, cells) = &mut row_bands[row_idx];
10079                *bbox = bbox.union(&line.bbox);
10080                for fragment in fragments {
10081                    append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10082                }
10083            } else {
10084                let mut cells = vec![String::new(); slot_ranges.len()];
10085                for fragment in fragments {
10086                    append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10087                }
10088                row_bands.push((line.bbox.clone(), cells));
10089            }
10090        }
10091    }
10092
10093    row_bands.sort_by(|left, right| {
10094        right
10095            .0
10096            .top_y
10097            .partial_cmp(&left.0.top_y)
10098            .unwrap_or(std::cmp::Ordering::Equal)
10099    });
10100
10101    row_bands
10102        .into_iter()
10103        .map(|(_, cells)| cells)
10104        .filter(|cells| {
10105            let filled = cells.iter().filter(|cell| !cell.trim().is_empty()).count();
10106            filled >= min_filled_slots
10107        })
10108        .collect()
10109}
10110
10111fn extract_chunk_lines(element: &ContentElement) -> Vec<ChunkLine> {
10112    match element {
10113        ContentElement::Paragraph(p) => chunk_lines_from_semantic_node(&p.base),
10114        ContentElement::Heading(h) => chunk_lines_from_semantic_node(&h.base.base),
10115        ContentElement::NumberHeading(nh) => chunk_lines_from_semantic_node(&nh.base.base.base),
10116        ContentElement::TextBlock(tb) => tb
10117            .text_lines
10118            .iter()
10119            .map(|line| ChunkLine {
10120                bbox: line.bbox.clone(),
10121                chunks: line.text_chunks.clone(),
10122            })
10123            .collect(),
10124        ContentElement::TextLine(tl) => vec![ChunkLine {
10125            bbox: tl.bbox.clone(),
10126            chunks: tl.text_chunks.clone(),
10127        }],
10128        _ => Vec::new(),
10129    }
10130}
10131
10132fn chunk_lines_from_semantic_node(node: &SemanticTextNode) -> Vec<ChunkLine> {
10133    let mut lines = Vec::new();
10134    for column in &node.columns {
10135        for block in &column.text_blocks {
10136            for line in &block.text_lines {
10137                lines.push(ChunkLine {
10138                    bbox: line.bbox.clone(),
10139                    chunks: line.text_chunks.clone(),
10140                });
10141            }
10142        }
10143    }
10144    lines
10145}
10146
10147fn split_line_into_slot_fragments(
10148    line: &ChunkLine,
10149    slot_ranges: &[(f64, f64)],
10150) -> Vec<SlotFragment> {
10151    let mut groups: Vec<(usize, Vec<TextChunk>, BoundingBox)> = Vec::new();
10152
10153    for chunk in line
10154        .chunks
10155        .iter()
10156        .filter(|chunk| !chunk.value.trim().is_empty())
10157        .cloned()
10158    {
10159        let slot_idx = assign_chunk_to_slot(&chunk.bbox, slot_ranges);
10160        if let Some((prev_slot, prev_chunks, prev_bbox)) = groups.last_mut() {
10161            let gap = chunk.bbox.left_x - prev_bbox.right_x;
10162            if *prev_slot == slot_idx && gap <= chunk.font_size.max(6.0) * 2.4 {
10163                *prev_bbox = prev_bbox.union(&chunk.bbox);
10164                prev_chunks.push(chunk);
10165                continue;
10166            }
10167        }
10168        groups.push((slot_idx, vec![chunk.clone()], chunk.bbox.clone()));
10169    }
10170
10171    groups
10172        .into_iter()
10173        .filter_map(|(slot_idx, chunks, bbox)| {
10174            let text = normalize_common_ocr_text(
10175                &crate::models::text::TextLine::concatenate_chunks(&chunks),
10176            );
10177            if text.trim().is_empty() {
10178                None
10179            } else {
10180                Some(SlotFragment {
10181                    slot_idx,
10182                    bbox,
10183                    text,
10184                })
10185            }
10186        })
10187        .collect()
10188}
10189
10190fn assign_chunk_to_slot(bbox: &BoundingBox, slot_ranges: &[(f64, f64)]) -> usize {
10191    let mut best_idx = 0usize;
10192    let mut best_overlap = f64::NEG_INFINITY;
10193    let center_x = bbox.center_x();
10194
10195    for (idx, (left, right)) in slot_ranges.iter().enumerate() {
10196        let overlap = (bbox.right_x.min(*right) - bbox.left_x.max(*left)).max(0.0);
10197        let score = if overlap > 0.0 {
10198            overlap / bbox.width().max(1.0)
10199        } else {
10200            -((center_x - ((*left + *right) / 2.0)).abs())
10201        };
10202        if score > best_overlap {
10203            best_overlap = score;
10204            best_idx = idx;
10205        }
10206    }
10207
10208    best_idx
10209}
10210
10211fn append_cell_text(cell: &mut String, fragment: &str) {
10212    let trimmed = fragment.trim();
10213    if trimmed.is_empty() {
10214        return;
10215    }
10216    if !cell.is_empty() {
10217        cell.push(' ');
10218    }
10219    cell.push_str(trimmed);
10220}
10221
10222fn normalize_leading_stub_header(rows: &mut [Vec<String>]) {
10223    if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
10224        return;
10225    }
10226
10227    if !rows[0][0].trim().is_empty() || rows[1][0].trim().is_empty() {
10228        return;
10229    }
10230
10231    let first_row_filled = rows[0]
10232        .iter()
10233        .skip(1)
10234        .filter(|cell| !cell.trim().is_empty())
10235        .count();
10236    let second_row_filled = rows[1]
10237        .iter()
10238        .skip(1)
10239        .filter(|cell| !cell.trim().is_empty())
10240        .count();
10241    if first_row_filled < 2 || second_row_filled < 2 {
10242        return;
10243    }
10244
10245    rows[0][0] = rows[1][0].trim().to_string();
10246    rows[1][0].clear();
10247}
10248
10249fn promote_embedded_stub_header(header_rows: &mut [Vec<String>], table_rows: &[Vec<String>]) {
10250    let Some(header_row) = header_rows.first_mut() else {
10251        return;
10252    };
10253    let Some(first_body_row) = table_rows.first() else {
10254        return;
10255    };
10256    if header_row.is_empty() || first_body_row.is_empty() {
10257        return;
10258    }
10259    if !header_row[0].trim().is_empty() {
10260        return;
10261    }
10262
10263    let promoted = first_body_row[0].trim();
10264    if promoted.is_empty() || promoted.split_whitespace().count() > 3 || promoted.len() > 24 {
10265        return;
10266    }
10267
10268    let header_fill = header_row
10269        .iter()
10270        .skip(1)
10271        .filter(|cell| !cell.trim().is_empty())
10272        .count();
10273    let body_fill = first_body_row
10274        .iter()
10275        .skip(1)
10276        .filter(|cell| !cell.trim().is_empty())
10277        .count();
10278    if header_fill < header_row.len().saturating_sub(1)
10279        || body_fill < first_body_row.len().saturating_sub(1)
10280    {
10281        return;
10282    }
10283
10284    header_row[0] = promoted.to_string();
10285}
10286
10287fn should_merge_panel_body_rows(rows: &[Vec<String>]) -> bool {
10288    rows.len() >= 3
10289        && rows
10290            .iter()
10291            .all(|row| !row.is_empty() && row.iter().all(|cell| !cell.trim().is_empty()))
10292}
10293
10294fn merge_panel_body_row(
10295    table_rows: &[Vec<String>],
10296    trailing_rows: &[Vec<String>],
10297    slot_count: usize,
10298) -> Vec<String> {
10299    let mut merged = vec![String::new(); slot_count];
10300    for row in table_rows {
10301        for (col_idx, cell) in row.iter().enumerate() {
10302            if col_idx + 1 >= slot_count {
10303                break;
10304            }
10305            append_cell_text(&mut merged[col_idx + 1], cell);
10306        }
10307    }
10308    for row in trailing_rows {
10309        for (col_idx, cell) in row.iter().enumerate() {
10310            if col_idx >= slot_count {
10311                break;
10312            }
10313            append_cell_text(&mut merged[col_idx], cell);
10314        }
10315    }
10316    merged
10317}
10318
10319fn render_pipe_rows(rows: &[Vec<String>]) -> String {
10320    if rows.is_empty() {
10321        return String::new();
10322    }
10323
10324    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10325    if num_cols == 0 {
10326        return String::new();
10327    }
10328
10329    let mut out = String::new();
10330    for (row_idx, row) in rows.iter().enumerate() {
10331        out.push('|');
10332        for col_idx in 0..num_cols {
10333            let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
10334            out.push_str(&format!(" {} |", cell.trim()));
10335        }
10336        out.push('\n');
10337
10338        if row_idx == 0 {
10339            out.push('|');
10340            for _ in 0..num_cols {
10341                out.push_str(" --- |");
10342            }
10343            out.push('\n');
10344        }
10345    }
10346    out.push('\n');
10347    out
10348}
10349
10350fn render_html_table(rows: &[Vec<String>]) -> String {
10351    if rows.is_empty() {
10352        return String::new();
10353    }
10354
10355    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10356    if num_cols == 0 {
10357        return String::new();
10358    }
10359
10360    let mut out = String::from("<table>\n");
10361    for (row_idx, row) in rows.iter().enumerate() {
10362        out.push_str("<tr>");
10363        for col_idx in 0..num_cols {
10364            let cell = escape_html_text(row.get(col_idx).map(String::as_str).unwrap_or("").trim());
10365            if row_idx == 0 {
10366                out.push_str("<th>");
10367                out.push_str(&cell);
10368                out.push_str("</th>");
10369            } else {
10370                out.push_str("<td>");
10371                out.push_str(&cell);
10372                out.push_str("</td>");
10373            }
10374        }
10375        out.push_str("</tr>\n");
10376    }
10377    out.push_str("</table>\n\n");
10378    out
10379}
10380
10381fn escape_html_text(text: &str) -> String {
10382    text.replace('&', "&amp;")
10383        .replace('<', "&lt;")
10384        .replace('>', "&gt;")
10385        .replace('"', "&quot;")
10386        .replace('\'', "&#39;")
10387}
10388
10389fn normalized_numeric_marker(text: &str) -> Option<String> {
10390    let digits = text
10391        .chars()
10392        .filter(|ch| ch.is_ascii_digit())
10393        .collect::<String>();
10394    (!digits.is_empty() && digits.len() <= 2).then_some(digits)
10395}
10396
10397fn render_infographic_card_rows(rows: &[Vec<String>]) -> Option<String> {
10398    if rows.is_empty() || !rows.iter().all(|row| row.len() == 2) {
10399        return None;
10400    }
10401
10402    let marker = normalized_numeric_marker(rows[0][0].trim())?;
10403    if rows[0][1].split_whitespace().count() < 4 {
10404        return None;
10405    }
10406    if rows
10407        .iter()
10408        .skip(1)
10409        .any(|row| normalized_numeric_marker(row[0].trim()).is_some())
10410    {
10411        return None;
10412    }
10413    if rows
10414        .iter()
10415        .skip(1)
10416        .any(|row| !row[0].trim().is_empty() && row[0].trim().len() > 2)
10417    {
10418        return None;
10419    }
10420
10421    let body = rows
10422        .iter()
10423        .filter_map(|row| row.get(1))
10424        .map(|cell| cell.trim())
10425        .filter(|cell| !cell.is_empty())
10426        .collect::<Vec<_>>()
10427        .join(" ");
10428    if body.split_whitespace().count() < 8 {
10429        return None;
10430    }
10431
10432    Some(format!("{marker}. {body}\n\n"))
10433}
10434
10435fn extract_element_text(element: &ContentElement) -> String {
10436    match element {
10437        ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
10438        ContentElement::Heading(h) => clean_paragraph_text(&h.base.base.value()),
10439        ContentElement::NumberHeading(nh) => clean_paragraph_text(&nh.base.base.base.value()),
10440        ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
10441        ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
10442        _ => String::new(),
10443    }
10444}
10445
10446/// Collect rendered rows from a single TableBorder (no cross-page chaining).
10447fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
10448    let num_cols = table.num_columns.max(1);
10449    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
10450    for row in &table.rows {
10451        let cell_texts: Vec<String> = (0..num_cols)
10452            .map(|col| {
10453                row.cells
10454                    .iter()
10455                    .find(|c| c.col_number == col)
10456                    .map(cell_text_content)
10457                    .unwrap_or_default()
10458            })
10459            .collect();
10460        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
10461            rendered_rows.push(cell_texts);
10462        }
10463    }
10464    rendered_rows
10465}
10466
10467/// Render a TableBorder directly as a markdown table.
10468///
10469/// When the table has a `next_table` link (cross-page continuation), the
10470/// continuation rows are appended so the entire logical table is emitted
10471/// as a single pipe table.
10472fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
10473    if table.rows.is_empty() {
10474        return;
10475    }
10476
10477    // Collect rows from this table.
10478    let mut rendered_rows = collect_table_border_rows(table);
10479
10480    if rendered_rows.is_empty() {
10481        return;
10482    }
10483
10484    if let Some(rendered) = render_infographic_card_rows(&rendered_rows) {
10485        out.push_str(&rendered);
10486        return;
10487    }
10488
10489    // Merge multi-line header rows into a single header row.
10490    merge_continuation_rows(&mut rendered_rows);
10491    trim_leading_table_carryover_rows(&mut rendered_rows);
10492
10493    // ToC detection: render table-of-contents as plain text pairs, not a markdown table.
10494    if is_toc_table(&rendered_rows) {
10495        render_toc_rows(out, &rendered_rows);
10496        return;
10497    }
10498
10499    out.push_str(&render_pipe_rows(&rendered_rows));
10500}
10501
10502/// Returns true if `text` looks like a page number (Arabic digits or Roman numerals).
10503fn is_page_number_like(text: &str) -> bool {
10504    let t = text.trim();
10505    if t.is_empty() {
10506        return false;
10507    }
10508    // All ASCII digits, length ≤ 5 (handles pages 1–99999)
10509    if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
10510        return true;
10511    }
10512    // Lowercase Roman numerals (i, ii, iii, iv, v, vi, vii, viii, ix, x …)
10513    let lower = t.to_ascii_lowercase();
10514    if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
10515        return true;
10516    }
10517    false
10518}
10519
10520/// Returns true if the rendered rows look like a table-of-contents:
10521/// exactly 2 columns where the majority of right-column cells are page numbers.
10522fn is_toc_table(rows: &[Vec<String>]) -> bool {
10523    if rows.is_empty() {
10524        return false;
10525    }
10526    // Need at least 2 rows to qualify as a ToC
10527    if rows.len() < 2 {
10528        return false;
10529    }
10530    // First, every row must have exactly 2 cells
10531    if !rows.iter().all(|r| r.len() == 2) {
10532        return false;
10533    }
10534
10535    let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
10536    if non_empty_right < 2 {
10537        return false;
10538    }
10539
10540    let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
10541    page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
10542}
10543
10544/// Render ToC-style rows as plain text (title pagenum pairs) rather than a markdown table.
10545fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
10546    for row in rows {
10547        let title = row[0].trim();
10548        let page = row[1].trim();
10549        if title.is_empty() && page.is_empty() {
10550            continue;
10551        }
10552        if !title.is_empty() && !page.is_empty() {
10553            out.push_str(title);
10554            out.push(' ');
10555            out.push_str(page);
10556        } else {
10557            out.push_str(title);
10558            out.push_str(page);
10559        }
10560        out.push('\n');
10561    }
10562    out.push('\n');
10563}
10564
10565/// Extract text content from a table cell.
10566fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
10567    // First try the content tokens — use gap-based concatenation instead of
10568    // naive space-joining so that letter-spaced text ("O w n e r s h i p")
10569    // is collapsed correctly.
10570    if !cell.content.is_empty() {
10571        let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
10572        return normalize_common_ocr_text(&crate::models::text::TextLine::concatenate_chunks(
10573            &chunks,
10574        ));
10575    }
10576    // Fall back to processed contents
10577    let mut text = String::new();
10578    for elem in &cell.contents {
10579        match elem {
10580            ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
10581            ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
10582            ContentElement::TextLine(tl) => text.push_str(&tl.value()),
10583            ContentElement::TextChunk(tc) => text.push_str(&tc.value),
10584            _ => {}
10585        }
10586    }
10587    normalize_common_ocr_text(&repair_fragmented_words(&text))
10588}
10589
10590/// Merge adjacent pipe tables that share the same column count.
10591///
10592/// PDF table detection sometimes splits one visual table into several
10593/// fragments that are emitted as successive pipe tables.  When two tables
10594/// are separated only by blank lines and have identical column counts,
10595/// they are merged into a single table by appending the second table's
10596/// rows (including its header-now-body row) to the first.
10597fn merge_adjacent_pipe_tables(markdown: &str) -> String {
10598    let lines: Vec<&str> = markdown.lines().collect();
10599    if lines.len() < 4 {
10600        return markdown.to_string();
10601    }
10602
10603    fn count_pipe_cols(line: &str) -> usize {
10604        let t = line.trim();
10605        if !t.starts_with('|') || !t.ends_with('|') {
10606            return 0;
10607        }
10608        t.split('|').count().saturating_sub(2)
10609    }
10610
10611    fn is_separator(line: &str) -> bool {
10612        let t = line.trim();
10613        if !t.starts_with('|') || !t.ends_with('|') {
10614            return false;
10615        }
10616        let cells: Vec<&str> = t.split('|').collect();
10617        if cells.len() < 3 {
10618            return false;
10619        }
10620        cells[1..cells.len() - 1].iter().all(|c| {
10621            let s = c.trim();
10622            !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
10623        })
10624    }
10625
10626    fn is_pipe_row(line: &str) -> bool {
10627        let t = line.trim();
10628        t.starts_with('|') && t.ends_with('|') && t.len() > 2
10629    }
10630
10631    fn pipe_cells(line: &str) -> Vec<String> {
10632        let t = line.trim();
10633        if !is_pipe_row(t) {
10634            return Vec::new();
10635        }
10636        let parts = t.split('|').collect::<Vec<_>>();
10637        parts[1..parts.len() - 1]
10638            .iter()
10639            .map(|cell| cell.trim().to_string())
10640            .collect()
10641    }
10642
10643    fn normalize_header_cell(cell: &str) -> String {
10644        cell.chars()
10645            .filter(|ch| ch.is_alphanumeric())
10646            .flat_map(|ch| ch.to_lowercase())
10647            .collect()
10648    }
10649
10650    fn looks_like_header_row(line: &str) -> bool {
10651        let cells = pipe_cells(line);
10652        if cells.len() < 2 {
10653            return false;
10654        }
10655
10656        let non_empty = cells
10657            .iter()
10658            .filter(|cell| !cell.trim().is_empty())
10659            .collect::<Vec<_>>();
10660        if non_empty.len() < 2 {
10661            return false;
10662        }
10663
10664        let headerish = non_empty.iter().all(|cell| {
10665            let trimmed = cell.trim();
10666            let word_count = trimmed.split_whitespace().count();
10667            let has_alpha = trimmed.chars().any(|ch| ch.is_alphabetic());
10668            has_alpha && word_count <= 4 && trimmed.len() <= 28
10669        });
10670        headerish
10671    }
10672
10673    fn header_overlap_ratio(left: &str, right: &str) -> f64 {
10674        let left_cells = pipe_cells(left)
10675            .into_iter()
10676            .map(|cell| normalize_header_cell(&cell))
10677            .collect::<Vec<_>>();
10678        let right_cells = pipe_cells(right)
10679            .into_iter()
10680            .map(|cell| normalize_header_cell(&cell))
10681            .collect::<Vec<_>>();
10682        let width = left_cells.len().min(right_cells.len());
10683        if width == 0 {
10684            return 0.0;
10685        }
10686
10687        let matches = (0..width)
10688            .filter(|idx| {
10689                !left_cells[*idx].is_empty()
10690                    && !right_cells[*idx].is_empty()
10691                    && left_cells[*idx] == right_cells[*idx]
10692            })
10693            .count();
10694        matches as f64 / width as f64
10695    }
10696
10697    fn header_schema_matches(left: &str, right: &str) -> bool {
10698        let left_cells = pipe_cells(left)
10699            .into_iter()
10700            .map(|cell| normalize_header_cell(&cell))
10701            .collect::<Vec<_>>();
10702        let right_cells = pipe_cells(right)
10703            .into_iter()
10704            .map(|cell| normalize_header_cell(&cell))
10705            .collect::<Vec<_>>();
10706        if left_cells.len() != right_cells.len() || left_cells.len() < 2 {
10707            return false;
10708        }
10709
10710        let mut aligned_non_empty = 0usize;
10711        for (left, right) in left_cells.iter().zip(right_cells.iter()) {
10712            if left.is_empty() || right.is_empty() {
10713                continue;
10714            }
10715            aligned_non_empty += 1;
10716            if left != right {
10717                return false;
10718            }
10719        }
10720
10721        aligned_non_empty >= 2
10722    }
10723
10724    fn pad_pipe_row(line: &str, target_cols: usize) -> String {
10725        let t = line.trim();
10726        let current_cols = count_pipe_cols(t);
10727        if current_cols >= target_cols {
10728            return t.to_string();
10729        }
10730        // Append extra empty cells after the existing trailing |
10731        let mut result = t.to_string();
10732        for _ in current_cols..target_cols {
10733            result.push_str("  |");
10734        }
10735        result
10736    }
10737
10738    // Identify pipe table blocks: (start, sep_idx, end, col_count).
10739    struct Block {
10740        start: usize,
10741        sep: usize,
10742        end: usize, // inclusive last line
10743        cols: usize,
10744    }
10745
10746    let mut blocks: Vec<Block> = Vec::new();
10747    let mut i = 0;
10748    while i < lines.len() {
10749        if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
10750            let cols = count_pipe_cols(lines[i]);
10751            let sep = i + 1;
10752            let mut end = sep;
10753            let mut j = sep + 1;
10754            while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
10755                end = j;
10756                j += 1;
10757            }
10758            blocks.push(Block {
10759                start: i,
10760                sep,
10761                end,
10762                cols,
10763            });
10764            i = end + 1;
10765        } else {
10766            i += 1;
10767        }
10768    }
10769
10770    if blocks.len() < 2 {
10771        return markdown.to_string();
10772    }
10773
10774    // Group adjacent blocks: allow different column counts.
10775    // Merge when separated by blank lines only, or by heading markers
10776    // (lines starting with #) that represent table cells misclassified
10777    // as headings by the pipeline.
10778    // Track group max cols during merge to use for heading gap decisions.
10779    let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
10780    let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
10781    for bi in 1..blocks.len() {
10782        let prev = &blocks[bi - 1];
10783        let curr = &blocks[bi];
10784        let gap_range = prev.end + 1..curr.start;
10785        let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
10786        // For heading gap check, use the group's max cols (not individual block).
10787        // This handles chains like [2-col] → blank → [1-col] → heading → [2-col]
10788        // where the 1-col intermediary is already merged with the 2-col leader.
10789        let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
10790        let effective_prev_cols = group_cols[leader_idx];
10791        let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
10792            let non_blank: Vec<usize> = gap_range
10793                .clone()
10794                .filter(|li| !lines[*li].trim().is_empty())
10795                .collect();
10796            // Only merge when gap has 1-2 heading lines
10797            !non_blank.is_empty()
10798                && non_blank.len() <= 2
10799                && non_blank.iter().all(|li| {
10800                    let t = lines[*li].trim();
10801                    t.starts_with('#') && t.len() < 100
10802                })
10803        } else {
10804            false
10805        };
10806        // Short displaced cell: a single short plain-text word between two
10807        // multi-column tables is almost certainly a cell value that the PDF
10808        // pipeline displaced out of the table grid.
10809        let gap_short_fragment =
10810            if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
10811                let non_blank: Vec<usize> = gap_range
10812                    .clone()
10813                    .filter(|li| !lines[*li].trim().is_empty())
10814                    .collect();
10815                non_blank.len() == 1 && {
10816                    let t = lines[non_blank[0]].trim();
10817                    t.len() < 30
10818                        && !t.starts_with('#')
10819                        && !t.starts_with('-')
10820                        && !t.starts_with('*')
10821                        && !t.contains(':')
10822                        && !t.contains("TABLE")
10823                }
10824            } else {
10825                false
10826            };
10827        let prev_has_header = looks_like_header_row(lines[prev.start]);
10828        let curr_has_header = curr.end >= curr.sep + 2 && looks_like_header_row(lines[curr.start]);
10829        let curr_has_distinct_header = prev_has_header
10830            && curr_has_header
10831            && !header_schema_matches(lines[prev.start], lines[curr.start])
10832            && (curr.cols != prev.cols
10833                || header_overlap_ratio(lines[prev.start], lines[curr.start]) < 1.0);
10834
10835        if (gap_all_blank || gap_heading_only || gap_short_fragment)
10836            && prev.cols > 0
10837            && curr.cols > 0
10838            && !curr_has_distinct_header
10839        {
10840            merge_leader[bi] = Some(leader_idx);
10841            // Update group max cols
10842            if curr.cols > group_cols[leader_idx] {
10843                group_cols[leader_idx] = curr.cols;
10844            }
10845        }
10846    }
10847
10848    let mut pad_target: Vec<usize> = vec![0; blocks.len()];
10849    for bi in 0..blocks.len() {
10850        let leader = merge_leader[bi].unwrap_or(bi);
10851        pad_target[bi] = group_cols[leader];
10852    }
10853
10854    // Mark lines to skip: blank gap lines + separator of merged blocks.
10855    // Non-blank gap lines become pipe table rows instead of being skipped.
10856    // Keep the header row (curr.start) — it becomes a data row.
10857    let mut skip = vec![false; lines.len()];
10858    let mut convert_to_pipe_row = vec![false; lines.len()];
10859    for (bi, leader) in merge_leader.iter().enumerate() {
10860        if leader.is_none() {
10861            continue;
10862        }
10863        let prev_end = blocks[bi - 1].end;
10864        let curr = &blocks[bi];
10865        for li in (prev_end + 1)..curr.start {
10866            if lines[li].trim().is_empty() {
10867                skip[li] = true;
10868            } else {
10869                // Non-blank gap line: convert to pipe row
10870                convert_to_pipe_row[li] = true;
10871            }
10872        }
10873        // Only skip separator, header row becomes a data row
10874        skip[curr.sep] = true;
10875    }
10876
10877    // Map each line to its block index (or the block it belongs to via gap conversion).
10878    let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
10879    for (bi, block) in blocks.iter().enumerate() {
10880        line_to_block[block.start..=block.end].fill(Some(bi));
10881    }
10882    // Assign gap lines to the preceding block for padding purposes.
10883    for (bi, leader) in merge_leader.iter().enumerate() {
10884        if leader.is_none() {
10885            continue;
10886        }
10887        let prev_end = blocks[bi - 1].end;
10888        let curr = &blocks[bi];
10889        for li in (prev_end + 1)..curr.start {
10890            if convert_to_pipe_row[li] {
10891                line_to_block[li] = Some(bi - 1);
10892            }
10893        }
10894    }
10895
10896    let mut result = String::new();
10897    for (li, line) in lines.iter().enumerate() {
10898        if skip[li] {
10899            continue;
10900        }
10901        if convert_to_pipe_row[li] {
10902            // Convert non-blank gap text/heading into a pipe table row.
10903            let text = line.trim().trim_start_matches('#').trim();
10904            if let Some(bi) = line_to_block[li] {
10905                let target = pad_target[bi];
10906                if target > 0 && !text.is_empty() {
10907                    result.push_str(&format!("| {} ", text));
10908                    for _ in 1..target {
10909                        result.push_str("|  ");
10910                    }
10911                    result.push_str("|\n");
10912                    continue;
10913                }
10914            }
10915            // Fallback: emit as-is if no block context
10916            result.push_str(line);
10917            result.push('\n');
10918            continue;
10919        }
10920        if let Some(bi) = line_to_block[li] {
10921            let target = pad_target[bi];
10922            if target > 0 && is_pipe_row(line) && !is_separator(line) {
10923                result.push_str(&pad_pipe_row(line, target));
10924                result.push('\n');
10925            } else if target > 0 && is_separator(line) {
10926                result.push('|');
10927                for _ in 0..target {
10928                    result.push_str(" --- |");
10929                }
10930                result.push('\n');
10931            } else {
10932                result.push_str(line);
10933                result.push('\n');
10934            }
10935        } else {
10936            result.push_str(line);
10937            result.push('\n');
10938        }
10939    }
10940
10941    result
10942}
10943
10944#[cfg(test)]
10945mod tests {
10946    use super::*;
10947    use crate::models::bbox::BoundingBox;
10948    use crate::models::chunks::TextChunk;
10949    use crate::models::content::ContentElement;
10950    use crate::models::enums::{PdfLayer, TextFormat, TextType};
10951    use crate::models::list::{ListBody, ListItem, ListLabel, PDFList};
10952    use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
10953    use crate::models::table::{
10954        TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
10955    };
10956    use crate::models::text::{TextBlock, TextColumn, TextLine};
10957
10958    #[test]
10959    fn test_empty_doc() {
10960        let doc = PdfDocument::new("test.pdf".to_string());
10961        let md = to_markdown(&doc).unwrap();
10962        assert!(md.contains("No content extracted"));
10963    }
10964
10965    #[test]
10966    fn test_with_title() {
10967        let mut doc = PdfDocument::new("test.pdf".to_string());
10968        doc.title = Some("My Title".to_string());
10969        let md = to_markdown(&doc).unwrap();
10970        assert!(md.starts_with("# My Title\n"));
10971    }
10972
10973    #[test]
10974    fn test_empty_title_not_rendered() {
10975        let mut doc = PdfDocument::new("test.pdf".to_string());
10976        doc.title = Some("  ".to_string());
10977        let md = to_markdown(&doc).unwrap();
10978        assert!(
10979            !md.contains("# "),
10980            "Empty/whitespace title should not produce a heading"
10981        );
10982    }
10983
10984    #[test]
10985    fn test_repair_fragmented_words() {
10986        assert_eq!(
10987            repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
10988            "Jurisdiction Foreign Requirements"
10989        );
10990    }
10991
10992    #[test]
10993    fn test_normalize_common_ocr_text_repairs_units() {
10994        assert_eq!(
10995            normalize_common_ocr_text("10 ߤL at 37 C and -20 oC"),
10996            "10 μL at 37°C and -20°C"
10997        );
10998    }
10999
11000    #[cfg(not(target_arch = "wasm32"))]
11001    #[test]
11002    fn test_build_layout_anchor_rows_reconstructs_four_column_matrix() {
11003        let lines = vec![
11004            "Key Functions by Main Service Flow".to_string(),
11005            "".to_string(),
11006            " Service Stage                   Function Name                Explanation                                                                                Expected Benefit".to_string(),
11007            "".to_string(),
11008            " 1. Project creation             Project creation and         Select document type to automatically run project creation, Pipeline configuration with    The intuitive UI environment allows the the person in charge to quickly proceed with".to_string(),
11009            "".to_string(),
11010            "                                 management                   recommended Modelset and Endpoint deployment                                               the entire process from project creation to deployment, improving work efficiency".to_string(),
11011            "".to_string(),
11012            "                                                                                                                                                         Conveniently manage raw data to be used for OCR Pack and actual date from live".to_string(),
11013            " 2. Data labeling and            Data storage management      Provides convenient functions for uploading raw data, viewer, and data management".to_string(),
11014            "                                                              (search using image metadata, sorting, filtering, hashtags settings on image data)         service".to_string(),
11015            " fine-tuning".to_string(),
11016            "                                                              Image data bookmark for Qualitative Evaluation".to_string(),
11017            "".to_string(),
11018            "                                 Create and manage Labeling   Creating a Labeling Space to manage raw data annotation, managing labeling resources       Labeling work can be outsourced within the pack. Labeled data is continuously".to_string(),
11019            "                                                              (Ontology, Characters to be Recognized), data set dump, data set version management        supplied from which data sets can be created with ease. The Auto Labeling function".to_string(),
11020            "                                 Space".to_string(),
11021            "                                                                                                     3                                                   increases both efficiency and convenience.".to_string(),
11022            "                                                              Various basic models for each selected 5".to_string(),
11023            "                                                                                                    document, information comparison between".to_string(),
11024            "                                 Model training                                                                                                          Providing a foundation for customers to implement, manage, and upgrade their own".to_string(),
11025            "                                                              models, basic model training, training pause function, re-training, cancel function, and   OCR model specialized to the customers’ needs".to_string(),
11026            "                                                              configuration support for Characters to be Recognized and Ontology that is frequently".to_string(),
11027            "                                                              modified while developing specialized models".to_string(),
11028        ];
11029
11030        let header = find_layout_header_candidate(&lines).unwrap();
11031        let rows =
11032            build_layout_anchor_rows(&lines, &extract_layout_entries(&lines, &header)).unwrap();
11033
11034        assert_eq!(
11035            header.headers,
11036            vec![
11037                "Service Stage".to_string(),
11038                "Function Name".to_string(),
11039                "Explanation".to_string(),
11040                "Expected Benefit".to_string()
11041            ]
11042        );
11043        assert_eq!(rows.len(), 4);
11044        assert_eq!(rows[0][0], "1. Project creation");
11045        assert_eq!(rows[0][1], "Project creation and management");
11046        assert!(rows[1][0].contains("fine-tuning"));
11047        assert_eq!(rows[2][1], "Create and manage Labeling Space");
11048        assert_eq!(rows[3][1], "Model training");
11049        assert!(rows[3][2].contains("Various basic models for each selected document"));
11050    }
11051
11052    #[cfg(not(target_arch = "wasm32"))]
11053    #[test]
11054    fn test_build_layout_panel_stub_rows_reconstructs_left_stub_table() {
11055        let lines = vec![
11056            "AI Pack".to_string(),
11057            "Upstage offers 3 AI packs that process unstructured information and data".to_string(),
11058            "".to_string(),
11059            "                                     OCR                                                Recommendation                                    Product semantic search".to_string(),
11060            "".to_string(),
11061            "              A solution that recognizes characters in an                A solution that recommends the best products and   A solution that enables semantic search, analyzes and".to_string(),
11062            "              image and extracts necessary information                   contents                                           organizes key information in unstructured text data".to_string(),
11063            "   Pack".to_string(),
11064            "                                                                                                                            into a standardized form (DB)".to_string(),
11065            "".to_string(),
11066            "              Applicable to all fields that require text extraction      Applicable to all fields that use any form of      Applicable to all fields that deal with various types of".to_string(),
11067            "              from standardized documents, such as receipts,             recommendation including alternative products,     unstructured data containing text information that".to_string(),
11068            "Application   bills, credit cards, ID cards, certificates, and medical   products and contents that are likely to be        require semantic search and conversion into a DB".to_string(),
11069            "              receipts                                                   purchased next".to_string(),
11070            "".to_string(),
11071            "              Achieved 1st place in the OCR World Competition            Team with specialists and technologies that        Creation of the first natural language evaluation".to_string(),
11072            "              The team includes specialists who have                     received Kaggle’s Gold Medal recommendation        system in Korean (KLUE)".to_string(),
11073            "              presented 14 papers in the world’s most                    (Education platform)                               World’s No.1 in Kaggle text embedding competition in".to_string(),
11074            " Highlight".to_string(),
11075            "              renowned AI conferences                                    Proven superior performance of more than 170%      E-commerce subject (Shopee)".to_string(),
11076            "                                                                         compared to other global top-tier recommendation".to_string(),
11077            "                                                                         models".to_string(),
11078        ];
11079
11080        let header = find_layout_panel_header_candidate(&lines).unwrap();
11081        let rows = build_layout_panel_stub_rows(&lines, &header).unwrap();
11082
11083        assert_eq!(
11084            header.headers,
11085            vec![
11086                "OCR".to_string(),
11087                "Recommendation".to_string(),
11088                "Product semantic search".to_string()
11089            ]
11090        );
11091        assert_eq!(rows.len(), 3);
11092        assert_eq!(rows[0][0], "Pack");
11093        assert!(rows[0][1].contains("image and extracts necessary information"));
11094        assert_eq!(rows[1][0], "Application");
11095        assert!(rows[1][3].contains("require semantic search and conversion into a DB"));
11096        assert_eq!(rows[2][0], "Highlight");
11097        assert!(rows[2][2].contains("top-tier recommendation models"));
11098    }
11099
11100    #[cfg(not(target_arch = "wasm32"))]
11101    #[test]
11102    fn test_extract_layout_toc_entries_merges_wrapped_entry() {
11103        let lines = vec![
11104            "Table of Contents".to_string(),
11105            "".to_string(),
11106            "Executive Summary                                          4".to_string(),
11107            "Legal Framework                                            6".to_string(),
11108            "Election Administration                                   11".to_string(),
11109            "Civil Society Engagement                                  15".to_string(),
11110            "Political Parties, Candidates Registration and Election   18".to_string(),
11111            "Campaign".to_string(),
11112            "Media Freedom and Access to Information                   25".to_string(),
11113            "Voter Education and Awareness                             29".to_string(),
11114            "Participation of Marginalized Sectors                     31".to_string(),
11115            "Recommendations                                           39".to_string(),
11116        ];
11117
11118        let (title, entries) = extract_layout_toc_entries(&lines).unwrap();
11119        assert_eq!(title, "Table of Contents");
11120        assert_eq!(entries.len(), 9);
11121        assert_eq!(entries[0].title, "Executive Summary");
11122        assert_eq!(entries[0].page, "4");
11123        assert_eq!(
11124            entries[4].title,
11125            "Political Parties, Candidates Registration and Election Campaign"
11126        );
11127        assert_eq!(entries[4].page, "18");
11128    }
11129
11130    #[cfg(not(target_arch = "wasm32"))]
11131    fn make_bbox_layout_line(words: &[(&str, f64, f64)], bottom: f64, top: f64) -> BBoxLayoutLine {
11132        make_bbox_layout_line_in_block(0, words, bottom, top)
11133    }
11134
11135    #[cfg(not(target_arch = "wasm32"))]
11136    fn make_bbox_layout_line_in_block(
11137        block_id: usize,
11138        words: &[(&str, f64, f64)],
11139        bottom: f64,
11140        top: f64,
11141    ) -> BBoxLayoutLine {
11142        BBoxLayoutLine {
11143            block_id,
11144            bbox: BoundingBox::new(
11145                Some(1),
11146                words.first().map(|(_, left, _)| *left).unwrap_or(72.0),
11147                bottom,
11148                words.last().map(|(_, _, right)| *right).unwrap_or(320.0),
11149                top,
11150            ),
11151            words: words
11152                .iter()
11153                .map(|(text, left, right)| BBoxLayoutWord {
11154                    bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
11155                    text: (*text).to_string(),
11156                })
11157                .collect(),
11158        }
11159    }
11160
11161    #[cfg(not(target_arch = "wasm32"))]
11162    #[test]
11163    fn test_detect_layout_open_plate_recovers_two_column_species_rows() {
11164        let lines = vec![
11165            make_bbox_layout_line(
11166                &[
11167                    ("Fish", 60.0, 76.0),
11168                    ("species", 78.0, 107.0),
11169                    ("on", 109.0, 119.0),
11170                    ("IUCN", 121.0, 142.0),
11171                    ("Red", 144.0, 159.0),
11172                    ("List", 161.0, 176.0),
11173                ],
11174                649.0,
11175                660.0,
11176            ),
11177            make_bbox_layout_line(
11178                &[("Potosi", 60.0, 84.0), ("Pupfish", 86.0, 114.0)],
11179                632.0,
11180                643.0,
11181            ),
11182            make_bbox_layout_line(
11183                &[("Cyprinodon", 132.0, 176.0), ("alvarezi", 178.0, 207.0)],
11184                632.0,
11185                643.0,
11186            ),
11187            make_bbox_layout_line(
11188                &[
11189                    ("La", 60.0, 69.0),
11190                    ("Palma", 71.0, 94.0),
11191                    ("Pupfish", 96.0, 124.0),
11192                    ("Cyprinodon", 132.0, 176.0),
11193                    ("longidorsalis", 178.0, 224.0),
11194                ],
11195                616.0,
11196                627.0,
11197            ),
11198            make_bbox_layout_line(
11199                &[("Butterfly", 60.0, 94.0), ("Splitfin", 96.0, 123.0)],
11200                600.0,
11201                611.0,
11202            ),
11203            make_bbox_layout_line(
11204                &[("Ameca", 132.0, 156.0), ("splendens", 158.0, 194.0)],
11205                600.0,
11206                611.0,
11207            ),
11208            make_bbox_layout_line(
11209                &[("Golden", 60.0, 88.0), ("Skiffia", 90.0, 113.0)],
11210                584.0,
11211                595.0,
11212            ),
11213            make_bbox_layout_line(
11214                &[("Skiffia", 132.0, 155.0), ("francesae", 158.0, 193.0)],
11215                584.0,
11216                595.0,
11217            ),
11218            make_bbox_layout_line(
11219                &[
11220                    ("Table", 56.0, 74.0),
11221                    ("6.1:", 76.0, 87.0),
11222                    ("Four", 89.0, 105.0),
11223                    ("fish", 107.0, 119.0),
11224                    ("species", 121.0, 145.0),
11225                    ("on", 147.0, 155.0),
11226                    ("IUCN", 157.0, 176.0),
11227                    ("Red", 178.0, 190.0),
11228                    ("List", 192.0, 205.0),
11229                    ("held", 279.0, 293.0),
11230                    ("in", 295.0, 302.0),
11231                    ("public", 304.0, 325.0),
11232                    ("aquariums.", 327.0, 365.0),
11233                ],
11234                556.0,
11235                566.0,
11236            ),
11237        ];
11238
11239        let plate = detect_layout_open_plate(576.0, &lines).unwrap();
11240        assert_eq!(plate.heading, "Fish species on IUCN Red List");
11241        assert_eq!(
11242            plate.header_row,
11243            vec![
11244                "Fish species on IUCN Red List".to_string(),
11245                "Scientific name".to_string()
11246            ]
11247        );
11248        assert_eq!(plate.rows.len(), 4);
11249        assert_eq!(
11250            plate.rows[1],
11251            vec![
11252                "La Palma Pupfish".to_string(),
11253                "Cyprinodon longidorsalis".to_string()
11254            ]
11255        );
11256        assert!(plate
11257            .caption
11258            .starts_with("Table 6.1: Four fish species on IUCN Red List"));
11259    }
11260
11261    #[cfg(not(target_arch = "wasm32"))]
11262    #[test]
11263    fn test_extract_layout_narrative_bridge_recovers_left_prose_and_defers_captions() {
11264        let plate = OpenPlateCandidate {
11265            heading: "Fish species on IUCN Red List".to_string(),
11266            header_row: vec![
11267                "Fish species on IUCN Red List".to_string(),
11268                "Scientific name".to_string(),
11269            ],
11270            rows: vec![],
11271            caption: "Table 6.1".to_string(),
11272            cutoff_top_y: 560.0,
11273        };
11274        let lines = vec![
11275            make_bbox_layout_line(
11276                &[
11277                    ("Public", 56.0, 83.0),
11278                    ("aquariums,", 88.0, 135.0),
11279                    ("because", 140.0, 174.0),
11280                ],
11281                509.0,
11282                521.0,
11283            ),
11284            make_bbox_layout_line(
11285                &[
11286                    ("of", 180.0, 188.0),
11287                    ("their", 194.0, 214.0),
11288                    ("in-", 220.0, 233.0),
11289                ],
11290                509.0,
11291                521.0,
11292            ),
11293            make_bbox_layout_line(
11294                &[
11295                    ("house", 56.0, 82.0),
11296                    ("expertise,", 84.0, 125.0),
11297                    ("can", 128.0, 143.0),
11298                ],
11299                495.0,
11300                507.0,
11301            ),
11302            make_bbox_layout_line(
11303                &[("act", 146.0, 159.0), ("quickly", 161.0, 191.0)],
11304                495.0,
11305                507.0,
11306            ),
11307            make_bbox_layout_line_in_block(
11308                1,
11309                &[
11310                    ("Figure", 242.0, 265.0),
11311                    ("6.3:", 267.0, 280.0),
11312                    ("Photo", 282.0, 303.0),
11313                ],
11314                355.0,
11315                366.0,
11316            ),
11317            make_bbox_layout_line_in_block(
11318                1,
11319                &[
11320                    ("of", 305.0, 312.0),
11321                    ("the", 314.0, 325.0),
11322                    ("species.", 327.0, 360.0),
11323                ],
11324                355.0,
11325                366.0,
11326            ),
11327            make_bbox_layout_line(
11328                &[
11329                    ("The", 56.0, 73.0),
11330                    ("breeding", 77.0, 114.0),
11331                    ("colonies", 118.0, 153.0),
11332                ],
11333                330.0,
11334                342.0,
11335            ),
11336            make_bbox_layout_line(
11337                &[
11338                    ("of", 157.0, 165.0),
11339                    ("the", 169.0, 183.0),
11340                    ("Butterfly", 187.0, 224.0),
11341                    ("Splitfin", 228.0, 258.0),
11342                    ("at", 314.0, 323.0),
11343                    ("the", 327.0, 341.0),
11344                    ("London", 345.0, 377.0),
11345                    ("Zoo", 381.0, 397.0),
11346                    ("and", 401.0, 416.0),
11347                    ("elsewhere", 420.0, 463.0),
11348                    ("serve", 467.0, 489.0),
11349                    ("as", 493.0, 502.0),
11350                    ("ark", 506.0, 519.0),
11351                ],
11352                330.0,
11353                342.0,
11354            ),
11355            make_bbox_layout_line(
11356                &[
11357                    ("Figure", 56.0, 79.0),
11358                    ("6.4:", 81.0, 94.0),
11359                    ("Lake", 96.0, 116.0),
11360                    ("Sturgeon", 118.0, 158.0),
11361                ],
11362                104.0,
11363                116.0,
11364            ),
11365        ];
11366
11367        let bridge = extract_layout_narrative_bridge(576.0, &lines, &plate).unwrap();
11368        assert!(bridge
11369            .bridge_paragraph
11370            .as_deref()
11371            .is_some_and(|text| text.contains("Public aquariums") && text.contains("expertise")));
11372        assert_eq!(bridge.deferred_captions.len(), 2);
11373        assert!(bridge.deferred_captions[0].contains("Figure 6.3:"));
11374        assert!(bridge.deferred_captions[0].contains("species."));
11375    }
11376
11377    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11378    #[test]
11379    fn test_detect_layout_ocr_benchmark_dashboard_on_real_pdf() {
11380        let path =
11381            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000199.pdf");
11382        let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11383        let dashboard = detect_layout_ocr_benchmark_dashboard(page_width, &lines).unwrap();
11384
11385        assert_eq!(
11386            dashboard.title,
11387            "Base Model Performance Evaluation of Upstage OCR Pack"
11388        );
11389        assert_eq!(dashboard.left_columns.len(), 2);
11390        assert_eq!(
11391            dashboard.left_columns[0],
11392            "Scene (Photographed document image)"
11393        );
11394        assert_eq!(
11395            dashboard.left_rows[0],
11396            vec![
11397                "Company A²".to_string(),
11398                "70.23".to_string(),
11399                "80.41".to_string()
11400            ]
11401        );
11402        assert_eq!(
11403            dashboard.right_rows[0],
11404            vec![
11405                "OCR-Recall³".to_string(),
11406                "73.2".to_string(),
11407                "94.2".to_string(),
11408                "94.1".to_string()
11409            ]
11410        );
11411        assert_eq!(dashboard.right_rows[3][0], "Parsing-F¹");
11412        assert_eq!(dashboard.right_rows[3][1], "68.0");
11413        assert_eq!(dashboard.right_rows[3][2], "82.65");
11414        assert_eq!(dashboard.right_rows[3][3], "82.65");
11415        assert!(!dashboard.definition_notes.is_empty());
11416        assert!(!dashboard.source_notes.is_empty());
11417    }
11418
11419    #[cfg(not(target_arch = "wasm32"))]
11420    #[test]
11421    fn test_split_layout_line_spans_handles_unicode_boundaries() {
11422        let line = "Title  “Podcast #EP32: SDGs dan Anak Muda”  2024";
11423        let spans = split_layout_line_spans(line);
11424        assert_eq!(spans.len(), 3);
11425        assert_eq!(spans[0].1, "Title");
11426        assert!(spans[1].1.contains("Podcast #EP32: SDGs dan Anak Muda"));
11427        assert!(spans[1].1.ends_with('”'));
11428        assert!(spans[2].1.ends_with("24"));
11429    }
11430
11431    #[cfg(not(target_arch = "wasm32"))]
11432    #[test]
11433    fn test_render_layout_single_caption_chart_document_on_real_pdf() {
11434        let path =
11435            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000037.pdf");
11436        let doc = PdfDocument {
11437            title: None,
11438            source_path: Some(path.to_string_lossy().to_string()),
11439            number_of_pages: 1,
11440            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11441                .unwrap()
11442                .kids,
11443            ..PdfDocument::new("01030000000037.pdf".to_string())
11444        };
11445        let rendered = render_layout_single_caption_chart_document(&doc).unwrap();
11446        assert!(rendered.contains("# 3. Impact on Business Operations"));
11447        assert!(rendered.contains("## 3.1. Status of Business Operations"));
11448        assert!(rendered.contains("As shown in Figure 3.1.1, the number of MSMEs"));
11449        assert!(
11450            rendered.contains("Figure 3.1.1: Status of operations during each survey phase (%)")
11451        );
11452        assert!(
11453            rendered.contains("lockdown period. In the handicraft/textile sector, 30% of MSMEs")
11454        );
11455        assert!(!rendered.contains("| Lockdown Period |"));
11456    }
11457
11458    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11459    #[test]
11460    fn test_to_markdown_captioned_media_document_on_real_pdf_72() {
11461        let path =
11462            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000072.pdf");
11463        let doc = PdfDocument {
11464            title: None,
11465            source_path: Some(path.to_string_lossy().to_string()),
11466            number_of_pages: 1,
11467            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11468                .unwrap()
11469                .kids,
11470            ..PdfDocument::new("01030000000072.pdf".to_string())
11471        };
11472        let md = to_markdown(&doc).unwrap();
11473        assert!(md.contains("## Diagram 5"), "{md}");
11474        assert!(
11475            md.contains("**Distribution of Komnas HAM’s YouTube Content (2019-2020)**"),
11476            "{md}"
11477        );
11478        assert!(
11479            md.contains(
11480                "As of 1 December 2021, the Komnas HAM’s YouTube channel has 2,290 subscribers"
11481            ),
11482            "{md}"
11483        );
11484        assert!(md.contains("**Figure 4**"), "{md}");
11485        assert!(
11486            md.contains("*Komnas HAM’s YouTube channel as of 1 December 2021*"),
11487            "{md}"
11488        );
11489    }
11490
11491    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11492    #[test]
11493    fn test_to_markdown_captioned_media_document_on_real_pdf_73() {
11494        let path =
11495            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000073.pdf");
11496        let doc = PdfDocument {
11497            title: None,
11498            source_path: Some(path.to_string_lossy().to_string()),
11499            number_of_pages: 1,
11500            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11501                .unwrap()
11502                .kids,
11503            ..PdfDocument::new("01030000000073.pdf".to_string())
11504        };
11505        let md = to_markdown(&doc).unwrap();
11506        assert!(
11507            md.starts_with("# In this content, DPN Argentina provides a brief explanation"),
11508            "{md}"
11509        );
11510        assert!(
11511            md.contains("Examples of such greetings are as follows:"),
11512            "{md}"
11513        );
11514        assert!(md.contains("*Image*"), "{md}");
11515        assert!(md.contains("**Figure 6**"), "{md}");
11516        assert!(md.contains("**DPN Argentina**"), "{md}");
11517        assert!(
11518            md.contains("**Content: World Health Day Celebration (7 April 2021).**^98"),
11519            "{md}"
11520        );
11521        assert!(md.contains("**Footnote:**"), "{md}");
11522        assert!(
11523            md.contains("https://twitter.com/DPNArgentina/status/1379765916259483648."),
11524            "{md}"
11525        );
11526    }
11527
11528    #[cfg(not(target_arch = "wasm32"))]
11529    #[test]
11530    fn test_render_layout_captioned_media_document_does_not_fire_on_real_pdf_14() {
11531        let path =
11532            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11533        let doc = PdfDocument {
11534            title: None,
11535            source_path: Some(path.to_string_lossy().to_string()),
11536            number_of_pages: 1,
11537            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11538                .unwrap()
11539                .kids,
11540            ..PdfDocument::new("01030000000014.pdf".to_string())
11541        };
11542        assert!(render_layout_captioned_media_document(&doc).is_none());
11543    }
11544
11545    #[cfg(not(target_arch = "wasm32"))]
11546    #[test]
11547    fn test_to_markdown_real_pdf_14_preserves_body_paragraphs() {
11548        let path =
11549            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11550        let doc = PdfDocument {
11551            title: None,
11552            source_path: Some(path.to_string_lossy().to_string()),
11553            number_of_pages: 1,
11554            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11555                .unwrap()
11556                .kids,
11557            ..PdfDocument::new("01030000000014.pdf".to_string())
11558        };
11559        let md = to_markdown(&doc).unwrap();
11560        assert!(
11561            md.contains("These images also show that different areas are used by men and by women"),
11562            "{md}"
11563        );
11564    }
11565
11566    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11567    #[test]
11568    fn test_render_layout_recommendation_infographic_on_real_pdf() {
11569        let path =
11570            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000183.pdf");
11571        let doc = PdfDocument {
11572            title: None,
11573            source_path: Some(path.to_string_lossy().to_string()),
11574            number_of_pages: 1,
11575            kids: Vec::new(),
11576            ..PdfDocument::new("01030000000183.pdf".to_string())
11577        };
11578        let rendered = render_layout_recommendation_infographic_document(&doc).unwrap();
11579        assert!(rendered.contains("# Recommendation Pack: Track Record"));
11580        assert!(rendered.contains("## Comparison with Beauty Commerce Recommendation Models"));
11581        assert!(rendered.contains("| Graph-RecSys | 0.4048 |"));
11582        assert!(rendered.contains("| Current Service Recommendation Algorithm | 0.159 |"));
11583        assert!(rendered.contains("## Education Content Platform PoC Case"));
11584        assert!(rendered.contains("| DKT Model | 0.882 |"));
11585        assert!(rendered.contains("Compared to regular model"));
11586    }
11587
11588    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11589    #[test]
11590    fn test_render_layout_stacked_bar_report_on_real_pdf() {
11591        let path =
11592            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000038.pdf");
11593        let doc = PdfDocument {
11594            title: None,
11595            source_path: Some(path.to_string_lossy().to_string()),
11596            number_of_pages: 1,
11597            kids: Vec::new(),
11598            ..PdfDocument::new("01030000000038.pdf".to_string())
11599        };
11600        let rendered = render_layout_stacked_bar_report_document(&doc);
11601        if rendered.is_none() {
11602            let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11603            let blocks = collect_bbox_layout_blocks(&lines);
11604            let figures = collect_layout_figure_captions(&blocks);
11605            let narrative = detect_layout_stacked_bar_narrative(&blocks);
11606            eprintln!("page_width={page_width} figures={}", figures.len());
11607            if let Some(first) = figures.first() {
11608                eprintln!("figure1={}", bbox_layout_block_text(first));
11609            }
11610            if let Some(second) = figures.get(1) {
11611                eprintln!("figure2={}", bbox_layout_block_text(second));
11612            }
11613            eprintln!("narrative={}", narrative.is_some());
11614            if let Some(narrative) = &narrative {
11615                eprintln!("heading={}", narrative.heading);
11616                eprintln!("paragraphs={}", narrative.paragraphs.len());
11617                eprintln!("footnote={:?}", narrative.footnote);
11618            }
11619            for block in &blocks {
11620                let text = bbox_layout_block_text(block);
11621                if text.contains("July")
11622                    || text.contains("October")
11623                    || text.contains("January")
11624                    || text.contains("Will ")
11625                    || text.contains("Don’t")
11626                    || text.starts_with("6.2.")
11627                    || text.starts_with("5.")
11628                {
11629                    eprintln!(
11630                        "block top={:.1} bottom={:.1} left={:.1} right={:.1} text={}",
11631                        block.bbox.top_y,
11632                        block.bbox.bottom_y,
11633                        block.bbox.left_x,
11634                        block.bbox.right_x,
11635                        text
11636                    );
11637                }
11638            }
11639            if figures.len() >= 2 {
11640                let first = detect_layout_three_month_stacked_figure(
11641                    &blocks,
11642                    &lines,
11643                    page_width,
11644                    figures[0].clone(),
11645                    figures[1].bbox.top_y,
11646                );
11647                eprintln!("figure_one_ok={}", first.is_some());
11648                if let Some(narrative) = &narrative {
11649                    let second = detect_layout_sector_bar_figure(
11650                        &blocks,
11651                        &lines,
11652                        page_width,
11653                        figures[1].clone(),
11654                        narrative.top_y,
11655                    );
11656                    eprintln!("figure_two_ok={}", second.is_some());
11657                }
11658            }
11659        }
11660        let rendered = rendered.unwrap();
11661        assert!(rendered.contains("# Figure 6.1.1:"));
11662        assert!(rendered.contains("| Will not terminate employment | 51 | 81 | 73 |"));
11663        assert!(rendered.contains("# 6.2. Expectations for Re-Hiring Employees"));
11664    }
11665
11666    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11667    #[test]
11668    fn test_render_layout_multi_figure_chart_document_on_real_pdf() {
11669        let path =
11670            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000076.pdf");
11671        let doc = PdfDocument {
11672            title: None,
11673            source_path: Some(path.to_string_lossy().to_string()),
11674            number_of_pages: 1,
11675            kids: Vec::new(),
11676            ..PdfDocument::new("01030000000076.pdf".to_string())
11677        };
11678        let rendered = render_layout_multi_figure_chart_document(&doc).unwrap();
11679        assert!(rendered.contains("# Figures from the Document"));
11680        assert!(
11681            rendered.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
11682        );
11683        assert!(rendered.contains("| 2016 | 3,230 |"));
11684        assert!(rendered.contains("| 2021 | 2,693 |"));
11685        assert!(
11686            rendered.contains("## Figure 1.8. Singapore foreign workforce stock (in thousands)")
11687        );
11688        assert!(rendered.contains("| 2016 (Dec) | 1,393 |"));
11689        assert!(rendered.contains("| 2021 (Dec) | 1,200 |"));
11690        assert!(rendered.contains(
11691            "Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate."
11692        ));
11693    }
11694
11695    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11696    #[test]
11697    fn test_render_layout_open_plate_document_on_real_pdf() {
11698        let path =
11699            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11700        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11701        let rendered = render_layout_open_plate_document(&doc).unwrap();
11702        assert!(rendered.contains("# Fish species on IUCN Red List"));
11703        assert!(rendered.contains("| Potosi Pupfish | Cyprinodon alvarezi |"));
11704        assert!(rendered.contains("| Golden Skiffia | Skiffia francesae |"));
11705        assert!(rendered.contains("*Table 6.1: Four fish species on IUCN Red List"));
11706        assert!(rendered.contains("---"));
11707        assert!(rendered.contains("Public aquariums, because of their inhouse expertise"));
11708    }
11709
11710    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11711    #[test]
11712    fn test_to_markdown_open_plate_document_on_real_pdf() {
11713        let path =
11714            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11715        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11716        let md = to_markdown(&doc).unwrap();
11717
11718        assert!(md.contains("# Fish species on IUCN Red List"), "{md}");
11719        assert!(
11720            md.contains("| Potosi Pupfish | Cyprinodon alvarezi |"),
11721            "{md}"
11722        );
11723        assert!(
11724            md.contains("| Golden Skiffia | Skiffia francesae |"),
11725            "{md}"
11726        );
11727        assert!(
11728            md.contains("*Table 6.1: Four fish species on IUCN Red List"),
11729            "{md}"
11730        );
11731        assert!(
11732            md.contains("The breeding colonies of the Butterfly Splitfin"),
11733            "{md}"
11734        );
11735    }
11736
11737    #[cfg(not(target_arch = "wasm32"))]
11738    #[test]
11739    fn test_to_markdown_does_not_misclassify_open_plate_pdf_36() {
11740        let path =
11741            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000036.pdf");
11742        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11743        let md = to_markdown(&doc).unwrap();
11744
11745        assert!(md.contains("# 2. General Profile of MSMEs"), "{md}");
11746        assert!(
11747            md.contains("In July 2020, the survey established a general profile"),
11748            "{md}"
11749        );
11750        assert!(
11751            md.contains(
11752                "The tourism sub-sectors interviewed included lodging, restaurants and bars"
11753            ),
11754            "{md}"
11755        );
11756        assert!(
11757            !md.starts_with("# Business characteristics. Business size was"),
11758            "{md}"
11759        );
11760    }
11761
11762    #[cfg(not(target_arch = "wasm32"))]
11763    #[test]
11764    fn test_to_markdown_does_not_misclassify_open_plate_pdf_40() {
11765        let path =
11766            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000040.pdf");
11767        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11768        let md = to_markdown(&doc).unwrap();
11769
11770        assert!(
11771            md.contains(
11772                "Thailand, Philippines and Indonesia in particular, identifying known experts"
11773            ),
11774            "{md}"
11775        );
11776        assert!(
11777            md.contains("Figure 1: Age by gender of respondents"),
11778            "{md}"
11779        );
11780        assert!(md.contains("Gender Analysis of Violent Extremism"), "{md}");
11781        assert!(
11782            !md.starts_with("# Thailand, Philippines and Indonesia in"),
11783            "{md}"
11784        );
11785    }
11786
11787    #[cfg(not(target_arch = "wasm32"))]
11788    #[test]
11789    fn test_to_markdown_does_not_misclassify_open_plate_pdf_64() {
11790        let path =
11791            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000064.pdf");
11792        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11793        let md = to_markdown(&doc).unwrap();
11794
11795        assert!(md.contains("estuarine influenced areas."), "{md}");
11796        assert!(md.contains("| MANILA | 2454 | 6,125 |"), "{md}");
11797        assert!(
11798            md.contains("The port of Manila has been documented"),
11799            "{md}"
11800        );
11801        assert!(!md.starts_with("# CAGAYAN DE ORO"), "{md}");
11802    }
11803
11804    #[cfg(not(target_arch = "wasm32"))]
11805    #[test]
11806    fn test_detect_footnote_citation_regions_on_real_pdf() {
11807        let path =
11808            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11809        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11810        let regions = detect_footnote_citation_regions(&doc);
11811        assert!(!regions.is_empty(), "{regions:?}");
11812        assert!(
11813            regions.iter().any(|region| {
11814                region.rendered.contains("<table>")
11815                    && region.rendered.contains("<td>25</td>")
11816                    && region.rendered.contains("<td>29</td>")
11817            }),
11818            "{regions:#?}"
11819        );
11820        assert!(
11821            regions.iter().any(|region| {
11822                region.rendered.contains("<table>")
11823                    && region.rendered.contains("<td>30</td>")
11824                    && region.rendered.contains("<td>33</td>")
11825            }),
11826            "{regions:#?}"
11827        );
11828    }
11829
11830    #[cfg(not(target_arch = "wasm32"))]
11831    #[test]
11832    fn test_to_markdown_renders_footnote_citation_tables_on_real_pdf() {
11833        let path =
11834            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11835        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11836        let md = to_markdown(&doc).unwrap();
11837
11838        assert!(md.contains("<table>"), "{md}");
11839        assert!(md.contains("<th>Footnote</th><th>Citation</th>"), "{md}");
11840        assert!(md.contains("<td>25</td><td>Wiliam Beckford"), "{md}");
11841        assert!(
11842            md.contains("<td>29</td><td>Pope, The Rape of the Lock, 69.</td>"),
11843            "{md}"
11844        );
11845        assert!(
11846            md.contains("<td>30</td><td>Beawes, Lex Mercatoria Rediviva, 791.</td>"),
11847            "{md}"
11848        );
11849        assert!(
11850            md.contains("<td>32</td><td>Beawes, Lex Mercatoria Rediviva, 792.</td>"),
11851            "{md}"
11852        );
11853        assert!(
11854            md.contains("<td>33</td><td>M.M., Pharmacopoia Reformata:"),
11855            "{md}"
11856        );
11857    }
11858
11859    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11860    #[test]
11861    fn test_to_markdown_projection_sheet_document_on_real_pdf() {
11862        let path =
11863            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000128.pdf");
11864        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11865        let md = to_markdown(&doc).unwrap();
11866
11867        assert!(md.contains("# Table and Figure from the Document"), "{md}");
11868        assert!(md.contains("| A | B | C | D | E |"), "{md}");
11869        assert!(
11870            md.contains("| 10 | 8 | 19.73214458 | 17.99 | 21.47 |"),
11871            "{md}"
11872        );
11873        assert!(
11874            md.contains("**Figure 13.3. Graph of Projection Estimates**"),
11875            "{md}"
11876        );
11877        assert!(md.contains("[Open Template in Microsoft Excel](#)"), "{md}");
11878        assert!(
11879            md.contains("*298 | Ch. 13. Homogeneous Investment Types*"),
11880            "{md}"
11881        );
11882    }
11883
11884    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11885    #[test]
11886    fn test_to_markdown_appendix_tables_document_on_real_pdf() {
11887        let path =
11888            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000082.pdf");
11889        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11890        let md = to_markdown(&doc).unwrap();
11891
11892        assert!(md.contains("# Appendices"), "{md}");
11893        assert!(
11894            md.contains("## TABLE 28: BREAKDOWN OF IMPRISONMENT CLAUSES IN STATE LAWS"),
11895            "{md}"
11896        );
11897        assert!(md.contains("| Imprisonment terms | Number of clauses | Percentage of all states | Percentage of total |"), "{md}");
11898        assert!(
11899            md.contains("| Less than 3 months | 4,448 | 21.3% | 17.0% |"),
11900            "{md}"
11901        );
11902        assert!(
11903            md.contains("## TABLE 29: STATES WITH MORE THAN 1,000 IMPRISONMENT CLAUSES"),
11904            "{md}"
11905        );
11906        assert!(
11907            md.contains(
11908                "| State | Number of clauses | GSDP (In Rs lakh crore) | GSDP (In $ billion) |"
11909            ),
11910            "{md}"
11911        );
11912        assert!(md.contains("| Gujarat | 1469 | 15.6 | 200.4 |"), "{md}");
11913        assert!(
11914            md.contains("*Sources: TeamLease Regtech, and Reserve Bank of India for GSDPs*"),
11915            "{md}"
11916        );
11917        assert!(md.contains("*Exchange rate: Rs 75 to USD*"), "{md}");
11918    }
11919
11920    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11921    #[test]
11922    fn test_to_markdown_titled_dual_table_document_on_real_pdf() {
11923        let path =
11924            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000084.pdf");
11925        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11926        let md = to_markdown(&doc).unwrap();
11927
11928        assert!(md.starts_with("# Jailed for Doing Business"), "{md}");
11929        assert!(
11930            md.contains("## TABLE 38: THREE CASE STUDIES ON NBFC COMPLIANCES*"),
11931            "{md}"
11932        );
11933        assert!(
11934            md.contains("| Percentage of imprisonment clauses | 20% | 30% | 37% |"),
11935            "{md}"
11936        );
11937        assert!(
11938            md.contains("## TABLE 39: BREAKDOWN OF IMPRISONMENT CLAUSES IN NBFC CASE STUDIES*"),
11939            "{md}"
11940        );
11941        assert!(
11942            md.contains("| 5 years to 10 years | 19 | 19 | 19 |"),
11943            "{md}"
11944        );
11945        assert!(
11946            md.contains("*These are real data from three NBFCs*"),
11947            "{md}"
11948        );
11949    }
11950
11951    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11952    #[test]
11953    fn test_to_markdown_registration_report_document_on_real_pdf() {
11954        let path =
11955            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000047.pdf");
11956        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11957        let md = to_markdown(&doc).unwrap();
11958
11959        assert!(
11960            md.starts_with("# ANFREL Pre-Election Assessment Mission Report"),
11961            "{md}"
11962        );
11963        assert!(
11964            md.contains(
11965                "| 14 | Cambodian Indigeneous Peoples Democracy Party | 19 | 194 | 19 | 202 | +8 |"
11966            ),
11967            "{md}"
11968        );
11969        assert!(
11970            md.contains("|  | Total |  | 84,208 |  | 86,092 | +1,884 |"),
11971            "{md}"
11972        );
11973        assert!(!md.contains("|  | Democracy Party |"), "{md}");
11974    }
11975
11976    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11977    #[test]
11978    fn test_to_markdown_dual_table_article_document_on_real_pdf() {
11979        let path =
11980            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000190.pdf");
11981        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11982        let md = to_markdown(&doc).unwrap();
11983
11984        assert!(
11985            md.starts_with("# Table 6: Performance comparison amongst the merge candidates"),
11986            "{md}"
11987        );
11988        assert!(
11989            md.contains("*Table 6*: Performance comparison amongst the merge candidates."),
11990            "{md}"
11991        );
11992        assert!(md.contains("# Table 7: Ablation studies on the different merge methods used for obtaining the final model"), "{md}");
11993        assert!(!md.contains("*Table 6*: Table 6:"), "{md}");
11994        assert!(!md.contains("| Merge v1"), "{md}");
11995    }
11996
11997    #[test]
11998    fn test_normalize_list_text_strips_redundant_bullets() {
11999        assert_eq!(
12000            normalize_list_text("• Collected via surveys"),
12001            "Collected via surveys"
12002        );
12003        assert!(is_pure_bullet_marker("•"));
12004    }
12005
12006    #[test]
12007    fn test_reference_continuation_detected() {
12008        assert!(should_merge_paragraph_text(
12009            "Scaling laws for transfer.",
12010            "arXiv preprint arXiv:2102.01293."
12011        ));
12012    }
12013
12014    #[test]
12015    fn test_enumerated_markers_are_detected() {
12016        assert!(starts_with_enumerated_marker("iii. Third item"));
12017        assert!(starts_with_enumerated_marker("1) First item"));
12018        assert!(starts_with_enumerated_marker("a. Lettered item"));
12019        assert!(!starts_with_enumerated_marker("Figure 1. Caption"));
12020        assert!(!starts_with_enumerated_marker("Natural dispersal"));
12021    }
12022
12023    fn make_heading(text: &str) -> ContentElement {
12024        let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
12025        let chunk = TextChunk {
12026            value: text.to_string(),
12027            bbox: bbox.clone(),
12028            font_name: "Lato-Bold".to_string(),
12029            font_size: 12.0,
12030            font_weight: 700.0,
12031            italic_angle: 0.0,
12032            font_color: "#000000".to_string(),
12033            contrast_ratio: 21.0,
12034            symbol_ends: vec![],
12035            text_format: TextFormat::Normal,
12036            text_type: TextType::Regular,
12037            pdf_layer: PdfLayer::Main,
12038            ocg_visible: true,
12039            index: None,
12040            page_number: Some(1),
12041            level: None,
12042            mcid: None,
12043        };
12044        let line = TextLine {
12045            bbox: bbox.clone(),
12046            index: None,
12047            level: None,
12048            font_size: 12.0,
12049            base_line: 702.0,
12050            slant_degree: 0.0,
12051            is_hidden_text: false,
12052            text_chunks: vec![chunk],
12053            is_line_start: true,
12054            is_line_end: true,
12055            is_list_line: false,
12056            connected_line_art_label: None,
12057        };
12058        let block = TextBlock {
12059            bbox: bbox.clone(),
12060            index: None,
12061            level: None,
12062            font_size: 12.0,
12063            base_line: 702.0,
12064            slant_degree: 0.0,
12065            is_hidden_text: false,
12066            text_lines: vec![line],
12067            has_start_line: true,
12068            has_end_line: true,
12069            text_alignment: None,
12070        };
12071        let column = TextColumn {
12072            bbox: bbox.clone(),
12073            index: None,
12074            level: None,
12075            font_size: 12.0,
12076            base_line: 702.0,
12077            slant_degree: 0.0,
12078            is_hidden_text: false,
12079            text_blocks: vec![block],
12080        };
12081        ContentElement::Heading(SemanticHeading {
12082            base: SemanticParagraph {
12083                base: SemanticTextNode {
12084                    bbox,
12085                    index: None,
12086                    level: None,
12087                    semantic_type: crate::models::enums::SemanticType::Heading,
12088                    correct_semantic_score: None,
12089                    columns: vec![column],
12090                    font_weight: Some(700.0),
12091                    font_size: Some(12.0),
12092                    text_color: None,
12093                    italic_angle: None,
12094                    font_name: Some("Lato-Bold".to_string()),
12095                    text_format: None,
12096                    max_font_size: Some(12.0),
12097                    background_color: None,
12098                    is_hidden_text: false,
12099                },
12100                enclosed_top: false,
12101                enclosed_bottom: false,
12102                indentation: 0,
12103            },
12104            heading_level: Some(1),
12105        })
12106    }
12107
12108    fn make_heading_at(left: f64, bottom: f64, right: f64, top: f64, text: &str) -> ContentElement {
12109        let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12110        let chunk = TextChunk {
12111            value: text.to_string(),
12112            bbox: bbox.clone(),
12113            font_name: "Lato-Bold".to_string(),
12114            font_size: top - bottom,
12115            font_weight: 700.0,
12116            italic_angle: 0.0,
12117            font_color: "#000000".to_string(),
12118            contrast_ratio: 21.0,
12119            symbol_ends: vec![],
12120            text_format: TextFormat::Normal,
12121            text_type: TextType::Regular,
12122            pdf_layer: PdfLayer::Main,
12123            ocg_visible: true,
12124            index: None,
12125            page_number: Some(1),
12126            level: None,
12127            mcid: None,
12128        };
12129        let line = TextLine {
12130            bbox: bbox.clone(),
12131            index: None,
12132            level: None,
12133            font_size: top - bottom,
12134            base_line: bottom + 2.0,
12135            slant_degree: 0.0,
12136            is_hidden_text: false,
12137            text_chunks: vec![chunk],
12138            is_line_start: true,
12139            is_line_end: true,
12140            is_list_line: false,
12141            connected_line_art_label: None,
12142        };
12143        let block = TextBlock {
12144            bbox: bbox.clone(),
12145            index: None,
12146            level: None,
12147            font_size: top - bottom,
12148            base_line: bottom + 2.0,
12149            slant_degree: 0.0,
12150            is_hidden_text: false,
12151            text_lines: vec![line],
12152            has_start_line: true,
12153            has_end_line: true,
12154            text_alignment: None,
12155        };
12156        let column = TextColumn {
12157            bbox: bbox.clone(),
12158            index: None,
12159            level: None,
12160            font_size: top - bottom,
12161            base_line: bottom + 2.0,
12162            slant_degree: 0.0,
12163            is_hidden_text: false,
12164            text_blocks: vec![block],
12165        };
12166        ContentElement::Heading(SemanticHeading {
12167            base: SemanticParagraph {
12168                base: SemanticTextNode {
12169                    bbox,
12170                    index: None,
12171                    level: None,
12172                    semantic_type: crate::models::enums::SemanticType::Heading,
12173                    correct_semantic_score: None,
12174                    columns: vec![column],
12175                    font_weight: Some(700.0),
12176                    font_size: Some(top - bottom),
12177                    text_color: None,
12178                    italic_angle: None,
12179                    font_name: Some("Lato-Bold".to_string()),
12180                    text_format: None,
12181                    max_font_size: Some(top - bottom),
12182                    background_color: None,
12183                    is_hidden_text: false,
12184                },
12185                enclosed_top: false,
12186                enclosed_bottom: false,
12187                indentation: 0,
12188            },
12189            heading_level: None,
12190        })
12191    }
12192
12193    fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
12194        make_paragraph_at(72.0, bottom, 300.0, top, text)
12195    }
12196
12197    fn make_paragraph_at(
12198        left: f64,
12199        bottom: f64,
12200        right: f64,
12201        top: f64,
12202        text: &str,
12203    ) -> ContentElement {
12204        let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12205        let chunk = TextChunk {
12206            value: text.to_string(),
12207            bbox: bbox.clone(),
12208            font_name: "Lato-Regular".to_string(),
12209            font_size: (top - bottom).max(1.0),
12210            font_weight: 400.0,
12211            italic_angle: 0.0,
12212            font_color: "#000000".to_string(),
12213            contrast_ratio: 21.0,
12214            symbol_ends: vec![],
12215            text_format: TextFormat::Normal,
12216            text_type: TextType::Regular,
12217            pdf_layer: PdfLayer::Main,
12218            ocg_visible: true,
12219            index: None,
12220            page_number: Some(1),
12221            level: None,
12222            mcid: None,
12223        };
12224        let line = TextLine {
12225            bbox: bbox.clone(),
12226            index: None,
12227            level: None,
12228            font_size: chunk.font_size,
12229            base_line: bottom + 2.0,
12230            slant_degree: 0.0,
12231            is_hidden_text: false,
12232            text_chunks: vec![chunk],
12233            is_line_start: true,
12234            is_line_end: true,
12235            is_list_line: false,
12236            connected_line_art_label: None,
12237        };
12238        let block = TextBlock {
12239            bbox: bbox.clone(),
12240            index: None,
12241            level: None,
12242            font_size: line.font_size,
12243            base_line: line.base_line,
12244            slant_degree: 0.0,
12245            is_hidden_text: false,
12246            text_lines: vec![line],
12247            has_start_line: true,
12248            has_end_line: true,
12249            text_alignment: None,
12250        };
12251        let column = TextColumn {
12252            bbox: bbox.clone(),
12253            index: None,
12254            level: None,
12255            font_size: block.font_size,
12256            base_line: block.base_line,
12257            slant_degree: 0.0,
12258            is_hidden_text: false,
12259            text_blocks: vec![block],
12260        };
12261        ContentElement::Paragraph(SemanticParagraph {
12262            base: SemanticTextNode {
12263                bbox,
12264                index: None,
12265                level: None,
12266                semantic_type: crate::models::enums::SemanticType::Paragraph,
12267                correct_semantic_score: None,
12268                columns: vec![column],
12269                font_weight: Some(400.0),
12270                font_size: Some(top - bottom),
12271                text_color: None,
12272                italic_angle: None,
12273                font_name: Some("Lato-Regular".to_string()),
12274                text_format: None,
12275                max_font_size: Some(top - bottom),
12276                background_color: None,
12277                is_hidden_text: false,
12278            },
12279            enclosed_top: false,
12280            enclosed_bottom: false,
12281            indentation: 0,
12282        })
12283    }
12284
12285    fn make_fallback_list(items: &[&str]) -> ContentElement {
12286        let mut list_items = Vec::new();
12287        for (idx, text) in items.iter().enumerate() {
12288            let top = 700.0 - idx as f64 * 18.0;
12289            let bottom = top - 12.0;
12290            let bbox = BoundingBox::new(Some(1), 72.0, bottom, 320.0, top);
12291            list_items.push(ListItem {
12292                bbox: bbox.clone(),
12293                index: None,
12294                level: None,
12295                label: ListLabel {
12296                    bbox: bbox.clone(),
12297                    content: vec![],
12298                    semantic_type: None,
12299                },
12300                body: ListBody {
12301                    bbox: bbox.clone(),
12302                    content: vec![],
12303                    semantic_type: None,
12304                },
12305                label_length: 0,
12306                contents: vec![make_paragraph_at(72.0, bottom, 320.0, top, text)],
12307                semantic_type: None,
12308            });
12309        }
12310
12311        ContentElement::List(PDFList {
12312            bbox: BoundingBox::new(
12313                Some(1),
12314                72.0,
12315                700.0 - items.len() as f64 * 18.0,
12316                320.0,
12317                700.0,
12318            ),
12319            index: None,
12320            level: None,
12321            list_items,
12322            numbering_style: Some("bullets".to_string()),
12323            common_prefix: None,
12324            previous_list_id: None,
12325            next_list_id: None,
12326        })
12327    }
12328
12329    fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
12330        let mut table_rows = Vec::new();
12331        for (ri, (title, page)) in rows.iter().enumerate() {
12332            let top = 680.0 - ri as f64 * 18.0;
12333            let bottom = top - 12.0;
12334            let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
12335            let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
12336            table_rows.push(TableBorderRow {
12337                bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
12338                index: None,
12339                level: None,
12340                row_number: ri,
12341                cells: vec![
12342                    TableBorderCell {
12343                        bbox: left_bbox.clone(),
12344                        index: None,
12345                        level: None,
12346                        row_number: ri,
12347                        col_number: 0,
12348                        row_span: 1,
12349                        col_span: 1,
12350                        content: vec![TableToken {
12351                            base: TextChunk {
12352                                value: (*title).to_string(),
12353                                bbox: left_bbox,
12354                                font_name: "Lato-Regular".to_string(),
12355                                font_size: 10.0,
12356                                font_weight: 400.0,
12357                                italic_angle: 0.0,
12358                                font_color: "#000000".to_string(),
12359                                contrast_ratio: 21.0,
12360                                symbol_ends: vec![],
12361                                text_format: TextFormat::Normal,
12362                                text_type: TextType::Regular,
12363                                pdf_layer: PdfLayer::Main,
12364                                ocg_visible: true,
12365                                index: None,
12366                                page_number: Some(1),
12367                                level: None,
12368                                mcid: None,
12369                            },
12370                            token_type: TableTokenType::Text,
12371                        }],
12372                        contents: vec![],
12373                        semantic_type: None,
12374                    },
12375                    TableBorderCell {
12376                        bbox: right_bbox.clone(),
12377                        index: None,
12378                        level: None,
12379                        row_number: ri,
12380                        col_number: 1,
12381                        row_span: 1,
12382                        col_span: 1,
12383                        content: vec![TableToken {
12384                            base: TextChunk {
12385                                value: (*page).to_string(),
12386                                bbox: right_bbox,
12387                                font_name: "Lato-Regular".to_string(),
12388                                font_size: 10.0,
12389                                font_weight: 400.0,
12390                                italic_angle: 0.0,
12391                                font_color: "#000000".to_string(),
12392                                contrast_ratio: 21.0,
12393                                symbol_ends: vec![],
12394                                text_format: TextFormat::Normal,
12395                                text_type: TextType::Regular,
12396                                pdf_layer: PdfLayer::Main,
12397                                ocg_visible: true,
12398                                index: None,
12399                                page_number: Some(1),
12400                                level: None,
12401                                mcid: None,
12402                            },
12403                            token_type: TableTokenType::Text,
12404                        }],
12405                        contents: vec![],
12406                        semantic_type: None,
12407                    },
12408                ],
12409                semantic_type: None,
12410            });
12411        }
12412
12413        ContentElement::TableBorder(TableBorder {
12414            bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
12415            index: None,
12416            level: Some("1".to_string()),
12417            x_coordinates: vec![72.0, 320.0, 360.0],
12418            x_widths: vec![0.0, 0.0, 0.0],
12419            y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
12420            y_widths: vec![0.0, 0.0, 0.0, 0.0],
12421            rows: table_rows,
12422            num_rows: rows.len(),
12423            num_columns: 2,
12424            is_bad_table: false,
12425            is_table_transformer: false,
12426            previous_table: None,
12427            next_table: None,
12428        })
12429    }
12430
12431    #[test]
12432    fn test_contents_document_renders_toc_table_rows() {
12433        let mut doc = PdfDocument::new("contents.pdf".to_string());
12434        doc.kids.push(make_heading("CONTENTS"));
12435        doc.kids.push(make_toc_table(&[
12436            ("Experiment #1: Hydrostatic Pressure", "3"),
12437            ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
12438            ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
12439            ("Experiment #4: Energy Loss in Pipes", "33"),
12440            ("Experiment #5: Impact of a Jet", "43"),
12441            ("Experiment #6: Orifice and Free Jet Flow", "50"),
12442            ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
12443            ("References", "101"),
12444        ]));
12445
12446        let md = to_markdown(&doc).unwrap();
12447        assert!(md.starts_with("# CONTENTS\n\n"));
12448        assert!(md.contains("- Experiment #1: Hydrostatic Pressure 3\n"));
12449        assert!(md.contains("- Experiment #2: Bernoulli's Theorem Demonstration 13\n"));
12450        assert!(md.contains("- Experiment #7: Osborne Reynolds' Demonstration 59\n"));
12451        assert!(md.contains("- References 101\n"));
12452    }
12453
12454    #[test]
12455    fn test_toc_semantic_paragraphs_render_without_blank_lines() {
12456        let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
12457        let mut first = make_paragraph(
12458            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12459            700.0,
12460            712.0,
12461        );
12462        let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
12463        if let ContentElement::Paragraph(p) = &mut first {
12464            p.base.semantic_type = SemanticType::TableOfContent;
12465        }
12466        if let ContentElement::Paragraph(p) = &mut second {
12467            p.base.semantic_type = SemanticType::TableOfContent;
12468        }
12469        doc.kids.push(first);
12470        doc.kids.push(second);
12471
12472        let md = to_markdown(&doc).unwrap();
12473        assert!(md.contains(
12474            "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
12475        ));
12476    }
12477
12478    #[test]
12479    fn test_compact_toc_document_renders_without_blank_lines() {
12480        let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
12481        doc.kids.push(make_paragraph(
12482            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12483            700.0,
12484            712.0,
12485        ));
12486        doc.kids.push(make_paragraph(
12487            "Section 5.1: The Linear Model 35",
12488            684.0,
12489            696.0,
12490        ));
12491        doc.kids.push(make_paragraph(
12492            "Part VI. Chapter Six - Comparing Three or More Group Means",
12493            668.0,
12494            680.0,
12495        ));
12496        doc.kids.push(make_paragraph(
12497            "Section 6.1: Between Versus Within Group Analyses 49",
12498            652.0,
12499            664.0,
12500        ));
12501        doc.kids.push(make_paragraph(
12502            "Part VII. Chapter Seven - Moderation and Mediation Analyses",
12503            636.0,
12504            648.0,
12505        ));
12506        doc.kids.push(make_paragraph(
12507            "Section 7.1: Mediation and Moderation Models 64",
12508            620.0,
12509            632.0,
12510        ));
12511        doc.kids
12512            .push(make_paragraph("References 101", 604.0, 616.0));
12513        doc.kids.push(make_paragraph(
12514            "Section 8.1: Factor Analysis Definitions 75",
12515            588.0,
12516            600.0,
12517        ));
12518
12519        let md = to_markdown(&doc).unwrap();
12520        assert!(md.contains(
12521            "# Part V. Chapter Five - Comparing Associations Between Multiple Variables\n\n## Section 5.1: The Linear Model"
12522        ));
12523        assert!(md.contains(
12524            "# Part VI. Chapter Six - Comparing Three or More Group Means\n\n## Section 6.1: Between Versus Within Group Analyses"
12525        ));
12526        assert!(md.contains("References 101\n\n## Section 8.1: Factor Analysis Definitions"));
12527    }
12528
12529    #[test]
12530    fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
12531        let mut doc = PdfDocument::new("caption-body.pdf".to_string());
12532        doc.kids.push(make_paragraph(
12533            "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
12534            500.0,
12535            540.0,
12536        ));
12537
12538        let md = to_markdown(&doc).unwrap();
12539        assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
12540    }
12541
12542    #[test]
12543    fn test_short_caption_label_merges_with_following_tail_and_body() {
12544        let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
12545        doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
12546        doc.kids.push(make_paragraph(
12547            "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
12548            520.0,
12549            532.0,
12550        ));
12551
12552        let md = to_markdown(&doc).unwrap();
12553        assert!(md.contains(
12554            "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
12555        ));
12556    }
12557
12558    #[test]
12559    fn test_short_caption_label_merges_with_tail_and_year() {
12560        let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
12561        doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
12562        doc.kids.push(make_paragraph(
12563            "Komnas HAM's YouTube channel as of 1 December",
12564            520.0,
12565            532.0,
12566        ));
12567        doc.kids.push(make_paragraph("2021", 500.0, 512.0));
12568
12569        let md = to_markdown(&doc).unwrap();
12570        assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
12571        assert!(!md.contains("\n\n2021"));
12572    }
12573
12574    #[test]
12575    fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
12576        let mut doc = PdfDocument::new("chart.pdf".to_string());
12577        doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
12578        doc.kids.push(make_paragraph("100", 520.0, 528.0));
12579        doc.kids
12580            .push(make_paragraph("Body text continues here.", 400.0, 412.0));
12581        doc.kids.push(make_paragraph("36", 20.0, 28.0));
12582
12583        let md = to_markdown(&doc).unwrap();
12584        assert!(md.contains("100"));
12585        assert!(!md.lines().any(|line| line.trim() == "36"));
12586    }
12587
12588    #[test]
12589    fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
12590        let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
12591        doc.kids.push(make_paragraph(
12592            "First semantic paragraph ends here.",
12593            520.0,
12594            532.0,
12595        ));
12596        doc.kids.push(make_paragraph(
12597            "Second semantic paragraph starts here.",
12598            500.0,
12599            512.0,
12600        ));
12601
12602        let md = to_markdown(&doc).unwrap();
12603        assert!(md.contains(
12604            "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
12605        ));
12606    }
12607
12608    #[test]
12609    fn test_lowercase_semantic_paragraph_continuation_is_merged() {
12610        let mut doc = PdfDocument::new("continuation.pdf".to_string());
12611        doc.kids.push(make_paragraph(
12612            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
12613            520.0,
12614            532.0,
12615        ));
12616        doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
12617
12618        let md = to_markdown(&doc).unwrap();
12619        assert!(md.contains(
12620            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
12621        ));
12622    }
12623
12624    #[test]
12625    fn test_semantic_enumerated_paragraphs_are_not_merged() {
12626        let mut doc = PdfDocument::new("enumerated-paragraphs.pdf".to_string());
12627        doc.kids.push(make_paragraph(
12628            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12629            520.0,
12630            532.0,
12631        ));
12632        doc.kids.push(make_paragraph(
12633            "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12634            500.0,
12635            512.0,
12636        ));
12637
12638        let md = to_markdown(&doc).unwrap();
12639        assert!(md.contains(
12640            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.\n\niv. This business model will be operating cost-oriented not capital cost-oriented."
12641        ));
12642    }
12643
12644    #[test]
12645    fn test_leading_figure_carryover_is_skipped_before_first_numbered_heading() {
12646        let mut doc = PdfDocument::new("leading-figure-carryover.pdf".to_string());
12647        doc.number_of_pages = 1;
12648        doc.kids.push(make_paragraph_at(
12649            72.0,
12650            742.0,
12651            540.0,
12652            756.0,
12653            "Figure 6. Mytella strigata biofouling green mussel farms in Bacoor City, Cavite, Manila Bay",
12654        ));
12655        doc.kids.push(make_heading_at(
12656            72.0,
12657            680.0,
12658            260.0,
12659            696.0,
12660            "5. Natural dispersal",
12661        ));
12662        doc.kids.push(make_paragraph_at(
12663            72.0,
12664            640.0,
12665            540.0,
12666            654.0,
12667            "Dispersal by purely natural means is not included as a pathway of biological invasions.",
12668        ));
12669
12670        let md = to_markdown(&doc).unwrap();
12671        assert!(md.starts_with("# 5. Natural dispersal"));
12672        assert!(!md.contains("Figure 6. Mytella strigata"));
12673    }
12674
12675    #[test]
12676    fn test_list_renderer_strips_duplicate_bullets_and_skips_bullet_only_items() {
12677        let mut doc = PdfDocument::new("bullets.pdf".to_string());
12678        doc.kids.push(make_fallback_list(&[
12679            "• First item",
12680            "•",
12681            "• Second item",
12682            "133",
12683        ]));
12684
12685        let md = to_markdown(&doc).unwrap();
12686        assert!(md.contains("- First item"));
12687        assert!(md.contains("- Second item"));
12688        assert!(!md.contains("- • First item"));
12689        assert!(!md.contains("\n- •\n"));
12690        assert!(!md.contains("\n- 133\n"));
12691    }
12692
12693    #[test]
12694    fn test_list_renderer_merges_wrapped_continuation_items() {
12695        let mut doc = PdfDocument::new("wrapped-list.pdf".to_string());
12696        doc.kids.push(make_fallback_list(&[
12697            "Use a micropipette to add 2 μL of loading dye",
12698            "and down a couple of times to mix the loading dye with the digested DNA.",
12699            "Use a fresh pipet tip for each reaction tube.",
12700        ]));
12701
12702        let md = to_markdown(&doc).unwrap();
12703        assert!(md.contains(
12704            "- Use a micropipette to add 2 μL of loading dye and down a couple of times to mix the loading dye with the digested DNA."
12705        ));
12706        assert!(md.contains("- Use a fresh pipet tip for each reaction tube."));
12707        assert!(!md.contains("\n- and down"));
12708    }
12709
12710    #[test]
12711    fn test_list_renderer_keeps_enumerated_items_separate() {
12712        let mut doc = PdfDocument::new("enumerated-list.pdf".to_string());
12713        doc.kids.push(make_fallback_list(&[
12714            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12715            "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12716            "v. Assumed selling price of wood pellet is $100 per tonne and appropriate.",
12717        ]));
12718
12719        let md = to_markdown(&doc).unwrap();
12720        assert!(md.contains("iii. Looking at cost items, the cost of raw woods procurement will be highest share.\niv. This business model will be operating cost-oriented not capital cost-oriented.\nv. Assumed selling price of wood pellet is $100 per tonne and appropriate."));
12721        assert!(!md.contains("- iii."));
12722    }
12723
12724    #[test]
12725    fn test_postprocess_drops_isolated_single_char_noise_lines() {
12726        let markdown = "# The Data Journey\n\n1\n\nTo get started.\n\no\n\nNOTE: Keep going.\n";
12727        let cleaned = drop_isolated_noise_lines(markdown);
12728        assert!(!cleaned.contains("\n1\n"));
12729        assert!(!cleaned.contains("\no\n"));
12730        assert!(cleaned.contains("To get started."));
12731        assert!(cleaned.contains("NOTE: Keep going."));
12732    }
12733
12734    fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
12735        let mut table_rows = Vec::new();
12736        for (row_number, (left, right)) in rows.iter().enumerate() {
12737            let top = 656.0 - row_number as f64 * 18.0;
12738            let bottom = top - 16.0;
12739            let mut cells = Vec::new();
12740            for (col_number, (text, left_x, right_x)) in
12741                [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
12742                    .into_iter()
12743                    .enumerate()
12744            {
12745                let content = if text.is_empty() {
12746                    Vec::new()
12747                } else {
12748                    vec![TableToken {
12749                        base: TextChunk {
12750                            value: text.to_string(),
12751                            bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12752                            font_name: "Test".to_string(),
12753                            font_size: 11.0,
12754                            font_weight: 400.0,
12755                            italic_angle: 0.0,
12756                            font_color: "[0.0]".to_string(),
12757                            contrast_ratio: 21.0,
12758                            symbol_ends: Vec::new(),
12759                            text_format: TextFormat::Normal,
12760                            text_type: TextType::Regular,
12761                            pdf_layer: PdfLayer::Main,
12762                            ocg_visible: true,
12763                            index: None,
12764                            page_number: Some(1),
12765                            level: None,
12766                            mcid: None,
12767                        },
12768                        token_type: TableTokenType::Text,
12769                    }]
12770                };
12771                cells.push(TableBorderCell {
12772                    bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12773                    index: None,
12774                    level: None,
12775                    row_number,
12776                    col_number,
12777                    row_span: 1,
12778                    col_span: 1,
12779                    content,
12780                    contents: vec![],
12781                    semantic_type: None,
12782                });
12783            }
12784
12785            table_rows.push(TableBorderRow {
12786                bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
12787                index: None,
12788                level: None,
12789                row_number,
12790                cells,
12791                semantic_type: None,
12792            });
12793        }
12794
12795        ContentElement::TableBorder(TableBorder {
12796            bbox: BoundingBox::new(
12797                Some(1),
12798                72.0,
12799                656.0 - rows.len() as f64 * 18.0 - 16.0,
12800                420.0,
12801                656.0,
12802            ),
12803            index: None,
12804            level: Some("1".to_string()),
12805            x_coordinates: vec![72.0, 220.0, 420.0],
12806            x_widths: vec![0.0; 3],
12807            y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
12808            y_widths: vec![0.0; rows.len() + 1],
12809            rows: table_rows,
12810            num_rows: rows.len(),
12811            num_columns: 2,
12812            is_bad_table: false,
12813            is_table_transformer: false,
12814            previous_table: None,
12815            next_table: None,
12816        })
12817    }
12818
12819    fn make_chunked_paragraph_line(
12820        segments: &[(&str, f64, f64)],
12821        bottom: f64,
12822        top: f64,
12823    ) -> ContentElement {
12824        let bbox = BoundingBox::new(
12825            Some(1),
12826            segments.first().map(|(_, left, _)| *left).unwrap_or(72.0),
12827            bottom,
12828            segments.last().map(|(_, _, right)| *right).unwrap_or(320.0),
12829            top,
12830        );
12831
12832        let chunks = segments
12833            .iter()
12834            .map(|(text, left, right)| TextChunk {
12835                value: (*text).to_string(),
12836                bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
12837                font_name: "Lato-Regular".to_string(),
12838                font_size: top - bottom,
12839                font_weight: 400.0,
12840                italic_angle: 0.0,
12841                font_color: "#000000".to_string(),
12842                contrast_ratio: 21.0,
12843                symbol_ends: vec![],
12844                text_format: TextFormat::Normal,
12845                text_type: TextType::Regular,
12846                pdf_layer: PdfLayer::Main,
12847                ocg_visible: true,
12848                index: None,
12849                page_number: Some(1),
12850                level: None,
12851                mcid: None,
12852            })
12853            .collect::<Vec<_>>();
12854
12855        let line = TextLine {
12856            bbox: bbox.clone(),
12857            index: None,
12858            level: None,
12859            font_size: top - bottom,
12860            base_line: bottom + 2.0,
12861            slant_degree: 0.0,
12862            is_hidden_text: false,
12863            text_chunks: chunks,
12864            is_line_start: true,
12865            is_line_end: true,
12866            is_list_line: false,
12867            connected_line_art_label: None,
12868        };
12869        let block = TextBlock {
12870            bbox: bbox.clone(),
12871            index: None,
12872            level: None,
12873            font_size: line.font_size,
12874            base_line: line.base_line,
12875            slant_degree: 0.0,
12876            is_hidden_text: false,
12877            text_lines: vec![line],
12878            has_start_line: true,
12879            has_end_line: true,
12880            text_alignment: None,
12881        };
12882        let column = TextColumn {
12883            bbox: bbox.clone(),
12884            index: None,
12885            level: None,
12886            font_size: block.font_size,
12887            base_line: block.base_line,
12888            slant_degree: 0.0,
12889            is_hidden_text: false,
12890            text_blocks: vec![block],
12891        };
12892
12893        ContentElement::Paragraph(SemanticParagraph {
12894            base: SemanticTextNode {
12895                bbox,
12896                index: None,
12897                level: None,
12898                semantic_type: SemanticType::Paragraph,
12899                correct_semantic_score: None,
12900                columns: vec![column],
12901                font_weight: Some(400.0),
12902                font_size: Some(top - bottom),
12903                text_color: None,
12904                italic_angle: None,
12905                font_name: Some("Lato-Regular".to_string()),
12906                text_format: None,
12907                max_font_size: Some(top - bottom),
12908                background_color: None,
12909                is_hidden_text: false,
12910            },
12911            enclosed_top: false,
12912            enclosed_bottom: false,
12913            indentation: 0,
12914        })
12915    }
12916
12917    fn make_n_column_table(rows: &[Vec<&str>], column_bounds: &[(f64, f64)]) -> ContentElement {
12918        let mut table_rows = Vec::new();
12919        for (row_number, row_values) in rows.iter().enumerate() {
12920            let top = 656.0 - row_number as f64 * 18.0;
12921            let bottom = top - 16.0;
12922            let mut cells = Vec::new();
12923            for (col_number, (left_x, right_x)) in column_bounds.iter().enumerate() {
12924                let text = row_values.get(col_number).copied().unwrap_or("");
12925                let content = if text.is_empty() {
12926                    Vec::new()
12927                } else {
12928                    vec![TableToken {
12929                        base: TextChunk {
12930                            value: text.to_string(),
12931                            bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
12932                            font_name: "Test".to_string(),
12933                            font_size: 11.0,
12934                            font_weight: 400.0,
12935                            italic_angle: 0.0,
12936                            font_color: "[0.0]".to_string(),
12937                            contrast_ratio: 21.0,
12938                            symbol_ends: Vec::new(),
12939                            text_format: TextFormat::Normal,
12940                            text_type: TextType::Regular,
12941                            pdf_layer: PdfLayer::Main,
12942                            ocg_visible: true,
12943                            index: None,
12944                            page_number: Some(1),
12945                            level: None,
12946                            mcid: None,
12947                        },
12948                        token_type: TableTokenType::Text,
12949                    }]
12950                };
12951                cells.push(TableBorderCell {
12952                    bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
12953                    index: None,
12954                    level: None,
12955                    row_number,
12956                    col_number,
12957                    row_span: 1,
12958                    col_span: 1,
12959                    content,
12960                    contents: vec![],
12961                    semantic_type: None,
12962                });
12963            }
12964
12965            table_rows.push(TableBorderRow {
12966                bbox: BoundingBox::new(
12967                    Some(1),
12968                    column_bounds.first().map(|(left, _)| *left).unwrap_or(72.0),
12969                    bottom,
12970                    column_bounds
12971                        .last()
12972                        .map(|(_, right)| *right)
12973                        .unwrap_or(420.0),
12974                    top,
12975                ),
12976                index: None,
12977                level: None,
12978                row_number,
12979                cells,
12980                semantic_type: None,
12981            });
12982        }
12983
12984        let left = column_bounds
12985            .first()
12986            .map(|(value, _)| *value)
12987            .unwrap_or(72.0);
12988        let right = column_bounds
12989            .last()
12990            .map(|(_, value)| *value)
12991            .unwrap_or(420.0);
12992        let x_coordinates = std::iter::once(left)
12993            .chain(column_bounds.iter().map(|(_, right)| *right))
12994            .collect::<Vec<_>>();
12995
12996        ContentElement::TableBorder(TableBorder {
12997            bbox: BoundingBox::new(
12998                Some(1),
12999                left,
13000                656.0 - rows.len() as f64 * 18.0 - 16.0,
13001                right,
13002                656.0,
13003            ),
13004            index: None,
13005            level: Some("1".to_string()),
13006            x_coordinates,
13007            x_widths: vec![0.0; column_bounds.len() + 1],
13008            y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
13009            y_widths: vec![0.0; rows.len() + 1],
13010            rows: table_rows,
13011            num_rows: rows.len(),
13012            num_columns: column_bounds.len(),
13013            is_bad_table: false,
13014            is_table_transformer: false,
13015            previous_table: None,
13016            next_table: None,
13017        })
13018    }
13019
13020    #[test]
13021    fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
13022        let mut doc = PdfDocument::new("cec-table.pdf".to_string());
13023        doc.number_of_pages = 1;
13024        doc.kids.push(make_two_column_table(&[
13025            ("Mineral or colloid type", "CEC of pure colloid"),
13026            ("", "cmolc/kg"),
13027            ("kaolinite", "10"),
13028            ("illite", "30"),
13029        ]));
13030
13031        let md = to_markdown(&doc).unwrap();
13032        assert!(md.contains("| --- | --- |"));
13033        assert!(md.contains("| kaolinite | 10 |"));
13034    }
13035
13036    #[test]
13037    fn test_blank_right_column_table_is_not_misrendered_as_toc() {
13038        let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
13039        doc.number_of_pages = 1;
13040        doc.kids.push(make_two_column_table(&[
13041            (
13042                "Added cation",
13043                "Relative Size & Settling Rates of Floccules",
13044            ),
13045            ("K+", ""),
13046            ("Na+", ""),
13047            ("Ca2+", ""),
13048        ]));
13049
13050        let md = to_markdown(&doc).unwrap();
13051        assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
13052        assert!(md.contains("| K+ |  |"));
13053    }
13054
13055    #[test]
13056    fn test_infographic_card_table_renders_as_numbered_item() {
13057        let mut doc = PdfDocument::new("infographic-card.pdf".to_string());
13058        doc.number_of_pages = 1;
13059        doc.kids.push(make_two_column_table(&[
13060            (
13061                "1",
13062                "We're all both consumers and creators of creative work.",
13063            ),
13064            (
13065                "",
13066                "As consumers, we watch movies, listen to music, read books, and more.",
13067            ),
13068        ]));
13069
13070        let md = to_markdown(&doc).unwrap();
13071        assert!(md.contains(
13072            "1. We're all both consumers and creators of creative work. As consumers, we watch movies, listen to music, read books, and more."
13073        ));
13074        assert!(!md.contains("| 1 |"));
13075    }
13076
13077    #[test]
13078    fn test_grouped_header_rows_are_preserved_without_flattening() {
13079        let mut doc = PdfDocument::new("grouped-header.pdf".to_string());
13080        doc.number_of_pages = 1;
13081        doc.kids.push(make_n_column_table(
13082            &[
13083                vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13084                vec![
13085                    "",
13086                    "Alpaca-GPT4",
13087                    "OpenOrca",
13088                    "Synth. Math-Instruct",
13089                    "Orca DPO Pairs",
13090                    "Ultrafeedback Cleaned",
13091                    "Synth. Math-Alignment",
13092                ],
13093                vec![
13094                    "Total # Samples",
13095                    "52K",
13096                    "2.91M",
13097                    "126K",
13098                    "12.9K",
13099                    "60.8K",
13100                    "126K",
13101                ],
13102            ],
13103            &[
13104                (72.0, 120.0),
13105                (120.0, 170.0),
13106                (170.0, 220.0),
13107                (220.0, 280.0),
13108                (280.0, 340.0),
13109                (340.0, 410.0),
13110                (410.0, 470.0),
13111            ],
13112        ));
13113
13114        let md = to_markdown(&doc).unwrap();
13115        assert!(md.contains(
13116            "| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"
13117        ));
13118        assert!(md.contains(
13119            "|  | Alpaca-GPT4 | OpenOrca | Synth. Math-Instruct | Orca DPO Pairs | Ultrafeedback Cleaned | Synth. Math-Alignment |"
13120        ));
13121        assert!(!md.contains("Instruction OpenOrca"));
13122        assert!(!md.contains("Alignment Ultrafeedback"));
13123    }
13124
13125    #[test]
13126    fn test_top_table_plate_renderer_stops_before_article_body() {
13127        let mut doc = PdfDocument::new("table-plate.pdf".to_string());
13128        doc.number_of_pages = 1;
13129        doc.kids
13130            .push(make_paragraph_at(72.0, 724.0, 200.0, 736.0, "SOLAR 10.7B"));
13131        doc.kids.push(make_paragraph_at(
13132            72.0,
13133            704.0,
13134            220.0,
13135            716.0,
13136            "Training datasets",
13137        ));
13138        doc.kids.push(make_n_column_table(
13139            &[
13140                vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13141                vec![
13142                    "",
13143                    "Alpaca-GPT4",
13144                    "OpenOrca",
13145                    "Synth. Math-Instruct",
13146                    "Orca DPO Pairs",
13147                    "Ultrafeedback Cleaned",
13148                    "Synth. Math-Alignment",
13149                ],
13150                vec![
13151                    "Total # Samples",
13152                    "52K",
13153                    "2.91M",
13154                    "126K",
13155                    "12.9K",
13156                    "60.8K",
13157                    "126K",
13158                ],
13159                vec![
13160                    "Maximum # Samples Used",
13161                    "52K",
13162                    "100K",
13163                    "52K",
13164                    "12.9K",
13165                    "60.8K",
13166                    "20.1K",
13167                ],
13168                vec!["Open Source", "O", "O", "✗", "O", "O", "✗"],
13169            ],
13170            &[
13171                (78.0, 125.0),
13172                (125.0, 175.0),
13173                (175.0, 225.0),
13174                (225.0, 285.0),
13175                (285.0, 345.0),
13176                (345.0, 415.0),
13177                (415.0, 490.0),
13178            ],
13179        ));
13180        doc.kids.push(make_paragraph_at(
13181            72.0,
13182            500.0,
13183            310.0,
13184            514.0,
13185            "Table 1: Training datasets used for the instruction and alignment tuning stages, respectively.",
13186        ));
13187        doc.kids.push(make_paragraph_at(
13188            286.0,
13189            484.0,
13190            526.0,
13191            498.0,
13192            "Open source indicates whether the dataset is open-sourced.",
13193        ));
13194        doc.kids.push(make_paragraph_at(
13195            72.0,
13196            360.0,
13197            290.0,
13198            388.0,
13199            "Comparison to other up-scaling methods. Unlike Komatsuzaki et al. (2022)...",
13200        ));
13201
13202        let md = to_markdown(&doc).unwrap();
13203        assert!(md.contains("Table 1: Training datasets used for the instruction"));
13204        assert!(md.contains("| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"));
13205        assert!(!md.contains("Comparison to other up-scaling methods"));
13206    }
13207
13208    #[test]
13209    fn test_late_section_boundary_renderer_drops_equation_carryover() {
13210        let mut doc = PdfDocument::new("late-section.pdf".to_string());
13211        doc.number_of_pages = 1;
13212        doc.kids.push(make_paragraph_at(
13213            72.0,
13214            700.0,
13215            540.0,
13216            714.0,
13217            "The horizontal distance traveled by the jet is equal to:",
13218        ));
13219        doc.kids.push(make_paragraph_at(
13220            72.0,
13221            640.0,
13222            540.0,
13223            654.0,
13224            "The vertical position of the jet may be calculated as:",
13225        ));
13226        doc.kids.push(make_paragraph_at(
13227            72.0,
13228            580.0,
13229            260.0,
13230            594.0,
13231            "Rearranging Equation (8) gives:",
13232        ));
13233        doc.kids.push(make_paragraph_at(
13234            72.0,
13235            520.0,
13236            420.0,
13237            534.0,
13238            "Substitution into Equation 7 results in:",
13239        ));
13240        doc.kids.push(make_paragraph_at(
13241            72.0,
13242            460.0,
13243            280.0,
13244            474.0,
13245            "Equations (10) can be rearranged to find Cv:",
13246        ));
13247        doc.kids.push(make_heading_at(
13248            72.0,
13249            350.0,
13250            420.0,
13251            366.0,
13252            "7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE",
13253        ));
13254        doc.kids.push(make_paragraph_at(
13255            72.0,
13256            326.0,
13257            380.0,
13258            340.0,
13259            "If C_d is assumed to be constant, then a graph of Q plotted against",
13260        ));
13261        doc.kids.push(make_paragraph_at(
13262            400.0,
13263            326.0,
13264            540.0,
13265            340.0,
13266            "(Equation 6) will be linear, and",
13267        ));
13268        doc.kids.push(make_paragraph_at(
13269            72.0,
13270            310.0,
13271            240.0,
13272            324.0,
13273            "the slope of this graph will be:",
13274        ));
13275        doc.kids.push(make_paragraph_at(
13276            360.0,
13277            36.0,
13278            550.0,
13279            48.0,
13280            "EXPERIMENT #6: ORIFICE AND FREE JET FLOW 53",
13281        ));
13282
13283        let md = to_markdown(&doc).unwrap();
13284        assert!(md.starts_with("# 7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE"));
13285        assert!(md.contains(
13286            "If C_d is assumed to be constant, then a graph of Q plotted against (Equation 6) will be linear, and the slope of this graph will be:"
13287        ));
13288        assert!(!md.contains("The horizontal distance traveled by the jet"));
13289        assert!(!md.contains("EXPERIMENT #6"));
13290    }
13291
13292    #[test]
13293    fn test_leading_table_carryover_row_is_trimmed_from_general_renderer() {
13294        let mut doc = PdfDocument::new("carryover-table.pdf".to_string());
13295        doc.number_of_pages = 1;
13296        doc.kids.push(make_n_column_table(
13297            &[
13298                vec![
13299                    "Jurisdiction",
13300                    "GATS XVII Reservation (1994)",
13301                    "Foreign Ownership Permitted",
13302                    "Restrictions on Foreign Ownership",
13303                    "Foreign Ownership Reporting Requirements",
13304                ],
13305                vec![
13306                    "",
13307                    "",
13308                    "",
13309                    "right required to acquire desert lands and continue the prior page",
13310                    "",
13311                ],
13312                vec!["Finland", "N", "Y", "Prior approval may be required.", ""],
13313                vec!["France", "N", "Y", "None.", ""],
13314            ],
13315            &[
13316                (72.0, 150.0),
13317                (150.0, 235.0),
13318                (235.0, 330.0),
13319                (330.0, 500.0),
13320                (500.0, 560.0),
13321            ],
13322        ));
13323
13324        let md = to_markdown(&doc).unwrap();
13325        assert!(!md.contains("right required to acquire desert lands"));
13326        assert!(md.contains("| Finland | N | Y | Prior approval may be required. |  |"));
13327    }
13328
13329    #[test]
13330    fn test_single_table_report_renderer_promotes_title_and_skips_footer() {
13331        let mut doc = PdfDocument::new("single-table-report.pdf".to_string());
13332        doc.number_of_pages = 1;
13333        doc.kids.push(make_paragraph_at(
13334            140.0,
13335            674.0,
13336            474.0,
13337            688.0,
13338            "Restrictions on Land Ownership by Foreigners in Selected Jurisdictions",
13339        ));
13340        doc.kids.push(make_n_column_table(
13341            &[
13342                vec![
13343                    "Jurisdiction",
13344                    "GATS XVII Reservation (1994)",
13345                    "Foreign Ownership Permitted",
13346                    "Restrictions on Foreign Ownership",
13347                    "Foreign Ownership Reporting Requirements",
13348                ],
13349                vec![
13350                    "",
13351                    "",
13352                    "",
13353                    "right required to acquire desert lands and continue the prior page",
13354                    "",
13355                ],
13356                vec![
13357                    "Finland",
13358                    "N",
13359                    "Y",
13360                    "Prior approval from the Government of Aland may be required.",
13361                    "",
13362                ],
13363                vec!["France", "N", "Y", "None.", ""],
13364            ],
13365            &[
13366                (72.0, 150.0),
13367                (150.0, 235.0),
13368                (235.0, 330.0),
13369                (330.0, 500.0),
13370                (500.0, 560.0),
13371            ],
13372        ));
13373        doc.kids.push(make_paragraph_at(
13374            350.0,
13375            36.0,
13376            548.0,
13377            48.0,
13378            "The Law Library of Congress 7",
13379        ));
13380
13381        let md = to_markdown(&doc).unwrap();
13382        assert!(md.starts_with(
13383            "# Restrictions on Land Ownership by Foreigners in Selected Jurisdictions"
13384        ));
13385        assert!(!md.contains("right required to acquire desert lands"));
13386        assert!(!md.contains("The Law Library of Congress 7"));
13387        assert!(md.contains(
13388            "| Finland | N | Y | Prior approval from the Government of Aland may be required. |  |"
13389        ));
13390    }
13391
13392    #[test]
13393    fn test_geometric_panel_headers_are_promoted_into_table() {
13394        let mut doc = PdfDocument::new("ai-pack-panel.pdf".to_string());
13395        doc.kids.push(make_chunked_paragraph_line(
13396            &[("OCR", 220.0, 250.0)],
13397            720.0,
13398            732.0,
13399        ));
13400        doc.kids.push(make_chunked_paragraph_line(
13401            &[("Recommendation", 430.0, 540.0)],
13402            720.0,
13403            732.0,
13404        ));
13405        doc.kids.push(make_chunked_paragraph_line(
13406            &[("Product semantic search", 660.0, 860.0)],
13407            720.0,
13408            732.0,
13409        ));
13410        doc.kids.push(make_chunked_paragraph_line(
13411            &[("Pack", 72.0, 110.0)],
13412            684.0,
13413            696.0,
13414        ));
13415        doc.kids.push(make_chunked_paragraph_line(
13416            &[("A solution that recognizes characters", 140.0, 340.0)],
13417            684.0,
13418            696.0,
13419        ));
13420        doc.kids.push(make_chunked_paragraph_line(
13421            &[("A solution that recommends the best products", 390.0, 620.0)],
13422            684.0,
13423            696.0,
13424        ));
13425        doc.kids.push(make_chunked_paragraph_line(
13426            &[("A solution that enables semantic search", 650.0, 900.0)],
13427            684.0,
13428            696.0,
13429        ));
13430        doc.kids.push(make_n_column_table(
13431            &[
13432                vec![
13433                    "Achieved 1st place in the OCR World Competition",
13434                    "Team with specialists and technologies",
13435                    "Creation of the first natural language evaluation",
13436                ],
13437                vec![
13438                    "The team includes specialists who have",
13439                    "received Kaggle's Gold Medal recommendation",
13440                    "system in Korean (KLUE)",
13441                ],
13442                vec![
13443                    "presented 14 papers in renowned AI conferences",
13444                    "top-tier recommendation",
13445                    "Shopee subject",
13446                ],
13447            ],
13448            &[(120.0, 360.0), (360.0, 630.0), (630.0, 910.0)],
13449        ));
13450        doc.kids.push(make_chunked_paragraph_line(
13451            &[("models", 430.0, 490.0)],
13452            552.0,
13453            564.0,
13454        ));
13455
13456        let md = to_markdown(&doc).unwrap();
13457        assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13458        assert!(md.contains("| A solution that recognizes characters | A solution that recommends the best products | A solution that enables semantic search |"));
13459        assert!(md.contains(
13460            "received Kaggle's Gold Medal recommendation top-tier recommendation models"
13461        ));
13462    }
13463
13464    #[test]
13465    fn test_embedded_stub_header_is_promoted_from_first_table_column() {
13466        let mut doc = PdfDocument::new("embedded-stub-header.pdf".to_string());
13467        doc.kids.push(make_chunked_paragraph_line(
13468            &[("OCR", 220.0, 250.0)],
13469            720.0,
13470            732.0,
13471        ));
13472        doc.kids.push(make_chunked_paragraph_line(
13473            &[("Recommendation", 430.0, 540.0)],
13474            720.0,
13475            732.0,
13476        ));
13477        doc.kids.push(make_chunked_paragraph_line(
13478            &[("Product semantic search", 660.0, 860.0)],
13479            720.0,
13480            732.0,
13481        ));
13482        doc.kids.push(make_n_column_table(
13483            &[
13484                vec![
13485                    "Pack",
13486                    "A solution that recognizes characters in an image and extracts necessary information",
13487                    "A solution that recommends the best products and contents",
13488                    "A solution that enables semantic search and organizes key information",
13489                ],
13490                vec![
13491                    "Application",
13492                    "Applicable to all fields that require text extraction",
13493                    "Applicable to all fields that use any form of recommendation",
13494                    "Applicable to all fields that deal with unstructured data",
13495                ],
13496                vec![
13497                    "Highlight",
13498                    "Achieved 1st place in the OCR World Competition",
13499                    "Received Kaggle's Gold Medal recommendation",
13500                    "Creation of the first natural language evaluation system in Korean",
13501                ],
13502            ],
13503            &[
13504                (72.0, 120.0),
13505                (120.0, 360.0),
13506                (360.0, 630.0),
13507                (630.0, 910.0),
13508            ],
13509        ));
13510
13511        let md = to_markdown(&doc).unwrap();
13512        assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13513        assert!(
13514            md.contains("| Application | Applicable to all fields that require text extraction |")
13515        );
13516        assert!(md.contains("| Highlight | Achieved 1st place in the OCR World Competition |"));
13517        assert!(!md.contains("OCR\n\nRecommendation\n\nProduct semantic search"));
13518    }
13519
13520    #[test]
13521    fn test_geometric_chunk_alignment_splits_header_line_into_columns() {
13522        let line = make_chunked_paragraph_line(
13523            &[
13524                ("Properties", 72.0, 145.0),
13525                ("Instruction", 180.0, 255.0),
13526                ("Alignment", 480.0, 545.0),
13527            ],
13528            720.0,
13529            732.0,
13530        );
13531        let chunk_lines = extract_chunk_lines(&line);
13532        let fragments = split_line_into_slot_fragments(
13533            &chunk_lines[0],
13534            &[
13535                (72.0, 170.0),
13536                (170.0, 280.0),
13537                (280.0, 380.0),
13538                (380.0, 480.0),
13539                (480.0, 600.0),
13540                (600.0, 720.0),
13541                (720.0, 850.0),
13542            ],
13543        );
13544
13545        assert_eq!(fragments.len(), 3);
13546        assert_eq!(fragments[0].slot_idx, 0);
13547        assert_eq!(fragments[0].text, "Properties");
13548        assert_eq!(fragments[1].slot_idx, 1);
13549        assert_eq!(fragments[1].text, "Instruction");
13550        assert_eq!(fragments[2].slot_idx, 4);
13551        assert_eq!(fragments[2].text, "Alignment");
13552    }
13553
13554    #[test]
13555    fn test_merge_tables_across_heading() {
13556        let input = "some text\n\n\
13557                      | Area | Competence |\n\
13558                      | --- | --- |\n\
13559                      | Row1 | Val1 |\n\
13560                      | Row2 | Val2 |\n\
13561                      \n\
13562                      # Heading Between\n\
13563                      \n\
13564                      | Row3 | Val3 |\n\
13565                      | --- | --- |\n\
13566                      \n\
13567                      more text\n";
13568        let result = merge_adjacent_pipe_tables(input);
13569        // Heading should be converted to a pipe row
13570        assert!(
13571            result.contains("| Heading Between |"),
13572            "Heading should be in pipe row: {}",
13573            result
13574        );
13575        // Should NOT have # heading marker
13576        assert!(
13577            !result.contains("# Heading Between"),
13578            "Heading marker should be removed: {}",
13579            result
13580        );
13581        // Row3 should still be present
13582        assert!(
13583            result.contains("| Row3 |") || result.contains("Row3"),
13584            "Row3 should exist: {}",
13585            result
13586        );
13587    }
13588
13589    #[test]
13590    fn test_merge_tables_does_not_cross_distinct_headers() {
13591        let input = "| Model | Score |\n\
13592                     | --- | --- |\n\
13593                     | A | 1 |\n\
13594                     \n\
13595                     Table 6: Performance comparison amongst the merge candidates.\n\
13596                     \n\
13597                     | Model | Method | Score |\n\
13598                     | --- | --- | --- |\n\
13599                     | B | Avg | 2 |\n";
13600        let result = merge_adjacent_pipe_tables(input);
13601
13602        assert!(result.contains("Table 6: Performance comparison amongst the merge candidates."));
13603        assert!(result.contains("| Model | Score |"));
13604        assert!(result.contains("| Model | Method | Score |"));
13605        assert!(
13606            !result.contains("| Table 6: Performance comparison amongst the merge candidates. |")
13607        );
13608    }
13609
13610    #[test]
13611    fn test_normalize_chart_like_markdown_extracts_series_tables() {
13612        let input = "Figure 1.7. Non-citizen population in Malaysia (in thousands) 3,323 3,500 3,288 3,230 3,140 2,907 3,000 2,693 2,500 2,000 1,500 1,000 500 0\n\n\
13613                     2016 2017 2018 2019 2020 2021 Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.\n\n\
13614                     ASEAN Migration Outlook 19\n";
13615
13616        let normalized = normalize_chart_like_markdown(input);
13617        assert!(
13618            normalized.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
13619        );
13620        assert!(normalized.contains("| 2016 | 3,323 |"));
13621        assert!(normalized.contains("| 2021 | 2,693 |"));
13622        assert!(normalized.contains(
13623            "*Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.*"
13624        ));
13625        assert!(!normalized.contains("ASEAN Migration Outlook 19"));
13626    }
13627
13628    #[test]
13629    fn test_normalize_chart_like_markdown_promotes_structural_captions() {
13630        let input = "Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or\n\n\
13631                     The Wonderful Lamp.\n\n\
13632                     Body paragraph.\n";
13633
13634        let normalized = normalize_chart_like_markdown(input);
13635        assert!(normalized.contains(
13636            "## Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or The Wonderful Lamp"
13637        ));
13638        assert!(normalized.contains("Body paragraph."));
13639    }
13640
13641    #[test]
13642    fn test_normalize_chart_like_markdown_reconstructs_header_pair_chart_table() {
13643        let input = "Figure 4.8. Domestic Wood Pellets Production\n\n\
13644                     | 8 | 800 200 | 126 2014 | 120 2015 | 120 2016 | 127 2017 | 131 2018 | 147 2019 |\n\
13645                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\n\
13646                     Source: Forestry Agency, Ministry of Agriculture, Forestry and Fishery (MAFF), 2020.\n";
13647
13648        let normalized = normalize_chart_like_markdown(input);
13649        assert!(normalized.contains("# Figure 4.8. Domestic Wood Pellets Production"));
13650        assert!(normalized.contains("| Year | Domestic Wood Pellets Production |"));
13651        assert!(normalized.contains("| 2014 | 126 |"));
13652        assert!(normalized.contains("| 2019 | 147 |"));
13653        assert!(!normalized.contains("| 8 | 800 200 |"));
13654    }
13655
13656    #[test]
13657    fn test_normalize_chart_like_markdown_drops_numeric_axis_artifact_table() {
13658        let input = "| 31 1 0 2 23 2 2 2 0 5 10 15 20 25 30 35 Event Celebration Information Videograph 2019 2020 |\n\
13659                     | --- |\n\n\
13660                     Distribution of Komnas HAM's YouTube Content (2019-2020)\n";
13661
13662        let normalized = normalize_chart_like_markdown(input);
13663        assert!(!normalized.contains("| --- |"));
13664        assert!(normalized.contains("Distribution of Komnas HAM's YouTube Content (2019-2020)"));
13665    }
13666
13667    #[test]
13668    fn test_normalize_chart_like_markdown_drops_url_fragment_table() {
13669        let input = "## Figure 6 DPN Argentina Content: World Health Day Celebration\n\n\
13670                     | na/status/1379765916259483648 |\n\
13671                     | --- |\n\n\
13672                     98 DPN Argentina, accessed on 5 December 2021.\n";
13673
13674        let normalized = normalize_chart_like_markdown(input);
13675        assert!(!normalized.contains("/status/1379765916259483648 |"));
13676        assert!(normalized.contains("98 DPN Argentina, accessed on 5 December 2021."));
13677    }
13678
13679    #[test]
13680    fn test_normalize_chart_like_markdown_drops_sparse_table_before_caption() {
13681        let input = "What’s unique about the growth of Alligator Gars is their fast growth.\n\n\
13682                     | in | cm |  | Length | of | Gar | Fish | Age |\n\
13683                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13684                     | 120) | 300 |  |  |  |  |  |  |\n\
13685                     | 100+ | 250 |  |  |  |  |  |  |\n\
13686                     | 80+ | 200 |  |  |  |  |  |  |\n\
13687                     | 20. | 50 | G |  |  |  |  | Vi |\n\
13688                     | 0 | 0 |  |  |  |  |  |  |\n\
13689                     |  | 0 | 10 | 30 |  | 40 | 50 | 60 |\n\n\
13690                     Figure 8.6: Growth in length of Alligator Gar in Texas.\n";
13691
13692        let normalized = normalize_chart_like_markdown(input);
13693        assert!(!normalized.contains("| in | cm |"));
13694        assert!(normalized.contains("Figure 8.6: Growth in length of Alligator Gar in Texas."));
13695    }
13696
13697    #[test]
13698    fn test_normalize_chart_like_markdown_trims_large_top_table_plate() {
13699        let input = "| A | B | C | D | E | F | G | H |\n\
13700                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13701                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13702                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13703                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13704                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13705                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13706                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13707                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13708                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\n\
13709                     Table 2: Evaluation results for SOLAR 10.7B and SOLAR 10.7B-Instruct along with other top-performing models in the paper.\n\n\
13710                     # 4.2 Main Results\n\n\
13711                     The surrounding prose should be dropped.\n";
13712
13713        let normalized = normalize_chart_like_markdown(input);
13714        assert!(normalized.starts_with("| A | B | C | D | E | F | G | H |"));
13715        assert!(!normalized.contains("Table 2:"));
13716        assert!(!normalized.contains("4.2 Main Results"));
13717        assert!(!normalized.contains("surrounding prose"));
13718    }
13719}