Skip to main content

edgeparse_core/output/
markdown.rs

1//! Markdown output generator.
2
3#[cfg(not(target_arch = "wasm32"))]
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6#[cfg(not(target_arch = "wasm32"))]
7use std::path::Path;
8#[cfg(not(target_arch = "wasm32"))]
9use std::process::Command;
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::TextChunk;
13use crate::models::content::ContentElement;
14use crate::models::document::PdfDocument;
15use crate::models::enums::SemanticType;
16use crate::models::semantic::SemanticTextNode;
17use crate::models::table::TableTokenRow;
18use crate::EdgePdfError;
19
20#[cfg(not(target_arch = "wasm32"))]
21struct CachedBBoxLayout {
22    page_width: f64,
23    lines: Vec<BBoxLayoutLine>,
24    blocks: Vec<BBoxLayoutBlock>,
25}
26
27#[cfg(not(target_arch = "wasm32"))]
28#[derive(Default)]
29struct LayoutSourceCache {
30    bbox_layout: Option<Option<CachedBBoxLayout>>,
31    layout_lines: Option<Option<Vec<String>>>,
32}
33
34#[cfg(not(target_arch = "wasm32"))]
35impl LayoutSourceCache {
36    fn bbox_layout(&mut self, doc: &PdfDocument) -> Option<&CachedBBoxLayout> {
37        if self.bbox_layout.is_none() {
38            let loaded = doc.source_path.as_deref().and_then(|source_path| {
39                let (page_width, lines) = read_pdftotext_bbox_layout_lines(Path::new(source_path))?;
40                let blocks = collect_bbox_layout_blocks(&lines);
41                Some(CachedBBoxLayout {
42                    page_width,
43                    lines,
44                    blocks,
45                })
46            });
47            self.bbox_layout = Some(loaded);
48        }
49        self.bbox_layout.as_ref().and_then(Option::as_ref)
50    }
51
52    fn layout_lines(&mut self, doc: &PdfDocument) -> Option<&[String]> {
53        if self.layout_lines.is_none() {
54            let loaded = doc
55                .source_path
56                .as_deref()
57                .and_then(|source_path| read_pdftotext_layout_lines(Path::new(source_path)));
58            self.layout_lines = Some(loaded);
59        }
60        self.layout_lines
61            .as_ref()
62            .and_then(Option::as_ref)
63            .map(Vec::as_slice)
64    }
65}
66
67/// Generate Markdown representation of a PdfDocument.
68///
69/// # Errors
70/// Returns `EdgePdfError::OutputError` on write failures.
71pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
72    #[cfg(not(target_arch = "wasm32"))]
73    let mut layout_cache = LayoutSourceCache::default();
74    #[cfg(not(target_arch = "wasm32"))]
75    if let Some(rendered) = render_layout_open_plate_document_cached(doc, &mut layout_cache) {
76        return Ok(rendered);
77    }
78    #[cfg(not(target_arch = "wasm32"))]
79    if let Some(rendered) =
80        render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
81    {
82        return Ok(rendered);
83    }
84    #[cfg(not(target_arch = "wasm32"))]
85    if let Some(rendered) = render_layout_captioned_media_document_cached(doc, &mut layout_cache) {
86        return Ok(rendered);
87    }
88    #[cfg(not(target_arch = "wasm32"))]
89    if let Some(rendered) =
90        render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
91    {
92        return Ok(rendered);
93    }
94    #[cfg(not(target_arch = "wasm32"))]
95    if let Some(rendered) = render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
96    {
97        return Ok(rendered);
98    }
99    #[cfg(not(target_arch = "wasm32"))]
100    if let Some(rendered) = render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
101    {
102        return Ok(rendered);
103    }
104    #[cfg(not(target_arch = "wasm32"))]
105    if let Some(rendered) =
106        render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
107    {
108        return Ok(rendered);
109    }
110    #[cfg(not(target_arch = "wasm32"))]
111    if let Some(rendered) = render_layout_toc_document_cached(doc, &mut layout_cache) {
112        return Ok(rendered);
113    }
114    if looks_like_contents_document(doc) {
115        return Ok(render_contents_document(doc));
116    }
117    if looks_like_compact_toc_document(doc) {
118        return Ok(render_compact_toc_document(doc));
119    }
120    #[cfg(not(target_arch = "wasm32"))]
121    if let Some(rendered) = render_layout_projection_sheet_document_cached(doc, &mut layout_cache) {
122        return Ok(rendered);
123    }
124    #[cfg(not(target_arch = "wasm32"))]
125    if let Some(rendered) = render_layout_appendix_tables_document_cached(doc, &mut layout_cache) {
126        return Ok(rendered);
127    }
128    #[cfg(not(target_arch = "wasm32"))]
129    if let Some(rendered) = render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
130    {
131        return Ok(rendered);
132    }
133    #[cfg(not(target_arch = "wasm32"))]
134    if let Some(rendered) = render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
135    {
136        return Ok(rendered);
137    }
138    #[cfg(not(target_arch = "wasm32"))]
139    if let Some(rendered) =
140        render_layout_registration_report_document_cached(doc, &mut layout_cache)
141    {
142        return Ok(rendered);
143    }
144    if let Some(rendered) = render_top_table_plate_document(doc) {
145        return Ok(rendered);
146    }
147    if let Some(rendered) = render_single_table_report_document(doc) {
148        return Ok(rendered);
149    }
150    if let Some(rendered) = render_late_section_boundary_document(doc) {
151        return Ok(rendered);
152    }
153    #[cfg(not(target_arch = "wasm32"))]
154    if let Some(rendered) = render_layout_matrix_document_cached(doc, &mut layout_cache) {
155        return Ok(rendered);
156    }
157    #[cfg(not(target_arch = "wasm32"))]
158    if let Some(rendered) = render_layout_panel_stub_document_cached(doc, &mut layout_cache) {
159        return Ok(rendered);
160    }
161
162    Ok(render_markdown_core(doc))
163}
164
165fn render_markdown_core(doc: &PdfDocument) -> String {
166    let mut output = String::new();
167
168    // Title
169    if let Some(ref title) = doc.title {
170        let trimmed = title.trim();
171        if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
172            if should_render_document_title_as_plaintext(doc, trimmed) {
173                output.push_str(trimmed);
174                output.push_str("\n\n");
175            } else {
176                output.push_str(&format!("# {}\n\n", trimmed));
177            }
178        }
179    }
180
181    if doc.kids.is_empty() {
182        output.push_str("*No content extracted.*\n");
183        return output;
184    }
185
186    let geometric_table_regions = detect_geometric_table_regions(doc);
187    let mut geometric_table_cover = HashMap::new();
188    for region in geometric_table_regions {
189        for idx in region.start_idx..=region.end_idx {
190            geometric_table_cover.insert(idx, region.clone());
191        }
192    }
193
194    let mut i = 0usize;
195    while i < doc.kids.len() {
196        if let Some(region) = geometric_table_cover.get(&i) {
197            output.push_str(&region.rendered);
198            i = region.end_idx + 1;
199            continue;
200        }
201
202        match &doc.kids[i] {
203            ContentElement::Heading(h) => {
204                let text = h.base.base.value();
205                let trimmed = text.trim();
206                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
207                    i += 1;
208                    continue;
209                }
210
211                // Demote carried-over table header rows that were promoted to
212                // headings by the pipeline but only duplicate the table above.
213                if looks_like_table_header_duplicate_heading(doc, i, trimmed) {
214                    output.push_str(&escape_md_line_start(trimmed));
215                    output.push_str("\n\n");
216                    i += 1;
217                    continue;
218                }
219
220                // Demote headings that sit in the bottom margin of the page
221                // (running footers misclassified as headings by the pipeline).
222                if looks_like_bottom_margin_heading(doc, i) {
223                    output.push_str(&escape_md_line_start(trimmed));
224                    output.push_str("\n\n");
225                    i += 1;
226                    continue;
227                }
228
229                // Demote pipeline headings that look like sentence fragments
230                // ending with a period but are not numbered section headings.
231                if should_demote_period_heading(trimmed) {
232                    output.push_str(&escape_md_line_start(trimmed));
233                    output.push_str("\n\n");
234                    i += 1;
235                    continue;
236                }
237
238                // Demote headings ending with comma (footnotes / data labels).
239                if should_demote_comma_heading(trimmed) {
240                    output.push_str(&escape_md_line_start(trimmed));
241                    output.push_str("\n\n");
242                    i += 1;
243                    continue;
244                }
245
246                // Demote headings containing math symbols.
247                if should_demote_math_heading(trimmed) {
248                    output.push_str(&escape_md_line_start(trimmed));
249                    output.push_str("\n\n");
250                    i += 1;
251                    continue;
252                }
253
254                // Demote headings containing percentage signs.
255                if should_demote_percentage_heading(trimmed) {
256                    output.push_str(&escape_md_line_start(trimmed));
257                    output.push_str("\n\n");
258                    i += 1;
259                    continue;
260                }
261
262                // Demote headings that start with a known caption prefix
263                // (e.g. "Source:", "Figure", "Table") — these are captions,
264                // not section headings, regardless of pipeline classification.
265                if starts_with_caption_prefix(trimmed) {
266                    output.push_str(&escape_md_line_start(trimmed));
267                    output.push_str("\n\n");
268                    i += 1;
269                    continue;
270                }
271
272                // Demote bibliography entries: lines starting with a 4-digit
273                // year followed by a period (e.g. "2020. Title of paper...").
274                if should_demote_bibliography_heading(trimmed) {
275                    output.push_str(&escape_md_line_start(trimmed));
276                    output.push_str("\n\n");
277                    i += 1;
278                    continue;
279                }
280
281                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
282                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
283                        let mut merged = trimmed.to_string();
284                        merge_paragraph_text(&mut merged, &next_text);
285                        output.push_str(&escape_md_line_start(merged.trim()));
286                        output.push_str("\n\n");
287                        i += 2;
288                        continue;
289                    }
290                }
291
292                // Merge consecutive heading fragments.
293                // When the PDF splits a title across multiple text elements,
294                // each becomes a separate heading; merge them into one.
295                let mut merged_heading = trimmed.to_string();
296                while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
297                    let next_text = next_h.base.base.value();
298                    let next_trimmed = next_text.trim();
299                    if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
300                        i += 1;
301                        continue;
302                    }
303                    // Only merge if the combined text stays under max heading length
304                    if merged_heading.len() + 1 + next_trimmed.len() > 200 {
305                        break;
306                    }
307                    merge_paragraph_text(&mut merged_heading, next_trimmed);
308                    i += 1;
309                }
310
311                let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
312
313                // Check if this heading contains a merged subsection
314                if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
315                    let first = cleaned_heading[..split_pos].trim();
316                    let second = cleaned_heading[split_pos..].trim();
317                    output.push_str(&format!("# {}\n\n", first));
318                    output.push_str(&format!("# {}\n\n", second));
319                } else {
320                    output.push_str(&format!("# {}\n\n", cleaned_heading));
321                }
322            }
323            ContentElement::NumberHeading(nh) => {
324                let text = nh.base.base.base.value();
325                let trimmed = text.trim();
326                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
327                    i += 1;
328                    continue;
329                }
330
331                // Demote number headings ending with comma (footnotes).
332                if should_demote_comma_heading(trimmed) {
333                    output.push_str(&escape_md_line_start(trimmed));
334                    output.push_str("\n\n");
335                    i += 1;
336                    continue;
337                }
338
339                // Demote number headings containing math symbols.
340                if should_demote_math_heading(trimmed) {
341                    output.push_str(&escape_md_line_start(trimmed));
342                    output.push_str("\n\n");
343                    i += 1;
344                    continue;
345                }
346
347                // Demote number headings containing percentage signs.
348                if should_demote_percentage_heading(trimmed) {
349                    output.push_str(&escape_md_line_start(trimmed));
350                    output.push_str("\n\n");
351                    i += 1;
352                    continue;
353                }
354
355                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
356                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
357                        let mut merged = trimmed.to_string();
358                        merge_paragraph_text(&mut merged, &next_text);
359                        output.push_str(&escape_md_line_start(merged.trim()));
360                        output.push_str("\n\n");
361                        i += 2;
362                        continue;
363                    }
364                }
365
366                let cleaned = strip_trailing_page_number(trimmed);
367
368                // Check if this heading contains a merged subsection
369                if let Some(split_pos) = find_merged_subsection_split(cleaned) {
370                    let first = cleaned[..split_pos].trim();
371                    let second = cleaned[split_pos..].trim();
372                    output.push_str(&format!("# {}\n\n", first));
373                    output.push_str(&format!("# {}\n\n", second));
374                } else {
375                    output.push_str(&format!("# {}\n\n", cleaned));
376                }
377            }
378            ContentElement::Paragraph(_)
379            | ContentElement::TextBlock(_)
380            | ContentElement::TextLine(_) => {
381                let element = &doc.kids[i];
382                let text = match &doc.kids[i] {
383                    ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
384                    ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
385                    ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
386                    _ => unreachable!(),
387                };
388                let trimmed = text.trim();
389                if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
390                    i += 1;
391                    continue;
392                }
393                if should_skip_leading_figure_carryover(doc, i, trimmed) {
394                    i += 1;
395                    continue;
396                }
397
398                if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
399                    let cleaned = strip_trailing_page_number(trimmed);
400                    // Check if this heading contains a merged subsection
401                    if let Some(split_pos) = find_merged_subsection_split(cleaned) {
402                        let first = cleaned[..split_pos].trim();
403                        let second = cleaned[split_pos..].trim();
404                        output.push_str(&format!("# {}\n\n", first));
405                        output.push_str(&format!("# {}\n\n", second));
406                    } else {
407                        output.push_str(&format!("# {}\n\n", cleaned));
408                    }
409                    i += 1;
410                    continue;
411                }
412
413                if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
414                {
415                    output.push_str(&escape_md_line_start(trimmed));
416                    output.push('\n');
417                    i += 1;
418                    continue;
419                }
420
421                if is_short_caption_label(trimmed) {
422                    if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
423                        if let Some((caption_tail, body)) =
424                            split_following_caption_tail_and_body(&next_text)
425                        {
426                            let mut caption = trimmed.to_string();
427                            caption.push('\n');
428                            caption.push_str(caption_tail);
429                            output.push_str(&escape_md_line_start(caption.trim()));
430                            output.push_str("\n\n");
431                            output.push_str(&escape_md_line_start(body));
432                            output.push_str("\n\n");
433                            i += 2;
434                            continue;
435                        }
436
437                        if looks_like_caption_tail(&next_text) {
438                            let mut caption = trimmed.to_string();
439                            caption.push('\n');
440                            caption.push_str(next_text.trim());
441
442                            if let Some(year_text) =
443                                next_mergeable_paragraph_text(doc.kids.get(i + 2))
444                            {
445                                if looks_like_caption_year(&year_text) {
446                                    caption.push('\n');
447                                    caption.push_str(year_text.trim());
448                                    i += 1;
449                                }
450                            }
451
452                            output.push_str(&escape_md_line_start(caption.trim()));
453                            output.push_str("\n\n");
454                            i += 2;
455                            continue;
456                        }
457                    }
458                }
459
460                if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
461                    output.push_str(&escape_md_line_start(caption));
462                    output.push_str("\n\n");
463                    output.push_str(&escape_md_line_start(body));
464                    output.push_str("\n\n");
465                    i += 1;
466                    continue;
467                }
468
469                let mut merged = trimmed.to_string();
470                while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
471                    let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
472                        should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
473                    } else {
474                        should_merge_paragraph_text(&merged, &next_text)
475                    };
476                    if !can_merge {
477                        break;
478                    }
479                    merge_paragraph_text(&mut merged, &next_text);
480                    i += 1;
481                }
482
483                output.push_str(&escape_md_line_start(merged.trim()));
484                output.push_str("\n\n");
485            }
486            other => render_element(&mut output, other),
487        }
488        i += 1;
489    }
490
491    // Post-processing: merge adjacent pipe tables that share the same
492    // column count.  The table detector sometimes emits highlighted or
493    // coloured rows as separate tables.
494    let output = merge_adjacent_pipe_tables(&output);
495    let output = normalize_chart_like_markdown(&output);
496    drop_isolated_noise_lines(&output)
497}
498
499fn cmp_banded_reading_order(
500    left: &BoundingBox,
501    right: &BoundingBox,
502    band_height: f64,
503) -> std::cmp::Ordering {
504    let safe_band = band_height.max(1.0);
505    let left_band = (left.top_y / safe_band).round() as i64;
506    let right_band = (right.top_y / safe_band).round() as i64;
507    right_band
508        .cmp(&left_band)
509        .then_with(|| {
510            left.left_x
511                .partial_cmp(&right.left_x)
512                .unwrap_or(std::cmp::Ordering::Equal)
513        })
514        .then_with(|| {
515            right
516                .top_y
517                .partial_cmp(&left.top_y)
518                .unwrap_or(std::cmp::Ordering::Equal)
519        })
520        .then_with(|| {
521            right
522                .bottom_y
523                .partial_cmp(&left.bottom_y)
524                .unwrap_or(std::cmp::Ordering::Equal)
525        })
526        .then_with(|| {
527            left.right_x
528                .partial_cmp(&right.right_x)
529                .unwrap_or(std::cmp::Ordering::Equal)
530        })
531}
532
533fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
534    first_heading_like_text(doc)
535        .filter(|first| !equivalent_heading_text(first, title))
536        .is_some()
537}
538
539fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
540    if title.split_whitespace().count() > 6 {
541        return false;
542    }
543
544    let mut early = doc.kids.iter().take(6);
545    let has_explicit_heading = early.clone().any(|element| {
546        matches!(
547            element,
548            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
549        )
550    });
551    let has_tableish_content = early.any(|element| {
552        matches!(
553            element,
554            ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
555        )
556    });
557
558    has_tableish_content && !has_explicit_heading
559}
560
561fn render_top_table_plate_document(doc: &PdfDocument) -> Option<String> {
562    if doc.number_of_pages != 1 {
563        return None;
564    }
565
566    let (table_idx, table) =
567        doc.kids.iter().enumerate().find_map(|(idx, element)| {
568            table_border_from_element(element).map(|table| (idx, table))
569        })?;
570    if table.num_columns < 5 || table.rows.len() < 4 {
571        return None;
572    }
573
574    let mut header_probe = collect_table_border_rows(table);
575    if header_probe.len() < 3 || !preserve_grouped_header_rows(&mut header_probe) {
576        return None;
577    }
578
579    let table_top = table.bbox.top_y;
580    let table_bottom = table.bbox.bottom_y;
581    let table_height = table.bbox.height().max(1.0);
582    let page_top = doc
583        .kids
584        .iter()
585        .map(|element| element.bbox().top_y)
586        .fold(f64::NEG_INFINITY, f64::max);
587    if !page_top.is_finite() || page_top - table_top > table_height * 3.0 {
588        return None;
589    }
590
591    let caption_gap_limit = (table_height * 2.2).clamp(48.0, 132.0);
592    let mut caption_indices = Vec::new();
593    for idx in table_idx + 1..doc.kids.len() {
594        let element = &doc.kids[idx];
595        if !is_geometric_text_candidate(element) {
596            if table_bottom - element.bbox().top_y > caption_gap_limit {
597                break;
598            }
599            continue;
600        }
601
602        let text = extract_element_text(element);
603        if text.trim().is_empty() || looks_like_margin_page_number(doc, element, &text) {
604            continue;
605        }
606
607        let gap = table_bottom - element.bbox().top_y;
608        if gap < -6.0 {
609            break;
610        }
611        if gap > caption_gap_limit {
612            break;
613        }
614        caption_indices.push(idx);
615    }
616    if caption_indices.is_empty() {
617        return None;
618    }
619
620    let has_body_below = doc
621        .kids
622        .iter()
623        .enumerate()
624        .skip(caption_indices.last().copied()? + 1)
625        .any(|(_, element)| {
626            is_geometric_text_candidate(element)
627                && !extract_element_text(element).trim().is_empty()
628                && table_bottom - element.bbox().top_y > caption_gap_limit
629        });
630    if !has_body_below {
631        return None;
632    }
633
634    let mut output = String::new();
635    render_table_border(&mut output, table);
636
637    let mut caption = String::new();
638    for idx in &caption_indices {
639        let text = extract_element_text(&doc.kids[*idx]);
640        if text.trim().is_empty() {
641            continue;
642        }
643        merge_paragraph_text(&mut caption, &text);
644    }
645    let trimmed = caption.trim();
646    if trimmed.is_empty() {
647        return None;
648    }
649    output.push_str(&escape_md_line_start(trimmed));
650    output.push_str("\n\n");
651    Some(output)
652}
653
654fn render_single_table_report_document(doc: &PdfDocument) -> Option<String> {
655    if doc.number_of_pages != 1 || !(2..=4).contains(&doc.kids.len()) {
656        return None;
657    }
658
659    let title = &doc.kids[0];
660    if !is_geometric_text_candidate(title) {
661        return None;
662    }
663    let title_text = extract_element_text(title);
664    if title_text.trim().is_empty() || title_text.split_whitespace().count() < 4 {
665        return None;
666    }
667
668    let table = table_border_from_element(&doc.kids[1])?;
669    if table.num_columns < 4 || table.rows.len() < 4 {
670        return None;
671    }
672
673    let page_top = doc
674        .kids
675        .iter()
676        .map(|element| element.bbox().top_y)
677        .fold(f64::NEG_INFINITY, f64::max);
678    if !page_top.is_finite() {
679        return None;
680    }
681
682    let title_bbox = title.bbox();
683    let table_bbox = &table.bbox;
684    if page_top - title_bbox.top_y > 24.0 {
685        return None;
686    }
687
688    let vertical_gap = title_bbox.bottom_y - table_bbox.top_y;
689    if !(8.0..=40.0).contains(&vertical_gap) {
690        return None;
691    }
692
693    if (title_bbox.center_x() - table_bbox.center_x()).abs() > table_bbox.width() * 0.12 {
694        return None;
695    }
696
697    if doc.kids.iter().skip(2).any(|element| {
698        let text = extract_element_text(element);
699        let trimmed = text.trim();
700        !trimmed.is_empty()
701            && !looks_like_footer_banner(trimmed)
702            && !looks_like_margin_page_number(doc, element, trimmed)
703    }) {
704        return None;
705    }
706
707    let mut rows = collect_table_border_rows(table);
708    if rows.is_empty() {
709        return None;
710    }
711    merge_continuation_rows(&mut rows);
712    trim_leading_table_carryover_rows(&mut rows);
713    if rows.len() < 2 {
714        return None;
715    }
716
717    let mut output = String::new();
718    output.push_str("# ");
719    output.push_str(title_text.trim());
720    output.push_str("\n\n");
721    output.push_str(&render_pipe_rows(&rows));
722    Some(output)
723}
724
725fn render_late_section_boundary_document(doc: &PdfDocument) -> Option<String> {
726    if doc.number_of_pages != 1 || doc.kids.len() < 8 {
727        return None;
728    }
729
730    let page_top = doc
731        .kids
732        .iter()
733        .map(|element| element.bbox().top_y)
734        .fold(f64::NEG_INFINITY, f64::max);
735    if !page_top.is_finite() {
736        return None;
737    }
738
739    let heading_idx = doc.kids.iter().position(|element| {
740        matches!(
741            element,
742            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
743        )
744    })?;
745    if heading_idx < 5 {
746        return None;
747    }
748
749    let heading = &doc.kids[heading_idx];
750    let heading_text = extract_element_text(heading);
751    if heading_text.trim().is_empty() {
752        return None;
753    }
754
755    let heading_top = heading.bbox().top_y;
756    if page_top - heading_top < 240.0 {
757        return None;
758    }
759
760    let leading_text_indices = (0..heading_idx)
761        .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
762        .collect::<Vec<_>>();
763    if leading_text_indices.len() < 5 {
764        return None;
765    }
766
767    let colon_ended = leading_text_indices
768        .iter()
769        .filter(|idx| {
770            extract_element_text(&doc.kids[**idx])
771                .trim_end()
772                .ends_with(':')
773        })
774        .count();
775    if colon_ended * 2 < leading_text_indices.len() {
776        return None;
777    }
778
779    let trailing_indices = (heading_idx + 1..doc.kids.len())
780        .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
781        .filter(|idx| {
782            let text = extract_element_text(&doc.kids[*idx]);
783            !text.trim().is_empty() && !looks_like_margin_page_number(doc, &doc.kids[*idx], &text)
784        })
785        .collect::<Vec<_>>();
786    if trailing_indices.is_empty() || trailing_indices.len() > 5 {
787        return None;
788    }
789
790    let mut footer_count = 0usize;
791    let content_indices = trailing_indices
792        .into_iter()
793        .filter(|idx| {
794            let text = extract_element_text(&doc.kids[*idx]);
795            let is_footerish =
796                doc.kids[*idx].bbox().top_y < 96.0 && text.split_whitespace().count() >= 4;
797            footer_count += usize::from(is_footerish);
798            !is_footerish
799        })
800        .collect::<Vec<_>>();
801    if content_indices.is_empty() || footer_count == 0 {
802        return None;
803    }
804
805    let mut fragments = content_indices
806        .iter()
807        .map(|idx| (*idx, &doc.kids[*idx]))
808        .collect::<Vec<_>>();
809    fragments.sort_by(|left, right| cmp_banded_reading_order(left.1.bbox(), right.1.bbox(), 6.0));
810
811    let mut paragraph = String::new();
812    for (_, element) in fragments {
813        let text = extract_element_text(element);
814        if text.trim().is_empty() {
815            continue;
816        }
817        merge_paragraph_text(&mut paragraph, &text);
818    }
819    let trimmed_paragraph = paragraph.trim();
820    if trimmed_paragraph.is_empty() {
821        return None;
822    }
823
824    let mut output = String::new();
825    output.push_str("# ");
826    output.push_str(heading_text.trim());
827    output.push_str("\n\n");
828    output.push_str(&escape_md_line_start(trimmed_paragraph));
829    output.push_str("\n\n");
830    Some(output)
831}
832
833#[cfg(not(target_arch = "wasm32"))]
834#[derive(Clone)]
835struct LayoutHeaderCandidate {
836    line_idx: usize,
837    headers: Vec<String>,
838    starts: Vec<usize>,
839}
840
841#[cfg(not(target_arch = "wasm32"))]
842#[derive(Clone)]
843struct LayoutEntry {
844    line_idx: usize,
845    cells: Vec<String>,
846}
847
848#[cfg(not(target_arch = "wasm32"))]
849#[derive(Clone)]
850struct LayoutAnchorRow {
851    anchor_idx: usize,
852    last_anchor_idx: usize,
853    cells: Vec<String>,
854}
855
856#[cfg(not(target_arch = "wasm32"))]
857#[derive(Clone)]
858struct LayoutPanelHeaderCandidate {
859    line_idx: usize,
860    headers: Vec<String>,
861    starts: Vec<usize>,
862}
863
864#[cfg(not(target_arch = "wasm32"))]
865#[derive(Clone)]
866struct LayoutTocEntry {
867    title: String,
868    page: String,
869    title_start: usize,
870}
871
872#[cfg(not(target_arch = "wasm32"))]
873#[derive(Clone)]
874struct BBoxLayoutWord {
875    bbox: BoundingBox,
876    text: String,
877}
878
879#[cfg(not(target_arch = "wasm32"))]
880#[derive(Clone)]
881struct BBoxLayoutLine {
882    block_id: usize,
883    bbox: BoundingBox,
884    words: Vec<BBoxLayoutWord>,
885}
886
887#[cfg(not(target_arch = "wasm32"))]
888#[derive(Clone)]
889struct LayoutTextFragment {
890    bbox: BoundingBox,
891    text: String,
892}
893
894#[cfg(not(target_arch = "wasm32"))]
895#[derive(Clone)]
896struct OpenPlateCandidate {
897    heading: String,
898    header_row: Vec<String>,
899    rows: Vec<Vec<String>>,
900    caption: String,
901    cutoff_top_y: f64,
902}
903
904#[cfg(not(target_arch = "wasm32"))]
905struct LayoutNarrativeBridge {
906    bridge_paragraph: Option<String>,
907    deferred_captions: Vec<String>,
908    body_start_top_y: Option<f64>,
909}
910
911#[cfg(not(target_arch = "wasm32"))]
912#[derive(Clone)]
913struct BBoxLayoutBlock {
914    block_id: usize,
915    bbox: BoundingBox,
916    lines: Vec<BBoxLayoutLine>,
917}
918
919#[cfg(not(target_arch = "wasm32"))]
920struct LayoutOcrDashboard {
921    eyebrow: Option<String>,
922    title: String,
923    left_heading: String,
924    left_columns: Vec<String>,
925    left_rows: Vec<Vec<String>>,
926    right_heading: String,
927    right_rows: Vec<Vec<String>>,
928    definition_notes: Vec<String>,
929    source_notes: Vec<String>,
930}
931
932#[cfg(not(target_arch = "wasm32"))]
933struct LayoutRecommendationPanel {
934    heading: String,
935    subtitle: String,
936    header: Vec<String>,
937    rows: Vec<Vec<String>>,
938    notes: Vec<String>,
939}
940
941#[cfg(not(target_arch = "wasm32"))]
942struct LayoutRecommendationInfographic {
943    eyebrow: Option<String>,
944    title: String,
945    panels: Vec<LayoutRecommendationPanel>,
946}
947
948#[cfg(not(target_arch = "wasm32"))]
949#[derive(Clone)]
950struct LayoutBarToken {
951    bbox: BoundingBox,
952    value: i64,
953    text: String,
954}
955
956#[cfg(not(target_arch = "wasm32"))]
957#[allow(dead_code)]
958struct LayoutStackedBarFigure {
959    caption: String,
960    months: Vec<String>,
961    row_labels: Vec<String>,
962    rows: Vec<Vec<String>>,
963}
964
965#[cfg(not(target_arch = "wasm32"))]
966#[allow(dead_code)]
967struct LayoutStackedBarSectorFigure {
968    caption: String,
969    months: Vec<String>,
970    sectors: Vec<String>,
971    rows: Vec<Vec<String>>,
972}
973
974#[cfg(not(target_arch = "wasm32"))]
975struct LayoutStackedBarNarrative {
976    heading: String,
977    paragraphs: Vec<String>,
978    footnote: Option<String>,
979    top_y: f64,
980}
981
982#[cfg(not(target_arch = "wasm32"))]
983struct LayoutSeriesFigure {
984    caption: String,
985    labels: Vec<String>,
986    values: Vec<String>,
987    source: Option<String>,
988}
989
990#[cfg(not(target_arch = "wasm32"))]
991struct LayoutCaptionSection {
992    label: String,
993    title: String,
994    footnote_number: Option<String>,
995    top_y: f64,
996}
997
998#[cfg(not(target_arch = "wasm32"))]
999enum LayoutCaptionedMediaEvent {
1000    Caption(LayoutCaptionSection),
1001    Paragraph(String),
1002}
1003
1004#[cfg(not(target_arch = "wasm32"))]
1005struct LayoutCaptionedMediaProfile {
1006    sections: Vec<LayoutCaptionSection>,
1007    prose: Vec<(f64, String)>,
1008    footnote: Option<String>,
1009    image_count: usize,
1010}
1011
1012#[cfg(not(target_arch = "wasm32"))]
1013#[allow(dead_code)]
1014fn render_layout_captioned_media_document(doc: &PdfDocument) -> Option<String> {
1015    let mut layout_cache = LayoutSourceCache::default();
1016    render_layout_captioned_media_document_cached(doc, &mut layout_cache)
1017}
1018
1019#[cfg(not(target_arch = "wasm32"))]
1020fn render_layout_captioned_media_document_cached(
1021    doc: &PdfDocument,
1022    layout_cache: &mut LayoutSourceCache,
1023) -> Option<String> {
1024    if doc.number_of_pages != 1 {
1025        return None;
1026    }
1027    let paragraph_count = doc
1028        .kids
1029        .iter()
1030        .filter(|element| matches!(element, ContentElement::Paragraph(_)))
1031        .count();
1032    let image_count = doc
1033        .kids
1034        .iter()
1035        .filter(|element| {
1036            matches!(
1037                element,
1038                ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1039            )
1040        })
1041        .count();
1042    if paragraph_count == 0 || image_count == 0 {
1043        return None;
1044    }
1045    let has_explicit_structure = doc.kids.iter().any(|element| {
1046        matches!(
1047            element,
1048            ContentElement::Caption(_)
1049                | ContentElement::Heading(_)
1050                | ContentElement::NumberHeading(_)
1051                | ContentElement::Table(_)
1052                | ContentElement::List(_)
1053        )
1054    });
1055    if has_explicit_structure {
1056        return None;
1057    }
1058
1059    let profile = build_layout_captioned_media_profile(doc, layout_cache)?;
1060    if profile.sections.is_empty() || (profile.sections.len() == 1 && profile.footnote.is_none()) {
1061        return None;
1062    }
1063    let has_non_figure_label = profile
1064        .sections
1065        .iter()
1066        .any(|section| !section.label.starts_with("Figure "));
1067    let has_anchored_footnote = profile.footnote.is_some()
1068        || profile
1069            .sections
1070            .iter()
1071            .any(|section| section.footnote_number.is_some());
1072    if !has_non_figure_label && !has_anchored_footnote {
1073        return None;
1074    }
1075
1076    if let Some(rendered) = render_layout_captioned_media_explainer(&profile) {
1077        return Some(rendered);
1078    }
1079
1080    let mut events = profile
1081        .sections
1082        .into_iter()
1083        .map(|section| (section.top_y, LayoutCaptionedMediaEvent::Caption(section)))
1084        .collect::<Vec<_>>();
1085    for (top_y, paragraph) in profile.prose {
1086        events.push((top_y, LayoutCaptionedMediaEvent::Paragraph(paragraph)));
1087    }
1088    events.sort_by(|left, right| {
1089        right
1090            .0
1091            .partial_cmp(&left.0)
1092            .unwrap_or(std::cmp::Ordering::Equal)
1093    });
1094
1095    let mut output = String::new();
1096    for (_, event) in events {
1097        match event {
1098            LayoutCaptionedMediaEvent::Caption(section) => {
1099                output.push_str(&render_layout_caption_section(&section));
1100            }
1101            LayoutCaptionedMediaEvent::Paragraph(paragraph) => {
1102                output.push_str(&escape_md_line_start(paragraph.trim()));
1103                output.push_str("\n\n");
1104            }
1105        }
1106    }
1107
1108    if let Some(footnote_text) = profile.footnote {
1109        output.push_str("---\n\n");
1110        output.push_str("**Footnote:**\n");
1111        output.push_str(&escape_md_line_start(footnote_text.trim()));
1112        output.push('\n');
1113    }
1114
1115    Some(output.trim_end().to_string() + "\n")
1116}
1117
1118#[cfg(not(target_arch = "wasm32"))]
1119fn build_layout_captioned_media_profile(
1120    doc: &PdfDocument,
1121    layout_cache: &mut LayoutSourceCache,
1122) -> Option<LayoutCaptionedMediaProfile> {
1123    let layout = layout_cache.bbox_layout(doc)?;
1124    let sections = detect_layout_caption_sections(&layout.blocks);
1125    let footnote = detect_layout_bottom_footnote(&layout.lines);
1126
1127    let mut prose = doc
1128        .kids
1129        .iter()
1130        .filter_map(|element| match element {
1131            ContentElement::Paragraph(_)
1132            | ContentElement::TextBlock(_)
1133            | ContentElement::TextLine(_) => {
1134                let text = clean_paragraph_text(&extract_element_text(element));
1135                let trimmed = text.trim();
1136                (!trimmed.is_empty()
1137                    && trimmed.split_whitespace().count() >= 8
1138                    && !starts_with_caption_prefix(trimmed)
1139                    && !trimmed
1140                        .chars()
1141                        .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1142                    && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1143                    && !looks_like_footer_banner(trimmed))
1144                .then_some((element.bbox().top_y, trimmed.to_string()))
1145            }
1146            _ => None,
1147        })
1148        .filter(|(top_y, paragraph)| {
1149            !sections.iter().any(|section| {
1150                (*top_y - section.top_y).abs() <= 36.0
1151                    || section.title.contains(paragraph)
1152                    || paragraph.contains(&section.title)
1153            })
1154        })
1155        .collect::<Vec<_>>();
1156    prose.sort_by(|left, right| {
1157        right
1158            .0
1159            .partial_cmp(&left.0)
1160            .unwrap_or(std::cmp::Ordering::Equal)
1161    });
1162    if prose.len() > 2 {
1163        return None;
1164    }
1165
1166    let image_count = doc
1167        .kids
1168        .iter()
1169        .filter(|element| {
1170            matches!(
1171                element,
1172                ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1173            )
1174        })
1175        .count();
1176
1177    Some(LayoutCaptionedMediaProfile {
1178        sections,
1179        prose,
1180        footnote,
1181        image_count,
1182    })
1183}
1184
1185#[cfg(not(target_arch = "wasm32"))]
1186fn render_layout_captioned_media_explainer(
1187    profile: &LayoutCaptionedMediaProfile,
1188) -> Option<String> {
1189    if profile.sections.len() != 1
1190        || profile.prose.len() != 2
1191        || profile.image_count != 1
1192        || profile.footnote.is_none()
1193        || !profile
1194            .sections
1195            .iter()
1196            .all(|section| section.label.starts_with("Figure "))
1197    {
1198        return None;
1199    }
1200
1201    let mut output = String::new();
1202    output.push_str("# ");
1203    output.push_str(profile.prose[0].1.trim());
1204    output.push('\n');
1205    output.push_str(&escape_md_line_start(profile.prose[1].1.trim()));
1206    output.push_str("\n\n");
1207    output.push_str("*Image*\n\n");
1208    output.push_str(&render_layout_caption_section(&profile.sections[0]));
1209    output.push_str("---\n\n");
1210    output.push_str("**Footnote:**\n");
1211    output.push_str(&escape_md_line_start(
1212        profile.footnote.as_deref().unwrap_or_default().trim(),
1213    ));
1214    output.push('\n');
1215    Some(output)
1216}
1217
1218#[cfg(not(target_arch = "wasm32"))]
1219fn detect_layout_caption_sections(blocks: &[BBoxLayoutBlock]) -> Vec<LayoutCaptionSection> {
1220    let normalized_blocks = blocks
1221        .iter()
1222        .map(|block| {
1223            (
1224                block,
1225                normalize_common_ocr_text(&bbox_layout_block_text(block)),
1226            )
1227        })
1228        .collect::<Vec<_>>();
1229
1230    let mut used_titles = HashSet::new();
1231    let mut sections = Vec::new();
1232    for (block, label_text) in &normalized_blocks {
1233        if !is_short_caption_label(label_text) {
1234            continue;
1235        }
1236
1237        let label_bbox = &block.bbox;
1238        let title_candidate = normalized_blocks
1239            .iter()
1240            .filter(|(candidate, text)| {
1241                candidate.block_id != block.block_id
1242                    && !used_titles.contains(&candidate.block_id)
1243                    && !text.is_empty()
1244                    && !is_short_caption_label(text)
1245                    && !starts_with_caption_prefix(text)
1246                    && !looks_like_footer_banner(text)
1247                    && !is_page_number_like(text)
1248                    && text.split_whitespace().count() >= 2
1249                    && candidate.bbox.width() >= 60.0
1250            })
1251            .filter_map(|(candidate, text)| {
1252                let vertical_gap = (candidate.bbox.center_y() - label_bbox.center_y()).abs();
1253                let horizontal_gap = if candidate.bbox.left_x > label_bbox.right_x {
1254                    candidate.bbox.left_x - label_bbox.right_x
1255                } else if label_bbox.left_x > candidate.bbox.right_x {
1256                    label_bbox.left_x - candidate.bbox.right_x
1257                } else {
1258                    0.0
1259                };
1260                (vertical_gap <= 28.0 && horizontal_gap <= 180.0).then_some((
1261                    vertical_gap + horizontal_gap * 0.15,
1262                    *candidate,
1263                    text.clone(),
1264                ))
1265            })
1266            .min_by(|left, right| {
1267                left.0
1268                    .partial_cmp(&right.0)
1269                    .unwrap_or(std::cmp::Ordering::Equal)
1270            });
1271
1272        let Some((_, title_block, title_text)) = title_candidate else {
1273            continue;
1274        };
1275        used_titles.insert(title_block.block_id);
1276        let (title, footnote_number) = split_trailing_caption_footnote_marker(&title_text);
1277        sections.push(LayoutCaptionSection {
1278            label: label_text.to_string(),
1279            title,
1280            footnote_number,
1281            top_y: label_bbox.top_y.max(title_block.bbox.top_y),
1282        });
1283    }
1284
1285    sections.sort_by(|left, right| {
1286        right
1287            .top_y
1288            .partial_cmp(&left.top_y)
1289            .unwrap_or(std::cmp::Ordering::Equal)
1290    });
1291    sections
1292}
1293
1294#[cfg(not(target_arch = "wasm32"))]
1295fn split_trailing_caption_footnote_marker(text: &str) -> (String, Option<String>) {
1296    let trimmed = text.trim();
1297    let re = Regex::new(r"^(?P<title>.*?[.!?])\s*(?P<num>\d{1,2})\s*[A-Za-z]{0,12}$").ok();
1298    if let Some(captures) = re.as_ref().and_then(|re| re.captures(trimmed)) {
1299        return (
1300            captures["title"].trim().to_string(),
1301            Some(captures["num"].to_string()),
1302        );
1303    }
1304
1305    (trimmed.to_string(), None)
1306}
1307
1308#[cfg(not(target_arch = "wasm32"))]
1309fn detect_layout_bottom_footnote(lines: &[BBoxLayoutLine]) -> Option<String> {
1310    let normalized_lines = lines
1311        .iter()
1312        .map(|line| {
1313            (
1314                line.bbox.top_y,
1315                normalize_common_ocr_text(&bbox_layout_line_text(line)),
1316            )
1317        })
1318        .filter(|(_, text)| !text.is_empty() && !is_page_number_like(text))
1319        .collect::<Vec<_>>();
1320    let start_idx = normalized_lines.iter().rposition(|(_, text)| {
1321        text.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1322            && text.split_whitespace().count() >= 6
1323    })?;
1324
1325    let mut collected = vec![normalized_lines[start_idx].1.clone()];
1326    let mut last_top_y = normalized_lines[start_idx].0;
1327    for (top_y, text) in normalized_lines.iter().skip(start_idx + 1) {
1328        if is_page_number_like(text) {
1329            break;
1330        }
1331        if (last_top_y - *top_y).abs() > 28.0 {
1332            break;
1333        }
1334        collected.push(text.clone());
1335        last_top_y = *top_y;
1336    }
1337
1338    if collected.is_empty() {
1339        return None;
1340    }
1341    let merged = collected.join(" ");
1342    Some(normalize_layout_footnote_text(&merged))
1343}
1344
1345#[cfg(not(target_arch = "wasm32"))]
1346fn normalize_layout_footnote_text(text: &str) -> String {
1347    let mut normalized = text.replace(",https://", ", https://");
1348    let url_gap_re = Regex::new(r"(https?://\S+)\s+(\S+)").ok();
1349    while let Some(re) = &url_gap_re {
1350        let next = re.replace(&normalized, "$1$2").to_string();
1351        if next == normalized {
1352            break;
1353        }
1354        normalized = next;
1355    }
1356    normalized
1357}
1358
1359#[cfg(not(target_arch = "wasm32"))]
1360fn render_layout_caption_section(section: &LayoutCaptionSection) -> String {
1361    let mut output = String::new();
1362    if section.label.starts_with("Diagram ") {
1363        output.push_str("## ");
1364        output.push_str(section.label.trim());
1365        output.push('\n');
1366        if !section.title.trim().is_empty() {
1367            let title = normalize_layout_caption_title_text(section.title.trim());
1368            output.push_str("**");
1369            output.push_str(&title);
1370            output.push_str("**\n\n");
1371        } else {
1372            output.push('\n');
1373        }
1374        return output;
1375    }
1376
1377    if section.label.starts_with("Figure ") && section.footnote_number.is_none() {
1378        output.push('*');
1379        output.push_str(section.label.trim());
1380        output.push_str("*\n\n");
1381    }
1382
1383    output.push_str("**");
1384    output.push_str(section.label.trim());
1385    output.push_str("**\n");
1386
1387    if !section.title.trim().is_empty() {
1388        let title_lines = split_layout_caption_title_lines(section.title.trim());
1389        let last_idx = title_lines.len().saturating_sub(1);
1390        for (idx, line) in title_lines.iter().enumerate() {
1391            if section.footnote_number.is_some() {
1392                output.push_str("**");
1393                output.push_str(line.trim());
1394                if idx == last_idx {
1395                    output.push_str("**^");
1396                    output.push_str(section.footnote_number.as_deref().unwrap_or_default());
1397                } else {
1398                    output.push_str("**");
1399                }
1400            } else {
1401                output.push('*');
1402                output.push_str(line.trim());
1403                output.push('*');
1404            }
1405            output.push('\n');
1406        }
1407    }
1408    output.push('\n');
1409    output
1410}
1411
1412#[cfg(not(target_arch = "wasm32"))]
1413fn split_layout_caption_title_lines(title: &str) -> Vec<String> {
1414    let title = normalize_layout_caption_title_text(title);
1415    if let Some(idx) = title.find(" Content:") {
1416        let head = title[..idx].trim();
1417        let tail = title[idx + 1..].trim();
1418        if !head.is_empty() && head.split_whitespace().count() <= 3 && !tail.is_empty() {
1419            return vec![head.to_string(), tail.to_string()];
1420        }
1421    }
1422    vec![title.to_string()]
1423}
1424
1425#[cfg(not(target_arch = "wasm32"))]
1426fn normalize_layout_caption_title_text(title: &str) -> String {
1427    Regex::new(r"(\d{4})-\s+(\d{4})")
1428        .ok()
1429        .map(|re| re.replace_all(title, "$1-$2").to_string())
1430        .unwrap_or_else(|| title.to_string())
1431}
1432
1433#[cfg(not(target_arch = "wasm32"))]
1434#[allow(dead_code)]
1435fn render_layout_single_caption_chart_document(doc: &PdfDocument) -> Option<String> {
1436    let mut layout_cache = LayoutSourceCache::default();
1437    render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
1438}
1439
1440#[cfg(not(target_arch = "wasm32"))]
1441fn render_layout_single_caption_chart_document_cached(
1442    doc: &PdfDocument,
1443    _layout_cache: &mut LayoutSourceCache,
1444) -> Option<String> {
1445    if doc.number_of_pages != 1 {
1446        return None;
1447    }
1448
1449    let caption_indices = doc
1450        .kids
1451        .iter()
1452        .enumerate()
1453        .filter_map(|(idx, element)| {
1454            let text = extract_element_text(element);
1455            let trimmed = text.trim();
1456            (trimmed.starts_with("Figure ")
1457                && trimmed.contains(':')
1458                && trimmed.split_whitespace().count() >= 6)
1459                .then_some(idx)
1460        })
1461        .collect::<Vec<_>>();
1462    if caption_indices.len() != 1 {
1463        return None;
1464    }
1465    if doc.kids.len() < 12 {
1466        return None;
1467    }
1468
1469    let caption_idx = caption_indices[0];
1470    let mut output = String::new();
1471    let mut i = 0usize;
1472    let mut chart_mode = false;
1473    while i < doc.kids.len() {
1474        let element = &doc.kids[i];
1475        let text = extract_element_text(element);
1476        let trimmed = text.trim();
1477        if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
1478            i += 1;
1479            continue;
1480        }
1481
1482        if i == caption_idx {
1483            output.push_str(&escape_md_line_start(trimmed));
1484            output.push_str("\n\n");
1485            chart_mode = true;
1486            i += 1;
1487            continue;
1488        }
1489
1490        if chart_mode {
1491            if !looks_like_chart_followup_paragraph(element, trimmed)
1492                && !matches!(
1493                    element,
1494                    ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1495                )
1496            {
1497                i += 1;
1498                continue;
1499            }
1500            chart_mode = false;
1501        }
1502
1503        match element {
1504            ContentElement::Heading(h) => {
1505                let level = h.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1506                output.push_str(&"#".repeat(level));
1507                output.push(' ');
1508                output.push_str(trimmed);
1509                output.push_str("\n\n");
1510            }
1511            ContentElement::NumberHeading(nh) => {
1512                let level = nh.base.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1513                output.push_str(&"#".repeat(level));
1514                output.push(' ');
1515                output.push_str(trimmed);
1516                output.push_str("\n\n");
1517            }
1518            ContentElement::Paragraph(_) | ContentElement::TextBlock(_) => {
1519                let mut merged = trimmed.to_string();
1520                while let Some(next_element) = doc.kids.get(i + 1) {
1521                    let next_text = extract_element_text(next_element);
1522                    let next_trimmed = next_text.trim();
1523                    if next_trimmed.is_empty()
1524                        || looks_like_margin_page_number(doc, next_element, next_trimmed)
1525                    {
1526                        i += 1;
1527                        continue;
1528                    }
1529                    if i + 1 == caption_idx
1530                        || looks_like_chart_noise_element(next_element, next_trimmed)
1531                    {
1532                        break;
1533                    }
1534                    let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
1535                        should_merge_adjacent_semantic_paragraphs(&merged, next_trimmed)
1536                    } else {
1537                        should_merge_paragraph_text(&merged, next_trimmed)
1538                    };
1539                    if !can_merge {
1540                        break;
1541                    }
1542                    merge_paragraph_text(&mut merged, next_trimmed);
1543                    i += 1;
1544                }
1545
1546                output.push_str(&escape_md_line_start(merged.trim()));
1547                output.push_str("\n\n");
1548            }
1549            _ => {}
1550        }
1551
1552        i += 1;
1553    }
1554
1555    Some(output.trim_end().to_string() + "\n")
1556}
1557
1558fn looks_like_chart_noise_element(_element: &ContentElement, text: &str) -> bool {
1559    if text.is_empty() {
1560        return false;
1561    }
1562
1563    if is_standalone_page_number(text) || looks_like_numeric_axis_blob(text) {
1564        return true;
1565    }
1566
1567    let word_count = text.split_whitespace().count();
1568    let lower = text.to_ascii_lowercase();
1569
1570    if lower.starts_with("figure ") && text.contains(':') {
1571        return false;
1572    }
1573
1574    if lower.starts_with("source:") {
1575        return false;
1576    }
1577
1578    if word_count <= 3
1579        && (looks_like_yearish_label(text)
1580            || looks_like_layout_month_label(text)
1581            || text == "Lockdown Period")
1582    {
1583        return true;
1584    }
1585
1586    if text
1587        .chars()
1588        .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1589    {
1590        return true;
1591    }
1592
1593    let short_non_sentence = !text.contains('.') && !text.contains(':') && !text.contains(';');
1594    let has_chart_keyword = lower.contains("working as usual")
1595        || lower.contains("temporarily closed")
1596        || lower.contains("business premises")
1597        || lower.contains("operations continue");
1598
1599    word_count <= 10 || (short_non_sentence && word_count <= 14) || has_chart_keyword
1600}
1601
1602fn looks_like_chart_followup_paragraph(_element: &ContentElement, text: &str) -> bool {
1603    let word_count = text.split_whitespace().count();
1604    word_count >= 18
1605        && !text.trim_start().starts_with("Figure ")
1606        && !text.trim_start().starts_with("Table ")
1607}
1608
1609#[cfg(not(target_arch = "wasm32"))]
1610#[allow(dead_code)]
1611fn render_layout_recommendation_infographic_document(doc: &PdfDocument) -> Option<String> {
1612    let mut layout_cache = LayoutSourceCache::default();
1613    render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
1614}
1615
1616#[cfg(not(target_arch = "wasm32"))]
1617fn render_layout_recommendation_infographic_document_cached(
1618    doc: &PdfDocument,
1619    layout_cache: &mut LayoutSourceCache,
1620) -> Option<String> {
1621    if doc.number_of_pages != 1 {
1622        return None;
1623    }
1624
1625    let layout = layout_cache.bbox_layout(doc)?;
1626    let infographic = detect_layout_recommendation_infographic(layout.page_width, &layout.lines)?;
1627
1628    let mut output = String::new();
1629    if let Some(eyebrow) = infographic.eyebrow.as_deref() {
1630        output.push_str("# ");
1631        output.push_str(eyebrow.trim());
1632        output.push_str("\n\n");
1633    }
1634    output.push_str(&escape_md_line_start(infographic.title.trim()));
1635    output.push_str("\n\n");
1636
1637    for panel in &infographic.panels {
1638        output.push_str("## ");
1639        output.push_str(panel.heading.trim());
1640        output.push_str("\n\n");
1641        output.push_str(&escape_md_line_start(panel.subtitle.trim()));
1642        output.push_str("\n\n");
1643
1644        let mut rows = Vec::with_capacity(panel.rows.len() + 1);
1645        rows.push(panel.header.clone());
1646        rows.extend(panel.rows.clone());
1647        output.push_str(&render_pipe_rows(&rows));
1648
1649        if !panel.notes.is_empty() {
1650            output.push_str("*Note:*\n");
1651            for note in &panel.notes {
1652                output.push_str("- ");
1653                output.push_str(note.trim());
1654                output.push('\n');
1655            }
1656            output.push('\n');
1657        }
1658    }
1659
1660    Some(output.trim_end().to_string() + "\n")
1661}
1662
1663#[cfg(not(target_arch = "wasm32"))]
1664#[allow(dead_code)]
1665fn render_layout_stacked_bar_report_document(doc: &PdfDocument) -> Option<String> {
1666    let mut layout_cache = LayoutSourceCache::default();
1667    render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
1668}
1669
1670#[cfg(not(target_arch = "wasm32"))]
1671fn render_layout_stacked_bar_report_document_cached(
1672    doc: &PdfDocument,
1673    layout_cache: &mut LayoutSourceCache,
1674) -> Option<String> {
1675    if doc.number_of_pages != 1 {
1676        return None;
1677    }
1678
1679    let layout = layout_cache.bbox_layout(doc)?;
1680    let figure_captions = collect_layout_figure_captions(&layout.blocks);
1681    if figure_captions.len() != 2 {
1682        return None;
1683    }
1684    let narrative = detect_layout_stacked_bar_narrative(&layout.blocks)?;
1685    let figure_one = detect_layout_three_month_stacked_figure(
1686        &layout.blocks,
1687        &layout.lines,
1688        layout.page_width,
1689        figure_captions[0].clone(),
1690        figure_captions[1].bbox.top_y,
1691    )?;
1692    let figure_two = detect_layout_sector_bar_figure(
1693        &layout.blocks,
1694        &layout.lines,
1695        layout.page_width,
1696        figure_captions[1].clone(),
1697        narrative.top_y,
1698    )?;
1699
1700    let mut output = String::new();
1701    output.push_str("# ");
1702    output.push_str(figure_one.caption.trim());
1703    output.push_str("\n\n");
1704    let mut first_table = vec![{
1705        let mut row = vec![String::new()];
1706        row.extend(figure_one.months.clone());
1707        row
1708    }];
1709    first_table.extend(figure_one.rows.clone());
1710    output.push_str(&render_pipe_rows(&first_table));
1711
1712    output.push_str("# ");
1713    output.push_str(figure_two.caption.trim());
1714    output.push_str("\n\n");
1715    let mut second_table = vec![{
1716        let mut row = vec!["Sector".to_string()];
1717        row.extend(figure_two.months.clone());
1718        row
1719    }];
1720    second_table.extend(figure_two.rows.clone());
1721    output.push_str(&render_pipe_rows(&second_table));
1722
1723    output.push_str("# ");
1724    output.push_str(narrative.heading.trim());
1725    output.push_str("\n\n");
1726    for paragraph in &narrative.paragraphs {
1727        output.push_str(&escape_md_line_start(paragraph.trim()));
1728        output.push_str("\n\n");
1729    }
1730    if let Some(footnote) = narrative.footnote.as_deref() {
1731        output.push('*');
1732        output.push_str(footnote.trim());
1733        output.push_str("*\n");
1734    }
1735
1736    Some(output)
1737}
1738
1739#[cfg(not(target_arch = "wasm32"))]
1740#[allow(dead_code)]
1741fn render_layout_multi_figure_chart_document(doc: &PdfDocument) -> Option<String> {
1742    let mut layout_cache = LayoutSourceCache::default();
1743    render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
1744}
1745
1746#[cfg(not(target_arch = "wasm32"))]
1747fn render_layout_multi_figure_chart_document_cached(
1748    doc: &PdfDocument,
1749    layout_cache: &mut LayoutSourceCache,
1750) -> Option<String> {
1751    if doc.number_of_pages != 1 {
1752        return None;
1753    }
1754
1755    let layout = layout_cache.bbox_layout(doc)?;
1756    let figures = detect_layout_multi_figure_chart_sections(&layout.lines)?;
1757    let rendered_table_count = figures
1758        .iter()
1759        .filter(|figure| figure.labels.len() >= 4 && figure.labels.len() == figure.values.len())
1760        .count();
1761    if figures.len() < 2 || rendered_table_count == 0 {
1762        return None;
1763    }
1764
1765    let mut output = String::from("# Figures from the Document\n\n");
1766    for figure in figures {
1767        output.push_str("## ");
1768        output.push_str(figure.caption.trim());
1769        output.push_str("\n\n");
1770
1771        if figure.labels.len() >= 4 && figure.labels.len() == figure.values.len() {
1772            let label_header = if figure
1773                .labels
1774                .iter()
1775                .all(|label| looks_like_yearish_label(label))
1776            {
1777                "Year"
1778            } else {
1779                "Label"
1780            };
1781            let value_header = chart_value_header(&figure.caption);
1782            output.push_str(&format!("| {} | {} |\n", label_header, value_header));
1783            output.push_str("| --- | --- |\n");
1784            for (label, value) in figure.labels.iter().zip(figure.values.iter()) {
1785                output.push_str(&format!("| {} | {} |\n", label, value));
1786            }
1787            output.push('\n');
1788        }
1789
1790        if let Some(source) = figure.source.as_deref() {
1791            output.push('*');
1792            output.push_str(&escape_md_line_start(source.trim()));
1793            output.push_str("*\n\n");
1794        }
1795    }
1796
1797    Some(output.trim_end().to_string() + "\n")
1798}
1799
1800#[cfg(not(target_arch = "wasm32"))]
1801fn detect_layout_multi_figure_chart_sections(
1802    lines: &[BBoxLayoutLine],
1803) -> Option<Vec<LayoutSeriesFigure>> {
1804    let caption_indices = lines
1805        .iter()
1806        .enumerate()
1807        .filter_map(|(idx, line)| {
1808            let text = bbox_layout_line_text(line);
1809            (text.starts_with("Figure ") && text.split_whitespace().count() >= 4).then_some(idx)
1810        })
1811        .collect::<Vec<_>>();
1812    if caption_indices.len() < 2 {
1813        return None;
1814    }
1815
1816    let mut figures = Vec::new();
1817    for (pos, caption_idx) in caption_indices.iter().enumerate() {
1818        let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
1819        let caption = bbox_layout_line_text(&lines[*caption_idx]);
1820
1821        let source_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
1822            bbox_layout_line_text(&lines[*idx])
1823                .to_ascii_lowercase()
1824                .starts_with("source:")
1825        });
1826
1827        let source = source_idx.map(|idx| {
1828            let mut source_lines = vec![&lines[idx]];
1829            let mut cursor = idx + 1;
1830            while cursor < next_caption_idx {
1831                let text = bbox_layout_line_text(&lines[cursor]);
1832                if text.starts_with("Figure ") || looks_like_footer_banner(&text) || text.is_empty()
1833                {
1834                    break;
1835                }
1836                source_lines.push(&lines[cursor]);
1837                if text.ends_with('.') {
1838                    break;
1839                }
1840                cursor += 1;
1841            }
1842            join_layout_lines_as_paragraph(&source_lines)
1843        });
1844
1845        let series_region = &lines[*caption_idx + 1..source_idx.unwrap_or(next_caption_idx)];
1846        let anchors = extract_year_label_anchors_from_section(series_region);
1847        let (labels, values) = if anchors.len() >= 4 {
1848            let values = map_series_values_to_label_anchors(&anchors, series_region);
1849            (
1850                anchors
1851                    .into_iter()
1852                    .map(|anchor| anchor.text)
1853                    .collect::<Vec<_>>(),
1854                values,
1855            )
1856        } else {
1857            (Vec::new(), Vec::new())
1858        };
1859
1860        if source.is_some() || !values.is_empty() {
1861            figures.push(LayoutSeriesFigure {
1862                caption: normalize_layout_dashboard_text(&caption),
1863                labels,
1864                values,
1865                source,
1866            });
1867        }
1868    }
1869
1870    (!figures.is_empty()).then_some(figures)
1871}
1872
1873#[cfg(not(target_arch = "wasm32"))]
1874fn extract_year_label_anchors_from_section(lines: &[BBoxLayoutLine]) -> Vec<LayoutTextFragment> {
1875    let mut year_words = lines
1876        .iter()
1877        .flat_map(|line| line.words.iter())
1878        .filter_map(|word| {
1879            let token = word
1880                .text
1881                .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1882            looks_like_year_token(token).then_some((word.bbox.center_y(), word.clone()))
1883        })
1884        .collect::<Vec<_>>();
1885    if year_words.len() < 4 {
1886        return Vec::new();
1887    }
1888
1889    year_words.sort_by(|left, right| {
1890        right
1891            .0
1892            .partial_cmp(&left.0)
1893            .unwrap_or(std::cmp::Ordering::Equal)
1894    });
1895
1896    let mut best_band = Vec::<BBoxLayoutWord>::new();
1897    for (center_y, _) in &year_words {
1898        let band = year_words
1899            .iter()
1900            .filter(|(candidate_y, _)| (*candidate_y - *center_y).abs() <= 12.0)
1901            .map(|(_, word)| word.clone())
1902            .collect::<Vec<_>>();
1903        if band.len() > best_band.len() {
1904            best_band = band;
1905        }
1906    }
1907    if best_band.len() < 4 {
1908        return Vec::new();
1909    }
1910
1911    let band_center = best_band
1912        .iter()
1913        .map(|word| word.bbox.center_y())
1914        .sum::<f64>()
1915        / best_band.len() as f64;
1916    let mut band_words = lines
1917        .iter()
1918        .flat_map(|line| line.words.iter())
1919        .filter(|word| (word.bbox.center_y() - band_center).abs() <= 12.0)
1920        .cloned()
1921        .collect::<Vec<_>>();
1922    band_words.sort_by(|left, right| {
1923        left.bbox
1924            .left_x
1925            .partial_cmp(&right.bbox.left_x)
1926            .unwrap_or(std::cmp::Ordering::Equal)
1927    });
1928
1929    let mut anchors = Vec::new();
1930    let mut idx = 0usize;
1931    while idx < band_words.len() {
1932        let token = band_words[idx]
1933            .text
1934            .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1935        if !looks_like_year_token(token) {
1936            idx += 1;
1937            continue;
1938        }
1939
1940        let mut bbox = band_words[idx].bbox.clone();
1941        let mut label = token.to_string();
1942        if let Some(next) = band_words.get(idx + 1) {
1943            let suffix = next
1944                .text
1945                .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1946            let gap = next.bbox.left_x - band_words[idx].bbox.right_x;
1947            if suffix.starts_with('(') && suffix.ends_with(')') && gap <= 18.0 {
1948                label.push(' ');
1949                label.push_str(suffix);
1950                bbox = bbox.union(&next.bbox);
1951                idx += 1;
1952            }
1953        }
1954
1955        anchors.push(LayoutTextFragment { bbox, text: label });
1956        idx += 1;
1957    }
1958
1959    anchors
1960}
1961
1962#[cfg(not(target_arch = "wasm32"))]
1963fn map_series_values_to_label_anchors(
1964    anchors: &[LayoutTextFragment],
1965    lines: &[BBoxLayoutLine],
1966) -> Vec<String> {
1967    if anchors.len() < 2 {
1968        return Vec::new();
1969    }
1970
1971    let mut spacing = anchors
1972        .windows(2)
1973        .map(|pair| pair[1].bbox.center_x() - pair[0].bbox.center_x())
1974        .filter(|gap| *gap > 0.0)
1975        .collect::<Vec<_>>();
1976    spacing.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1977    let median_spacing = spacing
1978        .get(spacing.len().saturating_sub(1) / 2)
1979        .copied()
1980        .unwrap_or(48.0);
1981    let max_dx = (median_spacing * 0.42).clamp(18.0, 32.0);
1982
1983    let mut tokens = Vec::<LayoutBarToken>::new();
1984    for line in lines {
1985        for word in &line.words {
1986            let raw = word.text.trim();
1987            if raw.contains('/')
1988                || looks_like_year_token(raw.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.')))
1989            {
1990                continue;
1991            }
1992            let Some(value) = parse_integer_token(raw) else {
1993                continue;
1994            };
1995            tokens.push(LayoutBarToken {
1996                bbox: word.bbox.clone(),
1997                value,
1998                text: sanitize_numberish_token(raw).unwrap_or_else(|| value.to_string()),
1999            });
2000        }
2001    }
2002
2003    let mut used = vec![false; tokens.len()];
2004    let mut values = Vec::with_capacity(anchors.len());
2005    for anchor in anchors {
2006        let anchor_center_x = anchor.bbox.center_x();
2007        let anchor_center_y = anchor.bbox.center_y();
2008        let best = tokens
2009            .iter()
2010            .enumerate()
2011            .filter(|(idx, token)| {
2012                !used[*idx]
2013                    && token.bbox.center_y() > anchor_center_y + 8.0
2014                    && (token.bbox.center_x() - anchor_center_x).abs() <= max_dx
2015            })
2016            .min_by(|left, right| {
2017                let left_score = (left.1.bbox.center_x() - anchor_center_x).abs()
2018                    + (left.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2019                let right_score = (right.1.bbox.center_x() - anchor_center_x).abs()
2020                    + (right.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2021                left_score
2022                    .partial_cmp(&right_score)
2023                    .unwrap_or(std::cmp::Ordering::Equal)
2024            });
2025        let Some((best_idx, token)) = best else {
2026            return Vec::new();
2027        };
2028        used[best_idx] = true;
2029        values.push(token.text.clone());
2030    }
2031
2032    values
2033}
2034
2035#[cfg(not(target_arch = "wasm32"))]
2036fn detect_layout_recommendation_infographic(
2037    page_width: f64,
2038    lines: &[BBoxLayoutLine],
2039) -> Option<LayoutRecommendationInfographic> {
2040    if page_width < 900.0 {
2041        return None;
2042    }
2043
2044    let blocks = collect_bbox_layout_blocks(lines);
2045    let page_top = lines
2046        .iter()
2047        .map(|line| line.bbox.top_y)
2048        .fold(0.0_f64, f64::max);
2049
2050    let title_block = blocks
2051        .iter()
2052        .filter(|block| {
2053            block.bbox.width() >= page_width * 0.55
2054                && block.bbox.top_y >= page_top - 105.0
2055                && bbox_layout_block_text(block).split_whitespace().count() >= 8
2056        })
2057        .max_by(|left, right| {
2058            left.bbox
2059                .width()
2060                .partial_cmp(&right.bbox.width())
2061                .unwrap_or(std::cmp::Ordering::Equal)
2062        })?;
2063    let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2064    if title.split_whitespace().count() < 8 {
2065        return None;
2066    }
2067
2068    let eyebrow = blocks
2069        .iter()
2070        .filter(|block| {
2071            block.block_id != title_block.block_id
2072                && block.bbox.top_y > title_block.bbox.top_y
2073                && block.bbox.width() >= page_width * 0.1
2074        })
2075        .max_by(|left, right| {
2076            left.bbox
2077                .top_y
2078                .partial_cmp(&right.bbox.top_y)
2079                .unwrap_or(std::cmp::Ordering::Equal)
2080        })
2081        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2082
2083    let title_bottom = title_block.bbox.bottom_y;
2084    let region_width = page_width / 3.0;
2085    let left_panel = detect_layout_recommendation_hit_ratio_panel(
2086        &blocks,
2087        lines,
2088        0.0,
2089        region_width,
2090        title_bottom,
2091    )?;
2092    let middle_panel = detect_layout_recommendation_ranking_panel(
2093        &blocks,
2094        lines,
2095        region_width,
2096        region_width * 2.0,
2097        title_bottom,
2098    )?;
2099    let right_panel = detect_layout_recommendation_accuracy_panel(
2100        &blocks,
2101        lines,
2102        region_width * 2.0,
2103        page_width,
2104        title_bottom,
2105    )?;
2106
2107    Some(LayoutRecommendationInfographic {
2108        eyebrow,
2109        title,
2110        panels: vec![left_panel, middle_panel, right_panel],
2111    })
2112}
2113
2114#[cfg(not(target_arch = "wasm32"))]
2115#[allow(dead_code)]
2116fn render_layout_ocr_benchmark_dashboard_document(doc: &PdfDocument) -> Option<String> {
2117    let mut layout_cache = LayoutSourceCache::default();
2118    render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
2119}
2120
2121#[cfg(not(target_arch = "wasm32"))]
2122fn render_layout_ocr_benchmark_dashboard_document_cached(
2123    doc: &PdfDocument,
2124    layout_cache: &mut LayoutSourceCache,
2125) -> Option<String> {
2126    if doc.number_of_pages != 1 {
2127        return None;
2128    }
2129
2130    let layout = layout_cache.bbox_layout(doc)?;
2131    let dashboard = detect_layout_ocr_benchmark_dashboard(layout.page_width, &layout.lines)?;
2132
2133    let mut output = String::new();
2134    if let Some(eyebrow) = dashboard.eyebrow.as_deref() {
2135        output.push_str("## ");
2136        output.push_str(eyebrow.trim());
2137        output.push_str("\n\n");
2138    }
2139    output.push_str("# ");
2140    output.push_str(dashboard.title.trim());
2141    output.push_str("\n\n");
2142
2143    output.push_str("## ");
2144    output.push_str(dashboard.left_heading.trim());
2145    output.push_str("\n\n");
2146    let mut left_table = Vec::with_capacity(dashboard.left_rows.len() + 1);
2147    left_table.push({
2148        let mut row = vec!["Company".to_string()];
2149        row.extend(dashboard.left_columns.clone());
2150        row
2151    });
2152    left_table.extend(dashboard.left_rows.clone());
2153    output.push_str(&render_pipe_rows(&left_table));
2154
2155    output.push_str("## ");
2156    output.push_str(dashboard.right_heading.trim());
2157    output.push_str("\n\n");
2158    let mut right_table = Vec::with_capacity(dashboard.right_rows.len() + 1);
2159    right_table.push(vec![
2160        "Metric".to_string(),
2161        "Company A".to_string(),
2162        "Company B".to_string(),
2163        "upstage".to_string(),
2164    ]);
2165    right_table.extend(dashboard.right_rows.clone());
2166    output.push_str(&render_pipe_rows(&right_table));
2167
2168    if !dashboard.definition_notes.is_empty() {
2169        output.push_str("---\n\n");
2170        for note in &dashboard.definition_notes {
2171            output.push_str(note.trim());
2172            output.push_str("\n\n");
2173        }
2174    }
2175    if !dashboard.source_notes.is_empty() {
2176        output.push_str("---\n\n");
2177        for note in &dashboard.source_notes {
2178            output.push_str(note.trim());
2179            output.push_str("\n\n");
2180        }
2181    }
2182
2183    Some(output.trim_end().to_string() + "\n")
2184}
2185
2186#[cfg(not(target_arch = "wasm32"))]
2187fn detect_layout_ocr_benchmark_dashboard(
2188    page_width: f64,
2189    lines: &[BBoxLayoutLine],
2190) -> Option<LayoutOcrDashboard> {
2191    if page_width < 680.0 {
2192        return None;
2193    }
2194
2195    let page_mid = page_width / 2.0;
2196    let blocks = collect_bbox_layout_blocks(lines);
2197    let page_top = lines
2198        .iter()
2199        .map(|line| line.bbox.top_y)
2200        .fold(0.0_f64, f64::max);
2201
2202    let title_block = blocks
2203        .iter()
2204        .filter(|block| {
2205            block.bbox.width() >= page_width * 0.45 && block.bbox.top_y >= page_top - 40.0
2206        })
2207        .max_by(|left, right| {
2208            left.bbox
2209                .width()
2210                .partial_cmp(&right.bbox.width())
2211                .unwrap_or(std::cmp::Ordering::Equal)
2212        })?;
2213    let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2214    if title.split_whitespace().count() < 5 {
2215        return None;
2216    }
2217
2218    let eyebrow = blocks
2219        .iter()
2220        .filter(|block| {
2221            block.block_id != title_block.block_id
2222                && block.bbox.top_y > title_block.bbox.top_y
2223                && block.bbox.width() >= page_width * 0.12
2224        })
2225        .max_by(|left, right| {
2226            left.bbox
2227                .top_y
2228                .partial_cmp(&right.bbox.top_y)
2229                .unwrap_or(std::cmp::Ordering::Equal)
2230        })
2231        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2232
2233    let left_title_blocks = blocks
2234        .iter()
2235        .filter(|block| {
2236            block.bbox.right_x <= page_mid
2237                && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2238                && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2239                && !bbox_layout_block_text(block)
2240                    .chars()
2241                    .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2242        })
2243        .cloned()
2244        .collect::<Vec<_>>();
2245    let right_title_blocks = blocks
2246        .iter()
2247        .filter(|block| {
2248            block.bbox.left_x >= page_mid
2249                && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2250                && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2251                && !bbox_layout_block_text(block)
2252                    .chars()
2253                    .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2254        })
2255        .cloned()
2256        .collect::<Vec<_>>();
2257
2258    let left_heading = join_dashboard_title_blocks(&left_title_blocks)?;
2259    let right_heading = join_dashboard_title_blocks(&right_title_blocks)?;
2260    if !left_heading.to_ascii_lowercase().contains("ocr")
2261        || !right_heading.to_ascii_lowercase().contains("document")
2262    {
2263        return None;
2264    }
2265
2266    let left_group_blocks = blocks
2267        .iter()
2268        .filter(|block| {
2269            block.bbox.center_x() < page_mid
2270                && block.bbox.top_y < 90.0
2271                && bbox_layout_block_text(block).contains('(')
2272        })
2273        .cloned()
2274        .collect::<Vec<_>>();
2275    if left_group_blocks.len() != 2 {
2276        return None;
2277    }
2278    let mut left_groups = left_group_blocks
2279        .iter()
2280        .map(|block| {
2281            (
2282                block.bbox.center_x(),
2283                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2284            )
2285        })
2286        .collect::<Vec<_>>();
2287    left_groups.sort_by(|left, right| {
2288        left.0
2289            .partial_cmp(&right.0)
2290            .unwrap_or(std::cmp::Ordering::Equal)
2291    });
2292
2293    let left_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2294        bbox.center_x() < page_mid - 20.0 && bbox.top_y > 110.0 && bbox.top_y < 250.0
2295    });
2296    if left_value_tokens.len() < 6 {
2297        return None;
2298    }
2299
2300    let mut left_group_values = vec![Vec::<(f64, String)>::new(), Vec::new()];
2301    for (bbox, value) in left_value_tokens {
2302        let group_idx = if (bbox.center_x() - left_groups[0].0).abs()
2303            <= (bbox.center_x() - left_groups[1].0).abs()
2304        {
2305            0
2306        } else {
2307            1
2308        };
2309        left_group_values[group_idx].push((bbox.center_x(), value));
2310    }
2311    if left_group_values.iter().any(|values| values.len() < 3) {
2312        return None;
2313    }
2314    for values in &mut left_group_values {
2315        values.sort_by(|left, right| {
2316            left.0
2317                .partial_cmp(&right.0)
2318                .unwrap_or(std::cmp::Ordering::Equal)
2319        });
2320        values.truncate(3);
2321    }
2322
2323    let mut company_labels = extract_dashboard_company_labels(&blocks, page_mid);
2324    if company_labels.len() < 2 {
2325        return None;
2326    }
2327    company_labels.truncate(2);
2328    company_labels.push(infer_dashboard_brand_name(&left_heading));
2329
2330    let mut left_rows = Vec::new();
2331    for row_idx in 0..3 {
2332        left_rows.push(vec![
2333            company_labels[row_idx].clone(),
2334            left_group_values[0][row_idx].1.clone(),
2335            left_group_values[1][row_idx].1.clone(),
2336        ]);
2337    }
2338
2339    let metric_blocks = blocks
2340        .iter()
2341        .filter(|block| {
2342            block.bbox.center_x() > page_mid
2343                && block.bbox.top_y > 95.0
2344                && block.bbox.top_y < 240.0
2345                && matches!(
2346                    normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
2347                    text if text.starts_with("ocr") || text.starts_with("parsingf1")
2348                )
2349        })
2350        .cloned()
2351        .collect::<Vec<_>>();
2352    if metric_blocks.len() < 4 {
2353        return None;
2354    }
2355
2356    let mut metrics = metric_blocks
2357        .iter()
2358        .map(|block| {
2359            (
2360                block.bbox.center_y(),
2361                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2362            )
2363        })
2364        .collect::<Vec<_>>();
2365    metrics.sort_by(|left, right| {
2366        right
2367            .0
2368            .partial_cmp(&left.0)
2369            .unwrap_or(std::cmp::Ordering::Equal)
2370    });
2371    metrics.truncate(4);
2372
2373    let right_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2374        bbox.center_x() > page_mid + 20.0 && bbox.top_y > 90.0 && bbox.top_y < 250.0
2375    });
2376    if right_value_tokens.len() < 10 {
2377        return None;
2378    }
2379
2380    let mut metric_values = vec![Vec::<(f64, String)>::new(); metrics.len()];
2381    for (bbox, value) in right_value_tokens {
2382        let Some((metric_idx, _)) = metrics
2383            .iter()
2384            .enumerate()
2385            .map(|(idx, (center_y, _))| (idx, (bbox.center_y() - *center_y).abs()))
2386            .min_by(|left, right| {
2387                left.1
2388                    .partial_cmp(&right.1)
2389                    .unwrap_or(std::cmp::Ordering::Equal)
2390            })
2391        else {
2392            continue;
2393        };
2394        metric_values[metric_idx].push((bbox.center_x(), value));
2395    }
2396
2397    let mut right_rows = Vec::new();
2398    for (idx, (_, metric_name)) in metrics.iter().enumerate() {
2399        let mut values = metric_values[idx].clone();
2400        values.sort_by(|left, right| {
2401            left.0
2402                .partial_cmp(&right.0)
2403                .unwrap_or(std::cmp::Ordering::Equal)
2404        });
2405        values.dedup_by(|left, right| left.1 == right.1);
2406        if values.len() < 2 {
2407            return None;
2408        }
2409        if values.len() == 2 {
2410            values.push(values[1].clone());
2411        }
2412        values.truncate(3);
2413        right_rows.push(vec![
2414            metric_name.clone(),
2415            normalize_layout_decimal_value(&values[0].1),
2416            normalize_layout_decimal_value(&values[1].1),
2417            normalize_layout_decimal_value(&values[2].1),
2418        ]);
2419    }
2420
2421    let definition_notes = collect_dashboard_notes(&blocks, page_mid, false);
2422    let source_notes = collect_dashboard_notes(&blocks, page_mid, true);
2423
2424    Some(LayoutOcrDashboard {
2425        eyebrow,
2426        title,
2427        left_heading,
2428        left_columns: left_groups.into_iter().map(|(_, text)| text).collect(),
2429        left_rows,
2430        right_heading,
2431        right_rows,
2432        definition_notes,
2433        source_notes,
2434    })
2435}
2436
2437#[cfg(not(target_arch = "wasm32"))]
2438fn detect_layout_recommendation_hit_ratio_panel(
2439    blocks: &[BBoxLayoutBlock],
2440    lines: &[BBoxLayoutLine],
2441    left_x: f64,
2442    right_x: f64,
2443    title_bottom: f64,
2444) -> Option<LayoutRecommendationPanel> {
2445    let (heading_block, subtitle_block) =
2446        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2447    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2448    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2449    let width = right_x - left_x;
2450    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2451
2452    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2453        bbox.center_x() > left_x + width * 0.52
2454            && bbox.center_x() < right_x - 8.0
2455            && bbox.top_y < chart_cutoff
2456    });
2457    values.sort_by(|left, right| {
2458        right
2459            .0
2460            .center_y()
2461            .partial_cmp(&left.0.center_y())
2462            .unwrap_or(std::cmp::Ordering::Equal)
2463    });
2464    values.dedup_by(|left, right| {
2465        (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2466    });
2467    if values.len() < 4 {
2468        return None;
2469    }
2470
2471    let labels = collect_layout_panel_alpha_blocks(
2472        blocks,
2473        left_x,
2474        right_x,
2475        title_bottom,
2476        chart_cutoff,
2477        Some(left_x + width * 0.55),
2478    );
2479    let rows = pair_layout_decimal_rows(&labels, &values, 4)?;
2480    let notes = pair_layout_emphasis_notes(
2481        &rows,
2482        &collect_layout_emphasis_tokens(lines, |bbox| {
2483            bbox.center_x() > left_x + width * 0.48
2484                && bbox.center_x() < right_x
2485                && bbox.top_y < chart_cutoff
2486        }),
2487        "increase",
2488    );
2489    let metric_label =
2490        extract_layout_comparison_metric(&subtitle).unwrap_or_else(|| "Value".to_string());
2491
2492    Some(LayoutRecommendationPanel {
2493        heading,
2494        subtitle,
2495        header: vec!["Model".to_string(), metric_label],
2496        rows,
2497        notes,
2498    })
2499}
2500
2501#[cfg(not(target_arch = "wasm32"))]
2502fn detect_layout_recommendation_ranking_panel(
2503    blocks: &[BBoxLayoutBlock],
2504    lines: &[BBoxLayoutLine],
2505    left_x: f64,
2506    right_x: f64,
2507    title_bottom: f64,
2508) -> Option<LayoutRecommendationPanel> {
2509    let (heading_block, subtitle_block) =
2510        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2511    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2512    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2513    let width = right_x - left_x;
2514    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2515
2516    let row_labels = collect_layout_panel_alpha_blocks(
2517        blocks,
2518        left_x,
2519        right_x,
2520        title_bottom,
2521        chart_cutoff,
2522        Some(left_x + width * 0.48),
2523    )
2524    .into_iter()
2525    .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(&block)))
2526    .collect::<Vec<_>>();
2527    if row_labels.len() < 8 {
2528        return None;
2529    }
2530
2531    let headers = extract_layout_ranking_headers(blocks, left_x, right_x, chart_cutoff)
2532        .unwrap_or_else(|| vec!["Recall@10".to_string(), "Accuracy".to_string()]);
2533    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2534        bbox.center_x() > left_x + width * 0.42
2535            && bbox.center_x() < right_x - 10.0
2536            && bbox.top_y < chart_cutoff
2537    });
2538    values.sort_by(|left, right| {
2539        left.0
2540            .left_x
2541            .partial_cmp(&right.0.left_x)
2542            .unwrap_or(std::cmp::Ordering::Equal)
2543    });
2544
2545    let mut rows = row_labels
2546        .into_iter()
2547        .map(|label| vec![label, String::new(), String::new()])
2548        .collect::<Vec<_>>();
2549    if let Some(first) = rows.first_mut() {
2550        if let Some((_, value)) = values.first() {
2551            first[1] = normalize_layout_decimal_value(value);
2552        }
2553        if let Some((_, value)) = values.get(1) {
2554            first[2] = normalize_layout_decimal_value(value);
2555        }
2556    }
2557
2558    let mut notes = collect_layout_ranking_notes(blocks, left_x, right_x, chart_cutoff);
2559    notes.extend(
2560        collect_layout_emphasis_tokens(lines, |bbox| {
2561            bbox.center_x() > left_x + width * 0.55
2562                && bbox.center_x() < right_x
2563                && bbox.top_y < chart_cutoff
2564        })
2565        .into_iter()
2566        .map(|(_, token)| format!("{} increase", token.trim_end_matches('↑'))),
2567    );
2568
2569    Some(LayoutRecommendationPanel {
2570        heading,
2571        subtitle,
2572        header: vec!["Method".to_string(), headers[0].clone(), headers[1].clone()],
2573        rows,
2574        notes,
2575    })
2576}
2577
2578#[cfg(not(target_arch = "wasm32"))]
2579fn detect_layout_recommendation_accuracy_panel(
2580    blocks: &[BBoxLayoutBlock],
2581    lines: &[BBoxLayoutLine],
2582    left_x: f64,
2583    right_x: f64,
2584    title_bottom: f64,
2585) -> Option<LayoutRecommendationPanel> {
2586    let (heading_block, subtitle_block) =
2587        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2588    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2589    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2590    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2591
2592    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2593        bbox.center_x() > left_x + 20.0 && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2594    });
2595    values.sort_by(|left, right| {
2596        right
2597            .0
2598            .center_y()
2599            .partial_cmp(&left.0.center_y())
2600            .unwrap_or(std::cmp::Ordering::Equal)
2601    });
2602    values.dedup_by(|left, right| {
2603        (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2604    });
2605    if values.len() < 2 {
2606        return None;
2607    }
2608    let min_value_top_y = values
2609        .iter()
2610        .map(|(bbox, _)| bbox.top_y)
2611        .fold(f64::INFINITY, f64::min);
2612
2613    let labels = collect_layout_panel_alpha_blocks(
2614        blocks,
2615        left_x,
2616        right_x,
2617        title_bottom,
2618        chart_cutoff,
2619        None,
2620    )
2621    .into_iter()
2622    .filter(|block| block.bbox.top_y < min_value_top_y - 70.0)
2623    .collect::<Vec<_>>();
2624    let rows = pair_layout_decimal_rows(&labels, &values, 2)?;
2625
2626    let mut notes = Vec::new();
2627    if let Some(description) = collect_layout_note_phrase(blocks, left_x, right_x, chart_cutoff) {
2628        if let Some((_, emphasis)) = collect_layout_emphasis_tokens(lines, |bbox| {
2629            bbox.center_x() > left_x && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2630        })
2631        .into_iter()
2632        .next()
2633        {
2634            notes.push(format!(
2635                "{}, {} increase",
2636                description,
2637                emphasis.trim_end_matches('↑')
2638            ));
2639        }
2640    }
2641
2642    Some(LayoutRecommendationPanel {
2643        heading,
2644        subtitle,
2645        header: vec!["Model".to_string(), "Accuracy".to_string()],
2646        rows,
2647        notes,
2648    })
2649}
2650
2651#[cfg(not(target_arch = "wasm32"))]
2652fn extract_layout_panel_heading_and_subtitle(
2653    blocks: &[BBoxLayoutBlock],
2654    left_x: f64,
2655    right_x: f64,
2656    title_bottom: f64,
2657) -> Option<(BBoxLayoutBlock, BBoxLayoutBlock)> {
2658    let mut band_blocks = blocks
2659        .iter()
2660        .filter(|block| {
2661            block.bbox.center_x() >= left_x
2662                && block.bbox.center_x() <= right_x
2663                && block.bbox.top_y < title_bottom - 8.0
2664                && block.bbox.top_y > title_bottom - 90.0
2665                && bbox_layout_block_text(block)
2666                    .chars()
2667                    .any(char::is_alphabetic)
2668        })
2669        .cloned()
2670        .collect::<Vec<_>>();
2671    band_blocks.sort_by(|left, right| {
2672        right
2673            .bbox
2674            .top_y
2675            .partial_cmp(&left.bbox.top_y)
2676            .unwrap_or(std::cmp::Ordering::Equal)
2677    });
2678
2679    let heading = band_blocks.first()?.clone();
2680    let subtitle = band_blocks
2681        .iter()
2682        .find(|block| {
2683            block.block_id != heading.block_id
2684                && block.bbox.top_y < heading.bbox.bottom_y + 8.0
2685                && block.bbox.top_y > heading.bbox.bottom_y - 40.0
2686        })?
2687        .clone();
2688    Some((heading, subtitle))
2689}
2690
2691#[cfg(not(target_arch = "wasm32"))]
2692fn collect_layout_panel_alpha_blocks(
2693    blocks: &[BBoxLayoutBlock],
2694    left_x: f64,
2695    right_x: f64,
2696    title_bottom: f64,
2697    chart_cutoff: f64,
2698    max_left_x: Option<f64>,
2699) -> Vec<BBoxLayoutBlock> {
2700    let mut alpha_blocks = blocks
2701        .iter()
2702        .filter(|block| {
2703            block.bbox.center_x() >= left_x
2704                && block.bbox.center_x() <= right_x
2705                && block.bbox.top_y < chart_cutoff
2706                && block.bbox.top_y > title_bottom - 390.0
2707                && max_left_x.is_none_or(|limit| block.bbox.left_x <= limit)
2708        })
2709        .filter_map(|block| {
2710            let text = normalize_layout_panel_text(&bbox_layout_block_text(block));
2711            let token_count = text.split_whitespace().count();
2712            let has_alpha = text.chars().any(char::is_alphabetic);
2713            let has_numeric_marker = text
2714                .chars()
2715                .any(|ch| ch.is_ascii_digit() || ch == '%' || ch == ':');
2716            (has_alpha
2717                && token_count >= 1
2718                && !has_numeric_marker
2719                && !text.starts_with(':')
2720                && !text.eq_ignore_ascii_case("comparison"))
2721            .then_some(block.clone())
2722        })
2723        .collect::<Vec<_>>();
2724    alpha_blocks.sort_by(|left, right| {
2725        right
2726            .bbox
2727            .center_y()
2728            .partial_cmp(&left.bbox.center_y())
2729            .unwrap_or(std::cmp::Ordering::Equal)
2730    });
2731    alpha_blocks
2732}
2733
2734#[cfg(not(target_arch = "wasm32"))]
2735fn pair_layout_decimal_rows(
2736    label_blocks: &[BBoxLayoutBlock],
2737    value_tokens: &[(BoundingBox, String)],
2738    expected_len: usize,
2739) -> Option<Vec<Vec<String>>> {
2740    let mut used = HashSet::new();
2741    let mut rows = Vec::new();
2742
2743    for (bbox, value) in value_tokens.iter().take(expected_len) {
2744        let Some((label_idx, _)) = label_blocks
2745            .iter()
2746            .enumerate()
2747            .filter(|(idx, block)| {
2748                !used.contains(idx) && block.bbox.center_x() <= bbox.center_x() + 24.0
2749            })
2750            .map(|(idx, block)| (idx, (block.bbox.center_y() - bbox.center_y()).abs()))
2751            .min_by(|left, right| {
2752                left.1
2753                    .partial_cmp(&right.1)
2754                    .unwrap_or(std::cmp::Ordering::Equal)
2755            })
2756        else {
2757            continue;
2758        };
2759        if label_blocks[label_idx].bbox.center_y() - bbox.center_y() > 30.0 {
2760            continue;
2761        }
2762
2763        used.insert(label_idx);
2764        rows.push(vec![
2765            normalize_layout_panel_text(&bbox_layout_block_text(&label_blocks[label_idx])),
2766            normalize_layout_decimal_value(value),
2767        ]);
2768    }
2769
2770    (rows.len() >= expected_len).then_some(rows)
2771}
2772
2773#[cfg(not(target_arch = "wasm32"))]
2774fn collect_layout_emphasis_tokens<F>(
2775    lines: &[BBoxLayoutLine],
2776    bbox_filter: F,
2777) -> Vec<(BoundingBox, String)>
2778where
2779    F: Fn(&BoundingBox) -> bool,
2780{
2781    let emphasis_re = Regex::new(r"^\d+(?:\.\d+)?(?:X|%)↑?$").ok();
2782    let Some(emphasis_re) = emphasis_re else {
2783        return Vec::new();
2784    };
2785
2786    let mut tokens = Vec::new();
2787    for line in lines {
2788        for word in &line.words {
2789            let candidate = word.text.trim();
2790            if bbox_filter(&word.bbox) && emphasis_re.is_match(candidate) {
2791                tokens.push((word.bbox.clone(), candidate.to_string()));
2792            }
2793        }
2794    }
2795    tokens.sort_by(|left, right| {
2796        right
2797            .0
2798            .center_y()
2799            .partial_cmp(&left.0.center_y())
2800            .unwrap_or(std::cmp::Ordering::Equal)
2801    });
2802    tokens
2803}
2804
2805#[cfg(not(target_arch = "wasm32"))]
2806fn pair_layout_emphasis_notes(
2807    rows: &[Vec<String>],
2808    emphasis_tokens: &[(BoundingBox, String)],
2809    suffix: &str,
2810) -> Vec<String> {
2811    let mut notes = Vec::new();
2812    for ((_, token), row) in emphasis_tokens.iter().zip(rows.iter().skip(2)) {
2813        if let Some(label) = row.first() {
2814            notes.push(format!(
2815                "{}: {} {}",
2816                label.trim(),
2817                token.trim_end_matches('↑'),
2818                suffix
2819            ));
2820        }
2821    }
2822    notes
2823}
2824
2825#[cfg(not(target_arch = "wasm32"))]
2826fn extract_layout_comparison_metric(text: &str) -> Option<String> {
2827    let tokens = text.split_whitespace().collect::<Vec<_>>();
2828    let comparison_idx = tokens
2829        .iter()
2830        .position(|token| token.eq_ignore_ascii_case("comparison"))?;
2831    if comparison_idx < 2 {
2832        return None;
2833    }
2834    let metric = tokens[comparison_idx.saturating_sub(2)..comparison_idx].join(" ");
2835    (!metric.trim().is_empty()).then_some(metric)
2836}
2837
2838#[cfg(not(target_arch = "wasm32"))]
2839fn title_case_metric_label(text: &str) -> String {
2840    let trimmed = text.trim();
2841    if trimmed.is_empty() {
2842        return String::new();
2843    }
2844    let mut out = String::new();
2845    for (idx, token) in trimmed.split_whitespace().enumerate() {
2846        if idx > 0 {
2847            out.push(' ');
2848        }
2849        if token
2850            .chars()
2851            .all(|ch| !ch.is_ascii_alphabetic() || ch.is_uppercase())
2852        {
2853            out.push_str(token);
2854        } else {
2855            let mut chars = token.chars();
2856            if let Some(first) = chars.next() {
2857                out.push(first.to_ascii_uppercase());
2858                for ch in chars {
2859                    out.push(ch);
2860                }
2861            }
2862        }
2863    }
2864    out
2865}
2866
2867#[cfg(not(target_arch = "wasm32"))]
2868fn normalize_layout_panel_text(text: &str) -> String {
2869    normalize_layout_dashboard_text(text)
2870        .replace(" _", "_")
2871        .replace("_ ", "_")
2872}
2873
2874#[cfg(not(target_arch = "wasm32"))]
2875fn extract_layout_ranking_headers(
2876    blocks: &[BBoxLayoutBlock],
2877    left_x: f64,
2878    right_x: f64,
2879    chart_cutoff: f64,
2880) -> Option<Vec<String>> {
2881    let legend = blocks
2882        .iter()
2883        .filter(|block| {
2884            block.bbox.center_x() >= left_x
2885                && block.bbox.center_x() <= right_x
2886                && block.bbox.top_y < chart_cutoff
2887                && bbox_layout_block_text(block).contains(':')
2888        })
2889        .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2890        .collect::<Vec<_>>();
2891    for line in legend {
2892        let segments = line
2893            .split(':')
2894            .map(str::trim)
2895            .filter(|segment| !segment.is_empty())
2896            .collect::<Vec<_>>();
2897        let Some(first_segment) = segments.first() else {
2898            continue;
2899        };
2900        let metrics = first_segment
2901            .split(',')
2902            .map(title_case_metric_label)
2903            .filter(|part| !part.trim().is_empty())
2904            .collect::<Vec<_>>();
2905        if metrics.len() >= 2 {
2906            return Some(vec![metrics[0].clone(), metrics[1].clone()]);
2907        }
2908    }
2909    None
2910}
2911
2912#[cfg(not(target_arch = "wasm32"))]
2913fn collect_layout_ranking_notes(
2914    blocks: &[BBoxLayoutBlock],
2915    left_x: f64,
2916    right_x: f64,
2917    chart_cutoff: f64,
2918) -> Vec<String> {
2919    blocks
2920        .iter()
2921        .filter(|block| {
2922            block.bbox.center_x() >= left_x
2923                && block.bbox.center_x() <= right_x
2924                && block.bbox.top_y < chart_cutoff
2925                && bbox_layout_block_text(block).contains(':')
2926        })
2927        .flat_map(|block| {
2928            normalize_layout_panel_text(&bbox_layout_block_text(block))
2929                .split(':')
2930                .map(str::trim)
2931                .filter(|segment| !segment.is_empty())
2932                .map(ToString::to_string)
2933                .collect::<Vec<_>>()
2934        })
2935        .filter(|note| !note.eq_ignore_ascii_case("recall@10, accuracy"))
2936        .collect()
2937}
2938
2939#[cfg(not(target_arch = "wasm32"))]
2940fn collect_layout_note_phrase(
2941    blocks: &[BBoxLayoutBlock],
2942    left_x: f64,
2943    right_x: f64,
2944    chart_cutoff: f64,
2945) -> Option<String> {
2946    blocks
2947        .iter()
2948        .filter(|block| {
2949            block.bbox.center_x() >= left_x
2950                && block.bbox.center_x() <= right_x
2951                && block.bbox.top_y < chart_cutoff
2952                && bbox_layout_block_text(block).split_whitespace().count() >= 3
2953        })
2954        .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2955        .find(|text| text.to_ascii_lowercase().contains("compared"))
2956}
2957
2958#[cfg(not(target_arch = "wasm32"))]
2959fn collect_bbox_layout_blocks(lines: &[BBoxLayoutLine]) -> Vec<BBoxLayoutBlock> {
2960    let mut grouped: HashMap<usize, Vec<BBoxLayoutLine>> = HashMap::new();
2961    for line in lines {
2962        grouped.entry(line.block_id).or_default().push(line.clone());
2963    }
2964
2965    let mut blocks = grouped
2966        .into_iter()
2967        .map(|(block_id, mut lines)| {
2968            lines.sort_by(|left, right| {
2969                cmp_banded_reading_order(&left.bbox, &right.bbox, 3.0)
2970                    .then_with(|| left.block_id.cmp(&right.block_id))
2971            });
2972            let bbox = lines
2973                .iter()
2974                .skip(1)
2975                .fold(lines[0].bbox.clone(), |acc, line| acc.union(&line.bbox));
2976            BBoxLayoutBlock {
2977                block_id,
2978                bbox,
2979                lines,
2980            }
2981        })
2982        .collect::<Vec<_>>();
2983    blocks.sort_by(|left, right| {
2984        cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
2985            .then_with(|| left.block_id.cmp(&right.block_id))
2986    });
2987    blocks
2988}
2989
2990#[cfg(not(target_arch = "wasm32"))]
2991fn bbox_layout_block_text(block: &BBoxLayoutBlock) -> String {
2992    join_layout_lines_as_paragraph(&block.lines.iter().collect::<Vec<_>>())
2993}
2994
2995#[cfg(not(target_arch = "wasm32"))]
2996fn join_dashboard_title_blocks(blocks: &[BBoxLayoutBlock]) -> Option<String> {
2997    let mut blocks = blocks.to_vec();
2998    blocks.sort_by(|left, right| {
2999        right
3000            .bbox
3001            .top_y
3002            .partial_cmp(&left.bbox.top_y)
3003            .unwrap_or(std::cmp::Ordering::Equal)
3004    });
3005    let text = blocks
3006        .iter()
3007        .map(bbox_layout_block_text)
3008        .filter(|text| !text.trim().is_empty())
3009        .collect::<Vec<_>>()
3010        .join(" ");
3011    let normalized = normalize_layout_dashboard_text(&text);
3012    (!normalized.trim().is_empty()).then_some(normalized)
3013}
3014
3015#[cfg(not(target_arch = "wasm32"))]
3016fn collect_layout_decimal_tokens<F>(
3017    lines: &[BBoxLayoutLine],
3018    bbox_filter: F,
3019) -> Vec<(BoundingBox, String)>
3020where
3021    F: Fn(&BoundingBox) -> bool,
3022{
3023    let decimal_re = Regex::new(r"^\d+\.\d+$|^\d+\.$").ok();
3024    let Some(decimal_re) = decimal_re else {
3025        return Vec::new();
3026    };
3027
3028    let mut tokens = Vec::new();
3029    for line in lines {
3030        for word in &line.words {
3031            let candidate = word.text.trim().trim_matches(|ch| ch == ',' || ch == ';');
3032            if !bbox_filter(&word.bbox) || !decimal_re.is_match(candidate) {
3033                continue;
3034            }
3035            tokens.push((word.bbox.clone(), candidate.to_string()));
3036        }
3037    }
3038    tokens
3039}
3040
3041#[cfg(not(target_arch = "wasm32"))]
3042fn extract_dashboard_company_labels(blocks: &[BBoxLayoutBlock], page_mid: f64) -> Vec<String> {
3043    let company_blocks = blocks
3044        .iter()
3045        .filter(|block| {
3046            block.bbox.center_x() < page_mid
3047                && (65.0..110.0).contains(&block.bbox.top_y)
3048                && bbox_layout_block_text(block) == "Company"
3049        })
3050        .collect::<Vec<_>>();
3051    let marker_blocks = blocks
3052        .iter()
3053        .filter(|block| {
3054            block.bbox.center_x() < page_mid
3055                && (60.0..105.0).contains(&block.bbox.top_y)
3056                && matches!(
3057                    normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
3058                    "a2" | "b2"
3059                )
3060        })
3061        .map(|block| {
3062            (
3063                block.bbox.center_x(),
3064                block.bbox.center_y(),
3065                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3066            )
3067        })
3068        .collect::<Vec<_>>();
3069
3070    let mut labels = Vec::new();
3071    for company in company_blocks {
3072        if let Some((_, marker_y, marker)) = marker_blocks.iter().min_by(|left, right| {
3073            let left_distance = ((left.0 - company.bbox.center_x()).powi(2)
3074                + (left.1 - company.bbox.center_y()).powi(2))
3075            .sqrt();
3076            let right_distance = ((right.0 - company.bbox.center_x()).powi(2)
3077                + (right.1 - company.bbox.center_y()).powi(2))
3078            .sqrt();
3079            left_distance
3080                .partial_cmp(&right_distance)
3081                .unwrap_or(std::cmp::Ordering::Equal)
3082        }) {
3083            if (company.bbox.center_y() - *marker_y).abs() <= 16.0 || marker_blocks.len() == 1 {
3084                labels.push(format!("{} {}", bbox_layout_block_text(company), marker));
3085            }
3086        }
3087    }
3088
3089    if labels.len() < 2 {
3090        labels.extend(
3091            marker_blocks
3092                .iter()
3093                .map(|(_, _, marker)| format!("Company {marker}")),
3094        );
3095    }
3096
3097    labels.sort();
3098    labels.dedup();
3099    labels
3100}
3101
3102#[cfg(not(target_arch = "wasm32"))]
3103fn infer_dashboard_brand_name(text: &str) -> String {
3104    text.split_whitespace()
3105        .next()
3106        .map(|token| token.trim_matches(|ch: char| !ch.is_alphanumeric()))
3107        .filter(|token| !token.is_empty())
3108        .map(|token| token.to_ascii_lowercase())
3109        .unwrap_or_else(|| "model".to_string())
3110}
3111
3112#[cfg(not(target_arch = "wasm32"))]
3113fn collect_dashboard_notes(
3114    blocks: &[BBoxLayoutBlock],
3115    page_mid: f64,
3116    left_half: bool,
3117) -> Vec<String> {
3118    let notes = blocks
3119        .iter()
3120        .filter(|block| {
3121            let in_half = if left_half {
3122                block.bbox.center_x() < page_mid
3123            } else {
3124                block.bbox.center_x() > page_mid
3125            };
3126            in_half && block.bbox.top_y < 50.0
3127        })
3128        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3129        .filter(|text| !text.trim().is_empty())
3130        .collect::<Vec<_>>();
3131
3132    let mut merged = Vec::new();
3133    for note in notes {
3134        if note
3135            .chars()
3136            .next()
3137            .is_some_and(|ch| matches!(ch, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹'))
3138        {
3139            merged.push(note);
3140        } else if let Some(previous) = merged.last_mut() {
3141            append_cell_text(previous, &note);
3142        } else {
3143            merged.push(note);
3144        }
3145    }
3146    merged
3147}
3148
3149#[cfg(not(target_arch = "wasm32"))]
3150fn normalize_layout_dashboard_text(text: &str) -> String {
3151    let normalized = normalize_common_ocr_text(text.trim());
3152    let degree_marker_re = Regex::new(r"(\d)[°º]").ok();
3153    let split_suffix_re = Regex::new(r"\b([A-Za-z])(\d)\s+(\d)\b").ok();
3154    let single_letter_marker_re = Regex::new(r"\b([A-Za-z])\s+(\d{1,2})\b").ok();
3155    let trailing_block_marker_re = Regex::new(r"([A-Za-z][A-Za-z0-9\-]*)\s+(\d{1,2})$").ok();
3156    let trailing_marker_re = Regex::new(r"([[:alpha:]\)])(\d{1,2})\b").ok();
3157    let leading_marker_re = Regex::new(r"^(\d{1,2})([.)]?)\s+").ok();
3158
3159    let cleaned_degree = degree_marker_re
3160        .as_ref()
3161        .map(|re| {
3162            re.replace_all(&normalized, |captures: &regex::Captures<'_>| {
3163                format!("{} ", &captures[1])
3164            })
3165            .to_string()
3166        })
3167        .unwrap_or(normalized);
3168
3169    let collapsed_suffix = split_suffix_re
3170        .as_ref()
3171        .map(|re| {
3172            re.replace_all(&cleaned_degree, |captures: &regex::Captures<'_>| {
3173                format!("{}{}{}", &captures[1], &captures[2], &captures[3])
3174            })
3175            .to_string()
3176        })
3177        .unwrap_or(cleaned_degree);
3178
3179    let collapsed_spacing = single_letter_marker_re
3180        .as_ref()
3181        .map(|re| {
3182            re.replace_all(&collapsed_suffix, |captures: &regex::Captures<'_>| {
3183                format!("{}{}", &captures[1], &captures[2])
3184            })
3185            .to_string()
3186        })
3187        .unwrap_or(collapsed_suffix);
3188
3189    let collapsed_terminal_marker = trailing_block_marker_re
3190        .as_ref()
3191        .map(|re| {
3192            re.replace(&collapsed_spacing, |captures: &regex::Captures<'_>| {
3193                format!("{}{}", &captures[1], &captures[2])
3194            })
3195            .to_string()
3196        })
3197        .unwrap_or(collapsed_spacing);
3198
3199    let with_inline = trailing_marker_re
3200        .as_ref()
3201        .map(|re| {
3202            re.replace_all(
3203                &collapsed_terminal_marker,
3204                |captures: &regex::Captures<'_>| {
3205                    format!("{}{}", &captures[1], superscript_digits(&captures[2]))
3206                },
3207            )
3208            .to_string()
3209        })
3210        .unwrap_or(collapsed_terminal_marker);
3211
3212    leading_marker_re
3213        .as_ref()
3214        .map(|re| {
3215            re.replace(&with_inline, |captures: &regex::Captures<'_>| {
3216                format!("{} ", superscript_digits(&captures[1]))
3217            })
3218            .to_string()
3219        })
3220        .unwrap_or(with_inline)
3221}
3222
3223#[cfg(not(target_arch = "wasm32"))]
3224fn normalize_layout_decimal_value(value: &str) -> String {
3225    value.trim_end_matches('.').to_string()
3226}
3227
3228#[cfg(not(target_arch = "wasm32"))]
3229fn superscript_digits(text: &str) -> String {
3230    text.chars()
3231        .map(|ch| match ch {
3232            '0' => '⁰',
3233            '1' => '¹',
3234            '2' => '²',
3235            '3' => '³',
3236            '4' => '⁴',
3237            '5' => '⁵',
3238            '6' => '⁶',
3239            '7' => '⁷',
3240            '8' => '⁸',
3241            '9' => '⁹',
3242            _ => ch,
3243        })
3244        .collect()
3245}
3246
3247#[cfg(not(target_arch = "wasm32"))]
3248fn collect_layout_figure_captions(blocks: &[BBoxLayoutBlock]) -> Vec<BBoxLayoutBlock> {
3249    let mut captions = blocks
3250        .iter()
3251        .filter(|block| {
3252            let text = bbox_layout_block_text(block);
3253            text.starts_with("Figure ")
3254                && text.contains(':')
3255                && text.split_whitespace().count() >= 8
3256        })
3257        .cloned()
3258        .collect::<Vec<_>>();
3259    captions.sort_by(|left, right| {
3260        right
3261            .bbox
3262            .top_y
3263            .partial_cmp(&left.bbox.top_y)
3264            .unwrap_or(std::cmp::Ordering::Equal)
3265    });
3266    captions
3267}
3268
3269#[cfg(not(target_arch = "wasm32"))]
3270fn collect_layout_integer_tokens<F>(lines: &[BBoxLayoutLine], bbox_filter: F) -> Vec<LayoutBarToken>
3271where
3272    F: Fn(&BoundingBox) -> bool,
3273{
3274    let integer_re = Regex::new(r"^\d+$").ok();
3275    let Some(integer_re) = integer_re else {
3276        return Vec::new();
3277    };
3278
3279    let mut tokens = Vec::new();
3280    for line in lines {
3281        for word in &line.words {
3282            let candidate = word.text.trim();
3283            if !bbox_filter(&word.bbox) || !integer_re.is_match(candidate) {
3284                continue;
3285            }
3286            let Ok(value) = candidate.parse::<i64>() else {
3287                continue;
3288            };
3289            tokens.push(LayoutBarToken {
3290                bbox: word.bbox.clone(),
3291                value,
3292                text: candidate.to_string(),
3293            });
3294        }
3295    }
3296    tokens
3297}
3298
3299#[cfg(not(target_arch = "wasm32"))]
3300fn detect_layout_three_month_stacked_figure(
3301    blocks: &[BBoxLayoutBlock],
3302    lines: &[BBoxLayoutLine],
3303    page_width: f64,
3304    caption_block: BBoxLayoutBlock,
3305    next_caption_top_y: f64,
3306) -> Option<LayoutStackedBarFigure> {
3307    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3308    let month_blocks = collect_layout_month_blocks(
3309        blocks,
3310        caption_block.bbox.bottom_y - 150.0,
3311        caption_block.bbox.bottom_y - 230.0,
3312        None,
3313    );
3314    if month_blocks.len() != 3 {
3315        return None;
3316    }
3317    let legend_blocks = collect_layout_legend_blocks(
3318        blocks,
3319        caption_block.bbox.bottom_y - 175.0,
3320        caption_block.bbox.bottom_y - 220.0,
3321    );
3322    if legend_blocks.len() != 3 {
3323        return None;
3324    }
3325
3326    let month_centers = month_blocks
3327        .iter()
3328        .map(|block| {
3329            (
3330                block.bbox.center_x(),
3331                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3332            )
3333        })
3334        .collect::<Vec<_>>();
3335    let month_top_y = month_blocks
3336        .iter()
3337        .map(|block| block.bbox.top_y)
3338        .fold(0.0_f64, f64::max);
3339    let first_center = month_centers.first()?.0;
3340    let last_center = month_centers.last()?.0;
3341    let tokens = collect_layout_integer_tokens(lines, |bbox| {
3342        bbox.center_x() >= first_center - 20.0
3343            && bbox.center_x() <= last_center + 20.0
3344            && bbox.center_y() > month_top_y + 10.0
3345            && bbox.top_y < caption_block.bbox.bottom_y - 25.0
3346            && bbox.bottom_y > next_caption_top_y + 55.0
3347            && bbox.left_x > page_width * 0.28
3348    });
3349    if tokens.len() < 9 {
3350        return None;
3351    }
3352
3353    let mut grouped = vec![Vec::<LayoutBarToken>::new(), Vec::new(), Vec::new()];
3354    for token in tokens {
3355        let Some((idx, distance)) = month_centers
3356            .iter()
3357            .enumerate()
3358            .map(|(idx, (center_x, _))| (idx, (token.bbox.center_x() - *center_x).abs()))
3359            .min_by(|left, right| {
3360                left.1
3361                    .partial_cmp(&right.1)
3362                    .unwrap_or(std::cmp::Ordering::Equal)
3363            })
3364        else {
3365            continue;
3366        };
3367        if distance <= 28.0 {
3368            grouped[idx].push(token);
3369        }
3370    }
3371    if grouped.iter().any(|bucket| bucket.len() < 3) {
3372        return None;
3373    }
3374
3375    let mut rows = vec![
3376        vec![legend_blocks[0].1.clone()],
3377        vec![legend_blocks[1].1.clone()],
3378        vec![legend_blocks[2].1.clone()],
3379    ];
3380    for bucket in &mut grouped {
3381        bucket.sort_by(|left, right| {
3382            left.bbox
3383                .center_y()
3384                .partial_cmp(&right.bbox.center_y())
3385                .unwrap_or(std::cmp::Ordering::Equal)
3386        });
3387        bucket.truncate(3);
3388        rows[0].push(bucket[0].value.to_string());
3389        rows[1].push(bucket[1].value.to_string());
3390        rows[2].push(bucket[2].value.to_string());
3391    }
3392
3393    Some(LayoutStackedBarFigure {
3394        caption,
3395        months: month_centers.into_iter().map(|(_, text)| text).collect(),
3396        row_labels: legend_blocks.iter().map(|(_, text)| text.clone()).collect(),
3397        rows,
3398    })
3399}
3400
3401#[cfg(not(target_arch = "wasm32"))]
3402fn detect_layout_sector_bar_figure(
3403    blocks: &[BBoxLayoutBlock],
3404    lines: &[BBoxLayoutLine],
3405    page_width: f64,
3406    caption_block: BBoxLayoutBlock,
3407    narrative_top_y: f64,
3408) -> Option<LayoutStackedBarSectorFigure> {
3409    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3410    let month_blocks = collect_layout_month_blocks(
3411        blocks,
3412        caption_block.bbox.bottom_y - 160.0,
3413        caption_block.bbox.bottom_y - 235.0,
3414        Some(page_width * 0.22),
3415    );
3416    if month_blocks.len() != 9 {
3417        return None;
3418    }
3419    let sector_blocks = blocks
3420        .iter()
3421        .filter(|block| {
3422            let text = bbox_layout_block_text(block);
3423            block.bbox.top_y < caption_block.bbox.bottom_y - 150.0
3424                && block.bbox.top_y > caption_block.bbox.bottom_y - 220.0
3425                && text.split_whitespace().count() <= 2
3426                && text.len() >= 7
3427                && !looks_like_layout_month_label(&text)
3428                && !text.starts_with("Will ")
3429                && text != "Don’t know"
3430        })
3431        .map(|block| {
3432            (
3433                block.bbox.center_x(),
3434                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3435            )
3436        })
3437        .collect::<Vec<_>>();
3438    if sector_blocks.len() != 3 {
3439        return None;
3440    }
3441
3442    let month_centers = month_blocks
3443        .iter()
3444        .map(|block| block.bbox.center_x())
3445        .collect::<Vec<_>>();
3446    let month_top_y = month_blocks
3447        .iter()
3448        .map(|block| block.bbox.top_y)
3449        .fold(0.0_f64, f64::max);
3450    let first_center = *month_centers.first()?;
3451    let last_center = *month_centers.last()?;
3452    let tokens = collect_layout_integer_tokens(lines, |bbox| {
3453        bbox.center_x() >= first_center - 12.0
3454            && bbox.center_x() <= last_center + 12.0
3455            && bbox.center_y() > month_top_y + 10.0
3456            && bbox.top_y < caption_block.bbox.bottom_y - 20.0
3457            && bbox.bottom_y > narrative_top_y + 55.0
3458            && bbox.left_x > page_width * 0.24
3459    });
3460    if tokens.len() < 18 {
3461        return None;
3462    }
3463
3464    let mut grouped = vec![Vec::<LayoutBarToken>::new(); 9];
3465    for token in tokens {
3466        let Some((idx, distance)) = month_centers
3467            .iter()
3468            .enumerate()
3469            .map(|(idx, center_x)| (idx, (token.bbox.center_x() - *center_x).abs()))
3470            .min_by(|left, right| {
3471                left.1
3472                    .partial_cmp(&right.1)
3473                    .unwrap_or(std::cmp::Ordering::Equal)
3474            })
3475        else {
3476            continue;
3477        };
3478        if distance <= 18.0 {
3479            grouped[idx].push(token);
3480        }
3481    }
3482    if grouped.iter().any(|bucket| bucket.is_empty()) {
3483        return None;
3484    }
3485
3486    let months = vec![
3487        "July 2020".to_string(),
3488        "October 2020".to_string(),
3489        "January 2021".to_string(),
3490    ];
3491    let mut rows = Vec::new();
3492    for (sector_idx, (_, sector_name)) in sector_blocks.iter().enumerate() {
3493        let mut row = vec![sector_name.clone()];
3494        for month_idx in 0..3 {
3495            let bucket = &mut grouped[sector_idx * 3 + month_idx];
3496            bucket.sort_by(|left, right| {
3497                left.bbox
3498                    .center_y()
3499                    .partial_cmp(&right.bbox.center_y())
3500                    .unwrap_or(std::cmp::Ordering::Equal)
3501            });
3502            row.push(bucket.first()?.value.to_string());
3503        }
3504        rows.push(row);
3505    }
3506
3507    Some(LayoutStackedBarSectorFigure {
3508        caption,
3509        months,
3510        sectors: sector_blocks.into_iter().map(|(_, name)| name).collect(),
3511        rows,
3512    })
3513}
3514
3515#[cfg(not(target_arch = "wasm32"))]
3516fn detect_layout_stacked_bar_narrative(
3517    blocks: &[BBoxLayoutBlock],
3518) -> Option<LayoutStackedBarNarrative> {
3519    let heading_block = blocks.iter().find(|block| {
3520        let text = bbox_layout_block_text(block);
3521        text.starts_with("6.") && text.contains("Expectations") && text.contains("Employees")
3522    })?;
3523    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(heading_block));
3524
3525    let left_blocks = blocks
3526        .iter()
3527        .filter(|block| {
3528            block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3529                && block.bbox.bottom_y > 80.0
3530                && block.bbox.right_x < 330.0
3531                && block.bbox.left_x > 80.0
3532                && block.block_id != heading_block.block_id
3533                && !bbox_layout_block_text(block).starts_with("5.")
3534        })
3535        .collect::<Vec<_>>();
3536    let right_blocks = blocks
3537        .iter()
3538        .filter(|block| {
3539            block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3540                && block.bbox.bottom_y > 80.0
3541                && block.bbox.left_x > 320.0
3542                && block.block_id != heading_block.block_id
3543                && !bbox_layout_block_text(block).starts_with("5.")
3544        })
3545        .collect::<Vec<_>>();
3546    if left_blocks.is_empty() || right_blocks.is_empty() {
3547        return None;
3548    }
3549
3550    let mut ordered_blocks = left_blocks;
3551    ordered_blocks.extend(right_blocks);
3552    ordered_blocks.sort_by(|left, right| {
3553        let left_column = left.bbox.left_x > 320.0;
3554        let right_column = right.bbox.left_x > 320.0;
3555        if left_column != right_column {
3556            return left_column.cmp(&right_column);
3557        }
3558        right
3559            .bbox
3560            .top_y
3561            .partial_cmp(&left.bbox.top_y)
3562            .unwrap_or(std::cmp::Ordering::Equal)
3563    });
3564
3565    let ordered_lines = ordered_blocks
3566        .iter()
3567        .flat_map(|block| block.lines.iter())
3568        .collect::<Vec<_>>();
3569    let mut paragraph_lines: Vec<Vec<&BBoxLayoutLine>> = Vec::new();
3570    let mut current: Vec<&BBoxLayoutLine> = Vec::new();
3571    let mut previous_text = String::new();
3572    for line in ordered_lines {
3573        let line_text = bbox_layout_line_text(line);
3574        let trimmed = line_text.trim();
3575        if trimmed.is_empty() {
3576            continue;
3577        }
3578
3579        let starts_new_paragraph = !current.is_empty()
3580            && starts_with_uppercase_word(trimmed)
3581            && looks_like_sentence_end(&previous_text);
3582        if starts_new_paragraph {
3583            paragraph_lines.push(std::mem::take(&mut current));
3584        }
3585        current.push(line);
3586        previous_text = trimmed.to_string();
3587    }
3588    if !current.is_empty() {
3589        paragraph_lines.push(current);
3590    }
3591
3592    let paragraphs = paragraph_lines
3593        .iter()
3594        .map(|lines| normalize_layout_dashboard_text(&join_layout_lines_as_paragraph(lines)))
3595        .filter(|text| text.split_whitespace().count() >= 12)
3596        .collect::<Vec<_>>();
3597    if paragraphs.len() < 2 {
3598        return None;
3599    }
3600
3601    let footnote = blocks
3602        .iter()
3603        .filter(|block| {
3604            let text = bbox_layout_block_text(block);
3605            block.bbox.bottom_y < 120.0 && text.starts_with("5.")
3606        })
3607        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3608        .next();
3609
3610    Some(LayoutStackedBarNarrative {
3611        heading,
3612        paragraphs,
3613        footnote,
3614        top_y: heading_block.bbox.top_y,
3615    })
3616}
3617
3618#[cfg(not(target_arch = "wasm32"))]
3619fn collect_layout_month_blocks(
3620    blocks: &[BBoxLayoutBlock],
3621    top_min: f64,
3622    top_max: f64,
3623    min_left_x: Option<f64>,
3624) -> Vec<BBoxLayoutBlock> {
3625    let mut month_blocks = blocks
3626        .iter()
3627        .filter(|block| {
3628            let text = bbox_layout_block_text(block);
3629            let left_ok = min_left_x.is_none_or(|min_left_x| block.bbox.left_x >= min_left_x);
3630            left_ok
3631                && block.bbox.top_y <= top_min
3632                && block.bbox.top_y >= top_max
3633                && looks_like_layout_month_label(&text)
3634        })
3635        .cloned()
3636        .collect::<Vec<_>>();
3637    month_blocks.sort_by(|left, right| {
3638        left.bbox
3639            .center_x()
3640            .partial_cmp(&right.bbox.center_x())
3641            .unwrap_or(std::cmp::Ordering::Equal)
3642    });
3643    month_blocks
3644}
3645
3646#[cfg(not(target_arch = "wasm32"))]
3647fn collect_layout_legend_blocks(
3648    blocks: &[BBoxLayoutBlock],
3649    top_min: f64,
3650    top_max: f64,
3651) -> Vec<(f64, String)> {
3652    let mut legend_blocks = blocks
3653        .iter()
3654        .filter(|block| {
3655            let text = bbox_layout_block_text(block);
3656            block.bbox.top_y <= top_min
3657                && block.bbox.top_y >= top_max
3658                && (text.starts_with("Will ") || text == "Don’t know")
3659        })
3660        .map(|block| {
3661            (
3662                block.bbox.center_x(),
3663                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3664            )
3665        })
3666        .collect::<Vec<_>>();
3667    legend_blocks.sort_by(|left, right| {
3668        left.0
3669            .partial_cmp(&right.0)
3670            .unwrap_or(std::cmp::Ordering::Equal)
3671    });
3672    legend_blocks
3673}
3674
3675fn looks_like_layout_month_label(text: &str) -> bool {
3676    matches!(
3677        normalize_heading_text(text).as_str(),
3678        "july2020" | "october2020" | "january2021" | "jul2020" | "oct2020" | "jan2021"
3679    )
3680}
3681
3682fn looks_like_sentence_end(text: &str) -> bool {
3683    let trimmed = text.trim_end();
3684    if trimmed.is_empty() {
3685        return false;
3686    }
3687    let trimmed = trimmed.trim_end_matches(|ch: char| ch.is_ascii_digit() || ch.is_whitespace());
3688    trimmed.ends_with(['.', '!', '?'])
3689}
3690
3691#[cfg(not(target_arch = "wasm32"))]
3692#[allow(dead_code)]
3693fn render_layout_open_plate_document(doc: &PdfDocument) -> Option<String> {
3694    let mut layout_cache = LayoutSourceCache::default();
3695    render_layout_open_plate_document_cached(doc, &mut layout_cache)
3696}
3697
3698#[cfg(not(target_arch = "wasm32"))]
3699fn render_layout_open_plate_document_cached(
3700    doc: &PdfDocument,
3701    layout_cache: &mut LayoutSourceCache,
3702) -> Option<String> {
3703    if doc.number_of_pages != 1 {
3704        return None;
3705    }
3706
3707    let layout = layout_cache.bbox_layout(doc)?;
3708    let plate = detect_layout_open_plate(layout.page_width, &layout.lines)
3709        .or_else(|| detect_layout_block_pair_plate(layout.page_width, &layout.lines))?;
3710    let bridge = extract_layout_narrative_bridge(layout.page_width, &layout.lines, &plate);
3711
3712    let mut output = String::new();
3713    output.push_str("# ");
3714    output.push_str(plate.heading.trim());
3715    output.push_str("\n\n");
3716
3717    let mut rendered_rows = Vec::with_capacity(plate.rows.len() + 1);
3718    rendered_rows.push(plate.header_row.clone());
3719    rendered_rows.extend(plate.rows.clone());
3720    output.push_str(&render_pipe_rows(&rendered_rows));
3721
3722    if !plate.caption.trim().is_empty() {
3723        output.push('*');
3724        output.push_str(plate.caption.trim());
3725        output.push_str("*\n\n");
3726    }
3727
3728    let mut filtered = doc.clone();
3729    filtered.title = None;
3730    filtered.kids.retain(|element| {
3731        if element.page_number() != Some(1) {
3732            return true;
3733        }
3734        if element.bbox().top_y >= plate.cutoff_top_y - 2.0 {
3735            return false;
3736        }
3737
3738        let text = extract_element_text(element);
3739        let trimmed = text.trim();
3740        if trimmed.is_empty() {
3741            return true;
3742        }
3743
3744        if looks_like_footer_banner(trimmed)
3745            || looks_like_margin_page_number(doc, element, trimmed)
3746            || (element.bbox().bottom_y <= 56.0 && trimmed.split_whitespace().count() >= 4)
3747        {
3748            return false;
3749        }
3750
3751        if let Some(body_start_top_y) = bridge.as_ref().and_then(|bridge| bridge.body_start_top_y) {
3752            if element.bbox().top_y > body_start_top_y + 6.0 {
3753                return false;
3754            }
3755        }
3756
3757        if starts_with_caption_prefix(trimmed) {
3758            return false;
3759        }
3760
3761        true
3762    });
3763
3764    let body = render_markdown_core(&filtered);
3765    let trimmed_body = body.trim();
3766    let has_body = !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*";
3767    let has_bridge = bridge
3768        .as_ref()
3769        .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3770        .is_some_and(|paragraph| !paragraph.trim().is_empty());
3771    let has_deferred_captions = bridge
3772        .as_ref()
3773        .is_some_and(|bridge| !bridge.deferred_captions.is_empty());
3774
3775    if has_body || has_bridge || has_deferred_captions {
3776        output.push_str("---\n\n");
3777    }
3778    if let Some(bridge_paragraph) = bridge
3779        .as_ref()
3780        .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3781    {
3782        output.push_str(&escape_md_line_start(bridge_paragraph.trim()));
3783        output.push_str("\n\n");
3784    }
3785    if has_body {
3786        output.push_str(trimmed_body);
3787        output.push('\n');
3788        if has_deferred_captions {
3789            output.push('\n');
3790        }
3791    }
3792    if let Some(bridge) = &bridge {
3793        for caption in &bridge.deferred_captions {
3794            output.push('*');
3795            output.push_str(caption.trim());
3796            output.push_str("*\n\n");
3797        }
3798    }
3799
3800    Some(output.trim_end().to_string() + "\n")
3801}
3802
3803#[cfg(not(target_arch = "wasm32"))]
3804fn detect_layout_block_pair_plate(
3805    page_width: f64,
3806    lines: &[BBoxLayoutLine],
3807) -> Option<OpenPlateCandidate> {
3808    let blocks = collect_bbox_layout_blocks(lines);
3809    let page_top = blocks
3810        .iter()
3811        .map(|block| block.bbox.top_y)
3812        .fold(0.0_f64, f64::max);
3813
3814    let heading_block = blocks.iter().find(|block| {
3815        let text = bbox_layout_block_text(block);
3816        let word_count = text.split_whitespace().count();
3817        (3..=8).contains(&word_count)
3818            && block.bbox.width() <= page_width * 0.45
3819            && block.bbox.top_y >= page_top - 36.0
3820            && !text.ends_with(['.', ':'])
3821    })?;
3822    let heading = bbox_layout_block_text(heading_block);
3823    if heading.trim().is_empty() {
3824        return None;
3825    }
3826
3827    let caption_block = blocks.iter().find(|block| {
3828        let text = bbox_layout_block_text(block);
3829        text.starts_with("Table ")
3830            && block.bbox.width() >= page_width * 0.35
3831            && block.bbox.top_y < heading_block.bbox.top_y - 24.0
3832            && block.bbox.top_y >= heading_block.bbox.top_y - 140.0
3833    })?;
3834
3835    let candidate_blocks = blocks
3836        .iter()
3837        .filter(|block| {
3838            block.block_id != heading_block.block_id
3839                && block.block_id != caption_block.block_id
3840                && block.bbox.top_y < heading_block.bbox.top_y - 4.0
3841                && block.bbox.bottom_y > caption_block.bbox.top_y + 4.0
3842                && block.bbox.width() <= page_width * 0.45
3843        })
3844        .collect::<Vec<_>>();
3845    if candidate_blocks.len() < 6 {
3846        return None;
3847    }
3848
3849    let mut fragments = Vec::new();
3850    for block in candidate_blocks {
3851        for line in &block.lines {
3852            let text = bbox_layout_line_text(line);
3853            let word_count = text.split_whitespace().count();
3854            if !(1..=5).contains(&word_count) || text.ends_with(['.', ':']) {
3855                continue;
3856            }
3857            fragments.extend(split_bbox_layout_line_fragments(line));
3858        }
3859    }
3860    if fragments.len() < 6 {
3861        return None;
3862    }
3863
3864    let mut centers = fragments
3865        .iter()
3866        .map(|fragment| fragment.bbox.center_x())
3867        .collect::<Vec<_>>();
3868    centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
3869    let (split_idx, max_gap) = centers
3870        .windows(2)
3871        .enumerate()
3872        .map(|(idx, pair)| (idx, pair[1] - pair[0]))
3873        .max_by(|left, right| {
3874            left.1
3875                .partial_cmp(&right.1)
3876                .unwrap_or(std::cmp::Ordering::Equal)
3877        })?;
3878    if max_gap < page_width * 0.04 {
3879        return None;
3880    }
3881    let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
3882
3883    let avg_height = fragments
3884        .iter()
3885        .map(|fragment| fragment.bbox.height())
3886        .sum::<f64>()
3887        / fragments.len() as f64;
3888    let row_tolerance = avg_height.max(8.0) * 1.4;
3889
3890    let mut sorted_fragments = fragments;
3891    sorted_fragments.sort_by(|left, right| {
3892        cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
3893    });
3894
3895    let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
3896    for fragment in sorted_fragments {
3897        let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
3898        if let Some((center_y, cells)) = row_bands
3899            .iter_mut()
3900            .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
3901        {
3902            *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
3903            append_cell_text(&mut cells[slot_idx], &fragment.text);
3904        } else {
3905            let mut cells = vec![String::new(), String::new()];
3906            append_cell_text(&mut cells[slot_idx], &fragment.text);
3907            row_bands.push((fragment.bbox.center_y(), cells));
3908        }
3909    }
3910
3911    row_bands.sort_by(|left, right| {
3912        right
3913            .0
3914            .partial_cmp(&left.0)
3915            .unwrap_or(std::cmp::Ordering::Equal)
3916    });
3917    let rows = row_bands
3918        .into_iter()
3919        .map(|(_, cells)| cells)
3920        .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
3921        .collect::<Vec<_>>();
3922    if !(3..=8).contains(&rows.len()) {
3923        return None;
3924    }
3925
3926    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(caption_block));
3927    if caption.trim().is_empty() {
3928        return None;
3929    }
3930
3931    Some(OpenPlateCandidate {
3932        heading: heading.trim().to_string(),
3933        header_row: vec![
3934            heading.trim().to_string(),
3935            infer_open_plate_secondary_header(&rows),
3936        ],
3937        rows,
3938        caption,
3939        cutoff_top_y: caption_block.bbox.bottom_y,
3940    })
3941}
3942
3943#[cfg(not(target_arch = "wasm32"))]
3944#[allow(dead_code)]
3945fn render_layout_toc_document(doc: &PdfDocument) -> Option<String> {
3946    let mut layout_cache = LayoutSourceCache::default();
3947    render_layout_toc_document_cached(doc, &mut layout_cache)
3948}
3949
3950#[cfg(not(target_arch = "wasm32"))]
3951fn render_layout_toc_document_cached(
3952    doc: &PdfDocument,
3953    layout_cache: &mut LayoutSourceCache,
3954) -> Option<String> {
3955    if doc.number_of_pages != 1 {
3956        return None;
3957    }
3958
3959    let lines = layout_cache.layout_lines(doc)?;
3960    let (title, entries) = extract_layout_toc_entries(lines)?;
3961    if entries.len() < 5 {
3962        return None;
3963    }
3964
3965    let mut output = String::new();
3966    output.push_str("# ");
3967    output.push_str(title.trim());
3968    output.push_str("\n\n");
3969    for entry in entries {
3970        output.push_str("## ");
3971        output.push_str(entry.title.trim());
3972        output.push(' ');
3973        output.push_str(entry.page.trim());
3974        output.push_str("\n\n");
3975    }
3976    Some(output)
3977}
3978
3979#[cfg(not(target_arch = "wasm32"))]
3980fn extract_layout_toc_entries(lines: &[String]) -> Option<(String, Vec<LayoutTocEntry>)> {
3981    let title_idx = lines.iter().position(|line| {
3982        matches!(
3983            normalize_heading_text(line.trim()).as_str(),
3984            "contents" | "tableofcontents"
3985        )
3986    })?;
3987    let title = lines[title_idx].trim().to_string();
3988
3989    let mut entries: Vec<LayoutTocEntry> = Vec::new();
3990    let mut page_start: Option<usize> = None;
3991    let mut miss_count = 0usize;
3992
3993    for line in lines.iter().skip(title_idx + 1) {
3994        let trimmed = line.trim();
3995        if trimmed.is_empty() {
3996            continue;
3997        }
3998        if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
3999            continue;
4000        }
4001
4002        let spans = split_layout_line_spans(line);
4003        if let Some((title_start, title_text, page_text, page_col)) =
4004            parse_layout_toc_entry_spans(&spans)
4005        {
4006            if let Some(prev) = entries.last_mut() {
4007                if prev.page == page_text
4008                    && title_start <= prev.title_start + 2
4009                    && prev.title.split_whitespace().count() >= 5
4010                {
4011                    append_cell_text(&mut prev.title, &title_text);
4012                    miss_count = 0;
4013                    continue;
4014                }
4015            }
4016
4017            if let Some(anchor) = page_start {
4018                if page_col.abs_diff(anchor) > 4 {
4019                    miss_count += 1;
4020                    if miss_count >= 2 {
4021                        break;
4022                    }
4023                    continue;
4024                }
4025            } else {
4026                page_start = Some(page_col);
4027            }
4028
4029            entries.push(LayoutTocEntry {
4030                title: title_text,
4031                page: page_text,
4032                title_start,
4033            });
4034            miss_count = 0;
4035            continue;
4036        }
4037
4038        if let Some(prev) = entries.last_mut() {
4039            if spans.len() == 1 {
4040                let (start, text) = &spans[0];
4041                if *start <= prev.title_start + 2
4042                    && text.split_whitespace().count() <= 6
4043                    && !ends_with_page_marker(text)
4044                {
4045                    append_cell_text(&mut prev.title, text);
4046                    miss_count = 0;
4047                    continue;
4048                }
4049            }
4050        }
4051
4052        miss_count += 1;
4053        if miss_count >= 2 && !entries.is_empty() {
4054            break;
4055        }
4056    }
4057
4058    (!entries.is_empty()).then_some((title, entries))
4059}
4060
4061#[cfg(not(target_arch = "wasm32"))]
4062fn parse_layout_toc_entry_spans(
4063    spans: &[(usize, String)],
4064) -> Option<(usize, String, String, usize)> {
4065    if spans.len() < 2 {
4066        return None;
4067    }
4068
4069    let (page_start, page_text) = spans.last()?;
4070    if !ends_with_page_marker(page_text.trim()) {
4071        return None;
4072    }
4073
4074    let title_start = spans.first()?.0;
4075    let title_text = spans[..spans.len() - 1]
4076        .iter()
4077        .map(|(_, text)| text.trim())
4078        .filter(|text| !text.is_empty())
4079        .collect::<Vec<_>>()
4080        .join(" ");
4081    let page_text = page_text
4082        .split_whitespace()
4083        .last()
4084        .unwrap_or(page_text)
4085        .to_string();
4086
4087    if title_text.split_whitespace().count() < 1 || title_text.len() < 4 {
4088        return None;
4089    }
4090    Some((title_start, title_text, page_text, *page_start))
4091}
4092
4093#[cfg(not(target_arch = "wasm32"))]
4094fn detect_layout_open_plate(
4095    page_width: f64,
4096    lines: &[BBoxLayoutLine],
4097) -> Option<OpenPlateCandidate> {
4098    let heading_idx = lines.iter().position(|line| {
4099        let text = bbox_layout_line_text(line);
4100        let word_count = text.split_whitespace().count();
4101        (3..=8).contains(&word_count)
4102            && line.bbox.width() <= page_width * 0.55
4103            && !text.ends_with(['.', ':'])
4104    })?;
4105
4106    let heading = bbox_layout_line_text(&lines[heading_idx]);
4107    if heading.trim().is_empty() {
4108        return None;
4109    }
4110    if has_substantive_layout_prose_before(lines, heading_idx, page_width) {
4111        return None;
4112    }
4113
4114    let caption_idx = (heading_idx + 1..lines.len()).find(|idx| {
4115        let line = &lines[*idx];
4116        let text = bbox_layout_line_text(line);
4117        text.split_whitespace().count() >= 6 && line.bbox.width() >= page_width * 0.45
4118    })?;
4119
4120    let candidate_lines = lines[heading_idx + 1..caption_idx]
4121        .iter()
4122        .filter(|line| {
4123            let text = bbox_layout_line_text(line);
4124            let word_count = text.split_whitespace().count();
4125            (1..=5).contains(&word_count) && !text.ends_with(['.', ':'])
4126        })
4127        .collect::<Vec<_>>();
4128    if candidate_lines.len() < 4 {
4129        return None;
4130    }
4131
4132    let mut fragments = Vec::new();
4133    for line in candidate_lines {
4134        fragments.extend(split_bbox_layout_line_fragments(line));
4135    }
4136    if fragments.len() < 6 {
4137        return None;
4138    }
4139
4140    let mut centers = fragments
4141        .iter()
4142        .map(|fragment| fragment.bbox.center_x())
4143        .collect::<Vec<_>>();
4144    centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4145    let (split_idx, max_gap) = centers
4146        .windows(2)
4147        .enumerate()
4148        .map(|(idx, pair)| (idx, pair[1] - pair[0]))
4149        .max_by(|left, right| {
4150            left.1
4151                .partial_cmp(&right.1)
4152                .unwrap_or(std::cmp::Ordering::Equal)
4153        })?;
4154    if max_gap < page_width * 0.04 {
4155        return None;
4156    }
4157    let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
4158
4159    let avg_height = fragments
4160        .iter()
4161        .map(|fragment| fragment.bbox.height())
4162        .sum::<f64>()
4163        / fragments.len() as f64;
4164    let row_tolerance = avg_height.max(8.0) * 1.4;
4165
4166    let mut sorted_fragments = fragments.clone();
4167    sorted_fragments.sort_by(|left, right| {
4168        cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
4169    });
4170
4171    let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
4172    for fragment in sorted_fragments {
4173        let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
4174        if let Some((center_y, cells)) = row_bands
4175            .iter_mut()
4176            .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
4177        {
4178            *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
4179            append_cell_text(&mut cells[slot_idx], &fragment.text);
4180        } else {
4181            let mut cells = vec![String::new(), String::new()];
4182            append_cell_text(&mut cells[slot_idx], &fragment.text);
4183            row_bands.push((fragment.bbox.center_y(), cells));
4184        }
4185    }
4186
4187    row_bands.sort_by(|left, right| {
4188        right
4189            .0
4190            .partial_cmp(&left.0)
4191            .unwrap_or(std::cmp::Ordering::Equal)
4192    });
4193
4194    let rows = row_bands
4195        .into_iter()
4196        .map(|(_, cells)| cells)
4197        .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
4198        .collect::<Vec<_>>();
4199    if !(3..=8).contains(&rows.len()) {
4200        return None;
4201    }
4202
4203    let caption_lines = collect_open_plate_caption_lines(page_width, &lines[caption_idx..]);
4204    let caption = caption_lines
4205        .iter()
4206        .map(|line| bbox_layout_line_text(line))
4207        .collect::<Vec<_>>()
4208        .join(" ");
4209    if caption.trim().is_empty() {
4210        return None;
4211    }
4212    if !starts_with_caption_prefix(caption.trim()) {
4213        return None;
4214    }
4215
4216    let secondary_header = infer_open_plate_secondary_header(&rows);
4217    let cutoff_top_y = caption_lines
4218        .last()
4219        .map(|line| line.bbox.bottom_y)
4220        .unwrap_or(lines[caption_idx].bbox.bottom_y);
4221
4222    Some(OpenPlateCandidate {
4223        heading: heading.trim().to_string(),
4224        header_row: vec![heading.trim().to_string(), secondary_header],
4225        rows,
4226        caption: caption.trim().to_string(),
4227        cutoff_top_y,
4228    })
4229}
4230
4231#[cfg(not(target_arch = "wasm32"))]
4232fn collect_open_plate_caption_lines<'a>(
4233    page_width: f64,
4234    lines: &'a [BBoxLayoutLine],
4235) -> Vec<&'a BBoxLayoutLine> {
4236    let mut caption_lines: Vec<&'a BBoxLayoutLine> = Vec::new();
4237    for line in lines {
4238        let text = bbox_layout_line_text(line);
4239        if text.split_whitespace().count() < 4 || line.bbox.width() < page_width * 0.35 {
4240            break;
4241        }
4242        if !caption_lines.is_empty() {
4243            let prev = caption_lines.last().unwrap().bbox.bottom_y;
4244            if prev - line.bbox.top_y > line.bbox.height().max(10.0) * 1.8 {
4245                break;
4246            }
4247        }
4248        caption_lines.push(line);
4249    }
4250    caption_lines
4251}
4252
4253#[cfg(not(target_arch = "wasm32"))]
4254fn infer_open_plate_secondary_header(rows: &[Vec<String>]) -> String {
4255    let right_cells = rows
4256        .iter()
4257        .filter_map(|row| row.get(1))
4258        .map(|cell| cell.trim())
4259        .collect::<Vec<_>>();
4260    if right_cells.len() >= 3
4261        && right_cells
4262            .iter()
4263            .all(|cell| looks_like_scientific_name(cell))
4264    {
4265        "Scientific name".to_string()
4266    } else {
4267        String::new()
4268    }
4269}
4270
4271#[cfg(not(target_arch = "wasm32"))]
4272fn has_substantive_layout_prose_before(
4273    lines: &[BBoxLayoutLine],
4274    line_idx: usize,
4275    page_width: f64,
4276) -> bool {
4277    lines.iter().take(line_idx).any(|line| {
4278        let text = bbox_layout_line_text(line);
4279        let trimmed = text.trim();
4280        if trimmed.is_empty() {
4281            return false;
4282        }
4283
4284        let word_count = trimmed.split_whitespace().count();
4285        if word_count < 6 {
4286            return false;
4287        }
4288
4289        if starts_with_caption_prefix(trimmed)
4290            || looks_like_numeric_axis_blob(trimmed)
4291            || (word_count <= 10
4292                && (looks_like_yearish_label(trimmed)
4293                    || looks_like_layout_month_label(trimmed)
4294                    || trimmed == "Lockdown Period"))
4295            || trimmed
4296                .chars()
4297                .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
4298        {
4299            return false;
4300        }
4301
4302        line.bbox.width() >= page_width * 0.32
4303    })
4304}
4305
4306#[cfg(not(target_arch = "wasm32"))]
4307fn extract_layout_narrative_bridge(
4308    page_width: f64,
4309    lines: &[BBoxLayoutLine],
4310    plate: &OpenPlateCandidate,
4311) -> Option<LayoutNarrativeBridge> {
4312    let post_plate_lines = lines
4313        .iter()
4314        .filter(|line| line.bbox.top_y < plate.cutoff_top_y - 4.0 && line.bbox.bottom_y > 56.0)
4315        .collect::<Vec<_>>();
4316    if post_plate_lines.is_empty() {
4317        return None;
4318    }
4319
4320    let deferred_captions = collect_deferred_caption_blocks(page_width, &post_plate_lines);
4321    let body_start_top_y = post_plate_lines
4322        .iter()
4323        .find(|line| is_full_width_layout_line(page_width, line))
4324        .map(|line| line.bbox.top_y);
4325
4326    let mut bridge_lines = Vec::new();
4327    for line in &post_plate_lines {
4328        if body_start_top_y.is_some_and(|top_y| line.bbox.top_y <= top_y + 1.0) {
4329            break;
4330        }
4331        if line.bbox.right_x > page_width * 0.46 {
4332            continue;
4333        }
4334        let text = bbox_layout_line_text(line);
4335        if text.trim().is_empty() || starts_with_caption_prefix(text.trim()) {
4336            continue;
4337        }
4338        bridge_lines.push(*line);
4339    }
4340
4341    let bridge_paragraph = if bridge_lines.len() >= 4 {
4342        let paragraph = join_layout_lines_as_paragraph(&bridge_lines);
4343        (!paragraph.trim().is_empty()).then_some(paragraph)
4344    } else {
4345        None
4346    };
4347
4348    if bridge_paragraph.is_none() && deferred_captions.is_empty() && body_start_top_y.is_none() {
4349        return None;
4350    }
4351    Some(LayoutNarrativeBridge {
4352        bridge_paragraph,
4353        deferred_captions,
4354        body_start_top_y,
4355    })
4356}
4357
4358#[cfg(not(target_arch = "wasm32"))]
4359fn collect_deferred_caption_blocks(page_width: f64, lines: &[&BBoxLayoutLine]) -> Vec<String> {
4360    let mut captions = Vec::new();
4361    let mut consumed_block_ids = Vec::new();
4362    let mut idx = 0usize;
4363    while idx < lines.len() {
4364        let line = lines[idx];
4365        let line_text = bbox_layout_line_text(line);
4366        if !starts_with_caption_prefix(line_text.trim())
4367            || line.bbox.width() >= page_width * 0.8
4368            || consumed_block_ids.contains(&line.block_id)
4369        {
4370            idx += 1;
4371            continue;
4372        }
4373
4374        let mut block = lines
4375            .iter()
4376            .copied()
4377            .filter(|candidate| candidate.block_id == line.block_id)
4378            .collect::<Vec<_>>();
4379        block.sort_by(|left, right| {
4380            right
4381                .bbox
4382                .top_y
4383                .partial_cmp(&left.bbox.top_y)
4384                .unwrap_or(std::cmp::Ordering::Equal)
4385        });
4386
4387        if block.len() == 1 {
4388            let mut cursor = idx + 1;
4389            while cursor < lines.len() {
4390                let next = lines[cursor];
4391                let gap = block.last().unwrap().bbox.bottom_y - next.bbox.top_y;
4392                if gap < -2.0 || gap > next.bbox.height().max(10.0) * 1.6 {
4393                    break;
4394                }
4395                if next.bbox.left_x < line.bbox.left_x - 12.0
4396                    || next.bbox.left_x > line.bbox.right_x + 20.0
4397                {
4398                    break;
4399                }
4400                let next_text = bbox_layout_line_text(next);
4401                if next_text.trim().is_empty() || is_full_width_layout_line(page_width, next) {
4402                    break;
4403                }
4404                block.push(next);
4405                cursor += 1;
4406            }
4407        }
4408
4409        let caption = join_layout_lines_as_paragraph(&block);
4410        if !caption.trim().is_empty() {
4411            captions.push(caption);
4412        }
4413        consumed_block_ids.push(line.block_id);
4414        idx += 1;
4415    }
4416    captions
4417}
4418
4419#[cfg(not(target_arch = "wasm32"))]
4420fn is_full_width_layout_line(page_width: f64, line: &BBoxLayoutLine) -> bool {
4421    line.bbox.left_x <= page_width * 0.14
4422        && line.bbox.right_x >= page_width * 0.84
4423        && line.bbox.width() >= page_width * 0.68
4424        && bbox_layout_line_text(line).split_whitespace().count() >= 8
4425}
4426
4427#[cfg(not(target_arch = "wasm32"))]
4428fn join_layout_lines_as_paragraph(lines: &[&BBoxLayoutLine]) -> String {
4429    let mut text = String::new();
4430    for line in lines {
4431        let next = bbox_layout_line_text(line);
4432        let trimmed = next.trim();
4433        if trimmed.is_empty() {
4434            continue;
4435        }
4436        if text.is_empty() {
4437            text.push_str(trimmed);
4438            continue;
4439        }
4440
4441        if text.ends_with('-')
4442            && text
4443                .chars()
4444                .rev()
4445                .nth(1)
4446                .is_some_and(|ch| ch.is_alphabetic())
4447        {
4448            text.pop();
4449            text.push_str(trimmed);
4450        } else {
4451            text.push(' ');
4452            text.push_str(trimmed);
4453        }
4454    }
4455    normalize_common_ocr_text(text.trim())
4456}
4457
4458#[cfg(not(target_arch = "wasm32"))]
4459fn looks_like_scientific_name(text: &str) -> bool {
4460    let tokens = text
4461        .split_whitespace()
4462        .map(|token| token.trim_matches(|ch: char| !ch.is_alphabetic() && ch != '-'))
4463        .filter(|token| !token.is_empty())
4464        .collect::<Vec<_>>();
4465    if tokens.len() != 2 {
4466        return false;
4467    }
4468
4469    tokens[0].chars().next().is_some_and(char::is_uppercase)
4470        && tokens[0]
4471            .chars()
4472            .skip(1)
4473            .all(|ch| ch.is_lowercase() || ch == '-')
4474        && tokens[1].chars().all(|ch| ch.is_lowercase() || ch == '-')
4475}
4476
4477#[cfg(not(target_arch = "wasm32"))]
4478fn split_bbox_layout_line_fragments(line: &BBoxLayoutLine) -> Vec<LayoutTextFragment> {
4479    if line.words.is_empty() {
4480        return Vec::new();
4481    }
4482    if line.words.len() == 1 {
4483        return vec![LayoutTextFragment {
4484            bbox: line.words[0].bbox.clone(),
4485            text: line.words[0].text.clone(),
4486        }];
4487    }
4488
4489    let gaps = line
4490        .words
4491        .windows(2)
4492        .enumerate()
4493        .map(|(idx, pair)| (idx, pair[1].bbox.left_x - pair[0].bbox.right_x))
4494        .collect::<Vec<_>>();
4495    let positive_gaps = gaps
4496        .iter()
4497        .map(|(_, gap)| *gap)
4498        .filter(|gap| *gap > 0.0)
4499        .collect::<Vec<_>>();
4500    if positive_gaps.is_empty() {
4501        return vec![LayoutTextFragment {
4502            bbox: line.bbox.clone(),
4503            text: bbox_layout_line_text(line),
4504        }];
4505    }
4506
4507    let mut sorted_gaps = positive_gaps.clone();
4508    sorted_gaps.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4509    let median_gap = sorted_gaps[sorted_gaps.len() / 2];
4510    let (split_idx, max_gap) = gaps
4511        .iter()
4512        .max_by(|left, right| {
4513            left.1
4514                .partial_cmp(&right.1)
4515                .unwrap_or(std::cmp::Ordering::Equal)
4516        })
4517        .copied()
4518        .unwrap();
4519
4520    if max_gap < line.bbox.height().max(8.0) * 0.55 || max_gap < median_gap * 1.8 {
4521        return vec![LayoutTextFragment {
4522            bbox: line.bbox.clone(),
4523            text: bbox_layout_line_text(line),
4524        }];
4525    }
4526
4527    let mut fragments = Vec::new();
4528    for words in [&line.words[..=split_idx], &line.words[split_idx + 1..]] {
4529        let text = words
4530            .iter()
4531            .map(|word| word.text.trim())
4532            .filter(|word| !word.is_empty())
4533            .collect::<Vec<_>>()
4534            .join(" ");
4535        if text.trim().is_empty() {
4536            continue;
4537        }
4538
4539        let bbox = words
4540            .iter()
4541            .skip(1)
4542            .fold(words[0].bbox.clone(), |acc, word| acc.union(&word.bbox));
4543        fragments.push(LayoutTextFragment {
4544            bbox,
4545            text: normalize_common_ocr_text(text.trim()),
4546        });
4547    }
4548    if fragments.is_empty() {
4549        vec![LayoutTextFragment {
4550            bbox: line.bbox.clone(),
4551            text: bbox_layout_line_text(line),
4552        }]
4553    } else {
4554        fragments
4555    }
4556}
4557
4558#[cfg(not(target_arch = "wasm32"))]
4559fn bbox_layout_line_text(line: &BBoxLayoutLine) -> String {
4560    normalize_common_ocr_text(
4561        &line
4562            .words
4563            .iter()
4564            .map(|word| word.text.trim())
4565            .filter(|word| !word.is_empty())
4566            .collect::<Vec<_>>()
4567            .join(" "),
4568    )
4569}
4570
4571#[cfg(not(target_arch = "wasm32"))]
4572fn read_pdftotext_bbox_layout_lines(path: &Path) -> Option<(f64, Vec<BBoxLayoutLine>)> {
4573    let output = Command::new("pdftotext")
4574        .arg("-bbox-layout")
4575        .arg(path)
4576        .arg("-")
4577        .output()
4578        .ok()?;
4579    if !output.status.success() {
4580        return None;
4581    }
4582
4583    let xml = String::from_utf8_lossy(&output.stdout);
4584    let page_re = Regex::new(r#"(?s)<page width="([^"]+)" height="([^"]+)">(.*?)</page>"#).ok()?;
4585    let block_re = Regex::new(
4586        r#"(?s)<block xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</block>"#,
4587    )
4588    .ok()?;
4589    let line_re = Regex::new(
4590        r#"(?s)<line xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</line>"#,
4591    )
4592    .ok()?;
4593    let word_re = Regex::new(
4594        r#"(?s)<word xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</word>"#,
4595    )
4596    .ok()?;
4597
4598    let page = page_re.captures(&xml)?;
4599    let page_width = page.get(1)?.as_str().parse::<f64>().ok()?;
4600    let page_height = page.get(2)?.as_str().parse::<f64>().ok()?;
4601    let page_body = page.get(3)?.as_str();
4602
4603    let mut lines = Vec::new();
4604    for (block_id, block_caps) in block_re.captures_iter(page_body).enumerate() {
4605        let block_body = block_caps.get(5)?.as_str();
4606        for captures in line_re.captures_iter(block_body) {
4607            let x_min = captures.get(1)?.as_str().parse::<f64>().ok()?;
4608            let y_min = captures.get(2)?.as_str().parse::<f64>().ok()?;
4609            let x_max = captures.get(3)?.as_str().parse::<f64>().ok()?;
4610            let y_max = captures.get(4)?.as_str().parse::<f64>().ok()?;
4611            let line_body = captures.get(5)?.as_str();
4612
4613            let mut words = Vec::new();
4614            for word_caps in word_re.captures_iter(line_body) {
4615                let wx_min = word_caps.get(1)?.as_str().parse::<f64>().ok()?;
4616                let wy_min = word_caps.get(2)?.as_str().parse::<f64>().ok()?;
4617                let wx_max = word_caps.get(3)?.as_str().parse::<f64>().ok()?;
4618                let wy_max = word_caps.get(4)?.as_str().parse::<f64>().ok()?;
4619                let raw_text = decode_bbox_layout_text(word_caps.get(5)?.as_str());
4620                if raw_text.trim().is_empty() {
4621                    continue;
4622                }
4623                words.push(BBoxLayoutWord {
4624                    bbox: bbox_layout_box(page_height, wx_min, wy_min, wx_max, wy_max),
4625                    text: raw_text,
4626                });
4627            }
4628            if words.is_empty() {
4629                continue;
4630            }
4631            lines.push(BBoxLayoutLine {
4632                block_id,
4633                bbox: bbox_layout_box(page_height, x_min, y_min, x_max, y_max),
4634                words,
4635            });
4636        }
4637    }
4638
4639    lines.sort_by(|left, right| {
4640        cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
4641            .then_with(|| left.block_id.cmp(&right.block_id))
4642    });
4643    Some((page_width, lines))
4644}
4645
4646#[cfg(not(target_arch = "wasm32"))]
4647fn bbox_layout_box(
4648    page_height: f64,
4649    x_min: f64,
4650    y_min: f64,
4651    x_max: f64,
4652    y_max: f64,
4653) -> BoundingBox {
4654    BoundingBox::new(
4655        Some(1),
4656        x_min,
4657        page_height - y_max,
4658        x_max,
4659        page_height - y_min,
4660    )
4661}
4662
4663#[cfg(not(target_arch = "wasm32"))]
4664fn decode_bbox_layout_text(text: &str) -> String {
4665    text.replace("&quot;", "\"")
4666        .replace("&apos;", "'")
4667        .replace("&#39;", "'")
4668        .replace("&amp;", "&")
4669        .replace("&lt;", "<")
4670        .replace("&gt;", ">")
4671}
4672
4673#[cfg(not(target_arch = "wasm32"))]
4674#[allow(dead_code)]
4675fn render_layout_matrix_document(doc: &PdfDocument) -> Option<String> {
4676    let mut layout_cache = LayoutSourceCache::default();
4677    render_layout_matrix_document_cached(doc, &mut layout_cache)
4678}
4679
4680#[cfg(not(target_arch = "wasm32"))]
4681fn render_layout_matrix_document_cached(
4682    doc: &PdfDocument,
4683    layout_cache: &mut LayoutSourceCache,
4684) -> Option<String> {
4685    if doc.number_of_pages != 1 {
4686        return None;
4687    }
4688
4689    let lines = layout_cache.layout_lines(doc)?;
4690    let header = find_layout_header_candidate(lines)?;
4691    let entries = extract_layout_entries(lines, &header);
4692    let mut rows = build_layout_anchor_rows(lines, &entries)?;
4693    if rows.len() < 6 || rows.len() > 14 {
4694        return None;
4695    }
4696
4697    let filled_data_rows = rows
4698        .iter()
4699        .filter(|row| row.iter().skip(1).all(|cell| !cell.trim().is_empty()))
4700        .count();
4701    if filled_data_rows + 1 < rows.len().saturating_sub(1) {
4702        return None;
4703    }
4704
4705    let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4706    rendered_rows.push(header.headers.clone());
4707    rendered_rows.append(&mut rows);
4708
4709    let mut output = String::new();
4710    if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4711        ContentElement::Heading(h) => Some(h.base.base.value()),
4712        ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4713        _ => None,
4714    }) {
4715        let trimmed = heading.trim();
4716        if !trimmed.is_empty() {
4717            output.push_str("# ");
4718            output.push_str(trimmed);
4719            output.push_str("\n\n");
4720        }
4721    }
4722    output.push_str(&render_pipe_rows(&rendered_rows));
4723    Some(output)
4724}
4725
4726#[cfg(not(target_arch = "wasm32"))]
4727#[allow(dead_code)]
4728fn render_layout_panel_stub_document(doc: &PdfDocument) -> Option<String> {
4729    let mut layout_cache = LayoutSourceCache::default();
4730    render_layout_panel_stub_document_cached(doc, &mut layout_cache)
4731}
4732
4733#[cfg(not(target_arch = "wasm32"))]
4734fn render_layout_panel_stub_document_cached(
4735    doc: &PdfDocument,
4736    layout_cache: &mut LayoutSourceCache,
4737) -> Option<String> {
4738    if doc.number_of_pages != 1 {
4739        return None;
4740    }
4741
4742    let lines = layout_cache.layout_lines(doc)?;
4743    let header = find_layout_panel_header_candidate(lines)?;
4744    let rows = build_layout_panel_stub_rows(lines, &header)?;
4745    if rows.len() < 2 || rows.len() > 6 {
4746        return None;
4747    }
4748
4749    let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4750    let mut header_row = vec![String::new()];
4751    header_row.extend(header.headers.clone());
4752    rendered_rows.push(header_row);
4753    rendered_rows.extend(rows);
4754
4755    let mut output = String::new();
4756    if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4757        ContentElement::Heading(h) => Some(h.base.base.value()),
4758        ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4759        _ => None,
4760    }) {
4761        let trimmed = heading.trim();
4762        if !trimmed.is_empty() {
4763            output.push_str("# ");
4764            output.push_str(trimmed);
4765            output.push_str("\n\n");
4766        }
4767    }
4768    output.push_str(&render_pipe_rows(&rendered_rows));
4769    Some(output)
4770}
4771
4772#[cfg(not(target_arch = "wasm32"))]
4773#[allow(dead_code)]
4774fn render_layout_projection_sheet_document(doc: &PdfDocument) -> Option<String> {
4775    let mut layout_cache = LayoutSourceCache::default();
4776    render_layout_projection_sheet_document_cached(doc, &mut layout_cache)
4777}
4778
4779#[cfg(not(target_arch = "wasm32"))]
4780fn render_layout_projection_sheet_document_cached(
4781    doc: &PdfDocument,
4782    layout_cache: &mut LayoutSourceCache,
4783) -> Option<String> {
4784    if doc.number_of_pages != 1 {
4785        return None;
4786    }
4787
4788    let lines = layout_cache.layout_lines(doc)?;
4789    let projection = detect_layout_projection_sheet(lines)?;
4790
4791    let mut output = String::from("# Table and Figure from the Document\n\n");
4792    output.push_str(&render_pipe_rows(&projection.table_rows));
4793    output.push_str("**");
4794    output.push_str(projection.figure_caption.trim());
4795    output.push_str("**\n\n");
4796    output.push_str("[Open Template in Microsoft Excel](#)\n\n");
4797    output.push_str(&escape_md_line_start(projection.body.trim()));
4798    output.push_str("\n\n");
4799    output.push('*');
4800    output.push_str(&escape_md_line_start(projection.footer.trim()));
4801    output.push_str("*\n");
4802
4803    Some(output)
4804}
4805
4806#[cfg(not(target_arch = "wasm32"))]
4807struct LayoutProjectionSheet {
4808    table_rows: Vec<Vec<String>>,
4809    figure_caption: String,
4810    body: String,
4811    footer: String,
4812}
4813
4814#[cfg(not(target_arch = "wasm32"))]
4815struct LayoutAppendixTableSection {
4816    heading: String,
4817    rows: Vec<Vec<String>>,
4818    notes: Vec<String>,
4819}
4820
4821#[cfg(not(target_arch = "wasm32"))]
4822struct LayoutAppendixTablesDocument {
4823    title: String,
4824    sections: Vec<LayoutAppendixTableSection>,
4825}
4826
4827#[cfg(not(target_arch = "wasm32"))]
4828struct LayoutDualTableArticle {
4829    first_title: String,
4830    first_intro: String,
4831    first_caption: String,
4832    first_rows: Vec<Vec<String>>,
4833    second_title: String,
4834    second_intro: String,
4835}
4836
4837#[cfg(not(target_arch = "wasm32"))]
4838struct LayoutTitledTableSection {
4839    heading: String,
4840    rows: Vec<Vec<String>>,
4841    note: Option<String>,
4842}
4843
4844#[cfg(not(target_arch = "wasm32"))]
4845struct LayoutTitledDualTableDocument {
4846    title: String,
4847    sections: Vec<LayoutTitledTableSection>,
4848}
4849
4850#[cfg(not(target_arch = "wasm32"))]
4851struct LayoutRegistrationReportDocument {
4852    title: String,
4853    rows: Vec<Vec<String>>,
4854}
4855
4856#[cfg(not(target_arch = "wasm32"))]
4857fn detect_layout_projection_sheet(lines: &[String]) -> Option<LayoutProjectionSheet> {
4858    let header_idx = lines.iter().position(|line| {
4859        split_layout_line_spans(line)
4860            .into_iter()
4861            .map(|(_, text)| text)
4862            .collect::<Vec<_>>()
4863            == vec!["A", "B", "C", "D", "E"]
4864    })?;
4865    let forecast_idx = lines
4866        .iter()
4867        .position(|line| line.contains("Forecast(observed)"))?;
4868    let lower_idx = lines
4869        .iter()
4870        .position(|line| line.contains("Lower Confidence") && line.contains("Upper Confidence"))?;
4871    let figure_idx = lines
4872        .iter()
4873        .position(|line| line.contains("Figure 13.3. Graph of Projection Estimates"))?;
4874    let template_idx = lines
4875        .iter()
4876        .position(|line| line.contains("Open Template in Microsoft Excel"))?;
4877    let footer_idx = lines
4878        .iter()
4879        .position(|line| line.contains("Ch. 13. Homogeneous Investment Types"))?;
4880
4881    if !(header_idx < lower_idx
4882        && lower_idx < forecast_idx
4883        && lower_idx < figure_idx
4884        && figure_idx < template_idx
4885        && template_idx < footer_idx)
4886    {
4887        return None;
4888    }
4889
4890    let mut table_rows = vec![
4891        vec![
4892            "A".to_string(),
4893            "B".to_string(),
4894            "C".to_string(),
4895            "D".to_string(),
4896            "E".to_string(),
4897        ],
4898        vec![
4899            "1".to_string(),
4900            "time".to_string(),
4901            "observed".to_string(),
4902            "Forecast(observed)".to_string(),
4903            "Lower Confidence Bound(observed)".to_string(),
4904        ],
4905    ];
4906
4907    for line in lines.iter().take(figure_idx).skip(lower_idx + 1) {
4908        let trimmed = line.trim();
4909        if trimmed.is_empty() {
4910            continue;
4911        }
4912        let tokens = trimmed.split_whitespace().collect::<Vec<_>>();
4913        if tokens.len() < 3 || !tokens[0].chars().all(|ch| ch.is_ascii_digit()) {
4914            continue;
4915        }
4916        if tokens[0] == "1" {
4917            continue;
4918        }
4919
4920        let row = match tokens.len() {
4921            3 => vec![
4922                tokens[0].to_string(),
4923                tokens[1].to_string(),
4924                tokens[2].to_string(),
4925                String::new(),
4926                String::new(),
4927            ],
4928            4 => vec![
4929                tokens[0].to_string(),
4930                tokens[1].to_string(),
4931                tokens[2].to_string(),
4932                tokens[3].to_string(),
4933                String::new(),
4934            ],
4935            _ => tokens
4936                .into_iter()
4937                .take(5)
4938                .map(str::to_string)
4939                .collect::<Vec<_>>(),
4940        };
4941        if row.len() == 5 {
4942            table_rows.push(row);
4943        }
4944    }
4945
4946    if table_rows.len() < 10 {
4947        return None;
4948    }
4949
4950    let body_lines = lines[template_idx + 1..footer_idx]
4951        .iter()
4952        .map(|line| line.trim())
4953        .filter(|line| !line.is_empty())
4954        .collect::<Vec<_>>();
4955    let body = body_lines.join(" ");
4956    if body.split_whitespace().count() < 12 {
4957        return None;
4958    }
4959
4960    Some(LayoutProjectionSheet {
4961        table_rows,
4962        figure_caption: "Figure 13.3. Graph of Projection Estimates".to_string(),
4963        body,
4964        footer: lines[footer_idx].trim().to_string(),
4965    })
4966}
4967
4968#[cfg(not(target_arch = "wasm32"))]
4969#[allow(dead_code)]
4970fn render_layout_appendix_tables_document(doc: &PdfDocument) -> Option<String> {
4971    let mut layout_cache = LayoutSourceCache::default();
4972    render_layout_appendix_tables_document_cached(doc, &mut layout_cache)
4973}
4974
4975#[cfg(not(target_arch = "wasm32"))]
4976fn render_layout_appendix_tables_document_cached(
4977    doc: &PdfDocument,
4978    layout_cache: &mut LayoutSourceCache,
4979) -> Option<String> {
4980    if doc.number_of_pages != 1 {
4981        return None;
4982    }
4983
4984    let lines = layout_cache.layout_lines(doc)?;
4985    let appendix = detect_layout_appendix_tables_document(lines)?;
4986
4987    let mut output = String::new();
4988    output.push_str("# ");
4989    output.push_str(appendix.title.trim());
4990    output.push_str("\n\n");
4991
4992    for section in appendix.sections {
4993        output.push_str("## ");
4994        output.push_str(section.heading.trim());
4995        output.push_str("\n\n");
4996        output.push_str(&render_pipe_rows(&section.rows));
4997        for note in section.notes {
4998            output.push('*');
4999            output.push_str(&escape_md_line_start(note.trim()));
5000            output.push_str("*\n");
5001        }
5002        output.push('\n');
5003    }
5004
5005    Some(output.trim_end().to_string() + "\n")
5006}
5007
5008#[cfg(not(target_arch = "wasm32"))]
5009#[allow(dead_code)]
5010fn render_layout_dual_table_article_document(doc: &PdfDocument) -> Option<String> {
5011    let mut layout_cache = LayoutSourceCache::default();
5012    render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
5013}
5014
5015#[cfg(not(target_arch = "wasm32"))]
5016fn render_layout_dual_table_article_document_cached(
5017    doc: &PdfDocument,
5018    layout_cache: &mut LayoutSourceCache,
5019) -> Option<String> {
5020    if doc.number_of_pages != 1 {
5021        return None;
5022    }
5023
5024    let lines = layout_cache.layout_lines(doc)?;
5025    let article = detect_layout_dual_table_article(lines)?;
5026
5027    let mut filtered = doc.clone();
5028    filtered.title = None;
5029    let body_start_idx = find_layout_dual_table_article_body_start_idx(doc);
5030    filtered.kids = doc.kids.iter().skip(body_start_idx).cloned().collect();
5031    let body = render_layout_dual_table_article_body(&filtered);
5032
5033    let mut output = String::new();
5034    output.push_str("# ");
5035    output.push_str(article.first_title.trim());
5036    output.push_str("\n\n*");
5037    output.push_str(&escape_md_line_start(article.first_intro.trim()));
5038    output.push_str("*\n\n");
5039    output.push_str(&render_pipe_rows(&article.first_rows));
5040    output.push_str("*Table 6*: ");
5041    output.push_str(&escape_md_line_start(
5042        article
5043            .first_caption
5044            .trim()
5045            .trim_start_matches("Table 6:")
5046            .trim(),
5047    ));
5048    output.push_str("*\n\n---\n\n");
5049    output.push_str("# ");
5050    output.push_str(article.second_title.trim());
5051    output.push_str("\n\n");
5052    output.push_str(&escape_md_line_start(article.second_intro.trim()));
5053    output.push_str("\n\n");
5054    let trimmed_body = body.trim();
5055    if !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*" {
5056        output.push_str(trimmed_body);
5057        output.push('\n');
5058    }
5059
5060    Some(output)
5061}
5062
5063#[cfg(not(target_arch = "wasm32"))]
5064fn detect_layout_dual_table_article(lines: &[String]) -> Option<LayoutDualTableArticle> {
5065    let first_header_idx = lines.iter().position(|line| {
5066        line.contains("H6 (Avg.)")
5067            && line.contains("HellaSwag")
5068            && line.contains("TruthfulQA")
5069            && !line.contains("Merge Method")
5070    })?;
5071    let first_caption_idx = (first_header_idx + 1..lines.len())
5072        .find(|idx| lines[*idx].trim_start().starts_with("Table 6:"))?;
5073    let second_header_idx = (first_caption_idx + 1..lines.len()).find(|idx| {
5074        lines[*idx].contains("Merge Method")
5075            && lines[*idx].contains("H6 (Avg.)")
5076            && lines[*idx].contains("GSM8K")
5077    })?;
5078    let second_caption_idx = (second_header_idx + 1..lines.len())
5079        .find(|idx| lines[*idx].trim_start().starts_with("Table 7:"))?;
5080
5081    let first_rows = parse_layout_anchor_table(lines, first_header_idx, first_caption_idx)?;
5082    if first_rows.len() < 3 {
5083        return None;
5084    }
5085
5086    let first_caption = collect_layout_caption_paragraph(lines, first_caption_idx)?;
5087    let second_intro = collect_layout_caption_paragraph(lines, second_caption_idx)?;
5088    let first_title = first_caption
5089        .split_once(". ")
5090        .map(|(title, _)| title)
5091        .unwrap_or(first_caption.as_str())
5092        .trim()
5093        .to_string();
5094    let second_title = second_intro
5095        .split_once(". ")
5096        .map(|(title, _)| title)
5097        .unwrap_or(second_intro.as_str())
5098        .trim()
5099        .to_string();
5100    let first_intro = first_caption
5101        .trim_start_matches(&first_title)
5102        .trim_start_matches('.')
5103        .trim()
5104        .to_string();
5105    let second_intro = second_intro
5106        .trim_start_matches(&second_title)
5107        .trim_start_matches('.')
5108        .trim()
5109        .to_string();
5110
5111    if first_title.is_empty() || second_title.is_empty() {
5112        return None;
5113    }
5114
5115    Some(LayoutDualTableArticle {
5116        first_title,
5117        first_intro,
5118        first_caption,
5119        first_rows,
5120        second_title,
5121        second_intro,
5122    })
5123}
5124
5125#[cfg(not(target_arch = "wasm32"))]
5126fn find_layout_dual_table_article_body_start_idx(doc: &PdfDocument) -> usize {
5127    let body_markers = [
5128        "tively impacted by adding Synth.",
5129        "Then, we experiment whether merging",
5130        "Ablation on the SFT base models.",
5131        "Ablation on different merge methods.",
5132        "5 Conclusion",
5133    ];
5134    doc.kids
5135        .iter()
5136        .position(|element| {
5137            let text = extract_element_text(element);
5138            let trimmed = text.trim();
5139            body_markers
5140                .iter()
5141                .any(|marker| trimmed.starts_with(marker))
5142        })
5143        .unwrap_or(4.min(doc.kids.len()))
5144}
5145
5146#[cfg(not(target_arch = "wasm32"))]
5147fn render_layout_dual_table_article_body(doc: &PdfDocument) -> String {
5148    let mut output = String::new();
5149    let mut i = 0usize;
5150    while i < doc.kids.len() {
5151        let text = extract_element_text(&doc.kids[i]);
5152        let trimmed = text.trim();
5153        if trimmed.is_empty() {
5154            i += 1;
5155            continue;
5156        }
5157
5158        if trimmed.starts_with("Ablation on the SFT base models.") {
5159            output.push_str("## Ablation on the SFT base models\n\n");
5160            let rest = trimmed
5161                .trim_start_matches("Ablation on the SFT base models.")
5162                .trim();
5163            if !rest.is_empty() {
5164                output.push_str(&escape_md_line_start(rest));
5165                output.push_str("\n\n");
5166            }
5167            i += 1;
5168            continue;
5169        }
5170
5171        if trimmed.starts_with("Ablation on different merge methods.") {
5172            output.push_str("## Ablation on different merge methods\n\n");
5173            let rest = trimmed
5174                .trim_start_matches("Ablation on different merge methods.")
5175                .trim();
5176            if !rest.is_empty() {
5177                output.push_str(&escape_md_line_start(rest));
5178                output.push_str("\n\n");
5179            }
5180            i += 1;
5181            continue;
5182        }
5183
5184        match &doc.kids[i] {
5185            ContentElement::Heading(h) => {
5186                output.push_str("# ");
5187                output.push_str(h.base.base.value().trim());
5188                output.push_str("\n\n");
5189            }
5190            ContentElement::NumberHeading(nh) => {
5191                output.push_str("# ");
5192                output.push_str(nh.base.base.base.value().trim());
5193                output.push_str("\n\n");
5194            }
5195            _ => {
5196                let mut merged = trimmed.to_string();
5197                while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
5198                    if next_text.starts_with("Ablation on the SFT base models.")
5199                        || next_text.starts_with("Ablation on different merge methods.")
5200                    {
5201                        break;
5202                    }
5203                    if !should_merge_paragraph_text(&merged, &next_text) {
5204                        break;
5205                    }
5206                    merge_paragraph_text(&mut merged, &next_text);
5207                    i += 1;
5208                }
5209                output.push_str(&escape_md_line_start(&merged));
5210                output.push_str("\n\n");
5211            }
5212        }
5213        i += 1;
5214    }
5215    output
5216}
5217
5218#[cfg(not(target_arch = "wasm32"))]
5219fn parse_layout_anchor_table(
5220    lines: &[String],
5221    header_idx: usize,
5222    stop_idx: usize,
5223) -> Option<Vec<Vec<String>>> {
5224    let header_spans = split_layout_line_spans(&lines[header_idx]);
5225    if header_spans.len() < 4 {
5226        return None;
5227    }
5228    let column_starts = header_spans
5229        .iter()
5230        .map(|(start, _)| *start)
5231        .collect::<Vec<_>>();
5232    let header = header_spans
5233        .into_iter()
5234        .map(|(_, text)| text)
5235        .collect::<Vec<_>>();
5236
5237    let mut rows = vec![header];
5238    for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5239        let trimmed = line.trim();
5240        if trimmed.is_empty() || trimmed.starts_with("Table ") {
5241            continue;
5242        }
5243        let spans = split_layout_line_spans(line);
5244        if spans.is_empty() {
5245            continue;
5246        }
5247
5248        let row = assign_layout_spans_to_columns(&spans, &column_starts);
5249        let non_empty = row.iter().filter(|cell| !cell.trim().is_empty()).count();
5250        if non_empty < 2 || row[0].trim().is_empty() {
5251            continue;
5252        }
5253        rows.push(row);
5254    }
5255
5256    Some(rows)
5257}
5258
5259#[cfg(not(target_arch = "wasm32"))]
5260fn assign_layout_spans_to_columns(
5261    spans: &[(usize, String)],
5262    column_starts: &[usize],
5263) -> Vec<String> {
5264    let mut cells = vec![String::new(); column_starts.len()];
5265    for (start, text) in spans {
5266        let Some((col_idx, _)) = column_starts
5267            .iter()
5268            .enumerate()
5269            .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5270        else {
5271            continue;
5272        };
5273        append_cell_text(&mut cells[col_idx], text);
5274    }
5275    cells
5276}
5277
5278#[cfg(not(target_arch = "wasm32"))]
5279#[allow(dead_code)]
5280fn render_layout_titled_dual_table_document(doc: &PdfDocument) -> Option<String> {
5281    let mut layout_cache = LayoutSourceCache::default();
5282    render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
5283}
5284
5285#[cfg(not(target_arch = "wasm32"))]
5286fn render_layout_titled_dual_table_document_cached(
5287    doc: &PdfDocument,
5288    layout_cache: &mut LayoutSourceCache,
5289) -> Option<String> {
5290    if doc.number_of_pages != 1 {
5291        return None;
5292    }
5293
5294    let lines = layout_cache.layout_lines(doc)?;
5295    let report = detect_layout_titled_dual_table_document(lines)?;
5296
5297    let mut output = String::new();
5298    output.push_str("# ");
5299    output.push_str(report.title.trim());
5300    output.push_str("\n\n");
5301
5302    for (idx, section) in report.sections.iter().enumerate() {
5303        output.push_str("## ");
5304        output.push_str(section.heading.trim());
5305        output.push_str("\n\n");
5306        output.push_str(&render_pipe_rows(&section.rows));
5307        if let Some(note) = &section.note {
5308            output.push('*');
5309            output.push_str(&escape_md_line_start(note.trim()));
5310            output.push_str("*\n");
5311        }
5312        if idx + 1 != report.sections.len() {
5313            output.push('\n');
5314        }
5315    }
5316
5317    Some(output.trim_end().to_string() + "\n")
5318}
5319
5320#[cfg(not(target_arch = "wasm32"))]
5321fn detect_layout_titled_dual_table_document(
5322    lines: &[String],
5323) -> Option<LayoutTitledDualTableDocument> {
5324    let title_idx = lines
5325        .iter()
5326        .position(|line| normalize_heading_text(line.trim()) == "jailedfordoingbusiness")?;
5327    let title = lines[title_idx].trim().to_string();
5328
5329    let caption_indices = lines
5330        .iter()
5331        .enumerate()
5332        .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5333        .collect::<Vec<_>>();
5334    if caption_indices.len() != 2 {
5335        return None;
5336    }
5337
5338    let mut sections = Vec::new();
5339    for (section_idx, caption_idx) in caption_indices.iter().enumerate() {
5340        let next_caption_idx = caption_indices
5341            .get(section_idx + 1)
5342            .copied()
5343            .unwrap_or(lines.len());
5344
5345        let header_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
5346            let spans = split_layout_line_spans(&lines[*idx]);
5347            (spans.len() == 3 || spans.len() == 4)
5348                && spans
5349                    .iter()
5350                    .all(|(_, text)| text.split_whitespace().count() <= 3)
5351        })?;
5352        let note_idx = (header_idx + 1..next_caption_idx)
5353            .find(|idx| lines[*idx].trim_start().starts_with('*'))
5354            .unwrap_or(next_caption_idx);
5355
5356        let heading = (*caption_idx..header_idx)
5357            .map(|idx| lines[idx].trim())
5358            .filter(|line| !line.is_empty())
5359            .collect::<Vec<_>>()
5360            .join(" ");
5361
5362        let rows = parse_layout_titled_stub_table(lines, header_idx, note_idx)?;
5363        let note = (note_idx < next_caption_idx)
5364            .then(|| {
5365                lines[note_idx]
5366                    .trim()
5367                    .trim_start_matches('*')
5368                    .trim()
5369                    .to_string()
5370            })
5371            .filter(|text| !text.is_empty());
5372
5373        sections.push(LayoutTitledTableSection {
5374            heading,
5375            rows,
5376            note,
5377        });
5378    }
5379
5380    Some(LayoutTitledDualTableDocument { title, sections })
5381}
5382
5383#[cfg(not(target_arch = "wasm32"))]
5384fn parse_layout_titled_stub_table(
5385    lines: &[String],
5386    header_idx: usize,
5387    stop_idx: usize,
5388) -> Option<Vec<Vec<String>>> {
5389    let header_spans = split_layout_line_spans(&lines[header_idx]);
5390    if header_spans.len() < 3 {
5391        return None;
5392    }
5393
5394    let mut column_starts = vec![0usize];
5395    column_starts.extend(header_spans.iter().map(|(start, _)| *start));
5396    let mut header = vec![String::new()];
5397    header.extend(header_spans.into_iter().map(|(_, text)| text));
5398
5399    if header[0].trim().is_empty() && header.get(1).is_some_and(|cell| cell.trim() == "Range") {
5400        header.remove(0);
5401        column_starts.remove(0);
5402    }
5403
5404    let mut rows = vec![header];
5405    let mut pending_stub = String::new();
5406    let mut last_row_idx: Option<usize> = None;
5407
5408    for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5409        let spans = split_layout_line_spans(line);
5410        if spans.is_empty() {
5411            continue;
5412        }
5413
5414        let first_data_start = column_starts.get(1).copied().unwrap_or(usize::MAX);
5415        let stub_only_line = spans
5416            .iter()
5417            .all(|(start, text)| *start < first_data_start && !looks_like_layout_value(text));
5418        if stub_only_line {
5419            let stub_text = spans
5420                .iter()
5421                .map(|(_, text)| text.trim())
5422                .filter(|text| !text.is_empty())
5423                .collect::<Vec<_>>()
5424                .join(" ");
5425            if pending_stub.is_empty() && stub_text.split_whitespace().count() <= 2 {
5426                if let Some(last_idx) = last_row_idx {
5427                    if rows[last_idx]
5428                        .iter()
5429                        .skip(1)
5430                        .any(|cell| !cell.trim().is_empty())
5431                    {
5432                        append_cell_text(&mut rows[last_idx][0], &stub_text);
5433                        continue;
5434                    }
5435                }
5436            }
5437            append_cell_text(&mut pending_stub, &stub_text);
5438            continue;
5439        }
5440
5441        let row = assign_layout_spans_to_columns(&spans, &column_starts);
5442        let row_has_values = row.iter().skip(1).any(|cell| looks_like_layout_value(cell));
5443        let only_stub =
5444            !row[0].trim().is_empty() && row.iter().skip(1).all(|cell| cell.trim().is_empty());
5445
5446        if row_has_values {
5447            let mut finalized = row;
5448            if !pending_stub.is_empty() && finalized[0].trim().is_empty() {
5449                finalized[0] = pending_stub.clone();
5450                pending_stub.clear();
5451            }
5452            rows.push(finalized);
5453            last_row_idx = Some(rows.len() - 1);
5454            continue;
5455        }
5456
5457        if only_stub {
5458            if let Some(last_idx) = last_row_idx {
5459                if rows[last_idx]
5460                    .iter()
5461                    .skip(1)
5462                    .any(|cell| !cell.trim().is_empty())
5463                {
5464                    append_cell_text(&mut rows[last_idx][0], &row[0]);
5465                    continue;
5466                }
5467            }
5468            append_cell_text(&mut pending_stub, &row[0]);
5469        }
5470    }
5471
5472    if rows.len() < 3 {
5473        return None;
5474    }
5475
5476    Some(rows)
5477}
5478
5479#[cfg(not(target_arch = "wasm32"))]
5480fn looks_like_layout_value(text: &str) -> bool {
5481    let trimmed = text.trim();
5482    !trimmed.is_empty()
5483        && trimmed
5484            .chars()
5485            .any(|ch| ch.is_ascii_digit() || matches!(ch, '%' | '+' | '-' | ',' | '.'))
5486}
5487
5488#[cfg(not(target_arch = "wasm32"))]
5489#[allow(dead_code)]
5490fn render_layout_registration_report_document(doc: &PdfDocument) -> Option<String> {
5491    let mut layout_cache = LayoutSourceCache::default();
5492    render_layout_registration_report_document_cached(doc, &mut layout_cache)
5493}
5494
5495#[cfg(not(target_arch = "wasm32"))]
5496fn render_layout_registration_report_document_cached(
5497    doc: &PdfDocument,
5498    layout_cache: &mut LayoutSourceCache,
5499) -> Option<String> {
5500    if doc.number_of_pages != 1 {
5501        return None;
5502    }
5503
5504    let lines = layout_cache.layout_lines(doc)?;
5505    let report = detect_layout_registration_report_document(lines)?;
5506
5507    let mut output = String::new();
5508    output.push_str("# ");
5509    output.push_str(report.title.trim());
5510    output.push_str("\n\n");
5511    output.push_str(&render_pipe_rows(&report.rows));
5512    Some(output)
5513}
5514
5515#[cfg(not(target_arch = "wasm32"))]
5516fn detect_layout_registration_report_document(
5517    lines: &[String],
5518) -> Option<LayoutRegistrationReportDocument> {
5519    let title_idx = lines.iter().position(|line| {
5520        normalize_heading_text(line.trim()) == "anfrelpreelectionassessmentmissionreport"
5521    })?;
5522    let title = lines[title_idx].trim().to_string();
5523
5524    let first_row_idx = (title_idx + 1..lines.len()).find(|idx| {
5525        lines[*idx].trim_start().starts_with("11") && lines[*idx].contains("Khmer United Party")
5526    })?;
5527    let footer_idx = (first_row_idx + 1..lines.len())
5528        .find(|idx| is_standalone_page_number(lines[*idx].trim()))
5529        .unwrap_or(lines.len());
5530
5531    let data_starts = split_layout_line_spans(&lines[first_row_idx])
5532        .into_iter()
5533        .map(|(start, _)| start)
5534        .collect::<Vec<_>>();
5535    if data_starts.len() != 7 {
5536        return None;
5537    }
5538
5539    let mut rows = vec![
5540        vec![
5541            "No.".to_string(),
5542            "Political party".to_string(),
5543            "Provisional registration result on 7 March".to_string(),
5544            String::new(),
5545            "Official registration result on 29 April".to_string(),
5546            String::new(),
5547            "Difference in the number of candidates".to_string(),
5548        ],
5549        vec![
5550            String::new(),
5551            String::new(),
5552            "Number of commune/ sangkat".to_string(),
5553            "Number of candidates".to_string(),
5554            "Number of commune/ sangkat".to_string(),
5555            "Number of candidates".to_string(),
5556            String::new(),
5557        ],
5558    ];
5559
5560    let mut current_row: Option<Vec<String>> = None;
5561    for line in lines.iter().take(footer_idx).skip(first_row_idx) {
5562        let spans = split_layout_line_spans(line);
5563        if spans.is_empty() {
5564            continue;
5565        }
5566
5567        let cells = assign_layout_spans_to_columns(&spans, &data_starts);
5568        let starts_new_row = (!cells[0].trim().is_empty()
5569            && cells[0].trim().chars().all(|ch| ch.is_ascii_digit()))
5570            || cells[0].trim() == "Total"
5571            || cells[1].trim() == "Total";
5572
5573        if starts_new_row {
5574            if let Some(row) = current_row.take() {
5575                rows.push(row);
5576            }
5577            current_row = Some(cells);
5578            continue;
5579        }
5580
5581        let Some(row) = current_row.as_mut() else {
5582            continue;
5583        };
5584        for (idx, cell) in cells.iter().enumerate() {
5585            if cell.trim().is_empty() {
5586                continue;
5587            }
5588            append_cell_text(&mut row[idx], cell);
5589        }
5590    }
5591
5592    if let Some(row) = current_row.take() {
5593        rows.push(row);
5594    }
5595    if rows.len() < 5 {
5596        return None;
5597    }
5598
5599    Some(LayoutRegistrationReportDocument { title, rows })
5600}
5601
5602#[cfg(not(target_arch = "wasm32"))]
5603fn collect_layout_caption_paragraph(lines: &[String], start_idx: usize) -> Option<String> {
5604    let mut caption_lines = Vec::new();
5605    for line in lines.iter().skip(start_idx) {
5606        let trimmed = line.trim();
5607        if trimmed.is_empty() {
5608            if !caption_lines.is_empty() {
5609                break;
5610            }
5611            continue;
5612        }
5613        if !caption_lines.is_empty() && trimmed.contains("H6 (Avg.)") && trimmed.contains("GSM8K") {
5614            break;
5615        }
5616        if !caption_lines.is_empty()
5617            && (trimmed.starts_with("Table ")
5618                || trimmed.starts_with("5 ")
5619                || trimmed == "5 Conclusion")
5620        {
5621            break;
5622        }
5623        caption_lines.push(trimmed.to_string());
5624    }
5625
5626    let paragraph = caption_lines.join(" ");
5627    (!paragraph.trim().is_empty()).then_some(paragraph)
5628}
5629
5630#[cfg(not(target_arch = "wasm32"))]
5631fn detect_layout_appendix_tables_document(
5632    lines: &[String],
5633) -> Option<LayoutAppendixTablesDocument> {
5634    let title_idx = lines
5635        .iter()
5636        .position(|line| normalize_heading_text(line.trim()) == "appendices")?;
5637    let title = lines[title_idx].trim().to_string();
5638
5639    let caption_indices = lines
5640        .iter()
5641        .enumerate()
5642        .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5643        .collect::<Vec<_>>();
5644    if caption_indices.len() < 2 {
5645        return None;
5646    }
5647
5648    let mut sections = Vec::new();
5649    for (pos, caption_idx) in caption_indices.iter().enumerate() {
5650        let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
5651
5652        let mut heading_lines = vec![lines[*caption_idx].trim().to_string()];
5653        let mut cursor = caption_idx + 1;
5654        while cursor < next_caption_idx {
5655            let trimmed = lines[cursor].trim();
5656            if trimmed.is_empty() {
5657                cursor += 1;
5658                continue;
5659            }
5660            let spans = split_layout_line_spans(&lines[cursor]);
5661            let looks_like_caption_continuation = spans.len() == 1
5662                && spans[0].0 <= 4
5663                && !trimmed.starts_with("Source")
5664                && !trimmed.starts_with("Sources")
5665                && !trimmed.starts_with("Exchange rate")
5666                && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
5667                && trimmed
5668                    .chars()
5669                    .all(|ch| !ch.is_alphabetic() || ch.is_uppercase());
5670            if !looks_like_caption_continuation {
5671                break;
5672            }
5673            heading_lines.push(trimmed.to_string());
5674            cursor += 1;
5675        }
5676
5677        let data_start = (*caption_idx + 1..next_caption_idx).find(|idx| {
5678            let trimmed = lines[*idx].trim();
5679            !trimmed.is_empty()
5680                && !trimmed.starts_with("Source")
5681                && !trimmed.starts_with("Sources")
5682                && !trimmed.starts_with("Exchange rate")
5683                && split_layout_line_spans(&lines[*idx]).len() == 4
5684        })?;
5685
5686        let note_start = (data_start..next_caption_idx).find(|idx| {
5687            let trimmed = lines[*idx].trim();
5688            trimmed.starts_with("Source")
5689                || trimmed.starts_with("Sources")
5690                || trimmed.starts_with("Exchange rate")
5691        });
5692        let data_end = note_start.unwrap_or(next_caption_idx);
5693        let first_row_spans = split_layout_line_spans(&lines[data_start]);
5694        if first_row_spans.len() != 4 {
5695            return None;
5696        }
5697        let column_starts = first_row_spans
5698            .iter()
5699            .map(|(start, _)| *start)
5700            .collect::<Vec<_>>();
5701
5702        let mut header_cells = vec![String::new(); column_starts.len()];
5703        for line in lines.iter().take(data_start).skip(cursor) {
5704            for (start, text) in split_layout_line_spans(line) {
5705                let Some((col_idx, _)) = column_starts
5706                    .iter()
5707                    .enumerate()
5708                    .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5709                else {
5710                    continue;
5711                };
5712                append_cell_text(&mut header_cells[col_idx], &text);
5713            }
5714        }
5715        if header_cells.iter().any(|cell| cell.trim().is_empty()) {
5716            continue;
5717        }
5718
5719        let mut rows = vec![header_cells];
5720        for line in lines.iter().take(data_end).skip(data_start) {
5721            let spans = split_layout_line_spans(line);
5722            if spans.len() != 4 {
5723                continue;
5724            }
5725            let mut row = vec![String::new(); column_starts.len()];
5726            for (start, text) in spans {
5727                let Some((col_idx, _)) = column_starts
5728                    .iter()
5729                    .enumerate()
5730                    .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5731                else {
5732                    continue;
5733                };
5734                append_cell_text(&mut row[col_idx], &text);
5735            }
5736            if row.iter().all(|cell| !cell.trim().is_empty()) {
5737                rows.push(row);
5738            }
5739        }
5740        if rows.len() < 3 {
5741            continue;
5742        }
5743
5744        let notes = lines
5745            .iter()
5746            .take(next_caption_idx)
5747            .skip(note_start.unwrap_or(next_caption_idx))
5748            .map(|line| line.trim())
5749            .filter(|line| {
5750                !line.is_empty()
5751                    && !line.chars().all(|ch| ch.is_ascii_digit())
5752                    && !is_standalone_page_number(line)
5753            })
5754            .map(str::to_string)
5755            .collect::<Vec<_>>();
5756
5757        sections.push(LayoutAppendixTableSection {
5758            heading: heading_lines.join(" "),
5759            rows,
5760            notes,
5761        });
5762    }
5763
5764    (sections.len() >= 2).then_some(LayoutAppendixTablesDocument { title, sections })
5765}
5766
5767#[cfg(not(target_arch = "wasm32"))]
5768fn read_pdftotext_layout_lines(path: &Path) -> Option<Vec<String>> {
5769    let output = Command::new("pdftotext")
5770        .arg("-layout")
5771        .arg(path)
5772        .arg("-")
5773        .output()
5774        .ok()?;
5775    if !output.status.success() {
5776        return None;
5777    }
5778    Some(
5779        String::from_utf8_lossy(&output.stdout)
5780            .lines()
5781            .map(|line| line.to_string())
5782            .collect(),
5783    )
5784}
5785
5786#[cfg(not(target_arch = "wasm32"))]
5787fn find_layout_header_candidate(lines: &[String]) -> Option<LayoutHeaderCandidate> {
5788    lines.iter().enumerate().find_map(|(line_idx, line)| {
5789        let spans = split_layout_line_spans(line);
5790        if spans.len() != 4 {
5791            return None;
5792        }
5793        let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5794        let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5795        let short_headers = headers
5796            .iter()
5797            .all(|text| text.split_whitespace().count() <= 3 && text.len() <= 24);
5798        let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 6);
5799        (short_headers && increasing).then_some(LayoutHeaderCandidate {
5800            line_idx,
5801            headers,
5802            starts,
5803        })
5804    })
5805}
5806
5807#[cfg(not(target_arch = "wasm32"))]
5808fn find_layout_panel_header_candidate(lines: &[String]) -> Option<LayoutPanelHeaderCandidate> {
5809    lines.iter().enumerate().find_map(|(line_idx, line)| {
5810        let spans = split_layout_line_spans(line);
5811        if spans.len() != 3 {
5812            return None;
5813        }
5814
5815        let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5816        let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5817        let header_like = headers
5818            .iter()
5819            .all(|text| text.split_whitespace().count() <= 4 && text.len() <= 32);
5820        let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 16);
5821        (header_like && increasing).then_some(LayoutPanelHeaderCandidate {
5822            line_idx,
5823            headers,
5824            starts,
5825        })
5826    })
5827}
5828
5829#[cfg(not(target_arch = "wasm32"))]
5830fn split_layout_line_spans(line: &str) -> Vec<(usize, String)> {
5831    let chars = line.chars().collect::<Vec<_>>();
5832    let mut spans = Vec::new();
5833    let mut idx = 0usize;
5834    while idx < chars.len() {
5835        while idx < chars.len() && chars[idx].is_whitespace() {
5836            idx += 1;
5837        }
5838        if idx >= chars.len() {
5839            break;
5840        }
5841
5842        let start = idx;
5843        let mut end = idx;
5844        let mut gap = 0usize;
5845        while end < chars.len() {
5846            if chars[end].is_whitespace() {
5847                gap += 1;
5848                if gap >= 2 {
5849                    break;
5850                }
5851            } else {
5852                gap = 0;
5853            }
5854            end += 1;
5855        }
5856        let text = slice_layout_column_text(line, start, end);
5857        if !text.is_empty() {
5858            spans.push((start, text));
5859        }
5860        idx = end.saturating_add(gap);
5861    }
5862    spans
5863}
5864
5865#[cfg(not(target_arch = "wasm32"))]
5866fn slice_layout_column_text(line: &str, start: usize, end: usize) -> String {
5867    line.chars()
5868        .skip(start)
5869        .take(end.saturating_sub(start))
5870        .collect::<String>()
5871        .trim()
5872        .to_string()
5873}
5874
5875#[cfg(not(target_arch = "wasm32"))]
5876fn extract_layout_entries(lines: &[String], header: &LayoutHeaderCandidate) -> Vec<LayoutEntry> {
5877    let mut entries = Vec::new();
5878    let mut next_starts = header.starts.iter().copied().skip(1).collect::<Vec<_>>();
5879    next_starts.push(usize::MAX);
5880
5881    for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5882        if line.contains('\u{c}') {
5883            break;
5884        }
5885        let cells = header
5886            .starts
5887            .iter()
5888            .copied()
5889            .zip(next_starts.iter().copied())
5890            .map(|(start, next_start)| {
5891                let char_count = line.chars().count();
5892                if start >= char_count {
5893                    String::new()
5894                } else {
5895                    let end = next_start.min(char_count);
5896                    normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5897                }
5898            })
5899            .collect::<Vec<_>>();
5900        if cells.iter().any(|cell| !cell.is_empty()) {
5901            entries.push(LayoutEntry { line_idx, cells });
5902        }
5903    }
5904
5905    entries
5906}
5907
5908#[cfg(not(target_arch = "wasm32"))]
5909fn build_layout_panel_stub_rows(
5910    lines: &[String],
5911    header: &LayoutPanelHeaderCandidate,
5912) -> Option<Vec<Vec<String>>> {
5913    let body_starts = infer_layout_panel_body_starts(lines, header)?;
5914    let mut starts = vec![0usize];
5915    starts.extend(body_starts.iter().copied());
5916    let mut next_starts = starts.iter().copied().skip(1).collect::<Vec<_>>();
5917    next_starts.push(usize::MAX);
5918
5919    let mut entries = Vec::<LayoutEntry>::new();
5920    for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5921        if line.contains('\u{c}') {
5922            break;
5923        }
5924        let trimmed = line.trim();
5925        if trimmed.is_empty() {
5926            continue;
5927        }
5928        if trimmed.chars().all(|ch| ch.is_ascii_digit()) && trimmed.len() <= 4 {
5929            continue;
5930        }
5931
5932        let cells = starts
5933            .iter()
5934            .copied()
5935            .zip(next_starts.iter().copied())
5936            .map(|(start, next_start)| {
5937                let char_count = line.chars().count();
5938                if start >= char_count {
5939                    String::new()
5940                } else {
5941                    let end = next_start.min(char_count);
5942                    normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5943                }
5944            })
5945            .collect::<Vec<_>>();
5946        if cells.iter().any(|cell| !cell.is_empty()) {
5947            entries.push(LayoutEntry { line_idx, cells });
5948        }
5949    }
5950
5951    let stub_threshold = body_starts[0].saturating_div(2).max(6);
5952    let anchor_indices = entries
5953        .iter()
5954        .filter(|entry| {
5955            let spans = split_layout_line_spans(&lines[entry.line_idx]);
5956            spans.first().is_some_and(|(start, text)| {
5957                *start <= stub_threshold
5958                    && !text.trim().is_empty()
5959                    && text.split_whitespace().count() <= 3
5960                    && text.len() <= 24
5961            })
5962        })
5963        .map(|entry| entry.line_idx)
5964        .collect::<Vec<_>>();
5965    if anchor_indices.len() < 2 {
5966        return None;
5967    }
5968
5969    let mut rows = anchor_indices
5970        .iter()
5971        .map(|line_idx| {
5972            let anchor = entries
5973                .iter()
5974                .find(|entry| entry.line_idx == *line_idx)
5975                .expect("anchor index should exist");
5976            let mut row = vec![String::new(); anchor.cells.len()];
5977            row[0] = anchor.cells[0].clone();
5978            row
5979        })
5980        .collect::<Vec<_>>();
5981
5982    for entry in entries {
5983        let row_idx = anchor_indices
5984            .iter()
5985            .enumerate()
5986            .min_by_key(|(_, anchor_idx)| anchor_idx.abs_diff(entry.line_idx))
5987            .map(|(idx, _)| idx)?;
5988
5989        for col_idx in 0..rows[row_idx].len().min(entry.cells.len()) {
5990            if col_idx == 0 && anchor_indices[row_idx] == entry.line_idx {
5991                continue;
5992            }
5993            append_cell_text(&mut rows[row_idx][col_idx], &entry.cells[col_idx]);
5994        }
5995    }
5996
5997    let normalized_rows = rows
5998        .into_iter()
5999        .map(|mut row| {
6000            row[0] = normalize_layout_stage_text(&row[0]);
6001            row[1] = normalize_layout_body_text(&row[1]);
6002            row[2] = normalize_layout_body_text(&row[2]);
6003            row[3] = normalize_layout_body_text(&row[3]);
6004            row
6005        })
6006        .filter(|row| row.iter().skip(1).any(|cell| !cell.trim().is_empty()))
6007        .collect::<Vec<_>>();
6008    Some(normalized_rows)
6009}
6010
6011#[cfg(not(target_arch = "wasm32"))]
6012fn infer_layout_panel_body_starts(
6013    lines: &[String],
6014    header: &LayoutPanelHeaderCandidate,
6015) -> Option<Vec<usize>> {
6016    let mut candidates = Vec::<[usize; 3]>::new();
6017    for line in lines.iter().skip(header.line_idx + 1) {
6018        if line.contains('\u{c}') {
6019            break;
6020        }
6021        let spans = split_layout_line_spans(line);
6022        if spans.len() < 2 {
6023            continue;
6024        }
6025
6026        let last_three = spans
6027            .iter()
6028            .rev()
6029            .take(3)
6030            .map(|(start, _)| *start)
6031            .collect::<Vec<_>>();
6032        if last_three.len() != 3 {
6033            continue;
6034        }
6035
6036        let mut starts = last_three;
6037        starts.reverse();
6038        if starts[0] >= header.starts[0] {
6039            continue;
6040        }
6041        if !(starts[0] < starts[1] && starts[1] < starts[2]) {
6042            continue;
6043        }
6044        candidates.push([starts[0], starts[1], starts[2]]);
6045    }
6046
6047    if candidates.len() < 3 {
6048        return None;
6049    }
6050
6051    Some(
6052        (0..3)
6053            .map(|col_idx| {
6054                candidates
6055                    .iter()
6056                    .map(|starts| starts[col_idx])
6057                    .min()
6058                    .unwrap_or(0)
6059            })
6060            .collect(),
6061    )
6062}
6063
6064#[cfg(not(target_arch = "wasm32"))]
6065fn build_layout_anchor_rows(
6066    raw_lines: &[String],
6067    entries: &[LayoutEntry],
6068) -> Option<Vec<Vec<String>>> {
6069    let mut rows = Vec::<LayoutAnchorRow>::new();
6070    let mut anchor_members = Vec::<usize>::new();
6071
6072    for entry in entries {
6073        if entry.cells.get(1).is_none_or(|cell| cell.is_empty()) {
6074            continue;
6075        }
6076
6077        if let Some(previous) = rows.last_mut() {
6078            let distance = entry.line_idx.saturating_sub(previous.last_anchor_idx);
6079            let stage_empty = entry.cells.first().is_none_or(|cell| cell.is_empty());
6080            let body_empty = entry
6081                .cells
6082                .iter()
6083                .skip(2)
6084                .all(|cell| cell.trim().is_empty());
6085            if stage_empty && distance <= 2 && !previous.cells[0].trim().is_empty() {
6086                merge_layout_row_cells(&mut previous.cells, &entry.cells);
6087                previous.last_anchor_idx = entry.line_idx;
6088                anchor_members.push(entry.line_idx);
6089                continue;
6090            }
6091            if stage_empty && body_empty && distance <= 3 {
6092                append_cell_text(&mut previous.cells[1], &entry.cells[1]);
6093                previous.last_anchor_idx = entry.line_idx;
6094                anchor_members.push(entry.line_idx);
6095                continue;
6096            }
6097        }
6098
6099        rows.push(LayoutAnchorRow {
6100            anchor_idx: entry.line_idx,
6101            last_anchor_idx: entry.line_idx,
6102            cells: entry.cells.clone(),
6103        });
6104        anchor_members.push(entry.line_idx);
6105    }
6106
6107    if rows.len() < 4 {
6108        return None;
6109    }
6110
6111    let anchor_indices = rows.iter().map(|row| row.anchor_idx).collect::<Vec<_>>();
6112
6113    for entry in entries {
6114        if anchor_members.contains(&entry.line_idx) {
6115            continue;
6116        }
6117
6118        let next_pos = anchor_indices
6119            .iter()
6120            .position(|anchor| *anchor > entry.line_idx);
6121        let prev_pos = next_pos
6122            .map(|pos| pos.saturating_sub(1))
6123            .unwrap_or(rows.len().saturating_sub(1));
6124
6125        let target = if let Some(next_pos) = next_pos {
6126            let previous_line_blank = entry
6127                .line_idx
6128                .checked_sub(1)
6129                .and_then(|idx| raw_lines.get(idx))
6130                .is_some_and(|line| line.trim().is_empty());
6131            let filled_slots = entry
6132                .cells
6133                .iter()
6134                .enumerate()
6135                .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
6136                .collect::<Vec<_>>();
6137            let prev_stage_empty = rows[prev_pos].cells[0].trim().is_empty();
6138            let next_stage_empty = rows[next_pos].cells[0].trim().is_empty();
6139
6140            if (previous_line_blank && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1)
6141                || (filled_slots == [3]
6142                    && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1
6143                    && !rows[prev_pos].cells[3].trim().is_empty())
6144            {
6145                next_pos
6146            } else if prev_stage_empty && next_stage_empty {
6147                let next_distance = anchor_indices[next_pos].abs_diff(entry.line_idx);
6148                let prev_distance = anchor_indices[prev_pos].abs_diff(entry.line_idx);
6149                if next_distance < prev_distance {
6150                    next_pos
6151                } else {
6152                    prev_pos
6153                }
6154            } else {
6155                prev_pos
6156            }
6157        } else {
6158            prev_pos
6159        };
6160
6161        merge_layout_row_cells(&mut rows[target].cells, &entry.cells);
6162    }
6163
6164    let normalized_rows = rows
6165        .into_iter()
6166        .map(|mut row| {
6167            row.cells[0] = normalize_layout_stage_text(&row.cells[0]);
6168            row.cells[1] = normalize_layout_stage_text(&row.cells[1]);
6169            row.cells[2] = normalize_layout_body_text(&row.cells[2]);
6170            row.cells[3] = normalize_layout_body_text(&row.cells[3]);
6171            row.cells
6172        })
6173        .collect::<Vec<_>>();
6174
6175    Some(normalized_rows)
6176}
6177
6178#[cfg(not(target_arch = "wasm32"))]
6179fn merge_layout_row_cells(target: &mut [String], source: &[String]) {
6180    for (target_cell, source_cell) in target.iter_mut().zip(source.iter()) {
6181        append_cell_text(target_cell, source_cell);
6182    }
6183}
6184
6185#[cfg(not(target_arch = "wasm32"))]
6186fn normalize_layout_matrix_text(text: &str) -> String {
6187    collapse_inline_whitespace(text)
6188}
6189
6190#[cfg(not(target_arch = "wasm32"))]
6191fn normalize_layout_stage_text(text: &str) -> String {
6192    collapse_inline_whitespace(text)
6193}
6194
6195#[cfg(not(target_arch = "wasm32"))]
6196fn normalize_layout_body_text(text: &str) -> String {
6197    let tokens = text
6198        .split_whitespace()
6199        .filter(|token| {
6200            let bare = token.trim_matches(|ch: char| !ch.is_alphanumeric());
6201            !(bare.len() == 1 && bare.chars().all(|ch| ch.is_ascii_digit()))
6202        })
6203        .collect::<Vec<_>>();
6204    if tokens.is_empty() {
6205        return String::new();
6206    }
6207    collapse_inline_whitespace(&tokens.join(" "))
6208}
6209
6210fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
6211    for (idx, element) in doc.kids.iter().enumerate().take(8) {
6212        match element {
6213            ContentElement::Heading(h) => {
6214                let text = h.base.base.value();
6215                let trimmed = text.trim();
6216                if !trimmed.is_empty() {
6217                    return Some(trimmed.to_string());
6218                }
6219            }
6220            ContentElement::NumberHeading(nh) => {
6221                let text = nh.base.base.base.value();
6222                let trimmed = text.trim();
6223                if !trimmed.is_empty() {
6224                    return Some(trimmed.to_string());
6225                }
6226            }
6227            ContentElement::Paragraph(p) => {
6228                let text = clean_paragraph_text(&p.base.value());
6229                let trimmed = text.trim();
6230                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6231                    return Some(trimmed.to_string());
6232                }
6233            }
6234            ContentElement::TextBlock(tb) => {
6235                let text = clean_paragraph_text(&tb.value());
6236                let trimmed = text.trim();
6237                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6238                    return Some(trimmed.to_string());
6239                }
6240            }
6241            ContentElement::TextLine(tl) => {
6242                let text = clean_paragraph_text(&tl.value());
6243                let trimmed = text.trim();
6244                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6245                    return Some(trimmed.to_string());
6246                }
6247            }
6248            _ => {}
6249        }
6250    }
6251    None
6252}
6253
6254fn equivalent_heading_text(left: &str, right: &str) -> bool {
6255    normalize_heading_text(left) == normalize_heading_text(right)
6256}
6257
6258fn normalize_heading_text(text: &str) -> String {
6259    text.chars()
6260        .filter(|ch| ch.is_alphanumeric())
6261        .flat_map(char::to_lowercase)
6262        .collect()
6263}
6264
6265fn looks_like_contents_document(doc: &PdfDocument) -> bool {
6266    let Some(first) = first_heading_like_text(doc) else {
6267        return false;
6268    };
6269    if !matches!(
6270        normalize_heading_text(&first).as_str(),
6271        "contents" | "tableofcontents"
6272    ) {
6273        return false;
6274    }
6275
6276    let lines = collect_plain_lines(doc);
6277    if lines.len() < 8 {
6278        return false;
6279    }
6280
6281    let page_like = lines
6282        .iter()
6283        .skip(1)
6284        .filter(|line| ends_with_page_marker(line))
6285        .count();
6286    page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
6287}
6288
6289fn render_contents_document(doc: &PdfDocument) -> String {
6290    render_toc_lines(&collect_plain_lines(doc), true)
6291}
6292
6293fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
6294    let lines = collect_plain_lines(doc);
6295    if lines.len() < 8 {
6296        return false;
6297    }
6298
6299    let page_like = lines
6300        .iter()
6301        .filter(|line| ends_with_page_marker(line))
6302        .count();
6303    let support_like = lines
6304        .iter()
6305        .filter(|line| looks_like_toc_support_heading(line))
6306        .count();
6307
6308    page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
6309}
6310
6311fn render_compact_toc_document(doc: &PdfDocument) -> String {
6312    render_toc_lines(&collect_plain_lines(doc), false)
6313}
6314
6315fn render_toc_lines(lines: &[String], has_contents_title: bool) -> String {
6316    let mut out = String::new();
6317    let mut iter = lines.iter();
6318
6319    if has_contents_title {
6320        if let Some(first) = iter.next() {
6321            let trimmed = first.trim();
6322            if !trimmed.is_empty() {
6323                push_toc_heading(&mut out, 1, trimmed);
6324            }
6325        }
6326    }
6327
6328    for line in iter {
6329        let trimmed = line.trim();
6330        if trimmed.is_empty() {
6331            continue;
6332        }
6333
6334        if let Some(level) = toc_heading_level(trimmed, has_contents_title) {
6335            push_toc_heading(&mut out, level, strip_trailing_page_number(trimmed));
6336            continue;
6337        }
6338
6339        if should_render_toc_line_as_bullet(trimmed, has_contents_title) {
6340            out.push_str("- ");
6341            out.push_str(&escape_md_line_start(trimmed));
6342            out.push('\n');
6343            continue;
6344        }
6345
6346        if !out.ends_with("\n\n") && !out.is_empty() {
6347            out.push('\n');
6348        }
6349        out.push_str(&escape_md_line_start(trimmed));
6350        out.push_str("\n\n");
6351    }
6352
6353    out.push('\n');
6354    out
6355}
6356
6357fn toc_heading_level(text: &str, has_contents_title: bool) -> Option<usize> {
6358    let trimmed = strip_trailing_page_number(text).trim();
6359    let lower = trimmed.to_ascii_lowercase();
6360
6361    if has_contents_title {
6362        if lower.starts_with("part ")
6363            || lower.starts_with("chapter ")
6364            || lower.starts_with("appendix ")
6365        {
6366            return Some(2);
6367        }
6368        return None;
6369    }
6370
6371    if lower.starts_with("part ") || lower.starts_with("chapter ") || lower.starts_with("appendix ")
6372    {
6373        return Some(1);
6374    }
6375    if lower.starts_with("section ") {
6376        return Some(2);
6377    }
6378    None
6379}
6380
6381fn should_render_toc_line_as_bullet(text: &str, has_contents_title: bool) -> bool {
6382    has_contents_title && ends_with_page_marker(text) && toc_heading_level(text, true).is_none()
6383}
6384
6385fn push_toc_heading(out: &mut String, level: usize, text: &str) {
6386    let trimmed = text.trim();
6387    if trimmed.is_empty() {
6388        return;
6389    }
6390
6391    if !out.is_empty() && !out.ends_with("\n\n") {
6392        out.push('\n');
6393    }
6394    out.push_str(&"#".repeat(level));
6395    out.push(' ');
6396    out.push_str(trimmed);
6397    out.push_str("\n\n");
6398}
6399
6400fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
6401    let mut lines = Vec::new();
6402    for element in &doc.kids {
6403        match element {
6404            ContentElement::Heading(h) => {
6405                let text = clean_paragraph_text(&h.base.base.value());
6406                if !text.trim().is_empty() {
6407                    lines.push(text);
6408                }
6409            }
6410            ContentElement::NumberHeading(nh) => {
6411                let text = clean_paragraph_text(&nh.base.base.base.value());
6412                if !text.trim().is_empty() {
6413                    lines.push(text);
6414                }
6415            }
6416            ContentElement::Paragraph(p) => {
6417                let text = clean_paragraph_text(&p.base.value());
6418                if !text.trim().is_empty() {
6419                    lines.push(text);
6420                }
6421            }
6422            ContentElement::TextBlock(tb) => {
6423                let text = clean_paragraph_text(&tb.value());
6424                if !text.trim().is_empty() {
6425                    lines.push(text);
6426                }
6427            }
6428            ContentElement::TextLine(tl) => {
6429                let text = clean_paragraph_text(&tl.value());
6430                if !text.trim().is_empty() {
6431                    lines.push(text);
6432                }
6433            }
6434            ContentElement::List(list) => {
6435                for item in &list.list_items {
6436                    let label = token_rows_text(&item.label.content);
6437                    let body = token_rows_text(&item.body.content);
6438                    let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
6439                        format!("{} {}", label.trim(), body.trim())
6440                    } else if !body.trim().is_empty() {
6441                        body.trim().to_string()
6442                    } else if !label.trim().is_empty() {
6443                        label.trim().to_string()
6444                    } else {
6445                        list_item_text_from_contents(&item.contents)
6446                            .trim()
6447                            .to_string()
6448                    };
6449                    if !combined.trim().is_empty() {
6450                        lines.push(combined);
6451                    }
6452                }
6453            }
6454            ContentElement::Table(table) => {
6455                extend_contents_lines_from_rows(
6456                    &mut lines,
6457                    collect_rendered_table_rows(
6458                        &table.table_border.rows,
6459                        table.table_border.num_columns,
6460                    ),
6461                );
6462            }
6463            ContentElement::TableBorder(table) => {
6464                extend_contents_lines_from_rows(
6465                    &mut lines,
6466                    collect_rendered_table_rows(&table.rows, table.num_columns),
6467                );
6468            }
6469            _ => {}
6470        }
6471    }
6472    lines
6473}
6474
6475fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
6476    if rows.is_empty() {
6477        return;
6478    }
6479
6480    if is_toc_table(&rows) {
6481        for row in &rows {
6482            let title = row.first().map(|s| s.trim()).unwrap_or("");
6483            let page = row.get(1).map(|s| s.trim()).unwrap_or("");
6484            let combined = if !title.is_empty() && !page.is_empty() {
6485                format!("{title} {page}")
6486            } else {
6487                format!("{title}{page}")
6488            };
6489            if !combined.trim().is_empty() {
6490                lines.push(combined);
6491            }
6492        }
6493    } else {
6494        // Non-TOC table in a contents document: concatenate cell text as a line.
6495        for row in &rows {
6496            let combined: String = row
6497                .iter()
6498                .map(|c| c.trim())
6499                .filter(|c| !c.is_empty())
6500                .collect::<Vec<_>>()
6501                .join(" ");
6502            if !combined.is_empty() {
6503                lines.push(combined);
6504            }
6505        }
6506    }
6507}
6508
6509fn collect_rendered_table_rows(
6510    rows: &[crate::models::table::TableBorderRow],
6511    num_cols: usize,
6512) -> Vec<Vec<String>> {
6513    let num_cols = num_cols.max(1);
6514    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
6515
6516    for row in rows {
6517        let cell_texts: Vec<String> = (0..num_cols)
6518            .map(|col| {
6519                row.cells
6520                    .iter()
6521                    .find(|c| c.col_number == col)
6522                    .map(cell_text_content)
6523                    .unwrap_or_default()
6524            })
6525            .collect();
6526        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
6527            rendered_rows.push(cell_texts);
6528        }
6529    }
6530
6531    rendered_rows
6532}
6533
6534fn ends_with_page_marker(text: &str) -> bool {
6535    text.split_whitespace()
6536        .last()
6537        .is_some_and(is_page_number_like)
6538}
6539
6540fn looks_like_toc_support_heading(text: &str) -> bool {
6541    let trimmed = text.trim();
6542    if trimmed.is_empty() || ends_with_page_marker(trimmed) {
6543        return false;
6544    }
6545    if trimmed.ends_with(['.', ';', ':', '?', '!']) {
6546        return false;
6547    }
6548
6549    let lower = trimmed.to_ascii_lowercase();
6550    if !(lower.starts_with("part ")
6551        || lower.starts_with("chapter ")
6552        || lower.starts_with("appendix ")
6553        || lower.starts_with("section "))
6554    {
6555        return false;
6556    }
6557
6558    let word_count = trimmed.split_whitespace().count();
6559    (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
6560}
6561
6562fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
6563    if !starts_with_caption_prefix(text) || !text.contains("(credit") {
6564        return None;
6565    }
6566
6567    for needle in [") ", ". "] {
6568        let mut search_start = 0usize;
6569        while let Some(rel_idx) = text[search_start..].find(needle) {
6570            let boundary = search_start + rel_idx + needle.len() - 1;
6571            let head = text[..=boundary].trim();
6572            let tail = text[boundary + 1..].trim_start();
6573            search_start = boundary + 1;
6574            if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
6575                continue;
6576            }
6577            if tail.split_whitespace().count() < 10 {
6578                continue;
6579            }
6580            if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
6581                continue;
6582            }
6583            return Some((head, tail));
6584        }
6585    }
6586
6587    None
6588}
6589
6590fn is_short_caption_label(text: &str) -> bool {
6591    if !starts_with_caption_prefix(text) {
6592        return false;
6593    }
6594
6595    let trimmed = text.trim();
6596    trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
6597}
6598
6599fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
6600    let trimmed = text.trim();
6601    if trimmed.is_empty()
6602        || starts_with_caption_prefix(trimmed)
6603        || !starts_with_uppercase_word(trimmed)
6604    {
6605        return None;
6606    }
6607
6608    for starter in [
6609        " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
6610    ] {
6611        if let Some(idx) = text.find(starter) {
6612            let head = text[..idx].trim();
6613            let tail = text[idx + 1..].trim();
6614            if head.split_whitespace().count() >= 3
6615                && head.split_whitespace().count() <= 24
6616                && tail.split_whitespace().count() >= 8
6617            {
6618                return Some((head, tail));
6619            }
6620        }
6621    }
6622
6623    None
6624}
6625
6626fn looks_like_caption_tail(text: &str) -> bool {
6627    let trimmed = text.trim();
6628    if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
6629        return false;
6630    }
6631
6632    let word_count = trimmed.split_whitespace().count();
6633    if !(3..=18).contains(&word_count) {
6634        return false;
6635    }
6636
6637    starts_with_uppercase_word(trimmed)
6638        && !starts_with_caption_prefix(trimmed)
6639        && !trimmed.contains(':')
6640}
6641
6642fn looks_like_caption_year(text: &str) -> bool {
6643    let trimmed = text.trim();
6644    trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
6645}
6646
6647/// Extract text from table token rows.
6648fn token_rows_text(rows: &[TableTokenRow]) -> String {
6649    normalize_common_ocr_text(&repair_fragmented_words(
6650        &rows
6651            .iter()
6652            .flat_map(|row| row.iter())
6653            .map(|token| token.base.value.as_str())
6654            .collect::<Vec<_>>()
6655            .join(" "),
6656    ))
6657}
6658
6659fn render_element(out: &mut String, element: &ContentElement) {
6660    match element {
6661        ContentElement::Heading(h) => {
6662            let text = h.base.base.value();
6663            let trimmed = text.trim();
6664            if should_skip_heading_text(trimmed) {
6665                return;
6666            }
6667            out.push_str(&format!("# {}\n\n", trimmed));
6668        }
6669        ContentElement::Paragraph(p) => {
6670            let text = p.base.value();
6671            let trimmed = clean_paragraph_text(&text);
6672            if !trimmed.is_empty() {
6673                out.push_str(&escape_md_line_start(&trimmed));
6674                if p.base.semantic_type == SemanticType::TableOfContent {
6675                    out.push('\n');
6676                } else {
6677                    out.push_str("\n\n");
6678                }
6679            }
6680        }
6681        ContentElement::List(list) => {
6682            let mut i = 0usize;
6683            let mut pending_item: Option<String> = None;
6684            while i < list.list_items.len() {
6685                let item = &list.list_items[i];
6686                let label = token_rows_text(&item.label.content);
6687                let body = token_rows_text(&item.body.content);
6688                let label_trimmed = normalize_list_text(label.trim());
6689                let body_trimmed = normalize_list_text(body.trim());
6690                let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
6691                    format!("{label_trimmed} {body_trimmed}")
6692                } else if !body_trimmed.is_empty() {
6693                    body_trimmed.to_string()
6694                } else {
6695                    label_trimmed.to_string()
6696                };
6697                let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
6698                    list_item_text_from_contents(&item.contents)
6699                } else {
6700                    combined
6701                };
6702
6703                if is_list_section_heading(&combined) {
6704                    if let Some(pending) = pending_item.take() {
6705                        push_rendered_list_item(out, pending.trim());
6706                    }
6707                    out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
6708                    i += 1;
6709                    continue;
6710                }
6711
6712                if is_pure_bullet_marker(&label_trimmed) && body_trimmed.is_empty() {
6713                    i += 1;
6714                    continue;
6715                }
6716
6717                if looks_like_stray_list_page_number(&combined) {
6718                    i += 1;
6719                    continue;
6720                }
6721
6722                let current_item = if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
6723                    if !label_trimmed.is_empty()
6724                        && !body_trimmed.is_empty()
6725                        && !is_pure_bullet_marker(&label_trimmed)
6726                    {
6727                        format!("{label_trimmed} {body_trimmed}")
6728                    } else if !body_trimmed.is_empty() {
6729                        body_trimmed.to_string()
6730                    } else if !is_pure_bullet_marker(&label_trimmed) {
6731                        label_trimmed.to_string()
6732                    } else {
6733                        String::new()
6734                    }
6735                } else if !item.contents.is_empty() {
6736                    normalize_list_text(list_item_text_from_contents(&item.contents).trim())
6737                } else {
6738                    String::new()
6739                };
6740
6741                if current_item.is_empty() {
6742                    i += 1;
6743                    continue;
6744                }
6745
6746                if let Some(previous) = pending_item.as_mut() {
6747                    if should_merge_list_continuation(previous, &current_item) {
6748                        merge_paragraph_text(previous, &current_item);
6749                        i += 1;
6750                        continue;
6751                    }
6752                }
6753
6754                if let Some(pending) = pending_item.replace(current_item) {
6755                    push_rendered_list_item(out, pending.trim());
6756                }
6757                i += 1;
6758            }
6759            if let Some(pending) = pending_item.take() {
6760                push_rendered_list_item(out, pending.trim());
6761            }
6762            out.push('\n');
6763        }
6764        ContentElement::Table(table) => {
6765            render_table(out, table);
6766        }
6767        ContentElement::TableBorder(table) => {
6768            render_table_border(out, table);
6769        }
6770        ContentElement::Formula(f) => {
6771            let latex = f.latex.trim();
6772            if !latex.is_empty() {
6773                out.push_str(&format!("$$\n{}\n$$\n\n", latex));
6774            }
6775        }
6776        ContentElement::Caption(c) => {
6777            let text = c.base.value();
6778            let normalized = normalize_common_ocr_text(text.trim());
6779            let trimmed = normalized.trim();
6780            if !trimmed.is_empty() {
6781                out.push_str(&format!("*{}*\n\n", trimmed));
6782            }
6783        }
6784        ContentElement::NumberHeading(nh) => {
6785            let text = nh.base.base.base.value();
6786            let trimmed = text.trim();
6787            if should_skip_heading_text(trimmed) {
6788                return;
6789            }
6790            out.push_str(&format!("# {}\n\n", trimmed));
6791        }
6792        ContentElement::Image(_) => {
6793            out.push_str("![Image](image)\n\n");
6794        }
6795        ContentElement::HeaderFooter(_) => {
6796            // Skip headers/footers in markdown by default
6797        }
6798        ContentElement::TextBlock(tb) => {
6799            let text = tb.value();
6800            let trimmed = clean_paragraph_text(&text);
6801            if !trimmed.is_empty() {
6802                out.push_str(&escape_md_line_start(&trimmed));
6803                out.push_str("\n\n");
6804            }
6805        }
6806        ContentElement::TextLine(tl) => {
6807            let text = tl.value();
6808            let normalized = normalize_common_ocr_text(text.trim());
6809            let trimmed = normalized.trim();
6810            if !trimmed.is_empty() {
6811                out.push_str(trimmed);
6812                out.push('\n');
6813            }
6814        }
6815        ContentElement::TextChunk(tc) => {
6816            out.push_str(&tc.value);
6817        }
6818        _ => {}
6819    }
6820}
6821
6822/// Escape characters that have special meaning at the start of a markdown line.
6823fn escape_md_line_start(text: &str) -> String {
6824    if text.starts_with('>') || text.starts_with('#') {
6825        format!("\\{}", text)
6826    } else {
6827        text.to_string()
6828    }
6829}
6830
6831fn starts_with_caption_prefix(text: &str) -> bool {
6832    let lower = text.trim_start().to_ascii_lowercase();
6833    [
6834        "figure ",
6835        "fig. ",
6836        "table ",
6837        "tab. ",
6838        "chart ",
6839        "graph ",
6840        "image ",
6841        "illustration ",
6842        "diagram ",
6843        "plate ",
6844        "map ",
6845        "exhibit ",
6846        "photo by ",
6847        "photo credit",
6848        "image by ",
6849        "image credit",
6850        "image courtesy",
6851        "photo courtesy",
6852        "credit: ",
6853        "source: ",
6854    ]
6855    .iter()
6856    .any(|prefix| lower.starts_with(prefix))
6857}
6858
6859fn is_structural_caption(text: &str) -> bool {
6860    let lower = text.trim().to_ascii_lowercase();
6861    lower.starts_with("figure ")
6862        || lower.starts_with("table ")
6863        || lower.starts_with("diagram ")
6864        || lower.starts_with("chart ")
6865}
6866
6867fn normalize_chart_like_markdown(markdown: &str) -> String {
6868    let blocks: Vec<&str> = markdown
6869        .split("\n\n")
6870        .map(str::trim)
6871        .filter(|block| !block.is_empty())
6872        .collect();
6873    if blocks.is_empty() {
6874        return markdown.trim().to_string();
6875    }
6876
6877    let mut normalized = Vec::new();
6878    let mut i = 0usize;
6879    while i < blocks.len() {
6880        if let Some(rendered) = trim_large_top_table_plate(&blocks, i) {
6881            normalized.push(rendered);
6882            break;
6883        }
6884
6885        if let Some((rendered, consumed)) = render_header_pair_chart_table(&blocks, i) {
6886            normalized.push(rendered);
6887            i += consumed;
6888            continue;
6889        }
6890
6891        if let Some((rendered, consumed)) = render_chart_block(&blocks, i) {
6892            normalized.push(rendered);
6893            i += consumed;
6894            continue;
6895        }
6896
6897        if let Some((rendered, consumed)) = render_structural_caption_block(&blocks, i) {
6898            normalized.push(rendered);
6899            i += consumed;
6900            continue;
6901        }
6902
6903        if should_drop_artifact_table_block(&blocks, i) {
6904            i += 1;
6905            continue;
6906        }
6907
6908        if !looks_like_footer_banner(blocks[i]) {
6909            normalized.push(blocks[i].to_string());
6910        }
6911        i += 1;
6912    }
6913
6914    normalized.join("\n\n").trim().to_string() + "\n"
6915}
6916
6917fn trim_large_top_table_plate(blocks: &[&str], start: usize) -> Option<String> {
6918    if start != 0 {
6919        return None;
6920    }
6921
6922    let rows = parse_pipe_table_block(blocks.first()?.trim())?;
6923    let body_rows = rows.len().saturating_sub(2);
6924    let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
6925    if body_rows < 8 || max_cols < 8 {
6926        return None;
6927    }
6928
6929    let caption = blocks.get(1)?.trim();
6930    if !caption.starts_with("Table ") || caption.split_whitespace().count() < 12 {
6931        return None;
6932    }
6933
6934    let has_following_section = blocks.iter().skip(2).any(|block| {
6935        let trimmed = block.trim();
6936        trimmed.starts_with("# ")
6937            || trimmed.starts_with("## ")
6938            || trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
6939                && trimmed.contains(" Main Results")
6940    });
6941    has_following_section.then_some(blocks[0].trim().to_string())
6942}
6943
6944fn render_header_pair_chart_table(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6945    let caption = blocks.get(start)?.trim();
6946    if !is_structural_caption(caption) {
6947        return None;
6948    }
6949
6950    let rows = parse_pipe_table_block(blocks.get(start + 1)?)?;
6951    if rows.len() != 2 {
6952        return None;
6953    }
6954
6955    let pairs = extract_value_year_pairs_from_cells(&rows[0]);
6956    if pairs.len() < 4 {
6957        return None;
6958    }
6959
6960    let mut source = String::new();
6961    let mut consumed = 2usize;
6962    if let Some(next_block) = blocks.get(start + 2) {
6963        let next = next_block.trim();
6964        if next.to_ascii_lowercase().starts_with("source:") {
6965            source = next.to_string();
6966            consumed += 1;
6967        }
6968    }
6969
6970    let mut out = String::new();
6971    let heading_prefix = if start == 0 { "# " } else { "## " };
6972    out.push_str(heading_prefix);
6973    out.push_str(caption);
6974    out.push_str("\n\n");
6975    out.push_str(&format!("| Year | {} |\n", chart_value_header(caption)));
6976    out.push_str("| --- | --- |\n");
6977    for (year, value) in pairs {
6978        out.push_str(&format!("| {} | {} |\n", year, value));
6979    }
6980    out.push('\n');
6981
6982    if !source.is_empty() {
6983        out.push('*');
6984        out.push_str(&escape_md_line_start(&source));
6985        out.push_str("*\n\n");
6986    }
6987
6988    Some((out.trim().to_string(), consumed))
6989}
6990
6991fn render_chart_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6992    let (caption, numeric_tokens) = split_chart_caption_and_values(blocks.get(start)?)?;
6993    let mut consumed = 1usize;
6994
6995    let mut source = String::new();
6996    let mut labels = Vec::new();
6997    if let Some(next_block) = blocks.get(start + 1) {
6998        let (candidate_labels, candidate_source) = extract_chart_labels_and_source(next_block);
6999        if !candidate_source.is_empty() || !candidate_labels.is_empty() {
7000            labels = candidate_labels;
7001            source = candidate_source;
7002            consumed += 1;
7003        }
7004    }
7005
7006    while let Some(block) = blocks.get(start + consumed) {
7007        if looks_like_numeric_noise_block(block) {
7008            consumed += 1;
7009            continue;
7010        }
7011        break;
7012    }
7013
7014    let value_tokens = derive_chart_series_values(&numeric_tokens, labels.len());
7015
7016    let mut out = String::new();
7017    out.push_str("## ");
7018    out.push_str(caption.trim());
7019    out.push_str("\n\n");
7020
7021    if labels.len() >= 3 && labels.len() == value_tokens.len() {
7022        let label_header = if labels.iter().all(|label| looks_like_yearish_label(label)) {
7023            "Year"
7024        } else {
7025            "Label"
7026        };
7027        let value_header = chart_value_header(&caption);
7028        out.push_str(&format!("| {} | {} |\n", label_header, value_header));
7029        out.push_str("| --- | --- |\n");
7030        for (label, value) in labels.iter().zip(value_tokens.iter()) {
7031            out.push_str(&format!("| {} | {} |\n", label, value));
7032        }
7033        out.push('\n');
7034    }
7035
7036    if !source.is_empty() {
7037        out.push('*');
7038        out.push_str(&escape_md_line_start(&source));
7039        out.push_str("*\n\n");
7040    }
7041
7042    Some((out.trim().to_string(), consumed))
7043}
7044
7045fn render_structural_caption_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7046    let block = blocks.get(start)?.trim();
7047    if !is_structural_caption(block) || block.contains('|') {
7048        return None;
7049    }
7050
7051    let mut caption = collapse_inline_whitespace(block);
7052    let mut consumed = 1usize;
7053    if let Some(next_block) = blocks.get(start + 1) {
7054        let next = next_block.trim();
7055        if looks_like_caption_continuation(next) {
7056            caption.push(' ');
7057            caption.push_str(next.trim_end_matches('.'));
7058            consumed += 1;
7059        } else if !looks_like_isolated_caption_context(block, next) {
7060            return None;
7061        }
7062    } else {
7063        return None;
7064    }
7065
7066    Some((format!("## {}", caption.trim()), consumed))
7067}
7068
7069fn split_chart_caption_and_values(block: &str) -> Option<(String, Vec<String>)> {
7070    let trimmed = block.trim();
7071    if !is_structural_caption(trimmed) {
7072        return None;
7073    }
7074
7075    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7076    let first_numeric_idx = tokens.iter().position(|token| is_numberish_token(token))?;
7077    if first_numeric_idx < 3 {
7078        return None;
7079    }
7080
7081    let caption = tokens[..first_numeric_idx].join(" ");
7082    let numeric_tokens: Vec<String> = tokens[first_numeric_idx..]
7083        .iter()
7084        .filter_map(|token| sanitize_numberish_token(token))
7085        .collect();
7086
7087    if numeric_tokens.len() < 4 {
7088        return None;
7089    }
7090
7091    Some((caption, numeric_tokens))
7092}
7093
7094fn parse_pipe_table_block(block: &str) -> Option<Vec<Vec<String>>> {
7095    let lines: Vec<&str> = block
7096        .lines()
7097        .map(str::trim)
7098        .filter(|line| !line.is_empty())
7099        .collect();
7100    if lines.len() < 2 {
7101        return None;
7102    }
7103
7104    let header = split_pipe_row(lines[0])?;
7105    if !is_pipe_separator_row(lines[1], header.len()) {
7106        return None;
7107    }
7108
7109    let mut rows = vec![header];
7110    rows.push(split_pipe_row(lines[1]).unwrap_or_default());
7111    for line in lines.iter().skip(2) {
7112        let row = split_pipe_row(line)?;
7113        rows.push(row);
7114    }
7115    Some(rows)
7116}
7117
7118fn split_pipe_row(line: &str) -> Option<Vec<String>> {
7119    let trimmed = line.trim();
7120    if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
7121        return None;
7122    }
7123
7124    Some(
7125        trimmed[1..trimmed.len() - 1]
7126            .split('|')
7127            .map(|cell| cell.trim().to_string())
7128            .collect(),
7129    )
7130}
7131
7132fn is_pipe_separator_row(line: &str, expected_cols: usize) -> bool {
7133    let Some(cells) = split_pipe_row(line) else {
7134        return false;
7135    };
7136    if cells.len() != expected_cols || expected_cols == 0 {
7137        return false;
7138    }
7139
7140    cells.iter().all(|cell| {
7141        let stripped = cell.trim_matches(':').trim();
7142        !stripped.is_empty() && stripped.chars().all(|ch| ch == '-')
7143    })
7144}
7145
7146fn extract_value_year_pairs_from_cells(cells: &[String]) -> Vec<(String, String)> {
7147    let mut pairs = Vec::new();
7148    for cell in cells {
7149        let tokens: Vec<&str> = cell.split_whitespace().collect();
7150        if tokens.len() != 2 {
7151            continue;
7152        }
7153
7154        if looks_like_year_token(tokens[0]) && is_numberish_token(tokens[1]) {
7155            if let Some(value) = sanitize_numberish_token(tokens[1]) {
7156                pairs.push((tokens[0].to_string(), value));
7157            }
7158            continue;
7159        }
7160
7161        if is_numberish_token(tokens[0]) && looks_like_year_token(tokens[1]) {
7162            if let Some(value) = sanitize_numberish_token(tokens[0]) {
7163                pairs.push((tokens[1].to_string(), value));
7164            }
7165        }
7166    }
7167
7168    pairs.sort_by(|left, right| left.0.cmp(&right.0));
7169    pairs
7170}
7171
7172fn should_drop_artifact_table_block(blocks: &[&str], start: usize) -> bool {
7173    let Some(rows) = parse_pipe_table_block(blocks[start]) else {
7174        return false;
7175    };
7176
7177    let prev = start
7178        .checked_sub(1)
7179        .and_then(|idx| blocks.get(idx))
7180        .map(|block| block.trim())
7181        .unwrap_or("");
7182    let next = blocks
7183        .get(start + 1)
7184        .map(|block| block.trim())
7185        .unwrap_or("");
7186
7187    if rows.len() == 2 && rows.first().is_some_and(|row| row.len() == 1) {
7188        let header = rows[0][0].trim();
7189        if looks_like_url_fragment(header) {
7190            return true;
7191        }
7192        if looks_like_numeric_axis_blob(header) && !previous_block_announces_table(prev) {
7193            return true;
7194        }
7195    }
7196
7197    let stats = pipe_table_stats(&rows);
7198    stats.fill_ratio < 0.5
7199        && stats.long_cell_count == 0
7200        && !is_structural_caption(prev)
7201        && (looks_like_citation_block(next) || is_structural_caption(next))
7202}
7203
7204fn previous_block_announces_table(block: &str) -> bool {
7205    let lower = block.trim().to_ascii_lowercase();
7206    lower.ends_with("as follows:")
7207        || lower.ends_with("following details:")
7208        || lower.ends_with("following detail:")
7209        || lower.contains("the following details")
7210}
7211
7212fn looks_like_url_fragment(text: &str) -> bool {
7213    let trimmed = text.trim();
7214    (!trimmed.is_empty() && (trimmed.contains("http") || trimmed.contains("/status/")))
7215        || (trimmed.contains('/') && !trimmed.contains(' '))
7216}
7217
7218fn looks_like_numeric_axis_blob(text: &str) -> bool {
7219    let numeric_values: Vec<i64> = text
7220        .split_whitespace()
7221        .filter_map(parse_integer_token)
7222        .collect();
7223    numeric_values.len() >= 8
7224        && !detect_axis_progression(&numeric_values).is_empty()
7225        && text.chars().any(char::is_alphabetic)
7226}
7227
7228fn looks_like_citation_block(block: &str) -> bool {
7229    let trimmed = block.trim();
7230    trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.split_whitespace().count() <= 8
7231}
7232
7233struct PipeTableStats {
7234    fill_ratio: f64,
7235    long_cell_count: usize,
7236}
7237
7238fn pipe_table_stats(rows: &[Vec<String>]) -> PipeTableStats {
7239    let cols = rows.iter().map(Vec::len).max().unwrap_or(0).max(1);
7240    let body = rows.len().saturating_sub(2);
7241    let mut nonempty = 0usize;
7242    let mut long_cell_count = 0usize;
7243
7244    for row in rows.iter().skip(2) {
7245        for cell in row {
7246            if !cell.trim().is_empty() {
7247                nonempty += 1;
7248                if cell.split_whitespace().count() >= 3 {
7249                    long_cell_count += 1;
7250                }
7251            }
7252        }
7253    }
7254
7255    let fill_ratio = if body == 0 {
7256        0.0
7257    } else {
7258        nonempty as f64 / (body * cols) as f64
7259    };
7260
7261    PipeTableStats {
7262        fill_ratio,
7263        long_cell_count,
7264    }
7265}
7266
7267fn extract_chart_labels_and_source(block: &str) -> (Vec<String>, String) {
7268    let trimmed = block.trim();
7269    let lower = trimmed.to_ascii_lowercase();
7270    let source_idx = lower.find("source:");
7271
7272    let label_region = source_idx.map_or(trimmed, |idx| trimmed[..idx].trim());
7273    let source = source_idx
7274        .map(|idx| trimmed[idx..].trim().to_string())
7275        .unwrap_or_default();
7276
7277    let labels = parse_chart_labels(label_region);
7278    (labels, source)
7279}
7280
7281fn parse_chart_labels(text: &str) -> Vec<String> {
7282    let tokens: Vec<&str> = text.split_whitespace().collect();
7283    let mut labels = Vec::new();
7284    let mut i = 0usize;
7285    while i < tokens.len() {
7286        let token = tokens[i].trim_matches(|c: char| c == ',' || c == ';');
7287        if looks_like_year_token(token) {
7288            let mut label = token.to_string();
7289            if let Some(next) = tokens.get(i + 1) {
7290                let next_trimmed = next.trim_matches(|c: char| c == ',' || c == ';');
7291                if next_trimmed.starts_with('(') && next_trimmed.ends_with(')') {
7292                    label.push(' ');
7293                    label.push_str(next_trimmed);
7294                    i += 1;
7295                }
7296            }
7297            labels.push(label);
7298        } else if looks_like_category_label(token) {
7299            labels.push(token.to_string());
7300        }
7301        i += 1;
7302    }
7303    labels
7304}
7305
7306fn derive_chart_series_values(tokens: &[String], expected_count: usize) -> Vec<String> {
7307    if expected_count == 0 {
7308        return Vec::new();
7309    }
7310
7311    if tokens.len() == expected_count {
7312        return tokens.to_vec();
7313    }
7314
7315    let numeric_values: Vec<i64> = tokens
7316        .iter()
7317        .filter_map(|token| parse_integer_token(token))
7318        .collect();
7319    if numeric_values.len() != tokens.len() {
7320        return Vec::new();
7321    }
7322
7323    let axis_series = detect_axis_progression(&numeric_values);
7324    if axis_series.is_empty() {
7325        return Vec::new();
7326    }
7327
7328    let mut remaining = Vec::new();
7329    let mut removable = axis_series;
7330    for token in tokens {
7331        let Some(value) = parse_integer_token(token) else {
7332            continue;
7333        };
7334        if let Some(pos) = removable.iter().position(|candidate| *candidate == value) {
7335            removable.remove(pos);
7336        } else {
7337            remaining.push(token.clone());
7338        }
7339    }
7340
7341    if remaining.len() == expected_count {
7342        remaining
7343    } else {
7344        Vec::new()
7345    }
7346}
7347
7348fn detect_axis_progression(values: &[i64]) -> Vec<i64> {
7349    if values.len() < 6 {
7350        return Vec::new();
7351    }
7352
7353    let mut sorted = values.to_vec();
7354    sorted.sort_unstable();
7355    sorted.dedup();
7356    if sorted.len() < 6 {
7357        return Vec::new();
7358    }
7359
7360    let mut best = Vec::new();
7361    for window in sorted.windows(2) {
7362        let step = window[1] - window[0];
7363        if step <= 0 {
7364            continue;
7365        }
7366
7367        let mut series = vec![window[0]];
7368        let mut current = window[0];
7369        loop {
7370            let next = current + step;
7371            if sorted.binary_search(&next).is_ok() {
7372                series.push(next);
7373                current = next;
7374            } else {
7375                break;
7376            }
7377        }
7378
7379        if series.len() > best.len() {
7380            best = series;
7381        }
7382    }
7383
7384    if best.len() >= 6 {
7385        best
7386    } else {
7387        Vec::new()
7388    }
7389}
7390
7391fn chart_value_header(caption: &str) -> String {
7392    let trimmed = caption.trim();
7393    let title = strip_structural_caption_prefix(trimmed);
7394
7395    let mut base = title.to_string();
7396    if let Some(idx) = base.rfind(" in ") {
7397        let tail = base[idx + 4..].trim();
7398        if tail.split_whitespace().count() <= 2
7399            && tail.chars().next().is_some_and(char::is_uppercase)
7400        {
7401            base.truncate(idx);
7402        }
7403    }
7404
7405    if let Some(start) = title.rfind('(') {
7406        if title.ends_with(')') {
7407            let unit = title[start + 1..title.len() - 1].trim();
7408            if let Some(idx) = base.rfind('(') {
7409                base.truncate(idx);
7410            }
7411            let normalized_unit = unit.strip_prefix("in ").unwrap_or(unit).trim();
7412            return format!("{} ({})", base.trim(), normalized_unit);
7413        }
7414    }
7415
7416    let trimmed = base.trim();
7417    if trimmed.is_empty() {
7418        "Value".to_string()
7419    } else {
7420        trimmed.to_string()
7421    }
7422}
7423
7424fn strip_structural_caption_prefix(text: &str) -> &str {
7425    let trimmed = text.trim();
7426    let mut parts = trimmed.splitn(3, ' ');
7427    let Some(first) = parts.next() else {
7428        return trimmed;
7429    };
7430    let Some(second) = parts.next() else {
7431        return trimmed;
7432    };
7433    let Some(rest) = parts.next() else {
7434        return trimmed;
7435    };
7436
7437    let first_lower = first.to_ascii_lowercase();
7438    if matches!(
7439        first_lower.as_str(),
7440        "figure" | "table" | "diagram" | "chart"
7441    ) && second
7442        .chars()
7443        .all(|ch| ch.is_ascii_digit() || matches!(ch, '.' | ':'))
7444    {
7445        rest.trim()
7446    } else {
7447        trimmed
7448    }
7449}
7450
7451fn looks_like_footer_banner(block: &str) -> bool {
7452    let trimmed = block.trim();
7453    if trimmed.contains('\n') || trimmed.len() < 8 {
7454        return false;
7455    }
7456
7457    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7458    if !(2..=6).contains(&tokens.len()) {
7459        return false;
7460    }
7461
7462    let Some(last) = tokens.last() else {
7463        return false;
7464    };
7465    if !last.chars().all(|ch| ch.is_ascii_digit()) {
7466        return false;
7467    }
7468
7469    tokens[..tokens.len() - 1].iter().all(|token| {
7470        matches!(
7471            token.to_ascii_lowercase().as_str(),
7472            "of" | "and" | "the" | "for" | "in" | "on"
7473        ) || token.chars().next().is_some_and(char::is_uppercase)
7474    })
7475}
7476
7477fn looks_like_caption_continuation(block: &str) -> bool {
7478    let trimmed = block.trim();
7479    !trimmed.is_empty()
7480        && trimmed.split_whitespace().count() <= 8
7481        && trimmed.chars().next().is_some_and(char::is_uppercase)
7482        && !trimmed.contains(':')
7483}
7484
7485fn collapse_inline_whitespace(text: &str) -> String {
7486    text.split_whitespace().collect::<Vec<_>>().join(" ")
7487}
7488
7489fn drop_isolated_noise_lines(markdown: &str) -> String {
7490    let lines: Vec<&str> = markdown.lines().collect();
7491    let mut kept = Vec::with_capacity(lines.len());
7492
7493    for (idx, line) in lines.iter().enumerate() {
7494        if should_drop_isolated_noise_line(&lines, idx) {
7495            continue;
7496        }
7497        kept.push(*line);
7498    }
7499
7500    let mut result = kept.join("\n");
7501    if markdown.ends_with('\n') {
7502        result.push('\n');
7503    }
7504    result
7505}
7506
7507fn should_drop_isolated_noise_line(lines: &[&str], idx: usize) -> bool {
7508    let trimmed = lines[idx].trim();
7509    if trimmed.len() != 1 {
7510        return false;
7511    }
7512
7513    let ch = trimmed.chars().next().unwrap_or_default();
7514    if !(ch.is_ascii_lowercase() || ch.is_ascii_digit()) {
7515        return false;
7516    }
7517
7518    let prev = previous_nonempty_line(lines, idx);
7519    let next = next_nonempty_line(lines, idx);
7520    let (Some(prev), Some(next)) = (prev, next) else {
7521        return false;
7522    };
7523
7524    is_substantive_markdown_line(prev) && is_substantive_markdown_line(next)
7525}
7526
7527fn previous_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7528    lines[..idx]
7529        .iter()
7530        .rev()
7531        .find(|line| !line.trim().is_empty())
7532        .copied()
7533}
7534
7535fn next_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7536    lines[idx + 1..]
7537        .iter()
7538        .find(|line| !line.trim().is_empty())
7539        .copied()
7540}
7541
7542fn is_substantive_markdown_line(line: &str) -> bool {
7543    let trimmed = line.trim();
7544    if trimmed.is_empty() {
7545        return false;
7546    }
7547
7548    if trimmed.starts_with('|') || trimmed.starts_with("- ") || trimmed.starts_with('#') {
7549        return true;
7550    }
7551
7552    trimmed.split_whitespace().count() >= 2
7553}
7554
7555fn normalize_common_ocr_text(text: &str) -> String {
7556    if text.is_empty() {
7557        return String::new();
7558    }
7559
7560    let mut normalized = text
7561        .replace("ߤL", "μL")
7562        .replace(" oC", "°C")
7563        .replace("37 C", "37°C")
7564        .replace("-20 oC", "-20°C")
7565        .replace("1- 20-μL", "1-20-μL")
7566        .replace("1- 20 μL", "1-20 μL")
7567        .replace("1- 2 0  μL", "1-20 μL")
7568        .replace("1- 2 0 μL", "1-20 μL");
7569
7570    normalized = normalize_degree_spacing(&normalized);
7571    collapse_inline_whitespace(&normalized)
7572}
7573
7574fn normalize_degree_spacing(text: &str) -> String {
7575    let chars: Vec<char> = text.chars().collect();
7576    let mut out = String::with_capacity(text.len());
7577    let mut i = 0usize;
7578    while i < chars.len() {
7579        let ch = chars[i];
7580        if ch == ' '
7581            && i > 0
7582            && i + 2 < chars.len()
7583            && chars[i - 1].is_ascii_digit()
7584            && matches!(chars[i + 1], 'C' | 'F')
7585            && !chars[i + 2].is_ascii_alphabetic()
7586        {
7587            out.push('°');
7588            out.push(chars[i + 1]);
7589            i += 2;
7590            continue;
7591        }
7592        out.push(ch);
7593        i += 1;
7594    }
7595    out
7596}
7597
7598fn normalize_list_text(text: &str) -> String {
7599    let normalized = normalize_common_ocr_text(text);
7600    let trimmed = normalized
7601        .trim_start_matches(|ch: char| is_bullet_like(ch))
7602        .trim();
7603    trimmed.to_string()
7604}
7605
7606fn push_rendered_list_item(out: &mut String, item: &str) {
7607    if starts_with_enumerated_marker(item) {
7608        out.push_str(item);
7609        out.push('\n');
7610    } else {
7611        out.push_str(&format!("- {}\n", item));
7612    }
7613}
7614
7615fn should_merge_list_continuation(previous: &str, current: &str) -> bool {
7616    let trimmed = current.trim();
7617    if trimmed.is_empty()
7618        || looks_like_stray_list_page_number(trimmed)
7619        || is_list_section_heading(trimmed)
7620        || looks_like_numbered_section(trimmed)
7621        || starts_with_enumerated_marker(trimmed)
7622    {
7623        return false;
7624    }
7625
7626    if previous.ends_with('-')
7627        && previous
7628            .chars()
7629            .rev()
7630            .nth(1)
7631            .is_some_and(|c| c.is_alphabetic())
7632        && trimmed.chars().next().is_some_and(char::is_lowercase)
7633    {
7634        return true;
7635    }
7636
7637    trimmed
7638        .chars()
7639        .next()
7640        .is_some_and(|ch| ch.is_ascii_lowercase() || matches!(ch, ',' | ';' | ')' | ']' | '%'))
7641}
7642
7643fn is_pure_bullet_marker(text: &str) -> bool {
7644    let trimmed = text.trim();
7645    !trimmed.is_empty() && trimmed.chars().all(is_bullet_like)
7646}
7647
7648fn looks_like_stray_list_page_number(text: &str) -> bool {
7649    let trimmed = text.trim();
7650    (1..=4).contains(&trimmed.len()) && trimmed.chars().all(|ch| ch.is_ascii_digit())
7651}
7652
7653fn is_bullet_like(ch: char) -> bool {
7654    matches!(
7655        ch,
7656        '•' | '◦'
7657            | '▪'
7658            | '▸'
7659            | '▹'
7660            | '►'
7661            | '▻'
7662            | '●'
7663            | '○'
7664            | '■'
7665            | '□'
7666            | '◆'
7667            | '◇'
7668            | '-'
7669    )
7670}
7671
7672fn looks_like_isolated_caption_context(caption: &str, next_block: &str) -> bool {
7673    let next = next_block.trim();
7674    if next.is_empty() {
7675        return false;
7676    }
7677
7678    let next_lower = next.to_ascii_lowercase();
7679    if next_lower.starts_with("source:")
7680        || next_lower.starts_with("note:")
7681        || next_lower.starts_with("*source:")
7682        || next_lower.starts_with("*note:")
7683    {
7684        return true;
7685    }
7686
7687    caption.split_whitespace().count() <= 14
7688        && next.split_whitespace().count() <= 45
7689        && (next.contains(':') || next.contains('='))
7690}
7691
7692fn looks_like_numeric_noise_block(block: &str) -> bool {
7693    let trimmed = block.trim();
7694    !trimmed.is_empty()
7695        && trimmed.split_whitespace().all(|token| {
7696            sanitize_numberish_token(token)
7697                .as_deref()
7698                .is_some_and(|sanitized| sanitized.chars().all(|ch| ch.is_ascii_digit()))
7699        })
7700}
7701
7702fn looks_like_yearish_label(label: &str) -> bool {
7703    label.chars().next().is_some_and(|ch| ch.is_ascii_digit())
7704}
7705
7706fn looks_like_year_token(token: &str) -> bool {
7707    token.len() == 4 && token.chars().all(|ch| ch.is_ascii_digit())
7708}
7709
7710fn looks_like_category_label(token: &str) -> bool {
7711    token
7712        .chars()
7713        .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '/' | '%'))
7714        && token.chars().any(|ch| ch.is_ascii_alphabetic())
7715}
7716
7717fn is_numberish_token(token: &str) -> bool {
7718    sanitize_numberish_token(token).is_some()
7719}
7720
7721fn sanitize_numberish_token(token: &str) -> Option<String> {
7722    let trimmed = token.trim_matches(|c: char| matches!(c, ',' | ';' | ':' | '.'));
7723    if trimmed.is_empty() {
7724        return None;
7725    }
7726
7727    let candidate = trimmed.trim_end_matches('%').replace(',', "");
7728    if candidate.chars().all(|ch| ch.is_ascii_digit()) {
7729        Some(trimmed.trim_end_matches([',', ';', ':']).to_string())
7730    } else {
7731        None
7732    }
7733}
7734
7735fn parse_integer_token(token: &str) -> Option<i64> {
7736    sanitize_numberish_token(token)?
7737        .replace(',', "")
7738        .parse::<i64>()
7739        .ok()
7740}
7741
7742fn starts_with_uppercase_word(text: &str) -> bool {
7743    for ch in text.trim_start().chars() {
7744        if ch.is_alphabetic() {
7745            return ch.is_uppercase();
7746        }
7747        if !matches!(ch, '"' | '\'' | '(' | '[') {
7748            break;
7749        }
7750    }
7751    false
7752}
7753
7754/// Clean paragraph text: trim trailing whitespace from each line,
7755/// collapse multiple spaces, and normalize whitespace.
7756fn clean_paragraph_text(text: &str) -> String {
7757    let trimmed = text.trim();
7758    if trimmed.is_empty() {
7759        return String::new();
7760    }
7761    // Collapse runs of spaces (but not newlines) to single space
7762    let mut result = String::with_capacity(trimmed.len());
7763    let mut prev_space = false;
7764    for ch in trimmed.chars() {
7765        if ch == ' ' || ch == '\t' {
7766            if !prev_space {
7767                result.push(' ');
7768                prev_space = true;
7769            }
7770        } else {
7771            result.push(ch);
7772            prev_space = false;
7773        }
7774    }
7775    normalize_common_ocr_text(&result)
7776}
7777
7778fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
7779    match element {
7780        Some(ContentElement::Paragraph(p)) => {
7781            let text = clean_paragraph_text(&p.base.value());
7782            let trimmed = text.trim();
7783            if trimmed.is_empty()
7784                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7785            {
7786                None
7787            } else {
7788                Some(trimmed.to_string())
7789            }
7790        }
7791        Some(ContentElement::TextBlock(tb)) => {
7792            let text = clean_paragraph_text(&tb.value());
7793            let trimmed = text.trim();
7794            if trimmed.is_empty()
7795                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7796            {
7797                None
7798            } else {
7799                Some(trimmed.to_string())
7800            }
7801        }
7802        Some(ContentElement::TextLine(tl)) => {
7803            let text = clean_paragraph_text(&tl.value());
7804            let trimmed = text.trim();
7805            if trimmed.is_empty()
7806                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7807            {
7808                None
7809            } else {
7810                Some(trimmed.to_string())
7811            }
7812        }
7813        _ => None,
7814    }
7815}
7816
7817fn should_render_paragraph_as_heading(
7818    doc: &PdfDocument,
7819    idx: usize,
7820    text: &str,
7821    next: Option<&ContentElement>,
7822) -> bool {
7823    if looks_like_top_margin_running_header(doc, idx, text) {
7824        return false;
7825    }
7826    if looks_like_hyphenated_table_title_continuation(doc, idx, text, next) {
7827        return true;
7828    }
7829    if should_render_element_as_heading(&doc.kids[idx], text, next) {
7830        return true;
7831    }
7832
7833    // Font-size guard: skip rescue if the candidate text is significantly
7834    // smaller than the document's body text (chart axis labels, footnotes).
7835    let body_font_size = compute_body_font_size(doc);
7836    if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
7837        return false;
7838    }
7839
7840    // Rescue pass tier 1: when the pipeline found zero headings, use broad rescue.
7841    if !doc_has_explicit_headings(doc) {
7842        if should_rescue_as_heading(doc, idx, text) {
7843            return true;
7844        }
7845        // Also check numbered sections and ALL CAPS even with zero headings,
7846        // since Tier 1 broad rescue has strict word/char limits that miss
7847        // longer keyword-numbered headings (e.g. "Activity 4. Title text").
7848        if should_rescue_allcaps_heading(doc, idx, text) {
7849            return true;
7850        }
7851        if should_rescue_numbered_heading(doc, idx, text) {
7852            return true;
7853        }
7854        return false;
7855    }
7856    // Rescue pass tier 2: when heading density is very low (< 10%), only
7857    // rescue ALL CAPS short text followed by substantial body content.
7858    if heading_density(doc) < 0.10 {
7859        if should_rescue_allcaps_heading(doc, idx, text) {
7860            return true;
7861        }
7862        // Rescue pass tier 3: numbered section headings (e.g. "01 - Title").
7863        // When a document has very few detected headings, numbered patterns
7864        // are a strong structural signal that the font-based detector missed.
7865        if should_rescue_numbered_heading(doc, idx, text) {
7866            return true;
7867        }
7868        // Font-size-gated title-case rescue: when the paragraph is rendered
7869        // in a noticeably larger font than body text, apply the same
7870        // title-case rescue used in tier 1.  A 15 % size increase is a
7871        // reliable visual heading signal straight from the PDF font metrics.
7872        if body_font_size > 0.0 {
7873            if let ContentElement::Paragraph(p) = &doc.kids[idx] {
7874                if let Some(fs) = p.base.font_size {
7875                    if fs >= 1.15 * body_font_size
7876                        && is_heading_rescue_candidate(doc, idx, text)
7877                        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7878                    {
7879                        return true;
7880                    }
7881                }
7882            }
7883        }
7884    }
7885    false
7886}
7887
7888/// Check whether any element in the document is an explicit heading from the pipeline.
7889fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
7890    doc.kids.iter().any(|e| {
7891        matches!(
7892            e,
7893            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7894        )
7895    })
7896}
7897
7898/// Compute the dominant body font size from paragraphs with substantial text
7899/// (> 10 words).  Uses the median of qualifying paragraphs to avoid being
7900/// skewed by short chart labels or footnote markers.
7901/// Returns 0.0 if no qualifying paragraph is found.
7902fn compute_body_font_size(doc: &PdfDocument) -> f64 {
7903    let mut font_sizes: Vec<f64> = doc
7904        .kids
7905        .iter()
7906        .filter_map(|e| {
7907            if let ContentElement::Paragraph(p) = e {
7908                let word_count = p.base.value().split_whitespace().count();
7909                if word_count > 10 {
7910                    p.base.font_size
7911                } else {
7912                    None
7913                }
7914            } else {
7915                None
7916            }
7917        })
7918        .collect();
7919    if font_sizes.is_empty() {
7920        return 0.0;
7921    }
7922    font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
7923    font_sizes[font_sizes.len() / 2]
7924}
7925
7926/// Check whether a paragraph's font size is too small relative to the document
7927/// body font to be a heading.  Returns true if the element should be skipped.
7928/// A heading should not be noticeably smaller than body text — font size ≥ 95%
7929/// of the dominant body size is required.
7930fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
7931    if body_font_size <= 0.0 {
7932        return false;
7933    }
7934    if let ContentElement::Paragraph(p) = &doc_kids[idx] {
7935        if let Some(fs) = p.base.font_size {
7936            return fs < 0.95 * body_font_size;
7937        }
7938    }
7939    false
7940}
7941
7942/// Count the ratio of pipeline headings to total content elements.
7943fn heading_density(doc: &PdfDocument) -> f64 {
7944    let total = doc.kids.len();
7945    if total == 0 {
7946        return 0.0;
7947    }
7948    let heading_count = doc
7949        .kids
7950        .iter()
7951        .filter(|e| {
7952            matches!(
7953                e,
7954                ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7955            )
7956        })
7957        .count();
7958    heading_count as f64 / total as f64
7959}
7960
7961/// Rescue headings: identify short standalone paragraphs that likely serve
7962/// as section headings.  Only runs when the pipeline produced zero headings.
7963fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7964    is_heading_rescue_candidate(doc, idx, text)
7965        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7966}
7967
7968/// Pure text-criteria check for title-case heading rescue.
7969/// Returns true when the text looks like a heading based on casing,
7970/// length, and character composition — without any lookahead.
7971fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7972    let trimmed = text.trim();
7973    if trimmed.is_empty() {
7974        return false;
7975    }
7976
7977    let has_alpha = trimmed.chars().any(char::is_alphabetic);
7978
7979    // Must have alphabetic chars and not end with sentence/continuation punctuation
7980    if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
7981        return false;
7982    }
7983
7984    // Reject text containing math/special symbols or percentage signs.
7985    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
7986        return false;
7987    }
7988
7989    // Must not be fully parenthesized (citations)
7990    if trimmed.starts_with('(') && trimmed.ends_with(')') {
7991        return false;
7992    }
7993
7994    // Must not look like a caption or chart label
7995    if starts_with_caption_prefix(trimmed)
7996        || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
7997    {
7998        return false;
7999    }
8000
8001    // Must be short: ≤ 6 words, ≤ 60 chars
8002    let word_count = trimmed.split_whitespace().count();
8003    if word_count > 6 || trimmed.len() > 60 {
8004        return false;
8005    }
8006
8007    // Must not be a purely numeric string
8008    if trimmed
8009        .chars()
8010        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8011    {
8012        return false;
8013    }
8014
8015    // First alphabetic character should be uppercase
8016    if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
8017        if first_alpha.is_lowercase() {
8018            return false;
8019        }
8020    }
8021
8022    true
8023}
8024
8025/// Check the next `max_lookahead` elements for substantive body content.
8026/// Returns true when at least one element is a long paragraph (≥ word_count*3
8027/// or > 15 words) or a structural element (list, table, image, figure).
8028fn has_substantive_follow_up(
8029    doc: &PdfDocument,
8030    idx: usize,
8031    word_count: usize,
8032    max_lookahead: usize,
8033) -> bool {
8034    for offset in 1..=max_lookahead {
8035        let lookahead_idx = idx + offset;
8036        if lookahead_idx >= doc.kids.len() {
8037            break;
8038        }
8039        let look_elem = &doc.kids[lookahead_idx];
8040        match look_elem {
8041            ContentElement::Paragraph(p) => {
8042                let next_text = p.base.value();
8043                let nw = next_text.split_whitespace().count();
8044                if nw >= word_count * 3 || nw > 15 {
8045                    return true;
8046                }
8047            }
8048            ContentElement::TextBlock(tb) => {
8049                let next_text = tb.value();
8050                let nw = next_text.split_whitespace().count();
8051                if nw >= word_count * 3 || nw > 15 {
8052                    return true;
8053                }
8054            }
8055            ContentElement::TextLine(tl) => {
8056                let next_text = tl.value();
8057                let nw = next_text.split_whitespace().count();
8058                if nw >= word_count * 3 || nw > 15 {
8059                    return true;
8060                }
8061            }
8062            ContentElement::List(_)
8063            | ContentElement::Table(_)
8064            | ContentElement::TableBorder(_)
8065            | ContentElement::Image(_)
8066            | ContentElement::Figure(_) => {
8067                return true;
8068            }
8069            _ => continue,
8070        }
8071    }
8072
8073    false
8074}
8075
8076/// Rescue numbered section headings like "01 - Find Open Educational Resources"
8077/// or "4.2 Main Results" when heading density is low.
8078fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8079    let trimmed = text.trim();
8080    if trimmed.is_empty() || trimmed.len() > 100 {
8081        return false;
8082    }
8083
8084    // Must match numbered section pattern: digits (with optional dots)
8085    // followed by separator and title text.
8086    if !looks_like_numbered_section(trimmed) {
8087        return false;
8088    }
8089
8090    // Must not end with sentence punctuation — EXCEPT when the text matches
8091    // a keyword+number pattern (e.g. "Activity 4. Determining CEC…") where
8092    // the trailing period is part of the heading format, not sentence ending.
8093    if trimmed.ends_with(['!', '?', ';', ',']) {
8094        return false;
8095    }
8096    if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
8097        return false;
8098    }
8099    // Reject numbered headings containing math symbols or percentage signs.
8100    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8101        return false;
8102    }
8103
8104    // Look ahead for substantive content
8105    for offset in 1..=3 {
8106        let lookahead_idx = idx + offset;
8107        if lookahead_idx >= doc.kids.len() {
8108            break;
8109        }
8110        match &doc.kids[lookahead_idx] {
8111            ContentElement::Paragraph(p) => {
8112                let nw = p.base.value().split_whitespace().count();
8113                if nw > 10 {
8114                    return true;
8115                }
8116            }
8117            ContentElement::TextBlock(tb) => {
8118                let nw = tb.value().split_whitespace().count();
8119                if nw > 10 {
8120                    return true;
8121                }
8122            }
8123            ContentElement::TextLine(tl) => {
8124                let nw = tl.value().split_whitespace().count();
8125                if nw > 10 {
8126                    return true;
8127                }
8128            }
8129            ContentElement::List(_)
8130            | ContentElement::Table(_)
8131            | ContentElement::TableBorder(_)
8132            | ContentElement::Image(_)
8133            | ContentElement::Figure(_) => {
8134                return true;
8135            }
8136            _ => continue,
8137        }
8138    }
8139
8140    false
8141}
8142
8143/// Check if text starts with a numbered section prefix (e.g. "01 -", "4.2 ", "III.")
8144/// or a keyword+number pattern (e.g. "Activity 4.", "Experiment #1:", "Chapter 3").
8145fn looks_like_numbered_section(text: &str) -> bool {
8146    let bytes = text.as_bytes();
8147    if bytes.is_empty() {
8148        return false;
8149    }
8150
8151    // Branch 1: digit-based prefix: "1 ", "01 ", "4.2 ", "1. ", "01 - "
8152    let mut idx = 0;
8153    if bytes[0].is_ascii_digit() {
8154        while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8155            idx += 1;
8156        }
8157        if idx >= bytes.len() {
8158            return false;
8159        }
8160        // dot-separated subsections: "4.2", "1.3.1"
8161        while idx < bytes.len() && bytes[idx] == b'.' {
8162            idx += 1;
8163            let start = idx;
8164            while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8165                idx += 1;
8166            }
8167            if idx == start {
8168                // "4." followed by space → "4. Title"
8169                break;
8170            }
8171        }
8172        // Must be followed by whitespace or "-"
8173        if idx >= bytes.len() {
8174            return false;
8175        }
8176        // Skip separator: "- " or " - " or just " "
8177        if bytes[idx] == b' ' || bytes[idx] == b'\t' {
8178            idx += 1;
8179            // Skip optional "- " separator
8180            if idx < bytes.len() && bytes[idx] == b'-' {
8181                idx += 1;
8182                if idx < bytes.len() && bytes[idx] == b' ' {
8183                    idx += 1;
8184                }
8185            }
8186        } else if bytes[idx] == b'-' {
8187            idx += 1;
8188            if idx < bytes.len() && bytes[idx] == b' ' {
8189                idx += 1;
8190            }
8191        } else {
8192            return false;
8193        }
8194        // Must have title text after prefix
8195        let rest = &text[idx..].trim();
8196        if rest.is_empty() {
8197            return false;
8198        }
8199        // First alpha char must be uppercase
8200        if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
8201            return c.is_uppercase();
8202        }
8203        return false;
8204    }
8205
8206    // Branch 2: keyword+number prefix: "Activity 4.", "Experiment #1:", "Chapter 3"
8207    if looks_like_keyword_numbered_section(text) {
8208        return true;
8209    }
8210
8211    false
8212}
8213
8214/// Structural keywords that commonly precede a number to form a heading.
8215const SECTION_KEYWORDS: &[&str] = &[
8216    "activity",
8217    "appendix",
8218    "case",
8219    "chapter",
8220    "exercise",
8221    "experiment",
8222    "lab",
8223    "lesson",
8224    "module",
8225    "part",
8226    "phase",
8227    "problem",
8228    "question",
8229    "section",
8230    "stage",
8231    "step",
8232    "task",
8233    "topic",
8234    "unit",
8235];
8236
8237/// Check if text matches "Keyword N. Title" or "Keyword #N: Title" pattern.
8238fn looks_like_keyword_numbered_section(text: &str) -> bool {
8239    let trimmed = text.trim();
8240    // Find the first space to extract the keyword
8241    let space_pos = match trimmed.find(' ') {
8242        Some(p) => p,
8243        None => return false,
8244    };
8245    let keyword = &trimmed[..space_pos];
8246    if !SECTION_KEYWORDS
8247        .iter()
8248        .any(|k| keyword.eq_ignore_ascii_case(k))
8249    {
8250        return false;
8251    }
8252    // After keyword+space, expect a number (optionally preceded by #)
8253    let rest = trimmed[space_pos + 1..].trim_start();
8254    if rest.is_empty() {
8255        return false;
8256    }
8257    let rest = rest.strip_prefix('#').unwrap_or(rest);
8258    // Must start with a digit or roman numeral
8259    let first_char = rest.chars().next().unwrap_or(' ');
8260    if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
8261        return false;
8262    }
8263    true
8264}
8265
8266/// Strict rescue for docs with some headings but low density: only promote
8267/// ALL CAPS text that is clearly a section heading.
8268fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8269    let trimmed = text.trim();
8270    if trimmed.is_empty() {
8271        return false;
8272    }
8273
8274    let word_count = trimmed.split_whitespace().count();
8275
8276    // Must be short: ≤ 8 words, ≤ 80 chars
8277    if word_count > 8 || trimmed.len() > 80 {
8278        return false;
8279    }
8280
8281    // Must be ALL CAPS (all alphabetic chars are uppercase)
8282    let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
8283    if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
8284        return false;
8285    }
8286
8287    // Must not end with sentence punctuation
8288    if trimmed.ends_with(['.', ';', ',']) {
8289        return false;
8290    }
8291
8292    // Reject all-caps headings containing math symbols or percentage signs.
8293    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8294        return false;
8295    }
8296
8297    // Must not look like a caption
8298    if starts_with_caption_prefix(trimmed) {
8299        return false;
8300    }
8301
8302    // Must not be purely numeric or a page number
8303    if trimmed
8304        .chars()
8305        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8306    {
8307        return false;
8308    }
8309
8310    // Look ahead for substantive content — accept any non-trivial text
8311    // (>6 words) or structured content within the next 4 elements.
8312    for offset in 1..=4 {
8313        let lookahead_idx = idx + offset;
8314        if lookahead_idx >= doc.kids.len() {
8315            break;
8316        }
8317        let look_elem = &doc.kids[lookahead_idx];
8318        match look_elem {
8319            ContentElement::Paragraph(p) => {
8320                let nw = p.base.value().split_whitespace().count();
8321                if nw > 6 {
8322                    return true;
8323                }
8324            }
8325            ContentElement::TextBlock(tb) => {
8326                let nw = tb.value().split_whitespace().count();
8327                if nw > 6 {
8328                    return true;
8329                }
8330            }
8331            ContentElement::TextLine(tl) => {
8332                let nw = tl.value().split_whitespace().count();
8333                if nw > 6 {
8334                    return true;
8335                }
8336            }
8337            ContentElement::List(_)
8338            | ContentElement::Table(_)
8339            | ContentElement::TableBorder(_)
8340            | ContentElement::Image(_)
8341            | ContentElement::Figure(_) => {
8342                return true;
8343            }
8344            _ => continue,
8345        }
8346    }
8347
8348    false
8349}
8350
8351fn should_render_element_as_heading(
8352    element: &ContentElement,
8353    text: &str,
8354    next: Option<&ContentElement>,
8355) -> bool {
8356    let trimmed = text.trim();
8357    if trimmed.is_empty() {
8358        return false;
8359    }
8360
8361    let lower = trimmed.to_ascii_lowercase();
8362    if matches!(lower.as_str(), "contents" | "table of contents")
8363        && trimmed.starts_with(|c: char| c.is_uppercase())
8364    {
8365        return true;
8366    }
8367
8368    let word_count = trimmed.split_whitespace().count();
8369    let has_alpha = trimmed.chars().any(char::is_alphabetic);
8370    let title_like = has_alpha
8371        && word_count <= 4
8372        && trimmed.len() <= 40
8373        && !trimmed.ends_with(['.', '!', '?', ';', ':']);
8374
8375    // Reject attribution prefixes that are clearly not section headings
8376    // (more targeted than starts_with_caption_prefix to avoid false demotions
8377    // of legitimate headings starting with common words like "Graph", "Table").
8378    let is_attribution = {
8379        let lower = trimmed.to_ascii_lowercase();
8380        lower.starts_with("source:")
8381            || lower.starts_with("credit:")
8382            || lower.starts_with("photo by ")
8383            || lower.starts_with("photo credit")
8384            || lower.starts_with("image by ")
8385            || lower.starts_with("image credit")
8386    };
8387
8388    title_like
8389        && matches!(next, Some(ContentElement::List(_)))
8390        && !looks_like_chart_label_heading(element, trimmed)
8391        && !is_attribution
8392}
8393
8394fn looks_like_hyphenated_table_title_continuation(
8395    doc: &PdfDocument,
8396    idx: usize,
8397    text: &str,
8398    next: Option<&ContentElement>,
8399) -> bool {
8400    if !matches!(
8401        next,
8402        Some(ContentElement::Table(_)) | Some(ContentElement::TableBorder(_))
8403    ) {
8404        return false;
8405    }
8406
8407    let trimmed = text.trim();
8408    if trimmed.is_empty()
8409        || starts_with_caption_prefix(trimmed)
8410        || looks_like_numbered_section(trimmed)
8411        || looks_like_keyword_numbered_section(trimmed)
8412        || !trimmed.ends_with(':')
8413    {
8414        return false;
8415    }
8416
8417    let word_count = trimmed.split_whitespace().count();
8418    if !(3..=5).contains(&word_count) || trimmed.len() > 60 {
8419        return false;
8420    }
8421
8422    let Some(first_alpha) = trimmed.chars().find(|ch| ch.is_alphabetic()) else {
8423        return false;
8424    };
8425    if first_alpha.is_lowercase() {
8426        return false;
8427    }
8428
8429    let Some(prev_idx) = idx.checked_sub(1) else {
8430        return false;
8431    };
8432    let prev_text = extract_element_text(&doc.kids[prev_idx]);
8433    let prev_trimmed = prev_text.trim();
8434    !prev_trimmed.is_empty() && prev_trimmed.ends_with('-')
8435}
8436
8437fn looks_like_table_header_duplicate_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8438    let trimmed = text.trim();
8439    if trimmed.is_empty()
8440        || starts_with_caption_prefix(trimmed)
8441        || looks_like_numbered_section(trimmed)
8442        || looks_like_keyword_numbered_section(trimmed)
8443    {
8444        return false;
8445    }
8446
8447    let word_count = trimmed.split_whitespace().count();
8448    if !(3..=10).contains(&word_count) || trimmed.len() > 96 {
8449        return false;
8450    }
8451
8452    let Some(prev_idx) = idx.checked_sub(1) else {
8453        return false;
8454    };
8455    let Some(previous_table) = table_border_from_element(&doc.kids[prev_idx]) else {
8456        return false;
8457    };
8458    if previous_table.num_columns < 3 || previous_table.rows.len() < 3 {
8459        return false;
8460    }
8461
8462    let mut rendered_rows = collect_table_border_rows(previous_table);
8463    if rendered_rows.is_empty() {
8464        return false;
8465    }
8466    merge_continuation_rows(&mut rendered_rows);
8467    trim_leading_table_carryover_rows(&mut rendered_rows);
8468
8469    let Some(header_row) = rendered_rows.first() else {
8470        return false;
8471    };
8472    let header_text = header_row
8473        .iter()
8474        .map(|cell| cell.trim())
8475        .filter(|cell| !cell.is_empty())
8476        .collect::<Vec<_>>()
8477        .join(" ");
8478    if !equivalent_heading_text(trimmed, &header_text) {
8479        return false;
8480    }
8481
8482    let page_number = doc.kids[idx].page_number();
8483    let mut short_fragments = 0usize;
8484    let mut numeric_fragments = 0usize;
8485
8486    for candidate in doc.kids.iter().skip(idx + 1) {
8487        if candidate.page_number() != page_number {
8488            break;
8489        }
8490        if matches!(
8491            candidate,
8492            ContentElement::Table(_) | ContentElement::TableBorder(_)
8493        ) {
8494            break;
8495        }
8496
8497        let fragment = extract_element_text(candidate);
8498        let fragment_trimmed = fragment.trim();
8499        if fragment_trimmed.is_empty()
8500            || looks_like_margin_page_number(doc, candidate, fragment_trimmed)
8501        {
8502            continue;
8503        }
8504
8505        let fragment_words = fragment_trimmed.split_whitespace().count();
8506        if fragment_words > 6 {
8507            return false;
8508        }
8509
8510        short_fragments += 1;
8511        if fragment_trimmed.chars().any(|ch| ch.is_ascii_digit()) {
8512            numeric_fragments += 1;
8513        }
8514
8515        if short_fragments >= 3 {
8516            break;
8517        }
8518    }
8519
8520    short_fragments >= 2 && numeric_fragments >= 1
8521}
8522
8523fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8524    let trimmed = text.trim();
8525    if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
8526        return false;
8527    }
8528
8529    let element = &doc.kids[idx];
8530    let bbox = element.bbox();
8531    if bbox.height() > 24.0 {
8532        return false;
8533    }
8534
8535    let Some(page) = element.page_number() else {
8536        return false;
8537    };
8538
8539    // Compute top Y for every page (single pass).
8540    let mut page_tops = std::collections::HashMap::<u32, f64>::new();
8541    for candidate in &doc.kids {
8542        if let Some(p) = candidate.page_number() {
8543            let top = page_tops.entry(p).or_insert(f64::MIN);
8544            *top = top.max(candidate.bbox().top_y);
8545        }
8546    }
8547
8548    let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
8549    if bbox.top_y < page_top - 24.0 {
8550        return false;
8551    }
8552
8553    // A running header repeats across pages.  If the same text does NOT
8554    // appear at the top margin of any other page, this is a unique heading
8555    // (e.g. a document title), not a running header.
8556    let trimmed_lower = trimmed.to_lowercase();
8557    for other_elem in &doc.kids {
8558        let Some(other_page) = other_elem.page_number() else {
8559            continue;
8560        };
8561        if other_page == page {
8562            continue;
8563        }
8564        let other_bbox = other_elem.bbox();
8565        if other_bbox.height() > 24.0 {
8566            continue;
8567        }
8568        let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
8569        if other_bbox.top_y < other_top - 24.0 {
8570            continue;
8571        }
8572        let other_text = match other_elem {
8573            ContentElement::Paragraph(p) => p.base.value(),
8574            ContentElement::TextBlock(tb) => tb.value(),
8575            ContentElement::TextLine(tl) => tl.value(),
8576            ContentElement::Heading(h) => h.base.base.value(),
8577            _ => continue,
8578        };
8579        if other_text.trim().to_lowercase() == trimmed_lower {
8580            return true;
8581        }
8582    }
8583
8584    false
8585}
8586
8587fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
8588    let trimmed = text.trim();
8589    let upper_words = trimmed
8590        .split_whitespace()
8591        .filter(|word| word.chars().any(char::is_alphabetic))
8592        .all(|word| {
8593            word.chars()
8594                .filter(|ch| ch.is_alphabetic())
8595                .all(|ch| ch.is_uppercase())
8596        });
8597
8598    (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
8599}
8600
8601fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
8602    let next_trimmed = next.trim();
8603    if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8604        return false;
8605    }
8606
8607    let normalized = normalize_heading_text(text);
8608    if matches!(
8609        normalized.as_str(),
8610        "contents" | "tableofcontents" | "introduction" | "conclusion"
8611    ) {
8612        return false;
8613    }
8614
8615    let words: Vec<&str> = text.split_whitespace().collect();
8616    if words.len() < 3 {
8617        return false;
8618    }
8619
8620    words
8621        .last()
8622        .is_some_and(|word| is_sentence_fragment_tail(word))
8623}
8624
8625fn is_sentence_fragment_tail(word: &str) -> bool {
8626    matches!(
8627        word.trim_matches(|c: char| !c.is_alphanumeric())
8628            .to_ascii_lowercase()
8629            .as_str(),
8630        "a" | "an"
8631            | "and"
8632            | "as"
8633            | "at"
8634            | "by"
8635            | "for"
8636            | "from"
8637            | "in"
8638            | "into"
8639            | "of"
8640            | "on"
8641            | "or"
8642            | "that"
8643            | "the"
8644            | "to"
8645            | "with"
8646    )
8647}
8648
8649fn is_list_section_heading(text: &str) -> bool {
8650    let trimmed = text.trim();
8651    trimmed.ends_with(':')
8652        && trimmed.len() <= 80
8653        && trimmed.split_whitespace().count() <= 8
8654        && trimmed.chars().any(char::is_alphabetic)
8655        && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
8656        && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
8657}
8658
8659fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
8660    let next_trimmed = next.trim();
8661    if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8662        return false;
8663    }
8664
8665    if starts_with_enumerated_marker(next_trimmed) {
8666        return false;
8667    }
8668
8669    if prev.ends_with('-')
8670        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8671        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8672    {
8673        return true;
8674    }
8675
8676    if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8677        return true;
8678    }
8679
8680    let lower = next_trimmed.to_ascii_lowercase();
8681    if lower.starts_with("http://")
8682        || lower.starts_with("https://")
8683        || lower.starts_with("arxiv")
8684        || lower.starts_with("doi:")
8685    {
8686        return true;
8687    }
8688
8689    if matches!(
8690        next_trimmed.split_whitespace().next(),
8691        Some("In" | "Proceedings" | "Advances" | "Learning")
8692    ) {
8693        return true;
8694    }
8695
8696    !prev.ends_with(['.', '!', '?', ':'])
8697}
8698
8699fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
8700    let next_trimmed = next.trim();
8701    if next_trimmed.is_empty() {
8702        return false;
8703    }
8704
8705    if starts_with_enumerated_marker(next_trimmed) {
8706        return false;
8707    }
8708
8709    if prev.ends_with('-')
8710        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8711        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8712    {
8713        return true;
8714    }
8715
8716    next_trimmed.chars().next().is_some_and(char::is_lowercase)
8717}
8718
8719fn starts_with_enumerated_marker(text: &str) -> bool {
8720    let first_token = match text.split_whitespace().next() {
8721        Some(token) => token.trim_start_matches(['(', '[']),
8722        None => return false,
8723    };
8724    if !first_token.ends_with(['.', ')', ':']) {
8725        return false;
8726    }
8727
8728    let marker = first_token.trim_end_matches(['.', ')', ':']);
8729    if marker.is_empty() {
8730        return false;
8731    }
8732
8733    if marker.chars().all(|c| c.is_ascii_digit()) {
8734        return true;
8735    }
8736
8737    if marker.len() == 1 && marker.chars().all(|c| c.is_ascii_alphabetic()) {
8738        return true;
8739    }
8740
8741    let lower = marker.to_ascii_lowercase();
8742    lower.len() <= 8 && lower.chars().all(|c| "ivxlcdm".contains(c))
8743}
8744
8745fn should_skip_leading_figure_carryover(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8746    let trimmed = text.trim();
8747    if !trimmed.starts_with("Figure ") || trimmed.split_whitespace().count() < 4 {
8748        return false;
8749    }
8750
8751    let element = &doc.kids[idx];
8752    let Some(page) = element.page_number() else {
8753        return false;
8754    };
8755
8756    let mut page_top = f64::MIN;
8757    for candidate in &doc.kids {
8758        if candidate.page_number() == Some(page)
8759            && matches!(
8760                candidate,
8761                ContentElement::Paragraph(_)
8762                    | ContentElement::TextBlock(_)
8763                    | ContentElement::TextLine(_)
8764                    | ContentElement::Heading(_)
8765                    | ContentElement::NumberHeading(_)
8766                    | ContentElement::Caption(_)
8767            )
8768        {
8769            page_top = page_top.max(candidate.bbox().top_y);
8770        }
8771    }
8772    if !page_top.is_finite() || element.bbox().top_y < page_top - 72.0 {
8773        return false;
8774    }
8775
8776    for prior_idx in 0..idx {
8777        let prior = &doc.kids[prior_idx];
8778        let prior_text = extract_element_text(prior);
8779        let prior_trimmed = prior_text.trim();
8780        if prior_trimmed.is_empty()
8781            || is_standalone_page_number(prior_trimmed)
8782            || looks_like_footer_banner(prior_trimmed)
8783        {
8784            continue;
8785        }
8786        match prior {
8787            ContentElement::Paragraph(_)
8788            | ContentElement::TextBlock(_)
8789            | ContentElement::TextLine(_) => {
8790                if !starts_with_caption_prefix(prior_trimmed)
8791                    && !looks_like_top_margin_running_header(doc, prior_idx, prior_trimmed)
8792                {
8793                    return false;
8794                }
8795            }
8796            ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8797                if !should_skip_heading_text(prior_trimmed) {
8798                    return false;
8799                }
8800            }
8801            _ => return false,
8802        }
8803    }
8804
8805    for lookahead_idx in idx + 1..doc.kids.len().min(idx + 8) {
8806        let next = &doc.kids[lookahead_idx];
8807        if next.page_number() != Some(page) {
8808            break;
8809        }
8810        let next_text = extract_element_text(next);
8811        let next_trimmed = next_text.trim();
8812        if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8813            continue;
8814        }
8815
8816        let is_numbered_heading = match next {
8817            ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8818                looks_like_numbered_section(next_trimmed)
8819                    || looks_like_keyword_numbered_section(next_trimmed)
8820            }
8821            ContentElement::Paragraph(_)
8822            | ContentElement::TextBlock(_)
8823            | ContentElement::TextLine(_) => {
8824                should_render_paragraph_as_heading(
8825                    doc,
8826                    lookahead_idx,
8827                    next_trimmed,
8828                    doc.kids.get(lookahead_idx + 1),
8829                ) && (looks_like_numbered_section(next_trimmed)
8830                    || looks_like_keyword_numbered_section(next_trimmed))
8831            }
8832            _ => false,
8833        };
8834
8835        if is_numbered_heading {
8836            return true;
8837        }
8838
8839        if !starts_with_caption_prefix(next_trimmed) && next_trimmed.split_whitespace().count() >= 5
8840        {
8841            return false;
8842        }
8843    }
8844
8845    false
8846}
8847
8848fn merge_paragraph_text(target: &mut String, next: &str) {
8849    let next_trimmed = next.trim();
8850    if target.ends_with('-')
8851        && target
8852            .chars()
8853            .rev()
8854            .nth(1)
8855            .is_some_and(|c| c.is_alphabetic())
8856        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8857    {
8858        target.pop();
8859        target.push_str(next_trimmed);
8860    } else {
8861        if !target.ends_with(' ') {
8862            target.push(' ');
8863        }
8864        target.push_str(next_trimmed);
8865    }
8866}
8867
8868fn is_standalone_page_number(text: &str) -> bool {
8869    let trimmed = text.trim();
8870    !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
8871}
8872
8873fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
8874    if !is_standalone_page_number(text) {
8875        return false;
8876    }
8877
8878    let bbox = element.bbox();
8879    if bbox.height() > 24.0 {
8880        return false;
8881    }
8882
8883    let Some(page) = element.page_number() else {
8884        return false;
8885    };
8886
8887    let mut page_top = f64::MIN;
8888    let mut page_bottom = f64::MAX;
8889    for candidate in &doc.kids {
8890        if candidate.page_number() == Some(page) {
8891            let candidate_bbox = candidate.bbox();
8892            page_top = page_top.max(candidate_bbox.top_y);
8893            page_bottom = page_bottom.min(candidate_bbox.bottom_y);
8894        }
8895    }
8896
8897    if !page_top.is_finite() || !page_bottom.is_finite() {
8898        return false;
8899    }
8900
8901    bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
8902}
8903
8904/// Check whether a pipeline heading sits in the bottom margin of its page.
8905/// Running footers (e.g. "Report Title 21") are sometimes classified as
8906/// headings by the pipeline.  A heading at the page bottom is very unlikely
8907/// to be a real section heading.
8908fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
8909    let element = &doc.kids[idx];
8910    let bbox = element.bbox();
8911    if bbox.height() > 30.0 {
8912        return false;
8913    }
8914
8915    let Some(page) = element.page_number() else {
8916        return false;
8917    };
8918
8919    let mut page_bottom = f64::MAX;
8920    for candidate in &doc.kids {
8921        if candidate.page_number() == Some(page) {
8922            page_bottom = page_bottom.min(candidate.bbox().bottom_y);
8923        }
8924    }
8925
8926    if !page_bottom.is_finite() {
8927        return false;
8928    }
8929
8930    // If this heading is at the very bottom of the page content, skip it.
8931    bbox.bottom_y <= page_bottom + 24.0
8932}
8933
8934/// Demote a pipeline heading that ends with a period when it doesn't look like
8935/// a genuine section heading (e.g. "United Kingdom." or "New Investment (a Challenger).").
8936/// Returns true when the heading should be rendered as a paragraph instead.
8937fn should_demote_period_heading(text: &str) -> bool {
8938    let trimmed = text.trim();
8939    if !trimmed.ends_with('.') {
8940        return false;
8941    }
8942    // Keep numbered section headings: "I. Introduction", "4.2. Results",
8943    // "Activity 4. Determining CEC…"
8944    if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
8945        return false;
8946    }
8947    // Keep headings whose text without the trailing period still looks like a
8948    // proper title — at least 3 words, first word uppercase, and the period
8949    // is clearly sentence-ending rather than part of a title pattern.
8950    let without_dot = trimmed.trim_end_matches('.');
8951    let word_count = without_dot.split_whitespace().count();
8952    // Very short fragments ending with '.' (like "Kingdom.") are almost
8953    // certainly not headings.
8954    if word_count <= 2 {
8955        return true;
8956    }
8957    false
8958}
8959
8960/// Demote headings that end with a comma — these are never real headings
8961/// (e.g. footnote references like "29 Pope," or "32 Beawes, 33 M.M.,").
8962fn should_demote_comma_heading(text: &str) -> bool {
8963    text.trim().ends_with(',')
8964}
8965
8966/// Demote headings containing mathematical/special symbols that never appear
8967/// in real section headings (e.g. "HL ¼", "P ≪ P", "LH þ HL:").
8968fn should_demote_math_heading(text: &str) -> bool {
8969    text.chars().any(|c| {
8970        matches!(
8971            c,
8972            '¼' | '½'
8973                | '¾'
8974                | '≪'
8975                | '≫'
8976                | 'þ'
8977                | 'ð'
8978                | '∑'
8979                | '∫'
8980                | '∂'
8981                | '∏'
8982                | '√'
8983                | '∞'
8984                | '≈'
8985                | '÷'
8986        )
8987    })
8988}
8989
8990/// Demote headings containing a percentage sign — these are typically data
8991/// labels rather than section headings (e.g. "56% AGREE").
8992fn should_demote_percentage_heading(text: &str) -> bool {
8993    text.contains('%')
8994}
8995
8996/// Demote bibliography entries that start with a 4-digit year followed by
8997/// a period and space (e.g. "2020. Measuring massive multitask...").
8998fn should_demote_bibliography_heading(text: &str) -> bool {
8999    let t = text.trim();
9000    if t.len() < 6 {
9001        return false;
9002    }
9003    let bytes = t.as_bytes();
9004    bytes[0..4].iter().all(|b| b.is_ascii_digit())
9005        && bytes[4] == b'.'
9006        && (bytes[5] == b' ' || t.len() == 5)
9007}
9008
9009/// Strip a trailing standalone page number from heading text.
9010/// E.g. "Chapter 3. Numerical differentiation 35" → "Chapter 3. Numerical differentiation"
9011/// Only strips when the last token is 1-4 digits and the heading has enough
9012/// words to be meaningful without it.
9013fn strip_trailing_page_number(text: &str) -> &str {
9014    let trimmed = text.trim();
9015    if let Some(last_space) = trimmed.rfind(' ') {
9016        let suffix = &trimmed[last_space + 1..];
9017        if !suffix.is_empty()
9018            && suffix.len() <= 4
9019            && suffix.chars().all(|c| c.is_ascii_digit())
9020            && trimmed[..last_space].split_whitespace().count() >= 3
9021        {
9022            return trimmed[..last_space].trim();
9023        }
9024    }
9025    trimmed
9026}
9027
9028/// Try to split a heading that contains a merged subsection number.
9029/// For example, "4 Results 4.1 Experimental Details" should become
9030/// two headings: "4 Results" and "4.1 Experimental Details".
9031/// Returns None if no split is needed, otherwise the split point byte offset.
9032fn find_merged_subsection_split(text: &str) -> Option<usize> {
9033    // Look for a subsection number pattern like "4.1" or "B.1" after initial content.
9034    // Must appear at a word boundary (preceded by space).
9035    let bytes = text.as_bytes();
9036    // Start searching after the first few characters to skip the initial number
9037    let mut i = 3;
9038    while i < bytes.len() {
9039        if bytes[i - 1] == b' ' {
9040            // Check for digit.digit pattern (e.g., "4.1")
9041            if bytes[i].is_ascii_digit() {
9042                if let Some(dot_pos) = text[i..].find('.') {
9043                    let after_dot = i + dot_pos + 1;
9044                    if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
9045                        // Found "N.N" pattern preceded by space
9046                        return Some(i);
9047                    }
9048                }
9049            }
9050            // Check for letter.digit pattern (e.g., "B.1")
9051            if bytes[i].is_ascii_uppercase()
9052                && i + 2 < bytes.len()
9053                && bytes[i + 1] == b'.'
9054                && bytes[i + 2].is_ascii_digit()
9055            {
9056                return Some(i);
9057            }
9058        }
9059        i += 1;
9060    }
9061    None
9062}
9063
9064fn should_skip_heading_text(text: &str) -> bool {
9065    let trimmed = text.trim();
9066    if trimmed.is_empty() || is_standalone_page_number(trimmed) {
9067        return true;
9068    }
9069
9070    let lower = trimmed.to_ascii_lowercase();
9071    if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
9072        && trimmed.contains('|')
9073    {
9074        return true;
9075    }
9076
9077    let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
9078    let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
9079    alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
9080}
9081
9082fn repair_fragmented_words(text: &str) -> String {
9083    const STOPWORDS: &[&str] = &[
9084        "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
9085        "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
9086    ];
9087
9088    let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
9089    if parts.len() < 2 {
9090        return text.to_string();
9091    }
9092
9093    let mut i = 0usize;
9094    while i + 1 < parts.len() {
9095        let left = parts[i].clone();
9096        let right = parts[i + 1].clone();
9097        let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
9098        let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
9099        let left_lower = left_clean.to_ascii_lowercase();
9100        let right_lower = right_clean.to_ascii_lowercase();
9101
9102        let should_join = !left_clean.is_empty()
9103            && !right_clean.is_empty()
9104            && left_clean.chars().all(char::is_alphabetic)
9105            && right_clean.chars().all(char::is_alphabetic)
9106            && (left_clean.len() <= 4 || right_clean.len() <= 4)
9107            && left_clean.len() + right_clean.len() >= 6
9108            && !right_clean.chars().next().is_some_and(char::is_uppercase)
9109            && !STOPWORDS.contains(&left_lower.as_str())
9110            && !STOPWORDS.contains(&right_lower.as_str());
9111
9112        if should_join {
9113            let next = parts.remove(i + 1);
9114            parts[i].push_str(&next);
9115        } else {
9116            i += 1;
9117        }
9118    }
9119
9120    parts.join(" ")
9121}
9122
9123/// Extract text from list item contents (fallback when label/body tokens are empty).
9124fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
9125    let mut text = String::new();
9126    for elem in contents {
9127        let part = match elem {
9128            ContentElement::Paragraph(p) => p.base.value(),
9129            ContentElement::TextBlock(tb) => tb.value(),
9130            ContentElement::TextLine(tl) => tl.value(),
9131            ContentElement::TextChunk(tc) => tc.value.clone(),
9132            _ => String::new(),
9133        };
9134        if !text.is_empty() && !part.is_empty() {
9135            text.push(' ');
9136        }
9137        text.push_str(&part);
9138    }
9139    text
9140}
9141
9142fn has_internal_header_gap(row: &[String]) -> bool {
9143    let mut seen_filled = false;
9144    let mut seen_gap_after_fill = false;
9145    for cell in row {
9146        if cell.trim().is_empty() {
9147            if seen_filled {
9148                seen_gap_after_fill = true;
9149            }
9150            continue;
9151        }
9152        if seen_gap_after_fill {
9153            return true;
9154        }
9155        seen_filled = true;
9156    }
9157    false
9158}
9159
9160fn expand_grouped_header_row(parent: &[String], child: &[String]) -> Vec<String> {
9161    let anchor_cols: Vec<usize> = parent
9162        .iter()
9163        .enumerate()
9164        .filter_map(|(idx, cell)| (!cell.trim().is_empty()).then_some(idx))
9165        .collect();
9166    if anchor_cols.is_empty() {
9167        return parent.to_vec();
9168    }
9169
9170    let mut expanded = parent.to_vec();
9171    for (col_idx, child_cell) in child.iter().enumerate() {
9172        if !expanded[col_idx].trim().is_empty() || child_cell.trim().is_empty() {
9173            continue;
9174        }
9175
9176        let mut best_anchor = anchor_cols[0];
9177        let mut best_distance = usize::abs_diff(anchor_cols[0], col_idx);
9178        for &anchor_idx in &anchor_cols[1..] {
9179            let distance = usize::abs_diff(anchor_idx, col_idx);
9180            if distance < best_distance || (distance == best_distance && anchor_idx > best_anchor) {
9181                best_anchor = anchor_idx;
9182                best_distance = distance;
9183            }
9184        }
9185        expanded[col_idx] = parent[best_anchor].trim().to_string();
9186    }
9187
9188    expanded
9189}
9190
9191fn preserve_grouped_header_rows(rows: &mut [Vec<String>]) -> bool {
9192    if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
9193        return false;
9194    }
9195    if rows[0].first().is_none_or(|cell| cell.trim().is_empty()) {
9196        return false;
9197    }
9198    if rows[1].first().is_some_and(|cell| !cell.trim().is_empty()) {
9199        return false;
9200    }
9201
9202    let first_filled = rows[0]
9203        .iter()
9204        .filter(|cell| !cell.trim().is_empty())
9205        .count();
9206    let second_filled = rows[1]
9207        .iter()
9208        .filter(|cell| !cell.trim().is_empty())
9209        .count();
9210    if first_filled < 2 || second_filled <= first_filled || !has_internal_header_gap(&rows[0]) {
9211        return false;
9212    }
9213
9214    rows[0] = expand_grouped_header_row(&rows[0], &rows[1]);
9215    true
9216}
9217
9218/// Merge header continuation rows in a rendered table.
9219///
9220/// When a PDF table has multi-line column headers, each wrapped line often
9221/// produces a separate row in the grid.  These continuation rows have an
9222/// empty first cell while the header row above them has content.  This
9223/// function detects such rows at the start of the table and merges their
9224/// text into the first row, producing a single combined header.
9225///
9226/// Only rows whose non-empty cells are all ≤ 30 characters are merged, to
9227/// avoid accidentally collapsing data rows that happen to have an empty key.
9228fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
9229    if rows.len() < 2 {
9230        return;
9231    }
9232    if preserve_grouped_header_rows(rows) {
9233        return;
9234    }
9235    // The first row must have a non-empty first cell (the header anchor).
9236    if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
9237        return;
9238    }
9239
9240    let mut merge_count = 0usize;
9241    for (i, row_i) in rows.iter().enumerate().skip(1) {
9242        let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
9243        if !first_empty {
9244            break; // hit a data row
9245        }
9246        // All non-empty cells must be short (header-like fragments).
9247        let all_short = row_i
9248            .iter()
9249            .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
9250        if !all_short {
9251            break;
9252        }
9253        merge_count = i;
9254    }
9255
9256    // Require at least 2 consecutive continuation rows to avoid merging
9257    // legitimate sub-header or unit rows (e.g. a single row with "cmolc/kg").
9258    if merge_count == 0 {
9259        return;
9260    }
9261
9262    // Merge rows 1..=merge_count into row 0.
9263    for i in 1..=merge_count {
9264        let (head, tail) = rows.split_at_mut(i);
9265        let ncols = head[0].len().min(tail[0].len());
9266        for (target, src) in head[0]
9267            .iter_mut()
9268            .take(ncols)
9269            .zip(tail[0].iter().take(ncols))
9270        {
9271            let fragment = src.trim().to_string();
9272            if !fragment.is_empty() {
9273                let target_str = target.trim().to_string();
9274                *target = if target_str.is_empty() {
9275                    fragment
9276                } else {
9277                    format!("{} {}", target_str, fragment)
9278                };
9279            }
9280        }
9281    }
9282
9283    // Remove the merged rows.
9284    rows.drain(1..=merge_count);
9285}
9286
9287fn trim_leading_table_carryover_rows(rows: &mut Vec<Vec<String>>) {
9288    while first_body_row_looks_like_carryover(rows) {
9289        rows.remove(1);
9290    }
9291}
9292
9293fn first_body_row_looks_like_carryover(rows: &[Vec<String>]) -> bool {
9294    if rows.len() < 3 {
9295        return false;
9296    }
9297
9298    let key_col_count = infer_leading_key_column_count(&rows[1..]);
9299    if key_col_count == 0 {
9300        return false;
9301    }
9302
9303    let candidate = &rows[1];
9304    if candidate
9305        .iter()
9306        .take(key_col_count)
9307        .any(|cell| !cell.trim().is_empty())
9308    {
9309        return false;
9310    }
9311
9312    let non_empty_cols = candidate
9313        .iter()
9314        .enumerate()
9315        .filter(|(_, cell)| !cell.trim().is_empty())
9316        .map(|(idx, _)| idx)
9317        .collect::<Vec<_>>();
9318    if non_empty_cols.len() != 1 {
9319        return false;
9320    }
9321
9322    let only_col = non_empty_cols[0];
9323    if only_col < key_col_count {
9324        return false;
9325    }
9326
9327    if candidate[only_col].split_whitespace().count() < 4 {
9328        return false;
9329    }
9330
9331    rows[2]
9332        .iter()
9333        .take(key_col_count)
9334        .all(|cell| !cell.trim().is_empty())
9335}
9336
9337fn infer_leading_key_column_count(rows: &[Vec<String>]) -> usize {
9338    if rows.len() < 2 {
9339        return 0;
9340    }
9341
9342    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
9343    let mut key_cols = 0usize;
9344
9345    for col_idx in 0..num_cols {
9346        let mut occupancy = 0usize;
9347        let mut word_counts = Vec::new();
9348
9349        for row in rows {
9350            let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
9351            let trimmed = cell.trim();
9352            if trimmed.is_empty() {
9353                continue;
9354            }
9355            occupancy += 1;
9356            word_counts.push(trimmed.split_whitespace().count());
9357        }
9358
9359        if occupancy == 0 {
9360            break;
9361        }
9362
9363        word_counts.sort_unstable();
9364        let median_words = word_counts[word_counts.len() / 2];
9365        let occupancy_ratio = occupancy as f64 / rows.len() as f64;
9366        if occupancy_ratio < 0.6 || median_words > 3 {
9367            break;
9368        }
9369        key_cols += 1;
9370    }
9371
9372    key_cols
9373}
9374
9375/// Render a SemanticTable as a markdown table.
9376fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
9377    // Delegate to render_table_border which handles cross-page linking.
9378    render_table_border(out, &table.table_border);
9379}
9380
9381#[derive(Clone, Debug)]
9382struct GeometricTableRegion {
9383    start_idx: usize,
9384    end_idx: usize,
9385    rendered: String,
9386}
9387
9388#[derive(Clone)]
9389struct ChunkLine {
9390    bbox: BoundingBox,
9391    chunks: Vec<TextChunk>,
9392}
9393
9394#[derive(Clone)]
9395struct SlotFragment {
9396    slot_idx: usize,
9397    bbox: BoundingBox,
9398    text: String,
9399}
9400
9401fn detect_geometric_table_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9402    let mut regions = Vec::new();
9403    let mut occupied_until = 0usize;
9404
9405    for (idx, element) in doc.kids.iter().enumerate() {
9406        if idx < occupied_until {
9407            continue;
9408        }
9409
9410        let Some(table) = table_border_from_element(element) else {
9411            continue;
9412        };
9413        let Some(region) = build_geometric_table_region(doc, idx, table) else {
9414            continue;
9415        };
9416        occupied_until = region.end_idx.saturating_add(1);
9417        regions.push(region);
9418    }
9419
9420    let mut occupied = regions
9421        .iter()
9422        .flat_map(|region| region.start_idx..=region.end_idx)
9423        .collect::<HashSet<_>>();
9424    for region in detect_footnote_citation_regions(doc) {
9425        if (region.start_idx..=region.end_idx).any(|idx| occupied.contains(&idx)) {
9426            continue;
9427        }
9428        occupied.extend(region.start_idx..=region.end_idx);
9429        regions.push(region);
9430    }
9431
9432    regions.sort_by_key(|region| region.start_idx);
9433    regions
9434}
9435
9436fn detect_footnote_citation_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9437    let body_font_size = compute_running_body_font_size(doc);
9438    if body_font_size <= 0.0 {
9439        return Vec::new();
9440    }
9441
9442    let mut regions = Vec::new();
9443    let mut idx = 0usize;
9444    while idx < doc.kids.len() {
9445        let Some(region) = build_footnote_citation_region(doc, idx, body_font_size) else {
9446            idx += 1;
9447            continue;
9448        };
9449        idx = region.end_idx.saturating_add(1);
9450        regions.push(region);
9451    }
9452
9453    regions
9454}
9455
9456fn compute_running_body_font_size(doc: &PdfDocument) -> f64 {
9457    doc.kids
9458        .iter()
9459        .filter_map(|element| {
9460            let ContentElement::Paragraph(paragraph) = element else {
9461                return None;
9462            };
9463            let text = paragraph.base.value();
9464            (text.split_whitespace().count() > 10).then_some(paragraph.base.font_size?)
9465        })
9466        .fold(0.0_f64, f64::max)
9467}
9468
9469fn build_footnote_citation_region(
9470    doc: &PdfDocument,
9471    start_idx: usize,
9472    body_font_size: f64,
9473) -> Option<GeometricTableRegion> {
9474    let element = doc.kids.get(start_idx)?;
9475    if !is_geometric_text_candidate(element) {
9476        return None;
9477    }
9478
9479    let start_text = extract_element_text(element);
9480    let trimmed_start = start_text.trim();
9481    if trimmed_start.is_empty() {
9482        return None;
9483    }
9484
9485    let small_font_threshold = (body_font_size * 0.92).min(body_font_size - 0.8).max(0.0);
9486    let mut lead_prefix = None;
9487    let mut fragments = Vec::new();
9488    let page_number = element.page_number()?;
9489    let mut column_bbox = element.bbox().clone();
9490    let mut region_start_idx = start_idx;
9491    let mut end_idx = start_idx;
9492
9493    if element_font_size(element).is_some_and(|font_size| font_size <= small_font_threshold)
9494        && starts_with_footnote_marker(trimmed_start)
9495    {
9496        if let Some((attach_idx, prefix, leading_fragments)) = leading_footnote_attachment(
9497            doc,
9498            start_idx,
9499            page_number,
9500            &column_bbox,
9501            small_font_threshold,
9502        ) {
9503            lead_prefix = Some(prefix);
9504            fragments.extend(leading_fragments);
9505            region_start_idx = attach_idx;
9506        }
9507        fragments.push(footnote_fragment_text(element));
9508    } else {
9509        let (prefix, first_tail) = split_trailing_footnote_lead(trimmed_start)?;
9510        let next = doc.kids.get(start_idx + 1)?;
9511        if !is_geometric_text_candidate(next)
9512            || next.page_number() != Some(page_number)
9513            || !element_font_size(next).is_some_and(|font_size| font_size <= small_font_threshold)
9514        {
9515            return None;
9516        }
9517        if !same_column_region(&column_bbox, next.bbox()) {
9518            return None;
9519        }
9520        lead_prefix = Some(prefix);
9521        fragments.push(first_tail);
9522    }
9523
9524    let mut consecutive_small = 0usize;
9525    for idx in start_idx + 1..doc.kids.len() {
9526        let candidate = &doc.kids[idx];
9527        if !is_geometric_text_candidate(candidate) || candidate.page_number() != Some(page_number) {
9528            break;
9529        }
9530
9531        let candidate_text = extract_element_text(candidate);
9532        let trimmed = candidate_text.trim();
9533        if trimmed.is_empty() || starts_with_caption_prefix(trimmed) {
9534            break;
9535        }
9536
9537        let Some(font_size) = element_font_size(candidate) else {
9538            break;
9539        };
9540        if font_size > small_font_threshold {
9541            break;
9542        }
9543        if !same_column_region(&column_bbox, candidate.bbox()) {
9544            break;
9545        }
9546
9547        column_bbox = column_bbox.union(candidate.bbox());
9548        fragments.push(footnote_fragment_text(candidate));
9549        consecutive_small += 1;
9550        end_idx = idx;
9551    }
9552
9553    if consecutive_small == 0 && lead_prefix.is_some() {
9554        return None;
9555    }
9556
9557    let rows = parse_footnote_citation_rows(&fragments);
9558    if rows.len() < 3 {
9559        return None;
9560    }
9561
9562    let numeric_markers = rows
9563        .iter()
9564        .filter_map(|(marker, _)| marker.parse::<u32>().ok())
9565        .collect::<Vec<_>>();
9566    if numeric_markers.len() != rows.len() {
9567        return None;
9568    }
9569    let sequential_steps = numeric_markers
9570        .windows(2)
9571        .filter(|pair| pair[1] == pair[0] + 1)
9572        .count();
9573    if sequential_steps + 1 < rows.len().saturating_sub(1) {
9574        return None;
9575    }
9576
9577    let mut rendered_rows = vec![vec!["Footnote".to_string(), "Citation".to_string()]];
9578    rendered_rows.extend(
9579        rows.into_iter()
9580            .map(|(marker, citation)| vec![marker, citation]),
9581    );
9582
9583    let mut rendered = String::new();
9584    if let Some(prefix) = lead_prefix {
9585        rendered.push_str(&escape_md_line_start(prefix.trim()));
9586        rendered.push_str("\n\n");
9587    }
9588    rendered.push_str(&render_html_table(&rendered_rows));
9589
9590    Some(GeometricTableRegion {
9591        start_idx: region_start_idx,
9592        end_idx,
9593        rendered,
9594    })
9595}
9596
9597fn leading_footnote_attachment(
9598    doc: &PdfDocument,
9599    start_idx: usize,
9600    page_number: u32,
9601    column_bbox: &BoundingBox,
9602    small_font_threshold: f64,
9603) -> Option<(usize, String, Vec<String>)> {
9604    let mut idx = start_idx.checked_sub(1)?;
9605    let mut leading_fragments = Vec::new();
9606    let mut scanned = 0usize;
9607
9608    loop {
9609        let candidate = doc.kids.get(idx)?;
9610        scanned += 1;
9611        if scanned > 6 || candidate.page_number() != Some(page_number) {
9612            return None;
9613        }
9614
9615        if !is_geometric_text_candidate(candidate) {
9616            if idx == 0 {
9617                return None;
9618            }
9619            idx -= 1;
9620            continue;
9621        }
9622
9623        let text = extract_element_text(candidate);
9624        let trimmed = text.trim();
9625        if trimmed.is_empty() {
9626            if idx == 0 {
9627                return None;
9628            }
9629            idx -= 1;
9630            continue;
9631        }
9632        if !same_column_region(candidate.bbox(), column_bbox) {
9633            return None;
9634        }
9635
9636        if element_font_size(candidate).is_some_and(|font_size| font_size <= small_font_threshold) {
9637            leading_fragments.push(footnote_fragment_text(candidate));
9638            if idx == 0 {
9639                return None;
9640            }
9641            idx -= 1;
9642            continue;
9643        }
9644
9645        let (prefix, first_tail) = split_trailing_footnote_lead(trimmed)?;
9646        leading_fragments.push(first_tail);
9647        leading_fragments.reverse();
9648        return Some((idx, prefix, leading_fragments));
9649    }
9650}
9651
9652fn parse_footnote_citation_rows(fragments: &[String]) -> Vec<(String, String)> {
9653    let mut rows = Vec::new();
9654    let mut current_marker = None::<String>;
9655    let mut current_citation = String::new();
9656
9657    for fragment in fragments {
9658        let markers = find_footnote_marker_positions(fragment);
9659        if markers.is_empty() {
9660            if current_marker.is_some() {
9661                merge_paragraph_text(&mut current_citation, fragment.trim());
9662            }
9663            continue;
9664        }
9665
9666        let mut cursor = 0usize;
9667        for (pos, marker, skip_len) in markers {
9668            let prefix = fragment[cursor..pos].trim();
9669            if current_marker.is_some() && !prefix.is_empty() {
9670                merge_paragraph_text(&mut current_citation, prefix);
9671            }
9672            if let Some(marker_value) = current_marker.take() {
9673                let trimmed = current_citation.trim();
9674                if !trimmed.is_empty() {
9675                    rows.push((marker_value, trimmed.to_string()));
9676                }
9677                current_citation.clear();
9678            }
9679            current_marker = Some(marker);
9680            cursor = pos + skip_len;
9681        }
9682
9683        let tail = fragment[cursor..].trim();
9684        if current_marker.is_some() && !tail.is_empty() {
9685            merge_paragraph_text(&mut current_citation, tail);
9686        }
9687    }
9688
9689    if let Some(marker_value) = current_marker {
9690        let trimmed = current_citation.trim();
9691        if !trimmed.is_empty() {
9692            rows.push((marker_value, trimmed.to_string()));
9693        }
9694    }
9695
9696    rebalance_adjacent_footnote_citations(&mut rows);
9697    rows
9698}
9699
9700fn rebalance_adjacent_footnote_citations(rows: &mut [(String, String)]) {
9701    for idx in 0..rows.len().saturating_sub(1) {
9702        if !rows[idx].1.trim_end().ends_with(',') {
9703            continue;
9704        }
9705
9706        let next = rows[idx + 1].1.trim().to_string();
9707        let Some((stub, remainder)) = split_leading_citation_stub(&next) else {
9708            continue;
9709        };
9710        let Some((first_sentence, trailing)) = split_first_sentence(remainder) else {
9711            continue;
9712        };
9713        if first_sentence.split_whitespace().count() < 2 {
9714            continue;
9715        }
9716
9717        merge_paragraph_text(&mut rows[idx].1, first_sentence);
9718        rows[idx + 1].1 = if trailing.is_empty() {
9719            stub.to_string()
9720        } else {
9721            format!("{stub} {trailing}")
9722        };
9723    }
9724}
9725
9726fn split_leading_citation_stub(text: &str) -> Option<(&str, &str)> {
9727    let comma_idx = text.find(',')?;
9728    if comma_idx > 8 {
9729        return None;
9730    }
9731    let stub = text[..=comma_idx].trim();
9732    let remainder = text[comma_idx + 1..].trim();
9733    (!stub.is_empty() && !remainder.is_empty()).then_some((stub, remainder))
9734}
9735
9736fn split_first_sentence(text: &str) -> Option<(&str, &str)> {
9737    let period_idx = text.find(". ")?;
9738    let first = text[..=period_idx].trim();
9739    let trailing = text[period_idx + 2..].trim();
9740    (!first.is_empty()).then_some((first, trailing))
9741}
9742
9743fn find_footnote_marker_positions(text: &str) -> Vec<(usize, String, usize)> {
9744    let chars = text.char_indices().collect::<Vec<_>>();
9745    let mut markers = Vec::new();
9746    let mut idx = 0usize;
9747
9748    while idx < chars.len() {
9749        let (byte_idx, ch) = chars[idx];
9750        if !ch.is_ascii_digit() {
9751            idx += 1;
9752            continue;
9753        }
9754
9755        let at_boundary = idx == 0
9756            || chars[idx - 1].1.is_whitespace()
9757            || matches!(
9758                chars[idx - 1].1,
9759                '.' | ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”'
9760            );
9761        if !at_boundary {
9762            idx += 1;
9763            continue;
9764        }
9765
9766        let mut end_idx = idx;
9767        while end_idx < chars.len() && chars[end_idx].1.is_ascii_digit() {
9768            end_idx += 1;
9769        }
9770        let digits = &text[byte_idx
9771            ..chars
9772                .get(end_idx)
9773                .map(|(pos, _)| *pos)
9774                .unwrap_or(text.len())];
9775        if digits.len() > 2 || end_idx >= chars.len() || !chars[end_idx].1.is_whitespace() {
9776            idx += 1;
9777            continue;
9778        }
9779
9780        let mut lookahead = end_idx;
9781        while lookahead < chars.len() && chars[lookahead].1.is_whitespace() {
9782            lookahead += 1;
9783        }
9784        let Some((_, next_ch)) = chars.get(lookahead) else {
9785            idx += 1;
9786            continue;
9787        };
9788        if !(next_ch.is_ascii_uppercase() || matches!(*next_ch, '(' | '[' | '*')) {
9789            idx += 1;
9790            continue;
9791        }
9792
9793        let skip_end = chars
9794            .get(lookahead)
9795            .map(|(pos, _)| *pos)
9796            .unwrap_or(text.len());
9797        markers.push((byte_idx, digits.to_string(), skip_end - byte_idx));
9798        idx = lookahead;
9799    }
9800
9801    markers
9802}
9803
9804fn split_trailing_footnote_lead(text: &str) -> Option<(String, String)> {
9805    let markers = find_footnote_marker_positions(text);
9806    let (pos, marker, skip_len) = markers.last()?.clone();
9807    let prefix = text[..pos].trim();
9808    let tail = text[pos + skip_len..].trim();
9809    if prefix.split_whitespace().count() < 6 || tail.split_whitespace().count() > 6 {
9810        return None;
9811    }
9812    Some((prefix.to_string(), format!("{marker} {tail}")))
9813}
9814
9815fn starts_with_footnote_marker(text: &str) -> bool {
9816    find_footnote_marker_positions(text)
9817        .first()
9818        .is_some_and(|(pos, _, _)| *pos == 0)
9819}
9820
9821fn same_column_region(left: &BoundingBox, right: &BoundingBox) -> bool {
9822    let overlap = (left.right_x.min(right.right_x) - left.left_x.max(right.left_x)).max(0.0);
9823    let min_width = left.width().min(right.width()).max(1.0);
9824    overlap / min_width >= 0.35 || (left.left_x - right.left_x).abs() <= 28.0
9825}
9826
9827fn footnote_fragment_text(element: &ContentElement) -> String {
9828    let text = extract_element_text(element);
9829    if element_font_name(element)
9830        .as_deref()
9831        .is_some_and(|name| name.to_ascii_lowercase().contains("italic"))
9832    {
9833        format!("*{}*", text.trim())
9834    } else {
9835        text
9836    }
9837}
9838
9839fn element_font_size(element: &ContentElement) -> Option<f64> {
9840    match element {
9841        ContentElement::Paragraph(p) => p.base.font_size,
9842        ContentElement::Heading(h) => h.base.base.font_size,
9843        ContentElement::NumberHeading(nh) => nh.base.base.base.font_size,
9844        ContentElement::TextBlock(tb) => Some(tb.font_size),
9845        ContentElement::TextLine(tl) => Some(tl.font_size),
9846        _ => None,
9847    }
9848}
9849
9850fn element_font_name(element: &ContentElement) -> Option<String> {
9851    match element {
9852        ContentElement::Paragraph(p) => p.base.font_name.clone(),
9853        ContentElement::Heading(h) => h.base.base.font_name.clone(),
9854        ContentElement::NumberHeading(nh) => nh.base.base.base.font_name.clone(),
9855        _ => None,
9856    }
9857}
9858
9859fn table_border_from_element(
9860    element: &ContentElement,
9861) -> Option<&crate::models::table::TableBorder> {
9862    match element {
9863        ContentElement::TableBorder(table) => Some(table),
9864        ContentElement::Table(table) => Some(&table.table_border),
9865        _ => None,
9866    }
9867}
9868
9869fn build_geometric_table_region(
9870    doc: &PdfDocument,
9871    table_idx: usize,
9872    table: &crate::models::table::TableBorder,
9873) -> Option<GeometricTableRegion> {
9874    let mut table_rows = collect_table_border_rows(table);
9875    if table_rows.is_empty() || table.num_columns < 3 {
9876        return None;
9877    }
9878    merge_continuation_rows(&mut table_rows);
9879
9880    let column_ranges = table_column_ranges(table)?;
9881    let candidate_indices = collect_table_header_candidate_indices(doc, table_idx, table);
9882    if candidate_indices.is_empty() {
9883        return None;
9884    }
9885
9886    let needs_external_stub =
9887        infer_left_stub_requirement(doc, &candidate_indices, &table_rows, &column_ranges);
9888    let supports_embedded_stub_header =
9889        supports_embedded_stub_header(&table_rows, &column_ranges, doc, &candidate_indices);
9890    if !needs_external_stub && !supports_embedded_stub_header {
9891        return None;
9892    }
9893    let slot_ranges = if needs_external_stub {
9894        slot_ranges(&column_ranges, doc, &candidate_indices, true)?
9895    } else {
9896        column_ranges.clone()
9897    };
9898    let mut header_rows = reconstruct_aligned_rows(doc, &candidate_indices, &slot_ranges, true, 2);
9899    if header_rows.is_empty() {
9900        return None;
9901    }
9902    if needs_external_stub {
9903        normalize_leading_stub_header(&mut header_rows);
9904    } else {
9905        promote_embedded_stub_header(&mut header_rows, &table_rows);
9906    }
9907
9908    let slot_count = slot_ranges.len();
9909    let dense_header_rows = header_rows
9910        .iter()
9911        .filter(|row| {
9912            row.iter().filter(|cell| !cell.trim().is_empty()).count()
9913                >= slot_count.saturating_sub(1).max(2)
9914        })
9915        .count();
9916    if dense_header_rows == 0 {
9917        return None;
9918    }
9919
9920    let mut combined_rows = Vec::new();
9921    combined_rows.extend(header_rows);
9922
9923    let following_indices = collect_table_footer_candidate_indices(doc, table_idx, table);
9924    let body_rows = if needs_external_stub && should_merge_panel_body_rows(&table_rows) {
9925        let trailing_rows =
9926            reconstruct_aligned_rows(doc, &following_indices, &slot_ranges, false, 1);
9927        vec![merge_panel_body_row(
9928            &table_rows,
9929            &trailing_rows,
9930            slot_count,
9931        )]
9932    } else if needs_external_stub {
9933        table_rows
9934            .iter()
9935            .map(|row| {
9936                let mut shifted = vec![String::new()];
9937                shifted.extend(row.iter().cloned());
9938                shifted
9939            })
9940            .collect()
9941    } else {
9942        table_rows
9943    };
9944
9945    if body_rows.is_empty() {
9946        return None;
9947    }
9948    combined_rows.extend(body_rows);
9949
9950    let rendered = render_pipe_rows(&combined_rows);
9951    Some(GeometricTableRegion {
9952        start_idx: candidate_indices[0],
9953        end_idx: following_indices.last().copied().unwrap_or(table_idx),
9954        rendered,
9955    })
9956}
9957
9958fn table_column_ranges(table: &crate::models::table::TableBorder) -> Option<Vec<(f64, f64)>> {
9959    if table.num_columns == 0 {
9960        return None;
9961    }
9962
9963    let mut ranges = vec![(f64::INFINITY, f64::NEG_INFINITY); table.num_columns];
9964    for row in &table.rows {
9965        for cell in &row.cells {
9966            if cell.col_number >= table.num_columns {
9967                continue;
9968            }
9969            let range = &mut ranges[cell.col_number];
9970            range.0 = range.0.min(cell.bbox.left_x);
9971            range.1 = range.1.max(cell.bbox.right_x);
9972        }
9973    }
9974
9975    if ranges
9976        .iter()
9977        .any(|(left, right)| !left.is_finite() || !right.is_finite() || right <= left)
9978    {
9979        return None;
9980    }
9981
9982    Some(ranges)
9983}
9984
9985fn collect_table_header_candidate_indices(
9986    doc: &PdfDocument,
9987    table_idx: usize,
9988    table: &crate::models::table::TableBorder,
9989) -> Vec<usize> {
9990    let mut indices = Vec::new();
9991    let table_page = table.bbox.page_number;
9992    let table_top = table.bbox.top_y;
9993    let mut cursor = table_idx;
9994
9995    while let Some(prev_idx) = cursor.checked_sub(1) {
9996        let element = &doc.kids[prev_idx];
9997        if element.page_number() != table_page {
9998            break;
9999        }
10000        if !is_geometric_text_candidate(element) {
10001            break;
10002        }
10003
10004        let bbox = element.bbox();
10005        let vertical_gap = bbox.bottom_y - table_top;
10006        if !(-6.0..=260.0).contains(&vertical_gap) {
10007            break;
10008        }
10009
10010        indices.push(prev_idx);
10011        cursor = prev_idx;
10012        if indices.len() >= 10 {
10013            break;
10014        }
10015    }
10016
10017    indices.reverse();
10018    indices
10019}
10020
10021fn collect_table_footer_candidate_indices(
10022    doc: &PdfDocument,
10023    table_idx: usize,
10024    table: &crate::models::table::TableBorder,
10025) -> Vec<usize> {
10026    let mut indices = Vec::new();
10027    let table_page = table.bbox.page_number;
10028    let table_bottom = table.bbox.bottom_y;
10029
10030    for idx in table_idx + 1..doc.kids.len() {
10031        let element = &doc.kids[idx];
10032        if element.page_number() != table_page {
10033            break;
10034        }
10035        if !is_geometric_text_candidate(element) {
10036            break;
10037        }
10038        if looks_like_margin_page_number(doc, element, &extract_element_text(element)) {
10039            break;
10040        }
10041
10042        let bbox = element.bbox();
10043        let gap = table_bottom - bbox.top_y;
10044        if !(-6.0..=28.0).contains(&gap) {
10045            break;
10046        }
10047        indices.push(idx);
10048        if indices.len() >= 4 {
10049            break;
10050        }
10051    }
10052
10053    indices
10054}
10055
10056fn is_geometric_text_candidate(element: &ContentElement) -> bool {
10057    matches!(
10058        element,
10059        ContentElement::Paragraph(_)
10060            | ContentElement::Heading(_)
10061            | ContentElement::NumberHeading(_)
10062            | ContentElement::TextBlock(_)
10063            | ContentElement::TextLine(_)
10064    )
10065}
10066
10067fn infer_left_stub_requirement(
10068    doc: &PdfDocument,
10069    candidate_indices: &[usize],
10070    table_rows: &[Vec<String>],
10071    column_ranges: &[(f64, f64)],
10072) -> bool {
10073    if column_ranges.is_empty() {
10074        return false;
10075    }
10076
10077    let first_width = (column_ranges[0].1 - column_ranges[0].0).max(1.0);
10078    let has_left_label = candidate_indices.iter().any(|idx| {
10079        let bbox = doc.kids[*idx].bbox();
10080        bbox.right_x <= column_ranges[0].0 + first_width * 0.12
10081            && bbox.width() <= first_width * 0.45
10082    });
10083    if !has_left_label {
10084        return false;
10085    }
10086
10087    let mut first_col_word_counts: Vec<usize> = table_rows
10088        .iter()
10089        .filter_map(|row| row.first())
10090        .map(|cell| cell.split_whitespace().count())
10091        .collect();
10092    if first_col_word_counts.is_empty() {
10093        return false;
10094    }
10095    first_col_word_counts.sort_unstable();
10096    let median = first_col_word_counts[first_col_word_counts.len() / 2];
10097    median >= 5
10098}
10099
10100fn supports_embedded_stub_header(
10101    table_rows: &[Vec<String>],
10102    column_ranges: &[(f64, f64)],
10103    doc: &PdfDocument,
10104    candidate_indices: &[usize],
10105) -> bool {
10106    if table_rows.len() < 2 || column_ranges.len() < 3 {
10107        return false;
10108    }
10109
10110    let first_row = &table_rows[0];
10111    if first_row.len() != column_ranges.len() || first_row[0].trim().is_empty() {
10112        return false;
10113    }
10114    if first_row[0].split_whitespace().count() > 3 || first_row[0].trim().len() > 24 {
10115        return false;
10116    }
10117
10118    let data_fill = first_row
10119        .iter()
10120        .skip(1)
10121        .filter(|cell| !cell.trim().is_empty())
10122        .count();
10123    if data_fill + 1 < column_ranges.len() {
10124        return false;
10125    }
10126
10127    let labeled_rows = table_rows
10128        .iter()
10129        .skip(1)
10130        .filter(|row| row.first().is_some_and(|cell| !cell.trim().is_empty()))
10131        .count();
10132    if labeled_rows == 0 {
10133        return false;
10134    }
10135
10136    let slot_ranges = column_ranges.to_vec();
10137    let header_rows = reconstruct_aligned_rows(doc, candidate_indices, &slot_ranges, true, 2);
10138    header_rows.iter().any(|row| {
10139        row.first().is_none_or(|cell| cell.trim().is_empty())
10140            && row
10141                .iter()
10142                .skip(1)
10143                .filter(|cell| !cell.trim().is_empty())
10144                .count()
10145                >= column_ranges.len().saturating_sub(1)
10146    })
10147}
10148
10149fn slot_ranges(
10150    column_ranges: &[(f64, f64)],
10151    doc: &PdfDocument,
10152    candidate_indices: &[usize],
10153    needs_stub: bool,
10154) -> Option<Vec<(f64, f64)>> {
10155    let mut slots = Vec::new();
10156    if needs_stub {
10157        let first_left = column_ranges.first()?.0;
10158        let left_stub_start = candidate_indices
10159            .iter()
10160            .map(|idx| doc.kids[*idx].bbox().left_x)
10161            .fold(first_left, f64::min);
10162        let stub_right = first_left - 1.0;
10163        if stub_right <= left_stub_start {
10164            return None;
10165        }
10166        slots.push((left_stub_start, stub_right));
10167    }
10168    slots.extend(column_ranges.iter().copied());
10169    Some(slots)
10170}
10171
10172fn reconstruct_aligned_rows(
10173    doc: &PdfDocument,
10174    candidate_indices: &[usize],
10175    slot_ranges: &[(f64, f64)],
10176    drop_wide_singletons: bool,
10177    min_filled_slots: usize,
10178) -> Vec<Vec<String>> {
10179    if candidate_indices.is_empty() || slot_ranges.is_empty() {
10180        return Vec::new();
10181    }
10182
10183    let mut row_bands: Vec<(BoundingBox, Vec<String>)> = Vec::new();
10184
10185    for idx in candidate_indices {
10186        for line in extract_chunk_lines(&doc.kids[*idx]) {
10187            let fragments = split_line_into_slot_fragments(&line, slot_ranges);
10188            if fragments.is_empty() {
10189                continue;
10190            }
10191
10192            if drop_wide_singletons && fragments.len() == 1 {
10193                let only = &fragments[0];
10194                let span_width = only.bbox.width();
10195                let table_width =
10196                    slot_ranges.last().map(|(_, right)| *right).unwrap_or(0.0) - slot_ranges[0].0;
10197                if span_width >= table_width * 0.55 {
10198                    continue;
10199                }
10200            }
10201
10202            let line_center = line.bbox.center_y();
10203            let tolerance = line
10204                .chunks
10205                .iter()
10206                .map(|chunk| chunk.font_size)
10207                .fold(8.0, f64::max)
10208                * 0.8;
10209
10210            let mut target_row = None;
10211            for (row_idx, (bbox, _)) in row_bands.iter().enumerate() {
10212                if (bbox.center_y() - line_center).abs() <= tolerance {
10213                    target_row = Some(row_idx);
10214                    break;
10215                }
10216            }
10217
10218            if let Some(row_idx) = target_row {
10219                let (bbox, cells) = &mut row_bands[row_idx];
10220                *bbox = bbox.union(&line.bbox);
10221                for fragment in fragments {
10222                    append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10223                }
10224            } else {
10225                let mut cells = vec![String::new(); slot_ranges.len()];
10226                for fragment in fragments {
10227                    append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10228                }
10229                row_bands.push((line.bbox.clone(), cells));
10230            }
10231        }
10232    }
10233
10234    row_bands.sort_by(|left, right| {
10235        right
10236            .0
10237            .top_y
10238            .partial_cmp(&left.0.top_y)
10239            .unwrap_or(std::cmp::Ordering::Equal)
10240    });
10241
10242    row_bands
10243        .into_iter()
10244        .map(|(_, cells)| cells)
10245        .filter(|cells| {
10246            let filled = cells.iter().filter(|cell| !cell.trim().is_empty()).count();
10247            filled >= min_filled_slots
10248        })
10249        .collect()
10250}
10251
10252fn extract_chunk_lines(element: &ContentElement) -> Vec<ChunkLine> {
10253    match element {
10254        ContentElement::Paragraph(p) => chunk_lines_from_semantic_node(&p.base),
10255        ContentElement::Heading(h) => chunk_lines_from_semantic_node(&h.base.base),
10256        ContentElement::NumberHeading(nh) => chunk_lines_from_semantic_node(&nh.base.base.base),
10257        ContentElement::TextBlock(tb) => tb
10258            .text_lines
10259            .iter()
10260            .map(|line| ChunkLine {
10261                bbox: line.bbox.clone(),
10262                chunks: line.text_chunks.clone(),
10263            })
10264            .collect(),
10265        ContentElement::TextLine(tl) => vec![ChunkLine {
10266            bbox: tl.bbox.clone(),
10267            chunks: tl.text_chunks.clone(),
10268        }],
10269        _ => Vec::new(),
10270    }
10271}
10272
10273fn chunk_lines_from_semantic_node(node: &SemanticTextNode) -> Vec<ChunkLine> {
10274    let mut lines = Vec::new();
10275    for column in &node.columns {
10276        for block in &column.text_blocks {
10277            for line in &block.text_lines {
10278                lines.push(ChunkLine {
10279                    bbox: line.bbox.clone(),
10280                    chunks: line.text_chunks.clone(),
10281                });
10282            }
10283        }
10284    }
10285    lines
10286}
10287
10288fn split_line_into_slot_fragments(
10289    line: &ChunkLine,
10290    slot_ranges: &[(f64, f64)],
10291) -> Vec<SlotFragment> {
10292    let mut groups: Vec<(usize, Vec<TextChunk>, BoundingBox)> = Vec::new();
10293
10294    for chunk in line
10295        .chunks
10296        .iter()
10297        .filter(|chunk| !chunk.value.trim().is_empty())
10298        .cloned()
10299    {
10300        let slot_idx = assign_chunk_to_slot(&chunk.bbox, slot_ranges);
10301        if let Some((prev_slot, prev_chunks, prev_bbox)) = groups.last_mut() {
10302            let gap = chunk.bbox.left_x - prev_bbox.right_x;
10303            if *prev_slot == slot_idx && gap <= chunk.font_size.max(6.0) * 2.4 {
10304                *prev_bbox = prev_bbox.union(&chunk.bbox);
10305                prev_chunks.push(chunk);
10306                continue;
10307            }
10308        }
10309        groups.push((slot_idx, vec![chunk.clone()], chunk.bbox.clone()));
10310    }
10311
10312    groups
10313        .into_iter()
10314        .filter_map(|(slot_idx, chunks, bbox)| {
10315            let text = normalize_common_ocr_text(
10316                &crate::models::text::TextLine::concatenate_chunks(&chunks),
10317            );
10318            if text.trim().is_empty() {
10319                None
10320            } else {
10321                Some(SlotFragment {
10322                    slot_idx,
10323                    bbox,
10324                    text,
10325                })
10326            }
10327        })
10328        .collect()
10329}
10330
10331fn assign_chunk_to_slot(bbox: &BoundingBox, slot_ranges: &[(f64, f64)]) -> usize {
10332    let mut best_idx = 0usize;
10333    let mut best_overlap = f64::NEG_INFINITY;
10334    let center_x = bbox.center_x();
10335
10336    for (idx, (left, right)) in slot_ranges.iter().enumerate() {
10337        let overlap = (bbox.right_x.min(*right) - bbox.left_x.max(*left)).max(0.0);
10338        let score = if overlap > 0.0 {
10339            overlap / bbox.width().max(1.0)
10340        } else {
10341            -((center_x - ((*left + *right) / 2.0)).abs())
10342        };
10343        if score > best_overlap {
10344            best_overlap = score;
10345            best_idx = idx;
10346        }
10347    }
10348
10349    best_idx
10350}
10351
10352fn append_cell_text(cell: &mut String, fragment: &str) {
10353    let trimmed = fragment.trim();
10354    if trimmed.is_empty() {
10355        return;
10356    }
10357    if !cell.is_empty() {
10358        cell.push(' ');
10359    }
10360    cell.push_str(trimmed);
10361}
10362
10363fn normalize_leading_stub_header(rows: &mut [Vec<String>]) {
10364    if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
10365        return;
10366    }
10367
10368    if !rows[0][0].trim().is_empty() || rows[1][0].trim().is_empty() {
10369        return;
10370    }
10371
10372    let first_row_filled = rows[0]
10373        .iter()
10374        .skip(1)
10375        .filter(|cell| !cell.trim().is_empty())
10376        .count();
10377    let second_row_filled = rows[1]
10378        .iter()
10379        .skip(1)
10380        .filter(|cell| !cell.trim().is_empty())
10381        .count();
10382    if first_row_filled < 2 || second_row_filled < 2 {
10383        return;
10384    }
10385
10386    rows[0][0] = rows[1][0].trim().to_string();
10387    rows[1][0].clear();
10388}
10389
10390fn promote_embedded_stub_header(header_rows: &mut [Vec<String>], table_rows: &[Vec<String>]) {
10391    let Some(header_row) = header_rows.first_mut() else {
10392        return;
10393    };
10394    let Some(first_body_row) = table_rows.first() else {
10395        return;
10396    };
10397    if header_row.is_empty() || first_body_row.is_empty() {
10398        return;
10399    }
10400    if !header_row[0].trim().is_empty() {
10401        return;
10402    }
10403
10404    let promoted = first_body_row[0].trim();
10405    if promoted.is_empty() || promoted.split_whitespace().count() > 3 || promoted.len() > 24 {
10406        return;
10407    }
10408
10409    let header_fill = header_row
10410        .iter()
10411        .skip(1)
10412        .filter(|cell| !cell.trim().is_empty())
10413        .count();
10414    let body_fill = first_body_row
10415        .iter()
10416        .skip(1)
10417        .filter(|cell| !cell.trim().is_empty())
10418        .count();
10419    if header_fill < header_row.len().saturating_sub(1)
10420        || body_fill < first_body_row.len().saturating_sub(1)
10421    {
10422        return;
10423    }
10424
10425    header_row[0] = promoted.to_string();
10426}
10427
10428fn should_merge_panel_body_rows(rows: &[Vec<String>]) -> bool {
10429    rows.len() >= 3
10430        && rows
10431            .iter()
10432            .all(|row| !row.is_empty() && row.iter().all(|cell| !cell.trim().is_empty()))
10433}
10434
10435fn merge_panel_body_row(
10436    table_rows: &[Vec<String>],
10437    trailing_rows: &[Vec<String>],
10438    slot_count: usize,
10439) -> Vec<String> {
10440    let mut merged = vec![String::new(); slot_count];
10441    for row in table_rows {
10442        for (col_idx, cell) in row.iter().enumerate() {
10443            if col_idx + 1 >= slot_count {
10444                break;
10445            }
10446            append_cell_text(&mut merged[col_idx + 1], cell);
10447        }
10448    }
10449    for row in trailing_rows {
10450        for (col_idx, cell) in row.iter().enumerate() {
10451            if col_idx >= slot_count {
10452                break;
10453            }
10454            append_cell_text(&mut merged[col_idx], cell);
10455        }
10456    }
10457    merged
10458}
10459
10460fn render_pipe_rows(rows: &[Vec<String>]) -> String {
10461    if rows.is_empty() {
10462        return String::new();
10463    }
10464
10465    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10466    if num_cols == 0 {
10467        return String::new();
10468    }
10469
10470    let mut out = String::new();
10471    for (row_idx, row) in rows.iter().enumerate() {
10472        out.push('|');
10473        for col_idx in 0..num_cols {
10474            let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
10475            out.push_str(&format!(" {} |", cell.trim()));
10476        }
10477        out.push('\n');
10478
10479        if row_idx == 0 {
10480            out.push('|');
10481            for _ in 0..num_cols {
10482                out.push_str(" --- |");
10483            }
10484            out.push('\n');
10485        }
10486    }
10487    out.push('\n');
10488    out
10489}
10490
10491fn render_html_table(rows: &[Vec<String>]) -> String {
10492    if rows.is_empty() {
10493        return String::new();
10494    }
10495
10496    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10497    if num_cols == 0 {
10498        return String::new();
10499    }
10500
10501    let mut out = String::from("<table>\n");
10502    for (row_idx, row) in rows.iter().enumerate() {
10503        out.push_str("<tr>");
10504        for col_idx in 0..num_cols {
10505            let cell = escape_html_text(row.get(col_idx).map(String::as_str).unwrap_or("").trim());
10506            if row_idx == 0 {
10507                out.push_str("<th>");
10508                out.push_str(&cell);
10509                out.push_str("</th>");
10510            } else {
10511                out.push_str("<td>");
10512                out.push_str(&cell);
10513                out.push_str("</td>");
10514            }
10515        }
10516        out.push_str("</tr>\n");
10517    }
10518    out.push_str("</table>\n\n");
10519    out
10520}
10521
10522fn escape_html_text(text: &str) -> String {
10523    text.replace('&', "&amp;")
10524        .replace('<', "&lt;")
10525        .replace('>', "&gt;")
10526        .replace('"', "&quot;")
10527        .replace('\'', "&#39;")
10528}
10529
10530fn normalized_numeric_marker(text: &str) -> Option<String> {
10531    let digits = text
10532        .chars()
10533        .filter(|ch| ch.is_ascii_digit())
10534        .collect::<String>();
10535    (!digits.is_empty() && digits.len() <= 2).then_some(digits)
10536}
10537
10538fn render_infographic_card_rows(rows: &[Vec<String>]) -> Option<String> {
10539    if rows.is_empty() || !rows.iter().all(|row| row.len() == 2) {
10540        return None;
10541    }
10542
10543    let marker = normalized_numeric_marker(rows[0][0].trim())?;
10544    if rows[0][1].split_whitespace().count() < 4 {
10545        return None;
10546    }
10547    if rows
10548        .iter()
10549        .skip(1)
10550        .any(|row| normalized_numeric_marker(row[0].trim()).is_some())
10551    {
10552        return None;
10553    }
10554    if rows
10555        .iter()
10556        .skip(1)
10557        .any(|row| !row[0].trim().is_empty() && row[0].trim().len() > 2)
10558    {
10559        return None;
10560    }
10561
10562    let body = rows
10563        .iter()
10564        .filter_map(|row| row.get(1))
10565        .map(|cell| cell.trim())
10566        .filter(|cell| !cell.is_empty())
10567        .collect::<Vec<_>>()
10568        .join(" ");
10569    if body.split_whitespace().count() < 8 {
10570        return None;
10571    }
10572
10573    Some(format!("{marker}. {body}\n\n"))
10574}
10575
10576fn extract_element_text(element: &ContentElement) -> String {
10577    match element {
10578        ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
10579        ContentElement::Heading(h) => clean_paragraph_text(&h.base.base.value()),
10580        ContentElement::NumberHeading(nh) => clean_paragraph_text(&nh.base.base.base.value()),
10581        ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
10582        ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
10583        _ => String::new(),
10584    }
10585}
10586
10587/// Collect rendered rows from a single TableBorder (no cross-page chaining).
10588fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
10589    let num_cols = table.num_columns.max(1);
10590    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
10591    for row in &table.rows {
10592        let cell_texts: Vec<String> = (0..num_cols)
10593            .map(|col| {
10594                row.cells
10595                    .iter()
10596                    .find(|c| c.col_number == col)
10597                    .map(cell_text_content)
10598                    .unwrap_or_default()
10599            })
10600            .collect();
10601        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
10602            rendered_rows.push(cell_texts);
10603        }
10604    }
10605    rendered_rows
10606}
10607
10608/// Render a TableBorder directly as a markdown table.
10609///
10610/// When the table has a `next_table` link (cross-page continuation), the
10611/// continuation rows are appended so the entire logical table is emitted
10612/// as a single pipe table.
10613fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
10614    if table.rows.is_empty() {
10615        return;
10616    }
10617
10618    // Collect rows from this table.
10619    let mut rendered_rows = collect_table_border_rows(table);
10620
10621    if rendered_rows.is_empty() {
10622        return;
10623    }
10624
10625    if let Some(rendered) = render_infographic_card_rows(&rendered_rows) {
10626        out.push_str(&rendered);
10627        return;
10628    }
10629
10630    // Merge multi-line header rows into a single header row.
10631    merge_continuation_rows(&mut rendered_rows);
10632    trim_leading_table_carryover_rows(&mut rendered_rows);
10633
10634    // ToC detection: render table-of-contents as plain text pairs, not a markdown table.
10635    if is_toc_table(&rendered_rows) {
10636        render_toc_rows(out, &rendered_rows);
10637        return;
10638    }
10639
10640    out.push_str(&render_pipe_rows(&rendered_rows));
10641}
10642
10643/// Returns true if `text` looks like a page number (Arabic digits or Roman numerals).
10644fn is_page_number_like(text: &str) -> bool {
10645    let t = text.trim();
10646    if t.is_empty() {
10647        return false;
10648    }
10649    // All ASCII digits, length ≤ 5 (handles pages 1–99999)
10650    if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
10651        return true;
10652    }
10653    // Lowercase Roman numerals (i, ii, iii, iv, v, vi, vii, viii, ix, x …)
10654    let lower = t.to_ascii_lowercase();
10655    if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
10656        return true;
10657    }
10658    false
10659}
10660
10661/// Returns true if the rendered rows look like a table-of-contents:
10662/// exactly 2 columns where the majority of right-column cells are page numbers.
10663fn is_toc_table(rows: &[Vec<String>]) -> bool {
10664    if rows.is_empty() {
10665        return false;
10666    }
10667    // Need at least 2 rows to qualify as a ToC
10668    if rows.len() < 2 {
10669        return false;
10670    }
10671    // First, every row must have exactly 2 cells
10672    if !rows.iter().all(|r| r.len() == 2) {
10673        return false;
10674    }
10675
10676    let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
10677    if non_empty_right < 2 {
10678        return false;
10679    }
10680
10681    let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
10682    page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
10683}
10684
10685/// Render ToC-style rows as plain text (title pagenum pairs) rather than a markdown table.
10686fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
10687    for row in rows {
10688        let title = row[0].trim();
10689        let page = row[1].trim();
10690        if title.is_empty() && page.is_empty() {
10691            continue;
10692        }
10693        if !title.is_empty() && !page.is_empty() {
10694            out.push_str(title);
10695            out.push(' ');
10696            out.push_str(page);
10697        } else {
10698            out.push_str(title);
10699            out.push_str(page);
10700        }
10701        out.push('\n');
10702    }
10703    out.push('\n');
10704}
10705
10706/// Extract text content from a table cell.
10707fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
10708    // First try the content tokens — use gap-based concatenation instead of
10709    // naive space-joining so that letter-spaced text ("O w n e r s h i p")
10710    // is collapsed correctly.
10711    if !cell.content.is_empty() {
10712        let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
10713        return normalize_common_ocr_text(&crate::models::text::TextLine::concatenate_chunks(
10714            &chunks,
10715        ));
10716    }
10717    // Fall back to processed contents
10718    let mut text = String::new();
10719    for elem in &cell.contents {
10720        match elem {
10721            ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
10722            ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
10723            ContentElement::TextLine(tl) => text.push_str(&tl.value()),
10724            ContentElement::TextChunk(tc) => text.push_str(&tc.value),
10725            _ => {}
10726        }
10727    }
10728    normalize_common_ocr_text(&repair_fragmented_words(&text))
10729}
10730
10731/// Merge adjacent pipe tables that share the same column count.
10732///
10733/// PDF table detection sometimes splits one visual table into several
10734/// fragments that are emitted as successive pipe tables.  When two tables
10735/// are separated only by blank lines and have identical column counts,
10736/// they are merged into a single table by appending the second table's
10737/// rows (including its header-now-body row) to the first.
10738fn merge_adjacent_pipe_tables(markdown: &str) -> String {
10739    let lines: Vec<&str> = markdown.lines().collect();
10740    if lines.len() < 4 {
10741        return markdown.to_string();
10742    }
10743
10744    fn count_pipe_cols(line: &str) -> usize {
10745        let t = line.trim();
10746        if !t.starts_with('|') || !t.ends_with('|') {
10747            return 0;
10748        }
10749        t.split('|').count().saturating_sub(2)
10750    }
10751
10752    fn is_separator(line: &str) -> bool {
10753        let t = line.trim();
10754        if !t.starts_with('|') || !t.ends_with('|') {
10755            return false;
10756        }
10757        let cells: Vec<&str> = t.split('|').collect();
10758        if cells.len() < 3 {
10759            return false;
10760        }
10761        cells[1..cells.len() - 1].iter().all(|c| {
10762            let s = c.trim();
10763            !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
10764        })
10765    }
10766
10767    fn is_pipe_row(line: &str) -> bool {
10768        let t = line.trim();
10769        t.starts_with('|') && t.ends_with('|') && t.len() > 2
10770    }
10771
10772    fn pipe_cells(line: &str) -> Vec<String> {
10773        let t = line.trim();
10774        if !is_pipe_row(t) {
10775            return Vec::new();
10776        }
10777        let parts = t.split('|').collect::<Vec<_>>();
10778        parts[1..parts.len() - 1]
10779            .iter()
10780            .map(|cell| cell.trim().to_string())
10781            .collect()
10782    }
10783
10784    fn normalize_header_cell(cell: &str) -> String {
10785        cell.chars()
10786            .filter(|ch| ch.is_alphanumeric())
10787            .flat_map(|ch| ch.to_lowercase())
10788            .collect()
10789    }
10790
10791    fn looks_like_header_row(line: &str) -> bool {
10792        let cells = pipe_cells(line);
10793        if cells.len() < 2 {
10794            return false;
10795        }
10796
10797        let non_empty = cells
10798            .iter()
10799            .filter(|cell| !cell.trim().is_empty())
10800            .collect::<Vec<_>>();
10801        if non_empty.len() < 2 {
10802            return false;
10803        }
10804
10805        let headerish = non_empty.iter().all(|cell| {
10806            let trimmed = cell.trim();
10807            let word_count = trimmed.split_whitespace().count();
10808            let has_alpha = trimmed.chars().any(|ch| ch.is_alphabetic());
10809            has_alpha && word_count <= 4 && trimmed.len() <= 28
10810        });
10811        headerish
10812    }
10813
10814    fn header_overlap_ratio(left: &str, right: &str) -> f64 {
10815        let left_cells = pipe_cells(left)
10816            .into_iter()
10817            .map(|cell| normalize_header_cell(&cell))
10818            .collect::<Vec<_>>();
10819        let right_cells = pipe_cells(right)
10820            .into_iter()
10821            .map(|cell| normalize_header_cell(&cell))
10822            .collect::<Vec<_>>();
10823        let width = left_cells.len().min(right_cells.len());
10824        if width == 0 {
10825            return 0.0;
10826        }
10827
10828        let matches = (0..width)
10829            .filter(|idx| {
10830                !left_cells[*idx].is_empty()
10831                    && !right_cells[*idx].is_empty()
10832                    && left_cells[*idx] == right_cells[*idx]
10833            })
10834            .count();
10835        matches as f64 / width as f64
10836    }
10837
10838    fn header_schema_matches(left: &str, right: &str) -> bool {
10839        let left_cells = pipe_cells(left)
10840            .into_iter()
10841            .map(|cell| normalize_header_cell(&cell))
10842            .collect::<Vec<_>>();
10843        let right_cells = pipe_cells(right)
10844            .into_iter()
10845            .map(|cell| normalize_header_cell(&cell))
10846            .collect::<Vec<_>>();
10847        if left_cells.len() != right_cells.len() || left_cells.len() < 2 {
10848            return false;
10849        }
10850
10851        let mut aligned_non_empty = 0usize;
10852        for (left, right) in left_cells.iter().zip(right_cells.iter()) {
10853            if left.is_empty() || right.is_empty() {
10854                continue;
10855            }
10856            aligned_non_empty += 1;
10857            if left != right {
10858                return false;
10859            }
10860        }
10861
10862        aligned_non_empty >= 2
10863    }
10864
10865    fn pad_pipe_row(line: &str, target_cols: usize) -> String {
10866        let t = line.trim();
10867        let current_cols = count_pipe_cols(t);
10868        if current_cols >= target_cols {
10869            return t.to_string();
10870        }
10871        // Append extra empty cells after the existing trailing |
10872        let mut result = t.to_string();
10873        for _ in current_cols..target_cols {
10874            result.push_str("  |");
10875        }
10876        result
10877    }
10878
10879    // Identify pipe table blocks: (start, sep_idx, end, col_count).
10880    struct Block {
10881        start: usize,
10882        sep: usize,
10883        end: usize, // inclusive last line
10884        cols: usize,
10885    }
10886
10887    let mut blocks: Vec<Block> = Vec::new();
10888    let mut i = 0;
10889    while i < lines.len() {
10890        if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
10891            let cols = count_pipe_cols(lines[i]);
10892            let sep = i + 1;
10893            let mut end = sep;
10894            let mut j = sep + 1;
10895            while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
10896                end = j;
10897                j += 1;
10898            }
10899            blocks.push(Block {
10900                start: i,
10901                sep,
10902                end,
10903                cols,
10904            });
10905            i = end + 1;
10906        } else {
10907            i += 1;
10908        }
10909    }
10910
10911    if blocks.len() < 2 {
10912        return markdown.to_string();
10913    }
10914
10915    // Group adjacent blocks: allow different column counts.
10916    // Merge when separated by blank lines only, or by heading markers
10917    // (lines starting with #) that represent table cells misclassified
10918    // as headings by the pipeline.
10919    // Track group max cols during merge to use for heading gap decisions.
10920    let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
10921    let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
10922    for bi in 1..blocks.len() {
10923        let prev = &blocks[bi - 1];
10924        let curr = &blocks[bi];
10925        let gap_range = prev.end + 1..curr.start;
10926        let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
10927        // For heading gap check, use the group's max cols (not individual block).
10928        // This handles chains like [2-col] → blank → [1-col] → heading → [2-col]
10929        // where the 1-col intermediary is already merged with the 2-col leader.
10930        let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
10931        let effective_prev_cols = group_cols[leader_idx];
10932        let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
10933            let non_blank: Vec<usize> = gap_range
10934                .clone()
10935                .filter(|li| !lines[*li].trim().is_empty())
10936                .collect();
10937            // Only merge when gap has 1-2 heading lines
10938            !non_blank.is_empty()
10939                && non_blank.len() <= 2
10940                && non_blank.iter().all(|li| {
10941                    let t = lines[*li].trim();
10942                    t.starts_with('#') && t.len() < 100
10943                })
10944        } else {
10945            false
10946        };
10947        // Short displaced cell: a single short plain-text word between two
10948        // multi-column tables is almost certainly a cell value that the PDF
10949        // pipeline displaced out of the table grid.
10950        let gap_short_fragment =
10951            if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
10952                let non_blank: Vec<usize> = gap_range
10953                    .clone()
10954                    .filter(|li| !lines[*li].trim().is_empty())
10955                    .collect();
10956                non_blank.len() == 1 && {
10957                    let t = lines[non_blank[0]].trim();
10958                    t.len() < 30
10959                        && !t.starts_with('#')
10960                        && !t.starts_with('-')
10961                        && !t.starts_with('*')
10962                        && !t.contains(':')
10963                        && !t.contains("TABLE")
10964                }
10965            } else {
10966                false
10967            };
10968        let prev_has_header = looks_like_header_row(lines[prev.start]);
10969        let curr_has_header = curr.end >= curr.sep + 2 && looks_like_header_row(lines[curr.start]);
10970        let curr_has_distinct_header = prev_has_header
10971            && curr_has_header
10972            && !header_schema_matches(lines[prev.start], lines[curr.start])
10973            && (curr.cols != prev.cols
10974                || header_overlap_ratio(lines[prev.start], lines[curr.start]) < 1.0);
10975
10976        if (gap_all_blank || gap_heading_only || gap_short_fragment)
10977            && prev.cols > 0
10978            && curr.cols > 0
10979            && !curr_has_distinct_header
10980        {
10981            merge_leader[bi] = Some(leader_idx);
10982            // Update group max cols
10983            if curr.cols > group_cols[leader_idx] {
10984                group_cols[leader_idx] = curr.cols;
10985            }
10986        }
10987    }
10988
10989    let mut pad_target: Vec<usize> = vec![0; blocks.len()];
10990    for bi in 0..blocks.len() {
10991        let leader = merge_leader[bi].unwrap_or(bi);
10992        pad_target[bi] = group_cols[leader];
10993    }
10994
10995    // Mark lines to skip: blank gap lines + separator of merged blocks.
10996    // Non-blank gap lines become pipe table rows instead of being skipped.
10997    // Keep the header row (curr.start) — it becomes a data row.
10998    let mut skip = vec![false; lines.len()];
10999    let mut convert_to_pipe_row = vec![false; lines.len()];
11000    for (bi, leader) in merge_leader.iter().enumerate() {
11001        if leader.is_none() {
11002            continue;
11003        }
11004        let prev_end = blocks[bi - 1].end;
11005        let curr = &blocks[bi];
11006        for li in (prev_end + 1)..curr.start {
11007            if lines[li].trim().is_empty() {
11008                skip[li] = true;
11009            } else {
11010                // Non-blank gap line: convert to pipe row
11011                convert_to_pipe_row[li] = true;
11012            }
11013        }
11014        // Only skip separator, header row becomes a data row
11015        skip[curr.sep] = true;
11016    }
11017
11018    // Map each line to its block index (or the block it belongs to via gap conversion).
11019    let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
11020    for (bi, block) in blocks.iter().enumerate() {
11021        line_to_block[block.start..=block.end].fill(Some(bi));
11022    }
11023    // Assign gap lines to the preceding block for padding purposes.
11024    for (bi, leader) in merge_leader.iter().enumerate() {
11025        if leader.is_none() {
11026            continue;
11027        }
11028        let prev_end = blocks[bi - 1].end;
11029        let curr = &blocks[bi];
11030        for li in (prev_end + 1)..curr.start {
11031            if convert_to_pipe_row[li] {
11032                line_to_block[li] = Some(bi - 1);
11033            }
11034        }
11035    }
11036
11037    let mut result = String::new();
11038    for (li, line) in lines.iter().enumerate() {
11039        if skip[li] {
11040            continue;
11041        }
11042        if convert_to_pipe_row[li] {
11043            // Convert non-blank gap text/heading into a pipe table row.
11044            let text = line.trim().trim_start_matches('#').trim();
11045            if let Some(bi) = line_to_block[li] {
11046                let target = pad_target[bi];
11047                if target > 0 && !text.is_empty() {
11048                    result.push_str(&format!("| {} ", text));
11049                    for _ in 1..target {
11050                        result.push_str("|  ");
11051                    }
11052                    result.push_str("|\n");
11053                    continue;
11054                }
11055            }
11056            // Fallback: emit as-is if no block context
11057            result.push_str(line);
11058            result.push('\n');
11059            continue;
11060        }
11061        if let Some(bi) = line_to_block[li] {
11062            let target = pad_target[bi];
11063            if target > 0 && is_pipe_row(line) && !is_separator(line) {
11064                result.push_str(&pad_pipe_row(line, target));
11065                result.push('\n');
11066            } else if target > 0 && is_separator(line) {
11067                result.push('|');
11068                for _ in 0..target {
11069                    result.push_str(" --- |");
11070                }
11071                result.push('\n');
11072            } else {
11073                result.push_str(line);
11074                result.push('\n');
11075            }
11076        } else {
11077            result.push_str(line);
11078            result.push('\n');
11079        }
11080    }
11081
11082    result
11083}
11084
11085#[cfg(test)]
11086mod tests {
11087    use super::*;
11088    use crate::models::bbox::BoundingBox;
11089    use crate::models::chunks::TextChunk;
11090    use crate::models::content::ContentElement;
11091    use crate::models::enums::{PdfLayer, TextFormat, TextType};
11092    use crate::models::list::{ListBody, ListItem, ListLabel, PDFList};
11093    use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
11094    use crate::models::table::{
11095        TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
11096    };
11097    use crate::models::text::{TextBlock, TextColumn, TextLine};
11098
11099    #[test]
11100    fn test_empty_doc() {
11101        let doc = PdfDocument::new("test.pdf".to_string());
11102        let md = to_markdown(&doc).unwrap();
11103        assert!(md.contains("No content extracted"));
11104    }
11105
11106    #[test]
11107    fn test_with_title() {
11108        let mut doc = PdfDocument::new("test.pdf".to_string());
11109        doc.title = Some("My Title".to_string());
11110        let md = to_markdown(&doc).unwrap();
11111        assert!(md.starts_with("# My Title\n"));
11112    }
11113
11114    #[test]
11115    fn test_empty_title_not_rendered() {
11116        let mut doc = PdfDocument::new("test.pdf".to_string());
11117        doc.title = Some("  ".to_string());
11118        let md = to_markdown(&doc).unwrap();
11119        assert!(
11120            !md.contains("# "),
11121            "Empty/whitespace title should not produce a heading"
11122        );
11123    }
11124
11125    #[test]
11126    fn test_repair_fragmented_words() {
11127        assert_eq!(
11128            repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
11129            "Jurisdiction Foreign Requirements"
11130        );
11131    }
11132
11133    #[test]
11134    fn test_normalize_common_ocr_text_repairs_units() {
11135        assert_eq!(
11136            normalize_common_ocr_text("10 ߤL at 37 C and -20 oC"),
11137            "10 μL at 37°C and -20°C"
11138        );
11139    }
11140
11141    #[cfg(not(target_arch = "wasm32"))]
11142    #[test]
11143    fn test_build_layout_anchor_rows_reconstructs_four_column_matrix() {
11144        let lines = vec![
11145            "Key Functions by Main Service Flow".to_string(),
11146            "".to_string(),
11147            " Service Stage                   Function Name                Explanation                                                                                Expected Benefit".to_string(),
11148            "".to_string(),
11149            " 1. Project creation             Project creation and         Select document type to automatically run project creation, Pipeline configuration with    The intuitive UI environment allows the the person in charge to quickly proceed with".to_string(),
11150            "".to_string(),
11151            "                                 management                   recommended Modelset and Endpoint deployment                                               the entire process from project creation to deployment, improving work efficiency".to_string(),
11152            "".to_string(),
11153            "                                                                                                                                                         Conveniently manage raw data to be used for OCR Pack and actual date from live".to_string(),
11154            " 2. Data labeling and            Data storage management      Provides convenient functions for uploading raw data, viewer, and data management".to_string(),
11155            "                                                              (search using image metadata, sorting, filtering, hashtags settings on image data)         service".to_string(),
11156            " fine-tuning".to_string(),
11157            "                                                              Image data bookmark for Qualitative Evaluation".to_string(),
11158            "".to_string(),
11159            "                                 Create and manage Labeling   Creating a Labeling Space to manage raw data annotation, managing labeling resources       Labeling work can be outsourced within the pack. Labeled data is continuously".to_string(),
11160            "                                                              (Ontology, Characters to be Recognized), data set dump, data set version management        supplied from which data sets can be created with ease. The Auto Labeling function".to_string(),
11161            "                                 Space".to_string(),
11162            "                                                                                                     3                                                   increases both efficiency and convenience.".to_string(),
11163            "                                                              Various basic models for each selected 5".to_string(),
11164            "                                                                                                    document, information comparison between".to_string(),
11165            "                                 Model training                                                                                                          Providing a foundation for customers to implement, manage, and upgrade their own".to_string(),
11166            "                                                              models, basic model training, training pause function, re-training, cancel function, and   OCR model specialized to the customers’ needs".to_string(),
11167            "                                                              configuration support for Characters to be Recognized and Ontology that is frequently".to_string(),
11168            "                                                              modified while developing specialized models".to_string(),
11169        ];
11170
11171        let header = find_layout_header_candidate(&lines).unwrap();
11172        let rows =
11173            build_layout_anchor_rows(&lines, &extract_layout_entries(&lines, &header)).unwrap();
11174
11175        assert_eq!(
11176            header.headers,
11177            vec![
11178                "Service Stage".to_string(),
11179                "Function Name".to_string(),
11180                "Explanation".to_string(),
11181                "Expected Benefit".to_string()
11182            ]
11183        );
11184        assert_eq!(rows.len(), 4);
11185        assert_eq!(rows[0][0], "1. Project creation");
11186        assert_eq!(rows[0][1], "Project creation and management");
11187        assert!(rows[1][0].contains("fine-tuning"));
11188        assert_eq!(rows[2][1], "Create and manage Labeling Space");
11189        assert_eq!(rows[3][1], "Model training");
11190        assert!(rows[3][2].contains("Various basic models for each selected document"));
11191    }
11192
11193    #[cfg(not(target_arch = "wasm32"))]
11194    #[test]
11195    fn test_build_layout_panel_stub_rows_reconstructs_left_stub_table() {
11196        let lines = vec![
11197            "AI Pack".to_string(),
11198            "Upstage offers 3 AI packs that process unstructured information and data".to_string(),
11199            "".to_string(),
11200            "                                     OCR                                                Recommendation                                    Product semantic search".to_string(),
11201            "".to_string(),
11202            "              A solution that recognizes characters in an                A solution that recommends the best products and   A solution that enables semantic search, analyzes and".to_string(),
11203            "              image and extracts necessary information                   contents                                           organizes key information in unstructured text data".to_string(),
11204            "   Pack".to_string(),
11205            "                                                                                                                            into a standardized form (DB)".to_string(),
11206            "".to_string(),
11207            "              Applicable to all fields that require text extraction      Applicable to all fields that use any form of      Applicable to all fields that deal with various types of".to_string(),
11208            "              from standardized documents, such as receipts,             recommendation including alternative products,     unstructured data containing text information that".to_string(),
11209            "Application   bills, credit cards, ID cards, certificates, and medical   products and contents that are likely to be        require semantic search and conversion into a DB".to_string(),
11210            "              receipts                                                   purchased next".to_string(),
11211            "".to_string(),
11212            "              Achieved 1st place in the OCR World Competition            Team with specialists and technologies that        Creation of the first natural language evaluation".to_string(),
11213            "              The team includes specialists who have                     received Kaggle’s Gold Medal recommendation        system in Korean (KLUE)".to_string(),
11214            "              presented 14 papers in the world’s most                    (Education platform)                               World’s No.1 in Kaggle text embedding competition in".to_string(),
11215            " Highlight".to_string(),
11216            "              renowned AI conferences                                    Proven superior performance of more than 170%      E-commerce subject (Shopee)".to_string(),
11217            "                                                                         compared to other global top-tier recommendation".to_string(),
11218            "                                                                         models".to_string(),
11219        ];
11220
11221        let header = find_layout_panel_header_candidate(&lines).unwrap();
11222        let rows = build_layout_panel_stub_rows(&lines, &header).unwrap();
11223
11224        assert_eq!(
11225            header.headers,
11226            vec![
11227                "OCR".to_string(),
11228                "Recommendation".to_string(),
11229                "Product semantic search".to_string()
11230            ]
11231        );
11232        assert_eq!(rows.len(), 3);
11233        assert_eq!(rows[0][0], "Pack");
11234        assert!(rows[0][1].contains("image and extracts necessary information"));
11235        assert_eq!(rows[1][0], "Application");
11236        assert!(rows[1][3].contains("require semantic search and conversion into a DB"));
11237        assert_eq!(rows[2][0], "Highlight");
11238        assert!(rows[2][2].contains("top-tier recommendation models"));
11239    }
11240
11241    #[cfg(not(target_arch = "wasm32"))]
11242    #[test]
11243    fn test_extract_layout_toc_entries_merges_wrapped_entry() {
11244        let lines = vec![
11245            "Table of Contents".to_string(),
11246            "".to_string(),
11247            "Executive Summary                                          4".to_string(),
11248            "Legal Framework                                            6".to_string(),
11249            "Election Administration                                   11".to_string(),
11250            "Civil Society Engagement                                  15".to_string(),
11251            "Political Parties, Candidates Registration and Election   18".to_string(),
11252            "Campaign".to_string(),
11253            "Media Freedom and Access to Information                   25".to_string(),
11254            "Voter Education and Awareness                             29".to_string(),
11255            "Participation of Marginalized Sectors                     31".to_string(),
11256            "Recommendations                                           39".to_string(),
11257        ];
11258
11259        let (title, entries) = extract_layout_toc_entries(&lines).unwrap();
11260        assert_eq!(title, "Table of Contents");
11261        assert_eq!(entries.len(), 9);
11262        assert_eq!(entries[0].title, "Executive Summary");
11263        assert_eq!(entries[0].page, "4");
11264        assert_eq!(
11265            entries[4].title,
11266            "Political Parties, Candidates Registration and Election Campaign"
11267        );
11268        assert_eq!(entries[4].page, "18");
11269    }
11270
11271    #[cfg(not(target_arch = "wasm32"))]
11272    fn make_bbox_layout_line(words: &[(&str, f64, f64)], bottom: f64, top: f64) -> BBoxLayoutLine {
11273        make_bbox_layout_line_in_block(0, words, bottom, top)
11274    }
11275
11276    #[cfg(not(target_arch = "wasm32"))]
11277    fn make_bbox_layout_line_in_block(
11278        block_id: usize,
11279        words: &[(&str, f64, f64)],
11280        bottom: f64,
11281        top: f64,
11282    ) -> BBoxLayoutLine {
11283        BBoxLayoutLine {
11284            block_id,
11285            bbox: BoundingBox::new(
11286                Some(1),
11287                words.first().map(|(_, left, _)| *left).unwrap_or(72.0),
11288                bottom,
11289                words.last().map(|(_, _, right)| *right).unwrap_or(320.0),
11290                top,
11291            ),
11292            words: words
11293                .iter()
11294                .map(|(text, left, right)| BBoxLayoutWord {
11295                    bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
11296                    text: (*text).to_string(),
11297                })
11298                .collect(),
11299        }
11300    }
11301
11302    #[cfg(not(target_arch = "wasm32"))]
11303    #[test]
11304    fn test_detect_layout_open_plate_recovers_two_column_species_rows() {
11305        let lines = vec![
11306            make_bbox_layout_line(
11307                &[
11308                    ("Fish", 60.0, 76.0),
11309                    ("species", 78.0, 107.0),
11310                    ("on", 109.0, 119.0),
11311                    ("IUCN", 121.0, 142.0),
11312                    ("Red", 144.0, 159.0),
11313                    ("List", 161.0, 176.0),
11314                ],
11315                649.0,
11316                660.0,
11317            ),
11318            make_bbox_layout_line(
11319                &[("Potosi", 60.0, 84.0), ("Pupfish", 86.0, 114.0)],
11320                632.0,
11321                643.0,
11322            ),
11323            make_bbox_layout_line(
11324                &[("Cyprinodon", 132.0, 176.0), ("alvarezi", 178.0, 207.0)],
11325                632.0,
11326                643.0,
11327            ),
11328            make_bbox_layout_line(
11329                &[
11330                    ("La", 60.0, 69.0),
11331                    ("Palma", 71.0, 94.0),
11332                    ("Pupfish", 96.0, 124.0),
11333                    ("Cyprinodon", 132.0, 176.0),
11334                    ("longidorsalis", 178.0, 224.0),
11335                ],
11336                616.0,
11337                627.0,
11338            ),
11339            make_bbox_layout_line(
11340                &[("Butterfly", 60.0, 94.0), ("Splitfin", 96.0, 123.0)],
11341                600.0,
11342                611.0,
11343            ),
11344            make_bbox_layout_line(
11345                &[("Ameca", 132.0, 156.0), ("splendens", 158.0, 194.0)],
11346                600.0,
11347                611.0,
11348            ),
11349            make_bbox_layout_line(
11350                &[("Golden", 60.0, 88.0), ("Skiffia", 90.0, 113.0)],
11351                584.0,
11352                595.0,
11353            ),
11354            make_bbox_layout_line(
11355                &[("Skiffia", 132.0, 155.0), ("francesae", 158.0, 193.0)],
11356                584.0,
11357                595.0,
11358            ),
11359            make_bbox_layout_line(
11360                &[
11361                    ("Table", 56.0, 74.0),
11362                    ("6.1:", 76.0, 87.0),
11363                    ("Four", 89.0, 105.0),
11364                    ("fish", 107.0, 119.0),
11365                    ("species", 121.0, 145.0),
11366                    ("on", 147.0, 155.0),
11367                    ("IUCN", 157.0, 176.0),
11368                    ("Red", 178.0, 190.0),
11369                    ("List", 192.0, 205.0),
11370                    ("held", 279.0, 293.0),
11371                    ("in", 295.0, 302.0),
11372                    ("public", 304.0, 325.0),
11373                    ("aquariums.", 327.0, 365.0),
11374                ],
11375                556.0,
11376                566.0,
11377            ),
11378        ];
11379
11380        let plate = detect_layout_open_plate(576.0, &lines).unwrap();
11381        assert_eq!(plate.heading, "Fish species on IUCN Red List");
11382        assert_eq!(
11383            plate.header_row,
11384            vec![
11385                "Fish species on IUCN Red List".to_string(),
11386                "Scientific name".to_string()
11387            ]
11388        );
11389        assert_eq!(plate.rows.len(), 4);
11390        assert_eq!(
11391            plate.rows[1],
11392            vec![
11393                "La Palma Pupfish".to_string(),
11394                "Cyprinodon longidorsalis".to_string()
11395            ]
11396        );
11397        assert!(plate
11398            .caption
11399            .starts_with("Table 6.1: Four fish species on IUCN Red List"));
11400    }
11401
11402    #[cfg(not(target_arch = "wasm32"))]
11403    #[test]
11404    fn test_extract_layout_narrative_bridge_recovers_left_prose_and_defers_captions() {
11405        let plate = OpenPlateCandidate {
11406            heading: "Fish species on IUCN Red List".to_string(),
11407            header_row: vec![
11408                "Fish species on IUCN Red List".to_string(),
11409                "Scientific name".to_string(),
11410            ],
11411            rows: vec![],
11412            caption: "Table 6.1".to_string(),
11413            cutoff_top_y: 560.0,
11414        };
11415        let lines = vec![
11416            make_bbox_layout_line(
11417                &[
11418                    ("Public", 56.0, 83.0),
11419                    ("aquariums,", 88.0, 135.0),
11420                    ("because", 140.0, 174.0),
11421                ],
11422                509.0,
11423                521.0,
11424            ),
11425            make_bbox_layout_line(
11426                &[
11427                    ("of", 180.0, 188.0),
11428                    ("their", 194.0, 214.0),
11429                    ("in-", 220.0, 233.0),
11430                ],
11431                509.0,
11432                521.0,
11433            ),
11434            make_bbox_layout_line(
11435                &[
11436                    ("house", 56.0, 82.0),
11437                    ("expertise,", 84.0, 125.0),
11438                    ("can", 128.0, 143.0),
11439                ],
11440                495.0,
11441                507.0,
11442            ),
11443            make_bbox_layout_line(
11444                &[("act", 146.0, 159.0), ("quickly", 161.0, 191.0)],
11445                495.0,
11446                507.0,
11447            ),
11448            make_bbox_layout_line_in_block(
11449                1,
11450                &[
11451                    ("Figure", 242.0, 265.0),
11452                    ("6.3:", 267.0, 280.0),
11453                    ("Photo", 282.0, 303.0),
11454                ],
11455                355.0,
11456                366.0,
11457            ),
11458            make_bbox_layout_line_in_block(
11459                1,
11460                &[
11461                    ("of", 305.0, 312.0),
11462                    ("the", 314.0, 325.0),
11463                    ("species.", 327.0, 360.0),
11464                ],
11465                355.0,
11466                366.0,
11467            ),
11468            make_bbox_layout_line(
11469                &[
11470                    ("The", 56.0, 73.0),
11471                    ("breeding", 77.0, 114.0),
11472                    ("colonies", 118.0, 153.0),
11473                ],
11474                330.0,
11475                342.0,
11476            ),
11477            make_bbox_layout_line(
11478                &[
11479                    ("of", 157.0, 165.0),
11480                    ("the", 169.0, 183.0),
11481                    ("Butterfly", 187.0, 224.0),
11482                    ("Splitfin", 228.0, 258.0),
11483                    ("at", 314.0, 323.0),
11484                    ("the", 327.0, 341.0),
11485                    ("London", 345.0, 377.0),
11486                    ("Zoo", 381.0, 397.0),
11487                    ("and", 401.0, 416.0),
11488                    ("elsewhere", 420.0, 463.0),
11489                    ("serve", 467.0, 489.0),
11490                    ("as", 493.0, 502.0),
11491                    ("ark", 506.0, 519.0),
11492                ],
11493                330.0,
11494                342.0,
11495            ),
11496            make_bbox_layout_line(
11497                &[
11498                    ("Figure", 56.0, 79.0),
11499                    ("6.4:", 81.0, 94.0),
11500                    ("Lake", 96.0, 116.0),
11501                    ("Sturgeon", 118.0, 158.0),
11502                ],
11503                104.0,
11504                116.0,
11505            ),
11506        ];
11507
11508        let bridge = extract_layout_narrative_bridge(576.0, &lines, &plate).unwrap();
11509        assert!(bridge
11510            .bridge_paragraph
11511            .as_deref()
11512            .is_some_and(|text| text.contains("Public aquariums") && text.contains("expertise")));
11513        assert_eq!(bridge.deferred_captions.len(), 2);
11514        assert!(bridge.deferred_captions[0].contains("Figure 6.3:"));
11515        assert!(bridge.deferred_captions[0].contains("species."));
11516    }
11517
11518    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11519    #[test]
11520    fn test_detect_layout_ocr_benchmark_dashboard_on_real_pdf() {
11521        let path =
11522            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000199.pdf");
11523        let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11524        let dashboard = detect_layout_ocr_benchmark_dashboard(page_width, &lines).unwrap();
11525
11526        assert_eq!(
11527            dashboard.title,
11528            "Base Model Performance Evaluation of Upstage OCR Pack"
11529        );
11530        assert_eq!(dashboard.left_columns.len(), 2);
11531        assert_eq!(
11532            dashboard.left_columns[0],
11533            "Scene (Photographed document image)"
11534        );
11535        assert_eq!(
11536            dashboard.left_rows[0],
11537            vec![
11538                "Company A²".to_string(),
11539                "70.23".to_string(),
11540                "80.41".to_string()
11541            ]
11542        );
11543        assert_eq!(
11544            dashboard.right_rows[0],
11545            vec![
11546                "OCR-Recall³".to_string(),
11547                "73.2".to_string(),
11548                "94.2".to_string(),
11549                "94.1".to_string()
11550            ]
11551        );
11552        assert_eq!(dashboard.right_rows[3][0], "Parsing-F¹");
11553        assert_eq!(dashboard.right_rows[3][1], "68.0");
11554        assert_eq!(dashboard.right_rows[3][2], "82.65");
11555        assert_eq!(dashboard.right_rows[3][3], "82.65");
11556        assert!(!dashboard.definition_notes.is_empty());
11557        assert!(!dashboard.source_notes.is_empty());
11558    }
11559
11560    #[cfg(not(target_arch = "wasm32"))]
11561    #[test]
11562    fn test_split_layout_line_spans_handles_unicode_boundaries() {
11563        let line = "Title  “Podcast #EP32: SDGs dan Anak Muda”  2024";
11564        let spans = split_layout_line_spans(line);
11565        assert_eq!(spans.len(), 3);
11566        assert_eq!(spans[0].1, "Title");
11567        assert!(spans[1].1.contains("Podcast #EP32: SDGs dan Anak Muda"));
11568        assert!(spans[1].1.ends_with('”'));
11569        assert!(spans[2].1.ends_with("24"));
11570    }
11571
11572    #[cfg(not(target_arch = "wasm32"))]
11573    #[test]
11574    fn test_render_layout_single_caption_chart_document_on_real_pdf() {
11575        let path =
11576            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000037.pdf");
11577        let doc = PdfDocument {
11578            title: None,
11579            source_path: Some(path.to_string_lossy().to_string()),
11580            number_of_pages: 1,
11581            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11582                .unwrap()
11583                .kids,
11584            ..PdfDocument::new("01030000000037.pdf".to_string())
11585        };
11586        let rendered = render_layout_single_caption_chart_document(&doc).unwrap();
11587        assert!(rendered.contains("# 3. Impact on Business Operations"));
11588        assert!(rendered.contains("## 3.1. Status of Business Operations"));
11589        assert!(rendered.contains("As shown in Figure 3.1.1, the number of MSMEs"));
11590        assert!(
11591            rendered.contains("Figure 3.1.1: Status of operations during each survey phase (%)")
11592        );
11593        assert!(
11594            rendered.contains("lockdown period. In the handicraft/textile sector, 30% of MSMEs")
11595        );
11596        assert!(!rendered.contains("| Lockdown Period |"));
11597    }
11598
11599    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11600    #[test]
11601    fn test_to_markdown_captioned_media_document_on_real_pdf_72() {
11602        let path =
11603            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000072.pdf");
11604        let doc = PdfDocument {
11605            title: None,
11606            source_path: Some(path.to_string_lossy().to_string()),
11607            number_of_pages: 1,
11608            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11609                .unwrap()
11610                .kids,
11611            ..PdfDocument::new("01030000000072.pdf".to_string())
11612        };
11613        let md = to_markdown(&doc).unwrap();
11614        assert!(md.contains("## Diagram 5"), "{md}");
11615        assert!(
11616            md.contains("**Distribution of Komnas HAM’s YouTube Content (2019-2020)**"),
11617            "{md}"
11618        );
11619        assert!(
11620            md.contains(
11621                "As of 1 December 2021, the Komnas HAM’s YouTube channel has 2,290 subscribers"
11622            ),
11623            "{md}"
11624        );
11625        assert!(md.contains("**Figure 4**"), "{md}");
11626        assert!(
11627            md.contains("*Komnas HAM’s YouTube channel as of 1 December 2021*"),
11628            "{md}"
11629        );
11630    }
11631
11632    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11633    #[test]
11634    fn test_to_markdown_captioned_media_document_on_real_pdf_73() {
11635        let path =
11636            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000073.pdf");
11637        let doc = PdfDocument {
11638            title: None,
11639            source_path: Some(path.to_string_lossy().to_string()),
11640            number_of_pages: 1,
11641            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11642                .unwrap()
11643                .kids,
11644            ..PdfDocument::new("01030000000073.pdf".to_string())
11645        };
11646        let md = to_markdown(&doc).unwrap();
11647        assert!(
11648            md.starts_with("# In this content, DPN Argentina provides a brief explanation"),
11649            "{md}"
11650        );
11651        assert!(
11652            md.contains("Examples of such greetings are as follows:"),
11653            "{md}"
11654        );
11655        assert!(md.contains("*Image*"), "{md}");
11656        assert!(md.contains("**Figure 6**"), "{md}");
11657        assert!(md.contains("**DPN Argentina**"), "{md}");
11658        assert!(
11659            md.contains("**Content: World Health Day Celebration (7 April 2021).**^98"),
11660            "{md}"
11661        );
11662        assert!(md.contains("**Footnote:**"), "{md}");
11663        assert!(
11664            md.contains("https://twitter.com/DPNArgentina/status/1379765916259483648."),
11665            "{md}"
11666        );
11667    }
11668
11669    #[cfg(not(target_arch = "wasm32"))]
11670    #[test]
11671    fn test_render_layout_captioned_media_document_does_not_fire_on_real_pdf_14() {
11672        let path =
11673            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11674        let doc = PdfDocument {
11675            title: None,
11676            source_path: Some(path.to_string_lossy().to_string()),
11677            number_of_pages: 1,
11678            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11679                .unwrap()
11680                .kids,
11681            ..PdfDocument::new("01030000000014.pdf".to_string())
11682        };
11683        assert!(render_layout_captioned_media_document(&doc).is_none());
11684    }
11685
11686    #[cfg(not(target_arch = "wasm32"))]
11687    #[test]
11688    fn test_to_markdown_real_pdf_14_preserves_body_paragraphs() {
11689        let path =
11690            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11691        let doc = PdfDocument {
11692            title: None,
11693            source_path: Some(path.to_string_lossy().to_string()),
11694            number_of_pages: 1,
11695            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11696                .unwrap()
11697                .kids,
11698            ..PdfDocument::new("01030000000014.pdf".to_string())
11699        };
11700        let md = to_markdown(&doc).unwrap();
11701        assert!(
11702            md.contains("These images also show that different areas are used by men and by women"),
11703            "{md}"
11704        );
11705    }
11706
11707    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11708    #[test]
11709    fn test_render_layout_recommendation_infographic_on_real_pdf() {
11710        let path =
11711            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000183.pdf");
11712        let doc = PdfDocument {
11713            title: None,
11714            source_path: Some(path.to_string_lossy().to_string()),
11715            number_of_pages: 1,
11716            kids: Vec::new(),
11717            ..PdfDocument::new("01030000000183.pdf".to_string())
11718        };
11719        let rendered = render_layout_recommendation_infographic_document(&doc).unwrap();
11720        assert!(rendered.contains("# Recommendation Pack: Track Record"));
11721        assert!(rendered.contains("## Comparison with Beauty Commerce Recommendation Models"));
11722        assert!(rendered.contains("| Graph-RecSys | 0.4048 |"));
11723        assert!(rendered.contains("| Current Service Recommendation Algorithm | 0.159 |"));
11724        assert!(rendered.contains("## Education Content Platform PoC Case"));
11725        assert!(rendered.contains("| DKT Model | 0.882 |"));
11726        assert!(rendered.contains("Compared to regular model"));
11727    }
11728
11729    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11730    #[test]
11731    fn test_render_layout_stacked_bar_report_on_real_pdf() {
11732        let path =
11733            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000038.pdf");
11734        let doc = PdfDocument {
11735            title: None,
11736            source_path: Some(path.to_string_lossy().to_string()),
11737            number_of_pages: 1,
11738            kids: Vec::new(),
11739            ..PdfDocument::new("01030000000038.pdf".to_string())
11740        };
11741        let rendered = render_layout_stacked_bar_report_document(&doc);
11742        if rendered.is_none() {
11743            let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11744            let blocks = collect_bbox_layout_blocks(&lines);
11745            let figures = collect_layout_figure_captions(&blocks);
11746            let narrative = detect_layout_stacked_bar_narrative(&blocks);
11747            eprintln!("page_width={page_width} figures={}", figures.len());
11748            if let Some(first) = figures.first() {
11749                eprintln!("figure1={}", bbox_layout_block_text(first));
11750            }
11751            if let Some(second) = figures.get(1) {
11752                eprintln!("figure2={}", bbox_layout_block_text(second));
11753            }
11754            eprintln!("narrative={}", narrative.is_some());
11755            if let Some(narrative) = &narrative {
11756                eprintln!("heading={}", narrative.heading);
11757                eprintln!("paragraphs={}", narrative.paragraphs.len());
11758                eprintln!("footnote={:?}", narrative.footnote);
11759            }
11760            for block in &blocks {
11761                let text = bbox_layout_block_text(block);
11762                if text.contains("July")
11763                    || text.contains("October")
11764                    || text.contains("January")
11765                    || text.contains("Will ")
11766                    || text.contains("Don’t")
11767                    || text.starts_with("6.2.")
11768                    || text.starts_with("5.")
11769                {
11770                    eprintln!(
11771                        "block top={:.1} bottom={:.1} left={:.1} right={:.1} text={}",
11772                        block.bbox.top_y,
11773                        block.bbox.bottom_y,
11774                        block.bbox.left_x,
11775                        block.bbox.right_x,
11776                        text
11777                    );
11778                }
11779            }
11780            if figures.len() >= 2 {
11781                let first = detect_layout_three_month_stacked_figure(
11782                    &blocks,
11783                    &lines,
11784                    page_width,
11785                    figures[0].clone(),
11786                    figures[1].bbox.top_y,
11787                );
11788                eprintln!("figure_one_ok={}", first.is_some());
11789                if let Some(narrative) = &narrative {
11790                    let second = detect_layout_sector_bar_figure(
11791                        &blocks,
11792                        &lines,
11793                        page_width,
11794                        figures[1].clone(),
11795                        narrative.top_y,
11796                    );
11797                    eprintln!("figure_two_ok={}", second.is_some());
11798                }
11799            }
11800        }
11801        let rendered = rendered.unwrap();
11802        assert!(rendered.contains("# Figure 6.1.1:"));
11803        assert!(rendered.contains("| Will not terminate employment | 51 | 81 | 73 |"));
11804        assert!(rendered.contains("# 6.2. Expectations for Re-Hiring Employees"));
11805    }
11806
11807    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11808    #[test]
11809    fn test_render_layout_multi_figure_chart_document_on_real_pdf() {
11810        let path =
11811            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000076.pdf");
11812        let doc = PdfDocument {
11813            title: None,
11814            source_path: Some(path.to_string_lossy().to_string()),
11815            number_of_pages: 1,
11816            kids: Vec::new(),
11817            ..PdfDocument::new("01030000000076.pdf".to_string())
11818        };
11819        let rendered = render_layout_multi_figure_chart_document(&doc).unwrap();
11820        assert!(rendered.contains("# Figures from the Document"));
11821        assert!(
11822            rendered.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
11823        );
11824        assert!(rendered.contains("| 2016 | 3,230 |"));
11825        assert!(rendered.contains("| 2021 | 2,693 |"));
11826        assert!(
11827            rendered.contains("## Figure 1.8. Singapore foreign workforce stock (in thousands)")
11828        );
11829        assert!(rendered.contains("| 2016 (Dec) | 1,393 |"));
11830        assert!(rendered.contains("| 2021 (Dec) | 1,200 |"));
11831        assert!(rendered.contains(
11832            "Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate."
11833        ));
11834    }
11835
11836    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11837    #[test]
11838    fn test_render_layout_open_plate_document_on_real_pdf() {
11839        let path =
11840            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11841        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11842        let rendered = render_layout_open_plate_document(&doc).unwrap();
11843        assert!(rendered.contains("# Fish species on IUCN Red List"));
11844        assert!(rendered.contains("| Potosi Pupfish | Cyprinodon alvarezi |"));
11845        assert!(rendered.contains("| Golden Skiffia | Skiffia francesae |"));
11846        assert!(rendered.contains("*Table 6.1: Four fish species on IUCN Red List"));
11847        assert!(rendered.contains("---"));
11848        assert!(rendered.contains("Public aquariums, because of their inhouse expertise"));
11849    }
11850
11851    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11852    #[test]
11853    fn test_to_markdown_open_plate_document_on_real_pdf() {
11854        let path =
11855            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11856        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11857        let md = to_markdown(&doc).unwrap();
11858
11859        assert!(md.contains("# Fish species on IUCN Red List"), "{md}");
11860        assert!(
11861            md.contains("| Potosi Pupfish | Cyprinodon alvarezi |"),
11862            "{md}"
11863        );
11864        assert!(
11865            md.contains("| Golden Skiffia | Skiffia francesae |"),
11866            "{md}"
11867        );
11868        assert!(
11869            md.contains("*Table 6.1: Four fish species on IUCN Red List"),
11870            "{md}"
11871        );
11872        assert!(
11873            md.contains("The breeding colonies of the Butterfly Splitfin"),
11874            "{md}"
11875        );
11876    }
11877
11878    #[cfg(not(target_arch = "wasm32"))]
11879    #[test]
11880    fn test_to_markdown_does_not_misclassify_open_plate_pdf_36() {
11881        let path =
11882            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000036.pdf");
11883        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11884        let md = to_markdown(&doc).unwrap();
11885
11886        assert!(md.contains("# 2. General Profile of MSMEs"), "{md}");
11887        assert!(
11888            md.contains("In July 2020, the survey established a general profile"),
11889            "{md}"
11890        );
11891        assert!(
11892            md.contains(
11893                "The tourism sub-sectors interviewed included lodging, restaurants and bars"
11894            ),
11895            "{md}"
11896        );
11897        assert!(
11898            !md.starts_with("# Business characteristics. Business size was"),
11899            "{md}"
11900        );
11901    }
11902
11903    #[cfg(not(target_arch = "wasm32"))]
11904    #[test]
11905    fn test_to_markdown_does_not_misclassify_open_plate_pdf_40() {
11906        let path =
11907            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000040.pdf");
11908        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11909        let md = to_markdown(&doc).unwrap();
11910
11911        assert!(
11912            md.contains(
11913                "Thailand, Philippines and Indonesia in particular, identifying known experts"
11914            ),
11915            "{md}"
11916        );
11917        assert!(
11918            md.contains("Figure 1: Age by gender of respondents"),
11919            "{md}"
11920        );
11921        assert!(md.contains("Gender Analysis of Violent Extremism"), "{md}");
11922        assert!(
11923            !md.starts_with("# Thailand, Philippines and Indonesia in"),
11924            "{md}"
11925        );
11926    }
11927
11928    #[cfg(not(target_arch = "wasm32"))]
11929    #[test]
11930    fn test_to_markdown_does_not_misclassify_open_plate_pdf_64() {
11931        let path =
11932            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000064.pdf");
11933        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11934        let md = to_markdown(&doc).unwrap();
11935
11936        assert!(md.contains("estuarine influenced areas."), "{md}");
11937        assert!(md.contains("| MANILA | 2454 | 6,125 |"), "{md}");
11938        assert!(
11939            md.contains("The port of Manila has been documented"),
11940            "{md}"
11941        );
11942        assert!(!md.starts_with("# CAGAYAN DE ORO"), "{md}");
11943    }
11944
11945    #[cfg(not(target_arch = "wasm32"))]
11946    #[test]
11947    fn test_detect_footnote_citation_regions_on_real_pdf() {
11948        let path =
11949            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11950        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11951        let regions = detect_footnote_citation_regions(&doc);
11952        assert!(!regions.is_empty(), "{regions:?}");
11953        assert!(
11954            regions.iter().any(|region| {
11955                region.rendered.contains("<table>")
11956                    && region.rendered.contains("<td>25</td>")
11957                    && region.rendered.contains("<td>29</td>")
11958            }),
11959            "{regions:#?}"
11960        );
11961        assert!(
11962            regions.iter().any(|region| {
11963                region.rendered.contains("<table>")
11964                    && region.rendered.contains("<td>30</td>")
11965                    && region.rendered.contains("<td>33</td>")
11966            }),
11967            "{regions:#?}"
11968        );
11969    }
11970
11971    #[cfg(not(target_arch = "wasm32"))]
11972    #[test]
11973    fn test_to_markdown_renders_footnote_citation_tables_on_real_pdf() {
11974        let path =
11975            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11976        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11977        let md = to_markdown(&doc).unwrap();
11978
11979        assert!(md.contains("<table>"), "{md}");
11980        assert!(md.contains("<th>Footnote</th><th>Citation</th>"), "{md}");
11981        assert!(md.contains("<td>25</td><td>Wiliam Beckford"), "{md}");
11982        assert!(
11983            md.contains("<td>29</td><td>Pope, The Rape of the Lock, 69.</td>"),
11984            "{md}"
11985        );
11986        assert!(
11987            md.contains("<td>30</td><td>Beawes, Lex Mercatoria Rediviva, 791.</td>"),
11988            "{md}"
11989        );
11990        assert!(
11991            md.contains("<td>32</td><td>Beawes, Lex Mercatoria Rediviva, 792.</td>"),
11992            "{md}"
11993        );
11994        assert!(
11995            md.contains("<td>33</td><td>M.M., Pharmacopoia Reformata:"),
11996            "{md}"
11997        );
11998    }
11999
12000    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12001    #[test]
12002    fn test_to_markdown_projection_sheet_document_on_real_pdf() {
12003        let path =
12004            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000128.pdf");
12005        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12006        let md = to_markdown(&doc).unwrap();
12007
12008        assert!(md.contains("# Table and Figure from the Document"), "{md}");
12009        assert!(md.contains("| A | B | C | D | E |"), "{md}");
12010        assert!(
12011            md.contains("| 10 | 8 | 19.73214458 | 17.99 | 21.47 |"),
12012            "{md}"
12013        );
12014        assert!(
12015            md.contains("**Figure 13.3. Graph of Projection Estimates**"),
12016            "{md}"
12017        );
12018        assert!(md.contains("[Open Template in Microsoft Excel](#)"), "{md}");
12019        assert!(
12020            md.contains("*298 | Ch. 13. Homogeneous Investment Types*"),
12021            "{md}"
12022        );
12023    }
12024
12025    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12026    #[test]
12027    fn test_to_markdown_appendix_tables_document_on_real_pdf() {
12028        let path =
12029            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000082.pdf");
12030        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12031        let md = to_markdown(&doc).unwrap();
12032
12033        assert!(md.contains("# Appendices"), "{md}");
12034        assert!(
12035            md.contains("## TABLE 28: BREAKDOWN OF IMPRISONMENT CLAUSES IN STATE LAWS"),
12036            "{md}"
12037        );
12038        assert!(md.contains("| Imprisonment terms | Number of clauses | Percentage of all states | Percentage of total |"), "{md}");
12039        assert!(
12040            md.contains("| Less than 3 months | 4,448 | 21.3% | 17.0% |"),
12041            "{md}"
12042        );
12043        assert!(
12044            md.contains("## TABLE 29: STATES WITH MORE THAN 1,000 IMPRISONMENT CLAUSES"),
12045            "{md}"
12046        );
12047        assert!(
12048            md.contains(
12049                "| State | Number of clauses | GSDP (In Rs lakh crore) | GSDP (In $ billion) |"
12050            ),
12051            "{md}"
12052        );
12053        assert!(md.contains("| Gujarat | 1469 | 15.6 | 200.4 |"), "{md}");
12054        assert!(
12055            md.contains("*Sources: TeamLease Regtech, and Reserve Bank of India for GSDPs*"),
12056            "{md}"
12057        );
12058        assert!(md.contains("*Exchange rate: Rs 75 to USD*"), "{md}");
12059    }
12060
12061    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12062    #[test]
12063    fn test_to_markdown_titled_dual_table_document_on_real_pdf() {
12064        let path =
12065            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000084.pdf");
12066        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12067        let md = to_markdown(&doc).unwrap();
12068
12069        assert!(md.starts_with("# Jailed for Doing Business"), "{md}");
12070        assert!(
12071            md.contains("## TABLE 38: THREE CASE STUDIES ON NBFC COMPLIANCES*"),
12072            "{md}"
12073        );
12074        assert!(
12075            md.contains("| Percentage of imprisonment clauses | 20% | 30% | 37% |"),
12076            "{md}"
12077        );
12078        assert!(
12079            md.contains("## TABLE 39: BREAKDOWN OF IMPRISONMENT CLAUSES IN NBFC CASE STUDIES*"),
12080            "{md}"
12081        );
12082        assert!(
12083            md.contains("| 5 years to 10 years | 19 | 19 | 19 |"),
12084            "{md}"
12085        );
12086        assert!(
12087            md.contains("*These are real data from three NBFCs*"),
12088            "{md}"
12089        );
12090    }
12091
12092    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12093    #[test]
12094    fn test_to_markdown_registration_report_document_on_real_pdf() {
12095        let path =
12096            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000047.pdf");
12097        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12098        let md = to_markdown(&doc).unwrap();
12099
12100        assert!(
12101            md.starts_with("# ANFREL Pre-Election Assessment Mission Report"),
12102            "{md}"
12103        );
12104        assert!(
12105            md.contains(
12106                "| 14 | Cambodian Indigeneous Peoples Democracy Party | 19 | 194 | 19 | 202 | +8 |"
12107            ),
12108            "{md}"
12109        );
12110        assert!(
12111            md.contains("|  | Total |  | 84,208 |  | 86,092 | +1,884 |"),
12112            "{md}"
12113        );
12114        assert!(!md.contains("|  | Democracy Party |"), "{md}");
12115    }
12116
12117    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12118    #[test]
12119    fn test_to_markdown_dual_table_article_document_on_real_pdf() {
12120        let path =
12121            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000190.pdf");
12122        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12123        let md = to_markdown(&doc).unwrap();
12124
12125        assert!(
12126            md.starts_with("# Table 6: Performance comparison amongst the merge candidates"),
12127            "{md}"
12128        );
12129        assert!(
12130            md.contains("*Table 6*: Performance comparison amongst the merge candidates."),
12131            "{md}"
12132        );
12133        assert!(md.contains("# Table 7: Ablation studies on the different merge methods used for obtaining the final model"), "{md}");
12134        assert!(!md.contains("*Table 6*: Table 6:"), "{md}");
12135        assert!(!md.contains("| Merge v1"), "{md}");
12136    }
12137
12138    #[test]
12139    fn test_normalize_list_text_strips_redundant_bullets() {
12140        assert_eq!(
12141            normalize_list_text("• Collected via surveys"),
12142            "Collected via surveys"
12143        );
12144        assert!(is_pure_bullet_marker("•"));
12145    }
12146
12147    #[test]
12148    fn test_reference_continuation_detected() {
12149        assert!(should_merge_paragraph_text(
12150            "Scaling laws for transfer.",
12151            "arXiv preprint arXiv:2102.01293."
12152        ));
12153    }
12154
12155    #[test]
12156    fn test_enumerated_markers_are_detected() {
12157        assert!(starts_with_enumerated_marker("iii. Third item"));
12158        assert!(starts_with_enumerated_marker("1) First item"));
12159        assert!(starts_with_enumerated_marker("a. Lettered item"));
12160        assert!(!starts_with_enumerated_marker("Figure 1. Caption"));
12161        assert!(!starts_with_enumerated_marker("Natural dispersal"));
12162    }
12163
12164    fn make_heading(text: &str) -> ContentElement {
12165        let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
12166        let chunk = TextChunk {
12167            value: text.to_string(),
12168            bbox: bbox.clone(),
12169            font_name: "Lato-Bold".to_string(),
12170            font_size: 12.0,
12171            font_weight: 700.0,
12172            italic_angle: 0.0,
12173            font_color: "#000000".to_string(),
12174            contrast_ratio: 21.0,
12175            symbol_ends: vec![],
12176            text_format: TextFormat::Normal,
12177            text_type: TextType::Regular,
12178            pdf_layer: PdfLayer::Main,
12179            ocg_visible: true,
12180            index: None,
12181            page_number: Some(1),
12182            level: None,
12183            mcid: None,
12184        };
12185        let line = TextLine {
12186            bbox: bbox.clone(),
12187            index: None,
12188            level: None,
12189            font_size: 12.0,
12190            base_line: 702.0,
12191            slant_degree: 0.0,
12192            is_hidden_text: false,
12193            text_chunks: vec![chunk],
12194            is_line_start: true,
12195            is_line_end: true,
12196            is_list_line: false,
12197            connected_line_art_label: None,
12198        };
12199        let block = TextBlock {
12200            bbox: bbox.clone(),
12201            index: None,
12202            level: None,
12203            font_size: 12.0,
12204            base_line: 702.0,
12205            slant_degree: 0.0,
12206            is_hidden_text: false,
12207            text_lines: vec![line],
12208            has_start_line: true,
12209            has_end_line: true,
12210            text_alignment: None,
12211        };
12212        let column = TextColumn {
12213            bbox: bbox.clone(),
12214            index: None,
12215            level: None,
12216            font_size: 12.0,
12217            base_line: 702.0,
12218            slant_degree: 0.0,
12219            is_hidden_text: false,
12220            text_blocks: vec![block],
12221        };
12222        ContentElement::Heading(SemanticHeading {
12223            base: SemanticParagraph {
12224                base: SemanticTextNode {
12225                    bbox,
12226                    index: None,
12227                    level: None,
12228                    semantic_type: crate::models::enums::SemanticType::Heading,
12229                    correct_semantic_score: None,
12230                    columns: vec![column],
12231                    font_weight: Some(700.0),
12232                    font_size: Some(12.0),
12233                    text_color: None,
12234                    italic_angle: None,
12235                    font_name: Some("Lato-Bold".to_string()),
12236                    text_format: None,
12237                    max_font_size: Some(12.0),
12238                    background_color: None,
12239                    is_hidden_text: false,
12240                },
12241                enclosed_top: false,
12242                enclosed_bottom: false,
12243                indentation: 0,
12244            },
12245            heading_level: Some(1),
12246        })
12247    }
12248
12249    fn make_heading_at(left: f64, bottom: f64, right: f64, top: f64, text: &str) -> ContentElement {
12250        let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12251        let chunk = TextChunk {
12252            value: text.to_string(),
12253            bbox: bbox.clone(),
12254            font_name: "Lato-Bold".to_string(),
12255            font_size: top - bottom,
12256            font_weight: 700.0,
12257            italic_angle: 0.0,
12258            font_color: "#000000".to_string(),
12259            contrast_ratio: 21.0,
12260            symbol_ends: vec![],
12261            text_format: TextFormat::Normal,
12262            text_type: TextType::Regular,
12263            pdf_layer: PdfLayer::Main,
12264            ocg_visible: true,
12265            index: None,
12266            page_number: Some(1),
12267            level: None,
12268            mcid: None,
12269        };
12270        let line = TextLine {
12271            bbox: bbox.clone(),
12272            index: None,
12273            level: None,
12274            font_size: top - bottom,
12275            base_line: bottom + 2.0,
12276            slant_degree: 0.0,
12277            is_hidden_text: false,
12278            text_chunks: vec![chunk],
12279            is_line_start: true,
12280            is_line_end: true,
12281            is_list_line: false,
12282            connected_line_art_label: None,
12283        };
12284        let block = TextBlock {
12285            bbox: bbox.clone(),
12286            index: None,
12287            level: None,
12288            font_size: top - bottom,
12289            base_line: bottom + 2.0,
12290            slant_degree: 0.0,
12291            is_hidden_text: false,
12292            text_lines: vec![line],
12293            has_start_line: true,
12294            has_end_line: true,
12295            text_alignment: None,
12296        };
12297        let column = TextColumn {
12298            bbox: bbox.clone(),
12299            index: None,
12300            level: None,
12301            font_size: top - bottom,
12302            base_line: bottom + 2.0,
12303            slant_degree: 0.0,
12304            is_hidden_text: false,
12305            text_blocks: vec![block],
12306        };
12307        ContentElement::Heading(SemanticHeading {
12308            base: SemanticParagraph {
12309                base: SemanticTextNode {
12310                    bbox,
12311                    index: None,
12312                    level: None,
12313                    semantic_type: crate::models::enums::SemanticType::Heading,
12314                    correct_semantic_score: None,
12315                    columns: vec![column],
12316                    font_weight: Some(700.0),
12317                    font_size: Some(top - bottom),
12318                    text_color: None,
12319                    italic_angle: None,
12320                    font_name: Some("Lato-Bold".to_string()),
12321                    text_format: None,
12322                    max_font_size: Some(top - bottom),
12323                    background_color: None,
12324                    is_hidden_text: false,
12325                },
12326                enclosed_top: false,
12327                enclosed_bottom: false,
12328                indentation: 0,
12329            },
12330            heading_level: None,
12331        })
12332    }
12333
12334    fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
12335        make_paragraph_at(72.0, bottom, 300.0, top, text)
12336    }
12337
12338    fn make_paragraph_at(
12339        left: f64,
12340        bottom: f64,
12341        right: f64,
12342        top: f64,
12343        text: &str,
12344    ) -> ContentElement {
12345        let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12346        let chunk = TextChunk {
12347            value: text.to_string(),
12348            bbox: bbox.clone(),
12349            font_name: "Lato-Regular".to_string(),
12350            font_size: (top - bottom).max(1.0),
12351            font_weight: 400.0,
12352            italic_angle: 0.0,
12353            font_color: "#000000".to_string(),
12354            contrast_ratio: 21.0,
12355            symbol_ends: vec![],
12356            text_format: TextFormat::Normal,
12357            text_type: TextType::Regular,
12358            pdf_layer: PdfLayer::Main,
12359            ocg_visible: true,
12360            index: None,
12361            page_number: Some(1),
12362            level: None,
12363            mcid: None,
12364        };
12365        let line = TextLine {
12366            bbox: bbox.clone(),
12367            index: None,
12368            level: None,
12369            font_size: chunk.font_size,
12370            base_line: bottom + 2.0,
12371            slant_degree: 0.0,
12372            is_hidden_text: false,
12373            text_chunks: vec![chunk],
12374            is_line_start: true,
12375            is_line_end: true,
12376            is_list_line: false,
12377            connected_line_art_label: None,
12378        };
12379        let block = TextBlock {
12380            bbox: bbox.clone(),
12381            index: None,
12382            level: None,
12383            font_size: line.font_size,
12384            base_line: line.base_line,
12385            slant_degree: 0.0,
12386            is_hidden_text: false,
12387            text_lines: vec![line],
12388            has_start_line: true,
12389            has_end_line: true,
12390            text_alignment: None,
12391        };
12392        let column = TextColumn {
12393            bbox: bbox.clone(),
12394            index: None,
12395            level: None,
12396            font_size: block.font_size,
12397            base_line: block.base_line,
12398            slant_degree: 0.0,
12399            is_hidden_text: false,
12400            text_blocks: vec![block],
12401        };
12402        ContentElement::Paragraph(SemanticParagraph {
12403            base: SemanticTextNode {
12404                bbox,
12405                index: None,
12406                level: None,
12407                semantic_type: crate::models::enums::SemanticType::Paragraph,
12408                correct_semantic_score: None,
12409                columns: vec![column],
12410                font_weight: Some(400.0),
12411                font_size: Some(top - bottom),
12412                text_color: None,
12413                italic_angle: None,
12414                font_name: Some("Lato-Regular".to_string()),
12415                text_format: None,
12416                max_font_size: Some(top - bottom),
12417                background_color: None,
12418                is_hidden_text: false,
12419            },
12420            enclosed_top: false,
12421            enclosed_bottom: false,
12422            indentation: 0,
12423        })
12424    }
12425
12426    fn make_fallback_list(items: &[&str]) -> ContentElement {
12427        let mut list_items = Vec::new();
12428        for (idx, text) in items.iter().enumerate() {
12429            let top = 700.0 - idx as f64 * 18.0;
12430            let bottom = top - 12.0;
12431            let bbox = BoundingBox::new(Some(1), 72.0, bottom, 320.0, top);
12432            list_items.push(ListItem {
12433                bbox: bbox.clone(),
12434                index: None,
12435                level: None,
12436                label: ListLabel {
12437                    bbox: bbox.clone(),
12438                    content: vec![],
12439                    semantic_type: None,
12440                },
12441                body: ListBody {
12442                    bbox: bbox.clone(),
12443                    content: vec![],
12444                    semantic_type: None,
12445                },
12446                label_length: 0,
12447                contents: vec![make_paragraph_at(72.0, bottom, 320.0, top, text)],
12448                semantic_type: None,
12449            });
12450        }
12451
12452        ContentElement::List(PDFList {
12453            bbox: BoundingBox::new(
12454                Some(1),
12455                72.0,
12456                700.0 - items.len() as f64 * 18.0,
12457                320.0,
12458                700.0,
12459            ),
12460            index: None,
12461            level: None,
12462            list_items,
12463            numbering_style: Some("bullets".to_string()),
12464            common_prefix: None,
12465            previous_list_id: None,
12466            next_list_id: None,
12467        })
12468    }
12469
12470    fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
12471        let mut table_rows = Vec::new();
12472        for (ri, (title, page)) in rows.iter().enumerate() {
12473            let top = 680.0 - ri as f64 * 18.0;
12474            let bottom = top - 12.0;
12475            let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
12476            let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
12477            table_rows.push(TableBorderRow {
12478                bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
12479                index: None,
12480                level: None,
12481                row_number: ri,
12482                cells: vec![
12483                    TableBorderCell {
12484                        bbox: left_bbox.clone(),
12485                        index: None,
12486                        level: None,
12487                        row_number: ri,
12488                        col_number: 0,
12489                        row_span: 1,
12490                        col_span: 1,
12491                        content: vec![TableToken {
12492                            base: TextChunk {
12493                                value: (*title).to_string(),
12494                                bbox: left_bbox,
12495                                font_name: "Lato-Regular".to_string(),
12496                                font_size: 10.0,
12497                                font_weight: 400.0,
12498                                italic_angle: 0.0,
12499                                font_color: "#000000".to_string(),
12500                                contrast_ratio: 21.0,
12501                                symbol_ends: vec![],
12502                                text_format: TextFormat::Normal,
12503                                text_type: TextType::Regular,
12504                                pdf_layer: PdfLayer::Main,
12505                                ocg_visible: true,
12506                                index: None,
12507                                page_number: Some(1),
12508                                level: None,
12509                                mcid: None,
12510                            },
12511                            token_type: TableTokenType::Text,
12512                        }],
12513                        contents: vec![],
12514                        semantic_type: None,
12515                    },
12516                    TableBorderCell {
12517                        bbox: right_bbox.clone(),
12518                        index: None,
12519                        level: None,
12520                        row_number: ri,
12521                        col_number: 1,
12522                        row_span: 1,
12523                        col_span: 1,
12524                        content: vec![TableToken {
12525                            base: TextChunk {
12526                                value: (*page).to_string(),
12527                                bbox: right_bbox,
12528                                font_name: "Lato-Regular".to_string(),
12529                                font_size: 10.0,
12530                                font_weight: 400.0,
12531                                italic_angle: 0.0,
12532                                font_color: "#000000".to_string(),
12533                                contrast_ratio: 21.0,
12534                                symbol_ends: vec![],
12535                                text_format: TextFormat::Normal,
12536                                text_type: TextType::Regular,
12537                                pdf_layer: PdfLayer::Main,
12538                                ocg_visible: true,
12539                                index: None,
12540                                page_number: Some(1),
12541                                level: None,
12542                                mcid: None,
12543                            },
12544                            token_type: TableTokenType::Text,
12545                        }],
12546                        contents: vec![],
12547                        semantic_type: None,
12548                    },
12549                ],
12550                semantic_type: None,
12551            });
12552        }
12553
12554        ContentElement::TableBorder(TableBorder {
12555            bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
12556            index: None,
12557            level: Some("1".to_string()),
12558            x_coordinates: vec![72.0, 320.0, 360.0],
12559            x_widths: vec![0.0, 0.0, 0.0],
12560            y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
12561            y_widths: vec![0.0, 0.0, 0.0, 0.0],
12562            rows: table_rows,
12563            num_rows: rows.len(),
12564            num_columns: 2,
12565            is_bad_table: false,
12566            is_table_transformer: false,
12567            previous_table: None,
12568            next_table: None,
12569        })
12570    }
12571
12572    #[test]
12573    fn test_contents_document_renders_toc_table_rows() {
12574        let mut doc = PdfDocument::new("contents.pdf".to_string());
12575        doc.kids.push(make_heading("CONTENTS"));
12576        doc.kids.push(make_toc_table(&[
12577            ("Experiment #1: Hydrostatic Pressure", "3"),
12578            ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
12579            ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
12580            ("Experiment #4: Energy Loss in Pipes", "33"),
12581            ("Experiment #5: Impact of a Jet", "43"),
12582            ("Experiment #6: Orifice and Free Jet Flow", "50"),
12583            ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
12584            ("References", "101"),
12585        ]));
12586
12587        let md = to_markdown(&doc).unwrap();
12588        assert!(md.starts_with("# CONTENTS\n\n"));
12589        assert!(md.contains("- Experiment #1: Hydrostatic Pressure 3\n"));
12590        assert!(md.contains("- Experiment #2: Bernoulli's Theorem Demonstration 13\n"));
12591        assert!(md.contains("- Experiment #7: Osborne Reynolds' Demonstration 59\n"));
12592        assert!(md.contains("- References 101\n"));
12593    }
12594
12595    #[test]
12596    fn test_toc_semantic_paragraphs_render_without_blank_lines() {
12597        let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
12598        let mut first = make_paragraph(
12599            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12600            700.0,
12601            712.0,
12602        );
12603        let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
12604        if let ContentElement::Paragraph(p) = &mut first {
12605            p.base.semantic_type = SemanticType::TableOfContent;
12606        }
12607        if let ContentElement::Paragraph(p) = &mut second {
12608            p.base.semantic_type = SemanticType::TableOfContent;
12609        }
12610        doc.kids.push(first);
12611        doc.kids.push(second);
12612
12613        let md = to_markdown(&doc).unwrap();
12614        assert!(md.contains(
12615            "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
12616        ));
12617    }
12618
12619    #[test]
12620    fn test_compact_toc_document_renders_without_blank_lines() {
12621        let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
12622        doc.kids.push(make_paragraph(
12623            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12624            700.0,
12625            712.0,
12626        ));
12627        doc.kids.push(make_paragraph(
12628            "Section 5.1: The Linear Model 35",
12629            684.0,
12630            696.0,
12631        ));
12632        doc.kids.push(make_paragraph(
12633            "Part VI. Chapter Six - Comparing Three or More Group Means",
12634            668.0,
12635            680.0,
12636        ));
12637        doc.kids.push(make_paragraph(
12638            "Section 6.1: Between Versus Within Group Analyses 49",
12639            652.0,
12640            664.0,
12641        ));
12642        doc.kids.push(make_paragraph(
12643            "Part VII. Chapter Seven - Moderation and Mediation Analyses",
12644            636.0,
12645            648.0,
12646        ));
12647        doc.kids.push(make_paragraph(
12648            "Section 7.1: Mediation and Moderation Models 64",
12649            620.0,
12650            632.0,
12651        ));
12652        doc.kids
12653            .push(make_paragraph("References 101", 604.0, 616.0));
12654        doc.kids.push(make_paragraph(
12655            "Section 8.1: Factor Analysis Definitions 75",
12656            588.0,
12657            600.0,
12658        ));
12659
12660        let md = to_markdown(&doc).unwrap();
12661        assert!(md.contains(
12662            "# Part V. Chapter Five - Comparing Associations Between Multiple Variables\n\n## Section 5.1: The Linear Model"
12663        ));
12664        assert!(md.contains(
12665            "# Part VI. Chapter Six - Comparing Three or More Group Means\n\n## Section 6.1: Between Versus Within Group Analyses"
12666        ));
12667        assert!(md.contains("References 101\n\n## Section 8.1: Factor Analysis Definitions"));
12668    }
12669
12670    #[test]
12671    fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
12672        let mut doc = PdfDocument::new("caption-body.pdf".to_string());
12673        doc.kids.push(make_paragraph(
12674            "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
12675            500.0,
12676            540.0,
12677        ));
12678
12679        let md = to_markdown(&doc).unwrap();
12680        assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
12681    }
12682
12683    #[test]
12684    fn test_short_caption_label_merges_with_following_tail_and_body() {
12685        let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
12686        doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
12687        doc.kids.push(make_paragraph(
12688            "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
12689            520.0,
12690            532.0,
12691        ));
12692
12693        let md = to_markdown(&doc).unwrap();
12694        assert!(md.contains(
12695            "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
12696        ));
12697    }
12698
12699    #[test]
12700    fn test_short_caption_label_merges_with_tail_and_year() {
12701        let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
12702        doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
12703        doc.kids.push(make_paragraph(
12704            "Komnas HAM's YouTube channel as of 1 December",
12705            520.0,
12706            532.0,
12707        ));
12708        doc.kids.push(make_paragraph("2021", 500.0, 512.0));
12709
12710        let md = to_markdown(&doc).unwrap();
12711        assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
12712        assert!(!md.contains("\n\n2021"));
12713    }
12714
12715    #[test]
12716    fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
12717        let mut doc = PdfDocument::new("chart.pdf".to_string());
12718        doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
12719        doc.kids.push(make_paragraph("100", 520.0, 528.0));
12720        doc.kids
12721            .push(make_paragraph("Body text continues here.", 400.0, 412.0));
12722        doc.kids.push(make_paragraph("36", 20.0, 28.0));
12723
12724        let md = to_markdown(&doc).unwrap();
12725        assert!(md.contains("100"));
12726        assert!(!md.lines().any(|line| line.trim() == "36"));
12727    }
12728
12729    #[test]
12730    fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
12731        let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
12732        doc.kids.push(make_paragraph(
12733            "First semantic paragraph ends here.",
12734            520.0,
12735            532.0,
12736        ));
12737        doc.kids.push(make_paragraph(
12738            "Second semantic paragraph starts here.",
12739            500.0,
12740            512.0,
12741        ));
12742
12743        let md = to_markdown(&doc).unwrap();
12744        assert!(md.contains(
12745            "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
12746        ));
12747    }
12748
12749    #[test]
12750    fn test_lowercase_semantic_paragraph_continuation_is_merged() {
12751        let mut doc = PdfDocument::new("continuation.pdf".to_string());
12752        doc.kids.push(make_paragraph(
12753            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
12754            520.0,
12755            532.0,
12756        ));
12757        doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
12758
12759        let md = to_markdown(&doc).unwrap();
12760        assert!(md.contains(
12761            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
12762        ));
12763    }
12764
12765    #[test]
12766    fn test_semantic_enumerated_paragraphs_are_not_merged() {
12767        let mut doc = PdfDocument::new("enumerated-paragraphs.pdf".to_string());
12768        doc.kids.push(make_paragraph(
12769            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12770            520.0,
12771            532.0,
12772        ));
12773        doc.kids.push(make_paragraph(
12774            "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12775            500.0,
12776            512.0,
12777        ));
12778
12779        let md = to_markdown(&doc).unwrap();
12780        assert!(md.contains(
12781            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.\n\niv. This business model will be operating cost-oriented not capital cost-oriented."
12782        ));
12783    }
12784
12785    #[test]
12786    fn test_leading_figure_carryover_is_skipped_before_first_numbered_heading() {
12787        let mut doc = PdfDocument::new("leading-figure-carryover.pdf".to_string());
12788        doc.number_of_pages = 1;
12789        doc.kids.push(make_paragraph_at(
12790            72.0,
12791            742.0,
12792            540.0,
12793            756.0,
12794            "Figure 6. Mytella strigata biofouling green mussel farms in Bacoor City, Cavite, Manila Bay",
12795        ));
12796        doc.kids.push(make_heading_at(
12797            72.0,
12798            680.0,
12799            260.0,
12800            696.0,
12801            "5. Natural dispersal",
12802        ));
12803        doc.kids.push(make_paragraph_at(
12804            72.0,
12805            640.0,
12806            540.0,
12807            654.0,
12808            "Dispersal by purely natural means is not included as a pathway of biological invasions.",
12809        ));
12810
12811        let md = to_markdown(&doc).unwrap();
12812        assert!(md.starts_with("# 5. Natural dispersal"));
12813        assert!(!md.contains("Figure 6. Mytella strigata"));
12814    }
12815
12816    #[test]
12817    fn test_list_renderer_strips_duplicate_bullets_and_skips_bullet_only_items() {
12818        let mut doc = PdfDocument::new("bullets.pdf".to_string());
12819        doc.kids.push(make_fallback_list(&[
12820            "• First item",
12821            "•",
12822            "• Second item",
12823            "133",
12824        ]));
12825
12826        let md = to_markdown(&doc).unwrap();
12827        assert!(md.contains("- First item"));
12828        assert!(md.contains("- Second item"));
12829        assert!(!md.contains("- • First item"));
12830        assert!(!md.contains("\n- •\n"));
12831        assert!(!md.contains("\n- 133\n"));
12832    }
12833
12834    #[test]
12835    fn test_list_renderer_merges_wrapped_continuation_items() {
12836        let mut doc = PdfDocument::new("wrapped-list.pdf".to_string());
12837        doc.kids.push(make_fallback_list(&[
12838            "Use a micropipette to add 2 μL of loading dye",
12839            "and down a couple of times to mix the loading dye with the digested DNA.",
12840            "Use a fresh pipet tip for each reaction tube.",
12841        ]));
12842
12843        let md = to_markdown(&doc).unwrap();
12844        assert!(md.contains(
12845            "- Use a micropipette to add 2 μL of loading dye and down a couple of times to mix the loading dye with the digested DNA."
12846        ));
12847        assert!(md.contains("- Use a fresh pipet tip for each reaction tube."));
12848        assert!(!md.contains("\n- and down"));
12849    }
12850
12851    #[test]
12852    fn test_list_renderer_keeps_enumerated_items_separate() {
12853        let mut doc = PdfDocument::new("enumerated-list.pdf".to_string());
12854        doc.kids.push(make_fallback_list(&[
12855            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12856            "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12857            "v. Assumed selling price of wood pellet is $100 per tonne and appropriate.",
12858        ]));
12859
12860        let md = to_markdown(&doc).unwrap();
12861        assert!(md.contains("iii. Looking at cost items, the cost of raw woods procurement will be highest share.\niv. This business model will be operating cost-oriented not capital cost-oriented.\nv. Assumed selling price of wood pellet is $100 per tonne and appropriate."));
12862        assert!(!md.contains("- iii."));
12863    }
12864
12865    #[test]
12866    fn test_postprocess_drops_isolated_single_char_noise_lines() {
12867        let markdown = "# The Data Journey\n\n1\n\nTo get started.\n\no\n\nNOTE: Keep going.\n";
12868        let cleaned = drop_isolated_noise_lines(markdown);
12869        assert!(!cleaned.contains("\n1\n"));
12870        assert!(!cleaned.contains("\no\n"));
12871        assert!(cleaned.contains("To get started."));
12872        assert!(cleaned.contains("NOTE: Keep going."));
12873    }
12874
12875    fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
12876        let mut table_rows = Vec::new();
12877        for (row_number, (left, right)) in rows.iter().enumerate() {
12878            let top = 656.0 - row_number as f64 * 18.0;
12879            let bottom = top - 16.0;
12880            let mut cells = Vec::new();
12881            for (col_number, (text, left_x, right_x)) in
12882                [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
12883                    .into_iter()
12884                    .enumerate()
12885            {
12886                let content = if text.is_empty() {
12887                    Vec::new()
12888                } else {
12889                    vec![TableToken {
12890                        base: TextChunk {
12891                            value: text.to_string(),
12892                            bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12893                            font_name: "Test".to_string(),
12894                            font_size: 11.0,
12895                            font_weight: 400.0,
12896                            italic_angle: 0.0,
12897                            font_color: "[0.0]".to_string(),
12898                            contrast_ratio: 21.0,
12899                            symbol_ends: Vec::new(),
12900                            text_format: TextFormat::Normal,
12901                            text_type: TextType::Regular,
12902                            pdf_layer: PdfLayer::Main,
12903                            ocg_visible: true,
12904                            index: None,
12905                            page_number: Some(1),
12906                            level: None,
12907                            mcid: None,
12908                        },
12909                        token_type: TableTokenType::Text,
12910                    }]
12911                };
12912                cells.push(TableBorderCell {
12913                    bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12914                    index: None,
12915                    level: None,
12916                    row_number,
12917                    col_number,
12918                    row_span: 1,
12919                    col_span: 1,
12920                    content,
12921                    contents: vec![],
12922                    semantic_type: None,
12923                });
12924            }
12925
12926            table_rows.push(TableBorderRow {
12927                bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
12928                index: None,
12929                level: None,
12930                row_number,
12931                cells,
12932                semantic_type: None,
12933            });
12934        }
12935
12936        ContentElement::TableBorder(TableBorder {
12937            bbox: BoundingBox::new(
12938                Some(1),
12939                72.0,
12940                656.0 - rows.len() as f64 * 18.0 - 16.0,
12941                420.0,
12942                656.0,
12943            ),
12944            index: None,
12945            level: Some("1".to_string()),
12946            x_coordinates: vec![72.0, 220.0, 420.0],
12947            x_widths: vec![0.0; 3],
12948            y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
12949            y_widths: vec![0.0; rows.len() + 1],
12950            rows: table_rows,
12951            num_rows: rows.len(),
12952            num_columns: 2,
12953            is_bad_table: false,
12954            is_table_transformer: false,
12955            previous_table: None,
12956            next_table: None,
12957        })
12958    }
12959
12960    fn make_chunked_paragraph_line(
12961        segments: &[(&str, f64, f64)],
12962        bottom: f64,
12963        top: f64,
12964    ) -> ContentElement {
12965        let bbox = BoundingBox::new(
12966            Some(1),
12967            segments.first().map(|(_, left, _)| *left).unwrap_or(72.0),
12968            bottom,
12969            segments.last().map(|(_, _, right)| *right).unwrap_or(320.0),
12970            top,
12971        );
12972
12973        let chunks = segments
12974            .iter()
12975            .map(|(text, left, right)| TextChunk {
12976                value: (*text).to_string(),
12977                bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
12978                font_name: "Lato-Regular".to_string(),
12979                font_size: top - bottom,
12980                font_weight: 400.0,
12981                italic_angle: 0.0,
12982                font_color: "#000000".to_string(),
12983                contrast_ratio: 21.0,
12984                symbol_ends: vec![],
12985                text_format: TextFormat::Normal,
12986                text_type: TextType::Regular,
12987                pdf_layer: PdfLayer::Main,
12988                ocg_visible: true,
12989                index: None,
12990                page_number: Some(1),
12991                level: None,
12992                mcid: None,
12993            })
12994            .collect::<Vec<_>>();
12995
12996        let line = TextLine {
12997            bbox: bbox.clone(),
12998            index: None,
12999            level: None,
13000            font_size: top - bottom,
13001            base_line: bottom + 2.0,
13002            slant_degree: 0.0,
13003            is_hidden_text: false,
13004            text_chunks: chunks,
13005            is_line_start: true,
13006            is_line_end: true,
13007            is_list_line: false,
13008            connected_line_art_label: None,
13009        };
13010        let block = TextBlock {
13011            bbox: bbox.clone(),
13012            index: None,
13013            level: None,
13014            font_size: line.font_size,
13015            base_line: line.base_line,
13016            slant_degree: 0.0,
13017            is_hidden_text: false,
13018            text_lines: vec![line],
13019            has_start_line: true,
13020            has_end_line: true,
13021            text_alignment: None,
13022        };
13023        let column = TextColumn {
13024            bbox: bbox.clone(),
13025            index: None,
13026            level: None,
13027            font_size: block.font_size,
13028            base_line: block.base_line,
13029            slant_degree: 0.0,
13030            is_hidden_text: false,
13031            text_blocks: vec![block],
13032        };
13033
13034        ContentElement::Paragraph(SemanticParagraph {
13035            base: SemanticTextNode {
13036                bbox,
13037                index: None,
13038                level: None,
13039                semantic_type: SemanticType::Paragraph,
13040                correct_semantic_score: None,
13041                columns: vec![column],
13042                font_weight: Some(400.0),
13043                font_size: Some(top - bottom),
13044                text_color: None,
13045                italic_angle: None,
13046                font_name: Some("Lato-Regular".to_string()),
13047                text_format: None,
13048                max_font_size: Some(top - bottom),
13049                background_color: None,
13050                is_hidden_text: false,
13051            },
13052            enclosed_top: false,
13053            enclosed_bottom: false,
13054            indentation: 0,
13055        })
13056    }
13057
13058    fn make_n_column_table(rows: &[Vec<&str>], column_bounds: &[(f64, f64)]) -> ContentElement {
13059        let mut table_rows = Vec::new();
13060        for (row_number, row_values) in rows.iter().enumerate() {
13061            let top = 656.0 - row_number as f64 * 18.0;
13062            let bottom = top - 16.0;
13063            let mut cells = Vec::new();
13064            for (col_number, (left_x, right_x)) in column_bounds.iter().enumerate() {
13065                let text = row_values.get(col_number).copied().unwrap_or("");
13066                let content = if text.is_empty() {
13067                    Vec::new()
13068                } else {
13069                    vec![TableToken {
13070                        base: TextChunk {
13071                            value: text.to_string(),
13072                            bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13073                            font_name: "Test".to_string(),
13074                            font_size: 11.0,
13075                            font_weight: 400.0,
13076                            italic_angle: 0.0,
13077                            font_color: "[0.0]".to_string(),
13078                            contrast_ratio: 21.0,
13079                            symbol_ends: Vec::new(),
13080                            text_format: TextFormat::Normal,
13081                            text_type: TextType::Regular,
13082                            pdf_layer: PdfLayer::Main,
13083                            ocg_visible: true,
13084                            index: None,
13085                            page_number: Some(1),
13086                            level: None,
13087                            mcid: None,
13088                        },
13089                        token_type: TableTokenType::Text,
13090                    }]
13091                };
13092                cells.push(TableBorderCell {
13093                    bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13094                    index: None,
13095                    level: None,
13096                    row_number,
13097                    col_number,
13098                    row_span: 1,
13099                    col_span: 1,
13100                    content,
13101                    contents: vec![],
13102                    semantic_type: None,
13103                });
13104            }
13105
13106            table_rows.push(TableBorderRow {
13107                bbox: BoundingBox::new(
13108                    Some(1),
13109                    column_bounds.first().map(|(left, _)| *left).unwrap_or(72.0),
13110                    bottom,
13111                    column_bounds
13112                        .last()
13113                        .map(|(_, right)| *right)
13114                        .unwrap_or(420.0),
13115                    top,
13116                ),
13117                index: None,
13118                level: None,
13119                row_number,
13120                cells,
13121                semantic_type: None,
13122            });
13123        }
13124
13125        let left = column_bounds
13126            .first()
13127            .map(|(value, _)| *value)
13128            .unwrap_or(72.0);
13129        let right = column_bounds
13130            .last()
13131            .map(|(_, value)| *value)
13132            .unwrap_or(420.0);
13133        let x_coordinates = std::iter::once(left)
13134            .chain(column_bounds.iter().map(|(_, right)| *right))
13135            .collect::<Vec<_>>();
13136
13137        ContentElement::TableBorder(TableBorder {
13138            bbox: BoundingBox::new(
13139                Some(1),
13140                left,
13141                656.0 - rows.len() as f64 * 18.0 - 16.0,
13142                right,
13143                656.0,
13144            ),
13145            index: None,
13146            level: Some("1".to_string()),
13147            x_coordinates,
13148            x_widths: vec![0.0; column_bounds.len() + 1],
13149            y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
13150            y_widths: vec![0.0; rows.len() + 1],
13151            rows: table_rows,
13152            num_rows: rows.len(),
13153            num_columns: column_bounds.len(),
13154            is_bad_table: false,
13155            is_table_transformer: false,
13156            previous_table: None,
13157            next_table: None,
13158        })
13159    }
13160
13161    #[test]
13162    fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
13163        let mut doc = PdfDocument::new("cec-table.pdf".to_string());
13164        doc.number_of_pages = 1;
13165        doc.kids.push(make_two_column_table(&[
13166            ("Mineral or colloid type", "CEC of pure colloid"),
13167            ("", "cmolc/kg"),
13168            ("kaolinite", "10"),
13169            ("illite", "30"),
13170        ]));
13171
13172        let md = to_markdown(&doc).unwrap();
13173        assert!(md.contains("| --- | --- |"));
13174        assert!(md.contains("| kaolinite | 10 |"));
13175    }
13176
13177    #[test]
13178    fn test_blank_right_column_table_is_not_misrendered_as_toc() {
13179        let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
13180        doc.number_of_pages = 1;
13181        doc.kids.push(make_two_column_table(&[
13182            (
13183                "Added cation",
13184                "Relative Size & Settling Rates of Floccules",
13185            ),
13186            ("K+", ""),
13187            ("Na+", ""),
13188            ("Ca2+", ""),
13189        ]));
13190
13191        let md = to_markdown(&doc).unwrap();
13192        assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
13193        assert!(md.contains("| K+ |  |"));
13194    }
13195
13196    #[test]
13197    fn test_infographic_card_table_renders_as_numbered_item() {
13198        let mut doc = PdfDocument::new("infographic-card.pdf".to_string());
13199        doc.number_of_pages = 1;
13200        doc.kids.push(make_two_column_table(&[
13201            (
13202                "1",
13203                "We're all both consumers and creators of creative work.",
13204            ),
13205            (
13206                "",
13207                "As consumers, we watch movies, listen to music, read books, and more.",
13208            ),
13209        ]));
13210
13211        let md = to_markdown(&doc).unwrap();
13212        assert!(md.contains(
13213            "1. We're all both consumers and creators of creative work. As consumers, we watch movies, listen to music, read books, and more."
13214        ));
13215        assert!(!md.contains("| 1 |"));
13216    }
13217
13218    #[test]
13219    fn test_grouped_header_rows_are_preserved_without_flattening() {
13220        let mut doc = PdfDocument::new("grouped-header.pdf".to_string());
13221        doc.number_of_pages = 1;
13222        doc.kids.push(make_n_column_table(
13223            &[
13224                vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13225                vec![
13226                    "",
13227                    "Alpaca-GPT4",
13228                    "OpenOrca",
13229                    "Synth. Math-Instruct",
13230                    "Orca DPO Pairs",
13231                    "Ultrafeedback Cleaned",
13232                    "Synth. Math-Alignment",
13233                ],
13234                vec![
13235                    "Total # Samples",
13236                    "52K",
13237                    "2.91M",
13238                    "126K",
13239                    "12.9K",
13240                    "60.8K",
13241                    "126K",
13242                ],
13243            ],
13244            &[
13245                (72.0, 120.0),
13246                (120.0, 170.0),
13247                (170.0, 220.0),
13248                (220.0, 280.0),
13249                (280.0, 340.0),
13250                (340.0, 410.0),
13251                (410.0, 470.0),
13252            ],
13253        ));
13254
13255        let md = to_markdown(&doc).unwrap();
13256        assert!(md.contains(
13257            "| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"
13258        ));
13259        assert!(md.contains(
13260            "|  | Alpaca-GPT4 | OpenOrca | Synth. Math-Instruct | Orca DPO Pairs | Ultrafeedback Cleaned | Synth. Math-Alignment |"
13261        ));
13262        assert!(!md.contains("Instruction OpenOrca"));
13263        assert!(!md.contains("Alignment Ultrafeedback"));
13264    }
13265
13266    #[test]
13267    fn test_top_table_plate_renderer_stops_before_article_body() {
13268        let mut doc = PdfDocument::new("table-plate.pdf".to_string());
13269        doc.number_of_pages = 1;
13270        doc.kids
13271            .push(make_paragraph_at(72.0, 724.0, 200.0, 736.0, "SOLAR 10.7B"));
13272        doc.kids.push(make_paragraph_at(
13273            72.0,
13274            704.0,
13275            220.0,
13276            716.0,
13277            "Training datasets",
13278        ));
13279        doc.kids.push(make_n_column_table(
13280            &[
13281                vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13282                vec![
13283                    "",
13284                    "Alpaca-GPT4",
13285                    "OpenOrca",
13286                    "Synth. Math-Instruct",
13287                    "Orca DPO Pairs",
13288                    "Ultrafeedback Cleaned",
13289                    "Synth. Math-Alignment",
13290                ],
13291                vec![
13292                    "Total # Samples",
13293                    "52K",
13294                    "2.91M",
13295                    "126K",
13296                    "12.9K",
13297                    "60.8K",
13298                    "126K",
13299                ],
13300                vec![
13301                    "Maximum # Samples Used",
13302                    "52K",
13303                    "100K",
13304                    "52K",
13305                    "12.9K",
13306                    "60.8K",
13307                    "20.1K",
13308                ],
13309                vec!["Open Source", "O", "O", "✗", "O", "O", "✗"],
13310            ],
13311            &[
13312                (78.0, 125.0),
13313                (125.0, 175.0),
13314                (175.0, 225.0),
13315                (225.0, 285.0),
13316                (285.0, 345.0),
13317                (345.0, 415.0),
13318                (415.0, 490.0),
13319            ],
13320        ));
13321        doc.kids.push(make_paragraph_at(
13322            72.0,
13323            500.0,
13324            310.0,
13325            514.0,
13326            "Table 1: Training datasets used for the instruction and alignment tuning stages, respectively.",
13327        ));
13328        doc.kids.push(make_paragraph_at(
13329            286.0,
13330            484.0,
13331            526.0,
13332            498.0,
13333            "Open source indicates whether the dataset is open-sourced.",
13334        ));
13335        doc.kids.push(make_paragraph_at(
13336            72.0,
13337            360.0,
13338            290.0,
13339            388.0,
13340            "Comparison to other up-scaling methods. Unlike Komatsuzaki et al. (2022)...",
13341        ));
13342
13343        let md = to_markdown(&doc).unwrap();
13344        assert!(md.contains("Table 1: Training datasets used for the instruction"));
13345        assert!(md.contains("| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"));
13346        assert!(!md.contains("Comparison to other up-scaling methods"));
13347    }
13348
13349    #[test]
13350    fn test_late_section_boundary_renderer_drops_equation_carryover() {
13351        let mut doc = PdfDocument::new("late-section.pdf".to_string());
13352        doc.number_of_pages = 1;
13353        doc.kids.push(make_paragraph_at(
13354            72.0,
13355            700.0,
13356            540.0,
13357            714.0,
13358            "The horizontal distance traveled by the jet is equal to:",
13359        ));
13360        doc.kids.push(make_paragraph_at(
13361            72.0,
13362            640.0,
13363            540.0,
13364            654.0,
13365            "The vertical position of the jet may be calculated as:",
13366        ));
13367        doc.kids.push(make_paragraph_at(
13368            72.0,
13369            580.0,
13370            260.0,
13371            594.0,
13372            "Rearranging Equation (8) gives:",
13373        ));
13374        doc.kids.push(make_paragraph_at(
13375            72.0,
13376            520.0,
13377            420.0,
13378            534.0,
13379            "Substitution into Equation 7 results in:",
13380        ));
13381        doc.kids.push(make_paragraph_at(
13382            72.0,
13383            460.0,
13384            280.0,
13385            474.0,
13386            "Equations (10) can be rearranged to find Cv:",
13387        ));
13388        doc.kids.push(make_heading_at(
13389            72.0,
13390            350.0,
13391            420.0,
13392            366.0,
13393            "7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE",
13394        ));
13395        doc.kids.push(make_paragraph_at(
13396            72.0,
13397            326.0,
13398            380.0,
13399            340.0,
13400            "If C_d is assumed to be constant, then a graph of Q plotted against",
13401        ));
13402        doc.kids.push(make_paragraph_at(
13403            400.0,
13404            326.0,
13405            540.0,
13406            340.0,
13407            "(Equation 6) will be linear, and",
13408        ));
13409        doc.kids.push(make_paragraph_at(
13410            72.0,
13411            310.0,
13412            240.0,
13413            324.0,
13414            "the slope of this graph will be:",
13415        ));
13416        doc.kids.push(make_paragraph_at(
13417            360.0,
13418            36.0,
13419            550.0,
13420            48.0,
13421            "EXPERIMENT #6: ORIFICE AND FREE JET FLOW 53",
13422        ));
13423
13424        let md = to_markdown(&doc).unwrap();
13425        assert!(md.starts_with("# 7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE"));
13426        assert!(md.contains(
13427            "If C_d is assumed to be constant, then a graph of Q plotted against (Equation 6) will be linear, and the slope of this graph will be:"
13428        ));
13429        assert!(!md.contains("The horizontal distance traveled by the jet"));
13430        assert!(!md.contains("EXPERIMENT #6"));
13431    }
13432
13433    #[test]
13434    fn test_leading_table_carryover_row_is_trimmed_from_general_renderer() {
13435        let mut doc = PdfDocument::new("carryover-table.pdf".to_string());
13436        doc.number_of_pages = 1;
13437        doc.kids.push(make_n_column_table(
13438            &[
13439                vec![
13440                    "Jurisdiction",
13441                    "GATS XVII Reservation (1994)",
13442                    "Foreign Ownership Permitted",
13443                    "Restrictions on Foreign Ownership",
13444                    "Foreign Ownership Reporting Requirements",
13445                ],
13446                vec![
13447                    "",
13448                    "",
13449                    "",
13450                    "right required to acquire desert lands and continue the prior page",
13451                    "",
13452                ],
13453                vec!["Finland", "N", "Y", "Prior approval may be required.", ""],
13454                vec!["France", "N", "Y", "None.", ""],
13455            ],
13456            &[
13457                (72.0, 150.0),
13458                (150.0, 235.0),
13459                (235.0, 330.0),
13460                (330.0, 500.0),
13461                (500.0, 560.0),
13462            ],
13463        ));
13464
13465        let md = to_markdown(&doc).unwrap();
13466        assert!(!md.contains("right required to acquire desert lands"));
13467        assert!(md.contains("| Finland | N | Y | Prior approval may be required. |  |"));
13468    }
13469
13470    #[test]
13471    fn test_single_table_report_renderer_promotes_title_and_skips_footer() {
13472        let mut doc = PdfDocument::new("single-table-report.pdf".to_string());
13473        doc.number_of_pages = 1;
13474        doc.kids.push(make_paragraph_at(
13475            140.0,
13476            674.0,
13477            474.0,
13478            688.0,
13479            "Restrictions on Land Ownership by Foreigners in Selected Jurisdictions",
13480        ));
13481        doc.kids.push(make_n_column_table(
13482            &[
13483                vec![
13484                    "Jurisdiction",
13485                    "GATS XVII Reservation (1994)",
13486                    "Foreign Ownership Permitted",
13487                    "Restrictions on Foreign Ownership",
13488                    "Foreign Ownership Reporting Requirements",
13489                ],
13490                vec![
13491                    "",
13492                    "",
13493                    "",
13494                    "right required to acquire desert lands and continue the prior page",
13495                    "",
13496                ],
13497                vec![
13498                    "Finland",
13499                    "N",
13500                    "Y",
13501                    "Prior approval from the Government of Aland may be required.",
13502                    "",
13503                ],
13504                vec!["France", "N", "Y", "None.", ""],
13505            ],
13506            &[
13507                (72.0, 150.0),
13508                (150.0, 235.0),
13509                (235.0, 330.0),
13510                (330.0, 500.0),
13511                (500.0, 560.0),
13512            ],
13513        ));
13514        doc.kids.push(make_paragraph_at(
13515            350.0,
13516            36.0,
13517            548.0,
13518            48.0,
13519            "The Law Library of Congress 7",
13520        ));
13521
13522        let md = to_markdown(&doc).unwrap();
13523        assert!(md.starts_with(
13524            "# Restrictions on Land Ownership by Foreigners in Selected Jurisdictions"
13525        ));
13526        assert!(!md.contains("right required to acquire desert lands"));
13527        assert!(!md.contains("The Law Library of Congress 7"));
13528        assert!(md.contains(
13529            "| Finland | N | Y | Prior approval from the Government of Aland may be required. |  |"
13530        ));
13531    }
13532
13533    #[test]
13534    fn test_hyphenated_table_title_continuation_renders_as_heading() {
13535        let mut doc = PdfDocument::new("hyphenated-table-title.pdf".to_string());
13536        doc.number_of_pages = 1;
13537        doc.kids.push(make_paragraph_at(
13538            72.0,
13539            724.0,
13540            520.0,
13541            738.0,
13542            "With this in mind, here we have the 7 key competence areas selected to form a part of Eco-",
13543        ));
13544        doc.kids.push(make_paragraph_at(
13545            72.0,
13546            704.0,
13547            260.0,
13548            718.0,
13549            "Circle's Competence Framework:",
13550        ));
13551        doc.kids.push(make_n_column_table(
13552            &[
13553                vec!["Eco-Circle Competence Framework"],
13554                vec!["#1: The 3 Rs: Recycle-Reuse-Reduce"],
13555                vec!["#2: Lifecycle of Circular Economy"],
13556            ],
13557            &[(140.0, 460.0)],
13558        ));
13559
13560        let md = to_markdown(&doc).unwrap();
13561        assert!(md.contains("# Circle's Competence Framework:"), "{md}");
13562    }
13563
13564    #[test]
13565    fn test_duplicate_table_header_heading_is_demoted() {
13566        let mut doc = PdfDocument::new("duplicate-table-header-heading.pdf".to_string());
13567        doc.number_of_pages = 1;
13568        doc.kids
13569            .push(make_heading("MOHAVE COMMUNITY COLLEGE BIO181"));
13570        doc.kids.push(make_n_column_table(
13571            &[
13572                vec![
13573                    "",
13574                    "Saccharometer",
13575                    "DI Water",
13576                    "Glucose Solution",
13577                    "Yeast Suspension",
13578                ],
13579                vec!["1", "", "8 ml", "6 ml", "0 ml"],
13580                vec!["2", "", "12 ml", "0 ml", "2 ml"],
13581                vec!["3", "", "6 ml", "6 ml", "2 ml"],
13582            ],
13583            &[
13584                (72.0, 110.0),
13585                (110.0, 210.0),
13586                (210.0, 300.0),
13587                (300.0, 430.0),
13588                (430.0, 540.0),
13589            ],
13590        ));
13591        doc.kids.push(make_heading_at(
13592            72.0,
13593            92.0,
13594            390.0,
13595            108.0,
13596            "Saccharometer DI Water Glucose Solution Yeast Suspension",
13597        ));
13598        doc.kids
13599            .push(make_paragraph_at(72.0, 72.0, 120.0, 88.0, "below"));
13600        doc.kids
13601            .push(make_paragraph_at(72.0, 56.0, 240.0, 72.0, "1 16 ml 12 ml"));
13602        doc.kids
13603            .push(make_paragraph_at(296.0, 56.0, 340.0, 72.0, "0 ml"));
13604
13605        let md = to_markdown(&doc).unwrap();
13606        assert!(
13607            md.contains("Saccharometer DI Water Glucose Solution Yeast Suspension"),
13608            "{md}"
13609        );
13610        assert!(
13611            !md.contains("# Saccharometer DI Water Glucose Solution Yeast Suspension"),
13612            "{md}"
13613        );
13614    }
13615
13616    #[test]
13617    fn test_geometric_panel_headers_are_promoted_into_table() {
13618        let mut doc = PdfDocument::new("ai-pack-panel.pdf".to_string());
13619        doc.kids.push(make_chunked_paragraph_line(
13620            &[("OCR", 220.0, 250.0)],
13621            720.0,
13622            732.0,
13623        ));
13624        doc.kids.push(make_chunked_paragraph_line(
13625            &[("Recommendation", 430.0, 540.0)],
13626            720.0,
13627            732.0,
13628        ));
13629        doc.kids.push(make_chunked_paragraph_line(
13630            &[("Product semantic search", 660.0, 860.0)],
13631            720.0,
13632            732.0,
13633        ));
13634        doc.kids.push(make_chunked_paragraph_line(
13635            &[("Pack", 72.0, 110.0)],
13636            684.0,
13637            696.0,
13638        ));
13639        doc.kids.push(make_chunked_paragraph_line(
13640            &[("A solution that recognizes characters", 140.0, 340.0)],
13641            684.0,
13642            696.0,
13643        ));
13644        doc.kids.push(make_chunked_paragraph_line(
13645            &[("A solution that recommends the best products", 390.0, 620.0)],
13646            684.0,
13647            696.0,
13648        ));
13649        doc.kids.push(make_chunked_paragraph_line(
13650            &[("A solution that enables semantic search", 650.0, 900.0)],
13651            684.0,
13652            696.0,
13653        ));
13654        doc.kids.push(make_n_column_table(
13655            &[
13656                vec![
13657                    "Achieved 1st place in the OCR World Competition",
13658                    "Team with specialists and technologies",
13659                    "Creation of the first natural language evaluation",
13660                ],
13661                vec![
13662                    "The team includes specialists who have",
13663                    "received Kaggle's Gold Medal recommendation",
13664                    "system in Korean (KLUE)",
13665                ],
13666                vec![
13667                    "presented 14 papers in renowned AI conferences",
13668                    "top-tier recommendation",
13669                    "Shopee subject",
13670                ],
13671            ],
13672            &[(120.0, 360.0), (360.0, 630.0), (630.0, 910.0)],
13673        ));
13674        doc.kids.push(make_chunked_paragraph_line(
13675            &[("models", 430.0, 490.0)],
13676            552.0,
13677            564.0,
13678        ));
13679
13680        let md = to_markdown(&doc).unwrap();
13681        assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13682        assert!(md.contains("| A solution that recognizes characters | A solution that recommends the best products | A solution that enables semantic search |"));
13683        assert!(md.contains(
13684            "received Kaggle's Gold Medal recommendation top-tier recommendation models"
13685        ));
13686    }
13687
13688    #[test]
13689    fn test_embedded_stub_header_is_promoted_from_first_table_column() {
13690        let mut doc = PdfDocument::new("embedded-stub-header.pdf".to_string());
13691        doc.kids.push(make_chunked_paragraph_line(
13692            &[("OCR", 220.0, 250.0)],
13693            720.0,
13694            732.0,
13695        ));
13696        doc.kids.push(make_chunked_paragraph_line(
13697            &[("Recommendation", 430.0, 540.0)],
13698            720.0,
13699            732.0,
13700        ));
13701        doc.kids.push(make_chunked_paragraph_line(
13702            &[("Product semantic search", 660.0, 860.0)],
13703            720.0,
13704            732.0,
13705        ));
13706        doc.kids.push(make_n_column_table(
13707            &[
13708                vec![
13709                    "Pack",
13710                    "A solution that recognizes characters in an image and extracts necessary information",
13711                    "A solution that recommends the best products and contents",
13712                    "A solution that enables semantic search and organizes key information",
13713                ],
13714                vec![
13715                    "Application",
13716                    "Applicable to all fields that require text extraction",
13717                    "Applicable to all fields that use any form of recommendation",
13718                    "Applicable to all fields that deal with unstructured data",
13719                ],
13720                vec![
13721                    "Highlight",
13722                    "Achieved 1st place in the OCR World Competition",
13723                    "Received Kaggle's Gold Medal recommendation",
13724                    "Creation of the first natural language evaluation system in Korean",
13725                ],
13726            ],
13727            &[
13728                (72.0, 120.0),
13729                (120.0, 360.0),
13730                (360.0, 630.0),
13731                (630.0, 910.0),
13732            ],
13733        ));
13734
13735        let md = to_markdown(&doc).unwrap();
13736        assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13737        assert!(
13738            md.contains("| Application | Applicable to all fields that require text extraction |")
13739        );
13740        assert!(md.contains("| Highlight | Achieved 1st place in the OCR World Competition |"));
13741        assert!(!md.contains("OCR\n\nRecommendation\n\nProduct semantic search"));
13742    }
13743
13744    #[test]
13745    fn test_geometric_chunk_alignment_splits_header_line_into_columns() {
13746        let line = make_chunked_paragraph_line(
13747            &[
13748                ("Properties", 72.0, 145.0),
13749                ("Instruction", 180.0, 255.0),
13750                ("Alignment", 480.0, 545.0),
13751            ],
13752            720.0,
13753            732.0,
13754        );
13755        let chunk_lines = extract_chunk_lines(&line);
13756        let fragments = split_line_into_slot_fragments(
13757            &chunk_lines[0],
13758            &[
13759                (72.0, 170.0),
13760                (170.0, 280.0),
13761                (280.0, 380.0),
13762                (380.0, 480.0),
13763                (480.0, 600.0),
13764                (600.0, 720.0),
13765                (720.0, 850.0),
13766            ],
13767        );
13768
13769        assert_eq!(fragments.len(), 3);
13770        assert_eq!(fragments[0].slot_idx, 0);
13771        assert_eq!(fragments[0].text, "Properties");
13772        assert_eq!(fragments[1].slot_idx, 1);
13773        assert_eq!(fragments[1].text, "Instruction");
13774        assert_eq!(fragments[2].slot_idx, 4);
13775        assert_eq!(fragments[2].text, "Alignment");
13776    }
13777
13778    #[test]
13779    fn test_merge_tables_across_heading() {
13780        let input = "some text\n\n\
13781                      | Area | Competence |\n\
13782                      | --- | --- |\n\
13783                      | Row1 | Val1 |\n\
13784                      | Row2 | Val2 |\n\
13785                      \n\
13786                      # Heading Between\n\
13787                      \n\
13788                      | Row3 | Val3 |\n\
13789                      | --- | --- |\n\
13790                      \n\
13791                      more text\n";
13792        let result = merge_adjacent_pipe_tables(input);
13793        // Heading should be converted to a pipe row
13794        assert!(
13795            result.contains("| Heading Between |"),
13796            "Heading should be in pipe row: {}",
13797            result
13798        );
13799        // Should NOT have # heading marker
13800        assert!(
13801            !result.contains("# Heading Between"),
13802            "Heading marker should be removed: {}",
13803            result
13804        );
13805        // Row3 should still be present
13806        assert!(
13807            result.contains("| Row3 |") || result.contains("Row3"),
13808            "Row3 should exist: {}",
13809            result
13810        );
13811    }
13812
13813    #[test]
13814    fn test_merge_tables_does_not_cross_distinct_headers() {
13815        let input = "| Model | Score |\n\
13816                     | --- | --- |\n\
13817                     | A | 1 |\n\
13818                     \n\
13819                     Table 6: Performance comparison amongst the merge candidates.\n\
13820                     \n\
13821                     | Model | Method | Score |\n\
13822                     | --- | --- | --- |\n\
13823                     | B | Avg | 2 |\n";
13824        let result = merge_adjacent_pipe_tables(input);
13825
13826        assert!(result.contains("Table 6: Performance comparison amongst the merge candidates."));
13827        assert!(result.contains("| Model | Score |"));
13828        assert!(result.contains("| Model | Method | Score |"));
13829        assert!(
13830            !result.contains("| Table 6: Performance comparison amongst the merge candidates. |")
13831        );
13832    }
13833
13834    #[test]
13835    fn test_normalize_chart_like_markdown_extracts_series_tables() {
13836        let input = "Figure 1.7. Non-citizen population in Malaysia (in thousands) 3,323 3,500 3,288 3,230 3,140 2,907 3,000 2,693 2,500 2,000 1,500 1,000 500 0\n\n\
13837                     2016 2017 2018 2019 2020 2021 Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.\n\n\
13838                     ASEAN Migration Outlook 19\n";
13839
13840        let normalized = normalize_chart_like_markdown(input);
13841        assert!(
13842            normalized.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
13843        );
13844        assert!(normalized.contains("| 2016 | 3,323 |"));
13845        assert!(normalized.contains("| 2021 | 2,693 |"));
13846        assert!(normalized.contains(
13847            "*Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.*"
13848        ));
13849        assert!(!normalized.contains("ASEAN Migration Outlook 19"));
13850    }
13851
13852    #[test]
13853    fn test_normalize_chart_like_markdown_promotes_structural_captions() {
13854        let input = "Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or\n\n\
13855                     The Wonderful Lamp.\n\n\
13856                     Body paragraph.\n";
13857
13858        let normalized = normalize_chart_like_markdown(input);
13859        assert!(normalized.contains(
13860            "## Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or The Wonderful Lamp"
13861        ));
13862        assert!(normalized.contains("Body paragraph."));
13863    }
13864
13865    #[test]
13866    fn test_normalize_chart_like_markdown_reconstructs_header_pair_chart_table() {
13867        let input = "Figure 4.8. Domestic Wood Pellets Production\n\n\
13868                     | 8 | 800 200 | 126 2014 | 120 2015 | 120 2016 | 127 2017 | 131 2018 | 147 2019 |\n\
13869                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\n\
13870                     Source: Forestry Agency, Ministry of Agriculture, Forestry and Fishery (MAFF), 2020.\n";
13871
13872        let normalized = normalize_chart_like_markdown(input);
13873        assert!(normalized.contains("# Figure 4.8. Domestic Wood Pellets Production"));
13874        assert!(normalized.contains("| Year | Domestic Wood Pellets Production |"));
13875        assert!(normalized.contains("| 2014 | 126 |"));
13876        assert!(normalized.contains("| 2019 | 147 |"));
13877        assert!(!normalized.contains("| 8 | 800 200 |"));
13878    }
13879
13880    #[test]
13881    fn test_normalize_chart_like_markdown_drops_numeric_axis_artifact_table() {
13882        let input = "| 31 1 0 2 23 2 2 2 0 5 10 15 20 25 30 35 Event Celebration Information Videograph 2019 2020 |\n\
13883                     | --- |\n\n\
13884                     Distribution of Komnas HAM's YouTube Content (2019-2020)\n";
13885
13886        let normalized = normalize_chart_like_markdown(input);
13887        assert!(!normalized.contains("| --- |"));
13888        assert!(normalized.contains("Distribution of Komnas HAM's YouTube Content (2019-2020)"));
13889    }
13890
13891    #[test]
13892    fn test_normalize_chart_like_markdown_drops_url_fragment_table() {
13893        let input = "## Figure 6 DPN Argentina Content: World Health Day Celebration\n\n\
13894                     | na/status/1379765916259483648 |\n\
13895                     | --- |\n\n\
13896                     98 DPN Argentina, accessed on 5 December 2021.\n";
13897
13898        let normalized = normalize_chart_like_markdown(input);
13899        assert!(!normalized.contains("/status/1379765916259483648 |"));
13900        assert!(normalized.contains("98 DPN Argentina, accessed on 5 December 2021."));
13901    }
13902
13903    #[test]
13904    fn test_normalize_chart_like_markdown_drops_sparse_table_before_caption() {
13905        let input = "What’s unique about the growth of Alligator Gars is their fast growth.\n\n\
13906                     | in | cm |  | Length | of | Gar | Fish | Age |\n\
13907                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13908                     | 120) | 300 |  |  |  |  |  |  |\n\
13909                     | 100+ | 250 |  |  |  |  |  |  |\n\
13910                     | 80+ | 200 |  |  |  |  |  |  |\n\
13911                     | 20. | 50 | G |  |  |  |  | Vi |\n\
13912                     | 0 | 0 |  |  |  |  |  |  |\n\
13913                     |  | 0 | 10 | 30 |  | 40 | 50 | 60 |\n\n\
13914                     Figure 8.6: Growth in length of Alligator Gar in Texas.\n";
13915
13916        let normalized = normalize_chart_like_markdown(input);
13917        assert!(!normalized.contains("| in | cm |"));
13918        assert!(normalized.contains("Figure 8.6: Growth in length of Alligator Gar in Texas."));
13919    }
13920
13921    #[test]
13922    fn test_normalize_chart_like_markdown_trims_large_top_table_plate() {
13923        let input = "| A | B | C | D | E | F | G | H |\n\
13924                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13925                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13926                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13927                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13928                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13929                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13930                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13931                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13932                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\n\
13933                     Table 2: Evaluation results for SOLAR 10.7B and SOLAR 10.7B-Instruct along with other top-performing models in the paper.\n\n\
13934                     # 4.2 Main Results\n\n\
13935                     The surrounding prose should be dropped.\n";
13936
13937        let normalized = normalize_chart_like_markdown(input);
13938        assert!(normalized.starts_with("| A | B | C | D | E | F | G | H |"));
13939        assert!(!normalized.contains("Table 2:"));
13940        assert!(!normalized.contains("4.2 Main Results"));
13941        assert!(!normalized.contains("surrounding prose"));
13942    }
13943}