edgeparse_core/output/
markdown.rs

1//! Markdown output generator.
2
3#[cfg(not(target_arch = "wasm32"))]
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6#[cfg(not(target_arch = "wasm32"))]
7use std::path::Path;
8#[cfg(not(target_arch = "wasm32"))]
9use std::process::Command;
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::TextChunk;
13use crate::models::content::ContentElement;
14use crate::models::document::PdfDocument;
15use crate::models::enums::SemanticType;
16use crate::models::semantic::SemanticTextNode;
17use crate::models::table::TableTokenRow;
18use crate::EdgePdfError;
19
20#[cfg(not(target_arch = "wasm32"))]
21struct CachedBBoxLayout {
22    page_width: f64,
23    lines: Vec<BBoxLayoutLine>,
24    blocks: Vec<BBoxLayoutBlock>,
25}
26
27#[cfg(not(target_arch = "wasm32"))]
28#[derive(Default)]
29struct LayoutSourceCache {
30    bbox_layout: Option<Option<CachedBBoxLayout>>,
31    layout_lines: Option<Option<Vec<String>>>,
32}
33
34#[cfg(not(target_arch = "wasm32"))]
35impl LayoutSourceCache {
36    fn bbox_layout(&mut self, doc: &PdfDocument) -> Option<&CachedBBoxLayout> {
37        if self.bbox_layout.is_none() {
38            let loaded = doc.source_path.as_deref().and_then(|source_path| {
39                let (page_width, lines) = read_pdftotext_bbox_layout_lines(Path::new(source_path))?;
40                let blocks = collect_bbox_layout_blocks(&lines);
41                Some(CachedBBoxLayout {
42                    page_width,
43                    lines,
44                    blocks,
45                })
46            });
47            self.bbox_layout = Some(loaded);
48        }
49        self.bbox_layout.as_ref().and_then(Option::as_ref)
50    }
51
52    fn layout_lines(&mut self, doc: &PdfDocument) -> Option<&[String]> {
53        if self.layout_lines.is_none() {
54            let loaded = doc
55                .source_path
56                .as_deref()
57                .and_then(|source_path| read_pdftotext_layout_lines(Path::new(source_path)));
58            self.layout_lines = Some(loaded);
59        }
60        self.layout_lines
61            .as_ref()
62            .and_then(Option::as_ref)
63            .map(Vec::as_slice)
64    }
65}
66
67/// Generate Markdown representation of a PdfDocument.
68///
69/// # Errors
70/// Returns `EdgePdfError::OutputError` on write failures.
71pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
72    #[cfg(not(target_arch = "wasm32"))]
73    let mut layout_cache = LayoutSourceCache::default();
74    #[cfg(not(target_arch = "wasm32"))]
75    if let Some(rendered) = render_layout_open_plate_document_cached(doc, &mut layout_cache) {
76        return Ok(rendered);
77    }
78    #[cfg(not(target_arch = "wasm32"))]
79    if let Some(rendered) =
80        render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
81    {
82        return Ok(rendered);
83    }
84    #[cfg(not(target_arch = "wasm32"))]
85    if let Some(rendered) = render_layout_captioned_media_document_cached(doc, &mut layout_cache) {
86        return Ok(rendered);
87    }
88    #[cfg(not(target_arch = "wasm32"))]
89    if let Some(rendered) =
90        render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
91    {
92        return Ok(rendered);
93    }
94    #[cfg(not(target_arch = "wasm32"))]
95    if let Some(rendered) = render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
96    {
97        return Ok(rendered);
98    }
99    #[cfg(not(target_arch = "wasm32"))]
100    if let Some(rendered) = render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
101    {
102        return Ok(rendered);
103    }
104    #[cfg(not(target_arch = "wasm32"))]
105    if let Some(rendered) =
106        render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
107    {
108        return Ok(rendered);
109    }
110    #[cfg(not(target_arch = "wasm32"))]
111    if let Some(rendered) = render_layout_toc_document_cached(doc, &mut layout_cache) {
112        return Ok(rendered);
113    }
114    if looks_like_contents_document(doc) {
115        return Ok(render_contents_document(doc));
116    }
117    if looks_like_compact_toc_document(doc) {
118        return Ok(render_compact_toc_document(doc));
119    }
120    #[cfg(not(target_arch = "wasm32"))]
121    if let Some(rendered) = render_layout_projection_sheet_document_cached(doc, &mut layout_cache) {
122        return Ok(rendered);
123    }
124    #[cfg(not(target_arch = "wasm32"))]
125    if let Some(rendered) = render_layout_appendix_tables_document_cached(doc, &mut layout_cache) {
126        return Ok(rendered);
127    }
128    #[cfg(not(target_arch = "wasm32"))]
129    if let Some(rendered) = render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
130    {
131        return Ok(rendered);
132    }
133    #[cfg(not(target_arch = "wasm32"))]
134    if let Some(rendered) = render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
135    {
136        return Ok(rendered);
137    }
138    #[cfg(not(target_arch = "wasm32"))]
139    if let Some(rendered) =
140        render_layout_registration_report_document_cached(doc, &mut layout_cache)
141    {
142        return Ok(rendered);
143    }
144    if let Some(rendered) = render_top_table_plate_document(doc) {
145        return Ok(rendered);
146    }
147    if let Some(rendered) = render_single_table_report_document(doc) {
148        return Ok(rendered);
149    }
150    if let Some(rendered) = render_late_section_boundary_document(doc) {
151        return Ok(rendered);
152    }
153    #[cfg(not(target_arch = "wasm32"))]
154    if let Some(rendered) = render_layout_matrix_document_cached(doc, &mut layout_cache) {
155        return Ok(rendered);
156    }
157    #[cfg(not(target_arch = "wasm32"))]
158    if let Some(rendered) = render_layout_panel_stub_document_cached(doc, &mut layout_cache) {
159        return Ok(rendered);
160    }
161
162    Ok(render_markdown_core(doc))
163}
164
165fn render_markdown_core(doc: &PdfDocument) -> String {
166    let mut output = String::new();
167
168    // Title
169    if let Some(ref title) = doc.title {
170        let trimmed = title.trim();
171        if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
172            if should_render_document_title_as_plaintext(doc, trimmed) {
173                output.push_str(trimmed);
174                output.push_str("\n\n");
175            } else {
176                output.push_str(&format!("# {}\n\n", trimmed));
177            }
178        }
179    }
180
181    if doc.kids.is_empty() {
182        output.push_str("*No content extracted.*\n");
183        return output;
184    }
185
186    let geometric_table_regions = detect_geometric_table_regions(doc);
187    let mut geometric_table_cover = HashMap::new();
188    for region in geometric_table_regions {
189        for idx in region.start_idx..=region.end_idx {
190            geometric_table_cover.insert(idx, region.clone());
191        }
192    }
193
194    let mut i = 0usize;
195    while i < doc.kids.len() {
196        if let Some(region) = geometric_table_cover.get(&i) {
197            output.push_str(&region.rendered);
198            i = region.end_idx + 1;
199            continue;
200        }
201
202        match &doc.kids[i] {
203            ContentElement::Heading(h) => {
204                let text = h.base.base.value();
205                let trimmed = text.trim();
206                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
207                    i += 1;
208                    continue;
209                }
210
211                // Demote carried-over table header rows that were promoted to
212                // headings by the pipeline but only duplicate the table above.
213                if looks_like_table_header_duplicate_heading(doc, i, trimmed) {
214                    output.push_str(&escape_md_line_start(trimmed));
215                    output.push_str("\n\n");
216                    i += 1;
217                    continue;
218                }
219
220                // Demote headings that sit in the bottom margin of the page
221                // (running footers misclassified as headings by the pipeline).
222                if looks_like_bottom_margin_heading(doc, i) {
223                    output.push_str(&escape_md_line_start(trimmed));
224                    output.push_str("\n\n");
225                    i += 1;
226                    continue;
227                }
228
229                // Demote pipeline headings that look like sentence fragments
230                // ending with a period but are not numbered section headings.
231                if should_demote_period_heading(trimmed) {
232                    output.push_str(&escape_md_line_start(trimmed));
233                    output.push_str("\n\n");
234                    i += 1;
235                    continue;
236                }
237
238                // Demote headings ending with comma (footnotes / data labels).
239                if should_demote_comma_heading(trimmed) {
240                    output.push_str(&escape_md_line_start(trimmed));
241                    output.push_str("\n\n");
242                    i += 1;
243                    continue;
244                }
245
246                // Demote headings containing math symbols.
247                if should_demote_math_heading(trimmed) {
248                    output.push_str(&escape_md_line_start(trimmed));
249                    output.push_str("\n\n");
250                    i += 1;
251                    continue;
252                }
253
254                // Demote headings containing percentage signs.
255                if should_demote_percentage_heading(trimmed) {
256                    output.push_str(&escape_md_line_start(trimmed));
257                    output.push_str("\n\n");
258                    i += 1;
259                    continue;
260                }
261
262                // Demote headings that start with a known caption prefix
263                // (e.g. "Source:", "Figure", "Table") — these are captions,
264                // not section headings, regardless of pipeline classification.
265                if starts_with_caption_prefix(trimmed) {
266                    output.push_str(&escape_md_line_start(trimmed));
267                    output.push_str("\n\n");
268                    i += 1;
269                    continue;
270                }
271
272                // Demote bibliography entries: lines starting with a 4-digit
273                // year followed by a period (e.g. "2020. Title of paper...").
274                if should_demote_bibliography_heading(trimmed) {
275                    output.push_str(&escape_md_line_start(trimmed));
276                    output.push_str("\n\n");
277                    i += 1;
278                    continue;
279                }
280
281                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
282                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
283                        let mut merged = trimmed.to_string();
284                        merge_paragraph_text(&mut merged, &next_text);
285                        output.push_str(&escape_md_line_start(merged.trim()));
286                        output.push_str("\n\n");
287                        i += 2;
288                        continue;
289                    }
290                }
291
292                // Merge consecutive heading fragments.
293                // When the PDF splits a title across multiple text elements,
294                // each becomes a separate heading; merge them into one.
295                let mut merged_heading = trimmed.to_string();
296                while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
297                    let next_text = next_h.base.base.value();
298                    let next_trimmed = next_text.trim();
299                    if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
300                        i += 1;
301                        continue;
302                    }
303                    // Only merge if the combined text stays under max heading length
304                    if merged_heading.len() + 1 + next_trimmed.len() > 200 {
305                        break;
306                    }
307                    merge_paragraph_text(&mut merged_heading, next_trimmed);
308                    i += 1;
309                }
310
311                let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
312
313                // Check if this heading contains a merged subsection
314                if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
315                    let first = cleaned_heading[..split_pos].trim();
316                    let second = cleaned_heading[split_pos..].trim();
317                    output.push_str(&format!("# {}\n\n", first));
318                    output.push_str(&format!("# {}\n\n", second));
319                } else {
320                    output.push_str(&format!("# {}\n\n", cleaned_heading));
321                }
322            }
323            ContentElement::NumberHeading(nh) => {
324                let text = nh.base.base.base.value();
325                let trimmed = text.trim();
326                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
327                    i += 1;
328                    continue;
329                }
330
331                // Demote number headings ending with comma (footnotes).
332                if should_demote_comma_heading(trimmed) {
333                    output.push_str(&escape_md_line_start(trimmed));
334                    output.push_str("\n\n");
335                    i += 1;
336                    continue;
337                }
338
339                // Demote number headings containing math symbols.
340                if should_demote_math_heading(trimmed) {
341                    output.push_str(&escape_md_line_start(trimmed));
342                    output.push_str("\n\n");
343                    i += 1;
344                    continue;
345                }
346
347                // Demote number headings containing percentage signs.
348                if should_demote_percentage_heading(trimmed) {
349                    output.push_str(&escape_md_line_start(trimmed));
350                    output.push_str("\n\n");
351                    i += 1;
352                    continue;
353                }
354
355                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
356                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
357                        let mut merged = trimmed.to_string();
358                        merge_paragraph_text(&mut merged, &next_text);
359                        output.push_str(&escape_md_line_start(merged.trim()));
360                        output.push_str("\n\n");
361                        i += 2;
362                        continue;
363                    }
364                }
365
366                let cleaned = strip_trailing_page_number(trimmed);
367
368                // Check if this heading contains a merged subsection
369                if let Some(split_pos) = find_merged_subsection_split(cleaned) {
370                    let first = cleaned[..split_pos].trim();
371                    let second = cleaned[split_pos..].trim();
372                    output.push_str(&format!("# {}\n\n", first));
373                    output.push_str(&format!("# {}\n\n", second));
374                } else {
375                    output.push_str(&format!("# {}\n\n", cleaned));
376                }
377            }
378            ContentElement::Paragraph(_)
379            | ContentElement::TextBlock(_)
380            | ContentElement::TextLine(_) => {
381                let element = &doc.kids[i];
382                let text = match &doc.kids[i] {
383                    ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
384                    ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
385                    ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
386                    _ => unreachable!(),
387                };
388                let trimmed = text.trim();
389                if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
390                    i += 1;
391                    continue;
392                }
393                if should_skip_leading_figure_carryover(doc, i, trimmed) {
394                    i += 1;
395                    continue;
396                }
397
398                if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
399                    let cleaned = strip_trailing_page_number(trimmed);
400                    // Check if this heading contains a merged subsection
401                    if let Some(split_pos) = find_merged_subsection_split(cleaned) {
402                        let first = cleaned[..split_pos].trim();
403                        let second = cleaned[split_pos..].trim();
404                        output.push_str(&format!("# {}\n\n", first));
405                        output.push_str(&format!("# {}\n\n", second));
406                    } else {
407                        output.push_str(&format!("# {}\n\n", cleaned));
408                    }
409                    i += 1;
410                    continue;
411                }
412
413                if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
414                {
415                    output.push_str(&escape_md_line_start(trimmed));
416                    output.push('\n');
417                    i += 1;
418                    continue;
419                }
420
421                if is_short_caption_label(trimmed) {
422                    if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
423                        if let Some((caption_tail, body)) =
424                            split_following_caption_tail_and_body(&next_text)
425                        {
426                            let mut caption = trimmed.to_string();
427                            caption.push('\n');
428                            caption.push_str(caption_tail);
429                            output.push_str(&escape_md_line_start(caption.trim()));
430                            output.push_str("\n\n");
431                            output.push_str(&escape_md_line_start(body));
432                            output.push_str("\n\n");
433                            i += 2;
434                            continue;
435                        }
436
437                        if looks_like_caption_tail(&next_text) {
438                            let mut caption = trimmed.to_string();
439                            caption.push('\n');
440                            caption.push_str(next_text.trim());
441
442                            if let Some(year_text) =
443                                next_mergeable_paragraph_text(doc.kids.get(i + 2))
444                            {
445                                if looks_like_caption_year(&year_text) {
446                                    caption.push('\n');
447                                    caption.push_str(year_text.trim());
448                                    i += 1;
449                                }
450                            }
451
452                            output.push_str(&escape_md_line_start(caption.trim()));
453                            output.push_str("\n\n");
454                            i += 2;
455                            continue;
456                        }
457                    }
458                }
459
460                if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
461                    output.push_str(&escape_md_line_start(caption));
462                    output.push_str("\n\n");
463                    output.push_str(&escape_md_line_start(body));
464                    output.push_str("\n\n");
465                    i += 1;
466                    continue;
467                }
468
469                let mut merged = trimmed.to_string();
470                while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
471                    let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
472                        should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
473                    } else {
474                        should_merge_paragraph_text(&merged, &next_text)
475                    };
476                    if !can_merge {
477                        break;
478                    }
479                    merge_paragraph_text(&mut merged, &next_text);
480                    i += 1;
481                }
482
483                output.push_str(&escape_md_line_start(merged.trim()));
484                output.push_str("\n\n");
485            }
486            other => render_element(&mut output, other),
487        }
488        i += 1;
489    }
490
491    // Post-processing: merge adjacent pipe tables that share the same
492    // column count.  The table detector sometimes emits highlighted or
493    // coloured rows as separate tables.
494    let output = merge_adjacent_pipe_tables(&output);
495    let output = normalize_chart_like_markdown(&output);
496    drop_isolated_noise_lines(&output)
497}
498
499fn cmp_banded_reading_order(
500    left: &BoundingBox,
501    right: &BoundingBox,
502    band_height: f64,
503) -> std::cmp::Ordering {
504    let safe_band = band_height.max(1.0);
505    let left_band = (left.top_y / safe_band).round() as i64;
506    let right_band = (right.top_y / safe_band).round() as i64;
507    right_band
508        .cmp(&left_band)
509        .then_with(|| {
510            left.left_x
511                .partial_cmp(&right.left_x)
512                .unwrap_or(std::cmp::Ordering::Equal)
513        })
514        .then_with(|| {
515            right
516                .top_y
517                .partial_cmp(&left.top_y)
518                .unwrap_or(std::cmp::Ordering::Equal)
519        })
520        .then_with(|| {
521            right
522                .bottom_y
523                .partial_cmp(&left.bottom_y)
524                .unwrap_or(std::cmp::Ordering::Equal)
525        })
526        .then_with(|| {
527            left.right_x
528                .partial_cmp(&right.right_x)
529                .unwrap_or(std::cmp::Ordering::Equal)
530        })
531}
532
533fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
534    first_heading_like_text(doc)
535        .filter(|first| !equivalent_heading_text(first, title))
536        .is_some()
537}
538
539fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
540    if title.split_whitespace().count() > 6 {
541        return false;
542    }
543
544    let mut early = doc.kids.iter().take(6);
545    let has_explicit_heading = early.clone().any(|element| {
546        matches!(
547            element,
548            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
549        )
550    });
551    let has_tableish_content = early.any(|element| {
552        matches!(
553            element,
554            ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
555        )
556    });
557
558    has_tableish_content && !has_explicit_heading
559}
560
561fn render_top_table_plate_document(doc: &PdfDocument) -> Option<String> {
562    if doc.number_of_pages != 1 {
563        return None;
564    }
565
566    let (table_idx, table) =
567        doc.kids.iter().enumerate().find_map(|(idx, element)| {
568            table_border_from_element(element).map(|table| (idx, table))
569        })?;
570    if table.num_columns < 5 || table.rows.len() < 4 {
571        return None;
572    }
573
574    let mut header_probe = collect_table_border_rows(table);
575    if header_probe.len() < 3 || !preserve_grouped_header_rows(&mut header_probe) {
576        return None;
577    }
578
579    let table_top = table.bbox.top_y;
580    let table_bottom = table.bbox.bottom_y;
581    let table_height = table.bbox.height().max(1.0);
582    let page_top = doc
583        .kids
584        .iter()
585        .map(|element| element.bbox().top_y)
586        .fold(f64::NEG_INFINITY, f64::max);
587    if !page_top.is_finite() || page_top - table_top > table_height * 3.0 {
588        return None;
589    }
590
591    let caption_gap_limit = (table_height * 2.2).clamp(48.0, 132.0);
592    let mut caption_indices = Vec::new();
593    for idx in table_idx + 1..doc.kids.len() {
594        let element = &doc.kids[idx];
595        if !is_geometric_text_candidate(element) {
596            if table_bottom - element.bbox().top_y > caption_gap_limit {
597                break;
598            }
599            continue;
600        }
601
602        let text = extract_element_text(element);
603        if text.trim().is_empty() || looks_like_margin_page_number(doc, element, &text) {
604            continue;
605        }
606
607        let gap = table_bottom - element.bbox().top_y;
608        if gap < -6.0 {
609            break;
610        }
611        if gap > caption_gap_limit {
612            break;
613        }
614        caption_indices.push(idx);
615    }
616    if caption_indices.is_empty() {
617        return None;
618    }
619
620    let has_body_below = doc
621        .kids
622        .iter()
623        .enumerate()
624        .skip(caption_indices.last().copied()? + 1)
625        .any(|(_, element)| {
626            is_geometric_text_candidate(element)
627                && !extract_element_text(element).trim().is_empty()
628                && table_bottom - element.bbox().top_y > caption_gap_limit
629        });
630    if !has_body_below {
631        return None;
632    }
633
634    let mut output = String::new();
635    render_table_border(&mut output, table);
636
637    let mut caption = String::new();
638    for idx in &caption_indices {
639        let text = extract_element_text(&doc.kids[*idx]);
640        if text.trim().is_empty() {
641            continue;
642        }
643        merge_paragraph_text(&mut caption, &text);
644    }
645    let trimmed = caption.trim();
646    if trimmed.is_empty() {
647        return None;
648    }
649    output.push_str(&escape_md_line_start(trimmed));
650    output.push_str("\n\n");
651    Some(output)
652}
653
654fn render_single_table_report_document(doc: &PdfDocument) -> Option<String> {
655    if doc.number_of_pages != 1 || !(2..=4).contains(&doc.kids.len()) {
656        return None;
657    }
658
659    let title = &doc.kids[0];
660    if !is_geometric_text_candidate(title) {
661        return None;
662    }
663    let title_text = extract_element_text(title);
664    if title_text.trim().is_empty() || title_text.split_whitespace().count() < 4 {
665        return None;
666    }
667
668    let table = table_border_from_element(&doc.kids[1])?;
669    if table.num_columns < 4 || table.rows.len() < 4 {
670        return None;
671    }
672
673    let page_top = doc
674        .kids
675        .iter()
676        .map(|element| element.bbox().top_y)
677        .fold(f64::NEG_INFINITY, f64::max);
678    if !page_top.is_finite() {
679        return None;
680    }
681
682    let title_bbox = title.bbox();
683    let table_bbox = &table.bbox;
684    if page_top - title_bbox.top_y > 24.0 {
685        return None;
686    }
687
688    let vertical_gap = title_bbox.bottom_y - table_bbox.top_y;
689    if !(8.0..=40.0).contains(&vertical_gap) {
690        return None;
691    }
692
693    if (title_bbox.center_x() - table_bbox.center_x()).abs() > table_bbox.width() * 0.12 {
694        return None;
695    }
696
697    if doc.kids.iter().skip(2).any(|element| {
698        let text = extract_element_text(element);
699        let trimmed = text.trim();
700        !trimmed.is_empty()
701            && !looks_like_footer_banner(trimmed)
702            && !looks_like_margin_page_number(doc, element, trimmed)
703    }) {
704        return None;
705    }
706
707    let mut rows = collect_table_border_rows(table);
708    if rows.is_empty() {
709        return None;
710    }
711    merge_continuation_rows(&mut rows);
712    trim_leading_table_carryover_rows(&mut rows);
713    if rows.len() < 2 {
714        return None;
715    }
716
717    let mut output = String::new();
718    output.push_str("# ");
719    output.push_str(title_text.trim());
720    output.push_str("\n\n");
721    output.push_str(&render_pipe_rows(&rows));
722    Some(output)
723}
724
725fn render_late_section_boundary_document(doc: &PdfDocument) -> Option<String> {
726    if doc.number_of_pages != 1 || doc.kids.len() < 8 {
727        return None;
728    }
729
730    let page_top = doc
731        .kids
732        .iter()
733        .map(|element| element.bbox().top_y)
734        .fold(f64::NEG_INFINITY, f64::max);
735    if !page_top.is_finite() {
736        return None;
737    }
738
739    let heading_idx = doc.kids.iter().position(|element| {
740        matches!(
741            element,
742            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
743        )
744    })?;
745    if heading_idx < 5 {
746        return None;
747    }
748
749    let heading = &doc.kids[heading_idx];
750    let heading_text = extract_element_text(heading);
751    if heading_text.trim().is_empty() {
752        return None;
753    }
754
755    let heading_top = heading.bbox().top_y;
756    if page_top - heading_top < 240.0 {
757        return None;
758    }
759
760    let leading_text_indices = (0..heading_idx)
761        .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
762        .collect::<Vec<_>>();
763    if leading_text_indices.len() < 5 {
764        return None;
765    }
766
767    let colon_ended = leading_text_indices
768        .iter()
769        .filter(|idx| {
770            extract_element_text(&doc.kids[**idx])
771                .trim_end()
772                .ends_with(':')
773        })
774        .count();
775    if colon_ended * 2 < leading_text_indices.len() {
776        return None;
777    }
778
779    let trailing_indices = (heading_idx + 1..doc.kids.len())
780        .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
781        .filter(|idx| {
782            let text = extract_element_text(&doc.kids[*idx]);
783            !text.trim().is_empty() && !looks_like_margin_page_number(doc, &doc.kids[*idx], &text)
784        })
785        .collect::<Vec<_>>();
786    if trailing_indices.is_empty() || trailing_indices.len() > 5 {
787        return None;
788    }
789
790    let mut footer_count = 0usize;
791    let content_indices = trailing_indices
792        .into_iter()
793        .filter(|idx| {
794            let text = extract_element_text(&doc.kids[*idx]);
795            let is_footerish =
796                doc.kids[*idx].bbox().top_y < 96.0 && text.split_whitespace().count() >= 4;
797            footer_count += usize::from(is_footerish);
798            !is_footerish
799        })
800        .collect::<Vec<_>>();
801    if content_indices.is_empty() || footer_count == 0 {
802        return None;
803    }
804
805    let mut fragments = content_indices
806        .iter()
807        .map(|idx| (*idx, &doc.kids[*idx]))
808        .collect::<Vec<_>>();
809    fragments.sort_by(|left, right| cmp_banded_reading_order(left.1.bbox(), right.1.bbox(), 6.0));
810
811    let mut paragraph = String::new();
812    for (_, element) in fragments {
813        let text = extract_element_text(element);
814        if text.trim().is_empty() {
815            continue;
816        }
817        merge_paragraph_text(&mut paragraph, &text);
818    }
819    let trimmed_paragraph = paragraph.trim();
820    if trimmed_paragraph.is_empty() {
821        return None;
822    }
823
824    let mut output = String::new();
825    output.push_str("# ");
826    output.push_str(heading_text.trim());
827    output.push_str("\n\n");
828    output.push_str(&escape_md_line_start(trimmed_paragraph));
829    output.push_str("\n\n");
830    Some(output)
831}
832
833#[cfg(not(target_arch = "wasm32"))]
834#[derive(Clone)]
835struct LayoutHeaderCandidate {
836    line_idx: usize,
837    headers: Vec<String>,
838    starts: Vec<usize>,
839}
840
841#[cfg(not(target_arch = "wasm32"))]
842#[derive(Clone)]
843struct LayoutEntry {
844    line_idx: usize,
845    cells: Vec<String>,
846}
847
848#[cfg(not(target_arch = "wasm32"))]
849#[derive(Clone)]
850struct LayoutAnchorRow {
851    anchor_idx: usize,
852    last_anchor_idx: usize,
853    cells: Vec<String>,
854}
855
856#[cfg(not(target_arch = "wasm32"))]
857#[derive(Clone)]
858struct LayoutPanelHeaderCandidate {
859    line_idx: usize,
860    headers: Vec<String>,
861    starts: Vec<usize>,
862}
863
864#[cfg(not(target_arch = "wasm32"))]
865#[derive(Clone)]
866struct LayoutTocEntry {
867    title: String,
868    page: String,
869    title_start: usize,
870}
871
872#[cfg(not(target_arch = "wasm32"))]
873#[derive(Clone)]
874struct BBoxLayoutWord {
875    bbox: BoundingBox,
876    text: String,
877}
878
879#[cfg(not(target_arch = "wasm32"))]
880#[derive(Clone)]
881struct BBoxLayoutLine {
882    block_id: usize,
883    bbox: BoundingBox,
884    words: Vec<BBoxLayoutWord>,
885}
886
887#[cfg(not(target_arch = "wasm32"))]
888#[derive(Clone)]
889struct LayoutTextFragment {
890    bbox: BoundingBox,
891    text: String,
892}
893
894#[cfg(not(target_arch = "wasm32"))]
895#[derive(Clone)]
896struct OpenPlateCandidate {
897    heading: String,
898    header_row: Vec<String>,
899    rows: Vec<Vec<String>>,
900    caption: String,
901    cutoff_top_y: f64,
902}
903
904#[cfg(not(target_arch = "wasm32"))]
905struct LayoutNarrativeBridge {
906    bridge_paragraph: Option<String>,
907    deferred_captions: Vec<String>,
908    body_start_top_y: Option<f64>,
909}
910
911#[cfg(not(target_arch = "wasm32"))]
912#[derive(Clone)]
913struct BBoxLayoutBlock {
914    block_id: usize,
915    bbox: BoundingBox,
916    lines: Vec<BBoxLayoutLine>,
917}
918
919#[cfg(not(target_arch = "wasm32"))]
920struct LayoutOcrDashboard {
921    eyebrow: Option<String>,
922    title: String,
923    left_heading: String,
924    left_columns: Vec<String>,
925    left_rows: Vec<Vec<String>>,
926    right_heading: String,
927    right_rows: Vec<Vec<String>>,
928    definition_notes: Vec<String>,
929    source_notes: Vec<String>,
930}
931
932#[cfg(not(target_arch = "wasm32"))]
933struct LayoutRecommendationPanel {
934    heading: String,
935    subtitle: String,
936    header: Vec<String>,
937    rows: Vec<Vec<String>>,
938    notes: Vec<String>,
939}
940
941#[cfg(not(target_arch = "wasm32"))]
942struct LayoutRecommendationInfographic {
943    eyebrow: Option<String>,
944    title: String,
945    panels: Vec<LayoutRecommendationPanel>,
946}
947
948#[cfg(not(target_arch = "wasm32"))]
949#[derive(Clone)]
950struct LayoutBarToken {
951    bbox: BoundingBox,
952    value: i64,
953    text: String,
954}
955
956#[cfg(not(target_arch = "wasm32"))]
957#[allow(dead_code)]
958struct LayoutStackedBarFigure {
959    caption: String,
960    months: Vec<String>,
961    row_labels: Vec<String>,
962    rows: Vec<Vec<String>>,
963}
964
965#[cfg(not(target_arch = "wasm32"))]
966#[allow(dead_code)]
967struct LayoutStackedBarSectorFigure {
968    caption: String,
969    months: Vec<String>,
970    sectors: Vec<String>,
971    rows: Vec<Vec<String>>,
972}
973
974#[cfg(not(target_arch = "wasm32"))]
975struct LayoutStackedBarNarrative {
976    heading: String,
977    paragraphs: Vec<String>,
978    footnote: Option<String>,
979    top_y: f64,
980}
981
982#[cfg(not(target_arch = "wasm32"))]
983struct LayoutSeriesFigure {
984    caption: String,
985    labels: Vec<String>,
986    values: Vec<String>,
987    source: Option<String>,
988}
989
990#[cfg(not(target_arch = "wasm32"))]
991struct LayoutCaptionSection {
992    label: String,
993    title: String,
994    footnote_number: Option<String>,
995    top_y: f64,
996}
997
998#[cfg(not(target_arch = "wasm32"))]
999enum LayoutCaptionedMediaEvent {
1000    Caption(LayoutCaptionSection),
1001    Paragraph(String),
1002}
1003
1004#[cfg(not(target_arch = "wasm32"))]
1005struct LayoutCaptionedMediaProfile {
1006    sections: Vec<LayoutCaptionSection>,
1007    prose: Vec<(f64, String)>,
1008    footnote: Option<String>,
1009    image_count: usize,
1010}
1011
1012#[cfg(not(target_arch = "wasm32"))]
1013#[allow(dead_code)]
1014fn render_layout_captioned_media_document(doc: &PdfDocument) -> Option<String> {
1015    let mut layout_cache = LayoutSourceCache::default();
1016    render_layout_captioned_media_document_cached(doc, &mut layout_cache)
1017}
1018
1019#[cfg(not(target_arch = "wasm32"))]
1020fn render_layout_captioned_media_document_cached(
1021    doc: &PdfDocument,
1022    layout_cache: &mut LayoutSourceCache,
1023) -> Option<String> {
1024    if doc.number_of_pages != 1 {
1025        return None;
1026    }
1027    let paragraph_count = doc
1028        .kids
1029        .iter()
1030        .filter(|element| matches!(element, ContentElement::Paragraph(_)))
1031        .count();
1032    let image_count = doc
1033        .kids
1034        .iter()
1035        .filter(|element| {
1036            matches!(
1037                element,
1038                ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1039            )
1040        })
1041        .count();
1042    if paragraph_count == 0 || image_count == 0 {
1043        return None;
1044    }
1045    let has_explicit_structure = doc.kids.iter().any(|element| {
1046        matches!(
1047            element,
1048            ContentElement::Caption(_)
1049                | ContentElement::Heading(_)
1050                | ContentElement::NumberHeading(_)
1051                | ContentElement::Table(_)
1052                | ContentElement::List(_)
1053        )
1054    });
1055    if has_explicit_structure {
1056        return None;
1057    }
1058
1059    let profile = build_layout_captioned_media_profile(doc, layout_cache)?;
1060    if profile.sections.is_empty() || (profile.sections.len() == 1 && profile.footnote.is_none()) {
1061        return None;
1062    }
1063    let has_non_figure_label = profile
1064        .sections
1065        .iter()
1066        .any(|section| !section.label.starts_with("Figure "));
1067    let has_anchored_footnote = profile.footnote.is_some()
1068        || profile
1069            .sections
1070            .iter()
1071            .any(|section| section.footnote_number.is_some());
1072    if !has_non_figure_label && !has_anchored_footnote {
1073        return None;
1074    }
1075
1076    if let Some(rendered) = render_layout_captioned_media_explainer(&profile) {
1077        return Some(rendered);
1078    }
1079
1080    let mut events = profile
1081        .sections
1082        .into_iter()
1083        .map(|section| (section.top_y, LayoutCaptionedMediaEvent::Caption(section)))
1084        .collect::<Vec<_>>();
1085    for (top_y, paragraph) in profile.prose {
1086        events.push((top_y, LayoutCaptionedMediaEvent::Paragraph(paragraph)));
1087    }
1088    events.sort_by(|left, right| {
1089        right
1090            .0
1091            .partial_cmp(&left.0)
1092            .unwrap_or(std::cmp::Ordering::Equal)
1093    });
1094
1095    let mut output = String::new();
1096    for (_, event) in events {
1097        match event {
1098            LayoutCaptionedMediaEvent::Caption(section) => {
1099                output.push_str(&render_layout_caption_section(&section));
1100            }
1101            LayoutCaptionedMediaEvent::Paragraph(paragraph) => {
1102                output.push_str(&escape_md_line_start(paragraph.trim()));
1103                output.push_str("\n\n");
1104            }
1105        }
1106    }
1107
1108    if let Some(footnote_text) = profile.footnote {
1109        output.push_str("---\n\n");
1110        output.push_str("**Footnote:**\n");
1111        output.push_str(&escape_md_line_start(footnote_text.trim()));
1112        output.push('\n');
1113    }
1114
1115    Some(output.trim_end().to_string() + "\n")
1116}
1117
1118#[cfg(not(target_arch = "wasm32"))]
1119fn build_layout_captioned_media_profile(
1120    doc: &PdfDocument,
1121    layout_cache: &mut LayoutSourceCache,
1122) -> Option<LayoutCaptionedMediaProfile> {
1123    let layout = layout_cache.bbox_layout(doc)?;
1124    let sections = detect_layout_caption_sections(&layout.blocks);
1125    let footnote = detect_layout_bottom_footnote(&layout.lines);
1126
1127    let mut prose = doc
1128        .kids
1129        .iter()
1130        .filter_map(|element| match element {
1131            ContentElement::Paragraph(_)
1132            | ContentElement::TextBlock(_)
1133            | ContentElement::TextLine(_) => {
1134                let text = clean_paragraph_text(&extract_element_text(element));
1135                let trimmed = text.trim();
1136                (!trimmed.is_empty()
1137                    && trimmed.split_whitespace().count() >= 8
1138                    && !starts_with_caption_prefix(trimmed)
1139                    && !trimmed
1140                        .chars()
1141                        .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1142                    && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1143                    && !looks_like_footer_banner(trimmed))
1144                .then_some((element.bbox().top_y, trimmed.to_string()))
1145            }
1146            _ => None,
1147        })
1148        .filter(|(top_y, paragraph)| {
1149            !sections.iter().any(|section| {
1150                (*top_y - section.top_y).abs() <= 36.0
1151                    || section.title.contains(paragraph)
1152                    || paragraph.contains(&section.title)
1153            })
1154        })
1155        .collect::<Vec<_>>();
1156    prose.sort_by(|left, right| {
1157        right
1158            .0
1159            .partial_cmp(&left.0)
1160            .unwrap_or(std::cmp::Ordering::Equal)
1161    });
1162    if prose.len() > 2 {
1163        return None;
1164    }
1165
1166    let image_count = doc
1167        .kids
1168        .iter()
1169        .filter(|element| {
1170            matches!(
1171                element,
1172                ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1173            )
1174        })
1175        .count();
1176
1177    Some(LayoutCaptionedMediaProfile {
1178        sections,
1179        prose,
1180        footnote,
1181        image_count,
1182    })
1183}
1184
1185#[cfg(not(target_arch = "wasm32"))]
1186fn render_layout_captioned_media_explainer(
1187    profile: &LayoutCaptionedMediaProfile,
1188) -> Option<String> {
1189    if profile.sections.len() != 1
1190        || profile.prose.len() != 2
1191        || profile.image_count != 1
1192        || profile.footnote.is_none()
1193        || !profile
1194            .sections
1195            .iter()
1196            .all(|section| section.label.starts_with("Figure "))
1197    {
1198        return None;
1199    }
1200
1201    let mut output = String::new();
1202    output.push_str("# ");
1203    output.push_str(profile.prose[0].1.trim());
1204    output.push('\n');
1205    output.push_str(&escape_md_line_start(profile.prose[1].1.trim()));
1206    output.push_str("\n\n");
1207    output.push_str("*Image*\n\n");
1208    output.push_str(&render_layout_caption_section(&profile.sections[0]));
1209    output.push_str("---\n\n");
1210    output.push_str("**Footnote:**\n");
1211    output.push_str(&escape_md_line_start(
1212        profile.footnote.as_deref().unwrap_or_default().trim(),
1213    ));
1214    output.push('\n');
1215    Some(output)
1216}
1217
1218#[cfg(not(target_arch = "wasm32"))]
1219fn detect_layout_caption_sections(blocks: &[BBoxLayoutBlock]) -> Vec<LayoutCaptionSection> {
1220    let normalized_blocks = blocks
1221        .iter()
1222        .map(|block| {
1223            (
1224                block,
1225                normalize_common_ocr_text(&bbox_layout_block_text(block)),
1226            )
1227        })
1228        .collect::<Vec<_>>();
1229
1230    let mut used_titles = HashSet::new();
1231    let mut sections = Vec::new();
1232    for (block, label_text) in &normalized_blocks {
1233        if !is_short_caption_label(label_text) {
1234            continue;
1235        }
1236
1237        let label_bbox = &block.bbox;
1238        let title_candidate = normalized_blocks
1239            .iter()
1240            .filter(|(candidate, text)| {
1241                candidate.block_id != block.block_id
1242                    && !used_titles.contains(&candidate.block_id)
1243                    && !text.is_empty()
1244                    && !is_short_caption_label(text)
1245                    && !starts_with_caption_prefix(text)
1246                    && !looks_like_footer_banner(text)
1247                    && !is_page_number_like(text)
1248                    && text.split_whitespace().count() >= 2
1249                    && candidate.bbox.width() >= 60.0
1250            })
1251            .filter_map(|(candidate, text)| {
1252                let vertical_gap = (candidate.bbox.center_y() - label_bbox.center_y()).abs();
1253                let horizontal_gap = if candidate.bbox.left_x > label_bbox.right_x {
1254                    candidate.bbox.left_x - label_bbox.right_x
1255                } else if label_bbox.left_x > candidate.bbox.right_x {
1256                    label_bbox.left_x - candidate.bbox.right_x
1257                } else {
1258                    0.0
1259                };
1260                (vertical_gap <= 28.0 && horizontal_gap <= 180.0).then_some((
1261                    vertical_gap + horizontal_gap * 0.15,
1262                    *candidate,
1263                    text.clone(),
1264                ))
1265            })
1266            .min_by(|left, right| {
1267                left.0
1268                    .partial_cmp(&right.0)
1269                    .unwrap_or(std::cmp::Ordering::Equal)
1270            });
1271
1272        let Some((_, title_block, title_text)) = title_candidate else {
1273            continue;
1274        };
1275        used_titles.insert(title_block.block_id);
1276        let (title, footnote_number) = split_trailing_caption_footnote_marker(&title_text);
1277        sections.push(LayoutCaptionSection {
1278            label: label_text.to_string(),
1279            title,
1280            footnote_number,
1281            top_y: label_bbox.top_y.max(title_block.bbox.top_y),
1282        });
1283    }
1284
1285    sections.sort_by(|left, right| {
1286        right
1287            .top_y
1288            .partial_cmp(&left.top_y)
1289            .unwrap_or(std::cmp::Ordering::Equal)
1290    });
1291    sections
1292}
1293
1294#[cfg(not(target_arch = "wasm32"))]
1295fn split_trailing_caption_footnote_marker(text: &str) -> (String, Option<String>) {
1296    let trimmed = text.trim();
1297    let re = Regex::new(r"^(?P<title>.*?[.!?])\s*(?P<num>\d{1,2})\s*[A-Za-z]{0,12}$").ok();
1298    if let Some(captures) = re.as_ref().and_then(|re| re.captures(trimmed)) {
1299        return (
1300            captures["title"].trim().to_string(),
1301            Some(captures["num"].to_string()),
1302        );
1303    }
1304
1305    (trimmed.to_string(), None)
1306}
1307
1308#[cfg(not(target_arch = "wasm32"))]
1309fn detect_layout_bottom_footnote(lines: &[BBoxLayoutLine]) -> Option<String> {
1310    let normalized_lines = lines
1311        .iter()
1312        .map(|line| {
1313            (
1314                line.bbox.top_y,
1315                normalize_common_ocr_text(&bbox_layout_line_text(line)),
1316            )
1317        })
1318        .filter(|(_, text)| !text.is_empty() && !is_page_number_like(text))
1319        .collect::<Vec<_>>();
1320    let start_idx = normalized_lines.iter().rposition(|(_, text)| {
1321        text.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1322            && text.split_whitespace().count() >= 6
1323    })?;
1324
1325    let mut collected = vec![normalized_lines[start_idx].1.clone()];
1326    let mut last_top_y = normalized_lines[start_idx].0;
1327    for (top_y, text) in normalized_lines.iter().skip(start_idx + 1) {
1328        if is_page_number_like(text) {
1329            break;
1330        }
1331        if (last_top_y - *top_y).abs() > 28.0 {
1332            break;
1333        }
1334        collected.push(text.clone());
1335        last_top_y = *top_y;
1336    }
1337
1338    if collected.is_empty() {
1339        return None;
1340    }
1341    let merged = collected.join(" ");
1342    Some(normalize_layout_footnote_text(&merged))
1343}
1344
1345#[cfg(not(target_arch = "wasm32"))]
1346fn normalize_layout_footnote_text(text: &str) -> String {
1347    let mut normalized = text.replace(",https://", ", https://");
1348    let url_gap_re = Regex::new(r"(https?://\S+)\s+(\S+)").ok();
1349    while let Some(re) = &url_gap_re {
1350        let next = re.replace(&normalized, "$1$2").to_string();
1351        if next == normalized {
1352            break;
1353        }
1354        normalized = next;
1355    }
1356    normalized
1357}
1358
1359#[cfg(not(target_arch = "wasm32"))]
1360fn render_layout_caption_section(section: &LayoutCaptionSection) -> String {
1361    let mut output = String::new();
1362    if section.label.starts_with("Diagram ") {
1363        output.push_str("## ");
1364        output.push_str(section.label.trim());
1365        output.push('\n');
1366        if !section.title.trim().is_empty() {
1367            let title = normalize_layout_caption_title_text(section.title.trim());
1368            output.push_str("**");
1369            output.push_str(&title);
1370            output.push_str("**\n\n");
1371        } else {
1372            output.push('\n');
1373        }
1374        return output;
1375    }
1376
1377    if section.label.starts_with("Figure ") && section.footnote_number.is_none() {
1378        output.push('*');
1379        output.push_str(section.label.trim());
1380        output.push_str("*\n\n");
1381    }
1382
1383    output.push_str("**");
1384    output.push_str(section.label.trim());
1385    output.push_str("**\n");
1386
1387    if !section.title.trim().is_empty() {
1388        let title_lines = split_layout_caption_title_lines(section.title.trim());
1389        let last_idx = title_lines.len().saturating_sub(1);
1390        for (idx, line) in title_lines.iter().enumerate() {
1391            if section.footnote_number.is_some() {
1392                output.push_str("**");
1393                output.push_str(line.trim());
1394                if idx == last_idx {
1395                    output.push_str("**^");
1396                    output.push_str(section.footnote_number.as_deref().unwrap_or_default());
1397                } else {
1398                    output.push_str("**");
1399                }
1400            } else {
1401                output.push('*');
1402                output.push_str(line.trim());
1403                output.push('*');
1404            }
1405            output.push('\n');
1406        }
1407    }
1408    output.push('\n');
1409    output
1410}
1411
1412#[cfg(not(target_arch = "wasm32"))]
1413fn split_layout_caption_title_lines(title: &str) -> Vec<String> {
1414    let title = normalize_layout_caption_title_text(title);
1415    if let Some(idx) = title.find(" Content:") {
1416        let head = title[..idx].trim();
1417        let tail = title[idx + 1..].trim();
1418        if !head.is_empty() && head.split_whitespace().count() <= 3 && !tail.is_empty() {
1419            return vec![head.to_string(), tail.to_string()];
1420        }
1421    }
1422    vec![title.to_string()]
1423}
1424
1425#[cfg(not(target_arch = "wasm32"))]
1426fn normalize_layout_caption_title_text(title: &str) -> String {
1427    Regex::new(r"(\d{4})-\s+(\d{4})")
1428        .ok()
1429        .map(|re| re.replace_all(title, "$1-$2").to_string())
1430        .unwrap_or_else(|| title.to_string())
1431}
1432
1433#[cfg(not(target_arch = "wasm32"))]
1434#[allow(dead_code)]
1435fn render_layout_single_caption_chart_document(doc: &PdfDocument) -> Option<String> {
1436    let mut layout_cache = LayoutSourceCache::default();
1437    render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
1438}
1439
1440#[cfg(not(target_arch = "wasm32"))]
1441fn render_layout_single_caption_chart_document_cached(
1442    doc: &PdfDocument,
1443    _layout_cache: &mut LayoutSourceCache,
1444) -> Option<String> {
1445    if doc.number_of_pages != 1 {
1446        return None;
1447    }
1448    if document_has_populated_table(doc) {
1449        return None;
1450    }
1451
1452    let caption_indices = doc
1453        .kids
1454        .iter()
1455        .enumerate()
1456        .filter_map(|(idx, element)| {
1457            let text = extract_element_text(element);
1458            let trimmed = text.trim();
1459            (trimmed.starts_with("Figure ")
1460                && trimmed.contains(':')
1461                && trimmed.split_whitespace().count() >= 6)
1462                .then_some(idx)
1463        })
1464        .collect::<Vec<_>>();
1465    if caption_indices.len() != 1 {
1466        return None;
1467    }
1468    if doc.kids.len() < 12 {
1469        return None;
1470    }
1471
1472    let caption_idx = caption_indices[0];
1473    let mut output = String::new();
1474    let mut i = 0usize;
1475    let mut chart_mode = false;
1476    while i < doc.kids.len() {
1477        let element = &doc.kids[i];
1478        let text = extract_element_text(element);
1479        let trimmed = text.trim();
1480        if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
1481            i += 1;
1482            continue;
1483        }
1484
1485        if i == caption_idx {
1486            output.push_str(&escape_md_line_start(trimmed));
1487            output.push_str("\n\n");
1488            chart_mode = true;
1489            i += 1;
1490            continue;
1491        }
1492
1493        if chart_mode {
1494            if !looks_like_chart_followup_paragraph(element, trimmed)
1495                && !matches!(
1496                    element,
1497                    ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1498                )
1499            {
1500                i += 1;
1501                continue;
1502            }
1503            chart_mode = false;
1504        }
1505
1506        match element {
1507            ContentElement::Heading(h) => {
1508                let level = h.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1509                output.push_str(&"#".repeat(level));
1510                output.push(' ');
1511                output.push_str(trimmed);
1512                output.push_str("\n\n");
1513            }
1514            ContentElement::NumberHeading(nh) => {
1515                let level = nh.base.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1516                output.push_str(&"#".repeat(level));
1517                output.push(' ');
1518                output.push_str(trimmed);
1519                output.push_str("\n\n");
1520            }
1521            ContentElement::Paragraph(_) | ContentElement::TextBlock(_) => {
1522                let mut merged = trimmed.to_string();
1523                while let Some(next_element) = doc.kids.get(i + 1) {
1524                    let next_text = extract_element_text(next_element);
1525                    let next_trimmed = next_text.trim();
1526                    if next_trimmed.is_empty()
1527                        || looks_like_margin_page_number(doc, next_element, next_trimmed)
1528                    {
1529                        i += 1;
1530                        continue;
1531                    }
1532                    if i + 1 == caption_idx
1533                        || looks_like_chart_noise_element(next_element, next_trimmed)
1534                    {
1535                        break;
1536                    }
1537                    let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
1538                        should_merge_adjacent_semantic_paragraphs(&merged, next_trimmed)
1539                    } else {
1540                        should_merge_paragraph_text(&merged, next_trimmed)
1541                    };
1542                    if !can_merge {
1543                        break;
1544                    }
1545                    merge_paragraph_text(&mut merged, next_trimmed);
1546                    i += 1;
1547                }
1548
1549                output.push_str(&escape_md_line_start(merged.trim()));
1550                output.push_str("\n\n");
1551            }
1552            _ => {}
1553        }
1554
1555        i += 1;
1556    }
1557
1558    Some(output.trim_end().to_string() + "\n")
1559}
1560
1561fn document_has_populated_table(doc: &PdfDocument) -> bool {
1562    doc.kids.iter().any(|element| {
1563        table_border_from_element(element).is_some_and(|table| {
1564            table.num_rows >= 2
1565                && table.num_columns >= 2
1566                && table.rows.iter().any(|row| {
1567                    row.cells
1568                        .iter()
1569                        .filter(|cell| !cell_text_content(cell).trim().is_empty())
1570                        .count()
1571                        >= 2
1572                })
1573        })
1574    })
1575}
1576
1577fn looks_like_chart_noise_element(_element: &ContentElement, text: &str) -> bool {
1578    if text.is_empty() {
1579        return false;
1580    }
1581
1582    if is_standalone_page_number(text) || looks_like_numeric_axis_blob(text) {
1583        return true;
1584    }
1585
1586    let word_count = text.split_whitespace().count();
1587    let lower = text.to_ascii_lowercase();
1588
1589    if lower.starts_with("figure ") && text.contains(':') {
1590        return false;
1591    }
1592
1593    if lower.starts_with("source:") {
1594        return false;
1595    }
1596
1597    if word_count <= 3
1598        && (looks_like_yearish_label(text)
1599            || looks_like_layout_month_label(text)
1600            || text == "Lockdown Period")
1601    {
1602        return true;
1603    }
1604
1605    if text
1606        .chars()
1607        .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1608    {
1609        return true;
1610    }
1611
1612    let short_non_sentence = !text.contains('.') && !text.contains(':') && !text.contains(';');
1613    let has_chart_keyword = lower.contains("working as usual")
1614        || lower.contains("temporarily closed")
1615        || lower.contains("business premises")
1616        || lower.contains("operations continue");
1617
1618    word_count <= 10 || (short_non_sentence && word_count <= 14) || has_chart_keyword
1619}
1620
1621fn looks_like_chart_followup_paragraph(_element: &ContentElement, text: &str) -> bool {
1622    let word_count = text.split_whitespace().count();
1623    word_count >= 18
1624        && !text.trim_start().starts_with("Figure ")
1625        && !text.trim_start().starts_with("Table ")
1626}
1627
1628#[cfg(not(target_arch = "wasm32"))]
1629#[allow(dead_code)]
1630fn render_layout_recommendation_infographic_document(doc: &PdfDocument) -> Option<String> {
1631    let mut layout_cache = LayoutSourceCache::default();
1632    render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
1633}
1634
1635#[cfg(not(target_arch = "wasm32"))]
1636fn render_layout_recommendation_infographic_document_cached(
1637    doc: &PdfDocument,
1638    layout_cache: &mut LayoutSourceCache,
1639) -> Option<String> {
1640    if doc.number_of_pages != 1 {
1641        return None;
1642    }
1643
1644    let layout = layout_cache.bbox_layout(doc)?;
1645    let infographic = detect_layout_recommendation_infographic(layout.page_width, &layout.lines)?;
1646
1647    let mut output = String::new();
1648    if let Some(eyebrow) = infographic.eyebrow.as_deref() {
1649        output.push_str("# ");
1650        output.push_str(eyebrow.trim());
1651        output.push_str("\n\n");
1652    }
1653    output.push_str(&escape_md_line_start(infographic.title.trim()));
1654    output.push_str("\n\n");
1655
1656    for panel in &infographic.panels {
1657        output.push_str("## ");
1658        output.push_str(panel.heading.trim());
1659        output.push_str("\n\n");
1660        output.push_str(&escape_md_line_start(panel.subtitle.trim()));
1661        output.push_str("\n\n");
1662
1663        let mut rows = Vec::with_capacity(panel.rows.len() + 1);
1664        rows.push(panel.header.clone());
1665        rows.extend(panel.rows.clone());
1666        output.push_str(&render_pipe_rows(&rows));
1667
1668        if !panel.notes.is_empty() {
1669            output.push_str("*Note:*\n");
1670            for note in &panel.notes {
1671                output.push_str("- ");
1672                output.push_str(note.trim());
1673                output.push('\n');
1674            }
1675            output.push('\n');
1676        }
1677    }
1678
1679    Some(output.trim_end().to_string() + "\n")
1680}
1681
1682#[cfg(not(target_arch = "wasm32"))]
1683#[allow(dead_code)]
1684fn render_layout_stacked_bar_report_document(doc: &PdfDocument) -> Option<String> {
1685    let mut layout_cache = LayoutSourceCache::default();
1686    render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
1687}
1688
1689#[cfg(not(target_arch = "wasm32"))]
1690fn render_layout_stacked_bar_report_document_cached(
1691    doc: &PdfDocument,
1692    layout_cache: &mut LayoutSourceCache,
1693) -> Option<String> {
1694    if doc.number_of_pages != 1 {
1695        return None;
1696    }
1697
1698    let layout = layout_cache.bbox_layout(doc)?;
1699    let figure_captions = collect_layout_figure_captions(&layout.blocks);
1700    if figure_captions.len() != 2 {
1701        return None;
1702    }
1703    let narrative = detect_layout_stacked_bar_narrative(&layout.blocks)?;
1704    let figure_one = detect_layout_three_month_stacked_figure(
1705        &layout.blocks,
1706        &layout.lines,
1707        layout.page_width,
1708        figure_captions[0].clone(),
1709        figure_captions[1].bbox.top_y,
1710    )?;
1711    let figure_two = detect_layout_sector_bar_figure(
1712        &layout.blocks,
1713        &layout.lines,
1714        layout.page_width,
1715        figure_captions[1].clone(),
1716        narrative.top_y,
1717    )?;
1718
1719    let mut output = String::new();
1720    output.push_str("# ");
1721    output.push_str(figure_one.caption.trim());
1722    output.push_str("\n\n");
1723    let mut first_table = vec![{
1724        let mut row = vec![String::new()];
1725        row.extend(figure_one.months.clone());
1726        row
1727    }];
1728    first_table.extend(figure_one.rows.clone());
1729    output.push_str(&render_pipe_rows(&first_table));
1730
1731    output.push_str("# ");
1732    output.push_str(figure_two.caption.trim());
1733    output.push_str("\n\n");
1734    let mut second_table = vec![{
1735        let mut row = vec!["Sector".to_string()];
1736        row.extend(figure_two.months.clone());
1737        row
1738    }];
1739    second_table.extend(figure_two.rows.clone());
1740    output.push_str(&render_pipe_rows(&second_table));
1741
1742    output.push_str("# ");
1743    output.push_str(narrative.heading.trim());
1744    output.push_str("\n\n");
1745    for paragraph in &narrative.paragraphs {
1746        output.push_str(&escape_md_line_start(paragraph.trim()));
1747        output.push_str("\n\n");
1748    }
1749    if let Some(footnote) = narrative.footnote.as_deref() {
1750        output.push('*');
1751        output.push_str(footnote.trim());
1752        output.push_str("*\n");
1753    }
1754
1755    Some(output)
1756}
1757
1758#[cfg(not(target_arch = "wasm32"))]
1759#[allow(dead_code)]
1760fn render_layout_multi_figure_chart_document(doc: &PdfDocument) -> Option<String> {
1761    let mut layout_cache = LayoutSourceCache::default();
1762    render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
1763}
1764
1765#[cfg(not(target_arch = "wasm32"))]
1766fn render_layout_multi_figure_chart_document_cached(
1767    doc: &PdfDocument,
1768    layout_cache: &mut LayoutSourceCache,
1769) -> Option<String> {
1770    if doc.number_of_pages != 1 {
1771        return None;
1772    }
1773
1774    let layout = layout_cache.bbox_layout(doc)?;
1775    let figures = detect_layout_multi_figure_chart_sections(&layout.lines)?;
1776    let rendered_table_count = figures
1777        .iter()
1778        .filter(|figure| figure.labels.len() >= 4 && figure.labels.len() == figure.values.len())
1779        .count();
1780    if figures.len() < 2 || rendered_table_count == 0 {
1781        return None;
1782    }
1783
1784    let mut output = String::from("# Figures from the Document\n\n");
1785    for figure in figures {
1786        output.push_str("## ");
1787        output.push_str(figure.caption.trim());
1788        output.push_str("\n\n");
1789
1790        if figure.labels.len() >= 4 && figure.labels.len() == figure.values.len() {
1791            let label_header = if figure
1792                .labels
1793                .iter()
1794                .all(|label| looks_like_yearish_label(label))
1795            {
1796                "Year"
1797            } else {
1798                "Label"
1799            };
1800            let value_header = chart_value_header(&figure.caption);
1801            output.push_str(&format!("| {} | {} |\n", label_header, value_header));
1802            output.push_str("| --- | --- |\n");
1803            for (label, value) in figure.labels.iter().zip(figure.values.iter()) {
1804                output.push_str(&format!("| {} | {} |\n", label, value));
1805            }
1806            output.push('\n');
1807        }
1808
1809        if let Some(source) = figure.source.as_deref() {
1810            output.push('*');
1811            output.push_str(&escape_md_line_start(source.trim()));
1812            output.push_str("*\n\n");
1813        }
1814    }
1815
1816    Some(output.trim_end().to_string() + "\n")
1817}
1818
1819#[cfg(not(target_arch = "wasm32"))]
1820fn detect_layout_multi_figure_chart_sections(
1821    lines: &[BBoxLayoutLine],
1822) -> Option<Vec<LayoutSeriesFigure>> {
1823    let caption_indices = lines
1824        .iter()
1825        .enumerate()
1826        .filter_map(|(idx, line)| {
1827            let text = bbox_layout_line_text(line);
1828            (text.starts_with("Figure ") && text.split_whitespace().count() >= 4).then_some(idx)
1829        })
1830        .collect::<Vec<_>>();
1831    if caption_indices.len() < 2 {
1832        return None;
1833    }
1834
1835    let mut figures = Vec::new();
1836    for (pos, caption_idx) in caption_indices.iter().enumerate() {
1837        let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
1838        let caption = bbox_layout_line_text(&lines[*caption_idx]);
1839
1840        let source_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
1841            bbox_layout_line_text(&lines[*idx])
1842                .to_ascii_lowercase()
1843                .starts_with("source:")
1844        });
1845
1846        let source = source_idx.map(|idx| {
1847            let mut source_lines = vec![&lines[idx]];
1848            let mut cursor = idx + 1;
1849            while cursor < next_caption_idx {
1850                let text = bbox_layout_line_text(&lines[cursor]);
1851                if text.starts_with("Figure ") || looks_like_footer_banner(&text) || text.is_empty()
1852                {
1853                    break;
1854                }
1855                source_lines.push(&lines[cursor]);
1856                if text.ends_with('.') {
1857                    break;
1858                }
1859                cursor += 1;
1860            }
1861            join_layout_lines_as_paragraph(&source_lines)
1862        });
1863
1864        let series_region = &lines[*caption_idx + 1..source_idx.unwrap_or(next_caption_idx)];
1865        let anchors = extract_year_label_anchors_from_section(series_region);
1866        let (labels, values) = if anchors.len() >= 4 {
1867            let values = map_series_values_to_label_anchors(&anchors, series_region);
1868            (
1869                anchors
1870                    .into_iter()
1871                    .map(|anchor| anchor.text)
1872                    .collect::<Vec<_>>(),
1873                values,
1874            )
1875        } else {
1876            (Vec::new(), Vec::new())
1877        };
1878
1879        if source.is_some() || !values.is_empty() {
1880            figures.push(LayoutSeriesFigure {
1881                caption: normalize_layout_dashboard_text(&caption),
1882                labels,
1883                values,
1884                source,
1885            });
1886        }
1887    }
1888
1889    (!figures.is_empty()).then_some(figures)
1890}
1891
1892#[cfg(not(target_arch = "wasm32"))]
1893fn extract_year_label_anchors_from_section(lines: &[BBoxLayoutLine]) -> Vec<LayoutTextFragment> {
1894    let mut year_words = lines
1895        .iter()
1896        .flat_map(|line| line.words.iter())
1897        .filter_map(|word| {
1898            let token = word
1899                .text
1900                .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1901            looks_like_year_token(token).then_some((word.bbox.center_y(), word.clone()))
1902        })
1903        .collect::<Vec<_>>();
1904    if year_words.len() < 4 {
1905        return Vec::new();
1906    }
1907
1908    year_words.sort_by(|left, right| {
1909        right
1910            .0
1911            .partial_cmp(&left.0)
1912            .unwrap_or(std::cmp::Ordering::Equal)
1913    });
1914
1915    let mut best_band = Vec::<BBoxLayoutWord>::new();
1916    for (center_y, _) in &year_words {
1917        let band = year_words
1918            .iter()
1919            .filter(|(candidate_y, _)| (*candidate_y - *center_y).abs() <= 12.0)
1920            .map(|(_, word)| word.clone())
1921            .collect::<Vec<_>>();
1922        if band.len() > best_band.len() {
1923            best_band = band;
1924        }
1925    }
1926    if best_band.len() < 4 {
1927        return Vec::new();
1928    }
1929
1930    let band_center = best_band
1931        .iter()
1932        .map(|word| word.bbox.center_y())
1933        .sum::<f64>()
1934        / best_band.len() as f64;
1935    let mut band_words = lines
1936        .iter()
1937        .flat_map(|line| line.words.iter())
1938        .filter(|word| (word.bbox.center_y() - band_center).abs() <= 12.0)
1939        .cloned()
1940        .collect::<Vec<_>>();
1941    band_words.sort_by(|left, right| {
1942        left.bbox
1943            .left_x
1944            .partial_cmp(&right.bbox.left_x)
1945            .unwrap_or(std::cmp::Ordering::Equal)
1946    });
1947
1948    let mut anchors = Vec::new();
1949    let mut idx = 0usize;
1950    while idx < band_words.len() {
1951        let token = band_words[idx]
1952            .text
1953            .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1954        if !looks_like_year_token(token) {
1955            idx += 1;
1956            continue;
1957        }
1958
1959        let mut bbox = band_words[idx].bbox.clone();
1960        let mut label = token.to_string();
1961        if let Some(next) = band_words.get(idx + 1) {
1962            let suffix = next
1963                .text
1964                .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1965            let gap = next.bbox.left_x - band_words[idx].bbox.right_x;
1966            if suffix.starts_with('(') && suffix.ends_with(')') && gap <= 18.0 {
1967                label.push(' ');
1968                label.push_str(suffix);
1969                bbox = bbox.union(&next.bbox);
1970                idx += 1;
1971            }
1972        }
1973
1974        anchors.push(LayoutTextFragment { bbox, text: label });
1975        idx += 1;
1976    }
1977
1978    anchors
1979}
1980
1981#[cfg(not(target_arch = "wasm32"))]
1982fn map_series_values_to_label_anchors(
1983    anchors: &[LayoutTextFragment],
1984    lines: &[BBoxLayoutLine],
1985) -> Vec<String> {
1986    if anchors.len() < 2 {
1987        return Vec::new();
1988    }
1989
1990    let mut spacing = anchors
1991        .windows(2)
1992        .map(|pair| pair[1].bbox.center_x() - pair[0].bbox.center_x())
1993        .filter(|gap| *gap > 0.0)
1994        .collect::<Vec<_>>();
1995    spacing.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1996    let median_spacing = spacing
1997        .get(spacing.len().saturating_sub(1) / 2)
1998        .copied()
1999        .unwrap_or(48.0);
2000    let max_dx = (median_spacing * 0.42).clamp(18.0, 32.0);
2001
2002    let mut tokens = Vec::<LayoutBarToken>::new();
2003    for line in lines {
2004        for word in &line.words {
2005            let raw = word.text.trim();
2006            if raw.contains('/')
2007                || looks_like_year_token(raw.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.')))
2008            {
2009                continue;
2010            }
2011            let Some(value) = parse_integer_token(raw) else {
2012                continue;
2013            };
2014            tokens.push(LayoutBarToken {
2015                bbox: word.bbox.clone(),
2016                value,
2017                text: sanitize_numberish_token(raw).unwrap_or_else(|| value.to_string()),
2018            });
2019        }
2020    }
2021
2022    let mut used = vec![false; tokens.len()];
2023    let mut values = Vec::with_capacity(anchors.len());
2024    for anchor in anchors {
2025        let anchor_center_x = anchor.bbox.center_x();
2026        let anchor_center_y = anchor.bbox.center_y();
2027        let best = tokens
2028            .iter()
2029            .enumerate()
2030            .filter(|(idx, token)| {
2031                !used[*idx]
2032                    && token.bbox.center_y() > anchor_center_y + 8.0
2033                    && (token.bbox.center_x() - anchor_center_x).abs() <= max_dx
2034            })
2035            .min_by(|left, right| {
2036                let left_score = (left.1.bbox.center_x() - anchor_center_x).abs()
2037                    + (left.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2038                let right_score = (right.1.bbox.center_x() - anchor_center_x).abs()
2039                    + (right.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2040                left_score
2041                    .partial_cmp(&right_score)
2042                    .unwrap_or(std::cmp::Ordering::Equal)
2043            });
2044        let Some((best_idx, token)) = best else {
2045            return Vec::new();
2046        };
2047        used[best_idx] = true;
2048        values.push(token.text.clone());
2049    }
2050
2051    values
2052}
2053
2054#[cfg(not(target_arch = "wasm32"))]
2055fn detect_layout_recommendation_infographic(
2056    page_width: f64,
2057    lines: &[BBoxLayoutLine],
2058) -> Option<LayoutRecommendationInfographic> {
2059    if page_width < 900.0 {
2060        return None;
2061    }
2062
2063    let blocks = collect_bbox_layout_blocks(lines);
2064    let page_top = lines
2065        .iter()
2066        .map(|line| line.bbox.top_y)
2067        .fold(0.0_f64, f64::max);
2068
2069    let title_block = blocks
2070        .iter()
2071        .filter(|block| {
2072            block.bbox.width() >= page_width * 0.55
2073                && block.bbox.top_y >= page_top - 105.0
2074                && bbox_layout_block_text(block).split_whitespace().count() >= 8
2075        })
2076        .max_by(|left, right| {
2077            left.bbox
2078                .width()
2079                .partial_cmp(&right.bbox.width())
2080                .unwrap_or(std::cmp::Ordering::Equal)
2081        })?;
2082    let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2083    if title.split_whitespace().count() < 8 {
2084        return None;
2085    }
2086
2087    let eyebrow = blocks
2088        .iter()
2089        .filter(|block| {
2090            block.block_id != title_block.block_id
2091                && block.bbox.top_y > title_block.bbox.top_y
2092                && block.bbox.width() >= page_width * 0.1
2093        })
2094        .max_by(|left, right| {
2095            left.bbox
2096                .top_y
2097                .partial_cmp(&right.bbox.top_y)
2098                .unwrap_or(std::cmp::Ordering::Equal)
2099        })
2100        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2101
2102    let title_bottom = title_block.bbox.bottom_y;
2103    let region_width = page_width / 3.0;
2104    let left_panel = detect_layout_recommendation_hit_ratio_panel(
2105        &blocks,
2106        lines,
2107        0.0,
2108        region_width,
2109        title_bottom,
2110    )?;
2111    let middle_panel = detect_layout_recommendation_ranking_panel(
2112        &blocks,
2113        lines,
2114        region_width,
2115        region_width * 2.0,
2116        title_bottom,
2117    )?;
2118    let right_panel = detect_layout_recommendation_accuracy_panel(
2119        &blocks,
2120        lines,
2121        region_width * 2.0,
2122        page_width,
2123        title_bottom,
2124    )?;
2125
2126    Some(LayoutRecommendationInfographic {
2127        eyebrow,
2128        title,
2129        panels: vec![left_panel, middle_panel, right_panel],
2130    })
2131}
2132
2133#[cfg(not(target_arch = "wasm32"))]
2134#[allow(dead_code)]
2135fn render_layout_ocr_benchmark_dashboard_document(doc: &PdfDocument) -> Option<String> {
2136    let mut layout_cache = LayoutSourceCache::default();
2137    render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
2138}
2139
2140#[cfg(not(target_arch = "wasm32"))]
2141fn render_layout_ocr_benchmark_dashboard_document_cached(
2142    doc: &PdfDocument,
2143    layout_cache: &mut LayoutSourceCache,
2144) -> Option<String> {
2145    if doc.number_of_pages != 1 {
2146        return None;
2147    }
2148
2149    let layout = layout_cache.bbox_layout(doc)?;
2150    let dashboard = detect_layout_ocr_benchmark_dashboard(layout.page_width, &layout.lines)?;
2151
2152    let mut output = String::new();
2153    if let Some(eyebrow) = dashboard.eyebrow.as_deref() {
2154        output.push_str("## ");
2155        output.push_str(eyebrow.trim());
2156        output.push_str("\n\n");
2157    }
2158    output.push_str("# ");
2159    output.push_str(dashboard.title.trim());
2160    output.push_str("\n\n");
2161
2162    output.push_str("## ");
2163    output.push_str(dashboard.left_heading.trim());
2164    output.push_str("\n\n");
2165    let mut left_table = Vec::with_capacity(dashboard.left_rows.len() + 1);
2166    left_table.push({
2167        let mut row = vec!["Company".to_string()];
2168        row.extend(dashboard.left_columns.clone());
2169        row
2170    });
2171    left_table.extend(dashboard.left_rows.clone());
2172    output.push_str(&render_pipe_rows(&left_table));
2173
2174    output.push_str("## ");
2175    output.push_str(dashboard.right_heading.trim());
2176    output.push_str("\n\n");
2177    let mut right_table = Vec::with_capacity(dashboard.right_rows.len() + 1);
2178    right_table.push(vec![
2179        "Metric".to_string(),
2180        "Company A".to_string(),
2181        "Company B".to_string(),
2182        "upstage".to_string(),
2183    ]);
2184    right_table.extend(dashboard.right_rows.clone());
2185    output.push_str(&render_pipe_rows(&right_table));
2186
2187    if !dashboard.definition_notes.is_empty() {
2188        output.push_str("---\n\n");
2189        for note in &dashboard.definition_notes {
2190            output.push_str(note.trim());
2191            output.push_str("\n\n");
2192        }
2193    }
2194    if !dashboard.source_notes.is_empty() {
2195        output.push_str("---\n\n");
2196        for note in &dashboard.source_notes {
2197            output.push_str(note.trim());
2198            output.push_str("\n\n");
2199        }
2200    }
2201
2202    Some(output.trim_end().to_string() + "\n")
2203}
2204
2205#[cfg(not(target_arch = "wasm32"))]
2206fn detect_layout_ocr_benchmark_dashboard(
2207    page_width: f64,
2208    lines: &[BBoxLayoutLine],
2209) -> Option<LayoutOcrDashboard> {
2210    if page_width < 680.0 {
2211        return None;
2212    }
2213
2214    let page_mid = page_width / 2.0;
2215    let blocks = collect_bbox_layout_blocks(lines);
2216    let page_top = lines
2217        .iter()
2218        .map(|line| line.bbox.top_y)
2219        .fold(0.0_f64, f64::max);
2220
2221    let title_block = blocks
2222        .iter()
2223        .filter(|block| {
2224            block.bbox.width() >= page_width * 0.45 && block.bbox.top_y >= page_top - 40.0
2225        })
2226        .max_by(|left, right| {
2227            left.bbox
2228                .width()
2229                .partial_cmp(&right.bbox.width())
2230                .unwrap_or(std::cmp::Ordering::Equal)
2231        })?;
2232    let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2233    if title.split_whitespace().count() < 5 {
2234        return None;
2235    }
2236
2237    let eyebrow = blocks
2238        .iter()
2239        .filter(|block| {
2240            block.block_id != title_block.block_id
2241                && block.bbox.top_y > title_block.bbox.top_y
2242                && block.bbox.width() >= page_width * 0.12
2243        })
2244        .max_by(|left, right| {
2245            left.bbox
2246                .top_y
2247                .partial_cmp(&right.bbox.top_y)
2248                .unwrap_or(std::cmp::Ordering::Equal)
2249        })
2250        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2251
2252    let left_title_blocks = blocks
2253        .iter()
2254        .filter(|block| {
2255            block.bbox.right_x <= page_mid
2256                && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2257                && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2258                && !bbox_layout_block_text(block)
2259                    .chars()
2260                    .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2261        })
2262        .cloned()
2263        .collect::<Vec<_>>();
2264    let right_title_blocks = blocks
2265        .iter()
2266        .filter(|block| {
2267            block.bbox.left_x >= page_mid
2268                && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2269                && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2270                && !bbox_layout_block_text(block)
2271                    .chars()
2272                    .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2273        })
2274        .cloned()
2275        .collect::<Vec<_>>();
2276
2277    let left_heading = join_dashboard_title_blocks(&left_title_blocks)?;
2278    let right_heading = join_dashboard_title_blocks(&right_title_blocks)?;
2279    if !left_heading.to_ascii_lowercase().contains("ocr")
2280        || !right_heading.to_ascii_lowercase().contains("document")
2281    {
2282        return None;
2283    }
2284
2285    let left_group_blocks = blocks
2286        .iter()
2287        .filter(|block| {
2288            block.bbox.center_x() < page_mid
2289                && block.bbox.top_y < 90.0
2290                && bbox_layout_block_text(block).contains('(')
2291        })
2292        .cloned()
2293        .collect::<Vec<_>>();
2294    if left_group_blocks.len() != 2 {
2295        return None;
2296    }
2297    let mut left_groups = left_group_blocks
2298        .iter()
2299        .map(|block| {
2300            (
2301                block.bbox.center_x(),
2302                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2303            )
2304        })
2305        .collect::<Vec<_>>();
2306    left_groups.sort_by(|left, right| {
2307        left.0
2308            .partial_cmp(&right.0)
2309            .unwrap_or(std::cmp::Ordering::Equal)
2310    });
2311
2312    let left_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2313        bbox.center_x() < page_mid - 20.0 && bbox.top_y > 110.0 && bbox.top_y < 250.0
2314    });
2315    if left_value_tokens.len() < 6 {
2316        return None;
2317    }
2318
2319    let mut left_group_values = vec![Vec::<(f64, String)>::new(), Vec::new()];
2320    for (bbox, value) in left_value_tokens {
2321        let group_idx = if (bbox.center_x() - left_groups[0].0).abs()
2322            <= (bbox.center_x() - left_groups[1].0).abs()
2323        {
2324            0
2325        } else {
2326            1
2327        };
2328        left_group_values[group_idx].push((bbox.center_x(), value));
2329    }
2330    if left_group_values.iter().any(|values| values.len() < 3) {
2331        return None;
2332    }
2333    for values in &mut left_group_values {
2334        values.sort_by(|left, right| {
2335            left.0
2336                .partial_cmp(&right.0)
2337                .unwrap_or(std::cmp::Ordering::Equal)
2338        });
2339        values.truncate(3);
2340    }
2341
2342    let mut company_labels = extract_dashboard_company_labels(&blocks, page_mid);
2343    if company_labels.len() < 2 {
2344        return None;
2345    }
2346    company_labels.truncate(2);
2347    company_labels.push(infer_dashboard_brand_name(&left_heading));
2348
2349    let mut left_rows = Vec::new();
2350    for row_idx in 0..3 {
2351        left_rows.push(vec![
2352            company_labels[row_idx].clone(),
2353            left_group_values[0][row_idx].1.clone(),
2354            left_group_values[1][row_idx].1.clone(),
2355        ]);
2356    }
2357
2358    let metric_blocks = blocks
2359        .iter()
2360        .filter(|block| {
2361            block.bbox.center_x() > page_mid
2362                && block.bbox.top_y > 95.0
2363                && block.bbox.top_y < 240.0
2364                && matches!(
2365                    normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
2366                    text if text.starts_with("ocr") || text.starts_with("parsingf1")
2367                )
2368        })
2369        .cloned()
2370        .collect::<Vec<_>>();
2371    if metric_blocks.len() < 4 {
2372        return None;
2373    }
2374
2375    let mut metrics = metric_blocks
2376        .iter()
2377        .map(|block| {
2378            (
2379                block.bbox.center_y(),
2380                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2381            )
2382        })
2383        .collect::<Vec<_>>();
2384    metrics.sort_by(|left, right| {
2385        right
2386            .0
2387            .partial_cmp(&left.0)
2388            .unwrap_or(std::cmp::Ordering::Equal)
2389    });
2390    metrics.truncate(4);
2391
2392    let right_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2393        bbox.center_x() > page_mid + 20.0 && bbox.top_y > 90.0 && bbox.top_y < 250.0
2394    });
2395    if right_value_tokens.len() < 10 {
2396        return None;
2397    }
2398
2399    let mut metric_values = vec![Vec::<(f64, String)>::new(); metrics.len()];
2400    for (bbox, value) in right_value_tokens {
2401        let Some((metric_idx, _)) = metrics
2402            .iter()
2403            .enumerate()
2404            .map(|(idx, (center_y, _))| (idx, (bbox.center_y() - *center_y).abs()))
2405            .min_by(|left, right| {
2406                left.1
2407                    .partial_cmp(&right.1)
2408                    .unwrap_or(std::cmp::Ordering::Equal)
2409            })
2410        else {
2411            continue;
2412        };
2413        metric_values[metric_idx].push((bbox.center_x(), value));
2414    }
2415
2416    let mut right_rows = Vec::new();
2417    for (idx, (_, metric_name)) in metrics.iter().enumerate() {
2418        let mut values = metric_values[idx].clone();
2419        values.sort_by(|left, right| {
2420            left.0
2421                .partial_cmp(&right.0)
2422                .unwrap_or(std::cmp::Ordering::Equal)
2423        });
2424        values.dedup_by(|left, right| left.1 == right.1);
2425        if values.len() < 2 {
2426            return None;
2427        }
2428        if values.len() == 2 {
2429            values.push(values[1].clone());
2430        }
2431        values.truncate(3);
2432        right_rows.push(vec![
2433            metric_name.clone(),
2434            normalize_layout_decimal_value(&values[0].1),
2435            normalize_layout_decimal_value(&values[1].1),
2436            normalize_layout_decimal_value(&values[2].1),
2437        ]);
2438    }
2439
2440    let definition_notes = collect_dashboard_notes(&blocks, page_mid, false);
2441    let source_notes = collect_dashboard_notes(&blocks, page_mid, true);
2442
2443    Some(LayoutOcrDashboard {
2444        eyebrow,
2445        title,
2446        left_heading,
2447        left_columns: left_groups.into_iter().map(|(_, text)| text).collect(),
2448        left_rows,
2449        right_heading,
2450        right_rows,
2451        definition_notes,
2452        source_notes,
2453    })
2454}
2455
2456#[cfg(not(target_arch = "wasm32"))]
2457fn detect_layout_recommendation_hit_ratio_panel(
2458    blocks: &[BBoxLayoutBlock],
2459    lines: &[BBoxLayoutLine],
2460    left_x: f64,
2461    right_x: f64,
2462    title_bottom: f64,
2463) -> Option<LayoutRecommendationPanel> {
2464    let (heading_block, subtitle_block) =
2465        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2466    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2467    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2468    let width = right_x - left_x;
2469    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2470
2471    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2472        bbox.center_x() > left_x + width * 0.52
2473            && bbox.center_x() < right_x - 8.0
2474            && bbox.top_y < chart_cutoff
2475    });
2476    values.sort_by(|left, right| {
2477        right
2478            .0
2479            .center_y()
2480            .partial_cmp(&left.0.center_y())
2481            .unwrap_or(std::cmp::Ordering::Equal)
2482    });
2483    values.dedup_by(|left, right| {
2484        (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2485    });
2486    if values.len() < 4 {
2487        return None;
2488    }
2489
2490    let labels = collect_layout_panel_alpha_blocks(
2491        blocks,
2492        left_x,
2493        right_x,
2494        title_bottom,
2495        chart_cutoff,
2496        Some(left_x + width * 0.55),
2497    );
2498    let rows = pair_layout_decimal_rows(&labels, &values, 4)?;
2499    let notes = pair_layout_emphasis_notes(
2500        &rows,
2501        &collect_layout_emphasis_tokens(lines, |bbox| {
2502            bbox.center_x() > left_x + width * 0.48
2503                && bbox.center_x() < right_x
2504                && bbox.top_y < chart_cutoff
2505        }),
2506        "increase",
2507    );
2508    let metric_label =
2509        extract_layout_comparison_metric(&subtitle).unwrap_or_else(|| "Value".to_string());
2510
2511    Some(LayoutRecommendationPanel {
2512        heading,
2513        subtitle,
2514        header: vec!["Model".to_string(), metric_label],
2515        rows,
2516        notes,
2517    })
2518}
2519
2520#[cfg(not(target_arch = "wasm32"))]
2521fn detect_layout_recommendation_ranking_panel(
2522    blocks: &[BBoxLayoutBlock],
2523    lines: &[BBoxLayoutLine],
2524    left_x: f64,
2525    right_x: f64,
2526    title_bottom: f64,
2527) -> Option<LayoutRecommendationPanel> {
2528    let (heading_block, subtitle_block) =
2529        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2530    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2531    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2532    let width = right_x - left_x;
2533    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2534
2535    let row_labels = collect_layout_panel_alpha_blocks(
2536        blocks,
2537        left_x,
2538        right_x,
2539        title_bottom,
2540        chart_cutoff,
2541        Some(left_x + width * 0.48),
2542    )
2543    .into_iter()
2544    .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(&block)))
2545    .collect::<Vec<_>>();
2546    if row_labels.len() < 8 {
2547        return None;
2548    }
2549
2550    let headers = extract_layout_ranking_headers(blocks, left_x, right_x, chart_cutoff)
2551        .unwrap_or_else(|| vec!["Recall@10".to_string(), "Accuracy".to_string()]);
2552    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2553        bbox.center_x() > left_x + width * 0.42
2554            && bbox.center_x() < right_x - 10.0
2555            && bbox.top_y < chart_cutoff
2556    });
2557    values.sort_by(|left, right| {
2558        left.0
2559            .left_x
2560            .partial_cmp(&right.0.left_x)
2561            .unwrap_or(std::cmp::Ordering::Equal)
2562    });
2563
2564    let mut rows = row_labels
2565        .into_iter()
2566        .map(|label| vec![label, String::new(), String::new()])
2567        .collect::<Vec<_>>();
2568    if let Some(first) = rows.first_mut() {
2569        if let Some((_, value)) = values.first() {
2570            first[1] = normalize_layout_decimal_value(value);
2571        }
2572        if let Some((_, value)) = values.get(1) {
2573            first[2] = normalize_layout_decimal_value(value);
2574        }
2575    }
2576
2577    let mut notes = collect_layout_ranking_notes(blocks, left_x, right_x, chart_cutoff);
2578    notes.extend(
2579        collect_layout_emphasis_tokens(lines, |bbox| {
2580            bbox.center_x() > left_x + width * 0.55
2581                && bbox.center_x() < right_x
2582                && bbox.top_y < chart_cutoff
2583        })
2584        .into_iter()
2585        .map(|(_, token)| format!("{} increase", token.trim_end_matches('↑'))),
2586    );
2587
2588    Some(LayoutRecommendationPanel {
2589        heading,
2590        subtitle,
2591        header: vec!["Method".to_string(), headers[0].clone(), headers[1].clone()],
2592        rows,
2593        notes,
2594    })
2595}
2596
2597#[cfg(not(target_arch = "wasm32"))]
2598fn detect_layout_recommendation_accuracy_panel(
2599    blocks: &[BBoxLayoutBlock],
2600    lines: &[BBoxLayoutLine],
2601    left_x: f64,
2602    right_x: f64,
2603    title_bottom: f64,
2604) -> Option<LayoutRecommendationPanel> {
2605    let (heading_block, subtitle_block) =
2606        extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2607    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2608    let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2609    let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2610
2611    let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2612        bbox.center_x() > left_x + 20.0 && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2613    });
2614    values.sort_by(|left, right| {
2615        right
2616            .0
2617            .center_y()
2618            .partial_cmp(&left.0.center_y())
2619            .unwrap_or(std::cmp::Ordering::Equal)
2620    });
2621    values.dedup_by(|left, right| {
2622        (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2623    });
2624    if values.len() < 2 {
2625        return None;
2626    }
2627    let min_value_top_y = values
2628        .iter()
2629        .map(|(bbox, _)| bbox.top_y)
2630        .fold(f64::INFINITY, f64::min);
2631
2632    let labels = collect_layout_panel_alpha_blocks(
2633        blocks,
2634        left_x,
2635        right_x,
2636        title_bottom,
2637        chart_cutoff,
2638        None,
2639    )
2640    .into_iter()
2641    .filter(|block| block.bbox.top_y < min_value_top_y - 70.0)
2642    .collect::<Vec<_>>();
2643    let rows = pair_layout_decimal_rows(&labels, &values, 2)?;
2644
2645    let mut notes = Vec::new();
2646    if let Some(description) = collect_layout_note_phrase(blocks, left_x, right_x, chart_cutoff) {
2647        if let Some((_, emphasis)) = collect_layout_emphasis_tokens(lines, |bbox| {
2648            bbox.center_x() > left_x && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2649        })
2650        .into_iter()
2651        .next()
2652        {
2653            notes.push(format!(
2654                "{}, {} increase",
2655                description,
2656                emphasis.trim_end_matches('↑')
2657            ));
2658        }
2659    }
2660
2661    Some(LayoutRecommendationPanel {
2662        heading,
2663        subtitle,
2664        header: vec!["Model".to_string(), "Accuracy".to_string()],
2665        rows,
2666        notes,
2667    })
2668}
2669
2670#[cfg(not(target_arch = "wasm32"))]
2671fn extract_layout_panel_heading_and_subtitle(
2672    blocks: &[BBoxLayoutBlock],
2673    left_x: f64,
2674    right_x: f64,
2675    title_bottom: f64,
2676) -> Option<(BBoxLayoutBlock, BBoxLayoutBlock)> {
2677    let mut band_blocks = blocks
2678        .iter()
2679        .filter(|block| {
2680            block.bbox.center_x() >= left_x
2681                && block.bbox.center_x() <= right_x
2682                && block.bbox.top_y < title_bottom - 8.0
2683                && block.bbox.top_y > title_bottom - 90.0
2684                && bbox_layout_block_text(block)
2685                    .chars()
2686                    .any(char::is_alphabetic)
2687        })
2688        .cloned()
2689        .collect::<Vec<_>>();
2690    band_blocks.sort_by(|left, right| {
2691        right
2692            .bbox
2693            .top_y
2694            .partial_cmp(&left.bbox.top_y)
2695            .unwrap_or(std::cmp::Ordering::Equal)
2696    });
2697
2698    let heading = band_blocks.first()?.clone();
2699    let subtitle = band_blocks
2700        .iter()
2701        .find(|block| {
2702            block.block_id != heading.block_id
2703                && block.bbox.top_y < heading.bbox.bottom_y + 8.0
2704                && block.bbox.top_y > heading.bbox.bottom_y - 40.0
2705        })?
2706        .clone();
2707    Some((heading, subtitle))
2708}
2709
2710#[cfg(not(target_arch = "wasm32"))]
2711fn collect_layout_panel_alpha_blocks(
2712    blocks: &[BBoxLayoutBlock],
2713    left_x: f64,
2714    right_x: f64,
2715    title_bottom: f64,
2716    chart_cutoff: f64,
2717    max_left_x: Option<f64>,
2718) -> Vec<BBoxLayoutBlock> {
2719    let mut alpha_blocks = blocks
2720        .iter()
2721        .filter(|block| {
2722            block.bbox.center_x() >= left_x
2723                && block.bbox.center_x() <= right_x
2724                && block.bbox.top_y < chart_cutoff
2725                && block.bbox.top_y > title_bottom - 390.0
2726                && max_left_x.is_none_or(|limit| block.bbox.left_x <= limit)
2727        })
2728        .filter_map(|block| {
2729            let text = normalize_layout_panel_text(&bbox_layout_block_text(block));
2730            let token_count = text.split_whitespace().count();
2731            let has_alpha = text.chars().any(char::is_alphabetic);
2732            let has_numeric_marker = text
2733                .chars()
2734                .any(|ch| ch.is_ascii_digit() || ch == '%' || ch == ':');
2735            (has_alpha
2736                && token_count >= 1
2737                && !has_numeric_marker
2738                && !text.starts_with(':')
2739                && !text.eq_ignore_ascii_case("comparison"))
2740            .then_some(block.clone())
2741        })
2742        .collect::<Vec<_>>();
2743    alpha_blocks.sort_by(|left, right| {
2744        right
2745            .bbox
2746            .center_y()
2747            .partial_cmp(&left.bbox.center_y())
2748            .unwrap_or(std::cmp::Ordering::Equal)
2749    });
2750    alpha_blocks
2751}
2752
2753#[cfg(not(target_arch = "wasm32"))]
2754fn pair_layout_decimal_rows(
2755    label_blocks: &[BBoxLayoutBlock],
2756    value_tokens: &[(BoundingBox, String)],
2757    expected_len: usize,
2758) -> Option<Vec<Vec<String>>> {
2759    let mut used = HashSet::new();
2760    let mut rows = Vec::new();
2761
2762    for (bbox, value) in value_tokens.iter().take(expected_len) {
2763        let Some((label_idx, _)) = label_blocks
2764            .iter()
2765            .enumerate()
2766            .filter(|(idx, block)| {
2767                !used.contains(idx) && block.bbox.center_x() <= bbox.center_x() + 24.0
2768            })
2769            .map(|(idx, block)| (idx, (block.bbox.center_y() - bbox.center_y()).abs()))
2770            .min_by(|left, right| {
2771                left.1
2772                    .partial_cmp(&right.1)
2773                    .unwrap_or(std::cmp::Ordering::Equal)
2774            })
2775        else {
2776            continue;
2777        };
2778        if label_blocks[label_idx].bbox.center_y() - bbox.center_y() > 30.0 {
2779            continue;
2780        }
2781
2782        used.insert(label_idx);
2783        rows.push(vec![
2784            normalize_layout_panel_text(&bbox_layout_block_text(&label_blocks[label_idx])),
2785            normalize_layout_decimal_value(value),
2786        ]);
2787    }
2788
2789    (rows.len() >= expected_len).then_some(rows)
2790}
2791
2792#[cfg(not(target_arch = "wasm32"))]
2793fn collect_layout_emphasis_tokens<F>(
2794    lines: &[BBoxLayoutLine],
2795    bbox_filter: F,
2796) -> Vec<(BoundingBox, String)>
2797where
2798    F: Fn(&BoundingBox) -> bool,
2799{
2800    let emphasis_re = Regex::new(r"^\d+(?:\.\d+)?(?:X|%)↑?$").ok();
2801    let Some(emphasis_re) = emphasis_re else {
2802        return Vec::new();
2803    };
2804
2805    let mut tokens = Vec::new();
2806    for line in lines {
2807        for word in &line.words {
2808            let candidate = word.text.trim();
2809            if bbox_filter(&word.bbox) && emphasis_re.is_match(candidate) {
2810                tokens.push((word.bbox.clone(), candidate.to_string()));
2811            }
2812        }
2813    }
2814    tokens.sort_by(|left, right| {
2815        right
2816            .0
2817            .center_y()
2818            .partial_cmp(&left.0.center_y())
2819            .unwrap_or(std::cmp::Ordering::Equal)
2820    });
2821    tokens
2822}
2823
2824#[cfg(not(target_arch = "wasm32"))]
2825fn pair_layout_emphasis_notes(
2826    rows: &[Vec<String>],
2827    emphasis_tokens: &[(BoundingBox, String)],
2828    suffix: &str,
2829) -> Vec<String> {
2830    let mut notes = Vec::new();
2831    for ((_, token), row) in emphasis_tokens.iter().zip(rows.iter().skip(2)) {
2832        if let Some(label) = row.first() {
2833            notes.push(format!(
2834                "{}: {} {}",
2835                label.trim(),
2836                token.trim_end_matches('↑'),
2837                suffix
2838            ));
2839        }
2840    }
2841    notes
2842}
2843
2844#[cfg(not(target_arch = "wasm32"))]
2845fn extract_layout_comparison_metric(text: &str) -> Option<String> {
2846    let tokens = text.split_whitespace().collect::<Vec<_>>();
2847    let comparison_idx = tokens
2848        .iter()
2849        .position(|token| token.eq_ignore_ascii_case("comparison"))?;
2850    if comparison_idx < 2 {
2851        return None;
2852    }
2853    let metric = tokens[comparison_idx.saturating_sub(2)..comparison_idx].join(" ");
2854    (!metric.trim().is_empty()).then_some(metric)
2855}
2856
2857#[cfg(not(target_arch = "wasm32"))]
2858fn title_case_metric_label(text: &str) -> String {
2859    let trimmed = text.trim();
2860    if trimmed.is_empty() {
2861        return String::new();
2862    }
2863    let mut out = String::new();
2864    for (idx, token) in trimmed.split_whitespace().enumerate() {
2865        if idx > 0 {
2866            out.push(' ');
2867        }
2868        if token
2869            .chars()
2870            .all(|ch| !ch.is_ascii_alphabetic() || ch.is_uppercase())
2871        {
2872            out.push_str(token);
2873        } else {
2874            let mut chars = token.chars();
2875            if let Some(first) = chars.next() {
2876                out.push(first.to_ascii_uppercase());
2877                for ch in chars {
2878                    out.push(ch);
2879                }
2880            }
2881        }
2882    }
2883    out
2884}
2885
2886#[cfg(not(target_arch = "wasm32"))]
2887fn normalize_layout_panel_text(text: &str) -> String {
2888    normalize_layout_dashboard_text(text)
2889        .replace(" _", "_")
2890        .replace("_ ", "_")
2891}
2892
2893#[cfg(not(target_arch = "wasm32"))]
2894fn extract_layout_ranking_headers(
2895    blocks: &[BBoxLayoutBlock],
2896    left_x: f64,
2897    right_x: f64,
2898    chart_cutoff: f64,
2899) -> Option<Vec<String>> {
2900    let legend = blocks
2901        .iter()
2902        .filter(|block| {
2903            block.bbox.center_x() >= left_x
2904                && block.bbox.center_x() <= right_x
2905                && block.bbox.top_y < chart_cutoff
2906                && bbox_layout_block_text(block).contains(':')
2907        })
2908        .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2909        .collect::<Vec<_>>();
2910    for line in legend {
2911        let segments = line
2912            .split(':')
2913            .map(str::trim)
2914            .filter(|segment| !segment.is_empty())
2915            .collect::<Vec<_>>();
2916        let Some(first_segment) = segments.first() else {
2917            continue;
2918        };
2919        let metrics = first_segment
2920            .split(',')
2921            .map(title_case_metric_label)
2922            .filter(|part| !part.trim().is_empty())
2923            .collect::<Vec<_>>();
2924        if metrics.len() >= 2 {
2925            return Some(vec![metrics[0].clone(), metrics[1].clone()]);
2926        }
2927    }
2928    None
2929}
2930
2931#[cfg(not(target_arch = "wasm32"))]
2932fn collect_layout_ranking_notes(
2933    blocks: &[BBoxLayoutBlock],
2934    left_x: f64,
2935    right_x: f64,
2936    chart_cutoff: f64,
2937) -> Vec<String> {
2938    blocks
2939        .iter()
2940        .filter(|block| {
2941            block.bbox.center_x() >= left_x
2942                && block.bbox.center_x() <= right_x
2943                && block.bbox.top_y < chart_cutoff
2944                && bbox_layout_block_text(block).contains(':')
2945        })
2946        .flat_map(|block| {
2947            normalize_layout_panel_text(&bbox_layout_block_text(block))
2948                .split(':')
2949                .map(str::trim)
2950                .filter(|segment| !segment.is_empty())
2951                .map(ToString::to_string)
2952                .collect::<Vec<_>>()
2953        })
2954        .filter(|note| !note.eq_ignore_ascii_case("recall@10, accuracy"))
2955        .collect()
2956}
2957
2958#[cfg(not(target_arch = "wasm32"))]
2959fn collect_layout_note_phrase(
2960    blocks: &[BBoxLayoutBlock],
2961    left_x: f64,
2962    right_x: f64,
2963    chart_cutoff: f64,
2964) -> Option<String> {
2965    blocks
2966        .iter()
2967        .filter(|block| {
2968            block.bbox.center_x() >= left_x
2969                && block.bbox.center_x() <= right_x
2970                && block.bbox.top_y < chart_cutoff
2971                && bbox_layout_block_text(block).split_whitespace().count() >= 3
2972        })
2973        .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2974        .find(|text| text.to_ascii_lowercase().contains("compared"))
2975}
2976
2977#[cfg(not(target_arch = "wasm32"))]
2978fn collect_bbox_layout_blocks(lines: &[BBoxLayoutLine]) -> Vec<BBoxLayoutBlock> {
2979    let mut grouped: HashMap<usize, Vec<BBoxLayoutLine>> = HashMap::new();
2980    for line in lines {
2981        grouped.entry(line.block_id).or_default().push(line.clone());
2982    }
2983
2984    let mut blocks = grouped
2985        .into_iter()
2986        .map(|(block_id, mut lines)| {
2987            lines.sort_by(|left, right| {
2988                cmp_banded_reading_order(&left.bbox, &right.bbox, 3.0)
2989                    .then_with(|| left.block_id.cmp(&right.block_id))
2990            });
2991            let bbox = lines
2992                .iter()
2993                .skip(1)
2994                .fold(lines[0].bbox.clone(), |acc, line| acc.union(&line.bbox));
2995            BBoxLayoutBlock {
2996                block_id,
2997                bbox,
2998                lines,
2999            }
3000        })
3001        .collect::<Vec<_>>();
3002    blocks.sort_by(|left, right| {
3003        cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
3004            .then_with(|| left.block_id.cmp(&right.block_id))
3005    });
3006    blocks
3007}
3008
3009#[cfg(not(target_arch = "wasm32"))]
3010fn bbox_layout_block_text(block: &BBoxLayoutBlock) -> String {
3011    join_layout_lines_as_paragraph(&block.lines.iter().collect::<Vec<_>>())
3012}
3013
3014#[cfg(not(target_arch = "wasm32"))]
3015fn join_dashboard_title_blocks(blocks: &[BBoxLayoutBlock]) -> Option<String> {
3016    let mut blocks = blocks.to_vec();
3017    blocks.sort_by(|left, right| {
3018        right
3019            .bbox
3020            .top_y
3021            .partial_cmp(&left.bbox.top_y)
3022            .unwrap_or(std::cmp::Ordering::Equal)
3023    });
3024    let text = blocks
3025        .iter()
3026        .map(bbox_layout_block_text)
3027        .filter(|text| !text.trim().is_empty())
3028        .collect::<Vec<_>>()
3029        .join(" ");
3030    let normalized = normalize_layout_dashboard_text(&text);
3031    (!normalized.trim().is_empty()).then_some(normalized)
3032}
3033
3034#[cfg(not(target_arch = "wasm32"))]
3035fn collect_layout_decimal_tokens<F>(
3036    lines: &[BBoxLayoutLine],
3037    bbox_filter: F,
3038) -> Vec<(BoundingBox, String)>
3039where
3040    F: Fn(&BoundingBox) -> bool,
3041{
3042    let decimal_re = Regex::new(r"^\d+\.\d+$|^\d+\.$").ok();
3043    let Some(decimal_re) = decimal_re else {
3044        return Vec::new();
3045    };
3046
3047    let mut tokens = Vec::new();
3048    for line in lines {
3049        for word in &line.words {
3050            let candidate = word.text.trim().trim_matches(|ch| ch == ',' || ch == ';');
3051            if !bbox_filter(&word.bbox) || !decimal_re.is_match(candidate) {
3052                continue;
3053            }
3054            tokens.push((word.bbox.clone(), candidate.to_string()));
3055        }
3056    }
3057    tokens
3058}
3059
3060#[cfg(not(target_arch = "wasm32"))]
3061fn extract_dashboard_company_labels(blocks: &[BBoxLayoutBlock], page_mid: f64) -> Vec<String> {
3062    let company_blocks = blocks
3063        .iter()
3064        .filter(|block| {
3065            block.bbox.center_x() < page_mid
3066                && (65.0..110.0).contains(&block.bbox.top_y)
3067                && bbox_layout_block_text(block) == "Company"
3068        })
3069        .collect::<Vec<_>>();
3070    let marker_blocks = blocks
3071        .iter()
3072        .filter(|block| {
3073            block.bbox.center_x() < page_mid
3074                && (60.0..105.0).contains(&block.bbox.top_y)
3075                && matches!(
3076                    normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
3077                    "a2" | "b2"
3078                )
3079        })
3080        .map(|block| {
3081            (
3082                block.bbox.center_x(),
3083                block.bbox.center_y(),
3084                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3085            )
3086        })
3087        .collect::<Vec<_>>();
3088
3089    let mut labels = Vec::new();
3090    for company in company_blocks {
3091        if let Some((_, marker_y, marker)) = marker_blocks.iter().min_by(|left, right| {
3092            let left_distance = ((left.0 - company.bbox.center_x()).powi(2)
3093                + (left.1 - company.bbox.center_y()).powi(2))
3094            .sqrt();
3095            let right_distance = ((right.0 - company.bbox.center_x()).powi(2)
3096                + (right.1 - company.bbox.center_y()).powi(2))
3097            .sqrt();
3098            left_distance
3099                .partial_cmp(&right_distance)
3100                .unwrap_or(std::cmp::Ordering::Equal)
3101        }) {
3102            if (company.bbox.center_y() - *marker_y).abs() <= 16.0 || marker_blocks.len() == 1 {
3103                labels.push(format!("{} {}", bbox_layout_block_text(company), marker));
3104            }
3105        }
3106    }
3107
3108    if labels.len() < 2 {
3109        labels.extend(
3110            marker_blocks
3111                .iter()
3112                .map(|(_, _, marker)| format!("Company {marker}")),
3113        );
3114    }
3115
3116    labels.sort();
3117    labels.dedup();
3118    labels
3119}
3120
3121#[cfg(not(target_arch = "wasm32"))]
3122fn infer_dashboard_brand_name(text: &str) -> String {
3123    text.split_whitespace()
3124        .next()
3125        .map(|token| token.trim_matches(|ch: char| !ch.is_alphanumeric()))
3126        .filter(|token| !token.is_empty())
3127        .map(|token| token.to_ascii_lowercase())
3128        .unwrap_or_else(|| "model".to_string())
3129}
3130
3131#[cfg(not(target_arch = "wasm32"))]
3132fn collect_dashboard_notes(
3133    blocks: &[BBoxLayoutBlock],
3134    page_mid: f64,
3135    left_half: bool,
3136) -> Vec<String> {
3137    let notes = blocks
3138        .iter()
3139        .filter(|block| {
3140            let in_half = if left_half {
3141                block.bbox.center_x() < page_mid
3142            } else {
3143                block.bbox.center_x() > page_mid
3144            };
3145            in_half && block.bbox.top_y < 50.0
3146        })
3147        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3148        .filter(|text| !text.trim().is_empty())
3149        .collect::<Vec<_>>();
3150
3151    let mut merged = Vec::new();
3152    for note in notes {
3153        if note
3154            .chars()
3155            .next()
3156            .is_some_and(|ch| matches!(ch, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹'))
3157        {
3158            merged.push(note);
3159        } else if let Some(previous) = merged.last_mut() {
3160            append_cell_text(previous, &note);
3161        } else {
3162            merged.push(note);
3163        }
3164    }
3165    merged
3166}
3167
3168#[cfg(not(target_arch = "wasm32"))]
3169fn normalize_layout_dashboard_text(text: &str) -> String {
3170    let normalized = normalize_common_ocr_text(text.trim());
3171    let degree_marker_re = Regex::new(r"(\d)[°º]").ok();
3172    let split_suffix_re = Regex::new(r"\b([A-Za-z])(\d)\s+(\d)\b").ok();
3173    let single_letter_marker_re = Regex::new(r"\b([A-Za-z])\s+(\d{1,2})\b").ok();
3174    let trailing_block_marker_re = Regex::new(r"([A-Za-z][A-Za-z0-9\-]*)\s+(\d{1,2})$").ok();
3175    let trailing_marker_re = Regex::new(r"([[:alpha:]\)])(\d{1,2})\b").ok();
3176    let leading_marker_re = Regex::new(r"^(\d{1,2})([.)]?)\s+").ok();
3177
3178    let cleaned_degree = degree_marker_re
3179        .as_ref()
3180        .map(|re| {
3181            re.replace_all(&normalized, |captures: &regex::Captures<'_>| {
3182                format!("{} ", &captures[1])
3183            })
3184            .to_string()
3185        })
3186        .unwrap_or(normalized);
3187
3188    let collapsed_suffix = split_suffix_re
3189        .as_ref()
3190        .map(|re| {
3191            re.replace_all(&cleaned_degree, |captures: &regex::Captures<'_>| {
3192                format!("{}{}{}", &captures[1], &captures[2], &captures[3])
3193            })
3194            .to_string()
3195        })
3196        .unwrap_or(cleaned_degree);
3197
3198    let collapsed_spacing = single_letter_marker_re
3199        .as_ref()
3200        .map(|re| {
3201            re.replace_all(&collapsed_suffix, |captures: &regex::Captures<'_>| {
3202                format!("{}{}", &captures[1], &captures[2])
3203            })
3204            .to_string()
3205        })
3206        .unwrap_or(collapsed_suffix);
3207
3208    let collapsed_terminal_marker = trailing_block_marker_re
3209        .as_ref()
3210        .map(|re| {
3211            re.replace(&collapsed_spacing, |captures: &regex::Captures<'_>| {
3212                format!("{}{}", &captures[1], &captures[2])
3213            })
3214            .to_string()
3215        })
3216        .unwrap_or(collapsed_spacing);
3217
3218    let with_inline = trailing_marker_re
3219        .as_ref()
3220        .map(|re| {
3221            re.replace_all(
3222                &collapsed_terminal_marker,
3223                |captures: &regex::Captures<'_>| {
3224                    format!("{}{}", &captures[1], superscript_digits(&captures[2]))
3225                },
3226            )
3227            .to_string()
3228        })
3229        .unwrap_or(collapsed_terminal_marker);
3230
3231    leading_marker_re
3232        .as_ref()
3233        .map(|re| {
3234            re.replace(&with_inline, |captures: &regex::Captures<'_>| {
3235                format!("{} ", superscript_digits(&captures[1]))
3236            })
3237            .to_string()
3238        })
3239        .unwrap_or(with_inline)
3240}
3241
3242#[cfg(not(target_arch = "wasm32"))]
3243fn normalize_layout_decimal_value(value: &str) -> String {
3244    value.trim_end_matches('.').to_string()
3245}
3246
3247#[cfg(not(target_arch = "wasm32"))]
3248fn superscript_digits(text: &str) -> String {
3249    text.chars()
3250        .map(|ch| match ch {
3251            '0' => '⁰',
3252            '1' => '¹',
3253            '2' => '²',
3254            '3' => '³',
3255            '4' => '⁴',
3256            '5' => '⁵',
3257            '6' => '⁶',
3258            '7' => '⁷',
3259            '8' => '⁸',
3260            '9' => '⁹',
3261            _ => ch,
3262        })
3263        .collect()
3264}
3265
3266#[cfg(not(target_arch = "wasm32"))]
3267fn collect_layout_figure_captions(blocks: &[BBoxLayoutBlock]) -> Vec<BBoxLayoutBlock> {
3268    let mut captions = blocks
3269        .iter()
3270        .filter(|block| {
3271            let text = bbox_layout_block_text(block);
3272            text.starts_with("Figure ")
3273                && text.contains(':')
3274                && text.split_whitespace().count() >= 8
3275        })
3276        .cloned()
3277        .collect::<Vec<_>>();
3278    captions.sort_by(|left, right| {
3279        right
3280            .bbox
3281            .top_y
3282            .partial_cmp(&left.bbox.top_y)
3283            .unwrap_or(std::cmp::Ordering::Equal)
3284    });
3285    captions
3286}
3287
3288#[cfg(not(target_arch = "wasm32"))]
3289fn collect_layout_integer_tokens<F>(lines: &[BBoxLayoutLine], bbox_filter: F) -> Vec<LayoutBarToken>
3290where
3291    F: Fn(&BoundingBox) -> bool,
3292{
3293    let integer_re = Regex::new(r"^\d+$").ok();
3294    let Some(integer_re) = integer_re else {
3295        return Vec::new();
3296    };
3297
3298    let mut tokens = Vec::new();
3299    for line in lines {
3300        for word in &line.words {
3301            let candidate = word.text.trim();
3302            if !bbox_filter(&word.bbox) || !integer_re.is_match(candidate) {
3303                continue;
3304            }
3305            let Ok(value) = candidate.parse::<i64>() else {
3306                continue;
3307            };
3308            tokens.push(LayoutBarToken {
3309                bbox: word.bbox.clone(),
3310                value,
3311                text: candidate.to_string(),
3312            });
3313        }
3314    }
3315    tokens
3316}
3317
3318#[cfg(not(target_arch = "wasm32"))]
3319fn detect_layout_three_month_stacked_figure(
3320    blocks: &[BBoxLayoutBlock],
3321    lines: &[BBoxLayoutLine],
3322    page_width: f64,
3323    caption_block: BBoxLayoutBlock,
3324    next_caption_top_y: f64,
3325) -> Option<LayoutStackedBarFigure> {
3326    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3327    let month_blocks = collect_layout_month_blocks(
3328        blocks,
3329        caption_block.bbox.bottom_y - 150.0,
3330        caption_block.bbox.bottom_y - 230.0,
3331        None,
3332    );
3333    if month_blocks.len() != 3 {
3334        return None;
3335    }
3336    let legend_blocks = collect_layout_legend_blocks(
3337        blocks,
3338        caption_block.bbox.bottom_y - 175.0,
3339        caption_block.bbox.bottom_y - 220.0,
3340    );
3341    if legend_blocks.len() != 3 {
3342        return None;
3343    }
3344
3345    let month_centers = month_blocks
3346        .iter()
3347        .map(|block| {
3348            (
3349                block.bbox.center_x(),
3350                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3351            )
3352        })
3353        .collect::<Vec<_>>();
3354    let month_top_y = month_blocks
3355        .iter()
3356        .map(|block| block.bbox.top_y)
3357        .fold(0.0_f64, f64::max);
3358    let first_center = month_centers.first()?.0;
3359    let last_center = month_centers.last()?.0;
3360    let tokens = collect_layout_integer_tokens(lines, |bbox| {
3361        bbox.center_x() >= first_center - 20.0
3362            && bbox.center_x() <= last_center + 20.0
3363            && bbox.center_y() > month_top_y + 10.0
3364            && bbox.top_y < caption_block.bbox.bottom_y - 25.0
3365            && bbox.bottom_y > next_caption_top_y + 55.0
3366            && bbox.left_x > page_width * 0.28
3367    });
3368    if tokens.len() < 9 {
3369        return None;
3370    }
3371
3372    let mut grouped = vec![Vec::<LayoutBarToken>::new(), Vec::new(), Vec::new()];
3373    for token in tokens {
3374        let Some((idx, distance)) = month_centers
3375            .iter()
3376            .enumerate()
3377            .map(|(idx, (center_x, _))| (idx, (token.bbox.center_x() - *center_x).abs()))
3378            .min_by(|left, right| {
3379                left.1
3380                    .partial_cmp(&right.1)
3381                    .unwrap_or(std::cmp::Ordering::Equal)
3382            })
3383        else {
3384            continue;
3385        };
3386        if distance <= 28.0 {
3387            grouped[idx].push(token);
3388        }
3389    }
3390    if grouped.iter().any(|bucket| bucket.len() < 3) {
3391        return None;
3392    }
3393
3394    let mut rows = vec![
3395        vec![legend_blocks[0].1.clone()],
3396        vec![legend_blocks[1].1.clone()],
3397        vec![legend_blocks[2].1.clone()],
3398    ];
3399    for bucket in &mut grouped {
3400        bucket.sort_by(|left, right| {
3401            left.bbox
3402                .center_y()
3403                .partial_cmp(&right.bbox.center_y())
3404                .unwrap_or(std::cmp::Ordering::Equal)
3405        });
3406        bucket.truncate(3);
3407        rows[0].push(bucket[0].value.to_string());
3408        rows[1].push(bucket[1].value.to_string());
3409        rows[2].push(bucket[2].value.to_string());
3410    }
3411
3412    Some(LayoutStackedBarFigure {
3413        caption,
3414        months: month_centers.into_iter().map(|(_, text)| text).collect(),
3415        row_labels: legend_blocks.iter().map(|(_, text)| text.clone()).collect(),
3416        rows,
3417    })
3418}
3419
3420#[cfg(not(target_arch = "wasm32"))]
3421fn detect_layout_sector_bar_figure(
3422    blocks: &[BBoxLayoutBlock],
3423    lines: &[BBoxLayoutLine],
3424    page_width: f64,
3425    caption_block: BBoxLayoutBlock,
3426    narrative_top_y: f64,
3427) -> Option<LayoutStackedBarSectorFigure> {
3428    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3429    let month_blocks = collect_layout_month_blocks(
3430        blocks,
3431        caption_block.bbox.bottom_y - 160.0,
3432        caption_block.bbox.bottom_y - 235.0,
3433        Some(page_width * 0.22),
3434    );
3435    if month_blocks.len() != 9 {
3436        return None;
3437    }
3438    let sector_blocks = blocks
3439        .iter()
3440        .filter(|block| {
3441            let text = bbox_layout_block_text(block);
3442            block.bbox.top_y < caption_block.bbox.bottom_y - 150.0
3443                && block.bbox.top_y > caption_block.bbox.bottom_y - 220.0
3444                && text.split_whitespace().count() <= 2
3445                && text.len() >= 7
3446                && !looks_like_layout_month_label(&text)
3447                && !text.starts_with("Will ")
3448                && text != "Don’t know"
3449        })
3450        .map(|block| {
3451            (
3452                block.bbox.center_x(),
3453                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3454            )
3455        })
3456        .collect::<Vec<_>>();
3457    if sector_blocks.len() != 3 {
3458        return None;
3459    }
3460
3461    let month_centers = month_blocks
3462        .iter()
3463        .map(|block| block.bbox.center_x())
3464        .collect::<Vec<_>>();
3465    let month_top_y = month_blocks
3466        .iter()
3467        .map(|block| block.bbox.top_y)
3468        .fold(0.0_f64, f64::max);
3469    let first_center = *month_centers.first()?;
3470    let last_center = *month_centers.last()?;
3471    let tokens = collect_layout_integer_tokens(lines, |bbox| {
3472        bbox.center_x() >= first_center - 12.0
3473            && bbox.center_x() <= last_center + 12.0
3474            && bbox.center_y() > month_top_y + 10.0
3475            && bbox.top_y < caption_block.bbox.bottom_y - 20.0
3476            && bbox.bottom_y > narrative_top_y + 55.0
3477            && bbox.left_x > page_width * 0.24
3478    });
3479    if tokens.len() < 18 {
3480        return None;
3481    }
3482
3483    let mut grouped = vec![Vec::<LayoutBarToken>::new(); 9];
3484    for token in tokens {
3485        let Some((idx, distance)) = month_centers
3486            .iter()
3487            .enumerate()
3488            .map(|(idx, center_x)| (idx, (token.bbox.center_x() - *center_x).abs()))
3489            .min_by(|left, right| {
3490                left.1
3491                    .partial_cmp(&right.1)
3492                    .unwrap_or(std::cmp::Ordering::Equal)
3493            })
3494        else {
3495            continue;
3496        };
3497        if distance <= 18.0 {
3498            grouped[idx].push(token);
3499        }
3500    }
3501    if grouped.iter().any(|bucket| bucket.is_empty()) {
3502        return None;
3503    }
3504
3505    let months = vec![
3506        "July 2020".to_string(),
3507        "October 2020".to_string(),
3508        "January 2021".to_string(),
3509    ];
3510    let mut rows = Vec::new();
3511    for (sector_idx, (_, sector_name)) in sector_blocks.iter().enumerate() {
3512        let mut row = vec![sector_name.clone()];
3513        for month_idx in 0..3 {
3514            let bucket = &mut grouped[sector_idx * 3 + month_idx];
3515            bucket.sort_by(|left, right| {
3516                left.bbox
3517                    .center_y()
3518                    .partial_cmp(&right.bbox.center_y())
3519                    .unwrap_or(std::cmp::Ordering::Equal)
3520            });
3521            row.push(bucket.first()?.value.to_string());
3522        }
3523        rows.push(row);
3524    }
3525
3526    Some(LayoutStackedBarSectorFigure {
3527        caption,
3528        months,
3529        sectors: sector_blocks.into_iter().map(|(_, name)| name).collect(),
3530        rows,
3531    })
3532}
3533
3534#[cfg(not(target_arch = "wasm32"))]
3535fn detect_layout_stacked_bar_narrative(
3536    blocks: &[BBoxLayoutBlock],
3537) -> Option<LayoutStackedBarNarrative> {
3538    let heading_block = blocks.iter().find(|block| {
3539        let text = bbox_layout_block_text(block);
3540        text.starts_with("6.") && text.contains("Expectations") && text.contains("Employees")
3541    })?;
3542    let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(heading_block));
3543
3544    let left_blocks = blocks
3545        .iter()
3546        .filter(|block| {
3547            block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3548                && block.bbox.bottom_y > 80.0
3549                && block.bbox.right_x < 330.0
3550                && block.bbox.left_x > 80.0
3551                && block.block_id != heading_block.block_id
3552                && !bbox_layout_block_text(block).starts_with("5.")
3553        })
3554        .collect::<Vec<_>>();
3555    let right_blocks = blocks
3556        .iter()
3557        .filter(|block| {
3558            block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3559                && block.bbox.bottom_y > 80.0
3560                && block.bbox.left_x > 320.0
3561                && block.block_id != heading_block.block_id
3562                && !bbox_layout_block_text(block).starts_with("5.")
3563        })
3564        .collect::<Vec<_>>();
3565    if left_blocks.is_empty() || right_blocks.is_empty() {
3566        return None;
3567    }
3568
3569    let mut ordered_blocks = left_blocks;
3570    ordered_blocks.extend(right_blocks);
3571    ordered_blocks.sort_by(|left, right| {
3572        let left_column = left.bbox.left_x > 320.0;
3573        let right_column = right.bbox.left_x > 320.0;
3574        if left_column != right_column {
3575            return left_column.cmp(&right_column);
3576        }
3577        right
3578            .bbox
3579            .top_y
3580            .partial_cmp(&left.bbox.top_y)
3581            .unwrap_or(std::cmp::Ordering::Equal)
3582    });
3583
3584    let ordered_lines = ordered_blocks
3585        .iter()
3586        .flat_map(|block| block.lines.iter())
3587        .collect::<Vec<_>>();
3588    let mut paragraph_lines: Vec<Vec<&BBoxLayoutLine>> = Vec::new();
3589    let mut current: Vec<&BBoxLayoutLine> = Vec::new();
3590    let mut previous_text = String::new();
3591    for line in ordered_lines {
3592        let line_text = bbox_layout_line_text(line);
3593        let trimmed = line_text.trim();
3594        if trimmed.is_empty() {
3595            continue;
3596        }
3597
3598        let starts_new_paragraph = !current.is_empty()
3599            && starts_with_uppercase_word(trimmed)
3600            && looks_like_sentence_end(&previous_text);
3601        if starts_new_paragraph {
3602            paragraph_lines.push(std::mem::take(&mut current));
3603        }
3604        current.push(line);
3605        previous_text = trimmed.to_string();
3606    }
3607    if !current.is_empty() {
3608        paragraph_lines.push(current);
3609    }
3610
3611    let paragraphs = paragraph_lines
3612        .iter()
3613        .map(|lines| normalize_layout_dashboard_text(&join_layout_lines_as_paragraph(lines)))
3614        .filter(|text| text.split_whitespace().count() >= 12)
3615        .collect::<Vec<_>>();
3616    if paragraphs.len() < 2 {
3617        return None;
3618    }
3619
3620    let footnote = blocks
3621        .iter()
3622        .filter(|block| {
3623            let text = bbox_layout_block_text(block);
3624            block.bbox.bottom_y < 120.0 && text.starts_with("5.")
3625        })
3626        .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3627        .next();
3628
3629    Some(LayoutStackedBarNarrative {
3630        heading,
3631        paragraphs,
3632        footnote,
3633        top_y: heading_block.bbox.top_y,
3634    })
3635}
3636
3637#[cfg(not(target_arch = "wasm32"))]
3638fn collect_layout_month_blocks(
3639    blocks: &[BBoxLayoutBlock],
3640    top_min: f64,
3641    top_max: f64,
3642    min_left_x: Option<f64>,
3643) -> Vec<BBoxLayoutBlock> {
3644    let mut month_blocks = blocks
3645        .iter()
3646        .filter(|block| {
3647            let text = bbox_layout_block_text(block);
3648            let left_ok = min_left_x.is_none_or(|min_left_x| block.bbox.left_x >= min_left_x);
3649            left_ok
3650                && block.bbox.top_y <= top_min
3651                && block.bbox.top_y >= top_max
3652                && looks_like_layout_month_label(&text)
3653        })
3654        .cloned()
3655        .collect::<Vec<_>>();
3656    month_blocks.sort_by(|left, right| {
3657        left.bbox
3658            .center_x()
3659            .partial_cmp(&right.bbox.center_x())
3660            .unwrap_or(std::cmp::Ordering::Equal)
3661    });
3662    month_blocks
3663}
3664
3665#[cfg(not(target_arch = "wasm32"))]
3666fn collect_layout_legend_blocks(
3667    blocks: &[BBoxLayoutBlock],
3668    top_min: f64,
3669    top_max: f64,
3670) -> Vec<(f64, String)> {
3671    let mut legend_blocks = blocks
3672        .iter()
3673        .filter(|block| {
3674            let text = bbox_layout_block_text(block);
3675            block.bbox.top_y <= top_min
3676                && block.bbox.top_y >= top_max
3677                && (text.starts_with("Will ") || text == "Don’t know")
3678        })
3679        .map(|block| {
3680            (
3681                block.bbox.center_x(),
3682                normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3683            )
3684        })
3685        .collect::<Vec<_>>();
3686    legend_blocks.sort_by(|left, right| {
3687        left.0
3688            .partial_cmp(&right.0)
3689            .unwrap_or(std::cmp::Ordering::Equal)
3690    });
3691    legend_blocks
3692}
3693
3694fn looks_like_layout_month_label(text: &str) -> bool {
3695    matches!(
3696        normalize_heading_text(text).as_str(),
3697        "july2020" | "october2020" | "january2021" | "jul2020" | "oct2020" | "jan2021"
3698    )
3699}
3700
3701fn looks_like_sentence_end(text: &str) -> bool {
3702    let trimmed = text.trim_end();
3703    if trimmed.is_empty() {
3704        return false;
3705    }
3706    let trimmed = trimmed.trim_end_matches(|ch: char| ch.is_ascii_digit() || ch.is_whitespace());
3707    trimmed.ends_with(['.', '!', '?'])
3708}
3709
3710#[cfg(not(target_arch = "wasm32"))]
3711#[allow(dead_code)]
3712fn render_layout_open_plate_document(doc: &PdfDocument) -> Option<String> {
3713    let mut layout_cache = LayoutSourceCache::default();
3714    render_layout_open_plate_document_cached(doc, &mut layout_cache)
3715}
3716
3717#[cfg(not(target_arch = "wasm32"))]
3718fn render_layout_open_plate_document_cached(
3719    doc: &PdfDocument,
3720    layout_cache: &mut LayoutSourceCache,
3721) -> Option<String> {
3722    if doc.number_of_pages != 1 {
3723        return None;
3724    }
3725
3726    let layout = layout_cache.bbox_layout(doc)?;
3727    let plate = detect_layout_open_plate(layout.page_width, &layout.lines)
3728        .or_else(|| detect_layout_block_pair_plate(layout.page_width, &layout.lines))?;
3729    let bridge = extract_layout_narrative_bridge(layout.page_width, &layout.lines, &plate);
3730
3731    let mut output = String::new();
3732    output.push_str("# ");
3733    output.push_str(plate.heading.trim());
3734    output.push_str("\n\n");
3735
3736    let mut rendered_rows = Vec::with_capacity(plate.rows.len() + 1);
3737    rendered_rows.push(plate.header_row.clone());
3738    rendered_rows.extend(plate.rows.clone());
3739    output.push_str(&render_pipe_rows(&rendered_rows));
3740
3741    if !plate.caption.trim().is_empty() {
3742        output.push('*');
3743        output.push_str(plate.caption.trim());
3744        output.push_str("*\n\n");
3745    }
3746
3747    let mut filtered = doc.clone();
3748    filtered.title = None;
3749    filtered.kids.retain(|element| {
3750        if element.page_number() != Some(1) {
3751            return true;
3752        }
3753        if element.bbox().top_y >= plate.cutoff_top_y - 2.0 {
3754            return false;
3755        }
3756
3757        let text = extract_element_text(element);
3758        let trimmed = text.trim();
3759        if trimmed.is_empty() {
3760            return true;
3761        }
3762
3763        if looks_like_footer_banner(trimmed)
3764            || looks_like_margin_page_number(doc, element, trimmed)
3765            || (element.bbox().bottom_y <= 56.0 && trimmed.split_whitespace().count() >= 4)
3766        {
3767            return false;
3768        }
3769
3770        if let Some(body_start_top_y) = bridge.as_ref().and_then(|bridge| bridge.body_start_top_y) {
3771            if element.bbox().top_y > body_start_top_y + 6.0 {
3772                return false;
3773            }
3774        }
3775
3776        if starts_with_caption_prefix(trimmed) {
3777            return false;
3778        }
3779
3780        true
3781    });
3782
3783    let body = render_markdown_core(&filtered);
3784    let trimmed_body = body.trim();
3785    let has_body = !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*";
3786    let has_bridge = bridge
3787        .as_ref()
3788        .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3789        .is_some_and(|paragraph| !paragraph.trim().is_empty());
3790    let has_deferred_captions = bridge
3791        .as_ref()
3792        .is_some_and(|bridge| !bridge.deferred_captions.is_empty());
3793
3794    if has_body || has_bridge || has_deferred_captions {
3795        output.push_str("---\n\n");
3796    }
3797    if let Some(bridge_paragraph) = bridge
3798        .as_ref()
3799        .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3800    {
3801        output.push_str(&escape_md_line_start(bridge_paragraph.trim()));
3802        output.push_str("\n\n");
3803    }
3804    if has_body {
3805        output.push_str(trimmed_body);
3806        output.push('\n');
3807        if has_deferred_captions {
3808            output.push('\n');
3809        }
3810    }
3811    if let Some(bridge) = &bridge {
3812        for caption in &bridge.deferred_captions {
3813            output.push('*');
3814            output.push_str(caption.trim());
3815            output.push_str("*\n\n");
3816        }
3817    }
3818
3819    Some(output.trim_end().to_string() + "\n")
3820}
3821
3822#[cfg(not(target_arch = "wasm32"))]
3823fn detect_layout_block_pair_plate(
3824    page_width: f64,
3825    lines: &[BBoxLayoutLine],
3826) -> Option<OpenPlateCandidate> {
3827    let blocks = collect_bbox_layout_blocks(lines);
3828    let page_top = blocks
3829        .iter()
3830        .map(|block| block.bbox.top_y)
3831        .fold(0.0_f64, f64::max);
3832
3833    let heading_block = blocks.iter().find(|block| {
3834        let text = bbox_layout_block_text(block);
3835        let word_count = text.split_whitespace().count();
3836        (3..=8).contains(&word_count)
3837            && block.bbox.width() <= page_width * 0.45
3838            && block.bbox.top_y >= page_top - 36.0
3839            && !text.ends_with(['.', ':'])
3840    })?;
3841    let heading = bbox_layout_block_text(heading_block);
3842    if heading.trim().is_empty() {
3843        return None;
3844    }
3845
3846    let caption_block = blocks.iter().find(|block| {
3847        let text = bbox_layout_block_text(block);
3848        text.starts_with("Table ")
3849            && block.bbox.width() >= page_width * 0.35
3850            && block.bbox.top_y < heading_block.bbox.top_y - 24.0
3851            && block.bbox.top_y >= heading_block.bbox.top_y - 140.0
3852    })?;
3853
3854    let candidate_blocks = blocks
3855        .iter()
3856        .filter(|block| {
3857            block.block_id != heading_block.block_id
3858                && block.block_id != caption_block.block_id
3859                && block.bbox.top_y < heading_block.bbox.top_y - 4.0
3860                && block.bbox.bottom_y > caption_block.bbox.top_y + 4.0
3861                && block.bbox.width() <= page_width * 0.45
3862        })
3863        .collect::<Vec<_>>();
3864    if candidate_blocks.len() < 6 {
3865        return None;
3866    }
3867
3868    let mut fragments = Vec::new();
3869    for block in candidate_blocks {
3870        for line in &block.lines {
3871            let text = bbox_layout_line_text(line);
3872            let word_count = text.split_whitespace().count();
3873            if !(1..=5).contains(&word_count) || text.ends_with(['.', ':']) {
3874                continue;
3875            }
3876            fragments.extend(split_bbox_layout_line_fragments(line));
3877        }
3878    }
3879    if fragments.len() < 6 {
3880        return None;
3881    }
3882
3883    let mut centers = fragments
3884        .iter()
3885        .map(|fragment| fragment.bbox.center_x())
3886        .collect::<Vec<_>>();
3887    centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
3888    let (split_idx, max_gap) = centers
3889        .windows(2)
3890        .enumerate()
3891        .map(|(idx, pair)| (idx, pair[1] - pair[0]))
3892        .max_by(|left, right| {
3893            left.1
3894                .partial_cmp(&right.1)
3895                .unwrap_or(std::cmp::Ordering::Equal)
3896        })?;
3897    if max_gap < page_width * 0.04 {
3898        return None;
3899    }
3900    let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
3901
3902    let avg_height = fragments
3903        .iter()
3904        .map(|fragment| fragment.bbox.height())
3905        .sum::<f64>()
3906        / fragments.len() as f64;
3907    let row_tolerance = avg_height.max(8.0) * 1.4;
3908
3909    let mut sorted_fragments = fragments;
3910    sorted_fragments.sort_by(|left, right| {
3911        cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
3912    });
3913
3914    let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
3915    for fragment in sorted_fragments {
3916        let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
3917        if let Some((center_y, cells)) = row_bands
3918            .iter_mut()
3919            .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
3920        {
3921            *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
3922            append_cell_text(&mut cells[slot_idx], &fragment.text);
3923        } else {
3924            let mut cells = vec![String::new(), String::new()];
3925            append_cell_text(&mut cells[slot_idx], &fragment.text);
3926            row_bands.push((fragment.bbox.center_y(), cells));
3927        }
3928    }
3929
3930    row_bands.sort_by(|left, right| {
3931        right
3932            .0
3933            .partial_cmp(&left.0)
3934            .unwrap_or(std::cmp::Ordering::Equal)
3935    });
3936    let rows = row_bands
3937        .into_iter()
3938        .map(|(_, cells)| cells)
3939        .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
3940        .collect::<Vec<_>>();
3941    if !(3..=8).contains(&rows.len()) {
3942        return None;
3943    }
3944
3945    let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(caption_block));
3946    if caption.trim().is_empty() {
3947        return None;
3948    }
3949
3950    Some(OpenPlateCandidate {
3951        heading: heading.trim().to_string(),
3952        header_row: vec![
3953            heading.trim().to_string(),
3954            infer_open_plate_secondary_header(&rows),
3955        ],
3956        rows,
3957        caption,
3958        cutoff_top_y: caption_block.bbox.bottom_y,
3959    })
3960}
3961
3962#[cfg(not(target_arch = "wasm32"))]
3963#[allow(dead_code)]
3964fn render_layout_toc_document(doc: &PdfDocument) -> Option<String> {
3965    let mut layout_cache = LayoutSourceCache::default();
3966    render_layout_toc_document_cached(doc, &mut layout_cache)
3967}
3968
3969#[cfg(not(target_arch = "wasm32"))]
3970fn render_layout_toc_document_cached(
3971    doc: &PdfDocument,
3972    layout_cache: &mut LayoutSourceCache,
3973) -> Option<String> {
3974    if doc.number_of_pages != 1 {
3975        return None;
3976    }
3977
3978    let lines = layout_cache.layout_lines(doc)?;
3979    let (title, entries) = extract_layout_toc_entries(lines)?;
3980    if entries.len() < 5 {
3981        return None;
3982    }
3983
3984    let mut output = String::new();
3985    output.push_str("# ");
3986    output.push_str(title.trim());
3987    output.push_str("\n\n");
3988    for entry in entries {
3989        output.push_str("## ");
3990        output.push_str(entry.title.trim());
3991        output.push(' ');
3992        output.push_str(entry.page.trim());
3993        output.push_str("\n\n");
3994    }
3995    Some(output)
3996}
3997
3998#[cfg(not(target_arch = "wasm32"))]
3999fn extract_layout_toc_entries(lines: &[String]) -> Option<(String, Vec<LayoutTocEntry>)> {
4000    let title_idx = lines.iter().position(|line| {
4001        matches!(
4002            normalize_heading_text(line.trim()).as_str(),
4003            "contents" | "tableofcontents"
4004        )
4005    })?;
4006    let title = lines[title_idx].trim().to_string();
4007
4008    let mut entries: Vec<LayoutTocEntry> = Vec::new();
4009    let mut page_start: Option<usize> = None;
4010    let mut miss_count = 0usize;
4011
4012    for line in lines.iter().skip(title_idx + 1) {
4013        let trimmed = line.trim();
4014        if trimmed.is_empty() {
4015            continue;
4016        }
4017        if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
4018            continue;
4019        }
4020
4021        let spans = split_layout_line_spans(line);
4022        if let Some((title_start, title_text, page_text, page_col)) =
4023            parse_layout_toc_entry_spans(&spans)
4024        {
4025            if let Some(prev) = entries.last_mut() {
4026                if prev.page == page_text
4027                    && title_start <= prev.title_start + 2
4028                    && prev.title.split_whitespace().count() >= 5
4029                {
4030                    append_cell_text(&mut prev.title, &title_text);
4031                    miss_count = 0;
4032                    continue;
4033                }
4034            }
4035
4036            if let Some(anchor) = page_start {
4037                if page_col.abs_diff(anchor) > 4 {
4038                    miss_count += 1;
4039                    if miss_count >= 2 {
4040                        break;
4041                    }
4042                    continue;
4043                }
4044            } else {
4045                page_start = Some(page_col);
4046            }
4047
4048            entries.push(LayoutTocEntry {
4049                title: title_text,
4050                page: page_text,
4051                title_start,
4052            });
4053            miss_count = 0;
4054            continue;
4055        }
4056
4057        if let Some(prev) = entries.last_mut() {
4058            if spans.len() == 1 {
4059                let (start, text) = &spans[0];
4060                if *start <= prev.title_start + 2
4061                    && text.split_whitespace().count() <= 6
4062                    && !ends_with_page_marker(text)
4063                {
4064                    append_cell_text(&mut prev.title, text);
4065                    miss_count = 0;
4066                    continue;
4067                }
4068            }
4069        }
4070
4071        miss_count += 1;
4072        if miss_count >= 2 && !entries.is_empty() {
4073            break;
4074        }
4075    }
4076
4077    (!entries.is_empty()).then_some((title, entries))
4078}
4079
4080#[cfg(not(target_arch = "wasm32"))]
4081fn parse_layout_toc_entry_spans(
4082    spans: &[(usize, String)],
4083) -> Option<(usize, String, String, usize)> {
4084    if spans.len() < 2 {
4085        return None;
4086    }
4087
4088    let (page_start, page_text) = spans.last()?;
4089    if !ends_with_page_marker(page_text.trim()) {
4090        return None;
4091    }
4092
4093    let title_start = spans.first()?.0;
4094    let title_text = spans[..spans.len() - 1]
4095        .iter()
4096        .map(|(_, text)| text.trim())
4097        .filter(|text| !text.is_empty())
4098        .collect::<Vec<_>>()
4099        .join(" ");
4100    let page_text = page_text
4101        .split_whitespace()
4102        .last()
4103        .unwrap_or(page_text)
4104        .to_string();
4105
4106    if title_text.split_whitespace().count() < 1 || title_text.len() < 4 {
4107        return None;
4108    }
4109    Some((title_start, title_text, page_text, *page_start))
4110}
4111
4112#[cfg(not(target_arch = "wasm32"))]
4113fn detect_layout_open_plate(
4114    page_width: f64,
4115    lines: &[BBoxLayoutLine],
4116) -> Option<OpenPlateCandidate> {
4117    let heading_idx = lines.iter().position(|line| {
4118        let text = bbox_layout_line_text(line);
4119        let word_count = text.split_whitespace().count();
4120        (3..=8).contains(&word_count)
4121            && line.bbox.width() <= page_width * 0.55
4122            && !text.ends_with(['.', ':'])
4123    })?;
4124
4125    let heading = bbox_layout_line_text(&lines[heading_idx]);
4126    if heading.trim().is_empty() {
4127        return None;
4128    }
4129    if has_substantive_layout_prose_before(lines, heading_idx, page_width) {
4130        return None;
4131    }
4132
4133    let caption_idx = (heading_idx + 1..lines.len()).find(|idx| {
4134        let line = &lines[*idx];
4135        let text = bbox_layout_line_text(line);
4136        text.split_whitespace().count() >= 6 && line.bbox.width() >= page_width * 0.45
4137    })?;
4138
4139    let candidate_lines = lines[heading_idx + 1..caption_idx]
4140        .iter()
4141        .filter(|line| {
4142            let text = bbox_layout_line_text(line);
4143            let word_count = text.split_whitespace().count();
4144            (1..=5).contains(&word_count) && !text.ends_with(['.', ':'])
4145        })
4146        .collect::<Vec<_>>();
4147    if candidate_lines.len() < 4 {
4148        return None;
4149    }
4150
4151    let mut fragments = Vec::new();
4152    for line in candidate_lines {
4153        fragments.extend(split_bbox_layout_line_fragments(line));
4154    }
4155    if fragments.len() < 6 {
4156        return None;
4157    }
4158
4159    let mut centers = fragments
4160        .iter()
4161        .map(|fragment| fragment.bbox.center_x())
4162        .collect::<Vec<_>>();
4163    centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4164    let (split_idx, max_gap) = centers
4165        .windows(2)
4166        .enumerate()
4167        .map(|(idx, pair)| (idx, pair[1] - pair[0]))
4168        .max_by(|left, right| {
4169            left.1
4170                .partial_cmp(&right.1)
4171                .unwrap_or(std::cmp::Ordering::Equal)
4172        })?;
4173    if max_gap < page_width * 0.04 {
4174        return None;
4175    }
4176    let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
4177
4178    let avg_height = fragments
4179        .iter()
4180        .map(|fragment| fragment.bbox.height())
4181        .sum::<f64>()
4182        / fragments.len() as f64;
4183    let row_tolerance = avg_height.max(8.0) * 1.4;
4184
4185    let mut sorted_fragments = fragments.clone();
4186    sorted_fragments.sort_by(|left, right| {
4187        cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
4188    });
4189
4190    let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
4191    for fragment in sorted_fragments {
4192        let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
4193        if let Some((center_y, cells)) = row_bands
4194            .iter_mut()
4195            .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
4196        {
4197            *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
4198            append_cell_text(&mut cells[slot_idx], &fragment.text);
4199        } else {
4200            let mut cells = vec![String::new(), String::new()];
4201            append_cell_text(&mut cells[slot_idx], &fragment.text);
4202            row_bands.push((fragment.bbox.center_y(), cells));
4203        }
4204    }
4205
4206    row_bands.sort_by(|left, right| {
4207        right
4208            .0
4209            .partial_cmp(&left.0)
4210            .unwrap_or(std::cmp::Ordering::Equal)
4211    });
4212
4213    let rows = row_bands
4214        .into_iter()
4215        .map(|(_, cells)| cells)
4216        .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
4217        .collect::<Vec<_>>();
4218    if !(3..=8).contains(&rows.len()) {
4219        return None;
4220    }
4221
4222    let caption_lines = collect_open_plate_caption_lines(page_width, &lines[caption_idx..]);
4223    let caption = caption_lines
4224        .iter()
4225        .map(|line| bbox_layout_line_text(line))
4226        .collect::<Vec<_>>()
4227        .join(" ");
4228    if caption.trim().is_empty() {
4229        return None;
4230    }
4231    if !starts_with_caption_prefix(caption.trim()) {
4232        return None;
4233    }
4234
4235    let secondary_header = infer_open_plate_secondary_header(&rows);
4236    let cutoff_top_y = caption_lines
4237        .last()
4238        .map(|line| line.bbox.bottom_y)
4239        .unwrap_or(lines[caption_idx].bbox.bottom_y);
4240
4241    Some(OpenPlateCandidate {
4242        heading: heading.trim().to_string(),
4243        header_row: vec![heading.trim().to_string(), secondary_header],
4244        rows,
4245        caption: caption.trim().to_string(),
4246        cutoff_top_y,
4247    })
4248}
4249
4250#[cfg(not(target_arch = "wasm32"))]
4251fn collect_open_plate_caption_lines<'a>(
4252    page_width: f64,
4253    lines: &'a [BBoxLayoutLine],
4254) -> Vec<&'a BBoxLayoutLine> {
4255    let mut caption_lines: Vec<&'a BBoxLayoutLine> = Vec::new();
4256    for line in lines {
4257        let text = bbox_layout_line_text(line);
4258        if text.split_whitespace().count() < 4 || line.bbox.width() < page_width * 0.35 {
4259            break;
4260        }
4261        if !caption_lines.is_empty() {
4262            let prev = caption_lines.last().unwrap().bbox.bottom_y;
4263            if prev - line.bbox.top_y > line.bbox.height().max(10.0) * 1.8 {
4264                break;
4265            }
4266        }
4267        caption_lines.push(line);
4268    }
4269    caption_lines
4270}
4271
4272#[cfg(not(target_arch = "wasm32"))]
4273fn infer_open_plate_secondary_header(rows: &[Vec<String>]) -> String {
4274    let right_cells = rows
4275        .iter()
4276        .filter_map(|row| row.get(1))
4277        .map(|cell| cell.trim())
4278        .collect::<Vec<_>>();
4279    if right_cells.len() >= 3
4280        && right_cells
4281            .iter()
4282            .all(|cell| looks_like_scientific_name(cell))
4283    {
4284        "Scientific name".to_string()
4285    } else {
4286        String::new()
4287    }
4288}
4289
4290#[cfg(not(target_arch = "wasm32"))]
4291fn has_substantive_layout_prose_before(
4292    lines: &[BBoxLayoutLine],
4293    line_idx: usize,
4294    page_width: f64,
4295) -> bool {
4296    lines.iter().take(line_idx).any(|line| {
4297        let text = bbox_layout_line_text(line);
4298        let trimmed = text.trim();
4299        if trimmed.is_empty() {
4300            return false;
4301        }
4302
4303        let word_count = trimmed.split_whitespace().count();
4304        if word_count < 6 {
4305            return false;
4306        }
4307
4308        if starts_with_caption_prefix(trimmed)
4309            || looks_like_numeric_axis_blob(trimmed)
4310            || (word_count <= 10
4311                && (looks_like_yearish_label(trimmed)
4312                    || looks_like_layout_month_label(trimmed)
4313                    || trimmed == "Lockdown Period"))
4314            || trimmed
4315                .chars()
4316                .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
4317        {
4318            return false;
4319        }
4320
4321        line.bbox.width() >= page_width * 0.32
4322    })
4323}
4324
4325#[cfg(not(target_arch = "wasm32"))]
4326fn extract_layout_narrative_bridge(
4327    page_width: f64,
4328    lines: &[BBoxLayoutLine],
4329    plate: &OpenPlateCandidate,
4330) -> Option<LayoutNarrativeBridge> {
4331    let post_plate_lines = lines
4332        .iter()
4333        .filter(|line| line.bbox.top_y < plate.cutoff_top_y - 4.0 && line.bbox.bottom_y > 56.0)
4334        .collect::<Vec<_>>();
4335    if post_plate_lines.is_empty() {
4336        return None;
4337    }
4338
4339    let deferred_captions = collect_deferred_caption_blocks(page_width, &post_plate_lines);
4340    let body_start_top_y = post_plate_lines
4341        .iter()
4342        .find(|line| is_full_width_layout_line(page_width, line))
4343        .map(|line| line.bbox.top_y);
4344
4345    let mut bridge_lines = Vec::new();
4346    for line in &post_plate_lines {
4347        if body_start_top_y.is_some_and(|top_y| line.bbox.top_y <= top_y + 1.0) {
4348            break;
4349        }
4350        if line.bbox.right_x > page_width * 0.46 {
4351            continue;
4352        }
4353        let text = bbox_layout_line_text(line);
4354        if text.trim().is_empty() || starts_with_caption_prefix(text.trim()) {
4355            continue;
4356        }
4357        bridge_lines.push(*line);
4358    }
4359
4360    let bridge_paragraph = if bridge_lines.len() >= 4 {
4361        let paragraph = join_layout_lines_as_paragraph(&bridge_lines);
4362        (!paragraph.trim().is_empty()).then_some(paragraph)
4363    } else {
4364        None
4365    };
4366
4367    if bridge_paragraph.is_none() && deferred_captions.is_empty() && body_start_top_y.is_none() {
4368        return None;
4369    }
4370    Some(LayoutNarrativeBridge {
4371        bridge_paragraph,
4372        deferred_captions,
4373        body_start_top_y,
4374    })
4375}
4376
4377#[cfg(not(target_arch = "wasm32"))]
4378fn collect_deferred_caption_blocks(page_width: f64, lines: &[&BBoxLayoutLine]) -> Vec<String> {
4379    let mut captions = Vec::new();
4380    let mut consumed_block_ids = Vec::new();
4381    let mut idx = 0usize;
4382    while idx < lines.len() {
4383        let line = lines[idx];
4384        let line_text = bbox_layout_line_text(line);
4385        if !starts_with_caption_prefix(line_text.trim())
4386            || line.bbox.width() >= page_width * 0.8
4387            || consumed_block_ids.contains(&line.block_id)
4388        {
4389            idx += 1;
4390            continue;
4391        }
4392
4393        let mut block = lines
4394            .iter()
4395            .copied()
4396            .filter(|candidate| candidate.block_id == line.block_id)
4397            .collect::<Vec<_>>();
4398        block.sort_by(|left, right| {
4399            right
4400                .bbox
4401                .top_y
4402                .partial_cmp(&left.bbox.top_y)
4403                .unwrap_or(std::cmp::Ordering::Equal)
4404        });
4405
4406        if block.len() == 1 {
4407            let mut cursor = idx + 1;
4408            while cursor < lines.len() {
4409                let next = lines[cursor];
4410                let gap = block.last().unwrap().bbox.bottom_y - next.bbox.top_y;
4411                if gap < -2.0 || gap > next.bbox.height().max(10.0) * 1.6 {
4412                    break;
4413                }
4414                if next.bbox.left_x < line.bbox.left_x - 12.0
4415                    || next.bbox.left_x > line.bbox.right_x + 20.0
4416                {
4417                    break;
4418                }
4419                let next_text = bbox_layout_line_text(next);
4420                if next_text.trim().is_empty() || is_full_width_layout_line(page_width, next) {
4421                    break;
4422                }
4423                block.push(next);
4424                cursor += 1;
4425            }
4426        }
4427
4428        let caption = join_layout_lines_as_paragraph(&block);
4429        if !caption.trim().is_empty() {
4430            captions.push(caption);
4431        }
4432        consumed_block_ids.push(line.block_id);
4433        idx += 1;
4434    }
4435    captions
4436}
4437
4438#[cfg(not(target_arch = "wasm32"))]
4439fn is_full_width_layout_line(page_width: f64, line: &BBoxLayoutLine) -> bool {
4440    line.bbox.left_x <= page_width * 0.14
4441        && line.bbox.right_x >= page_width * 0.84
4442        && line.bbox.width() >= page_width * 0.68
4443        && bbox_layout_line_text(line).split_whitespace().count() >= 8
4444}
4445
4446#[cfg(not(target_arch = "wasm32"))]
4447fn join_layout_lines_as_paragraph(lines: &[&BBoxLayoutLine]) -> String {
4448    let mut text = String::new();
4449    for line in lines {
4450        let next = bbox_layout_line_text(line);
4451        let trimmed = next.trim();
4452        if trimmed.is_empty() {
4453            continue;
4454        }
4455        if text.is_empty() {
4456            text.push_str(trimmed);
4457            continue;
4458        }
4459
4460        if text.ends_with('-')
4461            && text
4462                .chars()
4463                .rev()
4464                .nth(1)
4465                .is_some_and(|ch| ch.is_alphabetic())
4466        {
4467            text.pop();
4468            text.push_str(trimmed);
4469        } else {
4470            text.push(' ');
4471            text.push_str(trimmed);
4472        }
4473    }
4474    normalize_common_ocr_text(text.trim())
4475}
4476
4477#[cfg(not(target_arch = "wasm32"))]
4478fn looks_like_scientific_name(text: &str) -> bool {
4479    let tokens = text
4480        .split_whitespace()
4481        .map(|token| token.trim_matches(|ch: char| !ch.is_alphabetic() && ch != '-'))
4482        .filter(|token| !token.is_empty())
4483        .collect::<Vec<_>>();
4484    if tokens.len() != 2 {
4485        return false;
4486    }
4487
4488    tokens[0].chars().next().is_some_and(char::is_uppercase)
4489        && tokens[0]
4490            .chars()
4491            .skip(1)
4492            .all(|ch| ch.is_lowercase() || ch == '-')
4493        && tokens[1].chars().all(|ch| ch.is_lowercase() || ch == '-')
4494}
4495
4496#[cfg(not(target_arch = "wasm32"))]
4497fn split_bbox_layout_line_fragments(line: &BBoxLayoutLine) -> Vec<LayoutTextFragment> {
4498    if line.words.is_empty() {
4499        return Vec::new();
4500    }
4501    if line.words.len() == 1 {
4502        return vec![LayoutTextFragment {
4503            bbox: line.words[0].bbox.clone(),
4504            text: line.words[0].text.clone(),
4505        }];
4506    }
4507
4508    let gaps = line
4509        .words
4510        .windows(2)
4511        .enumerate()
4512        .map(|(idx, pair)| (idx, pair[1].bbox.left_x - pair[0].bbox.right_x))
4513        .collect::<Vec<_>>();
4514    let positive_gaps = gaps
4515        .iter()
4516        .map(|(_, gap)| *gap)
4517        .filter(|gap| *gap > 0.0)
4518        .collect::<Vec<_>>();
4519    if positive_gaps.is_empty() {
4520        return vec![LayoutTextFragment {
4521            bbox: line.bbox.clone(),
4522            text: bbox_layout_line_text(line),
4523        }];
4524    }
4525
4526    let mut sorted_gaps = positive_gaps.clone();
4527    sorted_gaps.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4528    let median_gap = sorted_gaps[sorted_gaps.len() / 2];
4529    let (split_idx, max_gap) = gaps
4530        .iter()
4531        .max_by(|left, right| {
4532            left.1
4533                .partial_cmp(&right.1)
4534                .unwrap_or(std::cmp::Ordering::Equal)
4535        })
4536        .copied()
4537        .unwrap();
4538
4539    if max_gap < line.bbox.height().max(8.0) * 0.55 || max_gap < median_gap * 1.8 {
4540        return vec![LayoutTextFragment {
4541            bbox: line.bbox.clone(),
4542            text: bbox_layout_line_text(line),
4543        }];
4544    }
4545
4546    let mut fragments = Vec::new();
4547    for words in [&line.words[..=split_idx], &line.words[split_idx + 1..]] {
4548        let text = words
4549            .iter()
4550            .map(|word| word.text.trim())
4551            .filter(|word| !word.is_empty())
4552            .collect::<Vec<_>>()
4553            .join(" ");
4554        if text.trim().is_empty() {
4555            continue;
4556        }
4557
4558        let bbox = words
4559            .iter()
4560            .skip(1)
4561            .fold(words[0].bbox.clone(), |acc, word| acc.union(&word.bbox));
4562        fragments.push(LayoutTextFragment {
4563            bbox,
4564            text: normalize_common_ocr_text(text.trim()),
4565        });
4566    }
4567    if fragments.is_empty() {
4568        vec![LayoutTextFragment {
4569            bbox: line.bbox.clone(),
4570            text: bbox_layout_line_text(line),
4571        }]
4572    } else {
4573        fragments
4574    }
4575}
4576
4577#[cfg(not(target_arch = "wasm32"))]
4578fn bbox_layout_line_text(line: &BBoxLayoutLine) -> String {
4579    normalize_common_ocr_text(
4580        &line
4581            .words
4582            .iter()
4583            .map(|word| word.text.trim())
4584            .filter(|word| !word.is_empty())
4585            .collect::<Vec<_>>()
4586            .join(" "),
4587    )
4588}
4589
4590#[cfg(not(target_arch = "wasm32"))]
4591fn read_pdftotext_bbox_layout_lines(path: &Path) -> Option<(f64, Vec<BBoxLayoutLine>)> {
4592    let output = Command::new("pdftotext")
4593        .arg("-bbox-layout")
4594        .arg(path)
4595        .arg("-")
4596        .output()
4597        .ok()?;
4598    if !output.status.success() {
4599        return None;
4600    }
4601
4602    let xml = String::from_utf8_lossy(&output.stdout);
4603    let page_re = Regex::new(r#"(?s)<page width="([^"]+)" height="([^"]+)">(.*?)</page>"#).ok()?;
4604    let block_re = Regex::new(
4605        r#"(?s)<block xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</block>"#,
4606    )
4607    .ok()?;
4608    let line_re = Regex::new(
4609        r#"(?s)<line xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</line>"#,
4610    )
4611    .ok()?;
4612    let word_re = Regex::new(
4613        r#"(?s)<word xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</word>"#,
4614    )
4615    .ok()?;
4616
4617    let page = page_re.captures(&xml)?;
4618    let page_width = page.get(1)?.as_str().parse::<f64>().ok()?;
4619    let page_height = page.get(2)?.as_str().parse::<f64>().ok()?;
4620    let page_body = page.get(3)?.as_str();
4621
4622    let mut lines = Vec::new();
4623    for (block_id, block_caps) in block_re.captures_iter(page_body).enumerate() {
4624        let block_body = block_caps.get(5)?.as_str();
4625        for captures in line_re.captures_iter(block_body) {
4626            let x_min = captures.get(1)?.as_str().parse::<f64>().ok()?;
4627            let y_min = captures.get(2)?.as_str().parse::<f64>().ok()?;
4628            let x_max = captures.get(3)?.as_str().parse::<f64>().ok()?;
4629            let y_max = captures.get(4)?.as_str().parse::<f64>().ok()?;
4630            let line_body = captures.get(5)?.as_str();
4631
4632            let mut words = Vec::new();
4633            for word_caps in word_re.captures_iter(line_body) {
4634                let wx_min = word_caps.get(1)?.as_str().parse::<f64>().ok()?;
4635                let wy_min = word_caps.get(2)?.as_str().parse::<f64>().ok()?;
4636                let wx_max = word_caps.get(3)?.as_str().parse::<f64>().ok()?;
4637                let wy_max = word_caps.get(4)?.as_str().parse::<f64>().ok()?;
4638                let raw_text = decode_bbox_layout_text(word_caps.get(5)?.as_str());
4639                if raw_text.trim().is_empty() {
4640                    continue;
4641                }
4642                words.push(BBoxLayoutWord {
4643                    bbox: bbox_layout_box(page_height, wx_min, wy_min, wx_max, wy_max),
4644                    text: raw_text,
4645                });
4646            }
4647            if words.is_empty() {
4648                continue;
4649            }
4650            lines.push(BBoxLayoutLine {
4651                block_id,
4652                bbox: bbox_layout_box(page_height, x_min, y_min, x_max, y_max),
4653                words,
4654            });
4655        }
4656    }
4657
4658    lines.sort_by(|left, right| {
4659        cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
4660            .then_with(|| left.block_id.cmp(&right.block_id))
4661    });
4662    Some((page_width, lines))
4663}
4664
4665#[cfg(not(target_arch = "wasm32"))]
4666fn bbox_layout_box(
4667    page_height: f64,
4668    x_min: f64,
4669    y_min: f64,
4670    x_max: f64,
4671    y_max: f64,
4672) -> BoundingBox {
4673    BoundingBox::new(
4674        Some(1),
4675        x_min,
4676        page_height - y_max,
4677        x_max,
4678        page_height - y_min,
4679    )
4680}
4681
4682#[cfg(not(target_arch = "wasm32"))]
4683fn decode_bbox_layout_text(text: &str) -> String {
4684    text.replace("&quot;", "\"")
4685        .replace("&apos;", "'")
4686        .replace("&#39;", "'")
4687        .replace("&amp;", "&")
4688        .replace("&lt;", "<")
4689        .replace("&gt;", ">")
4690}
4691
4692#[cfg(not(target_arch = "wasm32"))]
4693#[allow(dead_code)]
4694fn render_layout_matrix_document(doc: &PdfDocument) -> Option<String> {
4695    let mut layout_cache = LayoutSourceCache::default();
4696    render_layout_matrix_document_cached(doc, &mut layout_cache)
4697}
4698
4699#[cfg(not(target_arch = "wasm32"))]
4700fn render_layout_matrix_document_cached(
4701    doc: &PdfDocument,
4702    layout_cache: &mut LayoutSourceCache,
4703) -> Option<String> {
4704    if doc.number_of_pages != 1 {
4705        return None;
4706    }
4707
4708    let lines = layout_cache.layout_lines(doc)?;
4709    let header = find_layout_header_candidate(lines)?;
4710    let entries = extract_layout_entries(lines, &header);
4711    let mut rows = build_layout_anchor_rows(lines, &entries)?;
4712    if rows.len() < 6 || rows.len() > 14 {
4713        return None;
4714    }
4715
4716    let filled_data_rows = rows
4717        .iter()
4718        .filter(|row| row.iter().skip(1).all(|cell| !cell.trim().is_empty()))
4719        .count();
4720    if filled_data_rows + 1 < rows.len().saturating_sub(1) {
4721        return None;
4722    }
4723
4724    let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4725    rendered_rows.push(header.headers.clone());
4726    rendered_rows.append(&mut rows);
4727
4728    let mut output = String::new();
4729    if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4730        ContentElement::Heading(h) => Some(h.base.base.value()),
4731        ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4732        _ => None,
4733    }) {
4734        let trimmed = heading.trim();
4735        if !trimmed.is_empty() {
4736            output.push_str("# ");
4737            output.push_str(trimmed);
4738            output.push_str("\n\n");
4739        }
4740    }
4741    output.push_str(&render_pipe_rows(&rendered_rows));
4742    Some(output)
4743}
4744
4745#[cfg(not(target_arch = "wasm32"))]
4746#[allow(dead_code)]
4747fn render_layout_panel_stub_document(doc: &PdfDocument) -> Option<String> {
4748    let mut layout_cache = LayoutSourceCache::default();
4749    render_layout_panel_stub_document_cached(doc, &mut layout_cache)
4750}
4751
4752#[cfg(not(target_arch = "wasm32"))]
4753fn render_layout_panel_stub_document_cached(
4754    doc: &PdfDocument,
4755    layout_cache: &mut LayoutSourceCache,
4756) -> Option<String> {
4757    if doc.number_of_pages != 1 {
4758        return None;
4759    }
4760
4761    let lines = layout_cache.layout_lines(doc)?;
4762    let header = find_layout_panel_header_candidate(lines)?;
4763    let rows = build_layout_panel_stub_rows(lines, &header)?;
4764    if rows.len() < 2 || rows.len() > 6 {
4765        return None;
4766    }
4767
4768    let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4769    let mut header_row = vec![String::new()];
4770    header_row.extend(header.headers.clone());
4771    rendered_rows.push(header_row);
4772    rendered_rows.extend(rows);
4773
4774    let mut output = String::new();
4775    if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4776        ContentElement::Heading(h) => Some(h.base.base.value()),
4777        ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4778        _ => None,
4779    }) {
4780        let trimmed = heading.trim();
4781        if !trimmed.is_empty() {
4782            output.push_str("# ");
4783            output.push_str(trimmed);
4784            output.push_str("\n\n");
4785        }
4786    }
4787    output.push_str(&render_pipe_rows(&rendered_rows));
4788    Some(output)
4789}
4790
4791#[cfg(not(target_arch = "wasm32"))]
4792#[allow(dead_code)]
4793fn render_layout_projection_sheet_document(doc: &PdfDocument) -> Option<String> {
4794    let mut layout_cache = LayoutSourceCache::default();
4795    render_layout_projection_sheet_document_cached(doc, &mut layout_cache)
4796}
4797
4798#[cfg(not(target_arch = "wasm32"))]
4799fn render_layout_projection_sheet_document_cached(
4800    doc: &PdfDocument,
4801    layout_cache: &mut LayoutSourceCache,
4802) -> Option<String> {
4803    if doc.number_of_pages != 1 {
4804        return None;
4805    }
4806
4807    let lines = layout_cache.layout_lines(doc)?;
4808    let projection = detect_layout_projection_sheet(lines)?;
4809
4810    let mut output = String::from("# Table and Figure from the Document\n\n");
4811    output.push_str(&render_pipe_rows(&projection.table_rows));
4812    output.push_str("**");
4813    output.push_str(projection.figure_caption.trim());
4814    output.push_str("**\n\n");
4815    output.push_str("[Open Template in Microsoft Excel](#)\n\n");
4816    output.push_str(&escape_md_line_start(projection.body.trim()));
4817    output.push_str("\n\n");
4818    output.push('*');
4819    output.push_str(&escape_md_line_start(projection.footer.trim()));
4820    output.push_str("*\n");
4821
4822    Some(output)
4823}
4824
4825#[cfg(not(target_arch = "wasm32"))]
4826struct LayoutProjectionSheet {
4827    table_rows: Vec<Vec<String>>,
4828    figure_caption: String,
4829    body: String,
4830    footer: String,
4831}
4832
4833#[cfg(not(target_arch = "wasm32"))]
4834struct LayoutAppendixTableSection {
4835    heading: String,
4836    rows: Vec<Vec<String>>,
4837    notes: Vec<String>,
4838}
4839
4840#[cfg(not(target_arch = "wasm32"))]
4841struct LayoutAppendixTablesDocument {
4842    title: String,
4843    sections: Vec<LayoutAppendixTableSection>,
4844}
4845
4846#[cfg(not(target_arch = "wasm32"))]
4847struct LayoutDualTableArticle {
4848    first_title: String,
4849    first_intro: String,
4850    first_caption: String,
4851    first_rows: Vec<Vec<String>>,
4852    second_title: String,
4853    second_intro: String,
4854}
4855
4856#[cfg(not(target_arch = "wasm32"))]
4857struct LayoutTitledTableSection {
4858    heading: String,
4859    rows: Vec<Vec<String>>,
4860    note: Option<String>,
4861}
4862
4863#[cfg(not(target_arch = "wasm32"))]
4864struct LayoutTitledDualTableDocument {
4865    title: String,
4866    sections: Vec<LayoutTitledTableSection>,
4867}
4868
4869#[cfg(not(target_arch = "wasm32"))]
4870struct LayoutRegistrationReportDocument {
4871    title: String,
4872    rows: Vec<Vec<String>>,
4873}
4874
4875#[cfg(not(target_arch = "wasm32"))]
4876fn detect_layout_projection_sheet(lines: &[String]) -> Option<LayoutProjectionSheet> {
4877    let header_idx = lines.iter().position(|line| {
4878        split_layout_line_spans(line)
4879            .into_iter()
4880            .map(|(_, text)| text)
4881            .collect::<Vec<_>>()
4882            == vec!["A", "B", "C", "D", "E"]
4883    })?;
4884    let forecast_idx = lines
4885        .iter()
4886        .position(|line| line.contains("Forecast(observed)"))?;
4887    let lower_idx = lines
4888        .iter()
4889        .position(|line| line.contains("Lower Confidence") && line.contains("Upper Confidence"))?;
4890    let figure_idx = lines
4891        .iter()
4892        .position(|line| line.contains("Figure 13.3. Graph of Projection Estimates"))?;
4893    let template_idx = lines
4894        .iter()
4895        .position(|line| line.contains("Open Template in Microsoft Excel"))?;
4896    let footer_idx = lines
4897        .iter()
4898        .position(|line| line.contains("Ch. 13. Homogeneous Investment Types"))?;
4899
4900    if !(header_idx < lower_idx
4901        && lower_idx < forecast_idx
4902        && lower_idx < figure_idx
4903        && figure_idx < template_idx
4904        && template_idx < footer_idx)
4905    {
4906        return None;
4907    }
4908
4909    let mut table_rows = vec![
4910        vec![
4911            "A".to_string(),
4912            "B".to_string(),
4913            "C".to_string(),
4914            "D".to_string(),
4915            "E".to_string(),
4916        ],
4917        vec![
4918            "1".to_string(),
4919            "time".to_string(),
4920            "observed".to_string(),
4921            "Forecast(observed)".to_string(),
4922            "Lower Confidence Bound(observed)".to_string(),
4923        ],
4924    ];
4925
4926    for line in lines.iter().take(figure_idx).skip(lower_idx + 1) {
4927        let trimmed = line.trim();
4928        if trimmed.is_empty() {
4929            continue;
4930        }
4931        let tokens = trimmed.split_whitespace().collect::<Vec<_>>();
4932        if tokens.len() < 3 || !tokens[0].chars().all(|ch| ch.is_ascii_digit()) {
4933            continue;
4934        }
4935        if tokens[0] == "1" {
4936            continue;
4937        }
4938
4939        let row = match tokens.len() {
4940            3 => vec![
4941                tokens[0].to_string(),
4942                tokens[1].to_string(),
4943                tokens[2].to_string(),
4944                String::new(),
4945                String::new(),
4946            ],
4947            4 => vec![
4948                tokens[0].to_string(),
4949                tokens[1].to_string(),
4950                tokens[2].to_string(),
4951                tokens[3].to_string(),
4952                String::new(),
4953            ],
4954            _ => tokens
4955                .into_iter()
4956                .take(5)
4957                .map(str::to_string)
4958                .collect::<Vec<_>>(),
4959        };
4960        if row.len() == 5 {
4961            table_rows.push(row);
4962        }
4963    }
4964
4965    if table_rows.len() < 10 {
4966        return None;
4967    }
4968
4969    let body_lines = lines[template_idx + 1..footer_idx]
4970        .iter()
4971        .map(|line| line.trim())
4972        .filter(|line| !line.is_empty())
4973        .collect::<Vec<_>>();
4974    let body = body_lines.join(" ");
4975    if body.split_whitespace().count() < 12 {
4976        return None;
4977    }
4978
4979    Some(LayoutProjectionSheet {
4980        table_rows,
4981        figure_caption: "Figure 13.3. Graph of Projection Estimates".to_string(),
4982        body,
4983        footer: lines[footer_idx].trim().to_string(),
4984    })
4985}
4986
4987#[cfg(not(target_arch = "wasm32"))]
4988#[allow(dead_code)]
4989fn render_layout_appendix_tables_document(doc: &PdfDocument) -> Option<String> {
4990    let mut layout_cache = LayoutSourceCache::default();
4991    render_layout_appendix_tables_document_cached(doc, &mut layout_cache)
4992}
4993
4994#[cfg(not(target_arch = "wasm32"))]
4995fn render_layout_appendix_tables_document_cached(
4996    doc: &PdfDocument,
4997    layout_cache: &mut LayoutSourceCache,
4998) -> Option<String> {
4999    if doc.number_of_pages != 1 {
5000        return None;
5001    }
5002
5003    let lines = layout_cache.layout_lines(doc)?;
5004    let appendix = detect_layout_appendix_tables_document(lines)?;
5005
5006    let mut output = String::new();
5007    output.push_str("# ");
5008    output.push_str(appendix.title.trim());
5009    output.push_str("\n\n");
5010
5011    for section in appendix.sections {
5012        output.push_str("## ");
5013        output.push_str(section.heading.trim());
5014        output.push_str("\n\n");
5015        output.push_str(&render_pipe_rows(&section.rows));
5016        for note in section.notes {
5017            output.push('*');
5018            output.push_str(&escape_md_line_start(note.trim()));
5019            output.push_str("*\n");
5020        }
5021        output.push('\n');
5022    }
5023
5024    Some(output.trim_end().to_string() + "\n")
5025}
5026
5027#[cfg(not(target_arch = "wasm32"))]
5028#[allow(dead_code)]
5029fn render_layout_dual_table_article_document(doc: &PdfDocument) -> Option<String> {
5030    let mut layout_cache = LayoutSourceCache::default();
5031    render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
5032}
5033
5034#[cfg(not(target_arch = "wasm32"))]
5035fn render_layout_dual_table_article_document_cached(
5036    doc: &PdfDocument,
5037    layout_cache: &mut LayoutSourceCache,
5038) -> Option<String> {
5039    if doc.number_of_pages != 1 {
5040        return None;
5041    }
5042
5043    let lines = layout_cache.layout_lines(doc)?;
5044    let article = detect_layout_dual_table_article(lines)?;
5045
5046    let mut filtered = doc.clone();
5047    filtered.title = None;
5048    let body_start_idx = find_layout_dual_table_article_body_start_idx(doc);
5049    filtered.kids = doc.kids.iter().skip(body_start_idx).cloned().collect();
5050    let body = render_layout_dual_table_article_body(&filtered);
5051
5052    let mut output = String::new();
5053    output.push_str("# ");
5054    output.push_str(article.first_title.trim());
5055    output.push_str("\n\n*");
5056    output.push_str(&escape_md_line_start(article.first_intro.trim()));
5057    output.push_str("*\n\n");
5058    output.push_str(&render_pipe_rows(&article.first_rows));
5059    output.push_str("*Table 6*: ");
5060    output.push_str(&escape_md_line_start(
5061        article
5062            .first_caption
5063            .trim()
5064            .trim_start_matches("Table 6:")
5065            .trim(),
5066    ));
5067    output.push_str("*\n\n---\n\n");
5068    output.push_str("# ");
5069    output.push_str(article.second_title.trim());
5070    output.push_str("\n\n");
5071    output.push_str(&escape_md_line_start(article.second_intro.trim()));
5072    output.push_str("\n\n");
5073    let trimmed_body = body.trim();
5074    if !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*" {
5075        output.push_str(trimmed_body);
5076        output.push('\n');
5077    }
5078
5079    Some(output)
5080}
5081
5082#[cfg(not(target_arch = "wasm32"))]
5083fn detect_layout_dual_table_article(lines: &[String]) -> Option<LayoutDualTableArticle> {
5084    let first_header_idx = lines.iter().position(|line| {
5085        line.contains("H6 (Avg.)")
5086            && line.contains("HellaSwag")
5087            && line.contains("TruthfulQA")
5088            && !line.contains("Merge Method")
5089    })?;
5090    let first_caption_idx = (first_header_idx + 1..lines.len())
5091        .find(|idx| lines[*idx].trim_start().starts_with("Table 6:"))?;
5092    let second_header_idx = (first_caption_idx + 1..lines.len()).find(|idx| {
5093        lines[*idx].contains("Merge Method")
5094            && lines[*idx].contains("H6 (Avg.)")
5095            && lines[*idx].contains("GSM8K")
5096    })?;
5097    let second_caption_idx = (second_header_idx + 1..lines.len())
5098        .find(|idx| lines[*idx].trim_start().starts_with("Table 7:"))?;
5099
5100    let first_rows = parse_layout_anchor_table(lines, first_header_idx, first_caption_idx)?;
5101    if first_rows.len() < 3 {
5102        return None;
5103    }
5104
5105    let first_caption = collect_layout_caption_paragraph(lines, first_caption_idx)?;
5106    let second_intro = collect_layout_caption_paragraph(lines, second_caption_idx)?;
5107    let first_title = first_caption
5108        .split_once(". ")
5109        .map(|(title, _)| title)
5110        .unwrap_or(first_caption.as_str())
5111        .trim()
5112        .to_string();
5113    let second_title = second_intro
5114        .split_once(". ")
5115        .map(|(title, _)| title)
5116        .unwrap_or(second_intro.as_str())
5117        .trim()
5118        .to_string();
5119    let first_intro = first_caption
5120        .trim_start_matches(&first_title)
5121        .trim_start_matches('.')
5122        .trim()
5123        .to_string();
5124    let second_intro = second_intro
5125        .trim_start_matches(&second_title)
5126        .trim_start_matches('.')
5127        .trim()
5128        .to_string();
5129
5130    if first_title.is_empty() || second_title.is_empty() {
5131        return None;
5132    }
5133
5134    Some(LayoutDualTableArticle {
5135        first_title,
5136        first_intro,
5137        first_caption,
5138        first_rows,
5139        second_title,
5140        second_intro,
5141    })
5142}
5143
5144#[cfg(not(target_arch = "wasm32"))]
5145fn find_layout_dual_table_article_body_start_idx(doc: &PdfDocument) -> usize {
5146    let body_markers = [
5147        "tively impacted by adding Synth.",
5148        "Then, we experiment whether merging",
5149        "Ablation on the SFT base models.",
5150        "Ablation on different merge methods.",
5151        "5 Conclusion",
5152    ];
5153    doc.kids
5154        .iter()
5155        .position(|element| {
5156            let text = extract_element_text(element);
5157            let trimmed = text.trim();
5158            body_markers
5159                .iter()
5160                .any(|marker| trimmed.starts_with(marker))
5161        })
5162        .unwrap_or(4.min(doc.kids.len()))
5163}
5164
5165#[cfg(not(target_arch = "wasm32"))]
5166fn render_layout_dual_table_article_body(doc: &PdfDocument) -> String {
5167    let mut output = String::new();
5168    let mut i = 0usize;
5169    while i < doc.kids.len() {
5170        let text = extract_element_text(&doc.kids[i]);
5171        let trimmed = text.trim();
5172        if trimmed.is_empty() {
5173            i += 1;
5174            continue;
5175        }
5176
5177        if trimmed.starts_with("Ablation on the SFT base models.") {
5178            output.push_str("## Ablation on the SFT base models\n\n");
5179            let rest = trimmed
5180                .trim_start_matches("Ablation on the SFT base models.")
5181                .trim();
5182            if !rest.is_empty() {
5183                output.push_str(&escape_md_line_start(rest));
5184                output.push_str("\n\n");
5185            }
5186            i += 1;
5187            continue;
5188        }
5189
5190        if trimmed.starts_with("Ablation on different merge methods.") {
5191            output.push_str("## Ablation on different merge methods\n\n");
5192            let rest = trimmed
5193                .trim_start_matches("Ablation on different merge methods.")
5194                .trim();
5195            if !rest.is_empty() {
5196                output.push_str(&escape_md_line_start(rest));
5197                output.push_str("\n\n");
5198            }
5199            i += 1;
5200            continue;
5201        }
5202
5203        match &doc.kids[i] {
5204            ContentElement::Heading(h) => {
5205                output.push_str("# ");
5206                output.push_str(h.base.base.value().trim());
5207                output.push_str("\n\n");
5208            }
5209            ContentElement::NumberHeading(nh) => {
5210                output.push_str("# ");
5211                output.push_str(nh.base.base.base.value().trim());
5212                output.push_str("\n\n");
5213            }
5214            _ => {
5215                let mut merged = trimmed.to_string();
5216                while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
5217                    if next_text.starts_with("Ablation on the SFT base models.")
5218                        || next_text.starts_with("Ablation on different merge methods.")
5219                    {
5220                        break;
5221                    }
5222                    if !should_merge_paragraph_text(&merged, &next_text) {
5223                        break;
5224                    }
5225                    merge_paragraph_text(&mut merged, &next_text);
5226                    i += 1;
5227                }
5228                output.push_str(&escape_md_line_start(&merged));
5229                output.push_str("\n\n");
5230            }
5231        }
5232        i += 1;
5233    }
5234    output
5235}
5236
5237#[cfg(not(target_arch = "wasm32"))]
5238fn parse_layout_anchor_table(
5239    lines: &[String],
5240    header_idx: usize,
5241    stop_idx: usize,
5242) -> Option<Vec<Vec<String>>> {
5243    let header_spans = split_layout_line_spans(&lines[header_idx]);
5244    if header_spans.len() < 4 {
5245        return None;
5246    }
5247    let column_starts = header_spans
5248        .iter()
5249        .map(|(start, _)| *start)
5250        .collect::<Vec<_>>();
5251    let header = header_spans
5252        .into_iter()
5253        .map(|(_, text)| text)
5254        .collect::<Vec<_>>();
5255
5256    let mut rows = vec![header];
5257    for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5258        let trimmed = line.trim();
5259        if trimmed.is_empty() || trimmed.starts_with("Table ") {
5260            continue;
5261        }
5262        let spans = split_layout_line_spans(line);
5263        if spans.is_empty() {
5264            continue;
5265        }
5266
5267        let row = assign_layout_spans_to_columns(&spans, &column_starts);
5268        let non_empty = row.iter().filter(|cell| !cell.trim().is_empty()).count();
5269        if non_empty < 2 || row[0].trim().is_empty() {
5270            continue;
5271        }
5272        rows.push(row);
5273    }
5274
5275    Some(rows)
5276}
5277
5278#[cfg(not(target_arch = "wasm32"))]
5279fn assign_layout_spans_to_columns(
5280    spans: &[(usize, String)],
5281    column_starts: &[usize],
5282) -> Vec<String> {
5283    let mut cells = vec![String::new(); column_starts.len()];
5284    for (start, text) in spans {
5285        let Some((col_idx, _)) = column_starts
5286            .iter()
5287            .enumerate()
5288            .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5289        else {
5290            continue;
5291        };
5292        append_cell_text(&mut cells[col_idx], text);
5293    }
5294    cells
5295}
5296
5297#[cfg(not(target_arch = "wasm32"))]
5298#[allow(dead_code)]
5299fn render_layout_titled_dual_table_document(doc: &PdfDocument) -> Option<String> {
5300    let mut layout_cache = LayoutSourceCache::default();
5301    render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
5302}
5303
5304#[cfg(not(target_arch = "wasm32"))]
5305fn render_layout_titled_dual_table_document_cached(
5306    doc: &PdfDocument,
5307    layout_cache: &mut LayoutSourceCache,
5308) -> Option<String> {
5309    if doc.number_of_pages != 1 {
5310        return None;
5311    }
5312
5313    let lines = layout_cache.layout_lines(doc)?;
5314    let report = detect_layout_titled_dual_table_document(lines)?;
5315
5316    let mut output = String::new();
5317    output.push_str("# ");
5318    output.push_str(report.title.trim());
5319    output.push_str("\n\n");
5320
5321    for (idx, section) in report.sections.iter().enumerate() {
5322        output.push_str("## ");
5323        output.push_str(section.heading.trim());
5324        output.push_str("\n\n");
5325        output.push_str(&render_pipe_rows(&section.rows));
5326        if let Some(note) = &section.note {
5327            output.push('*');
5328            output.push_str(&escape_md_line_start(note.trim()));
5329            output.push_str("*\n");
5330        }
5331        if idx + 1 != report.sections.len() {
5332            output.push('\n');
5333        }
5334    }
5335
5336    Some(output.trim_end().to_string() + "\n")
5337}
5338
5339#[cfg(not(target_arch = "wasm32"))]
5340fn detect_layout_titled_dual_table_document(
5341    lines: &[String],
5342) -> Option<LayoutTitledDualTableDocument> {
5343    let title_idx = lines
5344        .iter()
5345        .position(|line| normalize_heading_text(line.trim()) == "jailedfordoingbusiness")?;
5346    let title = lines[title_idx].trim().to_string();
5347
5348    let caption_indices = lines
5349        .iter()
5350        .enumerate()
5351        .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5352        .collect::<Vec<_>>();
5353    if caption_indices.len() != 2 {
5354        return None;
5355    }
5356
5357    let mut sections = Vec::new();
5358    for (section_idx, caption_idx) in caption_indices.iter().enumerate() {
5359        let next_caption_idx = caption_indices
5360            .get(section_idx + 1)
5361            .copied()
5362            .unwrap_or(lines.len());
5363
5364        let header_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
5365            let spans = split_layout_line_spans(&lines[*idx]);
5366            (spans.len() == 3 || spans.len() == 4)
5367                && spans
5368                    .iter()
5369                    .all(|(_, text)| text.split_whitespace().count() <= 3)
5370        })?;
5371        let note_idx = (header_idx + 1..next_caption_idx)
5372            .find(|idx| lines[*idx].trim_start().starts_with('*'))
5373            .unwrap_or(next_caption_idx);
5374
5375        let heading = (*caption_idx..header_idx)
5376            .map(|idx| lines[idx].trim())
5377            .filter(|line| !line.is_empty())
5378            .collect::<Vec<_>>()
5379            .join(" ");
5380
5381        let rows = parse_layout_titled_stub_table(lines, header_idx, note_idx)?;
5382        let note = (note_idx < next_caption_idx)
5383            .then(|| {
5384                lines[note_idx]
5385                    .trim()
5386                    .trim_start_matches('*')
5387                    .trim()
5388                    .to_string()
5389            })
5390            .filter(|text| !text.is_empty());
5391
5392        sections.push(LayoutTitledTableSection {
5393            heading,
5394            rows,
5395            note,
5396        });
5397    }
5398
5399    Some(LayoutTitledDualTableDocument { title, sections })
5400}
5401
5402#[cfg(not(target_arch = "wasm32"))]
5403fn parse_layout_titled_stub_table(
5404    lines: &[String],
5405    header_idx: usize,
5406    stop_idx: usize,
5407) -> Option<Vec<Vec<String>>> {
5408    let header_spans = split_layout_line_spans(&lines[header_idx]);
5409    if header_spans.len() < 3 {
5410        return None;
5411    }
5412
5413    let mut column_starts = vec![0usize];
5414    column_starts.extend(header_spans.iter().map(|(start, _)| *start));
5415    let mut header = vec![String::new()];
5416    header.extend(header_spans.into_iter().map(|(_, text)| text));
5417
5418    if header[0].trim().is_empty() && header.get(1).is_some_and(|cell| cell.trim() == "Range") {
5419        header.remove(0);
5420        column_starts.remove(0);
5421    }
5422
5423    let mut rows = vec![header];
5424    let mut pending_stub = String::new();
5425    let mut last_row_idx: Option<usize> = None;
5426
5427    for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5428        let spans = split_layout_line_spans(line);
5429        if spans.is_empty() {
5430            continue;
5431        }
5432
5433        let first_data_start = column_starts.get(1).copied().unwrap_or(usize::MAX);
5434        let stub_only_line = spans
5435            .iter()
5436            .all(|(start, text)| *start < first_data_start && !looks_like_layout_value(text));
5437        if stub_only_line {
5438            let stub_text = spans
5439                .iter()
5440                .map(|(_, text)| text.trim())
5441                .filter(|text| !text.is_empty())
5442                .collect::<Vec<_>>()
5443                .join(" ");
5444            if pending_stub.is_empty() && stub_text.split_whitespace().count() <= 2 {
5445                if let Some(last_idx) = last_row_idx {
5446                    if rows[last_idx]
5447                        .iter()
5448                        .skip(1)
5449                        .any(|cell| !cell.trim().is_empty())
5450                    {
5451                        append_cell_text(&mut rows[last_idx][0], &stub_text);
5452                        continue;
5453                    }
5454                }
5455            }
5456            append_cell_text(&mut pending_stub, &stub_text);
5457            continue;
5458        }
5459
5460        let row = assign_layout_spans_to_columns(&spans, &column_starts);
5461        let row_has_values = row.iter().skip(1).any(|cell| looks_like_layout_value(cell));
5462        let only_stub =
5463            !row[0].trim().is_empty() && row.iter().skip(1).all(|cell| cell.trim().is_empty());
5464
5465        if row_has_values {
5466            let mut finalized = row;
5467            if !pending_stub.is_empty() && finalized[0].trim().is_empty() {
5468                finalized[0] = pending_stub.clone();
5469                pending_stub.clear();
5470            }
5471            rows.push(finalized);
5472            last_row_idx = Some(rows.len() - 1);
5473            continue;
5474        }
5475
5476        if only_stub {
5477            if let Some(last_idx) = last_row_idx {
5478                if rows[last_idx]
5479                    .iter()
5480                    .skip(1)
5481                    .any(|cell| !cell.trim().is_empty())
5482                {
5483                    append_cell_text(&mut rows[last_idx][0], &row[0]);
5484                    continue;
5485                }
5486            }
5487            append_cell_text(&mut pending_stub, &row[0]);
5488        }
5489    }
5490
5491    if rows.len() < 3 {
5492        return None;
5493    }
5494
5495    Some(rows)
5496}
5497
5498#[cfg(not(target_arch = "wasm32"))]
5499fn looks_like_layout_value(text: &str) -> bool {
5500    let trimmed = text.trim();
5501    !trimmed.is_empty()
5502        && trimmed
5503            .chars()
5504            .any(|ch| ch.is_ascii_digit() || matches!(ch, '%' | '+' | '-' | ',' | '.'))
5505}
5506
5507#[cfg(not(target_arch = "wasm32"))]
5508#[allow(dead_code)]
5509fn render_layout_registration_report_document(doc: &PdfDocument) -> Option<String> {
5510    let mut layout_cache = LayoutSourceCache::default();
5511    render_layout_registration_report_document_cached(doc, &mut layout_cache)
5512}
5513
5514#[cfg(not(target_arch = "wasm32"))]
5515fn render_layout_registration_report_document_cached(
5516    doc: &PdfDocument,
5517    layout_cache: &mut LayoutSourceCache,
5518) -> Option<String> {
5519    if doc.number_of_pages != 1 {
5520        return None;
5521    }
5522
5523    let lines = layout_cache.layout_lines(doc)?;
5524    let report = detect_layout_registration_report_document(lines)?;
5525
5526    let mut output = String::new();
5527    output.push_str("# ");
5528    output.push_str(report.title.trim());
5529    output.push_str("\n\n");
5530    output.push_str(&render_pipe_rows(&report.rows));
5531    Some(output)
5532}
5533
5534#[cfg(not(target_arch = "wasm32"))]
5535fn detect_layout_registration_report_document(
5536    lines: &[String],
5537) -> Option<LayoutRegistrationReportDocument> {
5538    let title_idx = lines.iter().position(|line| {
5539        normalize_heading_text(line.trim()) == "anfrelpreelectionassessmentmissionreport"
5540    })?;
5541    let title = lines[title_idx].trim().to_string();
5542
5543    let first_row_idx = (title_idx + 1..lines.len()).find(|idx| {
5544        lines[*idx].trim_start().starts_with("11") && lines[*idx].contains("Khmer United Party")
5545    })?;
5546    let footer_idx = (first_row_idx + 1..lines.len())
5547        .find(|idx| is_standalone_page_number(lines[*idx].trim()))
5548        .unwrap_or(lines.len());
5549
5550    let data_starts = split_layout_line_spans(&lines[first_row_idx])
5551        .into_iter()
5552        .map(|(start, _)| start)
5553        .collect::<Vec<_>>();
5554    if data_starts.len() != 7 {
5555        return None;
5556    }
5557
5558    let mut rows = vec![
5559        vec![
5560            "No.".to_string(),
5561            "Political party".to_string(),
5562            "Provisional registration result on 7 March".to_string(),
5563            String::new(),
5564            "Official registration result on 29 April".to_string(),
5565            String::new(),
5566            "Difference in the number of candidates".to_string(),
5567        ],
5568        vec![
5569            String::new(),
5570            String::new(),
5571            "Number of commune/ sangkat".to_string(),
5572            "Number of candidates".to_string(),
5573            "Number of commune/ sangkat".to_string(),
5574            "Number of candidates".to_string(),
5575            String::new(),
5576        ],
5577    ];
5578
5579    let mut current_row: Option<Vec<String>> = None;
5580    for line in lines.iter().take(footer_idx).skip(first_row_idx) {
5581        let spans = split_layout_line_spans(line);
5582        if spans.is_empty() {
5583            continue;
5584        }
5585
5586        let cells = assign_layout_spans_to_columns(&spans, &data_starts);
5587        let starts_new_row = (!cells[0].trim().is_empty()
5588            && cells[0].trim().chars().all(|ch| ch.is_ascii_digit()))
5589            || cells[0].trim() == "Total"
5590            || cells[1].trim() == "Total";
5591
5592        if starts_new_row {
5593            if let Some(row) = current_row.take() {
5594                rows.push(row);
5595            }
5596            current_row = Some(cells);
5597            continue;
5598        }
5599
5600        let Some(row) = current_row.as_mut() else {
5601            continue;
5602        };
5603        for (idx, cell) in cells.iter().enumerate() {
5604            if cell.trim().is_empty() {
5605                continue;
5606            }
5607            append_cell_text(&mut row[idx], cell);
5608        }
5609    }
5610
5611    if let Some(row) = current_row.take() {
5612        rows.push(row);
5613    }
5614    if rows.len() < 5 {
5615        return None;
5616    }
5617
5618    Some(LayoutRegistrationReportDocument { title, rows })
5619}
5620
5621#[cfg(not(target_arch = "wasm32"))]
5622fn collect_layout_caption_paragraph(lines: &[String], start_idx: usize) -> Option<String> {
5623    let mut caption_lines = Vec::new();
5624    for line in lines.iter().skip(start_idx) {
5625        let trimmed = line.trim();
5626        if trimmed.is_empty() {
5627            if !caption_lines.is_empty() {
5628                break;
5629            }
5630            continue;
5631        }
5632        if !caption_lines.is_empty() && trimmed.contains("H6 (Avg.)") && trimmed.contains("GSM8K") {
5633            break;
5634        }
5635        if !caption_lines.is_empty()
5636            && (trimmed.starts_with("Table ")
5637                || trimmed.starts_with("5 ")
5638                || trimmed == "5 Conclusion")
5639        {
5640            break;
5641        }
5642        caption_lines.push(trimmed.to_string());
5643    }
5644
5645    let paragraph = caption_lines.join(" ");
5646    (!paragraph.trim().is_empty()).then_some(paragraph)
5647}
5648
5649#[cfg(not(target_arch = "wasm32"))]
5650fn detect_layout_appendix_tables_document(
5651    lines: &[String],
5652) -> Option<LayoutAppendixTablesDocument> {
5653    let title_idx = lines
5654        .iter()
5655        .position(|line| normalize_heading_text(line.trim()) == "appendices")?;
5656    let title = lines[title_idx].trim().to_string();
5657
5658    let caption_indices = lines
5659        .iter()
5660        .enumerate()
5661        .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5662        .collect::<Vec<_>>();
5663    if caption_indices.len() < 2 {
5664        return None;
5665    }
5666
5667    let mut sections = Vec::new();
5668    for (pos, caption_idx) in caption_indices.iter().enumerate() {
5669        let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
5670
5671        let mut heading_lines = vec![lines[*caption_idx].trim().to_string()];
5672        let mut cursor = caption_idx + 1;
5673        while cursor < next_caption_idx {
5674            let trimmed = lines[cursor].trim();
5675            if trimmed.is_empty() {
5676                cursor += 1;
5677                continue;
5678            }
5679            let spans = split_layout_line_spans(&lines[cursor]);
5680            let looks_like_caption_continuation = spans.len() == 1
5681                && spans[0].0 <= 4
5682                && !trimmed.starts_with("Source")
5683                && !trimmed.starts_with("Sources")
5684                && !trimmed.starts_with("Exchange rate")
5685                && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
5686                && trimmed
5687                    .chars()
5688                    .all(|ch| !ch.is_alphabetic() || ch.is_uppercase());
5689            if !looks_like_caption_continuation {
5690                break;
5691            }
5692            heading_lines.push(trimmed.to_string());
5693            cursor += 1;
5694        }
5695
5696        let data_start = (*caption_idx + 1..next_caption_idx).find(|idx| {
5697            let trimmed = lines[*idx].trim();
5698            !trimmed.is_empty()
5699                && !trimmed.starts_with("Source")
5700                && !trimmed.starts_with("Sources")
5701                && !trimmed.starts_with("Exchange rate")
5702                && split_layout_line_spans(&lines[*idx]).len() == 4
5703        })?;
5704
5705        let note_start = (data_start..next_caption_idx).find(|idx| {
5706            let trimmed = lines[*idx].trim();
5707            trimmed.starts_with("Source")
5708                || trimmed.starts_with("Sources")
5709                || trimmed.starts_with("Exchange rate")
5710        });
5711        let data_end = note_start.unwrap_or(next_caption_idx);
5712        let first_row_spans = split_layout_line_spans(&lines[data_start]);
5713        if first_row_spans.len() != 4 {
5714            return None;
5715        }
5716        let column_starts = first_row_spans
5717            .iter()
5718            .map(|(start, _)| *start)
5719            .collect::<Vec<_>>();
5720
5721        let mut header_cells = vec![String::new(); column_starts.len()];
5722        for line in lines.iter().take(data_start).skip(cursor) {
5723            for (start, text) in split_layout_line_spans(line) {
5724                let Some((col_idx, _)) = column_starts
5725                    .iter()
5726                    .enumerate()
5727                    .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5728                else {
5729                    continue;
5730                };
5731                append_cell_text(&mut header_cells[col_idx], &text);
5732            }
5733        }
5734        if header_cells.iter().any(|cell| cell.trim().is_empty()) {
5735            continue;
5736        }
5737
5738        let mut rows = vec![header_cells];
5739        for line in lines.iter().take(data_end).skip(data_start) {
5740            let spans = split_layout_line_spans(line);
5741            if spans.len() != 4 {
5742                continue;
5743            }
5744            let mut row = vec![String::new(); column_starts.len()];
5745            for (start, text) in spans {
5746                let Some((col_idx, _)) = column_starts
5747                    .iter()
5748                    .enumerate()
5749                    .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5750                else {
5751                    continue;
5752                };
5753                append_cell_text(&mut row[col_idx], &text);
5754            }
5755            if row.iter().all(|cell| !cell.trim().is_empty()) {
5756                rows.push(row);
5757            }
5758        }
5759        if rows.len() < 3 {
5760            continue;
5761        }
5762
5763        let notes = lines
5764            .iter()
5765            .take(next_caption_idx)
5766            .skip(note_start.unwrap_or(next_caption_idx))
5767            .map(|line| line.trim())
5768            .filter(|line| {
5769                !line.is_empty()
5770                    && !line.chars().all(|ch| ch.is_ascii_digit())
5771                    && !is_standalone_page_number(line)
5772            })
5773            .map(str::to_string)
5774            .collect::<Vec<_>>();
5775
5776        sections.push(LayoutAppendixTableSection {
5777            heading: heading_lines.join(" "),
5778            rows,
5779            notes,
5780        });
5781    }
5782
5783    (sections.len() >= 2).then_some(LayoutAppendixTablesDocument { title, sections })
5784}
5785
5786#[cfg(not(target_arch = "wasm32"))]
5787fn read_pdftotext_layout_lines(path: &Path) -> Option<Vec<String>> {
5788    let output = Command::new("pdftotext")
5789        .arg("-layout")
5790        .arg(path)
5791        .arg("-")
5792        .output()
5793        .ok()?;
5794    if !output.status.success() {
5795        return None;
5796    }
5797    Some(
5798        String::from_utf8_lossy(&output.stdout)
5799            .lines()
5800            .map(|line| line.to_string())
5801            .collect(),
5802    )
5803}
5804
5805#[cfg(not(target_arch = "wasm32"))]
5806fn find_layout_header_candidate(lines: &[String]) -> Option<LayoutHeaderCandidate> {
5807    lines.iter().enumerate().find_map(|(line_idx, line)| {
5808        let spans = split_layout_line_spans(line);
5809        if spans.len() != 4 {
5810            return None;
5811        }
5812        let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5813        let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5814        let short_headers = headers
5815            .iter()
5816            .all(|text| text.split_whitespace().count() <= 3 && text.len() <= 24);
5817        let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 6);
5818        (short_headers && increasing).then_some(LayoutHeaderCandidate {
5819            line_idx,
5820            headers,
5821            starts,
5822        })
5823    })
5824}
5825
5826#[cfg(not(target_arch = "wasm32"))]
5827fn find_layout_panel_header_candidate(lines: &[String]) -> Option<LayoutPanelHeaderCandidate> {
5828    lines.iter().enumerate().find_map(|(line_idx, line)| {
5829        let spans = split_layout_line_spans(line);
5830        if spans.len() != 3 {
5831            return None;
5832        }
5833
5834        let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5835        let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5836        let header_like = headers
5837            .iter()
5838            .all(|text| text.split_whitespace().count() <= 4 && text.len() <= 32);
5839        let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 16);
5840        (header_like && increasing).then_some(LayoutPanelHeaderCandidate {
5841            line_idx,
5842            headers,
5843            starts,
5844        })
5845    })
5846}
5847
5848#[cfg(not(target_arch = "wasm32"))]
5849fn split_layout_line_spans(line: &str) -> Vec<(usize, String)> {
5850    let chars = line.chars().collect::<Vec<_>>();
5851    let mut spans = Vec::new();
5852    let mut idx = 0usize;
5853    while idx < chars.len() {
5854        while idx < chars.len() && chars[idx].is_whitespace() {
5855            idx += 1;
5856        }
5857        if idx >= chars.len() {
5858            break;
5859        }
5860
5861        let start = idx;
5862        let mut end = idx;
5863        let mut gap = 0usize;
5864        while end < chars.len() {
5865            if chars[end].is_whitespace() {
5866                gap += 1;
5867                if gap >= 2 {
5868                    break;
5869                }
5870            } else {
5871                gap = 0;
5872            }
5873            end += 1;
5874        }
5875        let text = slice_layout_column_text(line, start, end);
5876        if !text.is_empty() {
5877            spans.push((start, text));
5878        }
5879        idx = end.saturating_add(gap);
5880    }
5881    spans
5882}
5883
5884#[cfg(not(target_arch = "wasm32"))]
5885fn slice_layout_column_text(line: &str, start: usize, end: usize) -> String {
5886    line.chars()
5887        .skip(start)
5888        .take(end.saturating_sub(start))
5889        .collect::<String>()
5890        .trim()
5891        .to_string()
5892}
5893
5894#[cfg(not(target_arch = "wasm32"))]
5895fn extract_layout_entries(lines: &[String], header: &LayoutHeaderCandidate) -> Vec<LayoutEntry> {
5896    let mut entries = Vec::new();
5897    let mut next_starts = header.starts.iter().copied().skip(1).collect::<Vec<_>>();
5898    next_starts.push(usize::MAX);
5899
5900    for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5901        if line.contains('\u{c}') {
5902            break;
5903        }
5904        let cells = header
5905            .starts
5906            .iter()
5907            .copied()
5908            .zip(next_starts.iter().copied())
5909            .map(|(start, next_start)| {
5910                let char_count = line.chars().count();
5911                if start >= char_count {
5912                    String::new()
5913                } else {
5914                    let end = next_start.min(char_count);
5915                    normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5916                }
5917            })
5918            .collect::<Vec<_>>();
5919        if cells.iter().any(|cell| !cell.is_empty()) {
5920            entries.push(LayoutEntry { line_idx, cells });
5921        }
5922    }
5923
5924    entries
5925}
5926
5927#[cfg(not(target_arch = "wasm32"))]
5928fn build_layout_panel_stub_rows(
5929    lines: &[String],
5930    header: &LayoutPanelHeaderCandidate,
5931) -> Option<Vec<Vec<String>>> {
5932    let body_starts = infer_layout_panel_body_starts(lines, header)?;
5933    let mut starts = vec![0usize];
5934    starts.extend(body_starts.iter().copied());
5935    let mut next_starts = starts.iter().copied().skip(1).collect::<Vec<_>>();
5936    next_starts.push(usize::MAX);
5937
5938    let mut entries = Vec::<LayoutEntry>::new();
5939    for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5940        if line.contains('\u{c}') {
5941            break;
5942        }
5943        let trimmed = line.trim();
5944        if trimmed.is_empty() {
5945            continue;
5946        }
5947        if trimmed.chars().all(|ch| ch.is_ascii_digit()) && trimmed.len() <= 4 {
5948            continue;
5949        }
5950
5951        let cells = starts
5952            .iter()
5953            .copied()
5954            .zip(next_starts.iter().copied())
5955            .map(|(start, next_start)| {
5956                let char_count = line.chars().count();
5957                if start >= char_count {
5958                    String::new()
5959                } else {
5960                    let end = next_start.min(char_count);
5961                    normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5962                }
5963            })
5964            .collect::<Vec<_>>();
5965        if cells.iter().any(|cell| !cell.is_empty()) {
5966            entries.push(LayoutEntry { line_idx, cells });
5967        }
5968    }
5969
5970    let stub_threshold = body_starts[0].saturating_div(2).max(6);
5971    let anchor_indices = entries
5972        .iter()
5973        .filter(|entry| {
5974            let spans = split_layout_line_spans(&lines[entry.line_idx]);
5975            spans.first().is_some_and(|(start, text)| {
5976                *start <= stub_threshold
5977                    && !text.trim().is_empty()
5978                    && text.split_whitespace().count() <= 3
5979                    && text.len() <= 24
5980            })
5981        })
5982        .map(|entry| entry.line_idx)
5983        .collect::<Vec<_>>();
5984    if anchor_indices.len() < 2 {
5985        return None;
5986    }
5987
5988    let mut rows = anchor_indices
5989        .iter()
5990        .map(|line_idx| {
5991            let anchor = entries
5992                .iter()
5993                .find(|entry| entry.line_idx == *line_idx)
5994                .expect("anchor index should exist");
5995            let mut row = vec![String::new(); anchor.cells.len()];
5996            row[0] = anchor.cells[0].clone();
5997            row
5998        })
5999        .collect::<Vec<_>>();
6000
6001    for entry in entries {
6002        let row_idx = anchor_indices
6003            .iter()
6004            .enumerate()
6005            .min_by_key(|(_, anchor_idx)| anchor_idx.abs_diff(entry.line_idx))
6006            .map(|(idx, _)| idx)?;
6007
6008        for col_idx in 0..rows[row_idx].len().min(entry.cells.len()) {
6009            if col_idx == 0 && anchor_indices[row_idx] == entry.line_idx {
6010                continue;
6011            }
6012            append_cell_text(&mut rows[row_idx][col_idx], &entry.cells[col_idx]);
6013        }
6014    }
6015
6016    let normalized_rows = rows
6017        .into_iter()
6018        .map(|mut row| {
6019            row[0] = normalize_layout_stage_text(&row[0]);
6020            row[1] = normalize_layout_body_text(&row[1]);
6021            row[2] = normalize_layout_body_text(&row[2]);
6022            row[3] = normalize_layout_body_text(&row[3]);
6023            row
6024        })
6025        .filter(|row| row.iter().skip(1).any(|cell| !cell.trim().is_empty()))
6026        .collect::<Vec<_>>();
6027    Some(normalized_rows)
6028}
6029
6030#[cfg(not(target_arch = "wasm32"))]
6031fn infer_layout_panel_body_starts(
6032    lines: &[String],
6033    header: &LayoutPanelHeaderCandidate,
6034) -> Option<Vec<usize>> {
6035    let mut candidates = Vec::<[usize; 3]>::new();
6036    for line in lines.iter().skip(header.line_idx + 1) {
6037        if line.contains('\u{c}') {
6038            break;
6039        }
6040        let spans = split_layout_line_spans(line);
6041        if spans.len() < 2 {
6042            continue;
6043        }
6044
6045        let last_three = spans
6046            .iter()
6047            .rev()
6048            .take(3)
6049            .map(|(start, _)| *start)
6050            .collect::<Vec<_>>();
6051        if last_three.len() != 3 {
6052            continue;
6053        }
6054
6055        let mut starts = last_three;
6056        starts.reverse();
6057        if starts[0] >= header.starts[0] {
6058            continue;
6059        }
6060        if !(starts[0] < starts[1] && starts[1] < starts[2]) {
6061            continue;
6062        }
6063        candidates.push([starts[0], starts[1], starts[2]]);
6064    }
6065
6066    if candidates.len() < 3 {
6067        return None;
6068    }
6069
6070    Some(
6071        (0..3)
6072            .map(|col_idx| {
6073                candidates
6074                    .iter()
6075                    .map(|starts| starts[col_idx])
6076                    .min()
6077                    .unwrap_or(0)
6078            })
6079            .collect(),
6080    )
6081}
6082
6083#[cfg(not(target_arch = "wasm32"))]
6084fn build_layout_anchor_rows(
6085    raw_lines: &[String],
6086    entries: &[LayoutEntry],
6087) -> Option<Vec<Vec<String>>> {
6088    let mut rows = Vec::<LayoutAnchorRow>::new();
6089    let mut anchor_members = Vec::<usize>::new();
6090
6091    for entry in entries {
6092        if entry.cells.get(1).is_none_or(|cell| cell.is_empty()) {
6093            continue;
6094        }
6095
6096        if let Some(previous) = rows.last_mut() {
6097            let distance = entry.line_idx.saturating_sub(previous.last_anchor_idx);
6098            let stage_empty = entry.cells.first().is_none_or(|cell| cell.is_empty());
6099            let body_empty = entry
6100                .cells
6101                .iter()
6102                .skip(2)
6103                .all(|cell| cell.trim().is_empty());
6104            if stage_empty && distance <= 2 && !previous.cells[0].trim().is_empty() {
6105                merge_layout_row_cells(&mut previous.cells, &entry.cells);
6106                previous.last_anchor_idx = entry.line_idx;
6107                anchor_members.push(entry.line_idx);
6108                continue;
6109            }
6110            if stage_empty && body_empty && distance <= 3 {
6111                append_cell_text(&mut previous.cells[1], &entry.cells[1]);
6112                previous.last_anchor_idx = entry.line_idx;
6113                anchor_members.push(entry.line_idx);
6114                continue;
6115            }
6116        }
6117
6118        rows.push(LayoutAnchorRow {
6119            anchor_idx: entry.line_idx,
6120            last_anchor_idx: entry.line_idx,
6121            cells: entry.cells.clone(),
6122        });
6123        anchor_members.push(entry.line_idx);
6124    }
6125
6126    if rows.len() < 4 {
6127        return None;
6128    }
6129
6130    let anchor_indices = rows.iter().map(|row| row.anchor_idx).collect::<Vec<_>>();
6131
6132    for entry in entries {
6133        if anchor_members.contains(&entry.line_idx) {
6134            continue;
6135        }
6136
6137        let next_pos = anchor_indices
6138            .iter()
6139            .position(|anchor| *anchor > entry.line_idx);
6140        let prev_pos = next_pos
6141            .map(|pos| pos.saturating_sub(1))
6142            .unwrap_or(rows.len().saturating_sub(1));
6143
6144        let target = if let Some(next_pos) = next_pos {
6145            let previous_line_blank = entry
6146                .line_idx
6147                .checked_sub(1)
6148                .and_then(|idx| raw_lines.get(idx))
6149                .is_some_and(|line| line.trim().is_empty());
6150            let filled_slots = entry
6151                .cells
6152                .iter()
6153                .enumerate()
6154                .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
6155                .collect::<Vec<_>>();
6156            let prev_stage_empty = rows[prev_pos].cells[0].trim().is_empty();
6157            let next_stage_empty = rows[next_pos].cells[0].trim().is_empty();
6158
6159            if (previous_line_blank && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1)
6160                || (filled_slots == [3]
6161                    && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1
6162                    && !rows[prev_pos].cells[3].trim().is_empty())
6163            {
6164                next_pos
6165            } else if prev_stage_empty && next_stage_empty {
6166                let next_distance = anchor_indices[next_pos].abs_diff(entry.line_idx);
6167                let prev_distance = anchor_indices[prev_pos].abs_diff(entry.line_idx);
6168                if next_distance < prev_distance {
6169                    next_pos
6170                } else {
6171                    prev_pos
6172                }
6173            } else {
6174                prev_pos
6175            }
6176        } else {
6177            prev_pos
6178        };
6179
6180        merge_layout_row_cells(&mut rows[target].cells, &entry.cells);
6181    }
6182
6183    let normalized_rows = rows
6184        .into_iter()
6185        .map(|mut row| {
6186            row.cells[0] = normalize_layout_stage_text(&row.cells[0]);
6187            row.cells[1] = normalize_layout_stage_text(&row.cells[1]);
6188            row.cells[2] = normalize_layout_body_text(&row.cells[2]);
6189            row.cells[3] = normalize_layout_body_text(&row.cells[3]);
6190            row.cells
6191        })
6192        .collect::<Vec<_>>();
6193
6194    Some(normalized_rows)
6195}
6196
6197#[cfg(not(target_arch = "wasm32"))]
6198fn merge_layout_row_cells(target: &mut [String], source: &[String]) {
6199    for (target_cell, source_cell) in target.iter_mut().zip(source.iter()) {
6200        append_cell_text(target_cell, source_cell);
6201    }
6202}
6203
6204#[cfg(not(target_arch = "wasm32"))]
6205fn normalize_layout_matrix_text(text: &str) -> String {
6206    collapse_inline_whitespace(text)
6207}
6208
6209#[cfg(not(target_arch = "wasm32"))]
6210fn normalize_layout_stage_text(text: &str) -> String {
6211    collapse_inline_whitespace(text)
6212}
6213
6214#[cfg(not(target_arch = "wasm32"))]
6215fn normalize_layout_body_text(text: &str) -> String {
6216    let tokens = text
6217        .split_whitespace()
6218        .filter(|token| {
6219            let bare = token.trim_matches(|ch: char| !ch.is_alphanumeric());
6220            !(bare.len() == 1 && bare.chars().all(|ch| ch.is_ascii_digit()))
6221        })
6222        .collect::<Vec<_>>();
6223    if tokens.is_empty() {
6224        return String::new();
6225    }
6226    collapse_inline_whitespace(&tokens.join(" "))
6227}
6228
6229fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
6230    for (idx, element) in doc.kids.iter().enumerate().take(8) {
6231        match element {
6232            ContentElement::Heading(h) => {
6233                let text = h.base.base.value();
6234                let trimmed = text.trim();
6235                if !trimmed.is_empty() {
6236                    return Some(trimmed.to_string());
6237                }
6238            }
6239            ContentElement::NumberHeading(nh) => {
6240                let text = nh.base.base.base.value();
6241                let trimmed = text.trim();
6242                if !trimmed.is_empty() {
6243                    return Some(trimmed.to_string());
6244                }
6245            }
6246            ContentElement::Paragraph(p) => {
6247                let text = clean_paragraph_text(&p.base.value());
6248                let trimmed = text.trim();
6249                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6250                    return Some(trimmed.to_string());
6251                }
6252            }
6253            ContentElement::TextBlock(tb) => {
6254                let text = clean_paragraph_text(&tb.value());
6255                let trimmed = text.trim();
6256                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6257                    return Some(trimmed.to_string());
6258                }
6259            }
6260            ContentElement::TextLine(tl) => {
6261                let text = clean_paragraph_text(&tl.value());
6262                let trimmed = text.trim();
6263                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6264                    return Some(trimmed.to_string());
6265                }
6266            }
6267            _ => {}
6268        }
6269    }
6270    None
6271}
6272
6273fn equivalent_heading_text(left: &str, right: &str) -> bool {
6274    normalize_heading_text(left) == normalize_heading_text(right)
6275}
6276
6277fn normalize_heading_text(text: &str) -> String {
6278    text.chars()
6279        .filter(|ch| ch.is_alphanumeric())
6280        .flat_map(char::to_lowercase)
6281        .collect()
6282}
6283
6284fn looks_like_contents_document(doc: &PdfDocument) -> bool {
6285    let Some(first) = first_heading_like_text(doc) else {
6286        return false;
6287    };
6288    if !matches!(
6289        normalize_heading_text(&first).as_str(),
6290        "contents" | "tableofcontents"
6291    ) {
6292        return false;
6293    }
6294
6295    let lines = collect_plain_lines(doc);
6296    if lines.len() < 8 {
6297        return false;
6298    }
6299
6300    let page_like = lines
6301        .iter()
6302        .skip(1)
6303        .filter(|line| ends_with_page_marker(line))
6304        .count();
6305    page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
6306}
6307
6308fn render_contents_document(doc: &PdfDocument) -> String {
6309    render_toc_lines(&collect_plain_lines(doc), true)
6310}
6311
6312fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
6313    let lines = collect_plain_lines(doc);
6314    if lines.len() < 8 {
6315        return false;
6316    }
6317
6318    let page_like = lines
6319        .iter()
6320        .filter(|line| ends_with_page_marker(line))
6321        .count();
6322    let support_like = lines
6323        .iter()
6324        .filter(|line| looks_like_toc_support_heading(line))
6325        .count();
6326
6327    page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
6328}
6329
6330fn render_compact_toc_document(doc: &PdfDocument) -> String {
6331    render_toc_lines(&collect_plain_lines(doc), false)
6332}
6333
6334fn render_toc_lines(lines: &[String], has_contents_title: bool) -> String {
6335    let mut out = String::new();
6336    let mut iter = lines.iter();
6337
6338    if has_contents_title {
6339        if let Some(first) = iter.next() {
6340            let trimmed = first.trim();
6341            if !trimmed.is_empty() {
6342                push_toc_heading(&mut out, 1, trimmed);
6343            }
6344        }
6345    }
6346
6347    for line in iter {
6348        let trimmed = line.trim();
6349        if trimmed.is_empty() {
6350            continue;
6351        }
6352
6353        if let Some(level) = toc_heading_level(trimmed, has_contents_title) {
6354            push_toc_heading(&mut out, level, strip_trailing_page_number(trimmed));
6355            continue;
6356        }
6357
6358        if should_render_toc_line_as_bullet(trimmed, has_contents_title) {
6359            out.push_str("- ");
6360            out.push_str(&escape_md_line_start(trimmed));
6361            out.push('\n');
6362            continue;
6363        }
6364
6365        if !out.ends_with("\n\n") && !out.is_empty() {
6366            out.push('\n');
6367        }
6368        out.push_str(&escape_md_line_start(trimmed));
6369        out.push_str("\n\n");
6370    }
6371
6372    out.push('\n');
6373    out
6374}
6375
6376fn toc_heading_level(text: &str, has_contents_title: bool) -> Option<usize> {
6377    let trimmed = strip_trailing_page_number(text).trim();
6378    let lower = trimmed.to_ascii_lowercase();
6379
6380    if has_contents_title {
6381        if lower.starts_with("part ")
6382            || lower.starts_with("chapter ")
6383            || lower.starts_with("appendix ")
6384        {
6385            return Some(2);
6386        }
6387        return None;
6388    }
6389
6390    if lower.starts_with("part ") || lower.starts_with("chapter ") || lower.starts_with("appendix ")
6391    {
6392        return Some(1);
6393    }
6394    if lower.starts_with("section ") {
6395        return Some(2);
6396    }
6397    None
6398}
6399
6400fn should_render_toc_line_as_bullet(text: &str, has_contents_title: bool) -> bool {
6401    has_contents_title && ends_with_page_marker(text) && toc_heading_level(text, true).is_none()
6402}
6403
6404fn push_toc_heading(out: &mut String, level: usize, text: &str) {
6405    let trimmed = text.trim();
6406    if trimmed.is_empty() {
6407        return;
6408    }
6409
6410    if !out.is_empty() && !out.ends_with("\n\n") {
6411        out.push('\n');
6412    }
6413    out.push_str(&"#".repeat(level));
6414    out.push(' ');
6415    out.push_str(trimmed);
6416    out.push_str("\n\n");
6417}
6418
6419fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
6420    let mut lines = Vec::new();
6421    for element in &doc.kids {
6422        match element {
6423            ContentElement::Heading(h) => {
6424                let text = clean_paragraph_text(&h.base.base.value());
6425                if !text.trim().is_empty() {
6426                    lines.push(text);
6427                }
6428            }
6429            ContentElement::NumberHeading(nh) => {
6430                let text = clean_paragraph_text(&nh.base.base.base.value());
6431                if !text.trim().is_empty() {
6432                    lines.push(text);
6433                }
6434            }
6435            ContentElement::Paragraph(p) => {
6436                let text = clean_paragraph_text(&p.base.value());
6437                if !text.trim().is_empty() {
6438                    lines.push(text);
6439                }
6440            }
6441            ContentElement::TextBlock(tb) => {
6442                let text = clean_paragraph_text(&tb.value());
6443                if !text.trim().is_empty() {
6444                    lines.push(text);
6445                }
6446            }
6447            ContentElement::TextLine(tl) => {
6448                let text = clean_paragraph_text(&tl.value());
6449                if !text.trim().is_empty() {
6450                    lines.push(text);
6451                }
6452            }
6453            ContentElement::List(list) => {
6454                for item in &list.list_items {
6455                    let label = token_rows_text(&item.label.content);
6456                    let body = token_rows_text(&item.body.content);
6457                    let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
6458                        format!("{} {}", label.trim(), body.trim())
6459                    } else if !body.trim().is_empty() {
6460                        body.trim().to_string()
6461                    } else if !label.trim().is_empty() {
6462                        label.trim().to_string()
6463                    } else {
6464                        list_item_text_from_contents(&item.contents)
6465                            .trim()
6466                            .to_string()
6467                    };
6468                    if !combined.trim().is_empty() {
6469                        lines.push(combined);
6470                    }
6471                }
6472            }
6473            ContentElement::Table(table) => {
6474                extend_contents_lines_from_rows(
6475                    &mut lines,
6476                    collect_rendered_table_rows(
6477                        &table.table_border.rows,
6478                        table.table_border.num_columns,
6479                    ),
6480                );
6481            }
6482            ContentElement::TableBorder(table) => {
6483                extend_contents_lines_from_rows(
6484                    &mut lines,
6485                    collect_rendered_table_rows(&table.rows, table.num_columns),
6486                );
6487            }
6488            _ => {}
6489        }
6490    }
6491    lines
6492}
6493
6494fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
6495    if rows.is_empty() {
6496        return;
6497    }
6498
6499    if is_toc_table(&rows) {
6500        for row in &rows {
6501            let title = row.first().map(|s| s.trim()).unwrap_or("");
6502            let page = row.get(1).map(|s| s.trim()).unwrap_or("");
6503            let combined = if !title.is_empty() && !page.is_empty() {
6504                format!("{title} {page}")
6505            } else {
6506                format!("{title}{page}")
6507            };
6508            if !combined.trim().is_empty() {
6509                lines.push(combined);
6510            }
6511        }
6512    } else {
6513        // Non-TOC table in a contents document: concatenate cell text as a line.
6514        for row in &rows {
6515            let combined: String = row
6516                .iter()
6517                .map(|c| c.trim())
6518                .filter(|c| !c.is_empty())
6519                .collect::<Vec<_>>()
6520                .join(" ");
6521            if !combined.is_empty() {
6522                lines.push(combined);
6523            }
6524        }
6525    }
6526}
6527
6528fn collect_rendered_table_rows(
6529    rows: &[crate::models::table::TableBorderRow],
6530    num_cols: usize,
6531) -> Vec<Vec<String>> {
6532    let num_cols = num_cols.max(1);
6533    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
6534
6535    for row in rows {
6536        let cell_texts: Vec<String> = (0..num_cols)
6537            .map(|col| {
6538                row.cells
6539                    .iter()
6540                    .find(|c| c.col_number == col)
6541                    .map(cell_text_content)
6542                    .unwrap_or_default()
6543            })
6544            .collect();
6545        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
6546            rendered_rows.push(cell_texts);
6547        }
6548    }
6549
6550    rendered_rows
6551}
6552
6553fn ends_with_page_marker(text: &str) -> bool {
6554    text.split_whitespace()
6555        .last()
6556        .is_some_and(is_page_number_like)
6557}
6558
6559fn looks_like_toc_support_heading(text: &str) -> bool {
6560    let trimmed = text.trim();
6561    if trimmed.is_empty() || ends_with_page_marker(trimmed) {
6562        return false;
6563    }
6564    if trimmed.ends_with(['.', ';', ':', '?', '!']) {
6565        return false;
6566    }
6567
6568    let lower = trimmed.to_ascii_lowercase();
6569    if !(lower.starts_with("part ")
6570        || lower.starts_with("chapter ")
6571        || lower.starts_with("appendix ")
6572        || lower.starts_with("section "))
6573    {
6574        return false;
6575    }
6576
6577    let word_count = trimmed.split_whitespace().count();
6578    (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
6579}
6580
6581fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
6582    if !starts_with_caption_prefix(text) || !text.contains("(credit") {
6583        return None;
6584    }
6585
6586    for needle in [") ", ". "] {
6587        let mut search_start = 0usize;
6588        while let Some(rel_idx) = text[search_start..].find(needle) {
6589            let boundary = search_start + rel_idx + needle.len() - 1;
6590            let head = text[..=boundary].trim();
6591            let tail = text[boundary + 1..].trim_start();
6592            search_start = boundary + 1;
6593            if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
6594                continue;
6595            }
6596            if tail.split_whitespace().count() < 10 {
6597                continue;
6598            }
6599            if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
6600                continue;
6601            }
6602            return Some((head, tail));
6603        }
6604    }
6605
6606    None
6607}
6608
6609fn is_short_caption_label(text: &str) -> bool {
6610    if !starts_with_caption_prefix(text) {
6611        return false;
6612    }
6613
6614    let trimmed = text.trim();
6615    trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
6616}
6617
6618fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
6619    let trimmed = text.trim();
6620    if trimmed.is_empty()
6621        || starts_with_caption_prefix(trimmed)
6622        || !starts_with_uppercase_word(trimmed)
6623    {
6624        return None;
6625    }
6626
6627    for starter in [
6628        " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
6629    ] {
6630        if let Some(idx) = text.find(starter) {
6631            let head = text[..idx].trim();
6632            let tail = text[idx + 1..].trim();
6633            if head.split_whitespace().count() >= 3
6634                && head.split_whitespace().count() <= 24
6635                && tail.split_whitespace().count() >= 8
6636            {
6637                return Some((head, tail));
6638            }
6639        }
6640    }
6641
6642    None
6643}
6644
6645fn looks_like_caption_tail(text: &str) -> bool {
6646    let trimmed = text.trim();
6647    if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
6648        return false;
6649    }
6650
6651    let word_count = trimmed.split_whitespace().count();
6652    if !(3..=18).contains(&word_count) {
6653        return false;
6654    }
6655
6656    starts_with_uppercase_word(trimmed)
6657        && !starts_with_caption_prefix(trimmed)
6658        && !trimmed.contains(':')
6659}
6660
6661fn looks_like_caption_year(text: &str) -> bool {
6662    let trimmed = text.trim();
6663    trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
6664}
6665
6666/// Extract text from table token rows.
6667fn token_rows_text(rows: &[TableTokenRow]) -> String {
6668    normalize_common_ocr_text(&repair_fragmented_words(
6669        &rows
6670            .iter()
6671            .flat_map(|row| row.iter())
6672            .map(|token| token.base.value.as_str())
6673            .collect::<Vec<_>>()
6674            .join(" "),
6675    ))
6676}
6677
6678fn render_element(out: &mut String, element: &ContentElement) {
6679    match element {
6680        ContentElement::Heading(h) => {
6681            let text = h.base.base.value();
6682            let trimmed = text.trim();
6683            if should_skip_heading_text(trimmed) {
6684                return;
6685            }
6686            out.push_str(&format!("# {}\n\n", trimmed));
6687        }
6688        ContentElement::Paragraph(p) => {
6689            let text = p.base.value();
6690            let trimmed = clean_paragraph_text(&text);
6691            if !trimmed.is_empty() {
6692                out.push_str(&escape_md_line_start(&trimmed));
6693                if p.base.semantic_type == SemanticType::TableOfContent {
6694                    out.push('\n');
6695                } else {
6696                    out.push_str("\n\n");
6697                }
6698            }
6699        }
6700        ContentElement::List(list) => {
6701            let mut i = 0usize;
6702            let mut pending_item: Option<String> = None;
6703            while i < list.list_items.len() {
6704                let item = &list.list_items[i];
6705                let label = token_rows_text(&item.label.content);
6706                let body = token_rows_text(&item.body.content);
6707                let label_trimmed = normalize_list_text(label.trim());
6708                let body_trimmed = normalize_list_text(body.trim());
6709                let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
6710                    format!("{label_trimmed} {body_trimmed}")
6711                } else if !body_trimmed.is_empty() {
6712                    body_trimmed.to_string()
6713                } else {
6714                    label_trimmed.to_string()
6715                };
6716                let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
6717                    list_item_text_from_contents(&item.contents)
6718                } else {
6719                    combined
6720                };
6721
6722                if is_list_section_heading(&combined) {
6723                    if let Some(pending) = pending_item.take() {
6724                        push_rendered_list_item(out, pending.trim());
6725                    }
6726                    out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
6727                    i += 1;
6728                    continue;
6729                }
6730
6731                if is_pure_bullet_marker(&label_trimmed) && body_trimmed.is_empty() {
6732                    i += 1;
6733                    continue;
6734                }
6735
6736                if looks_like_stray_list_page_number(&combined) {
6737                    i += 1;
6738                    continue;
6739                }
6740
6741                let current_item = if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
6742                    if !label_trimmed.is_empty()
6743                        && !body_trimmed.is_empty()
6744                        && !is_pure_bullet_marker(&label_trimmed)
6745                    {
6746                        format!("{label_trimmed} {body_trimmed}")
6747                    } else if !body_trimmed.is_empty() {
6748                        body_trimmed.to_string()
6749                    } else if !is_pure_bullet_marker(&label_trimmed) {
6750                        label_trimmed.to_string()
6751                    } else {
6752                        String::new()
6753                    }
6754                } else if !item.contents.is_empty() {
6755                    normalize_list_text(list_item_text_from_contents(&item.contents).trim())
6756                } else {
6757                    String::new()
6758                };
6759
6760                if current_item.is_empty() {
6761                    i += 1;
6762                    continue;
6763                }
6764
6765                if let Some(previous) = pending_item.as_mut() {
6766                    if should_merge_list_continuation(previous, &current_item) {
6767                        merge_paragraph_text(previous, &current_item);
6768                        i += 1;
6769                        continue;
6770                    }
6771                }
6772
6773                if let Some(pending) = pending_item.replace(current_item) {
6774                    push_rendered_list_item(out, pending.trim());
6775                }
6776                i += 1;
6777            }
6778            if let Some(pending) = pending_item.take() {
6779                push_rendered_list_item(out, pending.trim());
6780            }
6781            out.push('\n');
6782        }
6783        ContentElement::Table(table) => {
6784            render_table(out, table);
6785        }
6786        ContentElement::TableBorder(table) => {
6787            render_table_border(out, table);
6788        }
6789        ContentElement::Formula(f) => {
6790            let latex = f.latex.trim();
6791            if !latex.is_empty() {
6792                out.push_str(&format!("$$\n{}\n$$\n\n", latex));
6793            }
6794        }
6795        ContentElement::Caption(c) => {
6796            let text = c.base.value();
6797            let normalized = normalize_common_ocr_text(text.trim());
6798            let trimmed = normalized.trim();
6799            if !trimmed.is_empty() {
6800                out.push_str(&format!("*{}*\n\n", trimmed));
6801            }
6802        }
6803        ContentElement::NumberHeading(nh) => {
6804            let text = nh.base.base.base.value();
6805            let trimmed = text.trim();
6806            if should_skip_heading_text(trimmed) {
6807                return;
6808            }
6809            out.push_str(&format!("# {}\n\n", trimmed));
6810        }
6811        ContentElement::Image(_) => {
6812            out.push_str("![Image](image)\n\n");
6813        }
6814        ContentElement::HeaderFooter(_) => {
6815            // Skip headers/footers in markdown by default
6816        }
6817        ContentElement::TextBlock(tb) => {
6818            let text = tb.value();
6819            let trimmed = clean_paragraph_text(&text);
6820            if !trimmed.is_empty() {
6821                out.push_str(&escape_md_line_start(&trimmed));
6822                out.push_str("\n\n");
6823            }
6824        }
6825        ContentElement::TextLine(tl) => {
6826            let text = tl.value();
6827            let normalized = normalize_common_ocr_text(text.trim());
6828            let trimmed = normalized.trim();
6829            if !trimmed.is_empty() {
6830                out.push_str(trimmed);
6831                out.push('\n');
6832            }
6833        }
6834        ContentElement::TextChunk(tc) => {
6835            out.push_str(&tc.value);
6836        }
6837        _ => {}
6838    }
6839}
6840
6841/// Escape characters that have special meaning at the start of a markdown line.
6842fn escape_md_line_start(text: &str) -> String {
6843    if text.starts_with('>') || text.starts_with('#') {
6844        format!("\\{}", text)
6845    } else {
6846        text.to_string()
6847    }
6848}
6849
6850fn starts_with_caption_prefix(text: &str) -> bool {
6851    let lower = text.trim_start().to_ascii_lowercase();
6852    [
6853        "figure ",
6854        "fig. ",
6855        "table ",
6856        "tab. ",
6857        "chart ",
6858        "graph ",
6859        "image ",
6860        "illustration ",
6861        "diagram ",
6862        "plate ",
6863        "map ",
6864        "exhibit ",
6865        "photo by ",
6866        "photo credit",
6867        "image by ",
6868        "image credit",
6869        "image courtesy",
6870        "photo courtesy",
6871        "credit: ",
6872        "source: ",
6873    ]
6874    .iter()
6875    .any(|prefix| lower.starts_with(prefix))
6876}
6877
6878fn is_structural_caption(text: &str) -> bool {
6879    let lower = text.trim().to_ascii_lowercase();
6880    lower.starts_with("figure ")
6881        || lower.starts_with("table ")
6882        || lower.starts_with("diagram ")
6883        || lower.starts_with("chart ")
6884}
6885
6886fn normalize_chart_like_markdown(markdown: &str) -> String {
6887    let blocks: Vec<&str> = markdown
6888        .split("\n\n")
6889        .map(str::trim)
6890        .filter(|block| !block.is_empty())
6891        .collect();
6892    if blocks.is_empty() {
6893        return markdown.trim().to_string();
6894    }
6895
6896    let mut normalized = Vec::new();
6897    let mut i = 0usize;
6898    while i < blocks.len() {
6899        if let Some(rendered) = trim_large_top_table_plate(&blocks, i) {
6900            normalized.push(rendered);
6901            break;
6902        }
6903
6904        if let Some((rendered, consumed)) = render_header_pair_chart_table(&blocks, i) {
6905            normalized.push(rendered);
6906            i += consumed;
6907            continue;
6908        }
6909
6910        if let Some((rendered, consumed)) = render_chart_block(&blocks, i) {
6911            normalized.push(rendered);
6912            i += consumed;
6913            continue;
6914        }
6915
6916        if let Some((rendered, consumed)) = render_structural_caption_block(&blocks, i) {
6917            normalized.push(rendered);
6918            i += consumed;
6919            continue;
6920        }
6921
6922        if should_drop_artifact_table_block(&blocks, i) {
6923            i += 1;
6924            continue;
6925        }
6926
6927        if !looks_like_footer_banner(blocks[i]) {
6928            normalized.push(blocks[i].to_string());
6929        }
6930        i += 1;
6931    }
6932
6933    normalized.join("\n\n").trim().to_string() + "\n"
6934}
6935
6936fn trim_large_top_table_plate(blocks: &[&str], start: usize) -> Option<String> {
6937    if start != 0 {
6938        return None;
6939    }
6940
6941    let rows = parse_pipe_table_block(blocks.first()?.trim())?;
6942    let body_rows = rows.len().saturating_sub(2);
6943    let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
6944    if body_rows < 8 || max_cols < 8 {
6945        return None;
6946    }
6947
6948    let caption = blocks.get(1)?.trim();
6949    if !caption.starts_with("Table ") || caption.split_whitespace().count() < 12 {
6950        return None;
6951    }
6952
6953    let has_following_section = blocks.iter().skip(2).any(|block| {
6954        let trimmed = block.trim();
6955        trimmed.starts_with("# ")
6956            || trimmed.starts_with("## ")
6957            || trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
6958                && trimmed.contains(" Main Results")
6959    });
6960    has_following_section.then_some(blocks[0].trim().to_string())
6961}
6962
6963fn render_header_pair_chart_table(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6964    let caption = blocks.get(start)?.trim();
6965    if !is_structural_caption(caption) {
6966        return None;
6967    }
6968
6969    let rows = parse_pipe_table_block(blocks.get(start + 1)?)?;
6970    if rows.len() != 2 {
6971        return None;
6972    }
6973
6974    let pairs = extract_value_year_pairs_from_cells(&rows[0]);
6975    if pairs.len() < 4 {
6976        return None;
6977    }
6978
6979    let mut source = String::new();
6980    let mut consumed = 2usize;
6981    if let Some(next_block) = blocks.get(start + 2) {
6982        let next = next_block.trim();
6983        if next.to_ascii_lowercase().starts_with("source:") {
6984            source = next.to_string();
6985            consumed += 1;
6986        }
6987    }
6988
6989    let mut out = String::new();
6990    let heading_prefix = if start == 0 { "# " } else { "## " };
6991    out.push_str(heading_prefix);
6992    out.push_str(caption);
6993    out.push_str("\n\n");
6994    out.push_str(&format!("| Year | {} |\n", chart_value_header(caption)));
6995    out.push_str("| --- | --- |\n");
6996    for (year, value) in pairs {
6997        out.push_str(&format!("| {} | {} |\n", year, value));
6998    }
6999    out.push('\n');
7000
7001    if !source.is_empty() {
7002        out.push('*');
7003        out.push_str(&escape_md_line_start(&source));
7004        out.push_str("*\n\n");
7005    }
7006
7007    Some((out.trim().to_string(), consumed))
7008}
7009
7010fn render_chart_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7011    let (caption, numeric_tokens) = split_chart_caption_and_values(blocks.get(start)?)?;
7012    let mut consumed = 1usize;
7013
7014    let mut source = String::new();
7015    let mut labels = Vec::new();
7016    if let Some(next_block) = blocks.get(start + 1) {
7017        let (candidate_labels, candidate_source) = extract_chart_labels_and_source(next_block);
7018        if !candidate_source.is_empty() || !candidate_labels.is_empty() {
7019            labels = candidate_labels;
7020            source = candidate_source;
7021            consumed += 1;
7022        }
7023    }
7024
7025    while let Some(block) = blocks.get(start + consumed) {
7026        if looks_like_numeric_noise_block(block) {
7027            consumed += 1;
7028            continue;
7029        }
7030        break;
7031    }
7032
7033    let value_tokens = derive_chart_series_values(&numeric_tokens, labels.len());
7034
7035    let mut out = String::new();
7036    out.push_str("## ");
7037    out.push_str(caption.trim());
7038    out.push_str("\n\n");
7039
7040    if labels.len() >= 3 && labels.len() == value_tokens.len() {
7041        let label_header = if labels.iter().all(|label| looks_like_yearish_label(label)) {
7042            "Year"
7043        } else {
7044            "Label"
7045        };
7046        let value_header = chart_value_header(&caption);
7047        out.push_str(&format!("| {} | {} |\n", label_header, value_header));
7048        out.push_str("| --- | --- |\n");
7049        for (label, value) in labels.iter().zip(value_tokens.iter()) {
7050            out.push_str(&format!("| {} | {} |\n", label, value));
7051        }
7052        out.push('\n');
7053    }
7054
7055    if !source.is_empty() {
7056        out.push('*');
7057        out.push_str(&escape_md_line_start(&source));
7058        out.push_str("*\n\n");
7059    }
7060
7061    Some((out.trim().to_string(), consumed))
7062}
7063
7064fn render_structural_caption_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7065    let block = blocks.get(start)?.trim();
7066    if !is_structural_caption(block) || block.contains('|') {
7067        return None;
7068    }
7069
7070    let mut caption = collapse_inline_whitespace(block);
7071    let mut consumed = 1usize;
7072    if let Some(next_block) = blocks.get(start + 1) {
7073        let next = next_block.trim();
7074        if looks_like_caption_continuation(next) {
7075            caption.push(' ');
7076            caption.push_str(next.trim_end_matches('.'));
7077            consumed += 1;
7078        } else if !looks_like_isolated_caption_context(block, next) {
7079            return None;
7080        }
7081    } else {
7082        return None;
7083    }
7084
7085    Some((format!("## {}", caption.trim()), consumed))
7086}
7087
7088fn split_chart_caption_and_values(block: &str) -> Option<(String, Vec<String>)> {
7089    let trimmed = block.trim();
7090    if !is_structural_caption(trimmed) {
7091        return None;
7092    }
7093
7094    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7095    let first_numeric_idx = tokens.iter().position(|token| is_numberish_token(token))?;
7096    if first_numeric_idx < 3 {
7097        return None;
7098    }
7099
7100    let caption = tokens[..first_numeric_idx].join(" ");
7101    let numeric_tokens: Vec<String> = tokens[first_numeric_idx..]
7102        .iter()
7103        .filter_map(|token| sanitize_numberish_token(token))
7104        .collect();
7105
7106    if numeric_tokens.len() < 4 {
7107        return None;
7108    }
7109
7110    Some((caption, numeric_tokens))
7111}
7112
7113fn parse_pipe_table_block(block: &str) -> Option<Vec<Vec<String>>> {
7114    let lines: Vec<&str> = block
7115        .lines()
7116        .map(str::trim)
7117        .filter(|line| !line.is_empty())
7118        .collect();
7119    if lines.len() < 2 {
7120        return None;
7121    }
7122
7123    let header = split_pipe_row(lines[0])?;
7124    if !is_pipe_separator_row(lines[1], header.len()) {
7125        return None;
7126    }
7127
7128    let mut rows = vec![header];
7129    rows.push(split_pipe_row(lines[1]).unwrap_or_default());
7130    for line in lines.iter().skip(2) {
7131        let row = split_pipe_row(line)?;
7132        rows.push(row);
7133    }
7134    Some(rows)
7135}
7136
7137fn split_pipe_row(line: &str) -> Option<Vec<String>> {
7138    let trimmed = line.trim();
7139    if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
7140        return None;
7141    }
7142
7143    Some(
7144        trimmed[1..trimmed.len() - 1]
7145            .split('|')
7146            .map(|cell| cell.trim().to_string())
7147            .collect(),
7148    )
7149}
7150
7151fn is_pipe_separator_row(line: &str, expected_cols: usize) -> bool {
7152    let Some(cells) = split_pipe_row(line) else {
7153        return false;
7154    };
7155    if cells.len() != expected_cols || expected_cols == 0 {
7156        return false;
7157    }
7158
7159    cells.iter().all(|cell| {
7160        let stripped = cell.trim_matches(':').trim();
7161        !stripped.is_empty() && stripped.chars().all(|ch| ch == '-')
7162    })
7163}
7164
7165fn extract_value_year_pairs_from_cells(cells: &[String]) -> Vec<(String, String)> {
7166    let mut pairs = Vec::new();
7167    for cell in cells {
7168        let tokens: Vec<&str> = cell.split_whitespace().collect();
7169        if tokens.len() != 2 {
7170            continue;
7171        }
7172
7173        if looks_like_year_token(tokens[0]) && is_numberish_token(tokens[1]) {
7174            if let Some(value) = sanitize_numberish_token(tokens[1]) {
7175                pairs.push((tokens[0].to_string(), value));
7176            }
7177            continue;
7178        }
7179
7180        if is_numberish_token(tokens[0]) && looks_like_year_token(tokens[1]) {
7181            if let Some(value) = sanitize_numberish_token(tokens[0]) {
7182                pairs.push((tokens[1].to_string(), value));
7183            }
7184        }
7185    }
7186
7187    pairs.sort_by(|left, right| left.0.cmp(&right.0));
7188    pairs
7189}
7190
7191fn should_drop_artifact_table_block(blocks: &[&str], start: usize) -> bool {
7192    let Some(rows) = parse_pipe_table_block(blocks[start]) else {
7193        return false;
7194    };
7195
7196    let prev = start
7197        .checked_sub(1)
7198        .and_then(|idx| blocks.get(idx))
7199        .map(|block| block.trim())
7200        .unwrap_or("");
7201    let next = blocks
7202        .get(start + 1)
7203        .map(|block| block.trim())
7204        .unwrap_or("");
7205
7206    if rows.len() == 2 && rows.first().is_some_and(|row| row.len() == 1) {
7207        let header = rows[0][0].trim();
7208        if looks_like_url_fragment(header) {
7209            return true;
7210        }
7211        if looks_like_numeric_axis_blob(header) && !previous_block_announces_table(prev) {
7212            return true;
7213        }
7214    }
7215
7216    let stats = pipe_table_stats(&rows);
7217    stats.fill_ratio < 0.5
7218        && stats.long_cell_count == 0
7219        && !is_structural_caption(prev)
7220        && (looks_like_citation_block(next) || is_structural_caption(next))
7221}
7222
7223fn previous_block_announces_table(block: &str) -> bool {
7224    let lower = block.trim().to_ascii_lowercase();
7225    lower.ends_with("as follows:")
7226        || lower.ends_with("following details:")
7227        || lower.ends_with("following detail:")
7228        || lower.contains("the following details")
7229}
7230
7231fn looks_like_url_fragment(text: &str) -> bool {
7232    let trimmed = text.trim();
7233    (!trimmed.is_empty() && (trimmed.contains("http") || trimmed.contains("/status/")))
7234        || (trimmed.contains('/') && !trimmed.contains(' '))
7235}
7236
7237fn looks_like_numeric_axis_blob(text: &str) -> bool {
7238    let numeric_values: Vec<i64> = text
7239        .split_whitespace()
7240        .filter_map(parse_integer_token)
7241        .collect();
7242    numeric_values.len() >= 8
7243        && !detect_axis_progression(&numeric_values).is_empty()
7244        && text.chars().any(char::is_alphabetic)
7245}
7246
7247fn looks_like_citation_block(block: &str) -> bool {
7248    let trimmed = block.trim();
7249    trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.split_whitespace().count() <= 8
7250}
7251
7252struct PipeTableStats {
7253    fill_ratio: f64,
7254    long_cell_count: usize,
7255}
7256
7257fn pipe_table_stats(rows: &[Vec<String>]) -> PipeTableStats {
7258    let cols = rows.iter().map(Vec::len).max().unwrap_or(0).max(1);
7259    let body = rows.len().saturating_sub(2);
7260    let mut nonempty = 0usize;
7261    let mut long_cell_count = 0usize;
7262
7263    for row in rows.iter().skip(2) {
7264        for cell in row {
7265            if !cell.trim().is_empty() {
7266                nonempty += 1;
7267                if cell.split_whitespace().count() >= 3 {
7268                    long_cell_count += 1;
7269                }
7270            }
7271        }
7272    }
7273
7274    let fill_ratio = if body == 0 {
7275        0.0
7276    } else {
7277        nonempty as f64 / (body * cols) as f64
7278    };
7279
7280    PipeTableStats {
7281        fill_ratio,
7282        long_cell_count,
7283    }
7284}
7285
7286fn extract_chart_labels_and_source(block: &str) -> (Vec<String>, String) {
7287    let trimmed = block.trim();
7288    let lower = trimmed.to_ascii_lowercase();
7289    let source_idx = lower.find("source:");
7290
7291    let label_region = source_idx.map_or(trimmed, |idx| trimmed[..idx].trim());
7292    let source = source_idx
7293        .map(|idx| trimmed[idx..].trim().to_string())
7294        .unwrap_or_default();
7295
7296    let labels = parse_chart_labels(label_region);
7297    (labels, source)
7298}
7299
7300fn parse_chart_labels(text: &str) -> Vec<String> {
7301    let tokens: Vec<&str> = text.split_whitespace().collect();
7302    let mut labels = Vec::new();
7303    let mut i = 0usize;
7304    while i < tokens.len() {
7305        let token = tokens[i].trim_matches(|c: char| c == ',' || c == ';');
7306        if looks_like_year_token(token) {
7307            let mut label = token.to_string();
7308            if let Some(next) = tokens.get(i + 1) {
7309                let next_trimmed = next.trim_matches(|c: char| c == ',' || c == ';');
7310                if next_trimmed.starts_with('(') && next_trimmed.ends_with(')') {
7311                    label.push(' ');
7312                    label.push_str(next_trimmed);
7313                    i += 1;
7314                }
7315            }
7316            labels.push(label);
7317        } else if looks_like_category_label(token) {
7318            labels.push(token.to_string());
7319        }
7320        i += 1;
7321    }
7322    labels
7323}
7324
7325fn derive_chart_series_values(tokens: &[String], expected_count: usize) -> Vec<String> {
7326    if expected_count == 0 {
7327        return Vec::new();
7328    }
7329
7330    if tokens.len() == expected_count {
7331        return tokens.to_vec();
7332    }
7333
7334    let numeric_values: Vec<i64> = tokens
7335        .iter()
7336        .filter_map(|token| parse_integer_token(token))
7337        .collect();
7338    if numeric_values.len() != tokens.len() {
7339        return Vec::new();
7340    }
7341
7342    let axis_series = detect_axis_progression(&numeric_values);
7343    if axis_series.is_empty() {
7344        return Vec::new();
7345    }
7346
7347    let mut remaining = Vec::new();
7348    let mut removable = axis_series;
7349    for token in tokens {
7350        let Some(value) = parse_integer_token(token) else {
7351            continue;
7352        };
7353        if let Some(pos) = removable.iter().position(|candidate| *candidate == value) {
7354            removable.remove(pos);
7355        } else {
7356            remaining.push(token.clone());
7357        }
7358    }
7359
7360    if remaining.len() == expected_count {
7361        remaining
7362    } else {
7363        Vec::new()
7364    }
7365}
7366
7367fn detect_axis_progression(values: &[i64]) -> Vec<i64> {
7368    if values.len() < 6 {
7369        return Vec::new();
7370    }
7371
7372    let mut sorted = values.to_vec();
7373    sorted.sort_unstable();
7374    sorted.dedup();
7375    if sorted.len() < 6 {
7376        return Vec::new();
7377    }
7378
7379    let mut best = Vec::new();
7380    for window in sorted.windows(2) {
7381        let step = window[1] - window[0];
7382        if step <= 0 {
7383            continue;
7384        }
7385
7386        let mut series = vec![window[0]];
7387        let mut current = window[0];
7388        loop {
7389            let next = current + step;
7390            if sorted.binary_search(&next).is_ok() {
7391                series.push(next);
7392                current = next;
7393            } else {
7394                break;
7395            }
7396        }
7397
7398        if series.len() > best.len() {
7399            best = series;
7400        }
7401    }
7402
7403    if best.len() >= 6 {
7404        best
7405    } else {
7406        Vec::new()
7407    }
7408}
7409
7410fn chart_value_header(caption: &str) -> String {
7411    let trimmed = caption.trim();
7412    let title = strip_structural_caption_prefix(trimmed);
7413
7414    let mut base = title.to_string();
7415    if let Some(idx) = base.rfind(" in ") {
7416        let tail = base[idx + 4..].trim();
7417        if tail.split_whitespace().count() <= 2
7418            && tail.chars().next().is_some_and(char::is_uppercase)
7419        {
7420            base.truncate(idx);
7421        }
7422    }
7423
7424    if let Some(start) = title.rfind('(') {
7425        if title.ends_with(')') {
7426            let unit = title[start + 1..title.len() - 1].trim();
7427            if let Some(idx) = base.rfind('(') {
7428                base.truncate(idx);
7429            }
7430            let normalized_unit = unit.strip_prefix("in ").unwrap_or(unit).trim();
7431            return format!("{} ({})", base.trim(), normalized_unit);
7432        }
7433    }
7434
7435    let trimmed = base.trim();
7436    if trimmed.is_empty() {
7437        "Value".to_string()
7438    } else {
7439        trimmed.to_string()
7440    }
7441}
7442
7443fn strip_structural_caption_prefix(text: &str) -> &str {
7444    let trimmed = text.trim();
7445    let mut parts = trimmed.splitn(3, ' ');
7446    let Some(first) = parts.next() else {
7447        return trimmed;
7448    };
7449    let Some(second) = parts.next() else {
7450        return trimmed;
7451    };
7452    let Some(rest) = parts.next() else {
7453        return trimmed;
7454    };
7455
7456    let first_lower = first.to_ascii_lowercase();
7457    if matches!(
7458        first_lower.as_str(),
7459        "figure" | "table" | "diagram" | "chart"
7460    ) && second
7461        .chars()
7462        .all(|ch| ch.is_ascii_digit() || matches!(ch, '.' | ':'))
7463    {
7464        rest.trim()
7465    } else {
7466        trimmed
7467    }
7468}
7469
7470fn looks_like_footer_banner(block: &str) -> bool {
7471    let trimmed = block.trim();
7472    if trimmed.contains('\n') || trimmed.len() < 8 {
7473        return false;
7474    }
7475
7476    let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7477    if !(2..=6).contains(&tokens.len()) {
7478        return false;
7479    }
7480
7481    let Some(last) = tokens.last() else {
7482        return false;
7483    };
7484    if !last.chars().all(|ch| ch.is_ascii_digit()) {
7485        return false;
7486    }
7487
7488    tokens[..tokens.len() - 1].iter().all(|token| {
7489        matches!(
7490            token.to_ascii_lowercase().as_str(),
7491            "of" | "and" | "the" | "for" | "in" | "on"
7492        ) || token.chars().next().is_some_and(char::is_uppercase)
7493    })
7494}
7495
7496fn looks_like_caption_continuation(block: &str) -> bool {
7497    let trimmed = block.trim();
7498    !trimmed.is_empty()
7499        && trimmed.split_whitespace().count() <= 8
7500        && trimmed.chars().next().is_some_and(char::is_uppercase)
7501        && !trimmed.contains(':')
7502}
7503
7504fn collapse_inline_whitespace(text: &str) -> String {
7505    text.split_whitespace().collect::<Vec<_>>().join(" ")
7506}
7507
7508fn drop_isolated_noise_lines(markdown: &str) -> String {
7509    let lines: Vec<&str> = markdown.lines().collect();
7510    let mut kept = Vec::with_capacity(lines.len());
7511
7512    for (idx, line) in lines.iter().enumerate() {
7513        if should_drop_isolated_noise_line(&lines, idx) {
7514            continue;
7515        }
7516        kept.push(*line);
7517    }
7518
7519    let mut result = kept.join("\n");
7520    if markdown.ends_with('\n') {
7521        result.push('\n');
7522    }
7523    result
7524}
7525
7526fn should_drop_isolated_noise_line(lines: &[&str], idx: usize) -> bool {
7527    let trimmed = lines[idx].trim();
7528    if trimmed.len() != 1 {
7529        return false;
7530    }
7531
7532    let ch = trimmed.chars().next().unwrap_or_default();
7533    if !(ch.is_ascii_lowercase() || ch.is_ascii_digit()) {
7534        return false;
7535    }
7536
7537    let prev = previous_nonempty_line(lines, idx);
7538    let next = next_nonempty_line(lines, idx);
7539    let (Some(prev), Some(next)) = (prev, next) else {
7540        return false;
7541    };
7542
7543    is_substantive_markdown_line(prev) && is_substantive_markdown_line(next)
7544}
7545
7546fn previous_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7547    lines[..idx]
7548        .iter()
7549        .rev()
7550        .find(|line| !line.trim().is_empty())
7551        .copied()
7552}
7553
7554fn next_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7555    lines[idx + 1..]
7556        .iter()
7557        .find(|line| !line.trim().is_empty())
7558        .copied()
7559}
7560
7561fn is_substantive_markdown_line(line: &str) -> bool {
7562    let trimmed = line.trim();
7563    if trimmed.is_empty() {
7564        return false;
7565    }
7566
7567    if trimmed.starts_with('|') || trimmed.starts_with("- ") || trimmed.starts_with('#') {
7568        return true;
7569    }
7570
7571    trimmed.split_whitespace().count() >= 2
7572}
7573
7574fn normalize_common_ocr_text(text: &str) -> String {
7575    if text.is_empty() {
7576        return String::new();
7577    }
7578
7579    let mut normalized = text
7580        .replace("ߤL", "μL")
7581        .replace(" oC", "°C")
7582        .replace("37 C", "37°C")
7583        .replace("-20 oC", "-20°C")
7584        .replace("1- 20-μL", "1-20-μL")
7585        .replace("1- 20 μL", "1-20 μL")
7586        .replace("1- 2 0  μL", "1-20 μL")
7587        .replace("1- 2 0 μL", "1-20 μL");
7588
7589    normalized = normalize_degree_spacing(&normalized);
7590    collapse_inline_whitespace(&normalized)
7591}
7592
7593fn normalize_degree_spacing(text: &str) -> String {
7594    let chars: Vec<char> = text.chars().collect();
7595    let mut out = String::with_capacity(text.len());
7596    let mut i = 0usize;
7597    while i < chars.len() {
7598        let ch = chars[i];
7599        if ch == ' '
7600            && i > 0
7601            && i + 2 < chars.len()
7602            && chars[i - 1].is_ascii_digit()
7603            && matches!(chars[i + 1], 'C' | 'F')
7604            && !chars[i + 2].is_ascii_alphabetic()
7605        {
7606            out.push('°');
7607            out.push(chars[i + 1]);
7608            i += 2;
7609            continue;
7610        }
7611        out.push(ch);
7612        i += 1;
7613    }
7614    out
7615}
7616
7617fn normalize_list_text(text: &str) -> String {
7618    let normalized = normalize_common_ocr_text(text);
7619    let trimmed = normalized
7620        .trim_start_matches(|ch: char| is_bullet_like(ch))
7621        .trim();
7622    trimmed.to_string()
7623}
7624
7625fn push_rendered_list_item(out: &mut String, item: &str) {
7626    if starts_with_enumerated_marker(item) {
7627        out.push_str(item);
7628        out.push('\n');
7629    } else {
7630        out.push_str(&format!("- {}\n", item));
7631    }
7632}
7633
7634fn should_merge_list_continuation(previous: &str, current: &str) -> bool {
7635    let trimmed = current.trim();
7636    if trimmed.is_empty()
7637        || looks_like_stray_list_page_number(trimmed)
7638        || is_list_section_heading(trimmed)
7639        || looks_like_numbered_section(trimmed)
7640        || starts_with_enumerated_marker(trimmed)
7641    {
7642        return false;
7643    }
7644
7645    if previous.ends_with('-')
7646        && previous
7647            .chars()
7648            .rev()
7649            .nth(1)
7650            .is_some_and(|c| c.is_alphabetic())
7651        && trimmed.chars().next().is_some_and(char::is_lowercase)
7652    {
7653        return true;
7654    }
7655
7656    trimmed
7657        .chars()
7658        .next()
7659        .is_some_and(|ch| ch.is_ascii_lowercase() || matches!(ch, ',' | ';' | ')' | ']' | '%'))
7660}
7661
7662fn is_pure_bullet_marker(text: &str) -> bool {
7663    let trimmed = text.trim();
7664    !trimmed.is_empty() && trimmed.chars().all(is_bullet_like)
7665}
7666
7667fn looks_like_stray_list_page_number(text: &str) -> bool {
7668    let trimmed = text.trim();
7669    (1..=4).contains(&trimmed.len()) && trimmed.chars().all(|ch| ch.is_ascii_digit())
7670}
7671
7672fn is_bullet_like(ch: char) -> bool {
7673    matches!(
7674        ch,
7675        '•' | '◦'
7676            | '▪'
7677            | '▸'
7678            | '▹'
7679            | '►'
7680            | '▻'
7681            | '●'
7682            | '○'
7683            | '■'
7684            | '□'
7685            | '◆'
7686            | '◇'
7687            | '-'
7688    )
7689}
7690
7691fn looks_like_isolated_caption_context(caption: &str, next_block: &str) -> bool {
7692    let next = next_block.trim();
7693    if next.is_empty() {
7694        return false;
7695    }
7696
7697    let next_lower = next.to_ascii_lowercase();
7698    if next_lower.starts_with("source:")
7699        || next_lower.starts_with("note:")
7700        || next_lower.starts_with("*source:")
7701        || next_lower.starts_with("*note:")
7702    {
7703        return true;
7704    }
7705
7706    caption.split_whitespace().count() <= 14
7707        && next.split_whitespace().count() <= 45
7708        && (next.contains(':') || next.contains('='))
7709}
7710
7711fn looks_like_numeric_noise_block(block: &str) -> bool {
7712    let trimmed = block.trim();
7713    !trimmed.is_empty()
7714        && trimmed.split_whitespace().all(|token| {
7715            sanitize_numberish_token(token)
7716                .as_deref()
7717                .is_some_and(|sanitized| sanitized.chars().all(|ch| ch.is_ascii_digit()))
7718        })
7719}
7720
7721fn looks_like_yearish_label(label: &str) -> bool {
7722    label.chars().next().is_some_and(|ch| ch.is_ascii_digit())
7723}
7724
7725fn looks_like_year_token(token: &str) -> bool {
7726    token.len() == 4 && token.chars().all(|ch| ch.is_ascii_digit())
7727}
7728
7729fn looks_like_category_label(token: &str) -> bool {
7730    token
7731        .chars()
7732        .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '/' | '%'))
7733        && token.chars().any(|ch| ch.is_ascii_alphabetic())
7734}
7735
7736fn is_numberish_token(token: &str) -> bool {
7737    sanitize_numberish_token(token).is_some()
7738}
7739
7740fn sanitize_numberish_token(token: &str) -> Option<String> {
7741    let trimmed = token.trim_matches(|c: char| matches!(c, ',' | ';' | ':' | '.'));
7742    if trimmed.is_empty() {
7743        return None;
7744    }
7745
7746    let candidate = trimmed.trim_end_matches('%').replace(',', "");
7747    if candidate.chars().all(|ch| ch.is_ascii_digit()) {
7748        Some(trimmed.trim_end_matches([',', ';', ':']).to_string())
7749    } else {
7750        None
7751    }
7752}
7753
7754fn parse_integer_token(token: &str) -> Option<i64> {
7755    sanitize_numberish_token(token)?
7756        .replace(',', "")
7757        .parse::<i64>()
7758        .ok()
7759}
7760
7761fn starts_with_uppercase_word(text: &str) -> bool {
7762    for ch in text.trim_start().chars() {
7763        if ch.is_alphabetic() {
7764            return ch.is_uppercase();
7765        }
7766        if !matches!(ch, '"' | '\'' | '(' | '[') {
7767            break;
7768        }
7769    }
7770    false
7771}
7772
7773/// Clean paragraph text: trim trailing whitespace from each line,
7774/// collapse multiple spaces, and normalize whitespace.
7775fn clean_paragraph_text(text: &str) -> String {
7776    let trimmed = text.trim();
7777    if trimmed.is_empty() {
7778        return String::new();
7779    }
7780    // Collapse runs of spaces (but not newlines) to single space
7781    let mut result = String::with_capacity(trimmed.len());
7782    let mut prev_space = false;
7783    for ch in trimmed.chars() {
7784        if ch == ' ' || ch == '\t' {
7785            if !prev_space {
7786                result.push(' ');
7787                prev_space = true;
7788            }
7789        } else {
7790            result.push(ch);
7791            prev_space = false;
7792        }
7793    }
7794    normalize_common_ocr_text(&result)
7795}
7796
7797fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
7798    match element {
7799        Some(ContentElement::Paragraph(p)) => {
7800            let text = clean_paragraph_text(&p.base.value());
7801            let trimmed = text.trim();
7802            if trimmed.is_empty()
7803                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7804            {
7805                None
7806            } else {
7807                Some(trimmed.to_string())
7808            }
7809        }
7810        Some(ContentElement::TextBlock(tb)) => {
7811            let text = clean_paragraph_text(&tb.value());
7812            let trimmed = text.trim();
7813            if trimmed.is_empty()
7814                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7815            {
7816                None
7817            } else {
7818                Some(trimmed.to_string())
7819            }
7820        }
7821        Some(ContentElement::TextLine(tl)) => {
7822            let text = clean_paragraph_text(&tl.value());
7823            let trimmed = text.trim();
7824            if trimmed.is_empty()
7825                || should_render_element_as_heading(element.unwrap(), trimmed, None)
7826            {
7827                None
7828            } else {
7829                Some(trimmed.to_string())
7830            }
7831        }
7832        _ => None,
7833    }
7834}
7835
7836fn should_render_paragraph_as_heading(
7837    doc: &PdfDocument,
7838    idx: usize,
7839    text: &str,
7840    next: Option<&ContentElement>,
7841) -> bool {
7842    if looks_like_top_margin_running_header(doc, idx, text) {
7843        return false;
7844    }
7845    if looks_like_hyphenated_table_title_continuation(doc, idx, text, next) {
7846        return true;
7847    }
7848    if should_render_element_as_heading(&doc.kids[idx], text, next) {
7849        return true;
7850    }
7851
7852    // Font-size guard: skip rescue if the candidate text is significantly
7853    // smaller than the document's body text (chart axis labels, footnotes).
7854    let body_font_size = compute_body_font_size(doc);
7855    if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
7856        return false;
7857    }
7858
7859    // Rescue pass tier 1: when the pipeline found zero headings, use broad rescue.
7860    if !doc_has_explicit_headings(doc) {
7861        if should_rescue_as_heading(doc, idx, text) {
7862            return true;
7863        }
7864        // Also check numbered sections and ALL CAPS even with zero headings,
7865        // since Tier 1 broad rescue has strict word/char limits that miss
7866        // longer keyword-numbered headings (e.g. "Activity 4. Title text").
7867        if should_rescue_allcaps_heading(doc, idx, text) {
7868            return true;
7869        }
7870        if should_rescue_numbered_heading(doc, idx, text) {
7871            return true;
7872        }
7873        return false;
7874    }
7875    // Rescue pass tier 2: when heading density is very low (< 10%), only
7876    // rescue ALL CAPS short text followed by substantial body content.
7877    if heading_density(doc) < 0.10 {
7878        if should_rescue_allcaps_heading(doc, idx, text) {
7879            return true;
7880        }
7881        // Rescue pass tier 3: numbered section headings (e.g. "01 - Title").
7882        // When a document has very few detected headings, numbered patterns
7883        // are a strong structural signal that the font-based detector missed.
7884        if should_rescue_numbered_heading(doc, idx, text) {
7885            return true;
7886        }
7887        // Font-size-gated title-case rescue: when the paragraph is rendered
7888        // in a noticeably larger font than body text, apply the same
7889        // title-case rescue used in tier 1.  A 15 % size increase is a
7890        // reliable visual heading signal straight from the PDF font metrics.
7891        if body_font_size > 0.0 {
7892            if let ContentElement::Paragraph(p) = &doc.kids[idx] {
7893                if let Some(fs) = p.base.font_size {
7894                    if fs >= 1.15 * body_font_size
7895                        && is_heading_rescue_candidate(doc, idx, text)
7896                        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7897                    {
7898                        return true;
7899                    }
7900                }
7901            }
7902        }
7903    }
7904    false
7905}
7906
7907/// Check whether any element in the document is an explicit heading from the pipeline.
7908fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
7909    doc.kids.iter().any(|e| {
7910        matches!(
7911            e,
7912            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7913        )
7914    })
7915}
7916
7917/// Compute the dominant body font size from paragraphs with substantial text
7918/// (> 10 words).  Uses the median of qualifying paragraphs to avoid being
7919/// skewed by short chart labels or footnote markers.
7920/// Returns 0.0 if no qualifying paragraph is found.
7921fn compute_body_font_size(doc: &PdfDocument) -> f64 {
7922    let mut font_sizes: Vec<f64> = doc
7923        .kids
7924        .iter()
7925        .filter_map(|e| {
7926            if let ContentElement::Paragraph(p) = e {
7927                let word_count = p.base.value().split_whitespace().count();
7928                if word_count > 10 {
7929                    p.base.font_size
7930                } else {
7931                    None
7932                }
7933            } else {
7934                None
7935            }
7936        })
7937        .collect();
7938    if font_sizes.is_empty() {
7939        return 0.0;
7940    }
7941    font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
7942    font_sizes[font_sizes.len() / 2]
7943}
7944
7945/// Check whether a paragraph's font size is too small relative to the document
7946/// body font to be a heading.  Returns true if the element should be skipped.
7947/// A heading should not be noticeably smaller than body text — font size ≥ 95%
7948/// of the dominant body size is required.
7949fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
7950    if body_font_size <= 0.0 {
7951        return false;
7952    }
7953    if let ContentElement::Paragraph(p) = &doc_kids[idx] {
7954        if let Some(fs) = p.base.font_size {
7955            return fs < 0.95 * body_font_size;
7956        }
7957    }
7958    false
7959}
7960
7961/// Count the ratio of pipeline headings to total content elements.
7962fn heading_density(doc: &PdfDocument) -> f64 {
7963    let total = doc.kids.len();
7964    if total == 0 {
7965        return 0.0;
7966    }
7967    let heading_count = doc
7968        .kids
7969        .iter()
7970        .filter(|e| {
7971            matches!(
7972                e,
7973                ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7974            )
7975        })
7976        .count();
7977    heading_count as f64 / total as f64
7978}
7979
7980/// Rescue headings: identify short standalone paragraphs that likely serve
7981/// as section headings.  Only runs when the pipeline produced zero headings.
7982fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7983    is_heading_rescue_candidate(doc, idx, text)
7984        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7985}
7986
7987/// Pure text-criteria check for title-case heading rescue.
7988/// Returns true when the text looks like a heading based on casing,
7989/// length, and character composition — without any lookahead.
7990fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7991    let trimmed = text.trim();
7992    if trimmed.is_empty() {
7993        return false;
7994    }
7995
7996    let has_alpha = trimmed.chars().any(char::is_alphabetic);
7997
7998    // Must have alphabetic chars and not end with sentence/continuation punctuation
7999    if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
8000        return false;
8001    }
8002
8003    // Reject text containing math/special symbols or percentage signs.
8004    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8005        return false;
8006    }
8007
8008    // Must not be fully parenthesized (citations)
8009    if trimmed.starts_with('(') && trimmed.ends_with(')') {
8010        return false;
8011    }
8012
8013    // Must not look like a caption or chart label
8014    if starts_with_caption_prefix(trimmed)
8015        || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
8016    {
8017        return false;
8018    }
8019
8020    // Must be short: ≤ 6 words, ≤ 60 chars
8021    let word_count = trimmed.split_whitespace().count();
8022    if word_count > 6 || trimmed.len() > 60 {
8023        return false;
8024    }
8025
8026    // Must not be a purely numeric string
8027    if trimmed
8028        .chars()
8029        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8030    {
8031        return false;
8032    }
8033
8034    // First alphabetic character should be uppercase
8035    if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
8036        if first_alpha.is_lowercase() {
8037            return false;
8038        }
8039    }
8040
8041    true
8042}
8043
8044/// Check the next `max_lookahead` elements for substantive body content.
8045/// Returns true when at least one element is a long paragraph (≥ word_count*3
8046/// or > 15 words) or a structural element (list, table, image, figure).
8047fn has_substantive_follow_up(
8048    doc: &PdfDocument,
8049    idx: usize,
8050    word_count: usize,
8051    max_lookahead: usize,
8052) -> bool {
8053    for offset in 1..=max_lookahead {
8054        let lookahead_idx = idx + offset;
8055        if lookahead_idx >= doc.kids.len() {
8056            break;
8057        }
8058        let look_elem = &doc.kids[lookahead_idx];
8059        match look_elem {
8060            ContentElement::Paragraph(p) => {
8061                let next_text = p.base.value();
8062                let nw = next_text.split_whitespace().count();
8063                if nw >= word_count * 3 || nw > 15 {
8064                    return true;
8065                }
8066            }
8067            ContentElement::TextBlock(tb) => {
8068                let next_text = tb.value();
8069                let nw = next_text.split_whitespace().count();
8070                if nw >= word_count * 3 || nw > 15 {
8071                    return true;
8072                }
8073            }
8074            ContentElement::TextLine(tl) => {
8075                let next_text = tl.value();
8076                let nw = next_text.split_whitespace().count();
8077                if nw >= word_count * 3 || nw > 15 {
8078                    return true;
8079                }
8080            }
8081            ContentElement::List(_)
8082            | ContentElement::Table(_)
8083            | ContentElement::TableBorder(_)
8084            | ContentElement::Image(_)
8085            | ContentElement::Figure(_) => {
8086                return true;
8087            }
8088            _ => continue,
8089        }
8090    }
8091
8092    false
8093}
8094
8095/// Rescue numbered section headings like "01 - Find Open Educational Resources"
8096/// or "4.2 Main Results" when heading density is low.
8097fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8098    let trimmed = text.trim();
8099    if trimmed.is_empty() || trimmed.len() > 100 {
8100        return false;
8101    }
8102
8103    // Must match numbered section pattern: digits (with optional dots)
8104    // followed by separator and title text.
8105    if !looks_like_numbered_section(trimmed) {
8106        return false;
8107    }
8108
8109    // Must not end with sentence punctuation — EXCEPT when the text matches
8110    // a keyword+number pattern (e.g. "Activity 4. Determining CEC…") where
8111    // the trailing period is part of the heading format, not sentence ending.
8112    if trimmed.ends_with(['!', '?', ';', ',']) {
8113        return false;
8114    }
8115    if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
8116        return false;
8117    }
8118    // Reject numbered headings containing math symbols or percentage signs.
8119    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8120        return false;
8121    }
8122
8123    // Look ahead for substantive content
8124    for offset in 1..=3 {
8125        let lookahead_idx = idx + offset;
8126        if lookahead_idx >= doc.kids.len() {
8127            break;
8128        }
8129        match &doc.kids[lookahead_idx] {
8130            ContentElement::Paragraph(p) => {
8131                let nw = p.base.value().split_whitespace().count();
8132                if nw > 10 {
8133                    return true;
8134                }
8135            }
8136            ContentElement::TextBlock(tb) => {
8137                let nw = tb.value().split_whitespace().count();
8138                if nw > 10 {
8139                    return true;
8140                }
8141            }
8142            ContentElement::TextLine(tl) => {
8143                let nw = tl.value().split_whitespace().count();
8144                if nw > 10 {
8145                    return true;
8146                }
8147            }
8148            ContentElement::List(_)
8149            | ContentElement::Table(_)
8150            | ContentElement::TableBorder(_)
8151            | ContentElement::Image(_)
8152            | ContentElement::Figure(_) => {
8153                return true;
8154            }
8155            _ => continue,
8156        }
8157    }
8158
8159    false
8160}
8161
8162/// Check if text starts with a numbered section prefix (e.g. "01 -", "4.2 ", "III.")
8163/// or a keyword+number pattern (e.g. "Activity 4.", "Experiment #1:", "Chapter 3").
8164fn looks_like_numbered_section(text: &str) -> bool {
8165    let bytes = text.as_bytes();
8166    if bytes.is_empty() {
8167        return false;
8168    }
8169
8170    // Branch 1: digit-based prefix: "1 ", "01 ", "4.2 ", "1. ", "01 - "
8171    let mut idx = 0;
8172    if bytes[0].is_ascii_digit() {
8173        while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8174            idx += 1;
8175        }
8176        if idx >= bytes.len() {
8177            return false;
8178        }
8179        // dot-separated subsections: "4.2", "1.3.1"
8180        while idx < bytes.len() && bytes[idx] == b'.' {
8181            idx += 1;
8182            let start = idx;
8183            while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8184                idx += 1;
8185            }
8186            if idx == start {
8187                // "4." followed by space → "4. Title"
8188                break;
8189            }
8190        }
8191        // Must be followed by whitespace or "-"
8192        if idx >= bytes.len() {
8193            return false;
8194        }
8195        // Skip separator: "- " or " - " or just " "
8196        if bytes[idx] == b' ' || bytes[idx] == b'\t' {
8197            idx += 1;
8198            // Skip optional "- " separator
8199            if idx < bytes.len() && bytes[idx] == b'-' {
8200                idx += 1;
8201                if idx < bytes.len() && bytes[idx] == b' ' {
8202                    idx += 1;
8203                }
8204            }
8205        } else if bytes[idx] == b'-' {
8206            idx += 1;
8207            if idx < bytes.len() && bytes[idx] == b' ' {
8208                idx += 1;
8209            }
8210        } else {
8211            return false;
8212        }
8213        // Must have title text after prefix
8214        let rest = &text[idx..].trim();
8215        if rest.is_empty() {
8216            return false;
8217        }
8218        // First alpha char must be uppercase
8219        if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
8220            return c.is_uppercase();
8221        }
8222        return false;
8223    }
8224
8225    // Branch 2: keyword+number prefix: "Activity 4.", "Experiment #1:", "Chapter 3"
8226    if looks_like_keyword_numbered_section(text) {
8227        return true;
8228    }
8229
8230    false
8231}
8232
8233/// Structural keywords that commonly precede a number to form a heading.
8234const SECTION_KEYWORDS: &[&str] = &[
8235    "activity",
8236    "appendix",
8237    "case",
8238    "chapter",
8239    "exercise",
8240    "experiment",
8241    "lab",
8242    "lesson",
8243    "module",
8244    "part",
8245    "phase",
8246    "problem",
8247    "question",
8248    "section",
8249    "stage",
8250    "step",
8251    "task",
8252    "topic",
8253    "unit",
8254];
8255
8256/// Check if text matches "Keyword N. Title" or "Keyword #N: Title" pattern.
8257fn looks_like_keyword_numbered_section(text: &str) -> bool {
8258    let trimmed = text.trim();
8259    // Find the first space to extract the keyword
8260    let space_pos = match trimmed.find(' ') {
8261        Some(p) => p,
8262        None => return false,
8263    };
8264    let keyword = &trimmed[..space_pos];
8265    if !SECTION_KEYWORDS
8266        .iter()
8267        .any(|k| keyword.eq_ignore_ascii_case(k))
8268    {
8269        return false;
8270    }
8271    // After keyword+space, expect a number (optionally preceded by #)
8272    let rest = trimmed[space_pos + 1..].trim_start();
8273    if rest.is_empty() {
8274        return false;
8275    }
8276    let rest = rest.strip_prefix('#').unwrap_or(rest);
8277    // Must start with a digit or roman numeral
8278    let first_char = rest.chars().next().unwrap_or(' ');
8279    if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
8280        return false;
8281    }
8282    true
8283}
8284
8285/// Strict rescue for docs with some headings but low density: only promote
8286/// ALL CAPS text that is clearly a section heading.
8287fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8288    let trimmed = text.trim();
8289    if trimmed.is_empty() {
8290        return false;
8291    }
8292
8293    let word_count = trimmed.split_whitespace().count();
8294
8295    // Must be short: ≤ 8 words, ≤ 80 chars
8296    if word_count > 8 || trimmed.len() > 80 {
8297        return false;
8298    }
8299
8300    // Must be ALL CAPS (all alphabetic chars are uppercase)
8301    let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
8302    if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
8303        return false;
8304    }
8305
8306    // Must not end with sentence punctuation
8307    if trimmed.ends_with(['.', ';', ',']) {
8308        return false;
8309    }
8310
8311    // Reject all-caps headings containing math symbols or percentage signs.
8312    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8313        return false;
8314    }
8315
8316    // Must not look like a caption
8317    if starts_with_caption_prefix(trimmed) {
8318        return false;
8319    }
8320
8321    // Must not be purely numeric or a page number
8322    if trimmed
8323        .chars()
8324        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8325    {
8326        return false;
8327    }
8328
8329    // Look ahead for substantive content — accept any non-trivial text
8330    // (>6 words) or structured content within the next 4 elements.
8331    for offset in 1..=4 {
8332        let lookahead_idx = idx + offset;
8333        if lookahead_idx >= doc.kids.len() {
8334            break;
8335        }
8336        let look_elem = &doc.kids[lookahead_idx];
8337        match look_elem {
8338            ContentElement::Paragraph(p) => {
8339                let nw = p.base.value().split_whitespace().count();
8340                if nw > 6 {
8341                    return true;
8342                }
8343            }
8344            ContentElement::TextBlock(tb) => {
8345                let nw = tb.value().split_whitespace().count();
8346                if nw > 6 {
8347                    return true;
8348                }
8349            }
8350            ContentElement::TextLine(tl) => {
8351                let nw = tl.value().split_whitespace().count();
8352                if nw > 6 {
8353                    return true;
8354                }
8355            }
8356            ContentElement::List(_)
8357            | ContentElement::Table(_)
8358            | ContentElement::TableBorder(_)
8359            | ContentElement::Image(_)
8360            | ContentElement::Figure(_) => {
8361                return true;
8362            }
8363            _ => continue,
8364        }
8365    }
8366
8367    false
8368}
8369
8370fn should_render_element_as_heading(
8371    element: &ContentElement,
8372    text: &str,
8373    next: Option<&ContentElement>,
8374) -> bool {
8375    let trimmed = text.trim();
8376    if trimmed.is_empty() {
8377        return false;
8378    }
8379
8380    let lower = trimmed.to_ascii_lowercase();
8381    if matches!(lower.as_str(), "contents" | "table of contents")
8382        && trimmed.starts_with(|c: char| c.is_uppercase())
8383    {
8384        return true;
8385    }
8386
8387    let word_count = trimmed.split_whitespace().count();
8388    let has_alpha = trimmed.chars().any(char::is_alphabetic);
8389    let title_like = has_alpha
8390        && word_count <= 4
8391        && trimmed.len() <= 40
8392        && !trimmed.ends_with(['.', '!', '?', ';', ':']);
8393
8394    // Reject attribution prefixes that are clearly not section headings
8395    // (more targeted than starts_with_caption_prefix to avoid false demotions
8396    // of legitimate headings starting with common words like "Graph", "Table").
8397    let is_attribution = {
8398        let lower = trimmed.to_ascii_lowercase();
8399        lower.starts_with("source:")
8400            || lower.starts_with("credit:")
8401            || lower.starts_with("photo by ")
8402            || lower.starts_with("photo credit")
8403            || lower.starts_with("image by ")
8404            || lower.starts_with("image credit")
8405    };
8406
8407    title_like
8408        && matches!(next, Some(ContentElement::List(_)))
8409        && !looks_like_chart_label_heading(element, trimmed)
8410        && !is_attribution
8411}
8412
8413fn looks_like_hyphenated_table_title_continuation(
8414    doc: &PdfDocument,
8415    idx: usize,
8416    text: &str,
8417    next: Option<&ContentElement>,
8418) -> bool {
8419    if !matches!(
8420        next,
8421        Some(ContentElement::Table(_)) | Some(ContentElement::TableBorder(_))
8422    ) {
8423        return false;
8424    }
8425
8426    let trimmed = text.trim();
8427    if trimmed.is_empty()
8428        || starts_with_caption_prefix(trimmed)
8429        || looks_like_numbered_section(trimmed)
8430        || looks_like_keyword_numbered_section(trimmed)
8431        || !trimmed.ends_with(':')
8432    {
8433        return false;
8434    }
8435
8436    let word_count = trimmed.split_whitespace().count();
8437    if !(3..=5).contains(&word_count) || trimmed.len() > 60 {
8438        return false;
8439    }
8440
8441    let Some(first_alpha) = trimmed.chars().find(|ch| ch.is_alphabetic()) else {
8442        return false;
8443    };
8444    if first_alpha.is_lowercase() {
8445        return false;
8446    }
8447
8448    let Some(prev_idx) = idx.checked_sub(1) else {
8449        return false;
8450    };
8451    let prev_text = extract_element_text(&doc.kids[prev_idx]);
8452    let prev_trimmed = prev_text.trim();
8453    !prev_trimmed.is_empty() && prev_trimmed.ends_with('-')
8454}
8455
8456fn looks_like_table_header_duplicate_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8457    let trimmed = text.trim();
8458    if trimmed.is_empty()
8459        || starts_with_caption_prefix(trimmed)
8460        || looks_like_numbered_section(trimmed)
8461        || looks_like_keyword_numbered_section(trimmed)
8462    {
8463        return false;
8464    }
8465
8466    let word_count = trimmed.split_whitespace().count();
8467    if !(3..=10).contains(&word_count) || trimmed.len() > 96 {
8468        return false;
8469    }
8470
8471    let Some(prev_idx) = idx.checked_sub(1) else {
8472        return false;
8473    };
8474    let Some(previous_table) = table_border_from_element(&doc.kids[prev_idx]) else {
8475        return false;
8476    };
8477    if previous_table.num_columns < 3 || previous_table.rows.len() < 3 {
8478        return false;
8479    }
8480
8481    let mut rendered_rows = collect_table_border_rows(previous_table);
8482    if rendered_rows.is_empty() {
8483        return false;
8484    }
8485    merge_continuation_rows(&mut rendered_rows);
8486    trim_leading_table_carryover_rows(&mut rendered_rows);
8487
8488    let Some(header_row) = rendered_rows.first() else {
8489        return false;
8490    };
8491    let header_text = header_row
8492        .iter()
8493        .map(|cell| cell.trim())
8494        .filter(|cell| !cell.is_empty())
8495        .collect::<Vec<_>>()
8496        .join(" ");
8497    if !equivalent_heading_text(trimmed, &header_text) {
8498        return false;
8499    }
8500
8501    let page_number = doc.kids[idx].page_number();
8502    let mut short_fragments = 0usize;
8503    let mut numeric_fragments = 0usize;
8504
8505    for candidate in doc.kids.iter().skip(idx + 1) {
8506        if candidate.page_number() != page_number {
8507            break;
8508        }
8509        if matches!(
8510            candidate,
8511            ContentElement::Table(_) | ContentElement::TableBorder(_)
8512        ) {
8513            break;
8514        }
8515
8516        let fragment = extract_element_text(candidate);
8517        let fragment_trimmed = fragment.trim();
8518        if fragment_trimmed.is_empty()
8519            || looks_like_margin_page_number(doc, candidate, fragment_trimmed)
8520        {
8521            continue;
8522        }
8523
8524        let fragment_words = fragment_trimmed.split_whitespace().count();
8525        if fragment_words > 6 {
8526            return false;
8527        }
8528
8529        short_fragments += 1;
8530        if fragment_trimmed.chars().any(|ch| ch.is_ascii_digit()) {
8531            numeric_fragments += 1;
8532        }
8533
8534        if short_fragments >= 3 {
8535            break;
8536        }
8537    }
8538
8539    short_fragments >= 2 && numeric_fragments >= 1
8540}
8541
8542fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8543    let trimmed = text.trim();
8544    if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
8545        return false;
8546    }
8547
8548    let element = &doc.kids[idx];
8549    let bbox = element.bbox();
8550    if bbox.height() > 24.0 {
8551        return false;
8552    }
8553
8554    let Some(page) = element.page_number() else {
8555        return false;
8556    };
8557
8558    // Compute top Y for every page (single pass).
8559    let mut page_tops = std::collections::HashMap::<u32, f64>::new();
8560    for candidate in &doc.kids {
8561        if let Some(p) = candidate.page_number() {
8562            let top = page_tops.entry(p).or_insert(f64::MIN);
8563            *top = top.max(candidate.bbox().top_y);
8564        }
8565    }
8566
8567    let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
8568    if bbox.top_y < page_top - 24.0 {
8569        return false;
8570    }
8571
8572    // A running header repeats across pages.  If the same text does NOT
8573    // appear at the top margin of any other page, this is a unique heading
8574    // (e.g. a document title), not a running header.
8575    let trimmed_lower = trimmed.to_lowercase();
8576    for other_elem in &doc.kids {
8577        let Some(other_page) = other_elem.page_number() else {
8578            continue;
8579        };
8580        if other_page == page {
8581            continue;
8582        }
8583        let other_bbox = other_elem.bbox();
8584        if other_bbox.height() > 24.0 {
8585            continue;
8586        }
8587        let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
8588        if other_bbox.top_y < other_top - 24.0 {
8589            continue;
8590        }
8591        let other_text = match other_elem {
8592            ContentElement::Paragraph(p) => p.base.value(),
8593            ContentElement::TextBlock(tb) => tb.value(),
8594            ContentElement::TextLine(tl) => tl.value(),
8595            ContentElement::Heading(h) => h.base.base.value(),
8596            _ => continue,
8597        };
8598        if other_text.trim().to_lowercase() == trimmed_lower {
8599            return true;
8600        }
8601    }
8602
8603    false
8604}
8605
8606fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
8607    let trimmed = text.trim();
8608    let upper_words = trimmed
8609        .split_whitespace()
8610        .filter(|word| word.chars().any(char::is_alphabetic))
8611        .all(|word| {
8612            word.chars()
8613                .filter(|ch| ch.is_alphabetic())
8614                .all(|ch| ch.is_uppercase())
8615        });
8616
8617    (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
8618}
8619
8620fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
8621    let next_trimmed = next.trim();
8622    if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8623        return false;
8624    }
8625
8626    let normalized = normalize_heading_text(text);
8627    if matches!(
8628        normalized.as_str(),
8629        "contents" | "tableofcontents" | "introduction" | "conclusion"
8630    ) {
8631        return false;
8632    }
8633
8634    let words: Vec<&str> = text.split_whitespace().collect();
8635    if words.len() < 3 {
8636        return false;
8637    }
8638
8639    words
8640        .last()
8641        .is_some_and(|word| is_sentence_fragment_tail(word))
8642}
8643
8644fn is_sentence_fragment_tail(word: &str) -> bool {
8645    matches!(
8646        word.trim_matches(|c: char| !c.is_alphanumeric())
8647            .to_ascii_lowercase()
8648            .as_str(),
8649        "a" | "an"
8650            | "and"
8651            | "as"
8652            | "at"
8653            | "by"
8654            | "for"
8655            | "from"
8656            | "in"
8657            | "into"
8658            | "of"
8659            | "on"
8660            | "or"
8661            | "that"
8662            | "the"
8663            | "to"
8664            | "with"
8665    )
8666}
8667
8668fn is_list_section_heading(text: &str) -> bool {
8669    let trimmed = text.trim();
8670    trimmed.ends_with(':')
8671        && trimmed.len() <= 80
8672        && trimmed.split_whitespace().count() <= 8
8673        && trimmed.chars().any(char::is_alphabetic)
8674        && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
8675        && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
8676}
8677
8678fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
8679    let next_trimmed = next.trim();
8680    if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8681        return false;
8682    }
8683
8684    if starts_with_enumerated_marker(next_trimmed) {
8685        return false;
8686    }
8687
8688    if prev.ends_with('-')
8689        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8690        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8691    {
8692        return true;
8693    }
8694
8695    if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8696        return true;
8697    }
8698
8699    let lower = next_trimmed.to_ascii_lowercase();
8700    if lower.starts_with("http://")
8701        || lower.starts_with("https://")
8702        || lower.starts_with("arxiv")
8703        || lower.starts_with("doi:")
8704    {
8705        return true;
8706    }
8707
8708    if matches!(
8709        next_trimmed.split_whitespace().next(),
8710        Some("In" | "Proceedings" | "Advances" | "Learning")
8711    ) {
8712        return true;
8713    }
8714
8715    !prev.ends_with(['.', '!', '?', ':'])
8716}
8717
8718fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
8719    let next_trimmed = next.trim();
8720    if next_trimmed.is_empty() {
8721        return false;
8722    }
8723
8724    if starts_with_enumerated_marker(next_trimmed) {
8725        return false;
8726    }
8727
8728    if prev.ends_with('-')
8729        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8730        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8731    {
8732        return true;
8733    }
8734
8735    next_trimmed.chars().next().is_some_and(char::is_lowercase)
8736}
8737
8738fn starts_with_enumerated_marker(text: &str) -> bool {
8739    let first_token = match text.split_whitespace().next() {
8740        Some(token) => token.trim_start_matches(['(', '[']),
8741        None => return false,
8742    };
8743    if !first_token.ends_with(['.', ')', ':']) {
8744        return false;
8745    }
8746
8747    let marker = first_token.trim_end_matches(['.', ')', ':']);
8748    if marker.is_empty() {
8749        return false;
8750    }
8751
8752    if marker.chars().all(|c| c.is_ascii_digit()) {
8753        return true;
8754    }
8755
8756    if marker.len() == 1 && marker.chars().all(|c| c.is_ascii_alphabetic()) {
8757        return true;
8758    }
8759
8760    let lower = marker.to_ascii_lowercase();
8761    lower.len() <= 8 && lower.chars().all(|c| "ivxlcdm".contains(c))
8762}
8763
8764fn should_skip_leading_figure_carryover(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8765    let trimmed = text.trim();
8766    if !trimmed.starts_with("Figure ") || trimmed.split_whitespace().count() < 4 {
8767        return false;
8768    }
8769
8770    let element = &doc.kids[idx];
8771    let Some(page) = element.page_number() else {
8772        return false;
8773    };
8774
8775    let mut page_top = f64::MIN;
8776    for candidate in &doc.kids {
8777        if candidate.page_number() == Some(page)
8778            && matches!(
8779                candidate,
8780                ContentElement::Paragraph(_)
8781                    | ContentElement::TextBlock(_)
8782                    | ContentElement::TextLine(_)
8783                    | ContentElement::Heading(_)
8784                    | ContentElement::NumberHeading(_)
8785                    | ContentElement::Caption(_)
8786            )
8787        {
8788            page_top = page_top.max(candidate.bbox().top_y);
8789        }
8790    }
8791    if !page_top.is_finite() || element.bbox().top_y < page_top - 72.0 {
8792        return false;
8793    }
8794
8795    for prior_idx in 0..idx {
8796        let prior = &doc.kids[prior_idx];
8797        let prior_text = extract_element_text(prior);
8798        let prior_trimmed = prior_text.trim();
8799        if prior_trimmed.is_empty()
8800            || is_standalone_page_number(prior_trimmed)
8801            || looks_like_footer_banner(prior_trimmed)
8802        {
8803            continue;
8804        }
8805        match prior {
8806            ContentElement::Paragraph(_)
8807            | ContentElement::TextBlock(_)
8808            | ContentElement::TextLine(_) => {
8809                if !starts_with_caption_prefix(prior_trimmed)
8810                    && !looks_like_top_margin_running_header(doc, prior_idx, prior_trimmed)
8811                {
8812                    return false;
8813                }
8814            }
8815            ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8816                if !should_skip_heading_text(prior_trimmed) {
8817                    return false;
8818                }
8819            }
8820            _ => return false,
8821        }
8822    }
8823
8824    for lookahead_idx in idx + 1..doc.kids.len().min(idx + 8) {
8825        let next = &doc.kids[lookahead_idx];
8826        if next.page_number() != Some(page) {
8827            break;
8828        }
8829        let next_text = extract_element_text(next);
8830        let next_trimmed = next_text.trim();
8831        if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8832            continue;
8833        }
8834
8835        let is_numbered_heading = match next {
8836            ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8837                looks_like_numbered_section(next_trimmed)
8838                    || looks_like_keyword_numbered_section(next_trimmed)
8839            }
8840            ContentElement::Paragraph(_)
8841            | ContentElement::TextBlock(_)
8842            | ContentElement::TextLine(_) => {
8843                should_render_paragraph_as_heading(
8844                    doc,
8845                    lookahead_idx,
8846                    next_trimmed,
8847                    doc.kids.get(lookahead_idx + 1),
8848                ) && (looks_like_numbered_section(next_trimmed)
8849                    || looks_like_keyword_numbered_section(next_trimmed))
8850            }
8851            _ => false,
8852        };
8853
8854        if is_numbered_heading {
8855            return true;
8856        }
8857
8858        if !starts_with_caption_prefix(next_trimmed) && next_trimmed.split_whitespace().count() >= 5
8859        {
8860            return false;
8861        }
8862    }
8863
8864    false
8865}
8866
8867fn merge_paragraph_text(target: &mut String, next: &str) {
8868    let next_trimmed = next.trim();
8869    if target.ends_with('-')
8870        && target
8871            .chars()
8872            .rev()
8873            .nth(1)
8874            .is_some_and(|c| c.is_alphabetic())
8875        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8876    {
8877        target.pop();
8878        target.push_str(next_trimmed);
8879    } else {
8880        if !target.ends_with(' ') {
8881            target.push(' ');
8882        }
8883        target.push_str(next_trimmed);
8884    }
8885}
8886
8887fn is_standalone_page_number(text: &str) -> bool {
8888    let trimmed = text.trim();
8889    !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
8890}
8891
8892fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
8893    if !is_standalone_page_number(text) {
8894        return false;
8895    }
8896
8897    let bbox = element.bbox();
8898    if bbox.height() > 24.0 {
8899        return false;
8900    }
8901
8902    let Some(page) = element.page_number() else {
8903        return false;
8904    };
8905
8906    let mut page_top = f64::MIN;
8907    let mut page_bottom = f64::MAX;
8908    for candidate in &doc.kids {
8909        if candidate.page_number() == Some(page) {
8910            let candidate_bbox = candidate.bbox();
8911            page_top = page_top.max(candidate_bbox.top_y);
8912            page_bottom = page_bottom.min(candidate_bbox.bottom_y);
8913        }
8914    }
8915
8916    if !page_top.is_finite() || !page_bottom.is_finite() {
8917        return false;
8918    }
8919
8920    bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
8921}
8922
8923/// Check whether a pipeline heading sits in the bottom margin of its page.
8924/// Running footers (e.g. "Report Title 21") are sometimes classified as
8925/// headings by the pipeline.  A heading at the page bottom is very unlikely
8926/// to be a real section heading.
8927fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
8928    let element = &doc.kids[idx];
8929    let bbox = element.bbox();
8930    if bbox.height() > 30.0 {
8931        return false;
8932    }
8933
8934    let Some(page) = element.page_number() else {
8935        return false;
8936    };
8937
8938    let mut page_bottom = f64::MAX;
8939    for candidate in &doc.kids {
8940        if candidate.page_number() == Some(page) {
8941            page_bottom = page_bottom.min(candidate.bbox().bottom_y);
8942        }
8943    }
8944
8945    if !page_bottom.is_finite() {
8946        return false;
8947    }
8948
8949    // If this heading is at the very bottom of the page content, skip it.
8950    bbox.bottom_y <= page_bottom + 24.0
8951}
8952
8953/// Demote a pipeline heading that ends with a period when it doesn't look like
8954/// a genuine section heading (e.g. "United Kingdom." or "New Investment (a Challenger).").
8955/// Returns true when the heading should be rendered as a paragraph instead.
8956fn should_demote_period_heading(text: &str) -> bool {
8957    let trimmed = text.trim();
8958    if !trimmed.ends_with('.') {
8959        return false;
8960    }
8961    // Keep numbered section headings: "I. Introduction", "4.2. Results",
8962    // "Activity 4. Determining CEC…"
8963    if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
8964        return false;
8965    }
8966    // Keep headings whose text without the trailing period still looks like a
8967    // proper title — at least 3 words, first word uppercase, and the period
8968    // is clearly sentence-ending rather than part of a title pattern.
8969    let without_dot = trimmed.trim_end_matches('.');
8970    let word_count = without_dot.split_whitespace().count();
8971    // Very short fragments ending with '.' (like "Kingdom.") are almost
8972    // certainly not headings.
8973    if word_count <= 2 {
8974        return true;
8975    }
8976    false
8977}
8978
8979/// Demote headings that end with a comma — these are never real headings
8980/// (e.g. footnote references like "29 Pope," or "32 Beawes, 33 M.M.,").
8981fn should_demote_comma_heading(text: &str) -> bool {
8982    text.trim().ends_with(',')
8983}
8984
8985/// Demote headings containing mathematical/special symbols that never appear
8986/// in real section headings (e.g. "HL ¼", "P ≪ P", "LH þ HL:").
8987fn should_demote_math_heading(text: &str) -> bool {
8988    text.chars().any(|c| {
8989        matches!(
8990            c,
8991            '¼' | '½'
8992                | '¾'
8993                | '≪'
8994                | '≫'
8995                | 'þ'
8996                | 'ð'
8997                | '∑'
8998                | '∫'
8999                | '∂'
9000                | '∏'
9001                | '√'
9002                | '∞'
9003                | '≈'
9004                | '÷'
9005        )
9006    })
9007}
9008
9009/// Demote headings containing a percentage sign — these are typically data
9010/// labels rather than section headings (e.g. "56% AGREE").
9011fn should_demote_percentage_heading(text: &str) -> bool {
9012    text.contains('%')
9013}
9014
9015/// Demote bibliography entries that start with a 4-digit year followed by
9016/// a period and space (e.g. "2020. Measuring massive multitask...").
9017fn should_demote_bibliography_heading(text: &str) -> bool {
9018    let t = text.trim();
9019    if t.len() < 6 {
9020        return false;
9021    }
9022    let bytes = t.as_bytes();
9023    bytes[0..4].iter().all(|b| b.is_ascii_digit())
9024        && bytes[4] == b'.'
9025        && (bytes[5] == b' ' || t.len() == 5)
9026}
9027
9028/// Strip a trailing standalone page number from heading text.
9029/// E.g. "Chapter 3. Numerical differentiation 35" → "Chapter 3. Numerical differentiation"
9030/// Only strips when the last token is 1-4 digits and the heading has enough
9031/// words to be meaningful without it.
9032fn strip_trailing_page_number(text: &str) -> &str {
9033    let trimmed = text.trim();
9034    if let Some(last_space) = trimmed.rfind(' ') {
9035        let suffix = &trimmed[last_space + 1..];
9036        if !suffix.is_empty()
9037            && suffix.len() <= 4
9038            && suffix.chars().all(|c| c.is_ascii_digit())
9039            && trimmed[..last_space].split_whitespace().count() >= 3
9040        {
9041            return trimmed[..last_space].trim();
9042        }
9043    }
9044    trimmed
9045}
9046
9047/// Try to split a heading that contains a merged subsection number.
9048/// For example, "4 Results 4.1 Experimental Details" should become
9049/// two headings: "4 Results" and "4.1 Experimental Details".
9050/// Returns None if no split is needed, otherwise the split point byte offset.
9051fn find_merged_subsection_split(text: &str) -> Option<usize> {
9052    // Look for a subsection number pattern like "4.1" or "B.1" after initial content.
9053    // Must appear at a word boundary (preceded by space).
9054    let bytes = text.as_bytes();
9055    // Start searching after the first few characters to skip the initial number
9056    let mut i = 3;
9057    while i < bytes.len() {
9058        if bytes[i - 1] == b' ' {
9059            // Check for digit.digit pattern (e.g., "4.1")
9060            if bytes[i].is_ascii_digit() {
9061                if let Some(dot_pos) = text[i..].find('.') {
9062                    let after_dot = i + dot_pos + 1;
9063                    if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
9064                        // Found "N.N" pattern preceded by space
9065                        return Some(i);
9066                    }
9067                }
9068            }
9069            // Check for letter.digit pattern (e.g., "B.1")
9070            if bytes[i].is_ascii_uppercase()
9071                && i + 2 < bytes.len()
9072                && bytes[i + 1] == b'.'
9073                && bytes[i + 2].is_ascii_digit()
9074            {
9075                return Some(i);
9076            }
9077        }
9078        i += 1;
9079    }
9080    None
9081}
9082
9083fn should_skip_heading_text(text: &str) -> bool {
9084    let trimmed = text.trim();
9085    if trimmed.is_empty() || is_standalone_page_number(trimmed) {
9086        return true;
9087    }
9088
9089    let lower = trimmed.to_ascii_lowercase();
9090    if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
9091        && trimmed.contains('|')
9092    {
9093        return true;
9094    }
9095
9096    let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
9097    let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
9098    alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
9099}
9100
9101fn repair_fragmented_words(text: &str) -> String {
9102    const STOPWORDS: &[&str] = &[
9103        "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
9104        "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
9105    ];
9106
9107    let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
9108    if parts.len() < 2 {
9109        return text.to_string();
9110    }
9111
9112    let mut i = 0usize;
9113    while i + 1 < parts.len() {
9114        let left = parts[i].clone();
9115        let right = parts[i + 1].clone();
9116        let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
9117        let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
9118        let left_lower = left_clean.to_ascii_lowercase();
9119        let right_lower = right_clean.to_ascii_lowercase();
9120
9121        let should_join = !left_clean.is_empty()
9122            && !right_clean.is_empty()
9123            && left_clean.chars().all(char::is_alphabetic)
9124            && right_clean.chars().all(char::is_alphabetic)
9125            && (left_clean.len() <= 4 || right_clean.len() <= 4)
9126            && left_clean.len() + right_clean.len() >= 6
9127            && !right_clean.chars().next().is_some_and(char::is_uppercase)
9128            && !STOPWORDS.contains(&left_lower.as_str())
9129            && !STOPWORDS.contains(&right_lower.as_str());
9130
9131        if should_join {
9132            let next = parts.remove(i + 1);
9133            parts[i].push_str(&next);
9134        } else {
9135            i += 1;
9136        }
9137    }
9138
9139    parts.join(" ")
9140}
9141
9142/// Extract text from list item contents (fallback when label/body tokens are empty).
9143fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
9144    let mut text = String::new();
9145    for elem in contents {
9146        let part = match elem {
9147            ContentElement::Paragraph(p) => p.base.value(),
9148            ContentElement::TextBlock(tb) => tb.value(),
9149            ContentElement::TextLine(tl) => tl.value(),
9150            ContentElement::TextChunk(tc) => tc.value.clone(),
9151            _ => String::new(),
9152        };
9153        if !text.is_empty() && !part.is_empty() {
9154            text.push(' ');
9155        }
9156        text.push_str(&part);
9157    }
9158    text
9159}
9160
9161fn has_internal_header_gap(row: &[String]) -> bool {
9162    let mut seen_filled = false;
9163    let mut seen_gap_after_fill = false;
9164    for cell in row {
9165        if cell.trim().is_empty() {
9166            if seen_filled {
9167                seen_gap_after_fill = true;
9168            }
9169            continue;
9170        }
9171        if seen_gap_after_fill {
9172            return true;
9173        }
9174        seen_filled = true;
9175    }
9176    false
9177}
9178
9179fn expand_grouped_header_row(parent: &[String], child: &[String]) -> Vec<String> {
9180    let anchor_cols: Vec<usize> = parent
9181        .iter()
9182        .enumerate()
9183        .filter_map(|(idx, cell)| (!cell.trim().is_empty()).then_some(idx))
9184        .collect();
9185    if anchor_cols.is_empty() {
9186        return parent.to_vec();
9187    }
9188
9189    let mut expanded = parent.to_vec();
9190    for (col_idx, child_cell) in child.iter().enumerate() {
9191        if !expanded[col_idx].trim().is_empty() || child_cell.trim().is_empty() {
9192            continue;
9193        }
9194
9195        let mut best_anchor = anchor_cols[0];
9196        let mut best_distance = usize::abs_diff(anchor_cols[0], col_idx);
9197        for &anchor_idx in &anchor_cols[1..] {
9198            let distance = usize::abs_diff(anchor_idx, col_idx);
9199            if distance < best_distance || (distance == best_distance && anchor_idx > best_anchor) {
9200                best_anchor = anchor_idx;
9201                best_distance = distance;
9202            }
9203        }
9204        expanded[col_idx] = parent[best_anchor].trim().to_string();
9205    }
9206
9207    expanded
9208}
9209
9210fn preserve_grouped_header_rows(rows: &mut [Vec<String>]) -> bool {
9211    if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
9212        return false;
9213    }
9214    if rows[0].first().is_none_or(|cell| cell.trim().is_empty()) {
9215        return false;
9216    }
9217    if rows[1].first().is_some_and(|cell| !cell.trim().is_empty()) {
9218        return false;
9219    }
9220
9221    let first_filled = rows[0]
9222        .iter()
9223        .filter(|cell| !cell.trim().is_empty())
9224        .count();
9225    let second_filled = rows[1]
9226        .iter()
9227        .filter(|cell| !cell.trim().is_empty())
9228        .count();
9229    if first_filled < 2 || second_filled <= first_filled || !has_internal_header_gap(&rows[0]) {
9230        return false;
9231    }
9232
9233    rows[0] = expand_grouped_header_row(&rows[0], &rows[1]);
9234    true
9235}
9236
9237/// Merge header continuation rows in a rendered table.
9238///
9239/// When a PDF table has multi-line column headers, each wrapped line often
9240/// produces a separate row in the grid.  These continuation rows have an
9241/// empty first cell while the header row above them has content.  This
9242/// function detects such rows at the start of the table and merges their
9243/// text into the first row, producing a single combined header.
9244///
9245/// Only rows whose non-empty cells are all ≤ 30 characters are merged, to
9246/// avoid accidentally collapsing data rows that happen to have an empty key.
9247fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
9248    if rows.len() < 2 {
9249        return;
9250    }
9251    if preserve_grouped_header_rows(rows) {
9252        return;
9253    }
9254    // The first row must have a non-empty first cell (the header anchor).
9255    if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
9256        return;
9257    }
9258
9259    let mut merge_count = 0usize;
9260    for (i, row_i) in rows.iter().enumerate().skip(1) {
9261        let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
9262        if !first_empty {
9263            break; // hit a data row
9264        }
9265        // All non-empty cells must be short (header-like fragments).
9266        let all_short = row_i
9267            .iter()
9268            .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
9269        if !all_short {
9270            break;
9271        }
9272        merge_count = i;
9273    }
9274
9275    // Require at least 2 consecutive continuation rows to avoid merging
9276    // legitimate sub-header or unit rows (e.g. a single row with "cmolc/kg").
9277    if merge_count == 0 {
9278        return;
9279    }
9280
9281    // Merge rows 1..=merge_count into row 0.
9282    for i in 1..=merge_count {
9283        let (head, tail) = rows.split_at_mut(i);
9284        let ncols = head[0].len().min(tail[0].len());
9285        for (target, src) in head[0]
9286            .iter_mut()
9287            .take(ncols)
9288            .zip(tail[0].iter().take(ncols))
9289        {
9290            let fragment = src.trim().to_string();
9291            if !fragment.is_empty() {
9292                let target_str = target.trim().to_string();
9293                *target = if target_str.is_empty() {
9294                    fragment
9295                } else {
9296                    format!("{} {}", target_str, fragment)
9297                };
9298            }
9299        }
9300    }
9301
9302    // Remove the merged rows.
9303    rows.drain(1..=merge_count);
9304}
9305
9306fn trim_leading_table_carryover_rows(rows: &mut Vec<Vec<String>>) {
9307    while first_body_row_looks_like_carryover(rows) {
9308        rows.remove(1);
9309    }
9310}
9311
9312fn first_body_row_looks_like_carryover(rows: &[Vec<String>]) -> bool {
9313    if rows.len() < 3 {
9314        return false;
9315    }
9316
9317    let key_col_count = infer_leading_key_column_count(&rows[1..]);
9318    if key_col_count == 0 {
9319        return false;
9320    }
9321
9322    let candidate = &rows[1];
9323    if candidate
9324        .iter()
9325        .take(key_col_count)
9326        .any(|cell| !cell.trim().is_empty())
9327    {
9328        return false;
9329    }
9330
9331    let non_empty_cols = candidate
9332        .iter()
9333        .enumerate()
9334        .filter(|(_, cell)| !cell.trim().is_empty())
9335        .map(|(idx, _)| idx)
9336        .collect::<Vec<_>>();
9337    if non_empty_cols.len() != 1 {
9338        return false;
9339    }
9340
9341    let only_col = non_empty_cols[0];
9342    if only_col < key_col_count {
9343        return false;
9344    }
9345
9346    if candidate[only_col].split_whitespace().count() < 4 {
9347        return false;
9348    }
9349
9350    rows[2]
9351        .iter()
9352        .take(key_col_count)
9353        .all(|cell| !cell.trim().is_empty())
9354}
9355
9356fn infer_leading_key_column_count(rows: &[Vec<String>]) -> usize {
9357    if rows.len() < 2 {
9358        return 0;
9359    }
9360
9361    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
9362    let mut key_cols = 0usize;
9363
9364    for col_idx in 0..num_cols {
9365        let mut occupancy = 0usize;
9366        let mut word_counts = Vec::new();
9367
9368        for row in rows {
9369            let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
9370            let trimmed = cell.trim();
9371            if trimmed.is_empty() {
9372                continue;
9373            }
9374            occupancy += 1;
9375            word_counts.push(trimmed.split_whitespace().count());
9376        }
9377
9378        if occupancy == 0 {
9379            break;
9380        }
9381
9382        word_counts.sort_unstable();
9383        let median_words = word_counts[word_counts.len() / 2];
9384        let occupancy_ratio = occupancy as f64 / rows.len() as f64;
9385        if occupancy_ratio < 0.6 || median_words > 3 {
9386            break;
9387        }
9388        key_cols += 1;
9389    }
9390
9391    key_cols
9392}
9393
9394/// Render a SemanticTable as a markdown table.
9395fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
9396    // Delegate to render_table_border which handles cross-page linking.
9397    render_table_border(out, &table.table_border);
9398}
9399
9400#[derive(Clone, Debug)]
9401struct GeometricTableRegion {
9402    start_idx: usize,
9403    end_idx: usize,
9404    rendered: String,
9405}
9406
9407#[derive(Clone)]
9408struct ChunkLine {
9409    bbox: BoundingBox,
9410    chunks: Vec<TextChunk>,
9411}
9412
9413#[derive(Clone)]
9414struct SlotFragment {
9415    slot_idx: usize,
9416    bbox: BoundingBox,
9417    text: String,
9418}
9419
9420fn detect_geometric_table_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9421    let mut regions = Vec::new();
9422    let mut occupied_until = 0usize;
9423
9424    for (idx, element) in doc.kids.iter().enumerate() {
9425        if idx < occupied_until {
9426            continue;
9427        }
9428
9429        let Some(table) = table_border_from_element(element) else {
9430            continue;
9431        };
9432        let Some(region) = build_geometric_table_region(doc, idx, table) else {
9433            continue;
9434        };
9435        occupied_until = region.end_idx.saturating_add(1);
9436        regions.push(region);
9437    }
9438
9439    let mut occupied = regions
9440        .iter()
9441        .flat_map(|region| region.start_idx..=region.end_idx)
9442        .collect::<HashSet<_>>();
9443    for region in detect_footnote_citation_regions(doc) {
9444        if (region.start_idx..=region.end_idx).any(|idx| occupied.contains(&idx)) {
9445            continue;
9446        }
9447        occupied.extend(region.start_idx..=region.end_idx);
9448        regions.push(region);
9449    }
9450
9451    regions.sort_by_key(|region| region.start_idx);
9452    regions
9453}
9454
9455fn detect_footnote_citation_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9456    let body_font_size = compute_running_body_font_size(doc);
9457    if body_font_size <= 0.0 {
9458        return Vec::new();
9459    }
9460
9461    let mut regions = Vec::new();
9462    let mut idx = 0usize;
9463    while idx < doc.kids.len() {
9464        let Some(region) = build_footnote_citation_region(doc, idx, body_font_size) else {
9465            idx += 1;
9466            continue;
9467        };
9468        idx = region.end_idx.saturating_add(1);
9469        regions.push(region);
9470    }
9471
9472    regions
9473}
9474
9475fn compute_running_body_font_size(doc: &PdfDocument) -> f64 {
9476    doc.kids
9477        .iter()
9478        .filter_map(|element| {
9479            let ContentElement::Paragraph(paragraph) = element else {
9480                return None;
9481            };
9482            let text = paragraph.base.value();
9483            (text.split_whitespace().count() > 10).then_some(paragraph.base.font_size?)
9484        })
9485        .fold(0.0_f64, f64::max)
9486}
9487
9488fn build_footnote_citation_region(
9489    doc: &PdfDocument,
9490    start_idx: usize,
9491    body_font_size: f64,
9492) -> Option<GeometricTableRegion> {
9493    let element = doc.kids.get(start_idx)?;
9494    if !is_geometric_text_candidate(element) {
9495        return None;
9496    }
9497
9498    let start_text = extract_element_text(element);
9499    let trimmed_start = start_text.trim();
9500    if trimmed_start.is_empty() {
9501        return None;
9502    }
9503
9504    let small_font_threshold = (body_font_size * 0.92).min(body_font_size - 0.8).max(0.0);
9505    let mut lead_prefix = None;
9506    let mut fragments = Vec::new();
9507    let page_number = element.page_number()?;
9508    let mut column_bbox = element.bbox().clone();
9509    let mut region_start_idx = start_idx;
9510    let mut end_idx = start_idx;
9511
9512    if element_font_size(element).is_some_and(|font_size| font_size <= small_font_threshold)
9513        && starts_with_footnote_marker(trimmed_start)
9514    {
9515        if let Some((attach_idx, prefix, leading_fragments)) = leading_footnote_attachment(
9516            doc,
9517            start_idx,
9518            page_number,
9519            &column_bbox,
9520            small_font_threshold,
9521        ) {
9522            lead_prefix = Some(prefix);
9523            fragments.extend(leading_fragments);
9524            region_start_idx = attach_idx;
9525        }
9526        fragments.push(footnote_fragment_text(element));
9527    } else {
9528        let (prefix, first_tail) = split_trailing_footnote_lead(trimmed_start)?;
9529        let next = doc.kids.get(start_idx + 1)?;
9530        if !is_geometric_text_candidate(next)
9531            || next.page_number() != Some(page_number)
9532            || !element_font_size(next).is_some_and(|font_size| font_size <= small_font_threshold)
9533        {
9534            return None;
9535        }
9536        if !same_column_region(&column_bbox, next.bbox()) {
9537            return None;
9538        }
9539        lead_prefix = Some(prefix);
9540        fragments.push(first_tail);
9541    }
9542
9543    let mut consecutive_small = 0usize;
9544    for idx in start_idx + 1..doc.kids.len() {
9545        let candidate = &doc.kids[idx];
9546        if !is_geometric_text_candidate(candidate) || candidate.page_number() != Some(page_number) {
9547            break;
9548        }
9549
9550        let candidate_text = extract_element_text(candidate);
9551        let trimmed = candidate_text.trim();
9552        if trimmed.is_empty() || starts_with_caption_prefix(trimmed) {
9553            break;
9554        }
9555
9556        let Some(font_size) = element_font_size(candidate) else {
9557            break;
9558        };
9559        if font_size > small_font_threshold {
9560            break;
9561        }
9562        if !same_column_region(&column_bbox, candidate.bbox()) {
9563            break;
9564        }
9565
9566        column_bbox = column_bbox.union(candidate.bbox());
9567        fragments.push(footnote_fragment_text(candidate));
9568        consecutive_small += 1;
9569        end_idx = idx;
9570    }
9571
9572    if consecutive_small == 0 && lead_prefix.is_some() {
9573        return None;
9574    }
9575
9576    let rows = parse_footnote_citation_rows(&fragments);
9577    if rows.len() < 3 {
9578        return None;
9579    }
9580
9581    let numeric_markers = rows
9582        .iter()
9583        .filter_map(|(marker, _)| marker.parse::<u32>().ok())
9584        .collect::<Vec<_>>();
9585    if numeric_markers.len() != rows.len() {
9586        return None;
9587    }
9588    let sequential_steps = numeric_markers
9589        .windows(2)
9590        .filter(|pair| pair[1] == pair[0] + 1)
9591        .count();
9592    if sequential_steps + 1 < rows.len().saturating_sub(1) {
9593        return None;
9594    }
9595
9596    let mut rendered_rows = vec![vec!["Footnote".to_string(), "Citation".to_string()]];
9597    rendered_rows.extend(
9598        rows.into_iter()
9599            .map(|(marker, citation)| vec![marker, citation]),
9600    );
9601
9602    let mut rendered = String::new();
9603    if let Some(prefix) = lead_prefix {
9604        rendered.push_str(&escape_md_line_start(prefix.trim()));
9605        rendered.push_str("\n\n");
9606    }
9607    rendered.push_str(&render_html_table(&rendered_rows));
9608
9609    Some(GeometricTableRegion {
9610        start_idx: region_start_idx,
9611        end_idx,
9612        rendered,
9613    })
9614}
9615
9616fn leading_footnote_attachment(
9617    doc: &PdfDocument,
9618    start_idx: usize,
9619    page_number: u32,
9620    column_bbox: &BoundingBox,
9621    small_font_threshold: f64,
9622) -> Option<(usize, String, Vec<String>)> {
9623    let mut idx = start_idx.checked_sub(1)?;
9624    let mut leading_fragments = Vec::new();
9625    let mut scanned = 0usize;
9626
9627    loop {
9628        let candidate = doc.kids.get(idx)?;
9629        scanned += 1;
9630        if scanned > 6 || candidate.page_number() != Some(page_number) {
9631            return None;
9632        }
9633
9634        if !is_geometric_text_candidate(candidate) {
9635            if idx == 0 {
9636                return None;
9637            }
9638            idx -= 1;
9639            continue;
9640        }
9641
9642        let text = extract_element_text(candidate);
9643        let trimmed = text.trim();
9644        if trimmed.is_empty() {
9645            if idx == 0 {
9646                return None;
9647            }
9648            idx -= 1;
9649            continue;
9650        }
9651        if !same_column_region(candidate.bbox(), column_bbox) {
9652            return None;
9653        }
9654
9655        if element_font_size(candidate).is_some_and(|font_size| font_size <= small_font_threshold) {
9656            leading_fragments.push(footnote_fragment_text(candidate));
9657            if idx == 0 {
9658                return None;
9659            }
9660            idx -= 1;
9661            continue;
9662        }
9663
9664        let (prefix, first_tail) = split_trailing_footnote_lead(trimmed)?;
9665        leading_fragments.push(first_tail);
9666        leading_fragments.reverse();
9667        return Some((idx, prefix, leading_fragments));
9668    }
9669}
9670
9671fn parse_footnote_citation_rows(fragments: &[String]) -> Vec<(String, String)> {
9672    let mut rows = Vec::new();
9673    let mut current_marker = None::<String>;
9674    let mut current_citation = String::new();
9675
9676    for fragment in fragments {
9677        let markers = find_footnote_marker_positions(fragment);
9678        if markers.is_empty() {
9679            if current_marker.is_some() {
9680                merge_paragraph_text(&mut current_citation, fragment.trim());
9681            }
9682            continue;
9683        }
9684
9685        let mut cursor = 0usize;
9686        for (pos, marker, skip_len) in markers {
9687            let prefix = fragment[cursor..pos].trim();
9688            if current_marker.is_some() && !prefix.is_empty() {
9689                merge_paragraph_text(&mut current_citation, prefix);
9690            }
9691            if let Some(marker_value) = current_marker.take() {
9692                let trimmed = current_citation.trim();
9693                if !trimmed.is_empty() {
9694                    rows.push((marker_value, trimmed.to_string()));
9695                }
9696                current_citation.clear();
9697            }
9698            current_marker = Some(marker);
9699            cursor = pos + skip_len;
9700        }
9701
9702        let tail = fragment[cursor..].trim();
9703        if current_marker.is_some() && !tail.is_empty() {
9704            merge_paragraph_text(&mut current_citation, tail);
9705        }
9706    }
9707
9708    if let Some(marker_value) = current_marker {
9709        let trimmed = current_citation.trim();
9710        if !trimmed.is_empty() {
9711            rows.push((marker_value, trimmed.to_string()));
9712        }
9713    }
9714
9715    rebalance_adjacent_footnote_citations(&mut rows);
9716    rows
9717}
9718
9719fn rebalance_adjacent_footnote_citations(rows: &mut [(String, String)]) {
9720    for idx in 0..rows.len().saturating_sub(1) {
9721        if !rows[idx].1.trim_end().ends_with(',') {
9722            continue;
9723        }
9724
9725        let next = rows[idx + 1].1.trim().to_string();
9726        let Some((stub, remainder)) = split_leading_citation_stub(&next) else {
9727            continue;
9728        };
9729        let Some((first_sentence, trailing)) = split_first_sentence(remainder) else {
9730            continue;
9731        };
9732        if first_sentence.split_whitespace().count() < 2 {
9733            continue;
9734        }
9735
9736        merge_paragraph_text(&mut rows[idx].1, first_sentence);
9737        rows[idx + 1].1 = if trailing.is_empty() {
9738            stub.to_string()
9739        } else {
9740            format!("{stub} {trailing}")
9741        };
9742    }
9743}
9744
9745fn split_leading_citation_stub(text: &str) -> Option<(&str, &str)> {
9746    let comma_idx = text.find(',')?;
9747    if comma_idx > 8 {
9748        return None;
9749    }
9750    let stub = text[..=comma_idx].trim();
9751    let remainder = text[comma_idx + 1..].trim();
9752    (!stub.is_empty() && !remainder.is_empty()).then_some((stub, remainder))
9753}
9754
9755fn split_first_sentence(text: &str) -> Option<(&str, &str)> {
9756    let period_idx = text.find(". ")?;
9757    let first = text[..=period_idx].trim();
9758    let trailing = text[period_idx + 2..].trim();
9759    (!first.is_empty()).then_some((first, trailing))
9760}
9761
9762fn find_footnote_marker_positions(text: &str) -> Vec<(usize, String, usize)> {
9763    let chars = text.char_indices().collect::<Vec<_>>();
9764    let mut markers = Vec::new();
9765    let mut idx = 0usize;
9766
9767    while idx < chars.len() {
9768        let (byte_idx, ch) = chars[idx];
9769        if !ch.is_ascii_digit() {
9770            idx += 1;
9771            continue;
9772        }
9773
9774        let at_boundary = idx == 0
9775            || chars[idx - 1].1.is_whitespace()
9776            || matches!(
9777                chars[idx - 1].1,
9778                '.' | ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”'
9779            );
9780        if !at_boundary {
9781            idx += 1;
9782            continue;
9783        }
9784
9785        let mut end_idx = idx;
9786        while end_idx < chars.len() && chars[end_idx].1.is_ascii_digit() {
9787            end_idx += 1;
9788        }
9789        let digits = &text[byte_idx
9790            ..chars
9791                .get(end_idx)
9792                .map(|(pos, _)| *pos)
9793                .unwrap_or(text.len())];
9794        if digits.len() > 2 || end_idx >= chars.len() || !chars[end_idx].1.is_whitespace() {
9795            idx += 1;
9796            continue;
9797        }
9798
9799        let mut lookahead = end_idx;
9800        while lookahead < chars.len() && chars[lookahead].1.is_whitespace() {
9801            lookahead += 1;
9802        }
9803        let Some((_, next_ch)) = chars.get(lookahead) else {
9804            idx += 1;
9805            continue;
9806        };
9807        if !(next_ch.is_ascii_uppercase() || matches!(*next_ch, '(' | '[' | '*')) {
9808            idx += 1;
9809            continue;
9810        }
9811
9812        let skip_end = chars
9813            .get(lookahead)
9814            .map(|(pos, _)| *pos)
9815            .unwrap_or(text.len());
9816        markers.push((byte_idx, digits.to_string(), skip_end - byte_idx));
9817        idx = lookahead;
9818    }
9819
9820    markers
9821}
9822
9823fn split_trailing_footnote_lead(text: &str) -> Option<(String, String)> {
9824    let markers = find_footnote_marker_positions(text);
9825    let (pos, marker, skip_len) = markers.last()?.clone();
9826    let prefix = text[..pos].trim();
9827    let tail = text[pos + skip_len..].trim();
9828    if prefix.split_whitespace().count() < 6 || tail.split_whitespace().count() > 6 {
9829        return None;
9830    }
9831    Some((prefix.to_string(), format!("{marker} {tail}")))
9832}
9833
9834fn starts_with_footnote_marker(text: &str) -> bool {
9835    find_footnote_marker_positions(text)
9836        .first()
9837        .is_some_and(|(pos, _, _)| *pos == 0)
9838}
9839
9840fn same_column_region(left: &BoundingBox, right: &BoundingBox) -> bool {
9841    let overlap = (left.right_x.min(right.right_x) - left.left_x.max(right.left_x)).max(0.0);
9842    let min_width = left.width().min(right.width()).max(1.0);
9843    overlap / min_width >= 0.35 || (left.left_x - right.left_x).abs() <= 28.0
9844}
9845
9846fn footnote_fragment_text(element: &ContentElement) -> String {
9847    let text = extract_element_text(element);
9848    if element_font_name(element)
9849        .as_deref()
9850        .is_some_and(|name| name.to_ascii_lowercase().contains("italic"))
9851    {
9852        format!("*{}*", text.trim())
9853    } else {
9854        text
9855    }
9856}
9857
9858fn element_font_size(element: &ContentElement) -> Option<f64> {
9859    match element {
9860        ContentElement::Paragraph(p) => p.base.font_size,
9861        ContentElement::Heading(h) => h.base.base.font_size,
9862        ContentElement::NumberHeading(nh) => nh.base.base.base.font_size,
9863        ContentElement::TextBlock(tb) => Some(tb.font_size),
9864        ContentElement::TextLine(tl) => Some(tl.font_size),
9865        _ => None,
9866    }
9867}
9868
9869fn element_font_name(element: &ContentElement) -> Option<String> {
9870    match element {
9871        ContentElement::Paragraph(p) => p.base.font_name.clone(),
9872        ContentElement::Heading(h) => h.base.base.font_name.clone(),
9873        ContentElement::NumberHeading(nh) => nh.base.base.base.font_name.clone(),
9874        _ => None,
9875    }
9876}
9877
9878fn table_border_from_element(
9879    element: &ContentElement,
9880) -> Option<&crate::models::table::TableBorder> {
9881    match element {
9882        ContentElement::TableBorder(table) => Some(table),
9883        ContentElement::Table(table) => Some(&table.table_border),
9884        _ => None,
9885    }
9886}
9887
9888fn build_geometric_table_region(
9889    doc: &PdfDocument,
9890    table_idx: usize,
9891    table: &crate::models::table::TableBorder,
9892) -> Option<GeometricTableRegion> {
9893    let mut table_rows = collect_table_border_rows(table);
9894    if table_rows.is_empty() || table.num_columns < 3 {
9895        return None;
9896    }
9897    merge_continuation_rows(&mut table_rows);
9898
9899    let column_ranges = table_column_ranges(table)?;
9900    let candidate_indices = collect_table_header_candidate_indices(doc, table_idx, table);
9901    if candidate_indices.is_empty() {
9902        return None;
9903    }
9904
9905    let needs_external_stub =
9906        infer_left_stub_requirement(doc, &candidate_indices, &table_rows, &column_ranges);
9907    let supports_embedded_stub_header =
9908        supports_embedded_stub_header(&table_rows, &column_ranges, doc, &candidate_indices);
9909    if !needs_external_stub && !supports_embedded_stub_header {
9910        return None;
9911    }
9912    let slot_ranges = if needs_external_stub {
9913        slot_ranges(&column_ranges, doc, &candidate_indices, true)?
9914    } else {
9915        column_ranges.clone()
9916    };
9917    let mut header_rows = reconstruct_aligned_rows(doc, &candidate_indices, &slot_ranges, true, 2);
9918    if header_rows.is_empty() {
9919        return None;
9920    }
9921    if needs_external_stub {
9922        normalize_leading_stub_header(&mut header_rows);
9923    } else {
9924        promote_embedded_stub_header(&mut header_rows, &table_rows);
9925    }
9926
9927    let slot_count = slot_ranges.len();
9928    let dense_header_rows = header_rows
9929        .iter()
9930        .filter(|row| {
9931            row.iter().filter(|cell| !cell.trim().is_empty()).count()
9932                >= slot_count.saturating_sub(1).max(2)
9933        })
9934        .count();
9935    if dense_header_rows == 0 {
9936        return None;
9937    }
9938
9939    let mut combined_rows = Vec::new();
9940    combined_rows.extend(header_rows);
9941
9942    let following_indices = collect_table_footer_candidate_indices(doc, table_idx, table);
9943    let body_rows = if needs_external_stub && should_merge_panel_body_rows(&table_rows) {
9944        let trailing_rows =
9945            reconstruct_aligned_rows(doc, &following_indices, &slot_ranges, false, 1);
9946        vec![merge_panel_body_row(
9947            &table_rows,
9948            &trailing_rows,
9949            slot_count,
9950        )]
9951    } else if needs_external_stub {
9952        table_rows
9953            .iter()
9954            .map(|row| {
9955                let mut shifted = vec![String::new()];
9956                shifted.extend(row.iter().cloned());
9957                shifted
9958            })
9959            .collect()
9960    } else {
9961        table_rows
9962    };
9963
9964    if body_rows.is_empty() {
9965        return None;
9966    }
9967    combined_rows.extend(body_rows);
9968
9969    let rendered = render_pipe_rows(&combined_rows);
9970    Some(GeometricTableRegion {
9971        start_idx: candidate_indices[0],
9972        end_idx: following_indices.last().copied().unwrap_or(table_idx),
9973        rendered,
9974    })
9975}
9976
9977fn table_column_ranges(table: &crate::models::table::TableBorder) -> Option<Vec<(f64, f64)>> {
9978    if table.num_columns == 0 {
9979        return None;
9980    }
9981
9982    let mut ranges = vec![(f64::INFINITY, f64::NEG_INFINITY); table.num_columns];
9983    for row in &table.rows {
9984        for cell in &row.cells {
9985            if cell.col_number >= table.num_columns {
9986                continue;
9987            }
9988            let range = &mut ranges[cell.col_number];
9989            range.0 = range.0.min(cell.bbox.left_x);
9990            range.1 = range.1.max(cell.bbox.right_x);
9991        }
9992    }
9993
9994    if ranges
9995        .iter()
9996        .any(|(left, right)| !left.is_finite() || !right.is_finite() || right <= left)
9997    {
9998        return None;
9999    }
10000
10001    Some(ranges)
10002}
10003
10004fn collect_table_header_candidate_indices(
10005    doc: &PdfDocument,
10006    table_idx: usize,
10007    table: &crate::models::table::TableBorder,
10008) -> Vec<usize> {
10009    let mut indices = Vec::new();
10010    let table_page = table.bbox.page_number;
10011    let table_top = table.bbox.top_y;
10012    let mut cursor = table_idx;
10013
10014    while let Some(prev_idx) = cursor.checked_sub(1) {
10015        let element = &doc.kids[prev_idx];
10016        if element.page_number() != table_page {
10017            break;
10018        }
10019        if !is_geometric_text_candidate(element) {
10020            break;
10021        }
10022
10023        let bbox = element.bbox();
10024        let vertical_gap = bbox.bottom_y - table_top;
10025        if !(-6.0..=260.0).contains(&vertical_gap) {
10026            break;
10027        }
10028
10029        indices.push(prev_idx);
10030        cursor = prev_idx;
10031        if indices.len() >= 10 {
10032            break;
10033        }
10034    }
10035
10036    indices.reverse();
10037    indices
10038}
10039
10040fn collect_table_footer_candidate_indices(
10041    doc: &PdfDocument,
10042    table_idx: usize,
10043    table: &crate::models::table::TableBorder,
10044) -> Vec<usize> {
10045    let mut indices = Vec::new();
10046    let table_page = table.bbox.page_number;
10047    let table_bottom = table.bbox.bottom_y;
10048
10049    for idx in table_idx + 1..doc.kids.len() {
10050        let element = &doc.kids[idx];
10051        if element.page_number() != table_page {
10052            break;
10053        }
10054        if !is_geometric_text_candidate(element) {
10055            break;
10056        }
10057        if looks_like_margin_page_number(doc, element, &extract_element_text(element)) {
10058            break;
10059        }
10060
10061        let bbox = element.bbox();
10062        let gap = table_bottom - bbox.top_y;
10063        if !(-6.0..=28.0).contains(&gap) {
10064            break;
10065        }
10066        indices.push(idx);
10067        if indices.len() >= 4 {
10068            break;
10069        }
10070    }
10071
10072    indices
10073}
10074
10075fn is_geometric_text_candidate(element: &ContentElement) -> bool {
10076    matches!(
10077        element,
10078        ContentElement::Paragraph(_)
10079            | ContentElement::Heading(_)
10080            | ContentElement::NumberHeading(_)
10081            | ContentElement::TextBlock(_)
10082            | ContentElement::TextLine(_)
10083    )
10084}
10085
10086fn infer_left_stub_requirement(
10087    doc: &PdfDocument,
10088    candidate_indices: &[usize],
10089    table_rows: &[Vec<String>],
10090    column_ranges: &[(f64, f64)],
10091) -> bool {
10092    if column_ranges.is_empty() {
10093        return false;
10094    }
10095
10096    let first_width = (column_ranges[0].1 - column_ranges[0].0).max(1.0);
10097    let has_left_label = candidate_indices.iter().any(|idx| {
10098        let bbox = doc.kids[*idx].bbox();
10099        bbox.right_x <= column_ranges[0].0 + first_width * 0.12
10100            && bbox.width() <= first_width * 0.45
10101    });
10102    if !has_left_label {
10103        return false;
10104    }
10105
10106    let mut first_col_word_counts: Vec<usize> = table_rows
10107        .iter()
10108        .filter_map(|row| row.first())
10109        .map(|cell| cell.split_whitespace().count())
10110        .collect();
10111    if first_col_word_counts.is_empty() {
10112        return false;
10113    }
10114    first_col_word_counts.sort_unstable();
10115    let median = first_col_word_counts[first_col_word_counts.len() / 2];
10116    median >= 5
10117}
10118
10119fn supports_embedded_stub_header(
10120    table_rows: &[Vec<String>],
10121    column_ranges: &[(f64, f64)],
10122    doc: &PdfDocument,
10123    candidate_indices: &[usize],
10124) -> bool {
10125    if table_rows.len() < 2 || column_ranges.len() < 3 {
10126        return false;
10127    }
10128
10129    let first_row = &table_rows[0];
10130    if first_row.len() != column_ranges.len() || first_row[0].trim().is_empty() {
10131        return false;
10132    }
10133    if first_row[0].split_whitespace().count() > 3 || first_row[0].trim().len() > 24 {
10134        return false;
10135    }
10136
10137    let data_fill = first_row
10138        .iter()
10139        .skip(1)
10140        .filter(|cell| !cell.trim().is_empty())
10141        .count();
10142    if data_fill + 1 < column_ranges.len() {
10143        return false;
10144    }
10145
10146    let labeled_rows = table_rows
10147        .iter()
10148        .skip(1)
10149        .filter(|row| row.first().is_some_and(|cell| !cell.trim().is_empty()))
10150        .count();
10151    if labeled_rows == 0 {
10152        return false;
10153    }
10154
10155    let slot_ranges = column_ranges.to_vec();
10156    let header_rows = reconstruct_aligned_rows(doc, candidate_indices, &slot_ranges, true, 2);
10157    header_rows.iter().any(|row| {
10158        row.first().is_none_or(|cell| cell.trim().is_empty())
10159            && row
10160                .iter()
10161                .skip(1)
10162                .filter(|cell| !cell.trim().is_empty())
10163                .count()
10164                >= column_ranges.len().saturating_sub(1)
10165    })
10166}
10167
10168fn slot_ranges(
10169    column_ranges: &[(f64, f64)],
10170    doc: &PdfDocument,
10171    candidate_indices: &[usize],
10172    needs_stub: bool,
10173) -> Option<Vec<(f64, f64)>> {
10174    let mut slots = Vec::new();
10175    if needs_stub {
10176        let first_left = column_ranges.first()?.0;
10177        let left_stub_start = candidate_indices
10178            .iter()
10179            .map(|idx| doc.kids[*idx].bbox().left_x)
10180            .fold(first_left, f64::min);
10181        let stub_right = first_left - 1.0;
10182        if stub_right <= left_stub_start {
10183            return None;
10184        }
10185        slots.push((left_stub_start, stub_right));
10186    }
10187    slots.extend(column_ranges.iter().copied());
10188    Some(slots)
10189}
10190
10191fn reconstruct_aligned_rows(
10192    doc: &PdfDocument,
10193    candidate_indices: &[usize],
10194    slot_ranges: &[(f64, f64)],
10195    drop_wide_singletons: bool,
10196    min_filled_slots: usize,
10197) -> Vec<Vec<String>> {
10198    if candidate_indices.is_empty() || slot_ranges.is_empty() {
10199        return Vec::new();
10200    }
10201
10202    let mut row_bands: Vec<(BoundingBox, Vec<String>)> = Vec::new();
10203
10204    for idx in candidate_indices {
10205        for line in extract_chunk_lines(&doc.kids[*idx]) {
10206            let fragments = split_line_into_slot_fragments(&line, slot_ranges);
10207            if fragments.is_empty() {
10208                continue;
10209            }
10210
10211            if drop_wide_singletons && fragments.len() == 1 {
10212                let only = &fragments[0];
10213                let span_width = only.bbox.width();
10214                let table_width =
10215                    slot_ranges.last().map(|(_, right)| *right).unwrap_or(0.0) - slot_ranges[0].0;
10216                if span_width >= table_width * 0.55 {
10217                    continue;
10218                }
10219            }
10220
10221            let line_center = line.bbox.center_y();
10222            let tolerance = line
10223                .chunks
10224                .iter()
10225                .map(|chunk| chunk.font_size)
10226                .fold(8.0, f64::max)
10227                * 0.8;
10228
10229            let mut target_row = None;
10230            for (row_idx, (bbox, _)) in row_bands.iter().enumerate() {
10231                if (bbox.center_y() - line_center).abs() <= tolerance {
10232                    target_row = Some(row_idx);
10233                    break;
10234                }
10235            }
10236
10237            if let Some(row_idx) = target_row {
10238                let (bbox, cells) = &mut row_bands[row_idx];
10239                *bbox = bbox.union(&line.bbox);
10240                for fragment in fragments {
10241                    append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10242                }
10243            } else {
10244                let mut cells = vec![String::new(); slot_ranges.len()];
10245                for fragment in fragments {
10246                    append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10247                }
10248                row_bands.push((line.bbox.clone(), cells));
10249            }
10250        }
10251    }
10252
10253    row_bands.sort_by(|left, right| {
10254        right
10255            .0
10256            .top_y
10257            .partial_cmp(&left.0.top_y)
10258            .unwrap_or(std::cmp::Ordering::Equal)
10259    });
10260
10261    row_bands
10262        .into_iter()
10263        .map(|(_, cells)| cells)
10264        .filter(|cells| {
10265            let filled = cells.iter().filter(|cell| !cell.trim().is_empty()).count();
10266            filled >= min_filled_slots
10267        })
10268        .collect()
10269}
10270
10271fn extract_chunk_lines(element: &ContentElement) -> Vec<ChunkLine> {
10272    match element {
10273        ContentElement::Paragraph(p) => chunk_lines_from_semantic_node(&p.base),
10274        ContentElement::Heading(h) => chunk_lines_from_semantic_node(&h.base.base),
10275        ContentElement::NumberHeading(nh) => chunk_lines_from_semantic_node(&nh.base.base.base),
10276        ContentElement::TextBlock(tb) => tb
10277            .text_lines
10278            .iter()
10279            .map(|line| ChunkLine {
10280                bbox: line.bbox.clone(),
10281                chunks: line.text_chunks.clone(),
10282            })
10283            .collect(),
10284        ContentElement::TextLine(tl) => vec![ChunkLine {
10285            bbox: tl.bbox.clone(),
10286            chunks: tl.text_chunks.clone(),
10287        }],
10288        _ => Vec::new(),
10289    }
10290}
10291
10292fn chunk_lines_from_semantic_node(node: &SemanticTextNode) -> Vec<ChunkLine> {
10293    let mut lines = Vec::new();
10294    for column in &node.columns {
10295        for block in &column.text_blocks {
10296            for line in &block.text_lines {
10297                lines.push(ChunkLine {
10298                    bbox: line.bbox.clone(),
10299                    chunks: line.text_chunks.clone(),
10300                });
10301            }
10302        }
10303    }
10304    lines
10305}
10306
10307fn split_line_into_slot_fragments(
10308    line: &ChunkLine,
10309    slot_ranges: &[(f64, f64)],
10310) -> Vec<SlotFragment> {
10311    let mut groups: Vec<(usize, Vec<TextChunk>, BoundingBox)> = Vec::new();
10312
10313    for chunk in line
10314        .chunks
10315        .iter()
10316        .filter(|chunk| !chunk.value.trim().is_empty())
10317        .cloned()
10318    {
10319        let slot_idx = assign_chunk_to_slot(&chunk.bbox, slot_ranges);
10320        if let Some((prev_slot, prev_chunks, prev_bbox)) = groups.last_mut() {
10321            let gap = chunk.bbox.left_x - prev_bbox.right_x;
10322            if *prev_slot == slot_idx && gap <= chunk.font_size.max(6.0) * 2.4 {
10323                *prev_bbox = prev_bbox.union(&chunk.bbox);
10324                prev_chunks.push(chunk);
10325                continue;
10326            }
10327        }
10328        groups.push((slot_idx, vec![chunk.clone()], chunk.bbox.clone()));
10329    }
10330
10331    groups
10332        .into_iter()
10333        .filter_map(|(slot_idx, chunks, bbox)| {
10334            let text = normalize_common_ocr_text(
10335                &crate::models::text::TextLine::concatenate_chunks(&chunks),
10336            );
10337            if text.trim().is_empty() {
10338                None
10339            } else {
10340                Some(SlotFragment {
10341                    slot_idx,
10342                    bbox,
10343                    text,
10344                })
10345            }
10346        })
10347        .collect()
10348}
10349
10350fn assign_chunk_to_slot(bbox: &BoundingBox, slot_ranges: &[(f64, f64)]) -> usize {
10351    let mut best_idx = 0usize;
10352    let mut best_overlap = f64::NEG_INFINITY;
10353    let center_x = bbox.center_x();
10354
10355    for (idx, (left, right)) in slot_ranges.iter().enumerate() {
10356        let overlap = (bbox.right_x.min(*right) - bbox.left_x.max(*left)).max(0.0);
10357        let score = if overlap > 0.0 {
10358            overlap / bbox.width().max(1.0)
10359        } else {
10360            -((center_x - ((*left + *right) / 2.0)).abs())
10361        };
10362        if score > best_overlap {
10363            best_overlap = score;
10364            best_idx = idx;
10365        }
10366    }
10367
10368    best_idx
10369}
10370
10371fn append_cell_text(cell: &mut String, fragment: &str) {
10372    let trimmed = fragment.trim();
10373    if trimmed.is_empty() {
10374        return;
10375    }
10376    if !cell.is_empty() {
10377        cell.push(' ');
10378    }
10379    cell.push_str(trimmed);
10380}
10381
10382fn normalize_leading_stub_header(rows: &mut [Vec<String>]) {
10383    if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
10384        return;
10385    }
10386
10387    if !rows[0][0].trim().is_empty() || rows[1][0].trim().is_empty() {
10388        return;
10389    }
10390
10391    let first_row_filled = rows[0]
10392        .iter()
10393        .skip(1)
10394        .filter(|cell| !cell.trim().is_empty())
10395        .count();
10396    let second_row_filled = rows[1]
10397        .iter()
10398        .skip(1)
10399        .filter(|cell| !cell.trim().is_empty())
10400        .count();
10401    if first_row_filled < 2 || second_row_filled < 2 {
10402        return;
10403    }
10404
10405    rows[0][0] = rows[1][0].trim().to_string();
10406    rows[1][0].clear();
10407}
10408
10409fn promote_embedded_stub_header(header_rows: &mut [Vec<String>], table_rows: &[Vec<String>]) {
10410    let Some(header_row) = header_rows.first_mut() else {
10411        return;
10412    };
10413    let Some(first_body_row) = table_rows.first() else {
10414        return;
10415    };
10416    if header_row.is_empty() || first_body_row.is_empty() {
10417        return;
10418    }
10419    if !header_row[0].trim().is_empty() {
10420        return;
10421    }
10422
10423    let promoted = first_body_row[0].trim();
10424    if promoted.is_empty() || promoted.split_whitespace().count() > 3 || promoted.len() > 24 {
10425        return;
10426    }
10427
10428    let header_fill = header_row
10429        .iter()
10430        .skip(1)
10431        .filter(|cell| !cell.trim().is_empty())
10432        .count();
10433    let body_fill = first_body_row
10434        .iter()
10435        .skip(1)
10436        .filter(|cell| !cell.trim().is_empty())
10437        .count();
10438    if header_fill < header_row.len().saturating_sub(1)
10439        || body_fill < first_body_row.len().saturating_sub(1)
10440    {
10441        return;
10442    }
10443
10444    header_row[0] = promoted.to_string();
10445}
10446
10447fn should_merge_panel_body_rows(rows: &[Vec<String>]) -> bool {
10448    rows.len() >= 3
10449        && rows
10450            .iter()
10451            .all(|row| !row.is_empty() && row.iter().all(|cell| !cell.trim().is_empty()))
10452}
10453
10454fn merge_panel_body_row(
10455    table_rows: &[Vec<String>],
10456    trailing_rows: &[Vec<String>],
10457    slot_count: usize,
10458) -> Vec<String> {
10459    let mut merged = vec![String::new(); slot_count];
10460    for row in table_rows {
10461        for (col_idx, cell) in row.iter().enumerate() {
10462            if col_idx + 1 >= slot_count {
10463                break;
10464            }
10465            append_cell_text(&mut merged[col_idx + 1], cell);
10466        }
10467    }
10468    for row in trailing_rows {
10469        for (col_idx, cell) in row.iter().enumerate() {
10470            if col_idx >= slot_count {
10471                break;
10472            }
10473            append_cell_text(&mut merged[col_idx], cell);
10474        }
10475    }
10476    merged
10477}
10478
10479fn render_pipe_rows(rows: &[Vec<String>]) -> String {
10480    if rows.is_empty() {
10481        return String::new();
10482    }
10483
10484    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10485    if num_cols == 0 {
10486        return String::new();
10487    }
10488
10489    let mut out = String::new();
10490    for (row_idx, row) in rows.iter().enumerate() {
10491        out.push('|');
10492        for col_idx in 0..num_cols {
10493            let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
10494            out.push_str(&format!(" {} |", cell.trim()));
10495        }
10496        out.push('\n');
10497
10498        if row_idx == 0 {
10499            out.push('|');
10500            for _ in 0..num_cols {
10501                out.push_str(" --- |");
10502            }
10503            out.push('\n');
10504        }
10505    }
10506    out.push('\n');
10507    out
10508}
10509
10510fn render_html_table(rows: &[Vec<String>]) -> String {
10511    if rows.is_empty() {
10512        return String::new();
10513    }
10514
10515    let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10516    if num_cols == 0 {
10517        return String::new();
10518    }
10519
10520    let mut out = String::from("<table>\n");
10521    for (row_idx, row) in rows.iter().enumerate() {
10522        out.push_str("<tr>");
10523        for col_idx in 0..num_cols {
10524            let cell = escape_html_text(row.get(col_idx).map(String::as_str).unwrap_or("").trim());
10525            if row_idx == 0 {
10526                out.push_str("<th>");
10527                out.push_str(&cell);
10528                out.push_str("</th>");
10529            } else {
10530                out.push_str("<td>");
10531                out.push_str(&cell);
10532                out.push_str("</td>");
10533            }
10534        }
10535        out.push_str("</tr>\n");
10536    }
10537    out.push_str("</table>\n\n");
10538    out
10539}
10540
10541fn escape_html_text(text: &str) -> String {
10542    text.replace('&', "&amp;")
10543        .replace('<', "&lt;")
10544        .replace('>', "&gt;")
10545        .replace('"', "&quot;")
10546        .replace('\'', "&#39;")
10547}
10548
10549fn normalized_numeric_marker(text: &str) -> Option<String> {
10550    let digits = text
10551        .chars()
10552        .filter(|ch| ch.is_ascii_digit())
10553        .collect::<String>();
10554    (!digits.is_empty() && digits.len() <= 2).then_some(digits)
10555}
10556
10557fn render_infographic_card_rows(rows: &[Vec<String>]) -> Option<String> {
10558    if rows.is_empty() || !rows.iter().all(|row| row.len() == 2) {
10559        return None;
10560    }
10561
10562    let marker = normalized_numeric_marker(rows[0][0].trim())?;
10563    if rows[0][1].split_whitespace().count() < 4 {
10564        return None;
10565    }
10566    if rows
10567        .iter()
10568        .skip(1)
10569        .any(|row| normalized_numeric_marker(row[0].trim()).is_some())
10570    {
10571        return None;
10572    }
10573    if rows
10574        .iter()
10575        .skip(1)
10576        .any(|row| !row[0].trim().is_empty() && row[0].trim().len() > 2)
10577    {
10578        return None;
10579    }
10580
10581    let body = rows
10582        .iter()
10583        .filter_map(|row| row.get(1))
10584        .map(|cell| cell.trim())
10585        .filter(|cell| !cell.is_empty())
10586        .collect::<Vec<_>>()
10587        .join(" ");
10588    if body.split_whitespace().count() < 8 {
10589        return None;
10590    }
10591
10592    Some(format!("{marker}. {body}\n\n"))
10593}
10594
10595fn extract_element_text(element: &ContentElement) -> String {
10596    match element {
10597        ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
10598        ContentElement::Heading(h) => clean_paragraph_text(&h.base.base.value()),
10599        ContentElement::NumberHeading(nh) => clean_paragraph_text(&nh.base.base.base.value()),
10600        ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
10601        ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
10602        _ => String::new(),
10603    }
10604}
10605
10606/// Collect rendered rows from a single TableBorder (no cross-page chaining).
10607fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
10608    let num_cols = table.num_columns.max(1);
10609    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
10610    for row in &table.rows {
10611        let cell_texts: Vec<String> = (0..num_cols)
10612            .map(|col| {
10613                row.cells
10614                    .iter()
10615                    .find(|c| c.col_number == col)
10616                    .map(cell_text_content)
10617                    .unwrap_or_default()
10618            })
10619            .collect();
10620        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
10621            rendered_rows.push(cell_texts);
10622        }
10623    }
10624    rendered_rows
10625}
10626
10627/// Render a TableBorder directly as a markdown table.
10628///
10629/// When the table has a `next_table` link (cross-page continuation), the
10630/// continuation rows are appended so the entire logical table is emitted
10631/// as a single pipe table.
10632fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
10633    if table.rows.is_empty() {
10634        return;
10635    }
10636
10637    // Collect rows from this table.
10638    let mut rendered_rows = collect_table_border_rows(table);
10639
10640    if rendered_rows.is_empty() {
10641        return;
10642    }
10643
10644    if let Some(rendered) = render_infographic_card_rows(&rendered_rows) {
10645        out.push_str(&rendered);
10646        return;
10647    }
10648
10649    // Merge multi-line header rows into a single header row.
10650    merge_continuation_rows(&mut rendered_rows);
10651    trim_leading_table_carryover_rows(&mut rendered_rows);
10652
10653    // ToC detection: render table-of-contents as plain text pairs, not a markdown table.
10654    if is_toc_table(&rendered_rows) {
10655        render_toc_rows(out, &rendered_rows);
10656        return;
10657    }
10658
10659    out.push_str(&render_pipe_rows(&rendered_rows));
10660}
10661
10662/// Returns true if `text` looks like a page number (Arabic digits or Roman numerals).
10663fn is_page_number_like(text: &str) -> bool {
10664    let t = text.trim();
10665    if t.is_empty() {
10666        return false;
10667    }
10668    // All ASCII digits, length ≤ 5 (handles pages 1–99999)
10669    if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
10670        return true;
10671    }
10672    // Lowercase Roman numerals (i, ii, iii, iv, v, vi, vii, viii, ix, x …)
10673    let lower = t.to_ascii_lowercase();
10674    if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
10675        return true;
10676    }
10677    false
10678}
10679
10680/// Returns true if the rendered rows look like a table-of-contents:
10681/// exactly 2 columns where the majority of right-column cells are page numbers.
10682fn is_toc_table(rows: &[Vec<String>]) -> bool {
10683    if rows.is_empty() {
10684        return false;
10685    }
10686    // Need at least 2 rows to qualify as a ToC
10687    if rows.len() < 2 {
10688        return false;
10689    }
10690    // First, every row must have exactly 2 cells
10691    if !rows.iter().all(|r| r.len() == 2) {
10692        return false;
10693    }
10694
10695    let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
10696    if non_empty_right < 2 {
10697        return false;
10698    }
10699
10700    let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
10701    page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
10702}
10703
10704/// Render ToC-style rows as plain text (title pagenum pairs) rather than a markdown table.
10705fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
10706    for row in rows {
10707        let title = row[0].trim();
10708        let page = row[1].trim();
10709        if title.is_empty() && page.is_empty() {
10710            continue;
10711        }
10712        if !title.is_empty() && !page.is_empty() {
10713            out.push_str(title);
10714            out.push(' ');
10715            out.push_str(page);
10716        } else {
10717            out.push_str(title);
10718            out.push_str(page);
10719        }
10720        out.push('\n');
10721    }
10722    out.push('\n');
10723}
10724
10725/// Extract text content from a table cell.
10726fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
10727    // First try the content tokens — use gap-based concatenation instead of
10728    // naive space-joining so that letter-spaced text ("O w n e r s h i p")
10729    // is collapsed correctly.
10730    if !cell.content.is_empty() {
10731        let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
10732        return normalize_common_ocr_text(&crate::models::text::TextLine::concatenate_chunks(
10733            &chunks,
10734        ));
10735    }
10736    // Fall back to processed contents
10737    let mut text = String::new();
10738    for elem in &cell.contents {
10739        match elem {
10740            ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
10741            ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
10742            ContentElement::TextLine(tl) => text.push_str(&tl.value()),
10743            ContentElement::TextChunk(tc) => text.push_str(&tc.value),
10744            _ => {}
10745        }
10746    }
10747    normalize_common_ocr_text(&repair_fragmented_words(&text))
10748}
10749
10750/// Merge adjacent pipe tables that share the same column count.
10751///
10752/// PDF table detection sometimes splits one visual table into several
10753/// fragments that are emitted as successive pipe tables.  When two tables
10754/// are separated only by blank lines and have identical column counts,
10755/// they are merged into a single table by appending the second table's
10756/// rows (including its header-now-body row) to the first.
10757fn merge_adjacent_pipe_tables(markdown: &str) -> String {
10758    let lines: Vec<&str> = markdown.lines().collect();
10759    if lines.len() < 4 {
10760        return markdown.to_string();
10761    }
10762
10763    fn count_pipe_cols(line: &str) -> usize {
10764        let t = line.trim();
10765        if !t.starts_with('|') || !t.ends_with('|') {
10766            return 0;
10767        }
10768        t.split('|').count().saturating_sub(2)
10769    }
10770
10771    fn is_separator(line: &str) -> bool {
10772        let t = line.trim();
10773        if !t.starts_with('|') || !t.ends_with('|') {
10774            return false;
10775        }
10776        let cells: Vec<&str> = t.split('|').collect();
10777        if cells.len() < 3 {
10778            return false;
10779        }
10780        cells[1..cells.len() - 1].iter().all(|c| {
10781            let s = c.trim();
10782            !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
10783        })
10784    }
10785
10786    fn is_pipe_row(line: &str) -> bool {
10787        let t = line.trim();
10788        t.starts_with('|') && t.ends_with('|') && t.len() > 2
10789    }
10790
10791    fn pipe_cells(line: &str) -> Vec<String> {
10792        let t = line.trim();
10793        if !is_pipe_row(t) {
10794            return Vec::new();
10795        }
10796        let parts = t.split('|').collect::<Vec<_>>();
10797        parts[1..parts.len() - 1]
10798            .iter()
10799            .map(|cell| cell.trim().to_string())
10800            .collect()
10801    }
10802
10803    fn normalize_header_cell(cell: &str) -> String {
10804        cell.chars()
10805            .filter(|ch| ch.is_alphanumeric())
10806            .flat_map(|ch| ch.to_lowercase())
10807            .collect()
10808    }
10809
10810    fn looks_like_header_row(line: &str) -> bool {
10811        let cells = pipe_cells(line);
10812        if cells.len() < 2 {
10813            return false;
10814        }
10815
10816        let non_empty = cells
10817            .iter()
10818            .filter(|cell| !cell.trim().is_empty())
10819            .collect::<Vec<_>>();
10820        if non_empty.len() < 2 {
10821            return false;
10822        }
10823
10824        let headerish = non_empty.iter().all(|cell| {
10825            let trimmed = cell.trim();
10826            let word_count = trimmed.split_whitespace().count();
10827            let has_alpha = trimmed.chars().any(|ch| ch.is_alphabetic());
10828            has_alpha && word_count <= 4 && trimmed.len() <= 28
10829        });
10830        headerish
10831    }
10832
10833    fn header_overlap_ratio(left: &str, right: &str) -> f64 {
10834        let left_cells = pipe_cells(left)
10835            .into_iter()
10836            .map(|cell| normalize_header_cell(&cell))
10837            .collect::<Vec<_>>();
10838        let right_cells = pipe_cells(right)
10839            .into_iter()
10840            .map(|cell| normalize_header_cell(&cell))
10841            .collect::<Vec<_>>();
10842        let width = left_cells.len().min(right_cells.len());
10843        if width == 0 {
10844            return 0.0;
10845        }
10846
10847        let matches = (0..width)
10848            .filter(|idx| {
10849                !left_cells[*idx].is_empty()
10850                    && !right_cells[*idx].is_empty()
10851                    && left_cells[*idx] == right_cells[*idx]
10852            })
10853            .count();
10854        matches as f64 / width as f64
10855    }
10856
10857    fn header_schema_matches(left: &str, right: &str) -> bool {
10858        let left_cells = pipe_cells(left)
10859            .into_iter()
10860            .map(|cell| normalize_header_cell(&cell))
10861            .collect::<Vec<_>>();
10862        let right_cells = pipe_cells(right)
10863            .into_iter()
10864            .map(|cell| normalize_header_cell(&cell))
10865            .collect::<Vec<_>>();
10866        if left_cells.len() != right_cells.len() || left_cells.len() < 2 {
10867            return false;
10868        }
10869
10870        let mut aligned_non_empty = 0usize;
10871        for (left, right) in left_cells.iter().zip(right_cells.iter()) {
10872            if left.is_empty() || right.is_empty() {
10873                continue;
10874            }
10875            aligned_non_empty += 1;
10876            if left != right {
10877                return false;
10878            }
10879        }
10880
10881        aligned_non_empty >= 2
10882    }
10883
10884    fn pad_pipe_row(line: &str, target_cols: usize) -> String {
10885        let t = line.trim();
10886        let current_cols = count_pipe_cols(t);
10887        if current_cols >= target_cols {
10888            return t.to_string();
10889        }
10890        // Append extra empty cells after the existing trailing |
10891        let mut result = t.to_string();
10892        for _ in current_cols..target_cols {
10893            result.push_str("  |");
10894        }
10895        result
10896    }
10897
10898    // Identify pipe table blocks: (start, sep_idx, end, col_count).
10899    struct Block {
10900        start: usize,
10901        sep: usize,
10902        end: usize, // inclusive last line
10903        cols: usize,
10904    }
10905
10906    let mut blocks: Vec<Block> = Vec::new();
10907    let mut i = 0;
10908    while i < lines.len() {
10909        if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
10910            let cols = count_pipe_cols(lines[i]);
10911            let sep = i + 1;
10912            let mut end = sep;
10913            let mut j = sep + 1;
10914            while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
10915                end = j;
10916                j += 1;
10917            }
10918            blocks.push(Block {
10919                start: i,
10920                sep,
10921                end,
10922                cols,
10923            });
10924            i = end + 1;
10925        } else {
10926            i += 1;
10927        }
10928    }
10929
10930    if blocks.len() < 2 {
10931        return markdown.to_string();
10932    }
10933
10934    // Group adjacent blocks: allow different column counts.
10935    // Merge when separated by blank lines only, or by heading markers
10936    // (lines starting with #) that represent table cells misclassified
10937    // as headings by the pipeline.
10938    // Track group max cols during merge to use for heading gap decisions.
10939    let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
10940    let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
10941    for bi in 1..blocks.len() {
10942        let prev = &blocks[bi - 1];
10943        let curr = &blocks[bi];
10944        let gap_range = prev.end + 1..curr.start;
10945        let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
10946        // For heading gap check, use the group's max cols (not individual block).
10947        // This handles chains like [2-col] → blank → [1-col] → heading → [2-col]
10948        // where the 1-col intermediary is already merged with the 2-col leader.
10949        let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
10950        let effective_prev_cols = group_cols[leader_idx];
10951        let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
10952            let non_blank: Vec<usize> = gap_range
10953                .clone()
10954                .filter(|li| !lines[*li].trim().is_empty())
10955                .collect();
10956            // Only merge when gap has 1-2 heading lines
10957            !non_blank.is_empty()
10958                && non_blank.len() <= 2
10959                && non_blank.iter().all(|li| {
10960                    let t = lines[*li].trim();
10961                    t.starts_with('#') && t.len() < 100
10962                })
10963        } else {
10964            false
10965        };
10966        // Short displaced cell: a single short plain-text word between two
10967        // multi-column tables is almost certainly a cell value that the PDF
10968        // pipeline displaced out of the table grid.
10969        let gap_short_fragment =
10970            if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
10971                let non_blank: Vec<usize> = gap_range
10972                    .clone()
10973                    .filter(|li| !lines[*li].trim().is_empty())
10974                    .collect();
10975                non_blank.len() == 1 && {
10976                    let t = lines[non_blank[0]].trim();
10977                    t.len() < 30
10978                        && !t.starts_with('#')
10979                        && !t.starts_with('-')
10980                        && !t.starts_with('*')
10981                        && !t.contains(':')
10982                        && !t.contains("TABLE")
10983                }
10984            } else {
10985                false
10986            };
10987        let prev_has_header = looks_like_header_row(lines[prev.start]);
10988        let curr_has_header = curr.end >= curr.sep + 2 && looks_like_header_row(lines[curr.start]);
10989        let curr_has_distinct_header = prev_has_header
10990            && curr_has_header
10991            && !header_schema_matches(lines[prev.start], lines[curr.start])
10992            && (curr.cols != prev.cols
10993                || header_overlap_ratio(lines[prev.start], lines[curr.start]) < 1.0);
10994
10995        if (gap_all_blank || gap_heading_only || gap_short_fragment)
10996            && prev.cols > 0
10997            && curr.cols > 0
10998            && !curr_has_distinct_header
10999        {
11000            merge_leader[bi] = Some(leader_idx);
11001            // Update group max cols
11002            if curr.cols > group_cols[leader_idx] {
11003                group_cols[leader_idx] = curr.cols;
11004            }
11005        }
11006    }
11007
11008    let mut pad_target: Vec<usize> = vec![0; blocks.len()];
11009    for bi in 0..blocks.len() {
11010        let leader = merge_leader[bi].unwrap_or(bi);
11011        pad_target[bi] = group_cols[leader];
11012    }
11013
11014    // Mark lines to skip: blank gap lines + separator of merged blocks.
11015    // Non-blank gap lines become pipe table rows instead of being skipped.
11016    // Keep the header row (curr.start) — it becomes a data row.
11017    let mut skip = vec![false; lines.len()];
11018    let mut convert_to_pipe_row = vec![false; lines.len()];
11019    for (bi, leader) in merge_leader.iter().enumerate() {
11020        if leader.is_none() {
11021            continue;
11022        }
11023        let prev_end = blocks[bi - 1].end;
11024        let curr = &blocks[bi];
11025        for li in (prev_end + 1)..curr.start {
11026            if lines[li].trim().is_empty() {
11027                skip[li] = true;
11028            } else {
11029                // Non-blank gap line: convert to pipe row
11030                convert_to_pipe_row[li] = true;
11031            }
11032        }
11033        // Only skip separator, header row becomes a data row
11034        skip[curr.sep] = true;
11035    }
11036
11037    // Map each line to its block index (or the block it belongs to via gap conversion).
11038    let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
11039    for (bi, block) in blocks.iter().enumerate() {
11040        line_to_block[block.start..=block.end].fill(Some(bi));
11041    }
11042    // Assign gap lines to the preceding block for padding purposes.
11043    for (bi, leader) in merge_leader.iter().enumerate() {
11044        if leader.is_none() {
11045            continue;
11046        }
11047        let prev_end = blocks[bi - 1].end;
11048        let curr = &blocks[bi];
11049        for li in (prev_end + 1)..curr.start {
11050            if convert_to_pipe_row[li] {
11051                line_to_block[li] = Some(bi - 1);
11052            }
11053        }
11054    }
11055
11056    let mut result = String::new();
11057    for (li, line) in lines.iter().enumerate() {
11058        if skip[li] {
11059            continue;
11060        }
11061        if convert_to_pipe_row[li] {
11062            // Convert non-blank gap text/heading into a pipe table row.
11063            let text = line.trim().trim_start_matches('#').trim();
11064            if let Some(bi) = line_to_block[li] {
11065                let target = pad_target[bi];
11066                if target > 0 && !text.is_empty() {
11067                    result.push_str(&format!("| {} ", text));
11068                    for _ in 1..target {
11069                        result.push_str("|  ");
11070                    }
11071                    result.push_str("|\n");
11072                    continue;
11073                }
11074            }
11075            // Fallback: emit as-is if no block context
11076            result.push_str(line);
11077            result.push('\n');
11078            continue;
11079        }
11080        if let Some(bi) = line_to_block[li] {
11081            let target = pad_target[bi];
11082            if target > 0 && is_pipe_row(line) && !is_separator(line) {
11083                result.push_str(&pad_pipe_row(line, target));
11084                result.push('\n');
11085            } else if target > 0 && is_separator(line) {
11086                result.push('|');
11087                for _ in 0..target {
11088                    result.push_str(" --- |");
11089                }
11090                result.push('\n');
11091            } else {
11092                result.push_str(line);
11093                result.push('\n');
11094            }
11095        } else {
11096            result.push_str(line);
11097            result.push('\n');
11098        }
11099    }
11100
11101    result
11102}
11103
11104#[cfg(test)]
11105mod tests {
11106    use super::*;
11107    use crate::models::bbox::BoundingBox;
11108    use crate::models::chunks::TextChunk;
11109    use crate::models::content::ContentElement;
11110    use crate::models::enums::{PdfLayer, TextFormat, TextType};
11111    use crate::models::list::{ListBody, ListItem, ListLabel, PDFList};
11112    use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
11113    use crate::models::table::{
11114        TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
11115    };
11116    use crate::models::text::{TextBlock, TextColumn, TextLine};
11117
11118    #[test]
11119    fn test_empty_doc() {
11120        let doc = PdfDocument::new("test.pdf".to_string());
11121        let md = to_markdown(&doc).unwrap();
11122        assert!(md.contains("No content extracted"));
11123    }
11124
11125    #[test]
11126    fn test_with_title() {
11127        let mut doc = PdfDocument::new("test.pdf".to_string());
11128        doc.title = Some("My Title".to_string());
11129        let md = to_markdown(&doc).unwrap();
11130        assert!(md.starts_with("# My Title\n"));
11131    }
11132
11133    #[test]
11134    fn test_empty_title_not_rendered() {
11135        let mut doc = PdfDocument::new("test.pdf".to_string());
11136        doc.title = Some("  ".to_string());
11137        let md = to_markdown(&doc).unwrap();
11138        assert!(
11139            !md.contains("# "),
11140            "Empty/whitespace title should not produce a heading"
11141        );
11142    }
11143
11144    #[test]
11145    fn test_repair_fragmented_words() {
11146        assert_eq!(
11147            repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
11148            "Jurisdiction Foreign Requirements"
11149        );
11150    }
11151
11152    #[test]
11153    fn test_normalize_common_ocr_text_repairs_units() {
11154        assert_eq!(
11155            normalize_common_ocr_text("10 ߤL at 37 C and -20 oC"),
11156            "10 μL at 37°C and -20°C"
11157        );
11158    }
11159
11160    #[cfg(not(target_arch = "wasm32"))]
11161    #[test]
11162    fn test_build_layout_anchor_rows_reconstructs_four_column_matrix() {
11163        let lines = vec![
11164            "Key Functions by Main Service Flow".to_string(),
11165            "".to_string(),
11166            " Service Stage                   Function Name                Explanation                                                                                Expected Benefit".to_string(),
11167            "".to_string(),
11168            " 1. Project creation             Project creation and         Select document type to automatically run project creation, Pipeline configuration with    The intuitive UI environment allows the the person in charge to quickly proceed with".to_string(),
11169            "".to_string(),
11170            "                                 management                   recommended Modelset and Endpoint deployment                                               the entire process from project creation to deployment, improving work efficiency".to_string(),
11171            "".to_string(),
11172            "                                                                                                                                                         Conveniently manage raw data to be used for OCR Pack and actual date from live".to_string(),
11173            " 2. Data labeling and            Data storage management      Provides convenient functions for uploading raw data, viewer, and data management".to_string(),
11174            "                                                              (search using image metadata, sorting, filtering, hashtags settings on image data)         service".to_string(),
11175            " fine-tuning".to_string(),
11176            "                                                              Image data bookmark for Qualitative Evaluation".to_string(),
11177            "".to_string(),
11178            "                                 Create and manage Labeling   Creating a Labeling Space to manage raw data annotation, managing labeling resources       Labeling work can be outsourced within the pack. Labeled data is continuously".to_string(),
11179            "                                                              (Ontology, Characters to be Recognized), data set dump, data set version management        supplied from which data sets can be created with ease. The Auto Labeling function".to_string(),
11180            "                                 Space".to_string(),
11181            "                                                                                                     3                                                   increases both efficiency and convenience.".to_string(),
11182            "                                                              Various basic models for each selected 5".to_string(),
11183            "                                                                                                    document, information comparison between".to_string(),
11184            "                                 Model training                                                                                                          Providing a foundation for customers to implement, manage, and upgrade their own".to_string(),
11185            "                                                              models, basic model training, training pause function, re-training, cancel function, and   OCR model specialized to the customers’ needs".to_string(),
11186            "                                                              configuration support for Characters to be Recognized and Ontology that is frequently".to_string(),
11187            "                                                              modified while developing specialized models".to_string(),
11188        ];
11189
11190        let header = find_layout_header_candidate(&lines).unwrap();
11191        let rows =
11192            build_layout_anchor_rows(&lines, &extract_layout_entries(&lines, &header)).unwrap();
11193
11194        assert_eq!(
11195            header.headers,
11196            vec![
11197                "Service Stage".to_string(),
11198                "Function Name".to_string(),
11199                "Explanation".to_string(),
11200                "Expected Benefit".to_string()
11201            ]
11202        );
11203        assert_eq!(rows.len(), 4);
11204        assert_eq!(rows[0][0], "1. Project creation");
11205        assert_eq!(rows[0][1], "Project creation and management");
11206        assert!(rows[1][0].contains("fine-tuning"));
11207        assert_eq!(rows[2][1], "Create and manage Labeling Space");
11208        assert_eq!(rows[3][1], "Model training");
11209        assert!(rows[3][2].contains("Various basic models for each selected document"));
11210    }
11211
11212    #[cfg(not(target_arch = "wasm32"))]
11213    #[test]
11214    fn test_build_layout_panel_stub_rows_reconstructs_left_stub_table() {
11215        let lines = vec![
11216            "AI Pack".to_string(),
11217            "Upstage offers 3 AI packs that process unstructured information and data".to_string(),
11218            "".to_string(),
11219            "                                     OCR                                                Recommendation                                    Product semantic search".to_string(),
11220            "".to_string(),
11221            "              A solution that recognizes characters in an                A solution that recommends the best products and   A solution that enables semantic search, analyzes and".to_string(),
11222            "              image and extracts necessary information                   contents                                           organizes key information in unstructured text data".to_string(),
11223            "   Pack".to_string(),
11224            "                                                                                                                            into a standardized form (DB)".to_string(),
11225            "".to_string(),
11226            "              Applicable to all fields that require text extraction      Applicable to all fields that use any form of      Applicable to all fields that deal with various types of".to_string(),
11227            "              from standardized documents, such as receipts,             recommendation including alternative products,     unstructured data containing text information that".to_string(),
11228            "Application   bills, credit cards, ID cards, certificates, and medical   products and contents that are likely to be        require semantic search and conversion into a DB".to_string(),
11229            "              receipts                                                   purchased next".to_string(),
11230            "".to_string(),
11231            "              Achieved 1st place in the OCR World Competition            Team with specialists and technologies that        Creation of the first natural language evaluation".to_string(),
11232            "              The team includes specialists who have                     received Kaggle’s Gold Medal recommendation        system in Korean (KLUE)".to_string(),
11233            "              presented 14 papers in the world’s most                    (Education platform)                               World’s No.1 in Kaggle text embedding competition in".to_string(),
11234            " Highlight".to_string(),
11235            "              renowned AI conferences                                    Proven superior performance of more than 170%      E-commerce subject (Shopee)".to_string(),
11236            "                                                                         compared to other global top-tier recommendation".to_string(),
11237            "                                                                         models".to_string(),
11238        ];
11239
11240        let header = find_layout_panel_header_candidate(&lines).unwrap();
11241        let rows = build_layout_panel_stub_rows(&lines, &header).unwrap();
11242
11243        assert_eq!(
11244            header.headers,
11245            vec![
11246                "OCR".to_string(),
11247                "Recommendation".to_string(),
11248                "Product semantic search".to_string()
11249            ]
11250        );
11251        assert_eq!(rows.len(), 3);
11252        assert_eq!(rows[0][0], "Pack");
11253        assert!(rows[0][1].contains("image and extracts necessary information"));
11254        assert_eq!(rows[1][0], "Application");
11255        assert!(rows[1][3].contains("require semantic search and conversion into a DB"));
11256        assert_eq!(rows[2][0], "Highlight");
11257        assert!(rows[2][2].contains("top-tier recommendation models"));
11258    }
11259
11260    #[cfg(not(target_arch = "wasm32"))]
11261    #[test]
11262    fn test_extract_layout_toc_entries_merges_wrapped_entry() {
11263        let lines = vec![
11264            "Table of Contents".to_string(),
11265            "".to_string(),
11266            "Executive Summary                                          4".to_string(),
11267            "Legal Framework                                            6".to_string(),
11268            "Election Administration                                   11".to_string(),
11269            "Civil Society Engagement                                  15".to_string(),
11270            "Political Parties, Candidates Registration and Election   18".to_string(),
11271            "Campaign".to_string(),
11272            "Media Freedom and Access to Information                   25".to_string(),
11273            "Voter Education and Awareness                             29".to_string(),
11274            "Participation of Marginalized Sectors                     31".to_string(),
11275            "Recommendations                                           39".to_string(),
11276        ];
11277
11278        let (title, entries) = extract_layout_toc_entries(&lines).unwrap();
11279        assert_eq!(title, "Table of Contents");
11280        assert_eq!(entries.len(), 9);
11281        assert_eq!(entries[0].title, "Executive Summary");
11282        assert_eq!(entries[0].page, "4");
11283        assert_eq!(
11284            entries[4].title,
11285            "Political Parties, Candidates Registration and Election Campaign"
11286        );
11287        assert_eq!(entries[4].page, "18");
11288    }
11289
11290    #[cfg(not(target_arch = "wasm32"))]
11291    fn make_bbox_layout_line(words: &[(&str, f64, f64)], bottom: f64, top: f64) -> BBoxLayoutLine {
11292        make_bbox_layout_line_in_block(0, words, bottom, top)
11293    }
11294
11295    #[cfg(not(target_arch = "wasm32"))]
11296    fn make_bbox_layout_line_in_block(
11297        block_id: usize,
11298        words: &[(&str, f64, f64)],
11299        bottom: f64,
11300        top: f64,
11301    ) -> BBoxLayoutLine {
11302        BBoxLayoutLine {
11303            block_id,
11304            bbox: BoundingBox::new(
11305                Some(1),
11306                words.first().map(|(_, left, _)| *left).unwrap_or(72.0),
11307                bottom,
11308                words.last().map(|(_, _, right)| *right).unwrap_or(320.0),
11309                top,
11310            ),
11311            words: words
11312                .iter()
11313                .map(|(text, left, right)| BBoxLayoutWord {
11314                    bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
11315                    text: (*text).to_string(),
11316                })
11317                .collect(),
11318        }
11319    }
11320
11321    #[cfg(not(target_arch = "wasm32"))]
11322    #[test]
11323    fn test_detect_layout_open_plate_recovers_two_column_species_rows() {
11324        let lines = vec![
11325            make_bbox_layout_line(
11326                &[
11327                    ("Fish", 60.0, 76.0),
11328                    ("species", 78.0, 107.0),
11329                    ("on", 109.0, 119.0),
11330                    ("IUCN", 121.0, 142.0),
11331                    ("Red", 144.0, 159.0),
11332                    ("List", 161.0, 176.0),
11333                ],
11334                649.0,
11335                660.0,
11336            ),
11337            make_bbox_layout_line(
11338                &[("Potosi", 60.0, 84.0), ("Pupfish", 86.0, 114.0)],
11339                632.0,
11340                643.0,
11341            ),
11342            make_bbox_layout_line(
11343                &[("Cyprinodon", 132.0, 176.0), ("alvarezi", 178.0, 207.0)],
11344                632.0,
11345                643.0,
11346            ),
11347            make_bbox_layout_line(
11348                &[
11349                    ("La", 60.0, 69.0),
11350                    ("Palma", 71.0, 94.0),
11351                    ("Pupfish", 96.0, 124.0),
11352                    ("Cyprinodon", 132.0, 176.0),
11353                    ("longidorsalis", 178.0, 224.0),
11354                ],
11355                616.0,
11356                627.0,
11357            ),
11358            make_bbox_layout_line(
11359                &[("Butterfly", 60.0, 94.0), ("Splitfin", 96.0, 123.0)],
11360                600.0,
11361                611.0,
11362            ),
11363            make_bbox_layout_line(
11364                &[("Ameca", 132.0, 156.0), ("splendens", 158.0, 194.0)],
11365                600.0,
11366                611.0,
11367            ),
11368            make_bbox_layout_line(
11369                &[("Golden", 60.0, 88.0), ("Skiffia", 90.0, 113.0)],
11370                584.0,
11371                595.0,
11372            ),
11373            make_bbox_layout_line(
11374                &[("Skiffia", 132.0, 155.0), ("francesae", 158.0, 193.0)],
11375                584.0,
11376                595.0,
11377            ),
11378            make_bbox_layout_line(
11379                &[
11380                    ("Table", 56.0, 74.0),
11381                    ("6.1:", 76.0, 87.0),
11382                    ("Four", 89.0, 105.0),
11383                    ("fish", 107.0, 119.0),
11384                    ("species", 121.0, 145.0),
11385                    ("on", 147.0, 155.0),
11386                    ("IUCN", 157.0, 176.0),
11387                    ("Red", 178.0, 190.0),
11388                    ("List", 192.0, 205.0),
11389                    ("held", 279.0, 293.0),
11390                    ("in", 295.0, 302.0),
11391                    ("public", 304.0, 325.0),
11392                    ("aquariums.", 327.0, 365.0),
11393                ],
11394                556.0,
11395                566.0,
11396            ),
11397        ];
11398
11399        let plate = detect_layout_open_plate(576.0, &lines).unwrap();
11400        assert_eq!(plate.heading, "Fish species on IUCN Red List");
11401        assert_eq!(
11402            plate.header_row,
11403            vec![
11404                "Fish species on IUCN Red List".to_string(),
11405                "Scientific name".to_string()
11406            ]
11407        );
11408        assert_eq!(plate.rows.len(), 4);
11409        assert_eq!(
11410            plate.rows[1],
11411            vec![
11412                "La Palma Pupfish".to_string(),
11413                "Cyprinodon longidorsalis".to_string()
11414            ]
11415        );
11416        assert!(plate
11417            .caption
11418            .starts_with("Table 6.1: Four fish species on IUCN Red List"));
11419    }
11420
11421    #[cfg(not(target_arch = "wasm32"))]
11422    #[test]
11423    fn test_extract_layout_narrative_bridge_recovers_left_prose_and_defers_captions() {
11424        let plate = OpenPlateCandidate {
11425            heading: "Fish species on IUCN Red List".to_string(),
11426            header_row: vec![
11427                "Fish species on IUCN Red List".to_string(),
11428                "Scientific name".to_string(),
11429            ],
11430            rows: vec![],
11431            caption: "Table 6.1".to_string(),
11432            cutoff_top_y: 560.0,
11433        };
11434        let lines = vec![
11435            make_bbox_layout_line(
11436                &[
11437                    ("Public", 56.0, 83.0),
11438                    ("aquariums,", 88.0, 135.0),
11439                    ("because", 140.0, 174.0),
11440                ],
11441                509.0,
11442                521.0,
11443            ),
11444            make_bbox_layout_line(
11445                &[
11446                    ("of", 180.0, 188.0),
11447                    ("their", 194.0, 214.0),
11448                    ("in-", 220.0, 233.0),
11449                ],
11450                509.0,
11451                521.0,
11452            ),
11453            make_bbox_layout_line(
11454                &[
11455                    ("house", 56.0, 82.0),
11456                    ("expertise,", 84.0, 125.0),
11457                    ("can", 128.0, 143.0),
11458                ],
11459                495.0,
11460                507.0,
11461            ),
11462            make_bbox_layout_line(
11463                &[("act", 146.0, 159.0), ("quickly", 161.0, 191.0)],
11464                495.0,
11465                507.0,
11466            ),
11467            make_bbox_layout_line_in_block(
11468                1,
11469                &[
11470                    ("Figure", 242.0, 265.0),
11471                    ("6.3:", 267.0, 280.0),
11472                    ("Photo", 282.0, 303.0),
11473                ],
11474                355.0,
11475                366.0,
11476            ),
11477            make_bbox_layout_line_in_block(
11478                1,
11479                &[
11480                    ("of", 305.0, 312.0),
11481                    ("the", 314.0, 325.0),
11482                    ("species.", 327.0, 360.0),
11483                ],
11484                355.0,
11485                366.0,
11486            ),
11487            make_bbox_layout_line(
11488                &[
11489                    ("The", 56.0, 73.0),
11490                    ("breeding", 77.0, 114.0),
11491                    ("colonies", 118.0, 153.0),
11492                ],
11493                330.0,
11494                342.0,
11495            ),
11496            make_bbox_layout_line(
11497                &[
11498                    ("of", 157.0, 165.0),
11499                    ("the", 169.0, 183.0),
11500                    ("Butterfly", 187.0, 224.0),
11501                    ("Splitfin", 228.0, 258.0),
11502                    ("at", 314.0, 323.0),
11503                    ("the", 327.0, 341.0),
11504                    ("London", 345.0, 377.0),
11505                    ("Zoo", 381.0, 397.0),
11506                    ("and", 401.0, 416.0),
11507                    ("elsewhere", 420.0, 463.0),
11508                    ("serve", 467.0, 489.0),
11509                    ("as", 493.0, 502.0),
11510                    ("ark", 506.0, 519.0),
11511                ],
11512                330.0,
11513                342.0,
11514            ),
11515            make_bbox_layout_line(
11516                &[
11517                    ("Figure", 56.0, 79.0),
11518                    ("6.4:", 81.0, 94.0),
11519                    ("Lake", 96.0, 116.0),
11520                    ("Sturgeon", 118.0, 158.0),
11521                ],
11522                104.0,
11523                116.0,
11524            ),
11525        ];
11526
11527        let bridge = extract_layout_narrative_bridge(576.0, &lines, &plate).unwrap();
11528        assert!(bridge
11529            .bridge_paragraph
11530            .as_deref()
11531            .is_some_and(|text| text.contains("Public aquariums") && text.contains("expertise")));
11532        assert_eq!(bridge.deferred_captions.len(), 2);
11533        assert!(bridge.deferred_captions[0].contains("Figure 6.3:"));
11534        assert!(bridge.deferred_captions[0].contains("species."));
11535    }
11536
11537    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11538    #[test]
11539    fn test_detect_layout_ocr_benchmark_dashboard_on_real_pdf() {
11540        let path =
11541            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000199.pdf");
11542        let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11543        let dashboard = detect_layout_ocr_benchmark_dashboard(page_width, &lines).unwrap();
11544
11545        assert_eq!(
11546            dashboard.title,
11547            "Base Model Performance Evaluation of Upstage OCR Pack"
11548        );
11549        assert_eq!(dashboard.left_columns.len(), 2);
11550        assert_eq!(
11551            dashboard.left_columns[0],
11552            "Scene (Photographed document image)"
11553        );
11554        assert_eq!(
11555            dashboard.left_rows[0],
11556            vec![
11557                "Company A²".to_string(),
11558                "70.23".to_string(),
11559                "80.41".to_string()
11560            ]
11561        );
11562        assert_eq!(
11563            dashboard.right_rows[0],
11564            vec![
11565                "OCR-Recall³".to_string(),
11566                "73.2".to_string(),
11567                "94.2".to_string(),
11568                "94.1".to_string()
11569            ]
11570        );
11571        assert_eq!(dashboard.right_rows[3][0], "Parsing-F¹");
11572        assert_eq!(dashboard.right_rows[3][1], "68.0");
11573        assert_eq!(dashboard.right_rows[3][2], "82.65");
11574        assert_eq!(dashboard.right_rows[3][3], "82.65");
11575        assert!(!dashboard.definition_notes.is_empty());
11576        assert!(!dashboard.source_notes.is_empty());
11577    }
11578
11579    #[cfg(not(target_arch = "wasm32"))]
11580    #[test]
11581    fn test_split_layout_line_spans_handles_unicode_boundaries() {
11582        let line = "Title  “Podcast #EP32: SDGs dan Anak Muda”  2024";
11583        let spans = split_layout_line_spans(line);
11584        assert_eq!(spans.len(), 3);
11585        assert_eq!(spans[0].1, "Title");
11586        assert!(spans[1].1.contains("Podcast #EP32: SDGs dan Anak Muda"));
11587        assert!(spans[1].1.ends_with('”'));
11588        assert!(spans[2].1.ends_with("24"));
11589    }
11590
11591    #[cfg(not(target_arch = "wasm32"))]
11592    #[test]
11593    fn test_render_layout_single_caption_chart_document_on_real_pdf() {
11594        let path =
11595            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000037.pdf");
11596        let doc = PdfDocument {
11597            title: None,
11598            source_path: Some(path.to_string_lossy().to_string()),
11599            number_of_pages: 1,
11600            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11601                .unwrap()
11602                .kids,
11603            ..PdfDocument::new("01030000000037.pdf".to_string())
11604        };
11605        let rendered = render_layout_single_caption_chart_document(&doc).unwrap();
11606        assert!(rendered.contains("# 3. Impact on Business Operations"));
11607        assert!(rendered.contains("## 3.1. Status of Business Operations"));
11608        assert!(rendered.contains("As shown in Figure 3.1.1, the number of MSMEs"));
11609        assert!(
11610            rendered.contains("Figure 3.1.1: Status of operations during each survey phase (%)")
11611        );
11612        assert!(
11613            rendered.contains("lockdown period. In the handicraft/textile sector, 30% of MSMEs")
11614        );
11615        assert!(!rendered.contains("| Lockdown Period |"));
11616    }
11617
11618    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11619    #[test]
11620    fn test_to_markdown_captioned_media_document_on_real_pdf_72() {
11621        let path =
11622            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000072.pdf");
11623        let doc = PdfDocument {
11624            title: None,
11625            source_path: Some(path.to_string_lossy().to_string()),
11626            number_of_pages: 1,
11627            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11628                .unwrap()
11629                .kids,
11630            ..PdfDocument::new("01030000000072.pdf".to_string())
11631        };
11632        let md = to_markdown(&doc).unwrap();
11633        assert!(md.contains("## Diagram 5"), "{md}");
11634        assert!(
11635            md.contains("**Distribution of Komnas HAM’s YouTube Content (2019-2020)**"),
11636            "{md}"
11637        );
11638        assert!(
11639            md.contains(
11640                "As of 1 December 2021, the Komnas HAM’s YouTube channel has 2,290 subscribers"
11641            ),
11642            "{md}"
11643        );
11644        assert!(md.contains("**Figure 4**"), "{md}");
11645        assert!(
11646            md.contains("*Komnas HAM’s YouTube channel as of 1 December 2021*"),
11647            "{md}"
11648        );
11649    }
11650
11651    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11652    #[test]
11653    fn test_to_markdown_captioned_media_document_on_real_pdf_73() {
11654        let path =
11655            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000073.pdf");
11656        let doc = PdfDocument {
11657            title: None,
11658            source_path: Some(path.to_string_lossy().to_string()),
11659            number_of_pages: 1,
11660            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11661                .unwrap()
11662                .kids,
11663            ..PdfDocument::new("01030000000073.pdf".to_string())
11664        };
11665        let md = to_markdown(&doc).unwrap();
11666        assert!(
11667            md.starts_with("# In this content, DPN Argentina provides a brief explanation"),
11668            "{md}"
11669        );
11670        assert!(
11671            md.contains("Examples of such greetings are as follows:"),
11672            "{md}"
11673        );
11674        assert!(md.contains("*Image*"), "{md}");
11675        assert!(md.contains("**Figure 6**"), "{md}");
11676        assert!(md.contains("**DPN Argentina**"), "{md}");
11677        assert!(
11678            md.contains("**Content: World Health Day Celebration (7 April 2021).**^98"),
11679            "{md}"
11680        );
11681        assert!(md.contains("**Footnote:**"), "{md}");
11682        assert!(
11683            md.contains("https://twitter.com/DPNArgentina/status/1379765916259483648."),
11684            "{md}"
11685        );
11686    }
11687
11688    #[cfg(not(target_arch = "wasm32"))]
11689    #[test]
11690    fn test_render_layout_captioned_media_document_does_not_fire_on_real_pdf_14() {
11691        let path =
11692            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11693        let doc = PdfDocument {
11694            title: None,
11695            source_path: Some(path.to_string_lossy().to_string()),
11696            number_of_pages: 1,
11697            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11698                .unwrap()
11699                .kids,
11700            ..PdfDocument::new("01030000000014.pdf".to_string())
11701        };
11702        assert!(render_layout_captioned_media_document(&doc).is_none());
11703    }
11704
11705    #[cfg(not(target_arch = "wasm32"))]
11706    #[test]
11707    fn test_to_markdown_real_pdf_14_preserves_body_paragraphs() {
11708        let path =
11709            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11710        let doc = PdfDocument {
11711            title: None,
11712            source_path: Some(path.to_string_lossy().to_string()),
11713            number_of_pages: 1,
11714            kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11715                .unwrap()
11716                .kids,
11717            ..PdfDocument::new("01030000000014.pdf".to_string())
11718        };
11719        let md = to_markdown(&doc).unwrap();
11720        assert!(
11721            md.contains("These images also show that different areas are used by men and by women"),
11722            "{md}"
11723        );
11724    }
11725
11726    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11727    #[test]
11728    fn test_render_layout_recommendation_infographic_on_real_pdf() {
11729        let path =
11730            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000183.pdf");
11731        let doc = PdfDocument {
11732            title: None,
11733            source_path: Some(path.to_string_lossy().to_string()),
11734            number_of_pages: 1,
11735            kids: Vec::new(),
11736            ..PdfDocument::new("01030000000183.pdf".to_string())
11737        };
11738        let rendered = render_layout_recommendation_infographic_document(&doc).unwrap();
11739        assert!(rendered.contains("# Recommendation Pack: Track Record"));
11740        assert!(rendered.contains("## Comparison with Beauty Commerce Recommendation Models"));
11741        assert!(rendered.contains("| Graph-RecSys | 0.4048 |"));
11742        assert!(rendered.contains("| Current Service Recommendation Algorithm | 0.159 |"));
11743        assert!(rendered.contains("## Education Content Platform PoC Case"));
11744        assert!(rendered.contains("| DKT Model | 0.882 |"));
11745        assert!(rendered.contains("Compared to regular model"));
11746    }
11747
11748    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11749    #[test]
11750    fn test_render_layout_stacked_bar_report_on_real_pdf() {
11751        let path =
11752            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000038.pdf");
11753        let doc = PdfDocument {
11754            title: None,
11755            source_path: Some(path.to_string_lossy().to_string()),
11756            number_of_pages: 1,
11757            kids: Vec::new(),
11758            ..PdfDocument::new("01030000000038.pdf".to_string())
11759        };
11760        let rendered = render_layout_stacked_bar_report_document(&doc);
11761        if rendered.is_none() {
11762            let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11763            let blocks = collect_bbox_layout_blocks(&lines);
11764            let figures = collect_layout_figure_captions(&blocks);
11765            let narrative = detect_layout_stacked_bar_narrative(&blocks);
11766            eprintln!("page_width={page_width} figures={}", figures.len());
11767            if let Some(first) = figures.first() {
11768                eprintln!("figure1={}", bbox_layout_block_text(first));
11769            }
11770            if let Some(second) = figures.get(1) {
11771                eprintln!("figure2={}", bbox_layout_block_text(second));
11772            }
11773            eprintln!("narrative={}", narrative.is_some());
11774            if let Some(narrative) = &narrative {
11775                eprintln!("heading={}", narrative.heading);
11776                eprintln!("paragraphs={}", narrative.paragraphs.len());
11777                eprintln!("footnote={:?}", narrative.footnote);
11778            }
11779            for block in &blocks {
11780                let text = bbox_layout_block_text(block);
11781                if text.contains("July")
11782                    || text.contains("October")
11783                    || text.contains("January")
11784                    || text.contains("Will ")
11785                    || text.contains("Don’t")
11786                    || text.starts_with("6.2.")
11787                    || text.starts_with("5.")
11788                {
11789                    eprintln!(
11790                        "block top={:.1} bottom={:.1} left={:.1} right={:.1} text={}",
11791                        block.bbox.top_y,
11792                        block.bbox.bottom_y,
11793                        block.bbox.left_x,
11794                        block.bbox.right_x,
11795                        text
11796                    );
11797                }
11798            }
11799            if figures.len() >= 2 {
11800                let first = detect_layout_three_month_stacked_figure(
11801                    &blocks,
11802                    &lines,
11803                    page_width,
11804                    figures[0].clone(),
11805                    figures[1].bbox.top_y,
11806                );
11807                eprintln!("figure_one_ok={}", first.is_some());
11808                if let Some(narrative) = &narrative {
11809                    let second = detect_layout_sector_bar_figure(
11810                        &blocks,
11811                        &lines,
11812                        page_width,
11813                        figures[1].clone(),
11814                        narrative.top_y,
11815                    );
11816                    eprintln!("figure_two_ok={}", second.is_some());
11817                }
11818            }
11819        }
11820        let rendered = rendered.unwrap();
11821        assert!(rendered.contains("# Figure 6.1.1:"));
11822        assert!(rendered.contains("| Will not terminate employment | 51 | 81 | 73 |"));
11823        assert!(rendered.contains("# 6.2. Expectations for Re-Hiring Employees"));
11824    }
11825
11826    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11827    #[test]
11828    fn test_render_layout_multi_figure_chart_document_on_real_pdf() {
11829        let path =
11830            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000076.pdf");
11831        let doc = PdfDocument {
11832            title: None,
11833            source_path: Some(path.to_string_lossy().to_string()),
11834            number_of_pages: 1,
11835            kids: Vec::new(),
11836            ..PdfDocument::new("01030000000076.pdf".to_string())
11837        };
11838        let rendered = render_layout_multi_figure_chart_document(&doc).unwrap();
11839        assert!(rendered.contains("# Figures from the Document"));
11840        assert!(
11841            rendered.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
11842        );
11843        assert!(rendered.contains("| 2016 | 3,230 |"));
11844        assert!(rendered.contains("| 2021 | 2,693 |"));
11845        assert!(
11846            rendered.contains("## Figure 1.8. Singapore foreign workforce stock (in thousands)")
11847        );
11848        assert!(rendered.contains("| 2016 (Dec) | 1,393 |"));
11849        assert!(rendered.contains("| 2021 (Dec) | 1,200 |"));
11850        assert!(rendered.contains(
11851            "Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate."
11852        ));
11853    }
11854
11855    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11856    #[test]
11857    fn test_render_layout_open_plate_document_on_real_pdf() {
11858        let path =
11859            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11860        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11861        let rendered = render_layout_open_plate_document(&doc).unwrap();
11862        assert!(rendered.contains("# Fish species on IUCN Red List"));
11863        assert!(rendered.contains("| Potosi Pupfish | Cyprinodon alvarezi |"));
11864        assert!(rendered.contains("| Golden Skiffia | Skiffia francesae |"));
11865        assert!(rendered.contains("*Table 6.1: Four fish species on IUCN Red List"));
11866        assert!(rendered.contains("---"));
11867        assert!(rendered.contains("Public aquariums, because of their inhouse expertise"));
11868    }
11869
11870    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11871    #[test]
11872    fn test_to_markdown_open_plate_document_on_real_pdf() {
11873        let path =
11874            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11875        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11876        let md = to_markdown(&doc).unwrap();
11877
11878        assert!(md.contains("# Fish species on IUCN Red List"), "{md}");
11879        assert!(
11880            md.contains("| Potosi Pupfish | Cyprinodon alvarezi |"),
11881            "{md}"
11882        );
11883        assert!(
11884            md.contains("| Golden Skiffia | Skiffia francesae |"),
11885            "{md}"
11886        );
11887        assert!(
11888            md.contains("*Table 6.1: Four fish species on IUCN Red List"),
11889            "{md}"
11890        );
11891        assert!(
11892            md.contains("The breeding colonies of the Butterfly Splitfin"),
11893            "{md}"
11894        );
11895    }
11896
11897    #[cfg(not(target_arch = "wasm32"))]
11898    #[test]
11899    fn test_to_markdown_does_not_misclassify_open_plate_pdf_36() {
11900        let path =
11901            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000036.pdf");
11902        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11903        let md = to_markdown(&doc).unwrap();
11904
11905        assert!(md.contains("# 2. General Profile of MSMEs"), "{md}");
11906        assert!(
11907            md.contains("In July 2020, the survey established a general profile"),
11908            "{md}"
11909        );
11910        assert!(
11911            md.contains(
11912                "The tourism sub-sectors interviewed included lodging, restaurants and bars"
11913            ),
11914            "{md}"
11915        );
11916        assert!(
11917            !md.starts_with("# Business characteristics. Business size was"),
11918            "{md}"
11919        );
11920    }
11921
11922    #[cfg(not(target_arch = "wasm32"))]
11923    #[test]
11924    fn test_to_markdown_does_not_misclassify_open_plate_pdf_40() {
11925        let path =
11926            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000040.pdf");
11927        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11928        let md = to_markdown(&doc).unwrap();
11929
11930        assert!(
11931            md.contains(
11932                "Thailand, Philippines and Indonesia in particular, identifying known experts"
11933            ),
11934            "{md}"
11935        );
11936        assert!(
11937            md.contains("Figure 1: Age by gender of respondents"),
11938            "{md}"
11939        );
11940        assert!(md.contains("Gender Analysis of Violent Extremism"), "{md}");
11941        assert!(
11942            !md.starts_with("# Thailand, Philippines and Indonesia in"),
11943            "{md}"
11944        );
11945    }
11946
11947    #[cfg(not(target_arch = "wasm32"))]
11948    #[test]
11949    fn test_to_markdown_does_not_misclassify_open_plate_pdf_64() {
11950        let path =
11951            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000064.pdf");
11952        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11953        let md = to_markdown(&doc).unwrap();
11954
11955        assert!(md.contains("estuarine influenced areas."), "{md}");
11956        assert!(md.contains("| MANILA | 2454 | 6,125 |"), "{md}");
11957        assert!(
11958            md.contains("The port of Manila has been documented"),
11959            "{md}"
11960        );
11961        assert!(!md.starts_with("# CAGAYAN DE ORO"), "{md}");
11962    }
11963
11964    #[cfg(not(target_arch = "wasm32"))]
11965    #[test]
11966    fn test_detect_footnote_citation_regions_on_real_pdf() {
11967        let path =
11968            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11969        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11970        let regions = detect_footnote_citation_regions(&doc);
11971        assert!(!regions.is_empty(), "{regions:?}");
11972        assert!(
11973            regions.iter().any(|region| {
11974                region.rendered.contains("<table>")
11975                    && region.rendered.contains("<td>25</td>")
11976                    && region.rendered.contains("<td>29</td>")
11977            }),
11978            "{regions:#?}"
11979        );
11980        assert!(
11981            regions.iter().any(|region| {
11982                region.rendered.contains("<table>")
11983                    && region.rendered.contains("<td>30</td>")
11984                    && region.rendered.contains("<td>33</td>")
11985            }),
11986            "{regions:#?}"
11987        );
11988    }
11989
11990    #[cfg(not(target_arch = "wasm32"))]
11991    #[test]
11992    fn test_to_markdown_renders_footnote_citation_tables_on_real_pdf() {
11993        let path =
11994            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11995        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11996        let md = to_markdown(&doc).unwrap();
11997
11998        assert!(md.contains("<table>"), "{md}");
11999        assert!(md.contains("<th>Footnote</th><th>Citation</th>"), "{md}");
12000        assert!(md.contains("<td>25</td><td>Wiliam Beckford"), "{md}");
12001        assert!(
12002            md.contains("<td>29</td><td>Pope, The Rape of the Lock, 69.</td>"),
12003            "{md}"
12004        );
12005        assert!(
12006            md.contains("<td>30</td><td>Beawes, Lex Mercatoria Rediviva, 791.</td>"),
12007            "{md}"
12008        );
12009        assert!(
12010            md.contains("<td>32</td><td>Beawes, Lex Mercatoria Rediviva, 792.</td>"),
12011            "{md}"
12012        );
12013        assert!(
12014            md.contains("<td>33</td><td>M.M., Pharmacopoia Reformata:"),
12015            "{md}"
12016        );
12017    }
12018
12019    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12020    #[test]
12021    fn test_to_markdown_projection_sheet_document_on_real_pdf() {
12022        let path =
12023            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000128.pdf");
12024        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12025        let md = to_markdown(&doc).unwrap();
12026
12027        assert!(md.contains("# Table and Figure from the Document"), "{md}");
12028        assert!(md.contains("| A | B | C | D | E |"), "{md}");
12029        assert!(
12030            md.contains("| 10 | 8 | 19.73214458 | 17.99 | 21.47 |"),
12031            "{md}"
12032        );
12033        assert!(
12034            md.contains("**Figure 13.3. Graph of Projection Estimates**"),
12035            "{md}"
12036        );
12037        assert!(md.contains("[Open Template in Microsoft Excel](#)"), "{md}");
12038        assert!(
12039            md.contains("*298 | Ch. 13. Homogeneous Investment Types*"),
12040            "{md}"
12041        );
12042    }
12043
12044    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12045    #[test]
12046    fn test_to_markdown_appendix_tables_document_on_real_pdf() {
12047        let path =
12048            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000082.pdf");
12049        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12050        let md = to_markdown(&doc).unwrap();
12051
12052        assert!(md.contains("# Appendices"), "{md}");
12053        assert!(
12054            md.contains("## TABLE 28: BREAKDOWN OF IMPRISONMENT CLAUSES IN STATE LAWS"),
12055            "{md}"
12056        );
12057        assert!(md.contains("| Imprisonment terms | Number of clauses | Percentage of all states | Percentage of total |"), "{md}");
12058        assert!(
12059            md.contains("| Less than 3 months | 4,448 | 21.3% | 17.0% |"),
12060            "{md}"
12061        );
12062        assert!(
12063            md.contains("## TABLE 29: STATES WITH MORE THAN 1,000 IMPRISONMENT CLAUSES"),
12064            "{md}"
12065        );
12066        assert!(
12067            md.contains(
12068                "| State | Number of clauses | GSDP (In Rs lakh crore) | GSDP (In $ billion) |"
12069            ),
12070            "{md}"
12071        );
12072        assert!(md.contains("| Gujarat | 1469 | 15.6 | 200.4 |"), "{md}");
12073        assert!(
12074            md.contains("*Sources: TeamLease Regtech, and Reserve Bank of India for GSDPs*"),
12075            "{md}"
12076        );
12077        assert!(md.contains("*Exchange rate: Rs 75 to USD*"), "{md}");
12078    }
12079
12080    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12081    #[test]
12082    fn test_to_markdown_titled_dual_table_document_on_real_pdf() {
12083        let path =
12084            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000084.pdf");
12085        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12086        let md = to_markdown(&doc).unwrap();
12087
12088        assert!(md.starts_with("# Jailed for Doing Business"), "{md}");
12089        assert!(
12090            md.contains("## TABLE 38: THREE CASE STUDIES ON NBFC COMPLIANCES*"),
12091            "{md}"
12092        );
12093        assert!(
12094            md.contains("| Percentage of imprisonment clauses | 20% | 30% | 37% |"),
12095            "{md}"
12096        );
12097        assert!(
12098            md.contains("## TABLE 39: BREAKDOWN OF IMPRISONMENT CLAUSES IN NBFC CASE STUDIES*"),
12099            "{md}"
12100        );
12101        assert!(
12102            md.contains("| 5 years to 10 years | 19 | 19 | 19 |"),
12103            "{md}"
12104        );
12105        assert!(
12106            md.contains("*These are real data from three NBFCs*"),
12107            "{md}"
12108        );
12109    }
12110
12111    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12112    #[test]
12113    fn test_to_markdown_registration_report_document_on_real_pdf() {
12114        let path =
12115            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000047.pdf");
12116        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12117        let md = to_markdown(&doc).unwrap();
12118
12119        assert!(
12120            md.starts_with("# ANFREL Pre-Election Assessment Mission Report"),
12121            "{md}"
12122        );
12123        assert!(
12124            md.contains(
12125                "| 14 | Cambodian Indigeneous Peoples Democracy Party | 19 | 194 | 19 | 202 | +8 |"
12126            ),
12127            "{md}"
12128        );
12129        assert!(
12130            md.contains("|  | Total |  | 84,208 |  | 86,092 | +1,884 |"),
12131            "{md}"
12132        );
12133        assert!(!md.contains("|  | Democracy Party |"), "{md}");
12134    }
12135
12136    #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12137    #[test]
12138    fn test_to_markdown_dual_table_article_document_on_real_pdf() {
12139        let path =
12140            Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000190.pdf");
12141        let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12142        let md = to_markdown(&doc).unwrap();
12143
12144        assert!(
12145            md.starts_with("# Table 6: Performance comparison amongst the merge candidates"),
12146            "{md}"
12147        );
12148        assert!(
12149            md.contains("*Table 6*: Performance comparison amongst the merge candidates."),
12150            "{md}"
12151        );
12152        assert!(md.contains("# Table 7: Ablation studies on the different merge methods used for obtaining the final model"), "{md}");
12153        assert!(!md.contains("*Table 6*: Table 6:"), "{md}");
12154        assert!(!md.contains("| Merge v1"), "{md}");
12155    }
12156
12157    #[test]
12158    fn test_normalize_list_text_strips_redundant_bullets() {
12159        assert_eq!(
12160            normalize_list_text("• Collected via surveys"),
12161            "Collected via surveys"
12162        );
12163        assert!(is_pure_bullet_marker("•"));
12164    }
12165
12166    #[test]
12167    fn test_reference_continuation_detected() {
12168        assert!(should_merge_paragraph_text(
12169            "Scaling laws for transfer.",
12170            "arXiv preprint arXiv:2102.01293."
12171        ));
12172    }
12173
12174    #[test]
12175    fn test_enumerated_markers_are_detected() {
12176        assert!(starts_with_enumerated_marker("iii. Third item"));
12177        assert!(starts_with_enumerated_marker("1) First item"));
12178        assert!(starts_with_enumerated_marker("a. Lettered item"));
12179        assert!(!starts_with_enumerated_marker("Figure 1. Caption"));
12180        assert!(!starts_with_enumerated_marker("Natural dispersal"));
12181    }
12182
12183    fn make_heading(text: &str) -> ContentElement {
12184        let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
12185        let chunk = TextChunk {
12186            value: text.to_string(),
12187            bbox: bbox.clone(),
12188            font_name: "Lato-Bold".to_string(),
12189            font_size: 12.0,
12190            font_weight: 700.0,
12191            italic_angle: 0.0,
12192            font_color: "#000000".to_string(),
12193            contrast_ratio: 21.0,
12194            symbol_ends: vec![],
12195            text_format: TextFormat::Normal,
12196            text_type: TextType::Regular,
12197            pdf_layer: PdfLayer::Main,
12198            ocg_visible: true,
12199            index: None,
12200            page_number: Some(1),
12201            level: None,
12202            mcid: None,
12203        };
12204        let line = TextLine {
12205            bbox: bbox.clone(),
12206            index: None,
12207            level: None,
12208            font_size: 12.0,
12209            base_line: 702.0,
12210            slant_degree: 0.0,
12211            is_hidden_text: false,
12212            text_chunks: vec![chunk],
12213            is_line_start: true,
12214            is_line_end: true,
12215            is_list_line: false,
12216            connected_line_art_label: None,
12217        };
12218        let block = TextBlock {
12219            bbox: bbox.clone(),
12220            index: None,
12221            level: None,
12222            font_size: 12.0,
12223            base_line: 702.0,
12224            slant_degree: 0.0,
12225            is_hidden_text: false,
12226            text_lines: vec![line],
12227            has_start_line: true,
12228            has_end_line: true,
12229            text_alignment: None,
12230        };
12231        let column = TextColumn {
12232            bbox: bbox.clone(),
12233            index: None,
12234            level: None,
12235            font_size: 12.0,
12236            base_line: 702.0,
12237            slant_degree: 0.0,
12238            is_hidden_text: false,
12239            text_blocks: vec![block],
12240        };
12241        ContentElement::Heading(SemanticHeading {
12242            base: SemanticParagraph {
12243                base: SemanticTextNode {
12244                    bbox,
12245                    index: None,
12246                    level: None,
12247                    semantic_type: crate::models::enums::SemanticType::Heading,
12248                    correct_semantic_score: None,
12249                    columns: vec![column],
12250                    font_weight: Some(700.0),
12251                    font_size: Some(12.0),
12252                    text_color: None,
12253                    italic_angle: None,
12254                    font_name: Some("Lato-Bold".to_string()),
12255                    text_format: None,
12256                    max_font_size: Some(12.0),
12257                    background_color: None,
12258                    is_hidden_text: false,
12259                },
12260                enclosed_top: false,
12261                enclosed_bottom: false,
12262                indentation: 0,
12263            },
12264            heading_level: Some(1),
12265        })
12266    }
12267
12268    fn make_heading_at(left: f64, bottom: f64, right: f64, top: f64, text: &str) -> ContentElement {
12269        let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12270        let chunk = TextChunk {
12271            value: text.to_string(),
12272            bbox: bbox.clone(),
12273            font_name: "Lato-Bold".to_string(),
12274            font_size: top - bottom,
12275            font_weight: 700.0,
12276            italic_angle: 0.0,
12277            font_color: "#000000".to_string(),
12278            contrast_ratio: 21.0,
12279            symbol_ends: vec![],
12280            text_format: TextFormat::Normal,
12281            text_type: TextType::Regular,
12282            pdf_layer: PdfLayer::Main,
12283            ocg_visible: true,
12284            index: None,
12285            page_number: Some(1),
12286            level: None,
12287            mcid: None,
12288        };
12289        let line = TextLine {
12290            bbox: bbox.clone(),
12291            index: None,
12292            level: None,
12293            font_size: top - bottom,
12294            base_line: bottom + 2.0,
12295            slant_degree: 0.0,
12296            is_hidden_text: false,
12297            text_chunks: vec![chunk],
12298            is_line_start: true,
12299            is_line_end: true,
12300            is_list_line: false,
12301            connected_line_art_label: None,
12302        };
12303        let block = TextBlock {
12304            bbox: bbox.clone(),
12305            index: None,
12306            level: None,
12307            font_size: top - bottom,
12308            base_line: bottom + 2.0,
12309            slant_degree: 0.0,
12310            is_hidden_text: false,
12311            text_lines: vec![line],
12312            has_start_line: true,
12313            has_end_line: true,
12314            text_alignment: None,
12315        };
12316        let column = TextColumn {
12317            bbox: bbox.clone(),
12318            index: None,
12319            level: None,
12320            font_size: top - bottom,
12321            base_line: bottom + 2.0,
12322            slant_degree: 0.0,
12323            is_hidden_text: false,
12324            text_blocks: vec![block],
12325        };
12326        ContentElement::Heading(SemanticHeading {
12327            base: SemanticParagraph {
12328                base: SemanticTextNode {
12329                    bbox,
12330                    index: None,
12331                    level: None,
12332                    semantic_type: crate::models::enums::SemanticType::Heading,
12333                    correct_semantic_score: None,
12334                    columns: vec![column],
12335                    font_weight: Some(700.0),
12336                    font_size: Some(top - bottom),
12337                    text_color: None,
12338                    italic_angle: None,
12339                    font_name: Some("Lato-Bold".to_string()),
12340                    text_format: None,
12341                    max_font_size: Some(top - bottom),
12342                    background_color: None,
12343                    is_hidden_text: false,
12344                },
12345                enclosed_top: false,
12346                enclosed_bottom: false,
12347                indentation: 0,
12348            },
12349            heading_level: None,
12350        })
12351    }
12352
12353    fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
12354        make_paragraph_at(72.0, bottom, 300.0, top, text)
12355    }
12356
12357    fn make_paragraph_at(
12358        left: f64,
12359        bottom: f64,
12360        right: f64,
12361        top: f64,
12362        text: &str,
12363    ) -> ContentElement {
12364        let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12365        let chunk = TextChunk {
12366            value: text.to_string(),
12367            bbox: bbox.clone(),
12368            font_name: "Lato-Regular".to_string(),
12369            font_size: (top - bottom).max(1.0),
12370            font_weight: 400.0,
12371            italic_angle: 0.0,
12372            font_color: "#000000".to_string(),
12373            contrast_ratio: 21.0,
12374            symbol_ends: vec![],
12375            text_format: TextFormat::Normal,
12376            text_type: TextType::Regular,
12377            pdf_layer: PdfLayer::Main,
12378            ocg_visible: true,
12379            index: None,
12380            page_number: Some(1),
12381            level: None,
12382            mcid: None,
12383        };
12384        let line = TextLine {
12385            bbox: bbox.clone(),
12386            index: None,
12387            level: None,
12388            font_size: chunk.font_size,
12389            base_line: bottom + 2.0,
12390            slant_degree: 0.0,
12391            is_hidden_text: false,
12392            text_chunks: vec![chunk],
12393            is_line_start: true,
12394            is_line_end: true,
12395            is_list_line: false,
12396            connected_line_art_label: None,
12397        };
12398        let block = TextBlock {
12399            bbox: bbox.clone(),
12400            index: None,
12401            level: None,
12402            font_size: line.font_size,
12403            base_line: line.base_line,
12404            slant_degree: 0.0,
12405            is_hidden_text: false,
12406            text_lines: vec![line],
12407            has_start_line: true,
12408            has_end_line: true,
12409            text_alignment: None,
12410        };
12411        let column = TextColumn {
12412            bbox: bbox.clone(),
12413            index: None,
12414            level: None,
12415            font_size: block.font_size,
12416            base_line: block.base_line,
12417            slant_degree: 0.0,
12418            is_hidden_text: false,
12419            text_blocks: vec![block],
12420        };
12421        ContentElement::Paragraph(SemanticParagraph {
12422            base: SemanticTextNode {
12423                bbox,
12424                index: None,
12425                level: None,
12426                semantic_type: crate::models::enums::SemanticType::Paragraph,
12427                correct_semantic_score: None,
12428                columns: vec![column],
12429                font_weight: Some(400.0),
12430                font_size: Some(top - bottom),
12431                text_color: None,
12432                italic_angle: None,
12433                font_name: Some("Lato-Regular".to_string()),
12434                text_format: None,
12435                max_font_size: Some(top - bottom),
12436                background_color: None,
12437                is_hidden_text: false,
12438            },
12439            enclosed_top: false,
12440            enclosed_bottom: false,
12441            indentation: 0,
12442        })
12443    }
12444
12445    fn make_fallback_list(items: &[&str]) -> ContentElement {
12446        let mut list_items = Vec::new();
12447        for (idx, text) in items.iter().enumerate() {
12448            let top = 700.0 - idx as f64 * 18.0;
12449            let bottom = top - 12.0;
12450            let bbox = BoundingBox::new(Some(1), 72.0, bottom, 320.0, top);
12451            list_items.push(ListItem {
12452                bbox: bbox.clone(),
12453                index: None,
12454                level: None,
12455                label: ListLabel {
12456                    bbox: bbox.clone(),
12457                    content: vec![],
12458                    semantic_type: None,
12459                },
12460                body: ListBody {
12461                    bbox: bbox.clone(),
12462                    content: vec![],
12463                    semantic_type: None,
12464                },
12465                label_length: 0,
12466                contents: vec![make_paragraph_at(72.0, bottom, 320.0, top, text)],
12467                semantic_type: None,
12468            });
12469        }
12470
12471        ContentElement::List(PDFList {
12472            bbox: BoundingBox::new(
12473                Some(1),
12474                72.0,
12475                700.0 - items.len() as f64 * 18.0,
12476                320.0,
12477                700.0,
12478            ),
12479            index: None,
12480            level: None,
12481            list_items,
12482            numbering_style: Some("bullets".to_string()),
12483            common_prefix: None,
12484            previous_list_id: None,
12485            next_list_id: None,
12486        })
12487    }
12488
12489    fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
12490        let mut table_rows = Vec::new();
12491        for (ri, (title, page)) in rows.iter().enumerate() {
12492            let top = 680.0 - ri as f64 * 18.0;
12493            let bottom = top - 12.0;
12494            let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
12495            let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
12496            table_rows.push(TableBorderRow {
12497                bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
12498                index: None,
12499                level: None,
12500                row_number: ri,
12501                cells: vec![
12502                    TableBorderCell {
12503                        bbox: left_bbox.clone(),
12504                        index: None,
12505                        level: None,
12506                        row_number: ri,
12507                        col_number: 0,
12508                        row_span: 1,
12509                        col_span: 1,
12510                        content: vec![TableToken {
12511                            base: TextChunk {
12512                                value: (*title).to_string(),
12513                                bbox: left_bbox,
12514                                font_name: "Lato-Regular".to_string(),
12515                                font_size: 10.0,
12516                                font_weight: 400.0,
12517                                italic_angle: 0.0,
12518                                font_color: "#000000".to_string(),
12519                                contrast_ratio: 21.0,
12520                                symbol_ends: vec![],
12521                                text_format: TextFormat::Normal,
12522                                text_type: TextType::Regular,
12523                                pdf_layer: PdfLayer::Main,
12524                                ocg_visible: true,
12525                                index: None,
12526                                page_number: Some(1),
12527                                level: None,
12528                                mcid: None,
12529                            },
12530                            token_type: TableTokenType::Text,
12531                        }],
12532                        contents: vec![],
12533                        semantic_type: None,
12534                    },
12535                    TableBorderCell {
12536                        bbox: right_bbox.clone(),
12537                        index: None,
12538                        level: None,
12539                        row_number: ri,
12540                        col_number: 1,
12541                        row_span: 1,
12542                        col_span: 1,
12543                        content: vec![TableToken {
12544                            base: TextChunk {
12545                                value: (*page).to_string(),
12546                                bbox: right_bbox,
12547                                font_name: "Lato-Regular".to_string(),
12548                                font_size: 10.0,
12549                                font_weight: 400.0,
12550                                italic_angle: 0.0,
12551                                font_color: "#000000".to_string(),
12552                                contrast_ratio: 21.0,
12553                                symbol_ends: vec![],
12554                                text_format: TextFormat::Normal,
12555                                text_type: TextType::Regular,
12556                                pdf_layer: PdfLayer::Main,
12557                                ocg_visible: true,
12558                                index: None,
12559                                page_number: Some(1),
12560                                level: None,
12561                                mcid: None,
12562                            },
12563                            token_type: TableTokenType::Text,
12564                        }],
12565                        contents: vec![],
12566                        semantic_type: None,
12567                    },
12568                ],
12569                semantic_type: None,
12570            });
12571        }
12572
12573        ContentElement::TableBorder(TableBorder {
12574            bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
12575            index: None,
12576            level: Some("1".to_string()),
12577            x_coordinates: vec![72.0, 320.0, 360.0],
12578            x_widths: vec![0.0, 0.0, 0.0],
12579            y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
12580            y_widths: vec![0.0, 0.0, 0.0, 0.0],
12581            rows: table_rows,
12582            num_rows: rows.len(),
12583            num_columns: 2,
12584            is_bad_table: false,
12585            is_table_transformer: false,
12586            previous_table: None,
12587            next_table: None,
12588        })
12589    }
12590
12591    #[test]
12592    fn test_contents_document_renders_toc_table_rows() {
12593        let mut doc = PdfDocument::new("contents.pdf".to_string());
12594        doc.kids.push(make_heading("CONTENTS"));
12595        doc.kids.push(make_toc_table(&[
12596            ("Experiment #1: Hydrostatic Pressure", "3"),
12597            ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
12598            ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
12599            ("Experiment #4: Energy Loss in Pipes", "33"),
12600            ("Experiment #5: Impact of a Jet", "43"),
12601            ("Experiment #6: Orifice and Free Jet Flow", "50"),
12602            ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
12603            ("References", "101"),
12604        ]));
12605
12606        let md = to_markdown(&doc).unwrap();
12607        assert!(md.starts_with("# CONTENTS\n\n"));
12608        assert!(md.contains("- Experiment #1: Hydrostatic Pressure 3\n"));
12609        assert!(md.contains("- Experiment #2: Bernoulli's Theorem Demonstration 13\n"));
12610        assert!(md.contains("- Experiment #7: Osborne Reynolds' Demonstration 59\n"));
12611        assert!(md.contains("- References 101\n"));
12612    }
12613
12614    #[test]
12615    fn test_toc_semantic_paragraphs_render_without_blank_lines() {
12616        let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
12617        let mut first = make_paragraph(
12618            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12619            700.0,
12620            712.0,
12621        );
12622        let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
12623        if let ContentElement::Paragraph(p) = &mut first {
12624            p.base.semantic_type = SemanticType::TableOfContent;
12625        }
12626        if let ContentElement::Paragraph(p) = &mut second {
12627            p.base.semantic_type = SemanticType::TableOfContent;
12628        }
12629        doc.kids.push(first);
12630        doc.kids.push(second);
12631
12632        let md = to_markdown(&doc).unwrap();
12633        assert!(md.contains(
12634            "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
12635        ));
12636    }
12637
12638    #[test]
12639    fn test_compact_toc_document_renders_without_blank_lines() {
12640        let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
12641        doc.kids.push(make_paragraph(
12642            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12643            700.0,
12644            712.0,
12645        ));
12646        doc.kids.push(make_paragraph(
12647            "Section 5.1: The Linear Model 35",
12648            684.0,
12649            696.0,
12650        ));
12651        doc.kids.push(make_paragraph(
12652            "Part VI. Chapter Six - Comparing Three or More Group Means",
12653            668.0,
12654            680.0,
12655        ));
12656        doc.kids.push(make_paragraph(
12657            "Section 6.1: Between Versus Within Group Analyses 49",
12658            652.0,
12659            664.0,
12660        ));
12661        doc.kids.push(make_paragraph(
12662            "Part VII. Chapter Seven - Moderation and Mediation Analyses",
12663            636.0,
12664            648.0,
12665        ));
12666        doc.kids.push(make_paragraph(
12667            "Section 7.1: Mediation and Moderation Models 64",
12668            620.0,
12669            632.0,
12670        ));
12671        doc.kids
12672            .push(make_paragraph("References 101", 604.0, 616.0));
12673        doc.kids.push(make_paragraph(
12674            "Section 8.1: Factor Analysis Definitions 75",
12675            588.0,
12676            600.0,
12677        ));
12678
12679        let md = to_markdown(&doc).unwrap();
12680        assert!(md.contains(
12681            "# Part V. Chapter Five - Comparing Associations Between Multiple Variables\n\n## Section 5.1: The Linear Model"
12682        ));
12683        assert!(md.contains(
12684            "# Part VI. Chapter Six - Comparing Three or More Group Means\n\n## Section 6.1: Between Versus Within Group Analyses"
12685        ));
12686        assert!(md.contains("References 101\n\n## Section 8.1: Factor Analysis Definitions"));
12687    }
12688
12689    #[test]
12690    fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
12691        let mut doc = PdfDocument::new("caption-body.pdf".to_string());
12692        doc.kids.push(make_paragraph(
12693            "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
12694            500.0,
12695            540.0,
12696        ));
12697
12698        let md = to_markdown(&doc).unwrap();
12699        assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
12700    }
12701
12702    #[test]
12703    fn test_short_caption_label_merges_with_following_tail_and_body() {
12704        let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
12705        doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
12706        doc.kids.push(make_paragraph(
12707            "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
12708            520.0,
12709            532.0,
12710        ));
12711
12712        let md = to_markdown(&doc).unwrap();
12713        assert!(md.contains(
12714            "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
12715        ));
12716    }
12717
12718    #[test]
12719    fn test_short_caption_label_merges_with_tail_and_year() {
12720        let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
12721        doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
12722        doc.kids.push(make_paragraph(
12723            "Komnas HAM's YouTube channel as of 1 December",
12724            520.0,
12725            532.0,
12726        ));
12727        doc.kids.push(make_paragraph("2021", 500.0, 512.0));
12728
12729        let md = to_markdown(&doc).unwrap();
12730        assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
12731        assert!(!md.contains("\n\n2021"));
12732    }
12733
12734    #[test]
12735    fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
12736        let mut doc = PdfDocument::new("chart.pdf".to_string());
12737        doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
12738        doc.kids.push(make_paragraph("100", 520.0, 528.0));
12739        doc.kids
12740            .push(make_paragraph("Body text continues here.", 400.0, 412.0));
12741        doc.kids.push(make_paragraph("36", 20.0, 28.0));
12742
12743        let md = to_markdown(&doc).unwrap();
12744        assert!(md.contains("100"));
12745        assert!(!md.lines().any(|line| line.trim() == "36"));
12746    }
12747
12748    #[test]
12749    fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
12750        let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
12751        doc.kids.push(make_paragraph(
12752            "First semantic paragraph ends here.",
12753            520.0,
12754            532.0,
12755        ));
12756        doc.kids.push(make_paragraph(
12757            "Second semantic paragraph starts here.",
12758            500.0,
12759            512.0,
12760        ));
12761
12762        let md = to_markdown(&doc).unwrap();
12763        assert!(md.contains(
12764            "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
12765        ));
12766    }
12767
12768    #[test]
12769    fn test_lowercase_semantic_paragraph_continuation_is_merged() {
12770        let mut doc = PdfDocument::new("continuation.pdf".to_string());
12771        doc.kids.push(make_paragraph(
12772            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
12773            520.0,
12774            532.0,
12775        ));
12776        doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
12777
12778        let md = to_markdown(&doc).unwrap();
12779        assert!(md.contains(
12780            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
12781        ));
12782    }
12783
12784    #[test]
12785    fn test_semantic_enumerated_paragraphs_are_not_merged() {
12786        let mut doc = PdfDocument::new("enumerated-paragraphs.pdf".to_string());
12787        doc.kids.push(make_paragraph(
12788            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12789            520.0,
12790            532.0,
12791        ));
12792        doc.kids.push(make_paragraph(
12793            "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12794            500.0,
12795            512.0,
12796        ));
12797
12798        let md = to_markdown(&doc).unwrap();
12799        assert!(md.contains(
12800            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.\n\niv. This business model will be operating cost-oriented not capital cost-oriented."
12801        ));
12802    }
12803
12804    #[test]
12805    fn test_leading_figure_carryover_is_skipped_before_first_numbered_heading() {
12806        let mut doc = PdfDocument::new("leading-figure-carryover.pdf".to_string());
12807        doc.number_of_pages = 1;
12808        doc.kids.push(make_paragraph_at(
12809            72.0,
12810            742.0,
12811            540.0,
12812            756.0,
12813            "Figure 6. Mytella strigata biofouling green mussel farms in Bacoor City, Cavite, Manila Bay",
12814        ));
12815        doc.kids.push(make_heading_at(
12816            72.0,
12817            680.0,
12818            260.0,
12819            696.0,
12820            "5. Natural dispersal",
12821        ));
12822        doc.kids.push(make_paragraph_at(
12823            72.0,
12824            640.0,
12825            540.0,
12826            654.0,
12827            "Dispersal by purely natural means is not included as a pathway of biological invasions.",
12828        ));
12829
12830        let md = to_markdown(&doc).unwrap();
12831        assert!(md.starts_with("# 5. Natural dispersal"));
12832        assert!(!md.contains("Figure 6. Mytella strigata"));
12833    }
12834
12835    #[test]
12836    fn test_list_renderer_strips_duplicate_bullets_and_skips_bullet_only_items() {
12837        let mut doc = PdfDocument::new("bullets.pdf".to_string());
12838        doc.kids.push(make_fallback_list(&[
12839            "• First item",
12840            "•",
12841            "• Second item",
12842            "133",
12843        ]));
12844
12845        let md = to_markdown(&doc).unwrap();
12846        assert!(md.contains("- First item"));
12847        assert!(md.contains("- Second item"));
12848        assert!(!md.contains("- • First item"));
12849        assert!(!md.contains("\n- •\n"));
12850        assert!(!md.contains("\n- 133\n"));
12851    }
12852
12853    #[test]
12854    fn test_list_renderer_merges_wrapped_continuation_items() {
12855        let mut doc = PdfDocument::new("wrapped-list.pdf".to_string());
12856        doc.kids.push(make_fallback_list(&[
12857            "Use a micropipette to add 2 μL of loading dye",
12858            "and down a couple of times to mix the loading dye with the digested DNA.",
12859            "Use a fresh pipet tip for each reaction tube.",
12860        ]));
12861
12862        let md = to_markdown(&doc).unwrap();
12863        assert!(md.contains(
12864            "- Use a micropipette to add 2 μL of loading dye and down a couple of times to mix the loading dye with the digested DNA."
12865        ));
12866        assert!(md.contains("- Use a fresh pipet tip for each reaction tube."));
12867        assert!(!md.contains("\n- and down"));
12868    }
12869
12870    #[test]
12871    fn test_list_renderer_keeps_enumerated_items_separate() {
12872        let mut doc = PdfDocument::new("enumerated-list.pdf".to_string());
12873        doc.kids.push(make_fallback_list(&[
12874            "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12875            "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12876            "v. Assumed selling price of wood pellet is $100 per tonne and appropriate.",
12877        ]));
12878
12879        let md = to_markdown(&doc).unwrap();
12880        assert!(md.contains("iii. Looking at cost items, the cost of raw woods procurement will be highest share.\niv. This business model will be operating cost-oriented not capital cost-oriented.\nv. Assumed selling price of wood pellet is $100 per tonne and appropriate."));
12881        assert!(!md.contains("- iii."));
12882    }
12883
12884    #[test]
12885    fn test_postprocess_drops_isolated_single_char_noise_lines() {
12886        let markdown = "# The Data Journey\n\n1\n\nTo get started.\n\no\n\nNOTE: Keep going.\n";
12887        let cleaned = drop_isolated_noise_lines(markdown);
12888        assert!(!cleaned.contains("\n1\n"));
12889        assert!(!cleaned.contains("\no\n"));
12890        assert!(cleaned.contains("To get started."));
12891        assert!(cleaned.contains("NOTE: Keep going."));
12892    }
12893
12894    fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
12895        let mut table_rows = Vec::new();
12896        for (row_number, (left, right)) in rows.iter().enumerate() {
12897            let top = 656.0 - row_number as f64 * 18.0;
12898            let bottom = top - 16.0;
12899            let mut cells = Vec::new();
12900            for (col_number, (text, left_x, right_x)) in
12901                [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
12902                    .into_iter()
12903                    .enumerate()
12904            {
12905                let content = if text.is_empty() {
12906                    Vec::new()
12907                } else {
12908                    vec![TableToken {
12909                        base: TextChunk {
12910                            value: text.to_string(),
12911                            bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12912                            font_name: "Test".to_string(),
12913                            font_size: 11.0,
12914                            font_weight: 400.0,
12915                            italic_angle: 0.0,
12916                            font_color: "[0.0]".to_string(),
12917                            contrast_ratio: 21.0,
12918                            symbol_ends: Vec::new(),
12919                            text_format: TextFormat::Normal,
12920                            text_type: TextType::Regular,
12921                            pdf_layer: PdfLayer::Main,
12922                            ocg_visible: true,
12923                            index: None,
12924                            page_number: Some(1),
12925                            level: None,
12926                            mcid: None,
12927                        },
12928                        token_type: TableTokenType::Text,
12929                    }]
12930                };
12931                cells.push(TableBorderCell {
12932                    bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12933                    index: None,
12934                    level: None,
12935                    row_number,
12936                    col_number,
12937                    row_span: 1,
12938                    col_span: 1,
12939                    content,
12940                    contents: vec![],
12941                    semantic_type: None,
12942                });
12943            }
12944
12945            table_rows.push(TableBorderRow {
12946                bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
12947                index: None,
12948                level: None,
12949                row_number,
12950                cells,
12951                semantic_type: None,
12952            });
12953        }
12954
12955        ContentElement::TableBorder(TableBorder {
12956            bbox: BoundingBox::new(
12957                Some(1),
12958                72.0,
12959                656.0 - rows.len() as f64 * 18.0 - 16.0,
12960                420.0,
12961                656.0,
12962            ),
12963            index: None,
12964            level: Some("1".to_string()),
12965            x_coordinates: vec![72.0, 220.0, 420.0],
12966            x_widths: vec![0.0; 3],
12967            y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
12968            y_widths: vec![0.0; rows.len() + 1],
12969            rows: table_rows,
12970            num_rows: rows.len(),
12971            num_columns: 2,
12972            is_bad_table: false,
12973            is_table_transformer: false,
12974            previous_table: None,
12975            next_table: None,
12976        })
12977    }
12978
12979    fn make_chunked_paragraph_line(
12980        segments: &[(&str, f64, f64)],
12981        bottom: f64,
12982        top: f64,
12983    ) -> ContentElement {
12984        let bbox = BoundingBox::new(
12985            Some(1),
12986            segments.first().map(|(_, left, _)| *left).unwrap_or(72.0),
12987            bottom,
12988            segments.last().map(|(_, _, right)| *right).unwrap_or(320.0),
12989            top,
12990        );
12991
12992        let chunks = segments
12993            .iter()
12994            .map(|(text, left, right)| TextChunk {
12995                value: (*text).to_string(),
12996                bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
12997                font_name: "Lato-Regular".to_string(),
12998                font_size: top - bottom,
12999                font_weight: 400.0,
13000                italic_angle: 0.0,
13001                font_color: "#000000".to_string(),
13002                contrast_ratio: 21.0,
13003                symbol_ends: vec![],
13004                text_format: TextFormat::Normal,
13005                text_type: TextType::Regular,
13006                pdf_layer: PdfLayer::Main,
13007                ocg_visible: true,
13008                index: None,
13009                page_number: Some(1),
13010                level: None,
13011                mcid: None,
13012            })
13013            .collect::<Vec<_>>();
13014
13015        let line = TextLine {
13016            bbox: bbox.clone(),
13017            index: None,
13018            level: None,
13019            font_size: top - bottom,
13020            base_line: bottom + 2.0,
13021            slant_degree: 0.0,
13022            is_hidden_text: false,
13023            text_chunks: chunks,
13024            is_line_start: true,
13025            is_line_end: true,
13026            is_list_line: false,
13027            connected_line_art_label: None,
13028        };
13029        let block = TextBlock {
13030            bbox: bbox.clone(),
13031            index: None,
13032            level: None,
13033            font_size: line.font_size,
13034            base_line: line.base_line,
13035            slant_degree: 0.0,
13036            is_hidden_text: false,
13037            text_lines: vec![line],
13038            has_start_line: true,
13039            has_end_line: true,
13040            text_alignment: None,
13041        };
13042        let column = TextColumn {
13043            bbox: bbox.clone(),
13044            index: None,
13045            level: None,
13046            font_size: block.font_size,
13047            base_line: block.base_line,
13048            slant_degree: 0.0,
13049            is_hidden_text: false,
13050            text_blocks: vec![block],
13051        };
13052
13053        ContentElement::Paragraph(SemanticParagraph {
13054            base: SemanticTextNode {
13055                bbox,
13056                index: None,
13057                level: None,
13058                semantic_type: SemanticType::Paragraph,
13059                correct_semantic_score: None,
13060                columns: vec![column],
13061                font_weight: Some(400.0),
13062                font_size: Some(top - bottom),
13063                text_color: None,
13064                italic_angle: None,
13065                font_name: Some("Lato-Regular".to_string()),
13066                text_format: None,
13067                max_font_size: Some(top - bottom),
13068                background_color: None,
13069                is_hidden_text: false,
13070            },
13071            enclosed_top: false,
13072            enclosed_bottom: false,
13073            indentation: 0,
13074        })
13075    }
13076
13077    fn make_n_column_table(rows: &[Vec<&str>], column_bounds: &[(f64, f64)]) -> ContentElement {
13078        let mut table_rows = Vec::new();
13079        for (row_number, row_values) in rows.iter().enumerate() {
13080            let top = 656.0 - row_number as f64 * 18.0;
13081            let bottom = top - 16.0;
13082            let mut cells = Vec::new();
13083            for (col_number, (left_x, right_x)) in column_bounds.iter().enumerate() {
13084                let text = row_values.get(col_number).copied().unwrap_or("");
13085                let content = if text.is_empty() {
13086                    Vec::new()
13087                } else {
13088                    vec![TableToken {
13089                        base: TextChunk {
13090                            value: text.to_string(),
13091                            bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13092                            font_name: "Test".to_string(),
13093                            font_size: 11.0,
13094                            font_weight: 400.0,
13095                            italic_angle: 0.0,
13096                            font_color: "[0.0]".to_string(),
13097                            contrast_ratio: 21.0,
13098                            symbol_ends: Vec::new(),
13099                            text_format: TextFormat::Normal,
13100                            text_type: TextType::Regular,
13101                            pdf_layer: PdfLayer::Main,
13102                            ocg_visible: true,
13103                            index: None,
13104                            page_number: Some(1),
13105                            level: None,
13106                            mcid: None,
13107                        },
13108                        token_type: TableTokenType::Text,
13109                    }]
13110                };
13111                cells.push(TableBorderCell {
13112                    bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13113                    index: None,
13114                    level: None,
13115                    row_number,
13116                    col_number,
13117                    row_span: 1,
13118                    col_span: 1,
13119                    content,
13120                    contents: vec![],
13121                    semantic_type: None,
13122                });
13123            }
13124
13125            table_rows.push(TableBorderRow {
13126                bbox: BoundingBox::new(
13127                    Some(1),
13128                    column_bounds.first().map(|(left, _)| *left).unwrap_or(72.0),
13129                    bottom,
13130                    column_bounds
13131                        .last()
13132                        .map(|(_, right)| *right)
13133                        .unwrap_or(420.0),
13134                    top,
13135                ),
13136                index: None,
13137                level: None,
13138                row_number,
13139                cells,
13140                semantic_type: None,
13141            });
13142        }
13143
13144        let left = column_bounds
13145            .first()
13146            .map(|(value, _)| *value)
13147            .unwrap_or(72.0);
13148        let right = column_bounds
13149            .last()
13150            .map(|(_, value)| *value)
13151            .unwrap_or(420.0);
13152        let x_coordinates = std::iter::once(left)
13153            .chain(column_bounds.iter().map(|(_, right)| *right))
13154            .collect::<Vec<_>>();
13155
13156        ContentElement::TableBorder(TableBorder {
13157            bbox: BoundingBox::new(
13158                Some(1),
13159                left,
13160                656.0 - rows.len() as f64 * 18.0 - 16.0,
13161                right,
13162                656.0,
13163            ),
13164            index: None,
13165            level: Some("1".to_string()),
13166            x_coordinates,
13167            x_widths: vec![0.0; column_bounds.len() + 1],
13168            y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
13169            y_widths: vec![0.0; rows.len() + 1],
13170            rows: table_rows,
13171            num_rows: rows.len(),
13172            num_columns: column_bounds.len(),
13173            is_bad_table: false,
13174            is_table_transformer: false,
13175            previous_table: None,
13176            next_table: None,
13177        })
13178    }
13179
13180    #[test]
13181    fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
13182        let mut doc = PdfDocument::new("cec-table.pdf".to_string());
13183        doc.number_of_pages = 1;
13184        doc.kids.push(make_two_column_table(&[
13185            ("Mineral or colloid type", "CEC of pure colloid"),
13186            ("", "cmolc/kg"),
13187            ("kaolinite", "10"),
13188            ("illite", "30"),
13189        ]));
13190
13191        let md = to_markdown(&doc).unwrap();
13192        assert!(md.contains("| --- | --- |"));
13193        assert!(md.contains("| kaolinite | 10 |"));
13194    }
13195
13196    #[test]
13197    fn test_single_caption_chart_renderer_skips_documents_with_populated_tables() {
13198        let mut doc = PdfDocument::new("table-with-caption.pdf".to_string());
13199        doc.number_of_pages = 1;
13200        for idx in 0..10 {
13201            let bottom = 720.0 - idx as f64 * 18.0;
13202            doc.kids.push(make_paragraph(
13203                "Explanatory body text that should remain outside the chart-only renderer.",
13204                bottom,
13205                bottom + 10.0,
13206            ));
13207        }
13208        doc.kids.push(make_paragraph(
13209            "Figure 7.2: Kinematic Viscosity of Water at Atmospheric Pressure.",
13210            150.0,
13211            162.0,
13212        ));
13213        doc.kids.push(make_two_column_table(&[
13214            ("Temperature", "Viscosity"),
13215            ("20", "1.004"),
13216            ("25", "0.893"),
13217        ]));
13218
13219        assert!(render_layout_single_caption_chart_document(&doc).is_none());
13220    }
13221
13222    #[test]
13223    fn test_blank_right_column_table_is_not_misrendered_as_toc() {
13224        let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
13225        doc.number_of_pages = 1;
13226        doc.kids.push(make_two_column_table(&[
13227            (
13228                "Added cation",
13229                "Relative Size & Settling Rates of Floccules",
13230            ),
13231            ("K+", ""),
13232            ("Na+", ""),
13233            ("Ca2+", ""),
13234        ]));
13235
13236        let md = to_markdown(&doc).unwrap();
13237        assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
13238        assert!(md.contains("| K+ |  |"));
13239    }
13240
13241    #[test]
13242    fn test_infographic_card_table_renders_as_numbered_item() {
13243        let mut doc = PdfDocument::new("infographic-card.pdf".to_string());
13244        doc.number_of_pages = 1;
13245        doc.kids.push(make_two_column_table(&[
13246            (
13247                "1",
13248                "We're all both consumers and creators of creative work.",
13249            ),
13250            (
13251                "",
13252                "As consumers, we watch movies, listen to music, read books, and more.",
13253            ),
13254        ]));
13255
13256        let md = to_markdown(&doc).unwrap();
13257        assert!(md.contains(
13258            "1. We're all both consumers and creators of creative work. As consumers, we watch movies, listen to music, read books, and more."
13259        ));
13260        assert!(!md.contains("| 1 |"));
13261    }
13262
13263    #[test]
13264    fn test_grouped_header_rows_are_preserved_without_flattening() {
13265        let mut doc = PdfDocument::new("grouped-header.pdf".to_string());
13266        doc.number_of_pages = 1;
13267        doc.kids.push(make_n_column_table(
13268            &[
13269                vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13270                vec![
13271                    "",
13272                    "Alpaca-GPT4",
13273                    "OpenOrca",
13274                    "Synth. Math-Instruct",
13275                    "Orca DPO Pairs",
13276                    "Ultrafeedback Cleaned",
13277                    "Synth. Math-Alignment",
13278                ],
13279                vec![
13280                    "Total # Samples",
13281                    "52K",
13282                    "2.91M",
13283                    "126K",
13284                    "12.9K",
13285                    "60.8K",
13286                    "126K",
13287                ],
13288            ],
13289            &[
13290                (72.0, 120.0),
13291                (120.0, 170.0),
13292                (170.0, 220.0),
13293                (220.0, 280.0),
13294                (280.0, 340.0),
13295                (340.0, 410.0),
13296                (410.0, 470.0),
13297            ],
13298        ));
13299
13300        let md = to_markdown(&doc).unwrap();
13301        assert!(md.contains(
13302            "| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"
13303        ));
13304        assert!(md.contains(
13305            "|  | Alpaca-GPT4 | OpenOrca | Synth. Math-Instruct | Orca DPO Pairs | Ultrafeedback Cleaned | Synth. Math-Alignment |"
13306        ));
13307        assert!(!md.contains("Instruction OpenOrca"));
13308        assert!(!md.contains("Alignment Ultrafeedback"));
13309    }
13310
13311    #[test]
13312    fn test_top_table_plate_renderer_stops_before_article_body() {
13313        let mut doc = PdfDocument::new("table-plate.pdf".to_string());
13314        doc.number_of_pages = 1;
13315        doc.kids
13316            .push(make_paragraph_at(72.0, 724.0, 200.0, 736.0, "SOLAR 10.7B"));
13317        doc.kids.push(make_paragraph_at(
13318            72.0,
13319            704.0,
13320            220.0,
13321            716.0,
13322            "Training datasets",
13323        ));
13324        doc.kids.push(make_n_column_table(
13325            &[
13326                vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13327                vec![
13328                    "",
13329                    "Alpaca-GPT4",
13330                    "OpenOrca",
13331                    "Synth. Math-Instruct",
13332                    "Orca DPO Pairs",
13333                    "Ultrafeedback Cleaned",
13334                    "Synth. Math-Alignment",
13335                ],
13336                vec![
13337                    "Total # Samples",
13338                    "52K",
13339                    "2.91M",
13340                    "126K",
13341                    "12.9K",
13342                    "60.8K",
13343                    "126K",
13344                ],
13345                vec![
13346                    "Maximum # Samples Used",
13347                    "52K",
13348                    "100K",
13349                    "52K",
13350                    "12.9K",
13351                    "60.8K",
13352                    "20.1K",
13353                ],
13354                vec!["Open Source", "O", "O", "✗", "O", "O", "✗"],
13355            ],
13356            &[
13357                (78.0, 125.0),
13358                (125.0, 175.0),
13359                (175.0, 225.0),
13360                (225.0, 285.0),
13361                (285.0, 345.0),
13362                (345.0, 415.0),
13363                (415.0, 490.0),
13364            ],
13365        ));
13366        doc.kids.push(make_paragraph_at(
13367            72.0,
13368            500.0,
13369            310.0,
13370            514.0,
13371            "Table 1: Training datasets used for the instruction and alignment tuning stages, respectively.",
13372        ));
13373        doc.kids.push(make_paragraph_at(
13374            286.0,
13375            484.0,
13376            526.0,
13377            498.0,
13378            "Open source indicates whether the dataset is open-sourced.",
13379        ));
13380        doc.kids.push(make_paragraph_at(
13381            72.0,
13382            360.0,
13383            290.0,
13384            388.0,
13385            "Comparison to other up-scaling methods. Unlike Komatsuzaki et al. (2022)...",
13386        ));
13387
13388        let md = to_markdown(&doc).unwrap();
13389        assert!(md.contains("Table 1: Training datasets used for the instruction"));
13390        assert!(md.contains("| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"));
13391        assert!(!md.contains("Comparison to other up-scaling methods"));
13392    }
13393
13394    #[test]
13395    fn test_late_section_boundary_renderer_drops_equation_carryover() {
13396        let mut doc = PdfDocument::new("late-section.pdf".to_string());
13397        doc.number_of_pages = 1;
13398        doc.kids.push(make_paragraph_at(
13399            72.0,
13400            700.0,
13401            540.0,
13402            714.0,
13403            "The horizontal distance traveled by the jet is equal to:",
13404        ));
13405        doc.kids.push(make_paragraph_at(
13406            72.0,
13407            640.0,
13408            540.0,
13409            654.0,
13410            "The vertical position of the jet may be calculated as:",
13411        ));
13412        doc.kids.push(make_paragraph_at(
13413            72.0,
13414            580.0,
13415            260.0,
13416            594.0,
13417            "Rearranging Equation (8) gives:",
13418        ));
13419        doc.kids.push(make_paragraph_at(
13420            72.0,
13421            520.0,
13422            420.0,
13423            534.0,
13424            "Substitution into Equation 7 results in:",
13425        ));
13426        doc.kids.push(make_paragraph_at(
13427            72.0,
13428            460.0,
13429            280.0,
13430            474.0,
13431            "Equations (10) can be rearranged to find Cv:",
13432        ));
13433        doc.kids.push(make_heading_at(
13434            72.0,
13435            350.0,
13436            420.0,
13437            366.0,
13438            "7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE",
13439        ));
13440        doc.kids.push(make_paragraph_at(
13441            72.0,
13442            326.0,
13443            380.0,
13444            340.0,
13445            "If C_d is assumed to be constant, then a graph of Q plotted against",
13446        ));
13447        doc.kids.push(make_paragraph_at(
13448            400.0,
13449            326.0,
13450            540.0,
13451            340.0,
13452            "(Equation 6) will be linear, and",
13453        ));
13454        doc.kids.push(make_paragraph_at(
13455            72.0,
13456            310.0,
13457            240.0,
13458            324.0,
13459            "the slope of this graph will be:",
13460        ));
13461        doc.kids.push(make_paragraph_at(
13462            360.0,
13463            36.0,
13464            550.0,
13465            48.0,
13466            "EXPERIMENT #6: ORIFICE AND FREE JET FLOW 53",
13467        ));
13468
13469        let md = to_markdown(&doc).unwrap();
13470        assert!(md.starts_with("# 7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE"));
13471        assert!(md.contains(
13472            "If C_d is assumed to be constant, then a graph of Q plotted against (Equation 6) will be linear, and the slope of this graph will be:"
13473        ));
13474        assert!(!md.contains("The horizontal distance traveled by the jet"));
13475        assert!(!md.contains("EXPERIMENT #6"));
13476    }
13477
13478    #[test]
13479    fn test_leading_table_carryover_row_is_trimmed_from_general_renderer() {
13480        let mut doc = PdfDocument::new("carryover-table.pdf".to_string());
13481        doc.number_of_pages = 1;
13482        doc.kids.push(make_n_column_table(
13483            &[
13484                vec![
13485                    "Jurisdiction",
13486                    "GATS XVII Reservation (1994)",
13487                    "Foreign Ownership Permitted",
13488                    "Restrictions on Foreign Ownership",
13489                    "Foreign Ownership Reporting Requirements",
13490                ],
13491                vec![
13492                    "",
13493                    "",
13494                    "",
13495                    "right required to acquire desert lands and continue the prior page",
13496                    "",
13497                ],
13498                vec!["Finland", "N", "Y", "Prior approval may be required.", ""],
13499                vec!["France", "N", "Y", "None.", ""],
13500            ],
13501            &[
13502                (72.0, 150.0),
13503                (150.0, 235.0),
13504                (235.0, 330.0),
13505                (330.0, 500.0),
13506                (500.0, 560.0),
13507            ],
13508        ));
13509
13510        let md = to_markdown(&doc).unwrap();
13511        assert!(!md.contains("right required to acquire desert lands"));
13512        assert!(md.contains("| Finland | N | Y | Prior approval may be required. |  |"));
13513    }
13514
13515    #[test]
13516    fn test_single_table_report_renderer_promotes_title_and_skips_footer() {
13517        let mut doc = PdfDocument::new("single-table-report.pdf".to_string());
13518        doc.number_of_pages = 1;
13519        doc.kids.push(make_paragraph_at(
13520            140.0,
13521            674.0,
13522            474.0,
13523            688.0,
13524            "Restrictions on Land Ownership by Foreigners in Selected Jurisdictions",
13525        ));
13526        doc.kids.push(make_n_column_table(
13527            &[
13528                vec![
13529                    "Jurisdiction",
13530                    "GATS XVII Reservation (1994)",
13531                    "Foreign Ownership Permitted",
13532                    "Restrictions on Foreign Ownership",
13533                    "Foreign Ownership Reporting Requirements",
13534                ],
13535                vec![
13536                    "",
13537                    "",
13538                    "",
13539                    "right required to acquire desert lands and continue the prior page",
13540                    "",
13541                ],
13542                vec![
13543                    "Finland",
13544                    "N",
13545                    "Y",
13546                    "Prior approval from the Government of Aland may be required.",
13547                    "",
13548                ],
13549                vec!["France", "N", "Y", "None.", ""],
13550            ],
13551            &[
13552                (72.0, 150.0),
13553                (150.0, 235.0),
13554                (235.0, 330.0),
13555                (330.0, 500.0),
13556                (500.0, 560.0),
13557            ],
13558        ));
13559        doc.kids.push(make_paragraph_at(
13560            350.0,
13561            36.0,
13562            548.0,
13563            48.0,
13564            "The Law Library of Congress 7",
13565        ));
13566
13567        let md = to_markdown(&doc).unwrap();
13568        assert!(md.starts_with(
13569            "# Restrictions on Land Ownership by Foreigners in Selected Jurisdictions"
13570        ));
13571        assert!(!md.contains("right required to acquire desert lands"));
13572        assert!(!md.contains("The Law Library of Congress 7"));
13573        assert!(md.contains(
13574            "| Finland | N | Y | Prior approval from the Government of Aland may be required. |  |"
13575        ));
13576    }
13577
13578    #[test]
13579    fn test_hyphenated_table_title_continuation_renders_as_heading() {
13580        let mut doc = PdfDocument::new("hyphenated-table-title.pdf".to_string());
13581        doc.number_of_pages = 1;
13582        doc.kids.push(make_paragraph_at(
13583            72.0,
13584            724.0,
13585            520.0,
13586            738.0,
13587            "With this in mind, here we have the 7 key competence areas selected to form a part of Eco-",
13588        ));
13589        doc.kids.push(make_paragraph_at(
13590            72.0,
13591            704.0,
13592            260.0,
13593            718.0,
13594            "Circle's Competence Framework:",
13595        ));
13596        doc.kids.push(make_n_column_table(
13597            &[
13598                vec!["Eco-Circle Competence Framework"],
13599                vec!["#1: The 3 Rs: Recycle-Reuse-Reduce"],
13600                vec!["#2: Lifecycle of Circular Economy"],
13601            ],
13602            &[(140.0, 460.0)],
13603        ));
13604
13605        let md = to_markdown(&doc).unwrap();
13606        assert!(md.contains("# Circle's Competence Framework:"), "{md}");
13607    }
13608
13609    #[test]
13610    fn test_duplicate_table_header_heading_is_demoted() {
13611        let mut doc = PdfDocument::new("duplicate-table-header-heading.pdf".to_string());
13612        doc.number_of_pages = 1;
13613        doc.kids
13614            .push(make_heading("MOHAVE COMMUNITY COLLEGE BIO181"));
13615        doc.kids.push(make_n_column_table(
13616            &[
13617                vec![
13618                    "",
13619                    "Saccharometer",
13620                    "DI Water",
13621                    "Glucose Solution",
13622                    "Yeast Suspension",
13623                ],
13624                vec!["1", "", "8 ml", "6 ml", "0 ml"],
13625                vec!["2", "", "12 ml", "0 ml", "2 ml"],
13626                vec!["3", "", "6 ml", "6 ml", "2 ml"],
13627            ],
13628            &[
13629                (72.0, 110.0),
13630                (110.0, 210.0),
13631                (210.0, 300.0),
13632                (300.0, 430.0),
13633                (430.0, 540.0),
13634            ],
13635        ));
13636        doc.kids.push(make_heading_at(
13637            72.0,
13638            92.0,
13639            390.0,
13640            108.0,
13641            "Saccharometer DI Water Glucose Solution Yeast Suspension",
13642        ));
13643        doc.kids
13644            .push(make_paragraph_at(72.0, 72.0, 120.0, 88.0, "below"));
13645        doc.kids
13646            .push(make_paragraph_at(72.0, 56.0, 240.0, 72.0, "1 16 ml 12 ml"));
13647        doc.kids
13648            .push(make_paragraph_at(296.0, 56.0, 340.0, 72.0, "0 ml"));
13649
13650        let md = to_markdown(&doc).unwrap();
13651        assert!(
13652            md.contains("Saccharometer DI Water Glucose Solution Yeast Suspension"),
13653            "{md}"
13654        );
13655        assert!(
13656            !md.contains("# Saccharometer DI Water Glucose Solution Yeast Suspension"),
13657            "{md}"
13658        );
13659    }
13660
13661    #[test]
13662    fn test_geometric_panel_headers_are_promoted_into_table() {
13663        let mut doc = PdfDocument::new("ai-pack-panel.pdf".to_string());
13664        doc.kids.push(make_chunked_paragraph_line(
13665            &[("OCR", 220.0, 250.0)],
13666            720.0,
13667            732.0,
13668        ));
13669        doc.kids.push(make_chunked_paragraph_line(
13670            &[("Recommendation", 430.0, 540.0)],
13671            720.0,
13672            732.0,
13673        ));
13674        doc.kids.push(make_chunked_paragraph_line(
13675            &[("Product semantic search", 660.0, 860.0)],
13676            720.0,
13677            732.0,
13678        ));
13679        doc.kids.push(make_chunked_paragraph_line(
13680            &[("Pack", 72.0, 110.0)],
13681            684.0,
13682            696.0,
13683        ));
13684        doc.kids.push(make_chunked_paragraph_line(
13685            &[("A solution that recognizes characters", 140.0, 340.0)],
13686            684.0,
13687            696.0,
13688        ));
13689        doc.kids.push(make_chunked_paragraph_line(
13690            &[("A solution that recommends the best products", 390.0, 620.0)],
13691            684.0,
13692            696.0,
13693        ));
13694        doc.kids.push(make_chunked_paragraph_line(
13695            &[("A solution that enables semantic search", 650.0, 900.0)],
13696            684.0,
13697            696.0,
13698        ));
13699        doc.kids.push(make_n_column_table(
13700            &[
13701                vec![
13702                    "Achieved 1st place in the OCR World Competition",
13703                    "Team with specialists and technologies",
13704                    "Creation of the first natural language evaluation",
13705                ],
13706                vec![
13707                    "The team includes specialists who have",
13708                    "received Kaggle's Gold Medal recommendation",
13709                    "system in Korean (KLUE)",
13710                ],
13711                vec![
13712                    "presented 14 papers in renowned AI conferences",
13713                    "top-tier recommendation",
13714                    "Shopee subject",
13715                ],
13716            ],
13717            &[(120.0, 360.0), (360.0, 630.0), (630.0, 910.0)],
13718        ));
13719        doc.kids.push(make_chunked_paragraph_line(
13720            &[("models", 430.0, 490.0)],
13721            552.0,
13722            564.0,
13723        ));
13724
13725        let md = to_markdown(&doc).unwrap();
13726        assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13727        assert!(md.contains("| A solution that recognizes characters | A solution that recommends the best products | A solution that enables semantic search |"));
13728        assert!(md.contains(
13729            "received Kaggle's Gold Medal recommendation top-tier recommendation models"
13730        ));
13731    }
13732
13733    #[test]
13734    fn test_embedded_stub_header_is_promoted_from_first_table_column() {
13735        let mut doc = PdfDocument::new("embedded-stub-header.pdf".to_string());
13736        doc.kids.push(make_chunked_paragraph_line(
13737            &[("OCR", 220.0, 250.0)],
13738            720.0,
13739            732.0,
13740        ));
13741        doc.kids.push(make_chunked_paragraph_line(
13742            &[("Recommendation", 430.0, 540.0)],
13743            720.0,
13744            732.0,
13745        ));
13746        doc.kids.push(make_chunked_paragraph_line(
13747            &[("Product semantic search", 660.0, 860.0)],
13748            720.0,
13749            732.0,
13750        ));
13751        doc.kids.push(make_n_column_table(
13752            &[
13753                vec![
13754                    "Pack",
13755                    "A solution that recognizes characters in an image and extracts necessary information",
13756                    "A solution that recommends the best products and contents",
13757                    "A solution that enables semantic search and organizes key information",
13758                ],
13759                vec![
13760                    "Application",
13761                    "Applicable to all fields that require text extraction",
13762                    "Applicable to all fields that use any form of recommendation",
13763                    "Applicable to all fields that deal with unstructured data",
13764                ],
13765                vec![
13766                    "Highlight",
13767                    "Achieved 1st place in the OCR World Competition",
13768                    "Received Kaggle's Gold Medal recommendation",
13769                    "Creation of the first natural language evaluation system in Korean",
13770                ],
13771            ],
13772            &[
13773                (72.0, 120.0),
13774                (120.0, 360.0),
13775                (360.0, 630.0),
13776                (630.0, 910.0),
13777            ],
13778        ));
13779
13780        let md = to_markdown(&doc).unwrap();
13781        assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13782        assert!(
13783            md.contains("| Application | Applicable to all fields that require text extraction |")
13784        );
13785        assert!(md.contains("| Highlight | Achieved 1st place in the OCR World Competition |"));
13786        assert!(!md.contains("OCR\n\nRecommendation\n\nProduct semantic search"));
13787    }
13788
13789    #[test]
13790    fn test_geometric_chunk_alignment_splits_header_line_into_columns() {
13791        let line = make_chunked_paragraph_line(
13792            &[
13793                ("Properties", 72.0, 145.0),
13794                ("Instruction", 180.0, 255.0),
13795                ("Alignment", 480.0, 545.0),
13796            ],
13797            720.0,
13798            732.0,
13799        );
13800        let chunk_lines = extract_chunk_lines(&line);
13801        let fragments = split_line_into_slot_fragments(
13802            &chunk_lines[0],
13803            &[
13804                (72.0, 170.0),
13805                (170.0, 280.0),
13806                (280.0, 380.0),
13807                (380.0, 480.0),
13808                (480.0, 600.0),
13809                (600.0, 720.0),
13810                (720.0, 850.0),
13811            ],
13812        );
13813
13814        assert_eq!(fragments.len(), 3);
13815        assert_eq!(fragments[0].slot_idx, 0);
13816        assert_eq!(fragments[0].text, "Properties");
13817        assert_eq!(fragments[1].slot_idx, 1);
13818        assert_eq!(fragments[1].text, "Instruction");
13819        assert_eq!(fragments[2].slot_idx, 4);
13820        assert_eq!(fragments[2].text, "Alignment");
13821    }
13822
13823    #[test]
13824    fn test_merge_tables_across_heading() {
13825        let input = "some text\n\n\
13826                      | Area | Competence |\n\
13827                      | --- | --- |\n\
13828                      | Row1 | Val1 |\n\
13829                      | Row2 | Val2 |\n\
13830                      \n\
13831                      # Heading Between\n\
13832                      \n\
13833                      | Row3 | Val3 |\n\
13834                      | --- | --- |\n\
13835                      \n\
13836                      more text\n";
13837        let result = merge_adjacent_pipe_tables(input);
13838        // Heading should be converted to a pipe row
13839        assert!(
13840            result.contains("| Heading Between |"),
13841            "Heading should be in pipe row: {}",
13842            result
13843        );
13844        // Should NOT have # heading marker
13845        assert!(
13846            !result.contains("# Heading Between"),
13847            "Heading marker should be removed: {}",
13848            result
13849        );
13850        // Row3 should still be present
13851        assert!(
13852            result.contains("| Row3 |") || result.contains("Row3"),
13853            "Row3 should exist: {}",
13854            result
13855        );
13856    }
13857
13858    #[test]
13859    fn test_merge_tables_does_not_cross_distinct_headers() {
13860        let input = "| Model | Score |\n\
13861                     | --- | --- |\n\
13862                     | A | 1 |\n\
13863                     \n\
13864                     Table 6: Performance comparison amongst the merge candidates.\n\
13865                     \n\
13866                     | Model | Method | Score |\n\
13867                     | --- | --- | --- |\n\
13868                     | B | Avg | 2 |\n";
13869        let result = merge_adjacent_pipe_tables(input);
13870
13871        assert!(result.contains("Table 6: Performance comparison amongst the merge candidates."));
13872        assert!(result.contains("| Model | Score |"));
13873        assert!(result.contains("| Model | Method | Score |"));
13874        assert!(
13875            !result.contains("| Table 6: Performance comparison amongst the merge candidates. |")
13876        );
13877    }
13878
13879    #[test]
13880    fn test_normalize_chart_like_markdown_extracts_series_tables() {
13881        let input = "Figure 1.7. Non-citizen population in Malaysia (in thousands) 3,323 3,500 3,288 3,230 3,140 2,907 3,000 2,693 2,500 2,000 1,500 1,000 500 0\n\n\
13882                     2016 2017 2018 2019 2020 2021 Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.\n\n\
13883                     ASEAN Migration Outlook 19\n";
13884
13885        let normalized = normalize_chart_like_markdown(input);
13886        assert!(
13887            normalized.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
13888        );
13889        assert!(normalized.contains("| 2016 | 3,323 |"));
13890        assert!(normalized.contains("| 2021 | 2,693 |"));
13891        assert!(normalized.contains(
13892            "*Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.*"
13893        ));
13894        assert!(!normalized.contains("ASEAN Migration Outlook 19"));
13895    }
13896
13897    #[test]
13898    fn test_normalize_chart_like_markdown_promotes_structural_captions() {
13899        let input = "Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or\n\n\
13900                     The Wonderful Lamp.\n\n\
13901                     Body paragraph.\n";
13902
13903        let normalized = normalize_chart_like_markdown(input);
13904        assert!(normalized.contains(
13905            "## Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or The Wonderful Lamp"
13906        ));
13907        assert!(normalized.contains("Body paragraph."));
13908    }
13909
13910    #[test]
13911    fn test_normalize_chart_like_markdown_reconstructs_header_pair_chart_table() {
13912        let input = "Figure 4.8. Domestic Wood Pellets Production\n\n\
13913                     | 8 | 800 200 | 126 2014 | 120 2015 | 120 2016 | 127 2017 | 131 2018 | 147 2019 |\n\
13914                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\n\
13915                     Source: Forestry Agency, Ministry of Agriculture, Forestry and Fishery (MAFF), 2020.\n";
13916
13917        let normalized = normalize_chart_like_markdown(input);
13918        assert!(normalized.contains("# Figure 4.8. Domestic Wood Pellets Production"));
13919        assert!(normalized.contains("| Year | Domestic Wood Pellets Production |"));
13920        assert!(normalized.contains("| 2014 | 126 |"));
13921        assert!(normalized.contains("| 2019 | 147 |"));
13922        assert!(!normalized.contains("| 8 | 800 200 |"));
13923    }
13924
13925    #[test]
13926    fn test_normalize_chart_like_markdown_drops_numeric_axis_artifact_table() {
13927        let input = "| 31 1 0 2 23 2 2 2 0 5 10 15 20 25 30 35 Event Celebration Information Videograph 2019 2020 |\n\
13928                     | --- |\n\n\
13929                     Distribution of Komnas HAM's YouTube Content (2019-2020)\n";
13930
13931        let normalized = normalize_chart_like_markdown(input);
13932        assert!(!normalized.contains("| --- |"));
13933        assert!(normalized.contains("Distribution of Komnas HAM's YouTube Content (2019-2020)"));
13934    }
13935
13936    #[test]
13937    fn test_normalize_chart_like_markdown_drops_url_fragment_table() {
13938        let input = "## Figure 6 DPN Argentina Content: World Health Day Celebration\n\n\
13939                     | na/status/1379765916259483648 |\n\
13940                     | --- |\n\n\
13941                     98 DPN Argentina, accessed on 5 December 2021.\n";
13942
13943        let normalized = normalize_chart_like_markdown(input);
13944        assert!(!normalized.contains("/status/1379765916259483648 |"));
13945        assert!(normalized.contains("98 DPN Argentina, accessed on 5 December 2021."));
13946    }
13947
13948    #[test]
13949    fn test_normalize_chart_like_markdown_drops_sparse_table_before_caption() {
13950        let input = "What’s unique about the growth of Alligator Gars is their fast growth.\n\n\
13951                     | in | cm |  | Length | of | Gar | Fish | Age |\n\
13952                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13953                     | 120) | 300 |  |  |  |  |  |  |\n\
13954                     | 100+ | 250 |  |  |  |  |  |  |\n\
13955                     | 80+ | 200 |  |  |  |  |  |  |\n\
13956                     | 20. | 50 | G |  |  |  |  | Vi |\n\
13957                     | 0 | 0 |  |  |  |  |  |  |\n\
13958                     |  | 0 | 10 | 30 |  | 40 | 50 | 60 |\n\n\
13959                     Figure 8.6: Growth in length of Alligator Gar in Texas.\n";
13960
13961        let normalized = normalize_chart_like_markdown(input);
13962        assert!(!normalized.contains("| in | cm |"));
13963        assert!(normalized.contains("Figure 8.6: Growth in length of Alligator Gar in Texas."));
13964    }
13965
13966    #[test]
13967    fn test_normalize_chart_like_markdown_trims_large_top_table_plate() {
13968        let input = "| A | B | C | D | E | F | G | H |\n\
13969                     | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13970                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13971                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13972                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13973                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13974                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13975                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13976                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13977                     | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\n\
13978                     Table 2: Evaluation results for SOLAR 10.7B and SOLAR 10.7B-Instruct along with other top-performing models in the paper.\n\n\
13979                     # 4.2 Main Results\n\n\
13980                     The surrounding prose should be dropped.\n";
13981
13982        let normalized = normalize_chart_like_markdown(input);
13983        assert!(normalized.starts_with("| A | B | C | D | E | F | G | H |"));
13984        assert!(!normalized.contains("Table 2:"));
13985        assert!(!normalized.contains("4.2 Main Results"));
13986        assert!(!normalized.contains("surrounding prose"));
13987    }
13988}
edgeparse_core/output/markdown.rs

edgeparse_core/output/
markdown.rs