Skip to main content

edgeparse_core/output/
markdown.rs

1//! Markdown output generator.
2
3use crate::models::content::ContentElement;
4use crate::models::document::PdfDocument;
5use crate::models::enums::SemanticType;
6use crate::models::table::TableTokenRow;
7use crate::EdgePdfError;
8
9/// Generate Markdown representation of a PdfDocument.
10///
11/// # Errors
12/// Returns `EdgePdfError::OutputError` on write failures.
13pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
14    if looks_like_contents_document(doc) {
15        return Ok(render_contents_document(doc));
16    }
17    if looks_like_compact_toc_document(doc) {
18        return Ok(render_compact_toc_document(doc));
19    }
20
21    let mut output = String::new();
22
23    // Title
24    if let Some(ref title) = doc.title {
25        let trimmed = title.trim();
26        if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
27            if should_render_document_title_as_plaintext(doc, trimmed) {
28                output.push_str(trimmed);
29                output.push_str("\n\n");
30            } else {
31                output.push_str(&format!("# {}\n\n", trimmed));
32            }
33        }
34    }
35
36    if doc.kids.is_empty() {
37        output.push_str("*No content extracted.*\n");
38        return Ok(output);
39    }
40
41    let mut i = 0usize;
42    while i < doc.kids.len() {
43        match &doc.kids[i] {
44            ContentElement::Heading(h) => {
45                let text = h.base.base.value();
46                let trimmed = text.trim();
47                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
48                    i += 1;
49                    continue;
50                }
51
52                // Demote headings that sit in the bottom margin of the page
53                // (running footers misclassified as headings by the pipeline).
54                if looks_like_bottom_margin_heading(doc, i) {
55                    output.push_str(&escape_md_line_start(trimmed));
56                    output.push_str("\n\n");
57                    i += 1;
58                    continue;
59                }
60
61                // Demote pipeline headings that look like sentence fragments
62                // ending with a period but are not numbered section headings.
63                if should_demote_period_heading(trimmed) {
64                    output.push_str(&escape_md_line_start(trimmed));
65                    output.push_str("\n\n");
66                    i += 1;
67                    continue;
68                }
69
70                // Demote headings ending with comma (footnotes / data labels).
71                if should_demote_comma_heading(trimmed) {
72                    output.push_str(&escape_md_line_start(trimmed));
73                    output.push_str("\n\n");
74                    i += 1;
75                    continue;
76                }
77
78                // Demote headings containing math symbols.
79                if should_demote_math_heading(trimmed) {
80                    output.push_str(&escape_md_line_start(trimmed));
81                    output.push_str("\n\n");
82                    i += 1;
83                    continue;
84                }
85
86                // Demote headings containing percentage signs.
87                if should_demote_percentage_heading(trimmed) {
88                    output.push_str(&escape_md_line_start(trimmed));
89                    output.push_str("\n\n");
90                    i += 1;
91                    continue;
92                }
93
94                // Demote headings that start with a known caption prefix
95                // (e.g. "Source:", "Figure", "Table") — these are captions,
96                // not section headings, regardless of pipeline classification.
97                if starts_with_caption_prefix(trimmed) {
98                    output.push_str(&escape_md_line_start(trimmed));
99                    output.push_str("\n\n");
100                    i += 1;
101                    continue;
102                }
103
104                // Demote bibliography entries: lines starting with a 4-digit
105                // year followed by a period (e.g. "2020. Title of paper...").
106                if should_demote_bibliography_heading(trimmed) {
107                    output.push_str(&escape_md_line_start(trimmed));
108                    output.push_str("\n\n");
109                    i += 1;
110                    continue;
111                }
112
113                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
114                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
115                        let mut merged = trimmed.to_string();
116                        merge_paragraph_text(&mut merged, &next_text);
117                        output.push_str(&escape_md_line_start(merged.trim()));
118                        output.push_str("\n\n");
119                        i += 2;
120                        continue;
121                    }
122                }
123
124                // Merge consecutive heading fragments.
125                // When the PDF splits a title across multiple text elements,
126                // each becomes a separate heading; merge them into one.
127                let mut merged_heading = trimmed.to_string();
128                while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
129                    let next_text = next_h.base.base.value();
130                    let next_trimmed = next_text.trim();
131                    if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
132                        i += 1;
133                        continue;
134                    }
135                    // Only merge if the combined text stays under max heading length
136                    if merged_heading.len() + 1 + next_trimmed.len() > 200 {
137                        break;
138                    }
139                    merge_paragraph_text(&mut merged_heading, next_trimmed);
140                    i += 1;
141                }
142
143                let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
144
145                // Check if this heading contains a merged subsection
146                if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
147                    let first = cleaned_heading[..split_pos].trim();
148                    let second = cleaned_heading[split_pos..].trim();
149                    output.push_str(&format!("# {}\n\n", first));
150                    output.push_str(&format!("# {}\n\n", second));
151                } else {
152                    output.push_str(&format!("# {}\n\n", cleaned_heading));
153                }
154            }
155            ContentElement::NumberHeading(nh) => {
156                let text = nh.base.base.base.value();
157                let trimmed = text.trim();
158                if trimmed.is_empty() || should_skip_heading_text(trimmed) {
159                    i += 1;
160                    continue;
161                }
162
163                // Demote number headings ending with comma (footnotes).
164                if should_demote_comma_heading(trimmed) {
165                    output.push_str(&escape_md_line_start(trimmed));
166                    output.push_str("\n\n");
167                    i += 1;
168                    continue;
169                }
170
171                // Demote number headings containing math symbols.
172                if should_demote_math_heading(trimmed) {
173                    output.push_str(&escape_md_line_start(trimmed));
174                    output.push_str("\n\n");
175                    i += 1;
176                    continue;
177                }
178
179                // Demote number headings containing percentage signs.
180                if should_demote_percentage_heading(trimmed) {
181                    output.push_str(&escape_md_line_start(trimmed));
182                    output.push_str("\n\n");
183                    i += 1;
184                    continue;
185                }
186
187                if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
188                    if should_demote_heading_to_paragraph(trimmed, &next_text) {
189                        let mut merged = trimmed.to_string();
190                        merge_paragraph_text(&mut merged, &next_text);
191                        output.push_str(&escape_md_line_start(merged.trim()));
192                        output.push_str("\n\n");
193                        i += 2;
194                        continue;
195                    }
196                }
197
198                let cleaned = strip_trailing_page_number(trimmed);
199
200                // Check if this heading contains a merged subsection
201                if let Some(split_pos) = find_merged_subsection_split(cleaned) {
202                    let first = cleaned[..split_pos].trim();
203                    let second = cleaned[split_pos..].trim();
204                    output.push_str(&format!("# {}\n\n", first));
205                    output.push_str(&format!("# {}\n\n", second));
206                } else {
207                    output.push_str(&format!("# {}\n\n", cleaned));
208                }
209            }
210            ContentElement::Paragraph(_)
211            | ContentElement::TextBlock(_)
212            | ContentElement::TextLine(_) => {
213                let element = &doc.kids[i];
214                let text = match &doc.kids[i] {
215                    ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
216                    ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
217                    ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
218                    _ => unreachable!(),
219                };
220                let trimmed = text.trim();
221                if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
222                    i += 1;
223                    continue;
224                }
225
226                if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
227                    let cleaned = strip_trailing_page_number(trimmed);
228                    // Check if this heading contains a merged subsection
229                    if let Some(split_pos) = find_merged_subsection_split(cleaned) {
230                        let first = cleaned[..split_pos].trim();
231                        let second = cleaned[split_pos..].trim();
232                        output.push_str(&format!("# {}\n\n", first));
233                        output.push_str(&format!("# {}\n\n", second));
234                    } else {
235                        output.push_str(&format!("# {}\n\n", cleaned));
236                    }
237                    i += 1;
238                    continue;
239                }
240
241                if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
242                {
243                    output.push_str(&escape_md_line_start(trimmed));
244                    output.push('\n');
245                    i += 1;
246                    continue;
247                }
248
249                if is_short_caption_label(trimmed) {
250                    if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
251                        if let Some((caption_tail, body)) =
252                            split_following_caption_tail_and_body(&next_text)
253                        {
254                            let mut caption = trimmed.to_string();
255                            caption.push('\n');
256                            caption.push_str(caption_tail);
257                            output.push_str(&escape_md_line_start(caption.trim()));
258                            output.push_str("\n\n");
259                            output.push_str(&escape_md_line_start(body));
260                            output.push_str("\n\n");
261                            i += 2;
262                            continue;
263                        }
264
265                        if looks_like_caption_tail(&next_text) {
266                            let mut caption = trimmed.to_string();
267                            caption.push('\n');
268                            caption.push_str(next_text.trim());
269
270                            if let Some(year_text) =
271                                next_mergeable_paragraph_text(doc.kids.get(i + 2))
272                            {
273                                if looks_like_caption_year(&year_text) {
274                                    caption.push('\n');
275                                    caption.push_str(year_text.trim());
276                                    i += 1;
277                                }
278                            }
279
280                            output.push_str(&escape_md_line_start(caption.trim()));
281                            output.push_str("\n\n");
282                            i += 2;
283                            continue;
284                        }
285                    }
286                }
287
288                if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
289                    output.push_str(&escape_md_line_start(caption));
290                    output.push_str("\n\n");
291                    output.push_str(&escape_md_line_start(body));
292                    output.push_str("\n\n");
293                    i += 1;
294                    continue;
295                }
296
297                let mut merged = trimmed.to_string();
298                while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
299                    let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
300                        should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
301                    } else {
302                        should_merge_paragraph_text(&merged, &next_text)
303                    };
304                    if !can_merge {
305                        break;
306                    }
307                    merge_paragraph_text(&mut merged, &next_text);
308                    i += 1;
309                }
310
311                output.push_str(&escape_md_line_start(merged.trim()));
312                output.push_str("\n\n");
313            }
314            other => render_element(&mut output, other),
315        }
316        i += 1;
317    }
318
319    // Post-processing: merge adjacent pipe tables that share the same
320    // column count.  The table detector sometimes emits highlighted or
321    // coloured rows as separate tables.
322    let output = merge_adjacent_pipe_tables(&output);
323
324    Ok(output)
325}
326
327fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
328    first_heading_like_text(doc)
329        .filter(|first| !equivalent_heading_text(first, title))
330        .is_some()
331}
332
333fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
334    if title.split_whitespace().count() > 6 {
335        return false;
336    }
337
338    let mut early = doc.kids.iter().take(6);
339    let has_explicit_heading = early.clone().any(|element| {
340        matches!(
341            element,
342            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
343        )
344    });
345    let has_tableish_content = early.any(|element| {
346        matches!(
347            element,
348            ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
349        )
350    });
351
352    has_tableish_content && !has_explicit_heading
353}
354
355fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
356    for (idx, element) in doc.kids.iter().enumerate().take(8) {
357        match element {
358            ContentElement::Heading(h) => {
359                let text = h.base.base.value();
360                let trimmed = text.trim();
361                if !trimmed.is_empty() {
362                    return Some(trimmed.to_string());
363                }
364            }
365            ContentElement::NumberHeading(nh) => {
366                let text = nh.base.base.base.value();
367                let trimmed = text.trim();
368                if !trimmed.is_empty() {
369                    return Some(trimmed.to_string());
370                }
371            }
372            ContentElement::Paragraph(p) => {
373                let text = clean_paragraph_text(&p.base.value());
374                let trimmed = text.trim();
375                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
376                    return Some(trimmed.to_string());
377                }
378            }
379            ContentElement::TextBlock(tb) => {
380                let text = clean_paragraph_text(&tb.value());
381                let trimmed = text.trim();
382                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
383                    return Some(trimmed.to_string());
384                }
385            }
386            ContentElement::TextLine(tl) => {
387                let text = clean_paragraph_text(&tl.value());
388                let trimmed = text.trim();
389                if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
390                    return Some(trimmed.to_string());
391                }
392            }
393            _ => {}
394        }
395    }
396    None
397}
398
399fn equivalent_heading_text(left: &str, right: &str) -> bool {
400    normalize_heading_text(left) == normalize_heading_text(right)
401}
402
403fn normalize_heading_text(text: &str) -> String {
404    text.chars()
405        .filter(|ch| ch.is_alphanumeric())
406        .flat_map(char::to_lowercase)
407        .collect()
408}
409
410fn looks_like_contents_document(doc: &PdfDocument) -> bool {
411    let Some(first) = first_heading_like_text(doc) else {
412        return false;
413    };
414    if !matches!(
415        normalize_heading_text(&first).as_str(),
416        "contents" | "tableofcontents"
417    ) {
418        return false;
419    }
420
421    let lines = collect_plain_lines(doc);
422    if lines.len() < 8 {
423        return false;
424    }
425
426    let page_like = lines
427        .iter()
428        .skip(1)
429        .filter(|line| ends_with_page_marker(line))
430        .count();
431    page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
432}
433
434fn render_contents_document(doc: &PdfDocument) -> String {
435    let lines = collect_plain_lines(doc);
436    let mut out = String::new();
437
438    let mut iter = lines.into_iter();
439    if let Some(first) = iter.next() {
440        out.push_str("# ");
441        out.push_str(first.trim());
442        out.push_str("\n\n");
443    }
444    for line in iter {
445        let trimmed = line.trim();
446        if trimmed.is_empty() {
447            continue;
448        }
449        out.push_str(trimmed);
450        out.push('\n');
451    }
452    out.push('\n');
453    out
454}
455
456fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
457    let lines = collect_plain_lines(doc);
458    if lines.len() < 8 {
459        return false;
460    }
461
462    let page_like = lines
463        .iter()
464        .filter(|line| ends_with_page_marker(line))
465        .count();
466    let support_like = lines
467        .iter()
468        .filter(|line| looks_like_toc_support_heading(line))
469        .count();
470
471    page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
472}
473
474fn render_compact_toc_document(doc: &PdfDocument) -> String {
475    let mut out = String::new();
476    for line in collect_plain_lines(doc) {
477        let trimmed = line.trim();
478        if trimmed.is_empty() {
479            continue;
480        }
481        out.push_str(trimmed);
482        out.push('\n');
483    }
484    out.push('\n');
485    out
486}
487
488fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
489    let mut lines = Vec::new();
490    for element in &doc.kids {
491        match element {
492            ContentElement::Heading(h) => {
493                let text = clean_paragraph_text(&h.base.base.value());
494                if !text.trim().is_empty() {
495                    lines.push(text);
496                }
497            }
498            ContentElement::NumberHeading(nh) => {
499                let text = clean_paragraph_text(&nh.base.base.base.value());
500                if !text.trim().is_empty() {
501                    lines.push(text);
502                }
503            }
504            ContentElement::Paragraph(p) => {
505                let text = clean_paragraph_text(&p.base.value());
506                if !text.trim().is_empty() {
507                    lines.push(text);
508                }
509            }
510            ContentElement::TextBlock(tb) => {
511                let text = clean_paragraph_text(&tb.value());
512                if !text.trim().is_empty() {
513                    lines.push(text);
514                }
515            }
516            ContentElement::TextLine(tl) => {
517                let text = clean_paragraph_text(&tl.value());
518                if !text.trim().is_empty() {
519                    lines.push(text);
520                }
521            }
522            ContentElement::List(list) => {
523                for item in &list.list_items {
524                    let label = token_rows_text(&item.label.content);
525                    let body = token_rows_text(&item.body.content);
526                    let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
527                        format!("{} {}", label.trim(), body.trim())
528                    } else if !body.trim().is_empty() {
529                        body.trim().to_string()
530                    } else if !label.trim().is_empty() {
531                        label.trim().to_string()
532                    } else {
533                        list_item_text_from_contents(&item.contents)
534                            .trim()
535                            .to_string()
536                    };
537                    if !combined.trim().is_empty() {
538                        lines.push(combined);
539                    }
540                }
541            }
542            ContentElement::Table(table) => {
543                extend_contents_lines_from_rows(
544                    &mut lines,
545                    collect_rendered_table_rows(
546                        &table.table_border.rows,
547                        table.table_border.num_columns,
548                    ),
549                );
550            }
551            ContentElement::TableBorder(table) => {
552                extend_contents_lines_from_rows(
553                    &mut lines,
554                    collect_rendered_table_rows(&table.rows, table.num_columns),
555                );
556            }
557            _ => {}
558        }
559    }
560    lines
561}
562
563fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
564    if rows.is_empty() {
565        return;
566    }
567
568    if is_toc_table(&rows) {
569        for row in &rows {
570            let title = row.first().map(|s| s.trim()).unwrap_or("");
571            let page = row.get(1).map(|s| s.trim()).unwrap_or("");
572            let combined = if !title.is_empty() && !page.is_empty() {
573                format!("{title} {page}")
574            } else {
575                format!("{title}{page}")
576            };
577            if !combined.trim().is_empty() {
578                lines.push(combined);
579            }
580        }
581    } else {
582        // Non-TOC table in a contents document: concatenate cell text as a line.
583        for row in &rows {
584            let combined: String = row
585                .iter()
586                .map(|c| c.trim())
587                .filter(|c| !c.is_empty())
588                .collect::<Vec<_>>()
589                .join(" ");
590            if !combined.is_empty() {
591                lines.push(combined);
592            }
593        }
594    }
595}
596
597fn collect_rendered_table_rows(
598    rows: &[crate::models::table::TableBorderRow],
599    num_cols: usize,
600) -> Vec<Vec<String>> {
601    let num_cols = num_cols.max(1);
602    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
603
604    for row in rows {
605        let cell_texts: Vec<String> = (0..num_cols)
606            .map(|col| {
607                row.cells
608                    .iter()
609                    .find(|c| c.col_number == col)
610                    .map(cell_text_content)
611                    .unwrap_or_default()
612            })
613            .collect();
614        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
615            rendered_rows.push(cell_texts);
616        }
617    }
618
619    rendered_rows
620}
621
622fn ends_with_page_marker(text: &str) -> bool {
623    text.split_whitespace()
624        .last()
625        .is_some_and(is_page_number_like)
626}
627
628fn looks_like_toc_support_heading(text: &str) -> bool {
629    let trimmed = text.trim();
630    if trimmed.is_empty() || ends_with_page_marker(trimmed) {
631        return false;
632    }
633    if trimmed.ends_with(['.', ';', ':', '?', '!']) {
634        return false;
635    }
636
637    let lower = trimmed.to_ascii_lowercase();
638    if !(lower.starts_with("part ")
639        || lower.starts_with("chapter ")
640        || lower.starts_with("appendix ")
641        || lower.starts_with("section "))
642    {
643        return false;
644    }
645
646    let word_count = trimmed.split_whitespace().count();
647    (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
648}
649
650fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
651    if !starts_with_caption_prefix(text) || !text.contains("(credit") {
652        return None;
653    }
654
655    for needle in [") ", ". "] {
656        let mut search_start = 0usize;
657        while let Some(rel_idx) = text[search_start..].find(needle) {
658            let boundary = search_start + rel_idx + needle.len() - 1;
659            let head = text[..=boundary].trim();
660            let tail = text[boundary + 1..].trim_start();
661            search_start = boundary + 1;
662            if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
663                continue;
664            }
665            if tail.split_whitespace().count() < 10 {
666                continue;
667            }
668            if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
669                continue;
670            }
671            return Some((head, tail));
672        }
673    }
674
675    None
676}
677
678fn is_short_caption_label(text: &str) -> bool {
679    if !starts_with_caption_prefix(text) {
680        return false;
681    }
682
683    let trimmed = text.trim();
684    trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
685}
686
687fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
688    let trimmed = text.trim();
689    if trimmed.is_empty()
690        || starts_with_caption_prefix(trimmed)
691        || !starts_with_uppercase_word(trimmed)
692    {
693        return None;
694    }
695
696    for starter in [
697        " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
698    ] {
699        if let Some(idx) = text.find(starter) {
700            let head = text[..idx].trim();
701            let tail = text[idx + 1..].trim();
702            if head.split_whitespace().count() >= 3
703                && head.split_whitespace().count() <= 24
704                && tail.split_whitespace().count() >= 8
705            {
706                return Some((head, tail));
707            }
708        }
709    }
710
711    None
712}
713
714fn looks_like_caption_tail(text: &str) -> bool {
715    let trimmed = text.trim();
716    if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
717        return false;
718    }
719
720    let word_count = trimmed.split_whitespace().count();
721    if !(3..=18).contains(&word_count) {
722        return false;
723    }
724
725    starts_with_uppercase_word(trimmed)
726        && !starts_with_caption_prefix(trimmed)
727        && !trimmed.contains(':')
728}
729
730fn looks_like_caption_year(text: &str) -> bool {
731    let trimmed = text.trim();
732    trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
733}
734
735/// Extract text from table token rows.
736fn token_rows_text(rows: &[TableTokenRow]) -> String {
737    repair_fragmented_words(
738        &rows
739            .iter()
740            .flat_map(|row| row.iter())
741            .map(|token| token.base.value.as_str())
742            .collect::<Vec<_>>()
743            .join(" "),
744    )
745}
746
747fn render_element(out: &mut String, element: &ContentElement) {
748    match element {
749        ContentElement::Heading(h) => {
750            let text = h.base.base.value();
751            let trimmed = text.trim();
752            if should_skip_heading_text(trimmed) {
753                return;
754            }
755            out.push_str(&format!("# {}\n\n", trimmed));
756        }
757        ContentElement::Paragraph(p) => {
758            let text = p.base.value();
759            let trimmed = clean_paragraph_text(&text);
760            if !trimmed.is_empty() {
761                out.push_str(&escape_md_line_start(&trimmed));
762                if p.base.semantic_type == SemanticType::TableOfContent {
763                    out.push('\n');
764                } else {
765                    out.push_str("\n\n");
766                }
767            }
768        }
769        ContentElement::List(list) => {
770            let mut i = 0usize;
771            while i < list.list_items.len() {
772                let item = &list.list_items[i];
773                let label = token_rows_text(&item.label.content);
774                let body = token_rows_text(&item.body.content);
775                let label_trimmed = label.trim();
776                let body_trimmed = body.trim();
777                let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
778                    format!("{label_trimmed} {body_trimmed}")
779                } else if !body_trimmed.is_empty() {
780                    body_trimmed.to_string()
781                } else {
782                    label_trimmed.to_string()
783                };
784                let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
785                    list_item_text_from_contents(&item.contents)
786                } else {
787                    combined
788                };
789
790                if is_list_section_heading(&combined) {
791                    out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
792                    i += 1;
793                    continue;
794                }
795
796                if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
797                    if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
798                        out.push_str(&format!("- {} {}\n", label_trimmed, body_trimmed));
799                    } else if !body_trimmed.is_empty() {
800                        out.push_str(&format!("- {}\n", body_trimmed));
801                    } else {
802                        out.push_str(&format!("- {}\n", label_trimmed));
803                    }
804                } else if !item.contents.is_empty() {
805                    // Fallback: extract text from contents (used by list_pass2)
806                    let text = list_item_text_from_contents(&item.contents);
807                    let trimmed = text.trim();
808                    if !trimmed.is_empty() {
809                        out.push_str(&format!("- {}\n", trimmed));
810                    }
811                }
812                i += 1;
813            }
814            out.push('\n');
815        }
816        ContentElement::Table(table) => {
817            render_table(out, table);
818        }
819        ContentElement::TableBorder(table) => {
820            render_table_border(out, table);
821        }
822        ContentElement::Formula(f) => {
823            let latex = f.latex.trim();
824            if !latex.is_empty() {
825                out.push_str(&format!("$$\n{}\n$$\n\n", latex));
826            }
827        }
828        ContentElement::Caption(c) => {
829            let text = c.base.value();
830            let trimmed = text.trim();
831            if !trimmed.is_empty() {
832                out.push_str(&format!("*{}*\n\n", trimmed));
833            }
834        }
835        ContentElement::NumberHeading(nh) => {
836            let text = nh.base.base.base.value();
837            let trimmed = text.trim();
838            if should_skip_heading_text(trimmed) {
839                return;
840            }
841            out.push_str(&format!("# {}\n\n", trimmed));
842        }
843        ContentElement::Image(_) => {
844            out.push_str("![Image](image)\n\n");
845        }
846        ContentElement::HeaderFooter(_) => {
847            // Skip headers/footers in markdown by default
848        }
849        ContentElement::TextBlock(tb) => {
850            let text = tb.value();
851            let trimmed = clean_paragraph_text(&text);
852            if !trimmed.is_empty() {
853                out.push_str(&escape_md_line_start(&trimmed));
854                out.push_str("\n\n");
855            }
856        }
857        ContentElement::TextLine(tl) => {
858            let text = tl.value();
859            let trimmed = text.trim();
860            if !trimmed.is_empty() {
861                out.push_str(trimmed);
862                out.push('\n');
863            }
864        }
865        ContentElement::TextChunk(tc) => {
866            out.push_str(&tc.value);
867        }
868        _ => {}
869    }
870}
871
872/// Escape characters that have special meaning at the start of a markdown line.
873fn escape_md_line_start(text: &str) -> String {
874    if text.starts_with('>') || text.starts_with('#') {
875        format!("\\{}", text)
876    } else {
877        text.to_string()
878    }
879}
880
881fn starts_with_caption_prefix(text: &str) -> bool {
882    let lower = text.trim_start().to_ascii_lowercase();
883    [
884        "figure ",
885        "fig. ",
886        "table ",
887        "tab. ",
888        "chart ",
889        "graph ",
890        "image ",
891        "illustration ",
892        "diagram ",
893        "plate ",
894        "map ",
895        "exhibit ",
896        "photo by ",
897        "photo credit",
898        "image by ",
899        "image credit",
900        "image courtesy",
901        "photo courtesy",
902        "credit: ",
903        "source: ",
904    ]
905    .iter()
906    .any(|prefix| lower.starts_with(prefix))
907}
908
909fn starts_with_uppercase_word(text: &str) -> bool {
910    for ch in text.trim_start().chars() {
911        if ch.is_alphabetic() {
912            return ch.is_uppercase();
913        }
914        if !matches!(ch, '"' | '\'' | '(' | '[') {
915            break;
916        }
917    }
918    false
919}
920
921/// Clean paragraph text: trim trailing whitespace from each line,
922/// collapse multiple spaces, and normalize whitespace.
923fn clean_paragraph_text(text: &str) -> String {
924    let trimmed = text.trim();
925    if trimmed.is_empty() {
926        return String::new();
927    }
928    // Collapse runs of spaces (but not newlines) to single space
929    let mut result = String::with_capacity(trimmed.len());
930    let mut prev_space = false;
931    for ch in trimmed.chars() {
932        if ch == ' ' || ch == '\t' {
933            if !prev_space {
934                result.push(' ');
935                prev_space = true;
936            }
937        } else {
938            result.push(ch);
939            prev_space = false;
940        }
941    }
942    result
943}
944
945fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
946    match element {
947        Some(ContentElement::Paragraph(p)) => {
948            let text = clean_paragraph_text(&p.base.value());
949            let trimmed = text.trim();
950            if trimmed.is_empty()
951                || should_render_element_as_heading(element.unwrap(), trimmed, None)
952            {
953                None
954            } else {
955                Some(trimmed.to_string())
956            }
957        }
958        Some(ContentElement::TextBlock(tb)) => {
959            let text = clean_paragraph_text(&tb.value());
960            let trimmed = text.trim();
961            if trimmed.is_empty()
962                || should_render_element_as_heading(element.unwrap(), trimmed, None)
963            {
964                None
965            } else {
966                Some(trimmed.to_string())
967            }
968        }
969        Some(ContentElement::TextLine(tl)) => {
970            let text = clean_paragraph_text(&tl.value());
971            let trimmed = text.trim();
972            if trimmed.is_empty()
973                || should_render_element_as_heading(element.unwrap(), trimmed, None)
974            {
975                None
976            } else {
977                Some(trimmed.to_string())
978            }
979        }
980        _ => None,
981    }
982}
983
984fn should_render_paragraph_as_heading(
985    doc: &PdfDocument,
986    idx: usize,
987    text: &str,
988    next: Option<&ContentElement>,
989) -> bool {
990    if looks_like_top_margin_running_header(doc, idx, text) {
991        return false;
992    }
993    if should_render_element_as_heading(&doc.kids[idx], text, next) {
994        return true;
995    }
996
997    // Font-size guard: skip rescue if the candidate text is significantly
998    // smaller than the document's body text (chart axis labels, footnotes).
999    let body_font_size = compute_body_font_size(doc);
1000    if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
1001        return false;
1002    }
1003
1004    // Rescue pass tier 1: when the pipeline found zero headings, use broad rescue.
1005    if !doc_has_explicit_headings(doc) {
1006        if should_rescue_as_heading(doc, idx, text) {
1007            return true;
1008        }
1009        // Also check numbered sections and ALL CAPS even with zero headings,
1010        // since Tier 1 broad rescue has strict word/char limits that miss
1011        // longer keyword-numbered headings (e.g. "Activity 4. Title text").
1012        if should_rescue_allcaps_heading(doc, idx, text) {
1013            return true;
1014        }
1015        if should_rescue_numbered_heading(doc, idx, text) {
1016            return true;
1017        }
1018        return false;
1019    }
1020    // Rescue pass tier 2: when heading density is very low (< 10%), only
1021    // rescue ALL CAPS short text followed by substantial body content.
1022    if heading_density(doc) < 0.10 {
1023        if should_rescue_allcaps_heading(doc, idx, text) {
1024            return true;
1025        }
1026        // Rescue pass tier 3: numbered section headings (e.g. "01 - Title").
1027        // When a document has very few detected headings, numbered patterns
1028        // are a strong structural signal that the font-based detector missed.
1029        if should_rescue_numbered_heading(doc, idx, text) {
1030            return true;
1031        }
1032        // Font-size-gated title-case rescue: when the paragraph is rendered
1033        // in a noticeably larger font than body text, apply the same
1034        // title-case rescue used in tier 1.  A 15 % size increase is a
1035        // reliable visual heading signal straight from the PDF font metrics.
1036        if body_font_size > 0.0 {
1037            if let ContentElement::Paragraph(p) = &doc.kids[idx] {
1038                if let Some(fs) = p.base.font_size {
1039                    if fs >= 1.15 * body_font_size
1040                        && is_heading_rescue_candidate(doc, idx, text)
1041                        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
1042                    {
1043                        return true;
1044                    }
1045                }
1046            }
1047        }
1048    }
1049    false
1050}
1051
1052/// Check whether any element in the document is an explicit heading from the pipeline.
1053fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
1054    doc.kids.iter().any(|e| {
1055        matches!(
1056            e,
1057            ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1058        )
1059    })
1060}
1061
1062/// Compute the dominant body font size from paragraphs with substantial text
1063/// (> 10 words).  Uses the median of qualifying paragraphs to avoid being
1064/// skewed by short chart labels or footnote markers.
1065/// Returns 0.0 if no qualifying paragraph is found.
1066fn compute_body_font_size(doc: &PdfDocument) -> f64 {
1067    let mut font_sizes: Vec<f64> = doc
1068        .kids
1069        .iter()
1070        .filter_map(|e| {
1071            if let ContentElement::Paragraph(p) = e {
1072                let word_count = p.base.value().split_whitespace().count();
1073                if word_count > 10 {
1074                    p.base.font_size
1075                } else {
1076                    None
1077                }
1078            } else {
1079                None
1080            }
1081        })
1082        .collect();
1083    if font_sizes.is_empty() {
1084        return 0.0;
1085    }
1086    font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1087    font_sizes[font_sizes.len() / 2]
1088}
1089
1090/// Check whether a paragraph's font size is too small relative to the document
1091/// body font to be a heading.  Returns true if the element should be skipped.
1092/// A heading should not be noticeably smaller than body text — font size ≥ 95%
1093/// of the dominant body size is required.
1094fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
1095    if body_font_size <= 0.0 {
1096        return false;
1097    }
1098    if let ContentElement::Paragraph(p) = &doc_kids[idx] {
1099        if let Some(fs) = p.base.font_size {
1100            return fs < 0.95 * body_font_size;
1101        }
1102    }
1103    false
1104}
1105
1106/// Count the ratio of pipeline headings to total content elements.
1107fn heading_density(doc: &PdfDocument) -> f64 {
1108    let total = doc.kids.len();
1109    if total == 0 {
1110        return 0.0;
1111    }
1112    let heading_count = doc
1113        .kids
1114        .iter()
1115        .filter(|e| {
1116            matches!(
1117                e,
1118                ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1119            )
1120        })
1121        .count();
1122    heading_count as f64 / total as f64
1123}
1124
1125/// Rescue headings: identify short standalone paragraphs that likely serve
1126/// as section headings.  Only runs when the pipeline produced zero headings.
1127fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1128    is_heading_rescue_candidate(doc, idx, text)
1129        && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
1130}
1131
1132/// Pure text-criteria check for title-case heading rescue.
1133/// Returns true when the text looks like a heading based on casing,
1134/// length, and character composition — without any lookahead.
1135fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1136    let trimmed = text.trim();
1137    if trimmed.is_empty() {
1138        return false;
1139    }
1140
1141    let has_alpha = trimmed.chars().any(char::is_alphabetic);
1142
1143    // Must have alphabetic chars and not end with sentence/continuation punctuation
1144    if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
1145        return false;
1146    }
1147
1148    // Reject text containing math/special symbols or percentage signs.
1149    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
1150        return false;
1151    }
1152
1153    // Must not be fully parenthesized (citations)
1154    if trimmed.starts_with('(') && trimmed.ends_with(')') {
1155        return false;
1156    }
1157
1158    // Must not look like a caption or chart label
1159    if starts_with_caption_prefix(trimmed)
1160        || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
1161    {
1162        return false;
1163    }
1164
1165    // Must be short: ≤ 6 words, ≤ 60 chars
1166    let word_count = trimmed.split_whitespace().count();
1167    if word_count > 6 || trimmed.len() > 60 {
1168        return false;
1169    }
1170
1171    // Must not be a purely numeric string
1172    if trimmed
1173        .chars()
1174        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
1175    {
1176        return false;
1177    }
1178
1179    // First alphabetic character should be uppercase
1180    if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
1181        if first_alpha.is_lowercase() {
1182            return false;
1183        }
1184    }
1185
1186    true
1187}
1188
1189/// Check the next `max_lookahead` elements for substantive body content.
1190/// Returns true when at least one element is a long paragraph (≥ word_count*3
1191/// or > 15 words) or a structural element (list, table, image, figure).
1192fn has_substantive_follow_up(
1193    doc: &PdfDocument,
1194    idx: usize,
1195    word_count: usize,
1196    max_lookahead: usize,
1197) -> bool {
1198    for offset in 1..=max_lookahead {
1199        let lookahead_idx = idx + offset;
1200        if lookahead_idx >= doc.kids.len() {
1201            break;
1202        }
1203        let look_elem = &doc.kids[lookahead_idx];
1204        match look_elem {
1205            ContentElement::Paragraph(p) => {
1206                let next_text = p.base.value();
1207                let nw = next_text.split_whitespace().count();
1208                if nw >= word_count * 3 || nw > 15 {
1209                    return true;
1210                }
1211            }
1212            ContentElement::TextBlock(tb) => {
1213                let next_text = tb.value();
1214                let nw = next_text.split_whitespace().count();
1215                if nw >= word_count * 3 || nw > 15 {
1216                    return true;
1217                }
1218            }
1219            ContentElement::TextLine(tl) => {
1220                let next_text = tl.value();
1221                let nw = next_text.split_whitespace().count();
1222                if nw >= word_count * 3 || nw > 15 {
1223                    return true;
1224                }
1225            }
1226            ContentElement::List(_)
1227            | ContentElement::Table(_)
1228            | ContentElement::TableBorder(_)
1229            | ContentElement::Image(_)
1230            | ContentElement::Figure(_) => {
1231                return true;
1232            }
1233            _ => continue,
1234        }
1235    }
1236
1237    false
1238}
1239
1240/// Rescue numbered section headings like "01 - Find Open Educational Resources"
1241/// or "4.2 Main Results" when heading density is low.
1242fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1243    let trimmed = text.trim();
1244    if trimmed.is_empty() || trimmed.len() > 100 {
1245        return false;
1246    }
1247
1248    // Must match numbered section pattern: digits (with optional dots)
1249    // followed by separator and title text.
1250    if !looks_like_numbered_section(trimmed) {
1251        return false;
1252    }
1253
1254    // Must not end with sentence punctuation — EXCEPT when the text matches
1255    // a keyword+number pattern (e.g. "Activity 4. Determining CEC…") where
1256    // the trailing period is part of the heading format, not sentence ending.
1257    if trimmed.ends_with(['!', '?', ';', ',']) {
1258        return false;
1259    }
1260    if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
1261        return false;
1262    }
1263    // Reject numbered headings containing math symbols or percentage signs.
1264    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
1265        return false;
1266    }
1267
1268    // Look ahead for substantive content
1269    for offset in 1..=3 {
1270        let lookahead_idx = idx + offset;
1271        if lookahead_idx >= doc.kids.len() {
1272            break;
1273        }
1274        match &doc.kids[lookahead_idx] {
1275            ContentElement::Paragraph(p) => {
1276                let nw = p.base.value().split_whitespace().count();
1277                if nw > 10 {
1278                    return true;
1279                }
1280            }
1281            ContentElement::TextBlock(tb) => {
1282                let nw = tb.value().split_whitespace().count();
1283                if nw > 10 {
1284                    return true;
1285                }
1286            }
1287            ContentElement::TextLine(tl) => {
1288                let nw = tl.value().split_whitespace().count();
1289                if nw > 10 {
1290                    return true;
1291                }
1292            }
1293            ContentElement::List(_)
1294            | ContentElement::Table(_)
1295            | ContentElement::TableBorder(_)
1296            | ContentElement::Image(_)
1297            | ContentElement::Figure(_) => {
1298                return true;
1299            }
1300            _ => continue,
1301        }
1302    }
1303
1304    false
1305}
1306
1307/// Check if text starts with a numbered section prefix (e.g. "01 -", "4.2 ", "III.")
1308/// or a keyword+number pattern (e.g. "Activity 4.", "Experiment #1:", "Chapter 3").
1309fn looks_like_numbered_section(text: &str) -> bool {
1310    let bytes = text.as_bytes();
1311    if bytes.is_empty() {
1312        return false;
1313    }
1314
1315    // Branch 1: digit-based prefix: "1 ", "01 ", "4.2 ", "1. ", "01 - "
1316    let mut idx = 0;
1317    if bytes[0].is_ascii_digit() {
1318        while idx < bytes.len() && bytes[idx].is_ascii_digit() {
1319            idx += 1;
1320        }
1321        if idx >= bytes.len() {
1322            return false;
1323        }
1324        // dot-separated subsections: "4.2", "1.3.1"
1325        while idx < bytes.len() && bytes[idx] == b'.' {
1326            idx += 1;
1327            let start = idx;
1328            while idx < bytes.len() && bytes[idx].is_ascii_digit() {
1329                idx += 1;
1330            }
1331            if idx == start {
1332                // "4." followed by space → "4. Title"
1333                break;
1334            }
1335        }
1336        // Must be followed by whitespace or "-"
1337        if idx >= bytes.len() {
1338            return false;
1339        }
1340        // Skip separator: "- " or " - " or just " "
1341        if bytes[idx] == b' ' || bytes[idx] == b'\t' {
1342            idx += 1;
1343            // Skip optional "- " separator
1344            if idx < bytes.len() && bytes[idx] == b'-' {
1345                idx += 1;
1346                if idx < bytes.len() && bytes[idx] == b' ' {
1347                    idx += 1;
1348                }
1349            }
1350        } else if bytes[idx] == b'-' {
1351            idx += 1;
1352            if idx < bytes.len() && bytes[idx] == b' ' {
1353                idx += 1;
1354            }
1355        } else {
1356            return false;
1357        }
1358        // Must have title text after prefix
1359        let rest = &text[idx..].trim();
1360        if rest.is_empty() {
1361            return false;
1362        }
1363        // First alpha char must be uppercase
1364        if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
1365            return c.is_uppercase();
1366        }
1367        return false;
1368    }
1369
1370    // Branch 2: keyword+number prefix: "Activity 4.", "Experiment #1:", "Chapter 3"
1371    if looks_like_keyword_numbered_section(text) {
1372        return true;
1373    }
1374
1375    false
1376}
1377
1378/// Structural keywords that commonly precede a number to form a heading.
1379const SECTION_KEYWORDS: &[&str] = &[
1380    "activity",
1381    "appendix",
1382    "case",
1383    "chapter",
1384    "exercise",
1385    "experiment",
1386    "lab",
1387    "lesson",
1388    "module",
1389    "part",
1390    "phase",
1391    "problem",
1392    "question",
1393    "section",
1394    "stage",
1395    "step",
1396    "task",
1397    "topic",
1398    "unit",
1399];
1400
1401/// Check if text matches "Keyword N. Title" or "Keyword #N: Title" pattern.
1402fn looks_like_keyword_numbered_section(text: &str) -> bool {
1403    let trimmed = text.trim();
1404    // Find the first space to extract the keyword
1405    let space_pos = match trimmed.find(' ') {
1406        Some(p) => p,
1407        None => return false,
1408    };
1409    let keyword = &trimmed[..space_pos];
1410    if !SECTION_KEYWORDS
1411        .iter()
1412        .any(|k| keyword.eq_ignore_ascii_case(k))
1413    {
1414        return false;
1415    }
1416    // After keyword+space, expect a number (optionally preceded by #)
1417    let rest = trimmed[space_pos + 1..].trim_start();
1418    if rest.is_empty() {
1419        return false;
1420    }
1421    let rest = rest.strip_prefix('#').unwrap_or(rest);
1422    // Must start with a digit or roman numeral
1423    let first_char = rest.chars().next().unwrap_or(' ');
1424    if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
1425        return false;
1426    }
1427    true
1428}
1429
1430/// Strict rescue for docs with some headings but low density: only promote
1431/// ALL CAPS text that is clearly a section heading.
1432fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1433    let trimmed = text.trim();
1434    if trimmed.is_empty() {
1435        return false;
1436    }
1437
1438    let word_count = trimmed.split_whitespace().count();
1439
1440    // Must be short: ≤ 8 words, ≤ 80 chars
1441    if word_count > 8 || trimmed.len() > 80 {
1442        return false;
1443    }
1444
1445    // Must be ALL CAPS (all alphabetic chars are uppercase)
1446    let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
1447    if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
1448        return false;
1449    }
1450
1451    // Must not end with sentence punctuation
1452    if trimmed.ends_with(['.', ';', ',']) {
1453        return false;
1454    }
1455
1456    // Reject all-caps headings containing math symbols or percentage signs.
1457    if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
1458        return false;
1459    }
1460
1461    // Must not look like a caption
1462    if starts_with_caption_prefix(trimmed) {
1463        return false;
1464    }
1465
1466    // Must not be purely numeric or a page number
1467    if trimmed
1468        .chars()
1469        .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
1470    {
1471        return false;
1472    }
1473
1474    // Look ahead for substantive content — accept any non-trivial text
1475    // (>6 words) or structured content within the next 4 elements.
1476    for offset in 1..=4 {
1477        let lookahead_idx = idx + offset;
1478        if lookahead_idx >= doc.kids.len() {
1479            break;
1480        }
1481        let look_elem = &doc.kids[lookahead_idx];
1482        match look_elem {
1483            ContentElement::Paragraph(p) => {
1484                let nw = p.base.value().split_whitespace().count();
1485                if nw > 6 {
1486                    return true;
1487                }
1488            }
1489            ContentElement::TextBlock(tb) => {
1490                let nw = tb.value().split_whitespace().count();
1491                if nw > 6 {
1492                    return true;
1493                }
1494            }
1495            ContentElement::TextLine(tl) => {
1496                let nw = tl.value().split_whitespace().count();
1497                if nw > 6 {
1498                    return true;
1499                }
1500            }
1501            ContentElement::List(_)
1502            | ContentElement::Table(_)
1503            | ContentElement::TableBorder(_)
1504            | ContentElement::Image(_)
1505            | ContentElement::Figure(_) => {
1506                return true;
1507            }
1508            _ => continue,
1509        }
1510    }
1511
1512    false
1513}
1514
1515fn should_render_element_as_heading(
1516    element: &ContentElement,
1517    text: &str,
1518    next: Option<&ContentElement>,
1519) -> bool {
1520    let trimmed = text.trim();
1521    if trimmed.is_empty() {
1522        return false;
1523    }
1524
1525    let lower = trimmed.to_ascii_lowercase();
1526    if matches!(lower.as_str(), "contents" | "table of contents")
1527        && trimmed.starts_with(|c: char| c.is_uppercase())
1528    {
1529        return true;
1530    }
1531
1532    let word_count = trimmed.split_whitespace().count();
1533    let has_alpha = trimmed.chars().any(char::is_alphabetic);
1534    let title_like = has_alpha
1535        && word_count <= 4
1536        && trimmed.len() <= 40
1537        && !trimmed.ends_with(['.', '!', '?', ';', ':']);
1538
1539    // Reject attribution prefixes that are clearly not section headings
1540    // (more targeted than starts_with_caption_prefix to avoid false demotions
1541    // of legitimate headings starting with common words like "Graph", "Table").
1542    let is_attribution = {
1543        let lower = trimmed.to_ascii_lowercase();
1544        lower.starts_with("source:")
1545            || lower.starts_with("credit:")
1546            || lower.starts_with("photo by ")
1547            || lower.starts_with("photo credit")
1548            || lower.starts_with("image by ")
1549            || lower.starts_with("image credit")
1550    };
1551
1552    title_like
1553        && matches!(next, Some(ContentElement::List(_)))
1554        && !looks_like_chart_label_heading(element, trimmed)
1555        && !is_attribution
1556}
1557
1558fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1559    let trimmed = text.trim();
1560    if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
1561        return false;
1562    }
1563
1564    let element = &doc.kids[idx];
1565    let bbox = element.bbox();
1566    if bbox.height() > 24.0 {
1567        return false;
1568    }
1569
1570    let Some(page) = element.page_number() else {
1571        return false;
1572    };
1573
1574    // Compute top Y for every page (single pass).
1575    let mut page_tops = std::collections::HashMap::<u32, f64>::new();
1576    for candidate in &doc.kids {
1577        if let Some(p) = candidate.page_number() {
1578            let top = page_tops.entry(p).or_insert(f64::MIN);
1579            *top = top.max(candidate.bbox().top_y);
1580        }
1581    }
1582
1583    let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
1584    if bbox.top_y < page_top - 24.0 {
1585        return false;
1586    }
1587
1588    // A running header repeats across pages.  If the same text does NOT
1589    // appear at the top margin of any other page, this is a unique heading
1590    // (e.g. a document title), not a running header.
1591    let trimmed_lower = trimmed.to_lowercase();
1592    for other_elem in &doc.kids {
1593        let Some(other_page) = other_elem.page_number() else {
1594            continue;
1595        };
1596        if other_page == page {
1597            continue;
1598        }
1599        let other_bbox = other_elem.bbox();
1600        if other_bbox.height() > 24.0 {
1601            continue;
1602        }
1603        let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
1604        if other_bbox.top_y < other_top - 24.0 {
1605            continue;
1606        }
1607        let other_text = match other_elem {
1608            ContentElement::Paragraph(p) => p.base.value(),
1609            ContentElement::TextBlock(tb) => tb.value(),
1610            ContentElement::TextLine(tl) => tl.value(),
1611            ContentElement::Heading(h) => h.base.base.value(),
1612            _ => continue,
1613        };
1614        if other_text.trim().to_lowercase() == trimmed_lower {
1615            return true;
1616        }
1617    }
1618
1619    false
1620}
1621
1622fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
1623    let trimmed = text.trim();
1624    let upper_words = trimmed
1625        .split_whitespace()
1626        .filter(|word| word.chars().any(char::is_alphabetic))
1627        .all(|word| {
1628            word.chars()
1629                .filter(|ch| ch.is_alphabetic())
1630                .all(|ch| ch.is_uppercase())
1631        });
1632
1633    (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
1634}
1635
1636fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
1637    let next_trimmed = next.trim();
1638    if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
1639        return false;
1640    }
1641
1642    let normalized = normalize_heading_text(text);
1643    if matches!(
1644        normalized.as_str(),
1645        "contents" | "tableofcontents" | "introduction" | "conclusion"
1646    ) {
1647        return false;
1648    }
1649
1650    let words: Vec<&str> = text.split_whitespace().collect();
1651    if words.len() < 3 {
1652        return false;
1653    }
1654
1655    words
1656        .last()
1657        .is_some_and(|word| is_sentence_fragment_tail(word))
1658}
1659
1660fn is_sentence_fragment_tail(word: &str) -> bool {
1661    matches!(
1662        word.trim_matches(|c: char| !c.is_alphanumeric())
1663            .to_ascii_lowercase()
1664            .as_str(),
1665        "a" | "an"
1666            | "and"
1667            | "as"
1668            | "at"
1669            | "by"
1670            | "for"
1671            | "from"
1672            | "in"
1673            | "into"
1674            | "of"
1675            | "on"
1676            | "or"
1677            | "that"
1678            | "the"
1679            | "to"
1680            | "with"
1681    )
1682}
1683
1684fn is_list_section_heading(text: &str) -> bool {
1685    let trimmed = text.trim();
1686    trimmed.ends_with(':')
1687        && trimmed.len() <= 80
1688        && trimmed.split_whitespace().count() <= 8
1689        && trimmed.chars().any(char::is_alphabetic)
1690        && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
1691        && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
1692}
1693
1694fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
1695    let next_trimmed = next.trim();
1696    if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
1697        return false;
1698    }
1699
1700    if prev.ends_with('-')
1701        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
1702        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
1703    {
1704        return true;
1705    }
1706
1707    if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
1708        return true;
1709    }
1710
1711    let lower = next_trimmed.to_ascii_lowercase();
1712    if lower.starts_with("http://")
1713        || lower.starts_with("https://")
1714        || lower.starts_with("arxiv")
1715        || lower.starts_with("doi:")
1716    {
1717        return true;
1718    }
1719
1720    if matches!(
1721        next_trimmed.split_whitespace().next(),
1722        Some("In" | "Proceedings" | "Advances" | "Learning")
1723    ) {
1724        return true;
1725    }
1726
1727    !prev.ends_with(['.', '!', '?', ':'])
1728}
1729
1730fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
1731    let next_trimmed = next.trim();
1732    if next_trimmed.is_empty() {
1733        return false;
1734    }
1735
1736    if prev.ends_with('-')
1737        && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
1738        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
1739    {
1740        return true;
1741    }
1742
1743    next_trimmed.chars().next().is_some_and(char::is_lowercase)
1744}
1745
1746fn merge_paragraph_text(target: &mut String, next: &str) {
1747    let next_trimmed = next.trim();
1748    if target.ends_with('-')
1749        && target
1750            .chars()
1751            .rev()
1752            .nth(1)
1753            .is_some_and(|c| c.is_alphabetic())
1754        && next_trimmed.chars().next().is_some_and(char::is_lowercase)
1755    {
1756        target.pop();
1757        target.push_str(next_trimmed);
1758    } else {
1759        if !target.ends_with(' ') {
1760            target.push(' ');
1761        }
1762        target.push_str(next_trimmed);
1763    }
1764}
1765
1766fn is_standalone_page_number(text: &str) -> bool {
1767    let trimmed = text.trim();
1768    !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
1769}
1770
1771fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
1772    if !is_standalone_page_number(text) {
1773        return false;
1774    }
1775
1776    let bbox = element.bbox();
1777    if bbox.height() > 24.0 {
1778        return false;
1779    }
1780
1781    let Some(page) = element.page_number() else {
1782        return false;
1783    };
1784
1785    let mut page_top = f64::MIN;
1786    let mut page_bottom = f64::MAX;
1787    for candidate in &doc.kids {
1788        if candidate.page_number() == Some(page) {
1789            let candidate_bbox = candidate.bbox();
1790            page_top = page_top.max(candidate_bbox.top_y);
1791            page_bottom = page_bottom.min(candidate_bbox.bottom_y);
1792        }
1793    }
1794
1795    if !page_top.is_finite() || !page_bottom.is_finite() {
1796        return false;
1797    }
1798
1799    bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
1800}
1801
1802/// Check whether a pipeline heading sits in the bottom margin of its page.
1803/// Running footers (e.g. "Report Title 21") are sometimes classified as
1804/// headings by the pipeline.  A heading at the page bottom is very unlikely
1805/// to be a real section heading.
1806fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
1807    let element = &doc.kids[idx];
1808    let bbox = element.bbox();
1809    if bbox.height() > 30.0 {
1810        return false;
1811    }
1812
1813    let Some(page) = element.page_number() else {
1814        return false;
1815    };
1816
1817    let mut page_bottom = f64::MAX;
1818    for candidate in &doc.kids {
1819        if candidate.page_number() == Some(page) {
1820            page_bottom = page_bottom.min(candidate.bbox().bottom_y);
1821        }
1822    }
1823
1824    if !page_bottom.is_finite() {
1825        return false;
1826    }
1827
1828    // If this heading is at the very bottom of the page content, skip it.
1829    bbox.bottom_y <= page_bottom + 24.0
1830}
1831
1832/// Demote a pipeline heading that ends with a period when it doesn't look like
1833/// a genuine section heading (e.g. "United Kingdom." or "New Investment (a Challenger).").
1834/// Returns true when the heading should be rendered as a paragraph instead.
1835fn should_demote_period_heading(text: &str) -> bool {
1836    let trimmed = text.trim();
1837    if !trimmed.ends_with('.') {
1838        return false;
1839    }
1840    // Keep numbered section headings: "I. Introduction", "4.2. Results",
1841    // "Activity 4. Determining CEC…"
1842    if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
1843        return false;
1844    }
1845    // Keep headings whose text without the trailing period still looks like a
1846    // proper title — at least 3 words, first word uppercase, and the period
1847    // is clearly sentence-ending rather than part of a title pattern.
1848    let without_dot = trimmed.trim_end_matches('.');
1849    let word_count = without_dot.split_whitespace().count();
1850    // Very short fragments ending with '.' (like "Kingdom.") are almost
1851    // certainly not headings.
1852    if word_count <= 2 {
1853        return true;
1854    }
1855    false
1856}
1857
1858/// Demote headings that end with a comma — these are never real headings
1859/// (e.g. footnote references like "29 Pope," or "32 Beawes, 33 M.M.,").
1860fn should_demote_comma_heading(text: &str) -> bool {
1861    text.trim().ends_with(',')
1862}
1863
1864/// Demote headings containing mathematical/special symbols that never appear
1865/// in real section headings (e.g. "HL ¼", "P ≪ P", "LH þ HL:").
1866fn should_demote_math_heading(text: &str) -> bool {
1867    text.chars().any(|c| {
1868        matches!(
1869            c,
1870            '¼' | '½'
1871                | '¾'
1872                | '≪'
1873                | '≫'
1874                | 'þ'
1875                | 'ð'
1876                | '∑'
1877                | '∫'
1878                | '∂'
1879                | '∏'
1880                | '√'
1881                | '∞'
1882                | '≈'
1883                | '÷'
1884        )
1885    })
1886}
1887
1888/// Demote headings containing a percentage sign — these are typically data
1889/// labels rather than section headings (e.g. "56% AGREE").
1890fn should_demote_percentage_heading(text: &str) -> bool {
1891    text.contains('%')
1892}
1893
1894/// Demote bibliography entries that start with a 4-digit year followed by
1895/// a period and space (e.g. "2020. Measuring massive multitask...").
1896fn should_demote_bibliography_heading(text: &str) -> bool {
1897    let t = text.trim();
1898    if t.len() < 6 {
1899        return false;
1900    }
1901    let bytes = t.as_bytes();
1902    bytes[0..4].iter().all(|b| b.is_ascii_digit())
1903        && bytes[4] == b'.'
1904        && (bytes[5] == b' ' || t.len() == 5)
1905}
1906
1907/// Strip a trailing standalone page number from heading text.
1908/// E.g. "Chapter 3. Numerical differentiation 35" → "Chapter 3. Numerical differentiation"
1909/// Only strips when the last token is 1-4 digits and the heading has enough
1910/// words to be meaningful without it.
1911fn strip_trailing_page_number(text: &str) -> &str {
1912    let trimmed = text.trim();
1913    if let Some(last_space) = trimmed.rfind(' ') {
1914        let suffix = &trimmed[last_space + 1..];
1915        if !suffix.is_empty()
1916            && suffix.len() <= 4
1917            && suffix.chars().all(|c| c.is_ascii_digit())
1918            && trimmed[..last_space].split_whitespace().count() >= 3
1919        {
1920            return trimmed[..last_space].trim();
1921        }
1922    }
1923    trimmed
1924}
1925
1926/// Try to split a heading that contains a merged subsection number.
1927/// For example, "4 Results 4.1 Experimental Details" should become
1928/// two headings: "4 Results" and "4.1 Experimental Details".
1929/// Returns None if no split is needed, otherwise the split point byte offset.
1930fn find_merged_subsection_split(text: &str) -> Option<usize> {
1931    // Look for a subsection number pattern like "4.1" or "B.1" after initial content.
1932    // Must appear at a word boundary (preceded by space).
1933    let bytes = text.as_bytes();
1934    // Start searching after the first few characters to skip the initial number
1935    let mut i = 3;
1936    while i < bytes.len() {
1937        if bytes[i - 1] == b' ' {
1938            // Check for digit.digit pattern (e.g., "4.1")
1939            if bytes[i].is_ascii_digit() {
1940                if let Some(dot_pos) = text[i..].find('.') {
1941                    let after_dot = i + dot_pos + 1;
1942                    if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
1943                        // Found "N.N" pattern preceded by space
1944                        return Some(i);
1945                    }
1946                }
1947            }
1948            // Check for letter.digit pattern (e.g., "B.1")
1949            if bytes[i].is_ascii_uppercase()
1950                && i + 2 < bytes.len()
1951                && bytes[i + 1] == b'.'
1952                && bytes[i + 2].is_ascii_digit()
1953            {
1954                return Some(i);
1955            }
1956        }
1957        i += 1;
1958    }
1959    None
1960}
1961
1962fn should_skip_heading_text(text: &str) -> bool {
1963    let trimmed = text.trim();
1964    if trimmed.is_empty() || is_standalone_page_number(trimmed) {
1965        return true;
1966    }
1967
1968    let lower = trimmed.to_ascii_lowercase();
1969    if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
1970        && trimmed.contains('|')
1971    {
1972        return true;
1973    }
1974
1975    let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
1976    let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
1977    alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
1978}
1979
1980fn repair_fragmented_words(text: &str) -> String {
1981    const STOPWORDS: &[&str] = &[
1982        "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
1983        "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
1984    ];
1985
1986    let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
1987    if parts.len() < 2 {
1988        return text.to_string();
1989    }
1990
1991    let mut i = 0usize;
1992    while i + 1 < parts.len() {
1993        let left = parts[i].clone();
1994        let right = parts[i + 1].clone();
1995        let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
1996        let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
1997        let left_lower = left_clean.to_ascii_lowercase();
1998        let right_lower = right_clean.to_ascii_lowercase();
1999
2000        let should_join = !left_clean.is_empty()
2001            && !right_clean.is_empty()
2002            && left_clean.chars().all(char::is_alphabetic)
2003            && right_clean.chars().all(char::is_alphabetic)
2004            && (left_clean.len() <= 4 || right_clean.len() <= 4)
2005            && left_clean.len() + right_clean.len() >= 6
2006            && !right_clean.chars().next().is_some_and(char::is_uppercase)
2007            && !STOPWORDS.contains(&left_lower.as_str())
2008            && !STOPWORDS.contains(&right_lower.as_str());
2009
2010        if should_join {
2011            let next = parts.remove(i + 1);
2012            parts[i].push_str(&next);
2013        } else {
2014            i += 1;
2015        }
2016    }
2017
2018    parts.join(" ")
2019}
2020
2021/// Extract text from list item contents (fallback when label/body tokens are empty).
2022fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
2023    let mut text = String::new();
2024    for elem in contents {
2025        let part = match elem {
2026            ContentElement::Paragraph(p) => p.base.value(),
2027            ContentElement::TextBlock(tb) => tb.value(),
2028            ContentElement::TextLine(tl) => tl.value(),
2029            ContentElement::TextChunk(tc) => tc.value.clone(),
2030            _ => String::new(),
2031        };
2032        if !text.is_empty() && !part.is_empty() {
2033            text.push(' ');
2034        }
2035        text.push_str(&part);
2036    }
2037    text
2038}
2039
2040/// Merge header continuation rows in a rendered table.
2041///
2042/// When a PDF table has multi-line column headers, each wrapped line often
2043/// produces a separate row in the grid.  These continuation rows have an
2044/// empty first cell while the header row above them has content.  This
2045/// function detects such rows at the start of the table and merges their
2046/// text into the first row, producing a single combined header.
2047///
2048/// Only rows whose non-empty cells are all ≤ 30 characters are merged, to
2049/// avoid accidentally collapsing data rows that happen to have an empty key.
2050fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
2051    if rows.len() < 2 {
2052        return;
2053    }
2054    // The first row must have a non-empty first cell (the header anchor).
2055    if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
2056        return;
2057    }
2058
2059    let mut merge_count = 0usize;
2060    for (i, row_i) in rows.iter().enumerate().skip(1) {
2061        let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
2062        if !first_empty {
2063            break; // hit a data row
2064        }
2065        // All non-empty cells must be short (header-like fragments).
2066        let all_short = row_i
2067            .iter()
2068            .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
2069        if !all_short {
2070            break;
2071        }
2072        merge_count = i;
2073    }
2074
2075    // Require at least 2 consecutive continuation rows to avoid merging
2076    // legitimate sub-header or unit rows (e.g. a single row with "cmolc/kg").
2077    if merge_count == 0 {
2078        return;
2079    }
2080
2081    // Merge rows 1..=merge_count into row 0.
2082    for i in 1..=merge_count {
2083        let (head, tail) = rows.split_at_mut(i);
2084        let ncols = head[0].len().min(tail[0].len());
2085        for (target, src) in head[0]
2086            .iter_mut()
2087            .take(ncols)
2088            .zip(tail[0].iter().take(ncols))
2089        {
2090            let fragment = src.trim().to_string();
2091            if !fragment.is_empty() {
2092                let target_str = target.trim().to_string();
2093                *target = if target_str.is_empty() {
2094                    fragment
2095                } else {
2096                    format!("{} {}", target_str, fragment)
2097                };
2098            }
2099        }
2100    }
2101
2102    // Remove the merged rows.
2103    rows.drain(1..=merge_count);
2104}
2105
2106/// Render a SemanticTable as a markdown table.
2107fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
2108    // Delegate to render_table_border which handles cross-page linking.
2109    render_table_border(out, &table.table_border);
2110}
2111
2112/// Collect rendered rows from a single TableBorder (no cross-page chaining).
2113fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
2114    let num_cols = table.num_columns.max(1);
2115    let mut rendered_rows: Vec<Vec<String>> = Vec::new();
2116    for row in &table.rows {
2117        let cell_texts: Vec<String> = (0..num_cols)
2118            .map(|col| {
2119                row.cells
2120                    .iter()
2121                    .find(|c| c.col_number == col)
2122                    .map(cell_text_content)
2123                    .unwrap_or_default()
2124            })
2125            .collect();
2126        if !cell_texts.iter().all(|t| t.trim().is_empty()) {
2127            rendered_rows.push(cell_texts);
2128        }
2129    }
2130    rendered_rows
2131}
2132
2133/// Render a TableBorder directly as a markdown table.
2134///
2135/// When the table has a `next_table` link (cross-page continuation), the
2136/// continuation rows are appended so the entire logical table is emitted
2137/// as a single pipe table.
2138fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
2139    if table.rows.is_empty() {
2140        return;
2141    }
2142
2143    let num_cols = table.num_columns.max(1);
2144
2145    // Collect rows from this table.
2146    let mut rendered_rows = collect_table_border_rows(table);
2147
2148    if rendered_rows.is_empty() {
2149        return;
2150    }
2151
2152    // Merge multi-line header rows into a single header row.
2153    merge_continuation_rows(&mut rendered_rows);
2154
2155    // ToC detection: render table-of-contents as plain text pairs, not a markdown table.
2156    if is_toc_table(&rendered_rows) {
2157        render_toc_rows(out, &rendered_rows);
2158        return;
2159    }
2160
2161    for (row_idx, cell_texts) in rendered_rows.iter().enumerate() {
2162        out.push('|');
2163        for cell_text in cell_texts {
2164            out.push_str(&format!(" {} |", cell_text.trim()));
2165        }
2166        out.push('\n');
2167
2168        // Add separator after first row (header)
2169        if row_idx == 0 {
2170            out.push('|');
2171            for _ in 0..num_cols {
2172                out.push_str(" --- |");
2173            }
2174            out.push('\n');
2175        }
2176    }
2177    out.push('\n');
2178}
2179
2180/// Returns true if `text` looks like a page number (Arabic digits or Roman numerals).
2181fn is_page_number_like(text: &str) -> bool {
2182    let t = text.trim();
2183    if t.is_empty() {
2184        return false;
2185    }
2186    // All ASCII digits, length ≤ 5 (handles pages 1–99999)
2187    if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
2188        return true;
2189    }
2190    // Lowercase Roman numerals (i, ii, iii, iv, v, vi, vii, viii, ix, x …)
2191    let lower = t.to_ascii_lowercase();
2192    if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
2193        return true;
2194    }
2195    false
2196}
2197
2198/// Returns true if the rendered rows look like a table-of-contents:
2199/// exactly 2 columns where the majority of right-column cells are page numbers.
2200fn is_toc_table(rows: &[Vec<String>]) -> bool {
2201    if rows.is_empty() {
2202        return false;
2203    }
2204    // Need at least 2 rows to qualify as a ToC
2205    if rows.len() < 2 {
2206        return false;
2207    }
2208    // First, every row must have exactly 2 cells
2209    if !rows.iter().all(|r| r.len() == 2) {
2210        return false;
2211    }
2212
2213    let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
2214    if non_empty_right < 2 {
2215        return false;
2216    }
2217
2218    let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
2219    page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
2220}
2221
2222/// Render ToC-style rows as plain text (title pagenum pairs) rather than a markdown table.
2223fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
2224    for row in rows {
2225        let title = row[0].trim();
2226        let page = row[1].trim();
2227        if title.is_empty() && page.is_empty() {
2228            continue;
2229        }
2230        if !title.is_empty() && !page.is_empty() {
2231            out.push_str(title);
2232            out.push(' ');
2233            out.push_str(page);
2234        } else {
2235            out.push_str(title);
2236            out.push_str(page);
2237        }
2238        out.push('\n');
2239    }
2240    out.push('\n');
2241}
2242
2243/// Extract text content from a table cell.
2244fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
2245    // First try the content tokens — use gap-based concatenation instead of
2246    // naive space-joining so that letter-spaced text ("O w n e r s h i p")
2247    // is collapsed correctly.
2248    if !cell.content.is_empty() {
2249        let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
2250        return crate::models::text::TextLine::concatenate_chunks(&chunks);
2251    }
2252    // Fall back to processed contents
2253    let mut text = String::new();
2254    for elem in &cell.contents {
2255        match elem {
2256            ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
2257            ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
2258            ContentElement::TextLine(tl) => text.push_str(&tl.value()),
2259            ContentElement::TextChunk(tc) => text.push_str(&tc.value),
2260            _ => {}
2261        }
2262    }
2263    repair_fragmented_words(&text)
2264}
2265
2266/// Merge adjacent pipe tables that share the same column count.
2267///
2268/// PDF table detection sometimes splits one visual table into several
2269/// fragments that are emitted as successive pipe tables.  When two tables
2270/// are separated only by blank lines and have identical column counts,
2271/// they are merged into a single table by appending the second table's
2272/// rows (including its header-now-body row) to the first.
2273fn merge_adjacent_pipe_tables(markdown: &str) -> String {
2274    let lines: Vec<&str> = markdown.lines().collect();
2275    if lines.len() < 4 {
2276        return markdown.to_string();
2277    }
2278
2279    fn count_pipe_cols(line: &str) -> usize {
2280        let t = line.trim();
2281        if !t.starts_with('|') || !t.ends_with('|') {
2282            return 0;
2283        }
2284        t.split('|').count().saturating_sub(2)
2285    }
2286
2287    fn is_separator(line: &str) -> bool {
2288        let t = line.trim();
2289        if !t.starts_with('|') || !t.ends_with('|') {
2290            return false;
2291        }
2292        let cells: Vec<&str> = t.split('|').collect();
2293        if cells.len() < 3 {
2294            return false;
2295        }
2296        cells[1..cells.len() - 1].iter().all(|c| {
2297            let s = c.trim();
2298            !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
2299        })
2300    }
2301
2302    fn is_pipe_row(line: &str) -> bool {
2303        let t = line.trim();
2304        t.starts_with('|') && t.ends_with('|') && t.len() > 2
2305    }
2306
2307    fn pad_pipe_row(line: &str, target_cols: usize) -> String {
2308        let t = line.trim();
2309        let current_cols = count_pipe_cols(t);
2310        if current_cols >= target_cols {
2311            return t.to_string();
2312        }
2313        // Append extra empty cells after the existing trailing |
2314        let mut result = t.to_string();
2315        for _ in current_cols..target_cols {
2316            result.push_str("  |");
2317        }
2318        result
2319    }
2320
2321    // Identify pipe table blocks: (start, sep_idx, end, col_count).
2322    struct Block {
2323        start: usize,
2324        sep: usize,
2325        end: usize, // inclusive last line
2326        cols: usize,
2327    }
2328
2329    let mut blocks: Vec<Block> = Vec::new();
2330    let mut i = 0;
2331    while i < lines.len() {
2332        if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
2333            let cols = count_pipe_cols(lines[i]);
2334            let sep = i + 1;
2335            let mut end = sep;
2336            let mut j = sep + 1;
2337            while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
2338                end = j;
2339                j += 1;
2340            }
2341            blocks.push(Block {
2342                start: i,
2343                sep,
2344                end,
2345                cols,
2346            });
2347            i = end + 1;
2348        } else {
2349            i += 1;
2350        }
2351    }
2352
2353    if blocks.len() < 2 {
2354        return markdown.to_string();
2355    }
2356
2357    // Group adjacent blocks: allow different column counts.
2358    // Merge when separated by blank lines only, or by heading markers
2359    // (lines starting with #) that represent table cells misclassified
2360    // as headings by the pipeline.
2361    // Track group max cols during merge to use for heading gap decisions.
2362    let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
2363    let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
2364    for bi in 1..blocks.len() {
2365        let prev = &blocks[bi - 1];
2366        let curr = &blocks[bi];
2367        let gap_range = prev.end + 1..curr.start;
2368        let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
2369        // For heading gap check, use the group's max cols (not individual block).
2370        // This handles chains like [2-col] → blank → [1-col] → heading → [2-col]
2371        // where the 1-col intermediary is already merged with the 2-col leader.
2372        let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
2373        let effective_prev_cols = group_cols[leader_idx];
2374        let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
2375            let non_blank: Vec<usize> = gap_range
2376                .clone()
2377                .filter(|li| !lines[*li].trim().is_empty())
2378                .collect();
2379            // Only merge when gap has 1-2 heading lines
2380            !non_blank.is_empty()
2381                && non_blank.len() <= 2
2382                && non_blank.iter().all(|li| {
2383                    let t = lines[*li].trim();
2384                    t.starts_with('#') && t.len() < 100
2385                })
2386        } else {
2387            false
2388        };
2389        // Short displaced cell: a single short plain-text word between two
2390        // multi-column tables is almost certainly a cell value that the PDF
2391        // pipeline displaced out of the table grid.
2392        let gap_short_fragment =
2393            if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
2394                let non_blank: Vec<usize> = gap_range
2395                    .clone()
2396                    .filter(|li| !lines[*li].trim().is_empty())
2397                    .collect();
2398                non_blank.len() == 1 && {
2399                    let t = lines[non_blank[0]].trim();
2400                    t.len() < 30
2401                        && !t.starts_with('#')
2402                        && !t.starts_with('-')
2403                        && !t.starts_with('*')
2404                        && !t.contains(':')
2405                        && !t.contains("TABLE")
2406                }
2407            } else {
2408                false
2409            };
2410        if (gap_all_blank || gap_heading_only || gap_short_fragment)
2411            && prev.cols > 0
2412            && curr.cols > 0
2413        {
2414            merge_leader[bi] = Some(leader_idx);
2415            // Update group max cols
2416            if curr.cols > group_cols[leader_idx] {
2417                group_cols[leader_idx] = curr.cols;
2418            }
2419        }
2420    }
2421
2422    let mut pad_target: Vec<usize> = vec![0; blocks.len()];
2423    for bi in 0..blocks.len() {
2424        let leader = merge_leader[bi].unwrap_or(bi);
2425        pad_target[bi] = group_cols[leader];
2426    }
2427
2428    // Mark lines to skip: blank gap lines + separator of merged blocks.
2429    // Non-blank gap lines become pipe table rows instead of being skipped.
2430    // Keep the header row (curr.start) — it becomes a data row.
2431    let mut skip = vec![false; lines.len()];
2432    let mut convert_to_pipe_row = vec![false; lines.len()];
2433    for (bi, leader) in merge_leader.iter().enumerate() {
2434        if leader.is_none() {
2435            continue;
2436        }
2437        let prev_end = blocks[bi - 1].end;
2438        let curr = &blocks[bi];
2439        for li in (prev_end + 1)..curr.start {
2440            if lines[li].trim().is_empty() {
2441                skip[li] = true;
2442            } else {
2443                // Non-blank gap line: convert to pipe row
2444                convert_to_pipe_row[li] = true;
2445            }
2446        }
2447        // Only skip separator, header row becomes a data row
2448        skip[curr.sep] = true;
2449    }
2450
2451    // Map each line to its block index (or the block it belongs to via gap conversion).
2452    let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
2453    for (bi, block) in blocks.iter().enumerate() {
2454        line_to_block[block.start..=block.end].fill(Some(bi));
2455    }
2456    // Assign gap lines to the preceding block for padding purposes.
2457    for (bi, leader) in merge_leader.iter().enumerate() {
2458        if leader.is_none() {
2459            continue;
2460        }
2461        let prev_end = blocks[bi - 1].end;
2462        let curr = &blocks[bi];
2463        for li in (prev_end + 1)..curr.start {
2464            if convert_to_pipe_row[li] {
2465                line_to_block[li] = Some(bi - 1);
2466            }
2467        }
2468    }
2469
2470    let mut result = String::new();
2471    for (li, line) in lines.iter().enumerate() {
2472        if skip[li] {
2473            continue;
2474        }
2475        if convert_to_pipe_row[li] {
2476            // Convert non-blank gap text/heading into a pipe table row.
2477            let text = line.trim().trim_start_matches('#').trim();
2478            if let Some(bi) = line_to_block[li] {
2479                let target = pad_target[bi];
2480                if target > 0 && !text.is_empty() {
2481                    result.push_str(&format!("| {} ", text));
2482                    for _ in 1..target {
2483                        result.push_str("|  ");
2484                    }
2485                    result.push_str("|\n");
2486                    continue;
2487                }
2488            }
2489            // Fallback: emit as-is if no block context
2490            result.push_str(line);
2491            result.push('\n');
2492            continue;
2493        }
2494        if let Some(bi) = line_to_block[li] {
2495            let target = pad_target[bi];
2496            if target > 0 && is_pipe_row(line) && !is_separator(line) {
2497                result.push_str(&pad_pipe_row(line, target));
2498                result.push('\n');
2499            } else if target > 0 && is_separator(line) {
2500                result.push('|');
2501                for _ in 0..target {
2502                    result.push_str(" --- |");
2503                }
2504                result.push('\n');
2505            } else {
2506                result.push_str(line);
2507                result.push('\n');
2508            }
2509        } else {
2510            result.push_str(line);
2511            result.push('\n');
2512        }
2513    }
2514
2515    result
2516}
2517
2518#[cfg(test)]
2519mod tests {
2520    use super::*;
2521    use crate::models::bbox::BoundingBox;
2522    use crate::models::chunks::TextChunk;
2523    use crate::models::content::ContentElement;
2524    use crate::models::enums::{PdfLayer, TextFormat, TextType};
2525    use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
2526    use crate::models::table::{
2527        TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
2528    };
2529    use crate::models::text::{TextBlock, TextColumn, TextLine};
2530
2531    #[test]
2532    fn test_empty_doc() {
2533        let doc = PdfDocument::new("test.pdf".to_string());
2534        let md = to_markdown(&doc).unwrap();
2535        assert!(md.contains("No content extracted"));
2536    }
2537
2538    #[test]
2539    fn test_with_title() {
2540        let mut doc = PdfDocument::new("test.pdf".to_string());
2541        doc.title = Some("My Title".to_string());
2542        let md = to_markdown(&doc).unwrap();
2543        assert!(md.starts_with("# My Title\n"));
2544    }
2545
2546    #[test]
2547    fn test_empty_title_not_rendered() {
2548        let mut doc = PdfDocument::new("test.pdf".to_string());
2549        doc.title = Some("  ".to_string());
2550        let md = to_markdown(&doc).unwrap();
2551        assert!(
2552            !md.contains("# "),
2553            "Empty/whitespace title should not produce a heading"
2554        );
2555    }
2556
2557    #[test]
2558    fn test_repair_fragmented_words() {
2559        assert_eq!(
2560            repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
2561            "Jurisdiction Foreign Requirements"
2562        );
2563    }
2564
2565    #[test]
2566    fn test_reference_continuation_detected() {
2567        assert!(should_merge_paragraph_text(
2568            "Scaling laws for transfer.",
2569            "arXiv preprint arXiv:2102.01293."
2570        ));
2571    }
2572
2573    fn make_heading(text: &str) -> ContentElement {
2574        let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
2575        let chunk = TextChunk {
2576            value: text.to_string(),
2577            bbox: bbox.clone(),
2578            font_name: "Lato-Bold".to_string(),
2579            font_size: 12.0,
2580            font_weight: 700.0,
2581            italic_angle: 0.0,
2582            font_color: "#000000".to_string(),
2583            contrast_ratio: 21.0,
2584            symbol_ends: vec![],
2585            text_format: TextFormat::Normal,
2586            text_type: TextType::Regular,
2587            pdf_layer: PdfLayer::Main,
2588            ocg_visible: true,
2589            index: None,
2590            page_number: Some(1),
2591            level: None,
2592            mcid: None,
2593        };
2594        let line = TextLine {
2595            bbox: bbox.clone(),
2596            index: None,
2597            level: None,
2598            font_size: 12.0,
2599            base_line: 702.0,
2600            slant_degree: 0.0,
2601            is_hidden_text: false,
2602            text_chunks: vec![chunk],
2603            is_line_start: true,
2604            is_line_end: true,
2605            is_list_line: false,
2606            connected_line_art_label: None,
2607        };
2608        let block = TextBlock {
2609            bbox: bbox.clone(),
2610            index: None,
2611            level: None,
2612            font_size: 12.0,
2613            base_line: 702.0,
2614            slant_degree: 0.0,
2615            is_hidden_text: false,
2616            text_lines: vec![line],
2617            has_start_line: true,
2618            has_end_line: true,
2619            text_alignment: None,
2620        };
2621        let column = TextColumn {
2622            bbox: bbox.clone(),
2623            index: None,
2624            level: None,
2625            font_size: 12.0,
2626            base_line: 702.0,
2627            slant_degree: 0.0,
2628            is_hidden_text: false,
2629            text_blocks: vec![block],
2630        };
2631        ContentElement::Heading(SemanticHeading {
2632            base: SemanticParagraph {
2633                base: SemanticTextNode {
2634                    bbox,
2635                    index: None,
2636                    level: None,
2637                    semantic_type: crate::models::enums::SemanticType::Heading,
2638                    correct_semantic_score: None,
2639                    columns: vec![column],
2640                    font_weight: Some(700.0),
2641                    font_size: Some(12.0),
2642                    text_color: None,
2643                    italic_angle: None,
2644                    font_name: Some("Lato-Bold".to_string()),
2645                    text_format: None,
2646                    max_font_size: Some(12.0),
2647                    background_color: None,
2648                    is_hidden_text: false,
2649                },
2650                enclosed_top: false,
2651                enclosed_bottom: false,
2652                indentation: 0,
2653            },
2654            heading_level: Some(1),
2655        })
2656    }
2657
2658    fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
2659        let bbox = BoundingBox::new(Some(1), 72.0, bottom, 300.0, top);
2660        let chunk = TextChunk {
2661            value: text.to_string(),
2662            bbox: bbox.clone(),
2663            font_name: "Lato-Regular".to_string(),
2664            font_size: (top - bottom).max(1.0),
2665            font_weight: 400.0,
2666            italic_angle: 0.0,
2667            font_color: "#000000".to_string(),
2668            contrast_ratio: 21.0,
2669            symbol_ends: vec![],
2670            text_format: TextFormat::Normal,
2671            text_type: TextType::Regular,
2672            pdf_layer: PdfLayer::Main,
2673            ocg_visible: true,
2674            index: None,
2675            page_number: Some(1),
2676            level: None,
2677            mcid: None,
2678        };
2679        let line = TextLine {
2680            bbox: bbox.clone(),
2681            index: None,
2682            level: None,
2683            font_size: chunk.font_size,
2684            base_line: bottom + 2.0,
2685            slant_degree: 0.0,
2686            is_hidden_text: false,
2687            text_chunks: vec![chunk],
2688            is_line_start: true,
2689            is_line_end: true,
2690            is_list_line: false,
2691            connected_line_art_label: None,
2692        };
2693        let block = TextBlock {
2694            bbox: bbox.clone(),
2695            index: None,
2696            level: None,
2697            font_size: line.font_size,
2698            base_line: line.base_line,
2699            slant_degree: 0.0,
2700            is_hidden_text: false,
2701            text_lines: vec![line],
2702            has_start_line: true,
2703            has_end_line: true,
2704            text_alignment: None,
2705        };
2706        let column = TextColumn {
2707            bbox: bbox.clone(),
2708            index: None,
2709            level: None,
2710            font_size: block.font_size,
2711            base_line: block.base_line,
2712            slant_degree: 0.0,
2713            is_hidden_text: false,
2714            text_blocks: vec![block],
2715        };
2716        ContentElement::Paragraph(SemanticParagraph {
2717            base: SemanticTextNode {
2718                bbox,
2719                index: None,
2720                level: None,
2721                semantic_type: crate::models::enums::SemanticType::Paragraph,
2722                correct_semantic_score: None,
2723                columns: vec![column],
2724                font_weight: Some(400.0),
2725                font_size: Some(top - bottom),
2726                text_color: None,
2727                italic_angle: None,
2728                font_name: Some("Lato-Regular".to_string()),
2729                text_format: None,
2730                max_font_size: Some(top - bottom),
2731                background_color: None,
2732                is_hidden_text: false,
2733            },
2734            enclosed_top: false,
2735            enclosed_bottom: false,
2736            indentation: 0,
2737        })
2738    }
2739
2740    fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
2741        let mut table_rows = Vec::new();
2742        for (ri, (title, page)) in rows.iter().enumerate() {
2743            let top = 680.0 - ri as f64 * 18.0;
2744            let bottom = top - 12.0;
2745            let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
2746            let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
2747            table_rows.push(TableBorderRow {
2748                bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
2749                index: None,
2750                level: None,
2751                row_number: ri,
2752                cells: vec![
2753                    TableBorderCell {
2754                        bbox: left_bbox.clone(),
2755                        index: None,
2756                        level: None,
2757                        row_number: ri,
2758                        col_number: 0,
2759                        row_span: 1,
2760                        col_span: 1,
2761                        content: vec![TableToken {
2762                            base: TextChunk {
2763                                value: (*title).to_string(),
2764                                bbox: left_bbox,
2765                                font_name: "Lato-Regular".to_string(),
2766                                font_size: 10.0,
2767                                font_weight: 400.0,
2768                                italic_angle: 0.0,
2769                                font_color: "#000000".to_string(),
2770                                contrast_ratio: 21.0,
2771                                symbol_ends: vec![],
2772                                text_format: TextFormat::Normal,
2773                                text_type: TextType::Regular,
2774                                pdf_layer: PdfLayer::Main,
2775                                ocg_visible: true,
2776                                index: None,
2777                                page_number: Some(1),
2778                                level: None,
2779                                mcid: None,
2780                            },
2781                            token_type: TableTokenType::Text,
2782                        }],
2783                        contents: vec![],
2784                        semantic_type: None,
2785                    },
2786                    TableBorderCell {
2787                        bbox: right_bbox.clone(),
2788                        index: None,
2789                        level: None,
2790                        row_number: ri,
2791                        col_number: 1,
2792                        row_span: 1,
2793                        col_span: 1,
2794                        content: vec![TableToken {
2795                            base: TextChunk {
2796                                value: (*page).to_string(),
2797                                bbox: right_bbox,
2798                                font_name: "Lato-Regular".to_string(),
2799                                font_size: 10.0,
2800                                font_weight: 400.0,
2801                                italic_angle: 0.0,
2802                                font_color: "#000000".to_string(),
2803                                contrast_ratio: 21.0,
2804                                symbol_ends: vec![],
2805                                text_format: TextFormat::Normal,
2806                                text_type: TextType::Regular,
2807                                pdf_layer: PdfLayer::Main,
2808                                ocg_visible: true,
2809                                index: None,
2810                                page_number: Some(1),
2811                                level: None,
2812                                mcid: None,
2813                            },
2814                            token_type: TableTokenType::Text,
2815                        }],
2816                        contents: vec![],
2817                        semantic_type: None,
2818                    },
2819                ],
2820                semantic_type: None,
2821            });
2822        }
2823
2824        ContentElement::TableBorder(TableBorder {
2825            bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
2826            index: None,
2827            level: Some("1".to_string()),
2828            x_coordinates: vec![72.0, 320.0, 360.0],
2829            x_widths: vec![0.0, 0.0, 0.0],
2830            y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
2831            y_widths: vec![0.0, 0.0, 0.0, 0.0],
2832            rows: table_rows,
2833            num_rows: rows.len(),
2834            num_columns: 2,
2835            is_bad_table: false,
2836            is_table_transformer: false,
2837            previous_table: None,
2838            next_table: None,
2839        })
2840    }
2841
2842    #[test]
2843    fn test_contents_document_renders_toc_table_rows() {
2844        let mut doc = PdfDocument::new("contents.pdf".to_string());
2845        doc.kids.push(make_heading("CONTENTS"));
2846        doc.kids.push(make_toc_table(&[
2847            ("Experiment #1: Hydrostatic Pressure", "3"),
2848            ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
2849            ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
2850            ("Experiment #4: Energy Loss in Pipes", "33"),
2851            ("Experiment #5: Impact of a Jet", "43"),
2852            ("Experiment #6: Orifice and Free Jet Flow", "50"),
2853            ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
2854            ("References", "101"),
2855        ]));
2856
2857        let md = to_markdown(&doc).unwrap();
2858        assert!(md.contains("Experiment #1: Hydrostatic Pressure 3"));
2859        assert!(md.contains("Experiment #2: Bernoulli's Theorem Demonstration 13"));
2860        assert!(md.contains("Experiment #7: Osborne Reynolds' Demonstration 59"));
2861        assert!(md.contains("References 101"));
2862    }
2863
2864    #[test]
2865    fn test_toc_semantic_paragraphs_render_without_blank_lines() {
2866        let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
2867        let mut first = make_paragraph(
2868            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
2869            700.0,
2870            712.0,
2871        );
2872        let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
2873        if let ContentElement::Paragraph(p) = &mut first {
2874            p.base.semantic_type = SemanticType::TableOfContent;
2875        }
2876        if let ContentElement::Paragraph(p) = &mut second {
2877            p.base.semantic_type = SemanticType::TableOfContent;
2878        }
2879        doc.kids.push(first);
2880        doc.kids.push(second);
2881
2882        let md = to_markdown(&doc).unwrap();
2883        assert!(md.contains(
2884            "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
2885        ));
2886    }
2887
2888    #[test]
2889    fn test_compact_toc_document_renders_without_blank_lines() {
2890        let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
2891        doc.kids.push(make_paragraph(
2892            "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
2893            700.0,
2894            712.0,
2895        ));
2896        doc.kids.push(make_paragraph(
2897            "Section 5.1: The Linear Model 35",
2898            684.0,
2899            696.0,
2900        ));
2901        doc.kids.push(make_paragraph(
2902            "Part VI. Chapter Six - Comparing Three or More Group Means",
2903            668.0,
2904            680.0,
2905        ));
2906        doc.kids.push(make_paragraph(
2907            "Section 6.1: Between Versus Within Group Analyses 49",
2908            652.0,
2909            664.0,
2910        ));
2911        doc.kids.push(make_paragraph(
2912            "Part VII. Chapter Seven - Moderation and Mediation Analyses",
2913            636.0,
2914            648.0,
2915        ));
2916        doc.kids.push(make_paragraph(
2917            "Section 7.1: Mediation and Moderation Models 64",
2918            620.0,
2919            632.0,
2920        ));
2921        doc.kids
2922            .push(make_paragraph("References 101", 604.0, 616.0));
2923        doc.kids.push(make_paragraph(
2924            "Section 8.1: Factor Analysis Definitions 75",
2925            588.0,
2926            600.0,
2927        ));
2928
2929        let md = to_markdown(&doc).unwrap();
2930        assert!(!md.contains("\n\nSection 5.1: The Linear Model 35"));
2931        assert!(md.contains("Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35"));
2932    }
2933
2934    #[test]
2935    fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
2936        let mut doc = PdfDocument::new("caption-body.pdf".to_string());
2937        doc.kids.push(make_paragraph(
2938            "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
2939            500.0,
2940            540.0,
2941        ));
2942
2943        let md = to_markdown(&doc).unwrap();
2944        assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
2945    }
2946
2947    #[test]
2948    fn test_short_caption_label_merges_with_following_tail_and_body() {
2949        let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
2950        doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
2951        doc.kids.push(make_paragraph(
2952            "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
2953            520.0,
2954            532.0,
2955        ));
2956
2957        let md = to_markdown(&doc).unwrap();
2958        assert!(md.contains(
2959            "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
2960        ));
2961    }
2962
2963    #[test]
2964    fn test_short_caption_label_merges_with_tail_and_year() {
2965        let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
2966        doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
2967        doc.kids.push(make_paragraph(
2968            "Komnas HAM's YouTube channel as of 1 December",
2969            520.0,
2970            532.0,
2971        ));
2972        doc.kids.push(make_paragraph("2021", 500.0, 512.0));
2973
2974        let md = to_markdown(&doc).unwrap();
2975        assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
2976        assert!(!md.contains("\n\n2021"));
2977    }
2978
2979    #[test]
2980    fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
2981        let mut doc = PdfDocument::new("chart.pdf".to_string());
2982        doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
2983        doc.kids.push(make_paragraph("100", 520.0, 528.0));
2984        doc.kids
2985            .push(make_paragraph("Body text continues here.", 400.0, 412.0));
2986        doc.kids.push(make_paragraph("36", 20.0, 28.0));
2987
2988        let md = to_markdown(&doc).unwrap();
2989        assert!(md.contains("100"));
2990        assert!(!md.lines().any(|line| line.trim() == "36"));
2991    }
2992
2993    #[test]
2994    fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
2995        let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
2996        doc.kids.push(make_paragraph(
2997            "First semantic paragraph ends here.",
2998            520.0,
2999            532.0,
3000        ));
3001        doc.kids.push(make_paragraph(
3002            "Second semantic paragraph starts here.",
3003            500.0,
3004            512.0,
3005        ));
3006
3007        let md = to_markdown(&doc).unwrap();
3008        assert!(md.contains(
3009            "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
3010        ));
3011    }
3012
3013    #[test]
3014    fn test_lowercase_semantic_paragraph_continuation_is_merged() {
3015        let mut doc = PdfDocument::new("continuation.pdf".to_string());
3016        doc.kids.push(make_paragraph(
3017            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
3018            520.0,
3019            532.0,
3020        ));
3021        doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
3022
3023        let md = to_markdown(&doc).unwrap();
3024        assert!(md.contains(
3025            "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
3026        ));
3027    }
3028
3029    fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
3030        let mut table_rows = Vec::new();
3031        for (row_number, (left, right)) in rows.iter().enumerate() {
3032            let top = 656.0 - row_number as f64 * 18.0;
3033            let bottom = top - 16.0;
3034            let mut cells = Vec::new();
3035            for (col_number, (text, left_x, right_x)) in
3036                [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
3037                    .into_iter()
3038                    .enumerate()
3039            {
3040                let content = if text.is_empty() {
3041                    Vec::new()
3042                } else {
3043                    vec![TableToken {
3044                        base: TextChunk {
3045                            value: text.to_string(),
3046                            bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
3047                            font_name: "Test".to_string(),
3048                            font_size: 11.0,
3049                            font_weight: 400.0,
3050                            italic_angle: 0.0,
3051                            font_color: "[0.0]".to_string(),
3052                            contrast_ratio: 21.0,
3053                            symbol_ends: Vec::new(),
3054                            text_format: TextFormat::Normal,
3055                            text_type: TextType::Regular,
3056                            pdf_layer: PdfLayer::Main,
3057                            ocg_visible: true,
3058                            index: None,
3059                            page_number: Some(1),
3060                            level: None,
3061                            mcid: None,
3062                        },
3063                        token_type: TableTokenType::Text,
3064                    }]
3065                };
3066                cells.push(TableBorderCell {
3067                    bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
3068                    index: None,
3069                    level: None,
3070                    row_number,
3071                    col_number,
3072                    row_span: 1,
3073                    col_span: 1,
3074                    content,
3075                    contents: vec![],
3076                    semantic_type: None,
3077                });
3078            }
3079
3080            table_rows.push(TableBorderRow {
3081                bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
3082                index: None,
3083                level: None,
3084                row_number,
3085                cells,
3086                semantic_type: None,
3087            });
3088        }
3089
3090        ContentElement::TableBorder(TableBorder {
3091            bbox: BoundingBox::new(
3092                Some(1),
3093                72.0,
3094                656.0 - rows.len() as f64 * 18.0 - 16.0,
3095                420.0,
3096                656.0,
3097            ),
3098            index: None,
3099            level: Some("1".to_string()),
3100            x_coordinates: vec![72.0, 220.0, 420.0],
3101            x_widths: vec![0.0; 3],
3102            y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
3103            y_widths: vec![0.0; rows.len() + 1],
3104            rows: table_rows,
3105            num_rows: rows.len(),
3106            num_columns: 2,
3107            is_bad_table: false,
3108            is_table_transformer: false,
3109            previous_table: None,
3110            next_table: None,
3111        })
3112    }
3113
3114    #[test]
3115    fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
3116        let mut doc = PdfDocument::new("cec-table.pdf".to_string());
3117        doc.number_of_pages = 1;
3118        doc.kids.push(make_two_column_table(&[
3119            ("Mineral or colloid type", "CEC of pure colloid"),
3120            ("", "cmolc/kg"),
3121            ("kaolinite", "10"),
3122            ("illite", "30"),
3123        ]));
3124
3125        let md = to_markdown(&doc).unwrap();
3126        assert!(md.contains("| --- | --- |"));
3127        assert!(md.contains("| kaolinite | 10 |"));
3128    }
3129
3130    #[test]
3131    fn test_blank_right_column_table_is_not_misrendered_as_toc() {
3132        let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
3133        doc.number_of_pages = 1;
3134        doc.kids.push(make_two_column_table(&[
3135            (
3136                "Added cation",
3137                "Relative Size & Settling Rates of Floccules",
3138            ),
3139            ("K+", ""),
3140            ("Na+", ""),
3141            ("Ca2+", ""),
3142        ]));
3143
3144        let md = to_markdown(&doc).unwrap();
3145        assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
3146        assert!(md.contains("| K+ |  |"));
3147    }
3148
3149    #[test]
3150    fn test_merge_tables_across_heading() {
3151        let input = "some text\n\n\
3152                      | Area | Competence |\n\
3153                      | --- | --- |\n\
3154                      | Row1 | Val1 |\n\
3155                      | Row2 | Val2 |\n\
3156                      \n\
3157                      # Heading Between\n\
3158                      \n\
3159                      | Row3 | Val3 |\n\
3160                      | --- | --- |\n\
3161                      \n\
3162                      more text\n";
3163        let result = merge_adjacent_pipe_tables(input);
3164        // Heading should be converted to a pipe row
3165        assert!(
3166            result.contains("| Heading Between |"),
3167            "Heading should be in pipe row: {}",
3168            result
3169        );
3170        // Should NOT have # heading marker
3171        assert!(
3172            !result.contains("# Heading Between"),
3173            "Heading marker should be removed: {}",
3174            result
3175        );
3176        // Row3 should still be present
3177        assert!(
3178            result.contains("| Row3 |") || result.contains("Row3"),
3179            "Row3 should exist: {}",
3180            result
3181        );
3182    }
3183}