1use crate::models::content::ContentElement;
4use crate::models::document::PdfDocument;
5use crate::models::enums::SemanticType;
6use crate::models::table::TableTokenRow;
7use crate::EdgePdfError;
8
9pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
14 if looks_like_contents_document(doc) {
15 return Ok(render_contents_document(doc));
16 }
17 if looks_like_compact_toc_document(doc) {
18 return Ok(render_compact_toc_document(doc));
19 }
20
21 let mut output = String::new();
22
23 if let Some(ref title) = doc.title {
25 let trimmed = title.trim();
26 if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
27 if should_render_document_title_as_plaintext(doc, trimmed) {
28 output.push_str(trimmed);
29 output.push_str("\n\n");
30 } else {
31 output.push_str(&format!("# {}\n\n", trimmed));
32 }
33 }
34 }
35
36 if doc.kids.is_empty() {
37 output.push_str("*No content extracted.*\n");
38 return Ok(output);
39 }
40
41 let mut i = 0usize;
42 while i < doc.kids.len() {
43 match &doc.kids[i] {
44 ContentElement::Heading(h) => {
45 let text = h.base.base.value();
46 let trimmed = text.trim();
47 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
48 i += 1;
49 continue;
50 }
51
52 if looks_like_bottom_margin_heading(doc, i) {
55 output.push_str(&escape_md_line_start(trimmed));
56 output.push_str("\n\n");
57 i += 1;
58 continue;
59 }
60
61 if should_demote_period_heading(trimmed) {
64 output.push_str(&escape_md_line_start(trimmed));
65 output.push_str("\n\n");
66 i += 1;
67 continue;
68 }
69
70 if should_demote_comma_heading(trimmed) {
72 output.push_str(&escape_md_line_start(trimmed));
73 output.push_str("\n\n");
74 i += 1;
75 continue;
76 }
77
78 if should_demote_math_heading(trimmed) {
80 output.push_str(&escape_md_line_start(trimmed));
81 output.push_str("\n\n");
82 i += 1;
83 continue;
84 }
85
86 if should_demote_percentage_heading(trimmed) {
88 output.push_str(&escape_md_line_start(trimmed));
89 output.push_str("\n\n");
90 i += 1;
91 continue;
92 }
93
94 if starts_with_caption_prefix(trimmed) {
98 output.push_str(&escape_md_line_start(trimmed));
99 output.push_str("\n\n");
100 i += 1;
101 continue;
102 }
103
104 if should_demote_bibliography_heading(trimmed) {
107 output.push_str(&escape_md_line_start(trimmed));
108 output.push_str("\n\n");
109 i += 1;
110 continue;
111 }
112
113 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
114 if should_demote_heading_to_paragraph(trimmed, &next_text) {
115 let mut merged = trimmed.to_string();
116 merge_paragraph_text(&mut merged, &next_text);
117 output.push_str(&escape_md_line_start(merged.trim()));
118 output.push_str("\n\n");
119 i += 2;
120 continue;
121 }
122 }
123
124 let mut merged_heading = trimmed.to_string();
128 while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
129 let next_text = next_h.base.base.value();
130 let next_trimmed = next_text.trim();
131 if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
132 i += 1;
133 continue;
134 }
135 if merged_heading.len() + 1 + next_trimmed.len() > 200 {
137 break;
138 }
139 merge_paragraph_text(&mut merged_heading, next_trimmed);
140 i += 1;
141 }
142
143 let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
144
145 if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
147 let first = cleaned_heading[..split_pos].trim();
148 let second = cleaned_heading[split_pos..].trim();
149 output.push_str(&format!("# {}\n\n", first));
150 output.push_str(&format!("# {}\n\n", second));
151 } else {
152 output.push_str(&format!("# {}\n\n", cleaned_heading));
153 }
154 }
155 ContentElement::NumberHeading(nh) => {
156 let text = nh.base.base.base.value();
157 let trimmed = text.trim();
158 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
159 i += 1;
160 continue;
161 }
162
163 if should_demote_comma_heading(trimmed) {
165 output.push_str(&escape_md_line_start(trimmed));
166 output.push_str("\n\n");
167 i += 1;
168 continue;
169 }
170
171 if should_demote_math_heading(trimmed) {
173 output.push_str(&escape_md_line_start(trimmed));
174 output.push_str("\n\n");
175 i += 1;
176 continue;
177 }
178
179 if should_demote_percentage_heading(trimmed) {
181 output.push_str(&escape_md_line_start(trimmed));
182 output.push_str("\n\n");
183 i += 1;
184 continue;
185 }
186
187 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
188 if should_demote_heading_to_paragraph(trimmed, &next_text) {
189 let mut merged = trimmed.to_string();
190 merge_paragraph_text(&mut merged, &next_text);
191 output.push_str(&escape_md_line_start(merged.trim()));
192 output.push_str("\n\n");
193 i += 2;
194 continue;
195 }
196 }
197
198 let cleaned = strip_trailing_page_number(trimmed);
199
200 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
202 let first = cleaned[..split_pos].trim();
203 let second = cleaned[split_pos..].trim();
204 output.push_str(&format!("# {}\n\n", first));
205 output.push_str(&format!("# {}\n\n", second));
206 } else {
207 output.push_str(&format!("# {}\n\n", cleaned));
208 }
209 }
210 ContentElement::Paragraph(_)
211 | ContentElement::TextBlock(_)
212 | ContentElement::TextLine(_) => {
213 let element = &doc.kids[i];
214 let text = match &doc.kids[i] {
215 ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
216 ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
217 ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
218 _ => unreachable!(),
219 };
220 let trimmed = text.trim();
221 if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
222 i += 1;
223 continue;
224 }
225
226 if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
227 let cleaned = strip_trailing_page_number(trimmed);
228 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
230 let first = cleaned[..split_pos].trim();
231 let second = cleaned[split_pos..].trim();
232 output.push_str(&format!("# {}\n\n", first));
233 output.push_str(&format!("# {}\n\n", second));
234 } else {
235 output.push_str(&format!("# {}\n\n", cleaned));
236 }
237 i += 1;
238 continue;
239 }
240
241 if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
242 {
243 output.push_str(&escape_md_line_start(trimmed));
244 output.push('\n');
245 i += 1;
246 continue;
247 }
248
249 if is_short_caption_label(trimmed) {
250 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
251 if let Some((caption_tail, body)) =
252 split_following_caption_tail_and_body(&next_text)
253 {
254 let mut caption = trimmed.to_string();
255 caption.push('\n');
256 caption.push_str(caption_tail);
257 output.push_str(&escape_md_line_start(caption.trim()));
258 output.push_str("\n\n");
259 output.push_str(&escape_md_line_start(body));
260 output.push_str("\n\n");
261 i += 2;
262 continue;
263 }
264
265 if looks_like_caption_tail(&next_text) {
266 let mut caption = trimmed.to_string();
267 caption.push('\n');
268 caption.push_str(next_text.trim());
269
270 if let Some(year_text) =
271 next_mergeable_paragraph_text(doc.kids.get(i + 2))
272 {
273 if looks_like_caption_year(&year_text) {
274 caption.push('\n');
275 caption.push_str(year_text.trim());
276 i += 1;
277 }
278 }
279
280 output.push_str(&escape_md_line_start(caption.trim()));
281 output.push_str("\n\n");
282 i += 2;
283 continue;
284 }
285 }
286 }
287
288 if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
289 output.push_str(&escape_md_line_start(caption));
290 output.push_str("\n\n");
291 output.push_str(&escape_md_line_start(body));
292 output.push_str("\n\n");
293 i += 1;
294 continue;
295 }
296
297 let mut merged = trimmed.to_string();
298 while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
299 let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
300 should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
301 } else {
302 should_merge_paragraph_text(&merged, &next_text)
303 };
304 if !can_merge {
305 break;
306 }
307 merge_paragraph_text(&mut merged, &next_text);
308 i += 1;
309 }
310
311 output.push_str(&escape_md_line_start(merged.trim()));
312 output.push_str("\n\n");
313 }
314 other => render_element(&mut output, other),
315 }
316 i += 1;
317 }
318
319 let output = merge_adjacent_pipe_tables(&output);
323
324 Ok(output)
325}
326
327fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
328 first_heading_like_text(doc)
329 .filter(|first| !equivalent_heading_text(first, title))
330 .is_some()
331}
332
333fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
334 if title.split_whitespace().count() > 6 {
335 return false;
336 }
337
338 let mut early = doc.kids.iter().take(6);
339 let has_explicit_heading = early.clone().any(|element| {
340 matches!(
341 element,
342 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
343 )
344 });
345 let has_tableish_content = early.any(|element| {
346 matches!(
347 element,
348 ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
349 )
350 });
351
352 has_tableish_content && !has_explicit_heading
353}
354
355fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
356 for (idx, element) in doc.kids.iter().enumerate().take(8) {
357 match element {
358 ContentElement::Heading(h) => {
359 let text = h.base.base.value();
360 let trimmed = text.trim();
361 if !trimmed.is_empty() {
362 return Some(trimmed.to_string());
363 }
364 }
365 ContentElement::NumberHeading(nh) => {
366 let text = nh.base.base.base.value();
367 let trimmed = text.trim();
368 if !trimmed.is_empty() {
369 return Some(trimmed.to_string());
370 }
371 }
372 ContentElement::Paragraph(p) => {
373 let text = clean_paragraph_text(&p.base.value());
374 let trimmed = text.trim();
375 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
376 return Some(trimmed.to_string());
377 }
378 }
379 ContentElement::TextBlock(tb) => {
380 let text = clean_paragraph_text(&tb.value());
381 let trimmed = text.trim();
382 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
383 return Some(trimmed.to_string());
384 }
385 }
386 ContentElement::TextLine(tl) => {
387 let text = clean_paragraph_text(&tl.value());
388 let trimmed = text.trim();
389 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
390 return Some(trimmed.to_string());
391 }
392 }
393 _ => {}
394 }
395 }
396 None
397}
398
399fn equivalent_heading_text(left: &str, right: &str) -> bool {
400 normalize_heading_text(left) == normalize_heading_text(right)
401}
402
403fn normalize_heading_text(text: &str) -> String {
404 text.chars()
405 .filter(|ch| ch.is_alphanumeric())
406 .flat_map(char::to_lowercase)
407 .collect()
408}
409
410fn looks_like_contents_document(doc: &PdfDocument) -> bool {
411 let Some(first) = first_heading_like_text(doc) else {
412 return false;
413 };
414 if !matches!(
415 normalize_heading_text(&first).as_str(),
416 "contents" | "tableofcontents"
417 ) {
418 return false;
419 }
420
421 let lines = collect_plain_lines(doc);
422 if lines.len() < 8 {
423 return false;
424 }
425
426 let page_like = lines
427 .iter()
428 .skip(1)
429 .filter(|line| ends_with_page_marker(line))
430 .count();
431 page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
432}
433
434fn render_contents_document(doc: &PdfDocument) -> String {
435 let lines = collect_plain_lines(doc);
436 let mut out = String::new();
437
438 let mut iter = lines.into_iter();
439 if let Some(first) = iter.next() {
440 out.push_str("# ");
441 out.push_str(first.trim());
442 out.push_str("\n\n");
443 }
444 for line in iter {
445 let trimmed = line.trim();
446 if trimmed.is_empty() {
447 continue;
448 }
449 out.push_str(trimmed);
450 out.push('\n');
451 }
452 out.push('\n');
453 out
454}
455
456fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
457 let lines = collect_plain_lines(doc);
458 if lines.len() < 8 {
459 return false;
460 }
461
462 let page_like = lines
463 .iter()
464 .filter(|line| ends_with_page_marker(line))
465 .count();
466 let support_like = lines
467 .iter()
468 .filter(|line| looks_like_toc_support_heading(line))
469 .count();
470
471 page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
472}
473
474fn render_compact_toc_document(doc: &PdfDocument) -> String {
475 let mut out = String::new();
476 for line in collect_plain_lines(doc) {
477 let trimmed = line.trim();
478 if trimmed.is_empty() {
479 continue;
480 }
481 out.push_str(trimmed);
482 out.push('\n');
483 }
484 out.push('\n');
485 out
486}
487
488fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
489 let mut lines = Vec::new();
490 for element in &doc.kids {
491 match element {
492 ContentElement::Heading(h) => {
493 let text = clean_paragraph_text(&h.base.base.value());
494 if !text.trim().is_empty() {
495 lines.push(text);
496 }
497 }
498 ContentElement::NumberHeading(nh) => {
499 let text = clean_paragraph_text(&nh.base.base.base.value());
500 if !text.trim().is_empty() {
501 lines.push(text);
502 }
503 }
504 ContentElement::Paragraph(p) => {
505 let text = clean_paragraph_text(&p.base.value());
506 if !text.trim().is_empty() {
507 lines.push(text);
508 }
509 }
510 ContentElement::TextBlock(tb) => {
511 let text = clean_paragraph_text(&tb.value());
512 if !text.trim().is_empty() {
513 lines.push(text);
514 }
515 }
516 ContentElement::TextLine(tl) => {
517 let text = clean_paragraph_text(&tl.value());
518 if !text.trim().is_empty() {
519 lines.push(text);
520 }
521 }
522 ContentElement::List(list) => {
523 for item in &list.list_items {
524 let label = token_rows_text(&item.label.content);
525 let body = token_rows_text(&item.body.content);
526 let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
527 format!("{} {}", label.trim(), body.trim())
528 } else if !body.trim().is_empty() {
529 body.trim().to_string()
530 } else if !label.trim().is_empty() {
531 label.trim().to_string()
532 } else {
533 list_item_text_from_contents(&item.contents)
534 .trim()
535 .to_string()
536 };
537 if !combined.trim().is_empty() {
538 lines.push(combined);
539 }
540 }
541 }
542 ContentElement::Table(table) => {
543 extend_contents_lines_from_rows(
544 &mut lines,
545 collect_rendered_table_rows(
546 &table.table_border.rows,
547 table.table_border.num_columns,
548 ),
549 );
550 }
551 ContentElement::TableBorder(table) => {
552 extend_contents_lines_from_rows(
553 &mut lines,
554 collect_rendered_table_rows(&table.rows, table.num_columns),
555 );
556 }
557 _ => {}
558 }
559 }
560 lines
561}
562
563fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
564 if rows.is_empty() {
565 return;
566 }
567
568 if is_toc_table(&rows) {
569 for row in &rows {
570 let title = row.first().map(|s| s.trim()).unwrap_or("");
571 let page = row.get(1).map(|s| s.trim()).unwrap_or("");
572 let combined = if !title.is_empty() && !page.is_empty() {
573 format!("{title} {page}")
574 } else {
575 format!("{title}{page}")
576 };
577 if !combined.trim().is_empty() {
578 lines.push(combined);
579 }
580 }
581 } else {
582 for row in &rows {
584 let combined: String = row
585 .iter()
586 .map(|c| c.trim())
587 .filter(|c| !c.is_empty())
588 .collect::<Vec<_>>()
589 .join(" ");
590 if !combined.is_empty() {
591 lines.push(combined);
592 }
593 }
594 }
595}
596
597fn collect_rendered_table_rows(
598 rows: &[crate::models::table::TableBorderRow],
599 num_cols: usize,
600) -> Vec<Vec<String>> {
601 let num_cols = num_cols.max(1);
602 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
603
604 for row in rows {
605 let cell_texts: Vec<String> = (0..num_cols)
606 .map(|col| {
607 row.cells
608 .iter()
609 .find(|c| c.col_number == col)
610 .map(cell_text_content)
611 .unwrap_or_default()
612 })
613 .collect();
614 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
615 rendered_rows.push(cell_texts);
616 }
617 }
618
619 rendered_rows
620}
621
622fn ends_with_page_marker(text: &str) -> bool {
623 text.split_whitespace()
624 .last()
625 .is_some_and(is_page_number_like)
626}
627
628fn looks_like_toc_support_heading(text: &str) -> bool {
629 let trimmed = text.trim();
630 if trimmed.is_empty() || ends_with_page_marker(trimmed) {
631 return false;
632 }
633 if trimmed.ends_with(['.', ';', ':', '?', '!']) {
634 return false;
635 }
636
637 let lower = trimmed.to_ascii_lowercase();
638 if !(lower.starts_with("part ")
639 || lower.starts_with("chapter ")
640 || lower.starts_with("appendix ")
641 || lower.starts_with("section "))
642 {
643 return false;
644 }
645
646 let word_count = trimmed.split_whitespace().count();
647 (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
648}
649
650fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
651 if !starts_with_caption_prefix(text) || !text.contains("(credit") {
652 return None;
653 }
654
655 for needle in [") ", ". "] {
656 let mut search_start = 0usize;
657 while let Some(rel_idx) = text[search_start..].find(needle) {
658 let boundary = search_start + rel_idx + needle.len() - 1;
659 let head = text[..=boundary].trim();
660 let tail = text[boundary + 1..].trim_start();
661 search_start = boundary + 1;
662 if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
663 continue;
664 }
665 if tail.split_whitespace().count() < 10 {
666 continue;
667 }
668 if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
669 continue;
670 }
671 return Some((head, tail));
672 }
673 }
674
675 None
676}
677
678fn is_short_caption_label(text: &str) -> bool {
679 if !starts_with_caption_prefix(text) {
680 return false;
681 }
682
683 let trimmed = text.trim();
684 trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
685}
686
687fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
688 let trimmed = text.trim();
689 if trimmed.is_empty()
690 || starts_with_caption_prefix(trimmed)
691 || !starts_with_uppercase_word(trimmed)
692 {
693 return None;
694 }
695
696 for starter in [
697 " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
698 ] {
699 if let Some(idx) = text.find(starter) {
700 let head = text[..idx].trim();
701 let tail = text[idx + 1..].trim();
702 if head.split_whitespace().count() >= 3
703 && head.split_whitespace().count() <= 24
704 && tail.split_whitespace().count() >= 8
705 {
706 return Some((head, tail));
707 }
708 }
709 }
710
711 None
712}
713
714fn looks_like_caption_tail(text: &str) -> bool {
715 let trimmed = text.trim();
716 if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
717 return false;
718 }
719
720 let word_count = trimmed.split_whitespace().count();
721 if !(3..=18).contains(&word_count) {
722 return false;
723 }
724
725 starts_with_uppercase_word(trimmed)
726 && !starts_with_caption_prefix(trimmed)
727 && !trimmed.contains(':')
728}
729
730fn looks_like_caption_year(text: &str) -> bool {
731 let trimmed = text.trim();
732 trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
733}
734
735fn token_rows_text(rows: &[TableTokenRow]) -> String {
737 repair_fragmented_words(
738 &rows
739 .iter()
740 .flat_map(|row| row.iter())
741 .map(|token| token.base.value.as_str())
742 .collect::<Vec<_>>()
743 .join(" "),
744 )
745}
746
747fn render_element(out: &mut String, element: &ContentElement) {
748 match element {
749 ContentElement::Heading(h) => {
750 let text = h.base.base.value();
751 let trimmed = text.trim();
752 if should_skip_heading_text(trimmed) {
753 return;
754 }
755 out.push_str(&format!("# {}\n\n", trimmed));
756 }
757 ContentElement::Paragraph(p) => {
758 let text = p.base.value();
759 let trimmed = clean_paragraph_text(&text);
760 if !trimmed.is_empty() {
761 out.push_str(&escape_md_line_start(&trimmed));
762 if p.base.semantic_type == SemanticType::TableOfContent {
763 out.push('\n');
764 } else {
765 out.push_str("\n\n");
766 }
767 }
768 }
769 ContentElement::List(list) => {
770 let mut i = 0usize;
771 while i < list.list_items.len() {
772 let item = &list.list_items[i];
773 let label = token_rows_text(&item.label.content);
774 let body = token_rows_text(&item.body.content);
775 let label_trimmed = label.trim();
776 let body_trimmed = body.trim();
777 let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
778 format!("{label_trimmed} {body_trimmed}")
779 } else if !body_trimmed.is_empty() {
780 body_trimmed.to_string()
781 } else {
782 label_trimmed.to_string()
783 };
784 let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
785 list_item_text_from_contents(&item.contents)
786 } else {
787 combined
788 };
789
790 if is_list_section_heading(&combined) {
791 out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
792 i += 1;
793 continue;
794 }
795
796 if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
797 if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
798 out.push_str(&format!("- {} {}\n", label_trimmed, body_trimmed));
799 } else if !body_trimmed.is_empty() {
800 out.push_str(&format!("- {}\n", body_trimmed));
801 } else {
802 out.push_str(&format!("- {}\n", label_trimmed));
803 }
804 } else if !item.contents.is_empty() {
805 let text = list_item_text_from_contents(&item.contents);
807 let trimmed = text.trim();
808 if !trimmed.is_empty() {
809 out.push_str(&format!("- {}\n", trimmed));
810 }
811 }
812 i += 1;
813 }
814 out.push('\n');
815 }
816 ContentElement::Table(table) => {
817 render_table(out, table);
818 }
819 ContentElement::TableBorder(table) => {
820 render_table_border(out, table);
821 }
822 ContentElement::Formula(f) => {
823 let latex = f.latex.trim();
824 if !latex.is_empty() {
825 out.push_str(&format!("$$\n{}\n$$\n\n", latex));
826 }
827 }
828 ContentElement::Caption(c) => {
829 let text = c.base.value();
830 let trimmed = text.trim();
831 if !trimmed.is_empty() {
832 out.push_str(&format!("*{}*\n\n", trimmed));
833 }
834 }
835 ContentElement::NumberHeading(nh) => {
836 let text = nh.base.base.base.value();
837 let trimmed = text.trim();
838 if should_skip_heading_text(trimmed) {
839 return;
840 }
841 out.push_str(&format!("# {}\n\n", trimmed));
842 }
843 ContentElement::Image(_) => {
844 out.push_str("\n\n");
845 }
846 ContentElement::HeaderFooter(_) => {
847 }
849 ContentElement::TextBlock(tb) => {
850 let text = tb.value();
851 let trimmed = clean_paragraph_text(&text);
852 if !trimmed.is_empty() {
853 out.push_str(&escape_md_line_start(&trimmed));
854 out.push_str("\n\n");
855 }
856 }
857 ContentElement::TextLine(tl) => {
858 let text = tl.value();
859 let trimmed = text.trim();
860 if !trimmed.is_empty() {
861 out.push_str(trimmed);
862 out.push('\n');
863 }
864 }
865 ContentElement::TextChunk(tc) => {
866 out.push_str(&tc.value);
867 }
868 _ => {}
869 }
870}
871
872fn escape_md_line_start(text: &str) -> String {
874 if text.starts_with('>') || text.starts_with('#') {
875 format!("\\{}", text)
876 } else {
877 text.to_string()
878 }
879}
880
881fn starts_with_caption_prefix(text: &str) -> bool {
882 let lower = text.trim_start().to_ascii_lowercase();
883 [
884 "figure ",
885 "fig. ",
886 "table ",
887 "tab. ",
888 "chart ",
889 "graph ",
890 "image ",
891 "illustration ",
892 "diagram ",
893 "plate ",
894 "map ",
895 "exhibit ",
896 "photo by ",
897 "photo credit",
898 "image by ",
899 "image credit",
900 "image courtesy",
901 "photo courtesy",
902 "credit: ",
903 "source: ",
904 ]
905 .iter()
906 .any(|prefix| lower.starts_with(prefix))
907}
908
909fn starts_with_uppercase_word(text: &str) -> bool {
910 for ch in text.trim_start().chars() {
911 if ch.is_alphabetic() {
912 return ch.is_uppercase();
913 }
914 if !matches!(ch, '"' | '\'' | '(' | '[') {
915 break;
916 }
917 }
918 false
919}
920
921fn clean_paragraph_text(text: &str) -> String {
924 let trimmed = text.trim();
925 if trimmed.is_empty() {
926 return String::new();
927 }
928 let mut result = String::with_capacity(trimmed.len());
930 let mut prev_space = false;
931 for ch in trimmed.chars() {
932 if ch == ' ' || ch == '\t' {
933 if !prev_space {
934 result.push(' ');
935 prev_space = true;
936 }
937 } else {
938 result.push(ch);
939 prev_space = false;
940 }
941 }
942 result
943}
944
945fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
946 match element {
947 Some(ContentElement::Paragraph(p)) => {
948 let text = clean_paragraph_text(&p.base.value());
949 let trimmed = text.trim();
950 if trimmed.is_empty()
951 || should_render_element_as_heading(element.unwrap(), trimmed, None)
952 {
953 None
954 } else {
955 Some(trimmed.to_string())
956 }
957 }
958 Some(ContentElement::TextBlock(tb)) => {
959 let text = clean_paragraph_text(&tb.value());
960 let trimmed = text.trim();
961 if trimmed.is_empty()
962 || should_render_element_as_heading(element.unwrap(), trimmed, None)
963 {
964 None
965 } else {
966 Some(trimmed.to_string())
967 }
968 }
969 Some(ContentElement::TextLine(tl)) => {
970 let text = clean_paragraph_text(&tl.value());
971 let trimmed = text.trim();
972 if trimmed.is_empty()
973 || should_render_element_as_heading(element.unwrap(), trimmed, None)
974 {
975 None
976 } else {
977 Some(trimmed.to_string())
978 }
979 }
980 _ => None,
981 }
982}
983
984fn should_render_paragraph_as_heading(
985 doc: &PdfDocument,
986 idx: usize,
987 text: &str,
988 next: Option<&ContentElement>,
989) -> bool {
990 if looks_like_top_margin_running_header(doc, idx, text) {
991 return false;
992 }
993 if should_render_element_as_heading(&doc.kids[idx], text, next) {
994 return true;
995 }
996
997 let body_font_size = compute_body_font_size(doc);
1000 if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
1001 return false;
1002 }
1003
1004 if !doc_has_explicit_headings(doc) {
1006 if should_rescue_as_heading(doc, idx, text) {
1007 return true;
1008 }
1009 if should_rescue_allcaps_heading(doc, idx, text) {
1013 return true;
1014 }
1015 if should_rescue_numbered_heading(doc, idx, text) {
1016 return true;
1017 }
1018 return false;
1019 }
1020 if heading_density(doc) < 0.10 {
1023 if should_rescue_allcaps_heading(doc, idx, text) {
1024 return true;
1025 }
1026 if should_rescue_numbered_heading(doc, idx, text) {
1030 return true;
1031 }
1032 if body_font_size > 0.0 {
1037 if let ContentElement::Paragraph(p) = &doc.kids[idx] {
1038 if let Some(fs) = p.base.font_size {
1039 if fs >= 1.15 * body_font_size
1040 && is_heading_rescue_candidate(doc, idx, text)
1041 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
1042 {
1043 return true;
1044 }
1045 }
1046 }
1047 }
1048 }
1049 false
1050}
1051
1052fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
1054 doc.kids.iter().any(|e| {
1055 matches!(
1056 e,
1057 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1058 )
1059 })
1060}
1061
1062fn compute_body_font_size(doc: &PdfDocument) -> f64 {
1067 let mut font_sizes: Vec<f64> = doc
1068 .kids
1069 .iter()
1070 .filter_map(|e| {
1071 if let ContentElement::Paragraph(p) = e {
1072 let word_count = p.base.value().split_whitespace().count();
1073 if word_count > 10 {
1074 p.base.font_size
1075 } else {
1076 None
1077 }
1078 } else {
1079 None
1080 }
1081 })
1082 .collect();
1083 if font_sizes.is_empty() {
1084 return 0.0;
1085 }
1086 font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1087 font_sizes[font_sizes.len() / 2]
1088}
1089
1090fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
1095 if body_font_size <= 0.0 {
1096 return false;
1097 }
1098 if let ContentElement::Paragraph(p) = &doc_kids[idx] {
1099 if let Some(fs) = p.base.font_size {
1100 return fs < 0.95 * body_font_size;
1101 }
1102 }
1103 false
1104}
1105
1106fn heading_density(doc: &PdfDocument) -> f64 {
1108 let total = doc.kids.len();
1109 if total == 0 {
1110 return 0.0;
1111 }
1112 let heading_count = doc
1113 .kids
1114 .iter()
1115 .filter(|e| {
1116 matches!(
1117 e,
1118 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1119 )
1120 })
1121 .count();
1122 heading_count as f64 / total as f64
1123}
1124
1125fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1128 is_heading_rescue_candidate(doc, idx, text)
1129 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
1130}
1131
1132fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1136 let trimmed = text.trim();
1137 if trimmed.is_empty() {
1138 return false;
1139 }
1140
1141 let has_alpha = trimmed.chars().any(char::is_alphabetic);
1142
1143 if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
1145 return false;
1146 }
1147
1148 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
1150 return false;
1151 }
1152
1153 if trimmed.starts_with('(') && trimmed.ends_with(')') {
1155 return false;
1156 }
1157
1158 if starts_with_caption_prefix(trimmed)
1160 || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
1161 {
1162 return false;
1163 }
1164
1165 let word_count = trimmed.split_whitespace().count();
1167 if word_count > 6 || trimmed.len() > 60 {
1168 return false;
1169 }
1170
1171 if trimmed
1173 .chars()
1174 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
1175 {
1176 return false;
1177 }
1178
1179 if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
1181 if first_alpha.is_lowercase() {
1182 return false;
1183 }
1184 }
1185
1186 true
1187}
1188
1189fn has_substantive_follow_up(
1193 doc: &PdfDocument,
1194 idx: usize,
1195 word_count: usize,
1196 max_lookahead: usize,
1197) -> bool {
1198 for offset in 1..=max_lookahead {
1199 let lookahead_idx = idx + offset;
1200 if lookahead_idx >= doc.kids.len() {
1201 break;
1202 }
1203 let look_elem = &doc.kids[lookahead_idx];
1204 match look_elem {
1205 ContentElement::Paragraph(p) => {
1206 let next_text = p.base.value();
1207 let nw = next_text.split_whitespace().count();
1208 if nw >= word_count * 3 || nw > 15 {
1209 return true;
1210 }
1211 }
1212 ContentElement::TextBlock(tb) => {
1213 let next_text = tb.value();
1214 let nw = next_text.split_whitespace().count();
1215 if nw >= word_count * 3 || nw > 15 {
1216 return true;
1217 }
1218 }
1219 ContentElement::TextLine(tl) => {
1220 let next_text = tl.value();
1221 let nw = next_text.split_whitespace().count();
1222 if nw >= word_count * 3 || nw > 15 {
1223 return true;
1224 }
1225 }
1226 ContentElement::List(_)
1227 | ContentElement::Table(_)
1228 | ContentElement::TableBorder(_)
1229 | ContentElement::Image(_)
1230 | ContentElement::Figure(_) => {
1231 return true;
1232 }
1233 _ => continue,
1234 }
1235 }
1236
1237 false
1238}
1239
1240fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1243 let trimmed = text.trim();
1244 if trimmed.is_empty() || trimmed.len() > 100 {
1245 return false;
1246 }
1247
1248 if !looks_like_numbered_section(trimmed) {
1251 return false;
1252 }
1253
1254 if trimmed.ends_with(['!', '?', ';', ',']) {
1258 return false;
1259 }
1260 if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
1261 return false;
1262 }
1263 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
1265 return false;
1266 }
1267
1268 for offset in 1..=3 {
1270 let lookahead_idx = idx + offset;
1271 if lookahead_idx >= doc.kids.len() {
1272 break;
1273 }
1274 match &doc.kids[lookahead_idx] {
1275 ContentElement::Paragraph(p) => {
1276 let nw = p.base.value().split_whitespace().count();
1277 if nw > 10 {
1278 return true;
1279 }
1280 }
1281 ContentElement::TextBlock(tb) => {
1282 let nw = tb.value().split_whitespace().count();
1283 if nw > 10 {
1284 return true;
1285 }
1286 }
1287 ContentElement::TextLine(tl) => {
1288 let nw = tl.value().split_whitespace().count();
1289 if nw > 10 {
1290 return true;
1291 }
1292 }
1293 ContentElement::List(_)
1294 | ContentElement::Table(_)
1295 | ContentElement::TableBorder(_)
1296 | ContentElement::Image(_)
1297 | ContentElement::Figure(_) => {
1298 return true;
1299 }
1300 _ => continue,
1301 }
1302 }
1303
1304 false
1305}
1306
1307fn looks_like_numbered_section(text: &str) -> bool {
1310 let bytes = text.as_bytes();
1311 if bytes.is_empty() {
1312 return false;
1313 }
1314
1315 let mut idx = 0;
1317 if bytes[0].is_ascii_digit() {
1318 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
1319 idx += 1;
1320 }
1321 if idx >= bytes.len() {
1322 return false;
1323 }
1324 while idx < bytes.len() && bytes[idx] == b'.' {
1326 idx += 1;
1327 let start = idx;
1328 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
1329 idx += 1;
1330 }
1331 if idx == start {
1332 break;
1334 }
1335 }
1336 if idx >= bytes.len() {
1338 return false;
1339 }
1340 if bytes[idx] == b' ' || bytes[idx] == b'\t' {
1342 idx += 1;
1343 if idx < bytes.len() && bytes[idx] == b'-' {
1345 idx += 1;
1346 if idx < bytes.len() && bytes[idx] == b' ' {
1347 idx += 1;
1348 }
1349 }
1350 } else if bytes[idx] == b'-' {
1351 idx += 1;
1352 if idx < bytes.len() && bytes[idx] == b' ' {
1353 idx += 1;
1354 }
1355 } else {
1356 return false;
1357 }
1358 let rest = &text[idx..].trim();
1360 if rest.is_empty() {
1361 return false;
1362 }
1363 if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
1365 return c.is_uppercase();
1366 }
1367 return false;
1368 }
1369
1370 if looks_like_keyword_numbered_section(text) {
1372 return true;
1373 }
1374
1375 false
1376}
1377
1378const SECTION_KEYWORDS: &[&str] = &[
1380 "activity",
1381 "appendix",
1382 "case",
1383 "chapter",
1384 "exercise",
1385 "experiment",
1386 "lab",
1387 "lesson",
1388 "module",
1389 "part",
1390 "phase",
1391 "problem",
1392 "question",
1393 "section",
1394 "stage",
1395 "step",
1396 "task",
1397 "topic",
1398 "unit",
1399];
1400
1401fn looks_like_keyword_numbered_section(text: &str) -> bool {
1403 let trimmed = text.trim();
1404 let space_pos = match trimmed.find(' ') {
1406 Some(p) => p,
1407 None => return false,
1408 };
1409 let keyword = &trimmed[..space_pos];
1410 if !SECTION_KEYWORDS
1411 .iter()
1412 .any(|k| keyword.eq_ignore_ascii_case(k))
1413 {
1414 return false;
1415 }
1416 let rest = trimmed[space_pos + 1..].trim_start();
1418 if rest.is_empty() {
1419 return false;
1420 }
1421 let rest = rest.strip_prefix('#').unwrap_or(rest);
1422 let first_char = rest.chars().next().unwrap_or(' ');
1424 if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
1425 return false;
1426 }
1427 true
1428}
1429
1430fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1433 let trimmed = text.trim();
1434 if trimmed.is_empty() {
1435 return false;
1436 }
1437
1438 let word_count = trimmed.split_whitespace().count();
1439
1440 if word_count > 8 || trimmed.len() > 80 {
1442 return false;
1443 }
1444
1445 let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
1447 if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
1448 return false;
1449 }
1450
1451 if trimmed.ends_with(['.', ';', ',']) {
1453 return false;
1454 }
1455
1456 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
1458 return false;
1459 }
1460
1461 if starts_with_caption_prefix(trimmed) {
1463 return false;
1464 }
1465
1466 if trimmed
1468 .chars()
1469 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
1470 {
1471 return false;
1472 }
1473
1474 for offset in 1..=4 {
1477 let lookahead_idx = idx + offset;
1478 if lookahead_idx >= doc.kids.len() {
1479 break;
1480 }
1481 let look_elem = &doc.kids[lookahead_idx];
1482 match look_elem {
1483 ContentElement::Paragraph(p) => {
1484 let nw = p.base.value().split_whitespace().count();
1485 if nw > 6 {
1486 return true;
1487 }
1488 }
1489 ContentElement::TextBlock(tb) => {
1490 let nw = tb.value().split_whitespace().count();
1491 if nw > 6 {
1492 return true;
1493 }
1494 }
1495 ContentElement::TextLine(tl) => {
1496 let nw = tl.value().split_whitespace().count();
1497 if nw > 6 {
1498 return true;
1499 }
1500 }
1501 ContentElement::List(_)
1502 | ContentElement::Table(_)
1503 | ContentElement::TableBorder(_)
1504 | ContentElement::Image(_)
1505 | ContentElement::Figure(_) => {
1506 return true;
1507 }
1508 _ => continue,
1509 }
1510 }
1511
1512 false
1513}
1514
1515fn should_render_element_as_heading(
1516 element: &ContentElement,
1517 text: &str,
1518 next: Option<&ContentElement>,
1519) -> bool {
1520 let trimmed = text.trim();
1521 if trimmed.is_empty() {
1522 return false;
1523 }
1524
1525 let lower = trimmed.to_ascii_lowercase();
1526 if matches!(lower.as_str(), "contents" | "table of contents")
1527 && trimmed.starts_with(|c: char| c.is_uppercase())
1528 {
1529 return true;
1530 }
1531
1532 let word_count = trimmed.split_whitespace().count();
1533 let has_alpha = trimmed.chars().any(char::is_alphabetic);
1534 let title_like = has_alpha
1535 && word_count <= 4
1536 && trimmed.len() <= 40
1537 && !trimmed.ends_with(['.', '!', '?', ';', ':']);
1538
1539 let is_attribution = {
1543 let lower = trimmed.to_ascii_lowercase();
1544 lower.starts_with("source:")
1545 || lower.starts_with("credit:")
1546 || lower.starts_with("photo by ")
1547 || lower.starts_with("photo credit")
1548 || lower.starts_with("image by ")
1549 || lower.starts_with("image credit")
1550 };
1551
1552 title_like
1553 && matches!(next, Some(ContentElement::List(_)))
1554 && !looks_like_chart_label_heading(element, trimmed)
1555 && !is_attribution
1556}
1557
1558fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
1559 let trimmed = text.trim();
1560 if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
1561 return false;
1562 }
1563
1564 let element = &doc.kids[idx];
1565 let bbox = element.bbox();
1566 if bbox.height() > 24.0 {
1567 return false;
1568 }
1569
1570 let Some(page) = element.page_number() else {
1571 return false;
1572 };
1573
1574 let mut page_tops = std::collections::HashMap::<u32, f64>::new();
1576 for candidate in &doc.kids {
1577 if let Some(p) = candidate.page_number() {
1578 let top = page_tops.entry(p).or_insert(f64::MIN);
1579 *top = top.max(candidate.bbox().top_y);
1580 }
1581 }
1582
1583 let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
1584 if bbox.top_y < page_top - 24.0 {
1585 return false;
1586 }
1587
1588 let trimmed_lower = trimmed.to_lowercase();
1592 for other_elem in &doc.kids {
1593 let Some(other_page) = other_elem.page_number() else {
1594 continue;
1595 };
1596 if other_page == page {
1597 continue;
1598 }
1599 let other_bbox = other_elem.bbox();
1600 if other_bbox.height() > 24.0 {
1601 continue;
1602 }
1603 let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
1604 if other_bbox.top_y < other_top - 24.0 {
1605 continue;
1606 }
1607 let other_text = match other_elem {
1608 ContentElement::Paragraph(p) => p.base.value(),
1609 ContentElement::TextBlock(tb) => tb.value(),
1610 ContentElement::TextLine(tl) => tl.value(),
1611 ContentElement::Heading(h) => h.base.base.value(),
1612 _ => continue,
1613 };
1614 if other_text.trim().to_lowercase() == trimmed_lower {
1615 return true;
1616 }
1617 }
1618
1619 false
1620}
1621
1622fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
1623 let trimmed = text.trim();
1624 let upper_words = trimmed
1625 .split_whitespace()
1626 .filter(|word| word.chars().any(char::is_alphabetic))
1627 .all(|word| {
1628 word.chars()
1629 .filter(|ch| ch.is_alphabetic())
1630 .all(|ch| ch.is_uppercase())
1631 });
1632
1633 (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
1634}
1635
1636fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
1637 let next_trimmed = next.trim();
1638 if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
1639 return false;
1640 }
1641
1642 let normalized = normalize_heading_text(text);
1643 if matches!(
1644 normalized.as_str(),
1645 "contents" | "tableofcontents" | "introduction" | "conclusion"
1646 ) {
1647 return false;
1648 }
1649
1650 let words: Vec<&str> = text.split_whitespace().collect();
1651 if words.len() < 3 {
1652 return false;
1653 }
1654
1655 words
1656 .last()
1657 .is_some_and(|word| is_sentence_fragment_tail(word))
1658}
1659
1660fn is_sentence_fragment_tail(word: &str) -> bool {
1661 matches!(
1662 word.trim_matches(|c: char| !c.is_alphanumeric())
1663 .to_ascii_lowercase()
1664 .as_str(),
1665 "a" | "an"
1666 | "and"
1667 | "as"
1668 | "at"
1669 | "by"
1670 | "for"
1671 | "from"
1672 | "in"
1673 | "into"
1674 | "of"
1675 | "on"
1676 | "or"
1677 | "that"
1678 | "the"
1679 | "to"
1680 | "with"
1681 )
1682}
1683
1684fn is_list_section_heading(text: &str) -> bool {
1685 let trimmed = text.trim();
1686 trimmed.ends_with(':')
1687 && trimmed.len() <= 80
1688 && trimmed.split_whitespace().count() <= 8
1689 && trimmed.chars().any(char::is_alphabetic)
1690 && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
1691 && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
1692}
1693
1694fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
1695 let next_trimmed = next.trim();
1696 if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
1697 return false;
1698 }
1699
1700 if prev.ends_with('-')
1701 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
1702 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
1703 {
1704 return true;
1705 }
1706
1707 if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
1708 return true;
1709 }
1710
1711 let lower = next_trimmed.to_ascii_lowercase();
1712 if lower.starts_with("http://")
1713 || lower.starts_with("https://")
1714 || lower.starts_with("arxiv")
1715 || lower.starts_with("doi:")
1716 {
1717 return true;
1718 }
1719
1720 if matches!(
1721 next_trimmed.split_whitespace().next(),
1722 Some("In" | "Proceedings" | "Advances" | "Learning")
1723 ) {
1724 return true;
1725 }
1726
1727 !prev.ends_with(['.', '!', '?', ':'])
1728}
1729
1730fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
1731 let next_trimmed = next.trim();
1732 if next_trimmed.is_empty() {
1733 return false;
1734 }
1735
1736 if prev.ends_with('-')
1737 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
1738 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
1739 {
1740 return true;
1741 }
1742
1743 next_trimmed.chars().next().is_some_and(char::is_lowercase)
1744}
1745
1746fn merge_paragraph_text(target: &mut String, next: &str) {
1747 let next_trimmed = next.trim();
1748 if target.ends_with('-')
1749 && target
1750 .chars()
1751 .rev()
1752 .nth(1)
1753 .is_some_and(|c| c.is_alphabetic())
1754 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
1755 {
1756 target.pop();
1757 target.push_str(next_trimmed);
1758 } else {
1759 if !target.ends_with(' ') {
1760 target.push(' ');
1761 }
1762 target.push_str(next_trimmed);
1763 }
1764}
1765
1766fn is_standalone_page_number(text: &str) -> bool {
1767 let trimmed = text.trim();
1768 !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
1769}
1770
1771fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
1772 if !is_standalone_page_number(text) {
1773 return false;
1774 }
1775
1776 let bbox = element.bbox();
1777 if bbox.height() > 24.0 {
1778 return false;
1779 }
1780
1781 let Some(page) = element.page_number() else {
1782 return false;
1783 };
1784
1785 let mut page_top = f64::MIN;
1786 let mut page_bottom = f64::MAX;
1787 for candidate in &doc.kids {
1788 if candidate.page_number() == Some(page) {
1789 let candidate_bbox = candidate.bbox();
1790 page_top = page_top.max(candidate_bbox.top_y);
1791 page_bottom = page_bottom.min(candidate_bbox.bottom_y);
1792 }
1793 }
1794
1795 if !page_top.is_finite() || !page_bottom.is_finite() {
1796 return false;
1797 }
1798
1799 bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
1800}
1801
1802fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
1807 let element = &doc.kids[idx];
1808 let bbox = element.bbox();
1809 if bbox.height() > 30.0 {
1810 return false;
1811 }
1812
1813 let Some(page) = element.page_number() else {
1814 return false;
1815 };
1816
1817 let mut page_bottom = f64::MAX;
1818 for candidate in &doc.kids {
1819 if candidate.page_number() == Some(page) {
1820 page_bottom = page_bottom.min(candidate.bbox().bottom_y);
1821 }
1822 }
1823
1824 if !page_bottom.is_finite() {
1825 return false;
1826 }
1827
1828 bbox.bottom_y <= page_bottom + 24.0
1830}
1831
1832fn should_demote_period_heading(text: &str) -> bool {
1836 let trimmed = text.trim();
1837 if !trimmed.ends_with('.') {
1838 return false;
1839 }
1840 if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
1843 return false;
1844 }
1845 let without_dot = trimmed.trim_end_matches('.');
1849 let word_count = without_dot.split_whitespace().count();
1850 if word_count <= 2 {
1853 return true;
1854 }
1855 false
1856}
1857
1858fn should_demote_comma_heading(text: &str) -> bool {
1861 text.trim().ends_with(',')
1862}
1863
1864fn should_demote_math_heading(text: &str) -> bool {
1867 text.chars().any(|c| {
1868 matches!(
1869 c,
1870 '¼' | '½'
1871 | '¾'
1872 | '≪'
1873 | '≫'
1874 | 'þ'
1875 | 'ð'
1876 | '∑'
1877 | '∫'
1878 | '∂'
1879 | '∏'
1880 | '√'
1881 | '∞'
1882 | '≈'
1883 | '÷'
1884 )
1885 })
1886}
1887
1888fn should_demote_percentage_heading(text: &str) -> bool {
1891 text.contains('%')
1892}
1893
1894fn should_demote_bibliography_heading(text: &str) -> bool {
1897 let t = text.trim();
1898 if t.len() < 6 {
1899 return false;
1900 }
1901 let bytes = t.as_bytes();
1902 bytes[0..4].iter().all(|b| b.is_ascii_digit())
1903 && bytes[4] == b'.'
1904 && (bytes[5] == b' ' || t.len() == 5)
1905}
1906
1907fn strip_trailing_page_number(text: &str) -> &str {
1912 let trimmed = text.trim();
1913 if let Some(last_space) = trimmed.rfind(' ') {
1914 let suffix = &trimmed[last_space + 1..];
1915 if !suffix.is_empty()
1916 && suffix.len() <= 4
1917 && suffix.chars().all(|c| c.is_ascii_digit())
1918 && trimmed[..last_space].split_whitespace().count() >= 3
1919 {
1920 return trimmed[..last_space].trim();
1921 }
1922 }
1923 trimmed
1924}
1925
1926fn find_merged_subsection_split(text: &str) -> Option<usize> {
1931 let bytes = text.as_bytes();
1934 let mut i = 3;
1936 while i < bytes.len() {
1937 if bytes[i - 1] == b' ' {
1938 if bytes[i].is_ascii_digit() {
1940 if let Some(dot_pos) = text[i..].find('.') {
1941 let after_dot = i + dot_pos + 1;
1942 if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
1943 return Some(i);
1945 }
1946 }
1947 }
1948 if bytes[i].is_ascii_uppercase()
1950 && i + 2 < bytes.len()
1951 && bytes[i + 1] == b'.'
1952 && bytes[i + 2].is_ascii_digit()
1953 {
1954 return Some(i);
1955 }
1956 }
1957 i += 1;
1958 }
1959 None
1960}
1961
1962fn should_skip_heading_text(text: &str) -> bool {
1963 let trimmed = text.trim();
1964 if trimmed.is_empty() || is_standalone_page_number(trimmed) {
1965 return true;
1966 }
1967
1968 let lower = trimmed.to_ascii_lowercase();
1969 if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
1970 && trimmed.contains('|')
1971 {
1972 return true;
1973 }
1974
1975 let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
1976 let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
1977 alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
1978}
1979
1980fn repair_fragmented_words(text: &str) -> String {
1981 const STOPWORDS: &[&str] = &[
1982 "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
1983 "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
1984 ];
1985
1986 let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
1987 if parts.len() < 2 {
1988 return text.to_string();
1989 }
1990
1991 let mut i = 0usize;
1992 while i + 1 < parts.len() {
1993 let left = parts[i].clone();
1994 let right = parts[i + 1].clone();
1995 let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
1996 let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
1997 let left_lower = left_clean.to_ascii_lowercase();
1998 let right_lower = right_clean.to_ascii_lowercase();
1999
2000 let should_join = !left_clean.is_empty()
2001 && !right_clean.is_empty()
2002 && left_clean.chars().all(char::is_alphabetic)
2003 && right_clean.chars().all(char::is_alphabetic)
2004 && (left_clean.len() <= 4 || right_clean.len() <= 4)
2005 && left_clean.len() + right_clean.len() >= 6
2006 && !right_clean.chars().next().is_some_and(char::is_uppercase)
2007 && !STOPWORDS.contains(&left_lower.as_str())
2008 && !STOPWORDS.contains(&right_lower.as_str());
2009
2010 if should_join {
2011 let next = parts.remove(i + 1);
2012 parts[i].push_str(&next);
2013 } else {
2014 i += 1;
2015 }
2016 }
2017
2018 parts.join(" ")
2019}
2020
2021fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
2023 let mut text = String::new();
2024 for elem in contents {
2025 let part = match elem {
2026 ContentElement::Paragraph(p) => p.base.value(),
2027 ContentElement::TextBlock(tb) => tb.value(),
2028 ContentElement::TextLine(tl) => tl.value(),
2029 ContentElement::TextChunk(tc) => tc.value.clone(),
2030 _ => String::new(),
2031 };
2032 if !text.is_empty() && !part.is_empty() {
2033 text.push(' ');
2034 }
2035 text.push_str(&part);
2036 }
2037 text
2038}
2039
2040fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
2051 if rows.len() < 2 {
2052 return;
2053 }
2054 if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
2056 return;
2057 }
2058
2059 let mut merge_count = 0usize;
2060 for (i, row_i) in rows.iter().enumerate().skip(1) {
2061 let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
2062 if !first_empty {
2063 break; }
2065 let all_short = row_i
2067 .iter()
2068 .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
2069 if !all_short {
2070 break;
2071 }
2072 merge_count = i;
2073 }
2074
2075 if merge_count == 0 {
2078 return;
2079 }
2080
2081 for i in 1..=merge_count {
2083 let (head, tail) = rows.split_at_mut(i);
2084 let ncols = head[0].len().min(tail[0].len());
2085 for (target, src) in head[0]
2086 .iter_mut()
2087 .take(ncols)
2088 .zip(tail[0].iter().take(ncols))
2089 {
2090 let fragment = src.trim().to_string();
2091 if !fragment.is_empty() {
2092 let target_str = target.trim().to_string();
2093 *target = if target_str.is_empty() {
2094 fragment
2095 } else {
2096 format!("{} {}", target_str, fragment)
2097 };
2098 }
2099 }
2100 }
2101
2102 rows.drain(1..=merge_count);
2104}
2105
2106fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
2108 render_table_border(out, &table.table_border);
2110}
2111
2112fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
2114 let num_cols = table.num_columns.max(1);
2115 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
2116 for row in &table.rows {
2117 let cell_texts: Vec<String> = (0..num_cols)
2118 .map(|col| {
2119 row.cells
2120 .iter()
2121 .find(|c| c.col_number == col)
2122 .map(cell_text_content)
2123 .unwrap_or_default()
2124 })
2125 .collect();
2126 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
2127 rendered_rows.push(cell_texts);
2128 }
2129 }
2130 rendered_rows
2131}
2132
2133fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
2139 if table.rows.is_empty() {
2140 return;
2141 }
2142
2143 let num_cols = table.num_columns.max(1);
2144
2145 let mut rendered_rows = collect_table_border_rows(table);
2147
2148 if rendered_rows.is_empty() {
2149 return;
2150 }
2151
2152 merge_continuation_rows(&mut rendered_rows);
2154
2155 if is_toc_table(&rendered_rows) {
2157 render_toc_rows(out, &rendered_rows);
2158 return;
2159 }
2160
2161 for (row_idx, cell_texts) in rendered_rows.iter().enumerate() {
2162 out.push('|');
2163 for cell_text in cell_texts {
2164 out.push_str(&format!(" {} |", cell_text.trim()));
2165 }
2166 out.push('\n');
2167
2168 if row_idx == 0 {
2170 out.push('|');
2171 for _ in 0..num_cols {
2172 out.push_str(" --- |");
2173 }
2174 out.push('\n');
2175 }
2176 }
2177 out.push('\n');
2178}
2179
2180fn is_page_number_like(text: &str) -> bool {
2182 let t = text.trim();
2183 if t.is_empty() {
2184 return false;
2185 }
2186 if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
2188 return true;
2189 }
2190 let lower = t.to_ascii_lowercase();
2192 if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
2193 return true;
2194 }
2195 false
2196}
2197
2198fn is_toc_table(rows: &[Vec<String>]) -> bool {
2201 if rows.is_empty() {
2202 return false;
2203 }
2204 if rows.len() < 2 {
2206 return false;
2207 }
2208 if !rows.iter().all(|r| r.len() == 2) {
2210 return false;
2211 }
2212
2213 let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
2214 if non_empty_right < 2 {
2215 return false;
2216 }
2217
2218 let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
2219 page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
2220}
2221
2222fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
2224 for row in rows {
2225 let title = row[0].trim();
2226 let page = row[1].trim();
2227 if title.is_empty() && page.is_empty() {
2228 continue;
2229 }
2230 if !title.is_empty() && !page.is_empty() {
2231 out.push_str(title);
2232 out.push(' ');
2233 out.push_str(page);
2234 } else {
2235 out.push_str(title);
2236 out.push_str(page);
2237 }
2238 out.push('\n');
2239 }
2240 out.push('\n');
2241}
2242
2243fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
2245 if !cell.content.is_empty() {
2249 let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
2250 return crate::models::text::TextLine::concatenate_chunks(&chunks);
2251 }
2252 let mut text = String::new();
2254 for elem in &cell.contents {
2255 match elem {
2256 ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
2257 ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
2258 ContentElement::TextLine(tl) => text.push_str(&tl.value()),
2259 ContentElement::TextChunk(tc) => text.push_str(&tc.value),
2260 _ => {}
2261 }
2262 }
2263 repair_fragmented_words(&text)
2264}
2265
2266fn merge_adjacent_pipe_tables(markdown: &str) -> String {
2274 let lines: Vec<&str> = markdown.lines().collect();
2275 if lines.len() < 4 {
2276 return markdown.to_string();
2277 }
2278
2279 fn count_pipe_cols(line: &str) -> usize {
2280 let t = line.trim();
2281 if !t.starts_with('|') || !t.ends_with('|') {
2282 return 0;
2283 }
2284 t.split('|').count().saturating_sub(2)
2285 }
2286
2287 fn is_separator(line: &str) -> bool {
2288 let t = line.trim();
2289 if !t.starts_with('|') || !t.ends_with('|') {
2290 return false;
2291 }
2292 let cells: Vec<&str> = t.split('|').collect();
2293 if cells.len() < 3 {
2294 return false;
2295 }
2296 cells[1..cells.len() - 1].iter().all(|c| {
2297 let s = c.trim();
2298 !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
2299 })
2300 }
2301
2302 fn is_pipe_row(line: &str) -> bool {
2303 let t = line.trim();
2304 t.starts_with('|') && t.ends_with('|') && t.len() > 2
2305 }
2306
2307 fn pad_pipe_row(line: &str, target_cols: usize) -> String {
2308 let t = line.trim();
2309 let current_cols = count_pipe_cols(t);
2310 if current_cols >= target_cols {
2311 return t.to_string();
2312 }
2313 let mut result = t.to_string();
2315 for _ in current_cols..target_cols {
2316 result.push_str(" |");
2317 }
2318 result
2319 }
2320
2321 struct Block {
2323 start: usize,
2324 sep: usize,
2325 end: usize, cols: usize,
2327 }
2328
2329 let mut blocks: Vec<Block> = Vec::new();
2330 let mut i = 0;
2331 while i < lines.len() {
2332 if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
2333 let cols = count_pipe_cols(lines[i]);
2334 let sep = i + 1;
2335 let mut end = sep;
2336 let mut j = sep + 1;
2337 while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
2338 end = j;
2339 j += 1;
2340 }
2341 blocks.push(Block {
2342 start: i,
2343 sep,
2344 end,
2345 cols,
2346 });
2347 i = end + 1;
2348 } else {
2349 i += 1;
2350 }
2351 }
2352
2353 if blocks.len() < 2 {
2354 return markdown.to_string();
2355 }
2356
2357 let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
2363 let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
2364 for bi in 1..blocks.len() {
2365 let prev = &blocks[bi - 1];
2366 let curr = &blocks[bi];
2367 let gap_range = prev.end + 1..curr.start;
2368 let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
2369 let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
2373 let effective_prev_cols = group_cols[leader_idx];
2374 let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
2375 let non_blank: Vec<usize> = gap_range
2376 .clone()
2377 .filter(|li| !lines[*li].trim().is_empty())
2378 .collect();
2379 !non_blank.is_empty()
2381 && non_blank.len() <= 2
2382 && non_blank.iter().all(|li| {
2383 let t = lines[*li].trim();
2384 t.starts_with('#') && t.len() < 100
2385 })
2386 } else {
2387 false
2388 };
2389 let gap_short_fragment =
2393 if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
2394 let non_blank: Vec<usize> = gap_range
2395 .clone()
2396 .filter(|li| !lines[*li].trim().is_empty())
2397 .collect();
2398 non_blank.len() == 1 && {
2399 let t = lines[non_blank[0]].trim();
2400 t.len() < 30
2401 && !t.starts_with('#')
2402 && !t.starts_with('-')
2403 && !t.starts_with('*')
2404 && !t.contains(':')
2405 && !t.contains("TABLE")
2406 }
2407 } else {
2408 false
2409 };
2410 if (gap_all_blank || gap_heading_only || gap_short_fragment)
2411 && prev.cols > 0
2412 && curr.cols > 0
2413 {
2414 merge_leader[bi] = Some(leader_idx);
2415 if curr.cols > group_cols[leader_idx] {
2417 group_cols[leader_idx] = curr.cols;
2418 }
2419 }
2420 }
2421
2422 let mut pad_target: Vec<usize> = vec![0; blocks.len()];
2423 for bi in 0..blocks.len() {
2424 let leader = merge_leader[bi].unwrap_or(bi);
2425 pad_target[bi] = group_cols[leader];
2426 }
2427
2428 let mut skip = vec![false; lines.len()];
2432 let mut convert_to_pipe_row = vec![false; lines.len()];
2433 for (bi, leader) in merge_leader.iter().enumerate() {
2434 if leader.is_none() {
2435 continue;
2436 }
2437 let prev_end = blocks[bi - 1].end;
2438 let curr = &blocks[bi];
2439 for li in (prev_end + 1)..curr.start {
2440 if lines[li].trim().is_empty() {
2441 skip[li] = true;
2442 } else {
2443 convert_to_pipe_row[li] = true;
2445 }
2446 }
2447 skip[curr.sep] = true;
2449 }
2450
2451 let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
2453 for (bi, block) in blocks.iter().enumerate() {
2454 line_to_block[block.start..=block.end].fill(Some(bi));
2455 }
2456 for (bi, leader) in merge_leader.iter().enumerate() {
2458 if leader.is_none() {
2459 continue;
2460 }
2461 let prev_end = blocks[bi - 1].end;
2462 let curr = &blocks[bi];
2463 for li in (prev_end + 1)..curr.start {
2464 if convert_to_pipe_row[li] {
2465 line_to_block[li] = Some(bi - 1);
2466 }
2467 }
2468 }
2469
2470 let mut result = String::new();
2471 for (li, line) in lines.iter().enumerate() {
2472 if skip[li] {
2473 continue;
2474 }
2475 if convert_to_pipe_row[li] {
2476 let text = line.trim().trim_start_matches('#').trim();
2478 if let Some(bi) = line_to_block[li] {
2479 let target = pad_target[bi];
2480 if target > 0 && !text.is_empty() {
2481 result.push_str(&format!("| {} ", text));
2482 for _ in 1..target {
2483 result.push_str("| ");
2484 }
2485 result.push_str("|\n");
2486 continue;
2487 }
2488 }
2489 result.push_str(line);
2491 result.push('\n');
2492 continue;
2493 }
2494 if let Some(bi) = line_to_block[li] {
2495 let target = pad_target[bi];
2496 if target > 0 && is_pipe_row(line) && !is_separator(line) {
2497 result.push_str(&pad_pipe_row(line, target));
2498 result.push('\n');
2499 } else if target > 0 && is_separator(line) {
2500 result.push('|');
2501 for _ in 0..target {
2502 result.push_str(" --- |");
2503 }
2504 result.push('\n');
2505 } else {
2506 result.push_str(line);
2507 result.push('\n');
2508 }
2509 } else {
2510 result.push_str(line);
2511 result.push('\n');
2512 }
2513 }
2514
2515 result
2516}
2517
2518#[cfg(test)]
2519mod tests {
2520 use super::*;
2521 use crate::models::bbox::BoundingBox;
2522 use crate::models::chunks::TextChunk;
2523 use crate::models::content::ContentElement;
2524 use crate::models::enums::{PdfLayer, TextFormat, TextType};
2525 use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
2526 use crate::models::table::{
2527 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
2528 };
2529 use crate::models::text::{TextBlock, TextColumn, TextLine};
2530
2531 #[test]
2532 fn test_empty_doc() {
2533 let doc = PdfDocument::new("test.pdf".to_string());
2534 let md = to_markdown(&doc).unwrap();
2535 assert!(md.contains("No content extracted"));
2536 }
2537
2538 #[test]
2539 fn test_with_title() {
2540 let mut doc = PdfDocument::new("test.pdf".to_string());
2541 doc.title = Some("My Title".to_string());
2542 let md = to_markdown(&doc).unwrap();
2543 assert!(md.starts_with("# My Title\n"));
2544 }
2545
2546 #[test]
2547 fn test_empty_title_not_rendered() {
2548 let mut doc = PdfDocument::new("test.pdf".to_string());
2549 doc.title = Some(" ".to_string());
2550 let md = to_markdown(&doc).unwrap();
2551 assert!(
2552 !md.contains("# "),
2553 "Empty/whitespace title should not produce a heading"
2554 );
2555 }
2556
2557 #[test]
2558 fn test_repair_fragmented_words() {
2559 assert_eq!(
2560 repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
2561 "Jurisdiction Foreign Requirements"
2562 );
2563 }
2564
2565 #[test]
2566 fn test_reference_continuation_detected() {
2567 assert!(should_merge_paragraph_text(
2568 "Scaling laws for transfer.",
2569 "arXiv preprint arXiv:2102.01293."
2570 ));
2571 }
2572
2573 fn make_heading(text: &str) -> ContentElement {
2574 let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
2575 let chunk = TextChunk {
2576 value: text.to_string(),
2577 bbox: bbox.clone(),
2578 font_name: "Lato-Bold".to_string(),
2579 font_size: 12.0,
2580 font_weight: 700.0,
2581 italic_angle: 0.0,
2582 font_color: "#000000".to_string(),
2583 contrast_ratio: 21.0,
2584 symbol_ends: vec![],
2585 text_format: TextFormat::Normal,
2586 text_type: TextType::Regular,
2587 pdf_layer: PdfLayer::Main,
2588 ocg_visible: true,
2589 index: None,
2590 page_number: Some(1),
2591 level: None,
2592 mcid: None,
2593 };
2594 let line = TextLine {
2595 bbox: bbox.clone(),
2596 index: None,
2597 level: None,
2598 font_size: 12.0,
2599 base_line: 702.0,
2600 slant_degree: 0.0,
2601 is_hidden_text: false,
2602 text_chunks: vec![chunk],
2603 is_line_start: true,
2604 is_line_end: true,
2605 is_list_line: false,
2606 connected_line_art_label: None,
2607 };
2608 let block = TextBlock {
2609 bbox: bbox.clone(),
2610 index: None,
2611 level: None,
2612 font_size: 12.0,
2613 base_line: 702.0,
2614 slant_degree: 0.0,
2615 is_hidden_text: false,
2616 text_lines: vec![line],
2617 has_start_line: true,
2618 has_end_line: true,
2619 text_alignment: None,
2620 };
2621 let column = TextColumn {
2622 bbox: bbox.clone(),
2623 index: None,
2624 level: None,
2625 font_size: 12.0,
2626 base_line: 702.0,
2627 slant_degree: 0.0,
2628 is_hidden_text: false,
2629 text_blocks: vec![block],
2630 };
2631 ContentElement::Heading(SemanticHeading {
2632 base: SemanticParagraph {
2633 base: SemanticTextNode {
2634 bbox,
2635 index: None,
2636 level: None,
2637 semantic_type: crate::models::enums::SemanticType::Heading,
2638 correct_semantic_score: None,
2639 columns: vec![column],
2640 font_weight: Some(700.0),
2641 font_size: Some(12.0),
2642 text_color: None,
2643 italic_angle: None,
2644 font_name: Some("Lato-Bold".to_string()),
2645 text_format: None,
2646 max_font_size: Some(12.0),
2647 background_color: None,
2648 is_hidden_text: false,
2649 },
2650 enclosed_top: false,
2651 enclosed_bottom: false,
2652 indentation: 0,
2653 },
2654 heading_level: Some(1),
2655 })
2656 }
2657
2658 fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
2659 let bbox = BoundingBox::new(Some(1), 72.0, bottom, 300.0, top);
2660 let chunk = TextChunk {
2661 value: text.to_string(),
2662 bbox: bbox.clone(),
2663 font_name: "Lato-Regular".to_string(),
2664 font_size: (top - bottom).max(1.0),
2665 font_weight: 400.0,
2666 italic_angle: 0.0,
2667 font_color: "#000000".to_string(),
2668 contrast_ratio: 21.0,
2669 symbol_ends: vec![],
2670 text_format: TextFormat::Normal,
2671 text_type: TextType::Regular,
2672 pdf_layer: PdfLayer::Main,
2673 ocg_visible: true,
2674 index: None,
2675 page_number: Some(1),
2676 level: None,
2677 mcid: None,
2678 };
2679 let line = TextLine {
2680 bbox: bbox.clone(),
2681 index: None,
2682 level: None,
2683 font_size: chunk.font_size,
2684 base_line: bottom + 2.0,
2685 slant_degree: 0.0,
2686 is_hidden_text: false,
2687 text_chunks: vec![chunk],
2688 is_line_start: true,
2689 is_line_end: true,
2690 is_list_line: false,
2691 connected_line_art_label: None,
2692 };
2693 let block = TextBlock {
2694 bbox: bbox.clone(),
2695 index: None,
2696 level: None,
2697 font_size: line.font_size,
2698 base_line: line.base_line,
2699 slant_degree: 0.0,
2700 is_hidden_text: false,
2701 text_lines: vec![line],
2702 has_start_line: true,
2703 has_end_line: true,
2704 text_alignment: None,
2705 };
2706 let column = TextColumn {
2707 bbox: bbox.clone(),
2708 index: None,
2709 level: None,
2710 font_size: block.font_size,
2711 base_line: block.base_line,
2712 slant_degree: 0.0,
2713 is_hidden_text: false,
2714 text_blocks: vec![block],
2715 };
2716 ContentElement::Paragraph(SemanticParagraph {
2717 base: SemanticTextNode {
2718 bbox,
2719 index: None,
2720 level: None,
2721 semantic_type: crate::models::enums::SemanticType::Paragraph,
2722 correct_semantic_score: None,
2723 columns: vec![column],
2724 font_weight: Some(400.0),
2725 font_size: Some(top - bottom),
2726 text_color: None,
2727 italic_angle: None,
2728 font_name: Some("Lato-Regular".to_string()),
2729 text_format: None,
2730 max_font_size: Some(top - bottom),
2731 background_color: None,
2732 is_hidden_text: false,
2733 },
2734 enclosed_top: false,
2735 enclosed_bottom: false,
2736 indentation: 0,
2737 })
2738 }
2739
2740 fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
2741 let mut table_rows = Vec::new();
2742 for (ri, (title, page)) in rows.iter().enumerate() {
2743 let top = 680.0 - ri as f64 * 18.0;
2744 let bottom = top - 12.0;
2745 let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
2746 let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
2747 table_rows.push(TableBorderRow {
2748 bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
2749 index: None,
2750 level: None,
2751 row_number: ri,
2752 cells: vec![
2753 TableBorderCell {
2754 bbox: left_bbox.clone(),
2755 index: None,
2756 level: None,
2757 row_number: ri,
2758 col_number: 0,
2759 row_span: 1,
2760 col_span: 1,
2761 content: vec![TableToken {
2762 base: TextChunk {
2763 value: (*title).to_string(),
2764 bbox: left_bbox,
2765 font_name: "Lato-Regular".to_string(),
2766 font_size: 10.0,
2767 font_weight: 400.0,
2768 italic_angle: 0.0,
2769 font_color: "#000000".to_string(),
2770 contrast_ratio: 21.0,
2771 symbol_ends: vec![],
2772 text_format: TextFormat::Normal,
2773 text_type: TextType::Regular,
2774 pdf_layer: PdfLayer::Main,
2775 ocg_visible: true,
2776 index: None,
2777 page_number: Some(1),
2778 level: None,
2779 mcid: None,
2780 },
2781 token_type: TableTokenType::Text,
2782 }],
2783 contents: vec![],
2784 semantic_type: None,
2785 },
2786 TableBorderCell {
2787 bbox: right_bbox.clone(),
2788 index: None,
2789 level: None,
2790 row_number: ri,
2791 col_number: 1,
2792 row_span: 1,
2793 col_span: 1,
2794 content: vec![TableToken {
2795 base: TextChunk {
2796 value: (*page).to_string(),
2797 bbox: right_bbox,
2798 font_name: "Lato-Regular".to_string(),
2799 font_size: 10.0,
2800 font_weight: 400.0,
2801 italic_angle: 0.0,
2802 font_color: "#000000".to_string(),
2803 contrast_ratio: 21.0,
2804 symbol_ends: vec![],
2805 text_format: TextFormat::Normal,
2806 text_type: TextType::Regular,
2807 pdf_layer: PdfLayer::Main,
2808 ocg_visible: true,
2809 index: None,
2810 page_number: Some(1),
2811 level: None,
2812 mcid: None,
2813 },
2814 token_type: TableTokenType::Text,
2815 }],
2816 contents: vec![],
2817 semantic_type: None,
2818 },
2819 ],
2820 semantic_type: None,
2821 });
2822 }
2823
2824 ContentElement::TableBorder(TableBorder {
2825 bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
2826 index: None,
2827 level: Some("1".to_string()),
2828 x_coordinates: vec![72.0, 320.0, 360.0],
2829 x_widths: vec![0.0, 0.0, 0.0],
2830 y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
2831 y_widths: vec![0.0, 0.0, 0.0, 0.0],
2832 rows: table_rows,
2833 num_rows: rows.len(),
2834 num_columns: 2,
2835 is_bad_table: false,
2836 is_table_transformer: false,
2837 previous_table: None,
2838 next_table: None,
2839 })
2840 }
2841
2842 #[test]
2843 fn test_contents_document_renders_toc_table_rows() {
2844 let mut doc = PdfDocument::new("contents.pdf".to_string());
2845 doc.kids.push(make_heading("CONTENTS"));
2846 doc.kids.push(make_toc_table(&[
2847 ("Experiment #1: Hydrostatic Pressure", "3"),
2848 ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
2849 ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
2850 ("Experiment #4: Energy Loss in Pipes", "33"),
2851 ("Experiment #5: Impact of a Jet", "43"),
2852 ("Experiment #6: Orifice and Free Jet Flow", "50"),
2853 ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
2854 ("References", "101"),
2855 ]));
2856
2857 let md = to_markdown(&doc).unwrap();
2858 assert!(md.contains("Experiment #1: Hydrostatic Pressure 3"));
2859 assert!(md.contains("Experiment #2: Bernoulli's Theorem Demonstration 13"));
2860 assert!(md.contains("Experiment #7: Osborne Reynolds' Demonstration 59"));
2861 assert!(md.contains("References 101"));
2862 }
2863
2864 #[test]
2865 fn test_toc_semantic_paragraphs_render_without_blank_lines() {
2866 let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
2867 let mut first = make_paragraph(
2868 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
2869 700.0,
2870 712.0,
2871 );
2872 let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
2873 if let ContentElement::Paragraph(p) = &mut first {
2874 p.base.semantic_type = SemanticType::TableOfContent;
2875 }
2876 if let ContentElement::Paragraph(p) = &mut second {
2877 p.base.semantic_type = SemanticType::TableOfContent;
2878 }
2879 doc.kids.push(first);
2880 doc.kids.push(second);
2881
2882 let md = to_markdown(&doc).unwrap();
2883 assert!(md.contains(
2884 "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
2885 ));
2886 }
2887
2888 #[test]
2889 fn test_compact_toc_document_renders_without_blank_lines() {
2890 let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
2891 doc.kids.push(make_paragraph(
2892 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
2893 700.0,
2894 712.0,
2895 ));
2896 doc.kids.push(make_paragraph(
2897 "Section 5.1: The Linear Model 35",
2898 684.0,
2899 696.0,
2900 ));
2901 doc.kids.push(make_paragraph(
2902 "Part VI. Chapter Six - Comparing Three or More Group Means",
2903 668.0,
2904 680.0,
2905 ));
2906 doc.kids.push(make_paragraph(
2907 "Section 6.1: Between Versus Within Group Analyses 49",
2908 652.0,
2909 664.0,
2910 ));
2911 doc.kids.push(make_paragraph(
2912 "Part VII. Chapter Seven - Moderation and Mediation Analyses",
2913 636.0,
2914 648.0,
2915 ));
2916 doc.kids.push(make_paragraph(
2917 "Section 7.1: Mediation and Moderation Models 64",
2918 620.0,
2919 632.0,
2920 ));
2921 doc.kids
2922 .push(make_paragraph("References 101", 604.0, 616.0));
2923 doc.kids.push(make_paragraph(
2924 "Section 8.1: Factor Analysis Definitions 75",
2925 588.0,
2926 600.0,
2927 ));
2928
2929 let md = to_markdown(&doc).unwrap();
2930 assert!(!md.contains("\n\nSection 5.1: The Linear Model 35"));
2931 assert!(md.contains("Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35"));
2932 }
2933
2934 #[test]
2935 fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
2936 let mut doc = PdfDocument::new("caption-body.pdf".to_string());
2937 doc.kids.push(make_paragraph(
2938 "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
2939 500.0,
2940 540.0,
2941 ));
2942
2943 let md = to_markdown(&doc).unwrap();
2944 assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
2945 }
2946
2947 #[test]
2948 fn test_short_caption_label_merges_with_following_tail_and_body() {
2949 let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
2950 doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
2951 doc.kids.push(make_paragraph(
2952 "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
2953 520.0,
2954 532.0,
2955 ));
2956
2957 let md = to_markdown(&doc).unwrap();
2958 assert!(md.contains(
2959 "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
2960 ));
2961 }
2962
2963 #[test]
2964 fn test_short_caption_label_merges_with_tail_and_year() {
2965 let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
2966 doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
2967 doc.kids.push(make_paragraph(
2968 "Komnas HAM's YouTube channel as of 1 December",
2969 520.0,
2970 532.0,
2971 ));
2972 doc.kids.push(make_paragraph("2021", 500.0, 512.0));
2973
2974 let md = to_markdown(&doc).unwrap();
2975 assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
2976 assert!(!md.contains("\n\n2021"));
2977 }
2978
2979 #[test]
2980 fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
2981 let mut doc = PdfDocument::new("chart.pdf".to_string());
2982 doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
2983 doc.kids.push(make_paragraph("100", 520.0, 528.0));
2984 doc.kids
2985 .push(make_paragraph("Body text continues here.", 400.0, 412.0));
2986 doc.kids.push(make_paragraph("36", 20.0, 28.0));
2987
2988 let md = to_markdown(&doc).unwrap();
2989 assert!(md.contains("100"));
2990 assert!(!md.lines().any(|line| line.trim() == "36"));
2991 }
2992
2993 #[test]
2994 fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
2995 let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
2996 doc.kids.push(make_paragraph(
2997 "First semantic paragraph ends here.",
2998 520.0,
2999 532.0,
3000 ));
3001 doc.kids.push(make_paragraph(
3002 "Second semantic paragraph starts here.",
3003 500.0,
3004 512.0,
3005 ));
3006
3007 let md = to_markdown(&doc).unwrap();
3008 assert!(md.contains(
3009 "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
3010 ));
3011 }
3012
3013 #[test]
3014 fn test_lowercase_semantic_paragraph_continuation_is_merged() {
3015 let mut doc = PdfDocument::new("continuation.pdf".to_string());
3016 doc.kids.push(make_paragraph(
3017 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
3018 520.0,
3019 532.0,
3020 ));
3021 doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
3022
3023 let md = to_markdown(&doc).unwrap();
3024 assert!(md.contains(
3025 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
3026 ));
3027 }
3028
3029 fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
3030 let mut table_rows = Vec::new();
3031 for (row_number, (left, right)) in rows.iter().enumerate() {
3032 let top = 656.0 - row_number as f64 * 18.0;
3033 let bottom = top - 16.0;
3034 let mut cells = Vec::new();
3035 for (col_number, (text, left_x, right_x)) in
3036 [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
3037 .into_iter()
3038 .enumerate()
3039 {
3040 let content = if text.is_empty() {
3041 Vec::new()
3042 } else {
3043 vec![TableToken {
3044 base: TextChunk {
3045 value: text.to_string(),
3046 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
3047 font_name: "Test".to_string(),
3048 font_size: 11.0,
3049 font_weight: 400.0,
3050 italic_angle: 0.0,
3051 font_color: "[0.0]".to_string(),
3052 contrast_ratio: 21.0,
3053 symbol_ends: Vec::new(),
3054 text_format: TextFormat::Normal,
3055 text_type: TextType::Regular,
3056 pdf_layer: PdfLayer::Main,
3057 ocg_visible: true,
3058 index: None,
3059 page_number: Some(1),
3060 level: None,
3061 mcid: None,
3062 },
3063 token_type: TableTokenType::Text,
3064 }]
3065 };
3066 cells.push(TableBorderCell {
3067 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
3068 index: None,
3069 level: None,
3070 row_number,
3071 col_number,
3072 row_span: 1,
3073 col_span: 1,
3074 content,
3075 contents: vec![],
3076 semantic_type: None,
3077 });
3078 }
3079
3080 table_rows.push(TableBorderRow {
3081 bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
3082 index: None,
3083 level: None,
3084 row_number,
3085 cells,
3086 semantic_type: None,
3087 });
3088 }
3089
3090 ContentElement::TableBorder(TableBorder {
3091 bbox: BoundingBox::new(
3092 Some(1),
3093 72.0,
3094 656.0 - rows.len() as f64 * 18.0 - 16.0,
3095 420.0,
3096 656.0,
3097 ),
3098 index: None,
3099 level: Some("1".to_string()),
3100 x_coordinates: vec![72.0, 220.0, 420.0],
3101 x_widths: vec![0.0; 3],
3102 y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
3103 y_widths: vec![0.0; rows.len() + 1],
3104 rows: table_rows,
3105 num_rows: rows.len(),
3106 num_columns: 2,
3107 is_bad_table: false,
3108 is_table_transformer: false,
3109 previous_table: None,
3110 next_table: None,
3111 })
3112 }
3113
3114 #[test]
3115 fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
3116 let mut doc = PdfDocument::new("cec-table.pdf".to_string());
3117 doc.number_of_pages = 1;
3118 doc.kids.push(make_two_column_table(&[
3119 ("Mineral or colloid type", "CEC of pure colloid"),
3120 ("", "cmolc/kg"),
3121 ("kaolinite", "10"),
3122 ("illite", "30"),
3123 ]));
3124
3125 let md = to_markdown(&doc).unwrap();
3126 assert!(md.contains("| --- | --- |"));
3127 assert!(md.contains("| kaolinite | 10 |"));
3128 }
3129
3130 #[test]
3131 fn test_blank_right_column_table_is_not_misrendered_as_toc() {
3132 let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
3133 doc.number_of_pages = 1;
3134 doc.kids.push(make_two_column_table(&[
3135 (
3136 "Added cation",
3137 "Relative Size & Settling Rates of Floccules",
3138 ),
3139 ("K+", ""),
3140 ("Na+", ""),
3141 ("Ca2+", ""),
3142 ]));
3143
3144 let md = to_markdown(&doc).unwrap();
3145 assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
3146 assert!(md.contains("| K+ | |"));
3147 }
3148
3149 #[test]
3150 fn test_merge_tables_across_heading() {
3151 let input = "some text\n\n\
3152 | Area | Competence |\n\
3153 | --- | --- |\n\
3154 | Row1 | Val1 |\n\
3155 | Row2 | Val2 |\n\
3156 \n\
3157 # Heading Between\n\
3158 \n\
3159 | Row3 | Val3 |\n\
3160 | --- | --- |\n\
3161 \n\
3162 more text\n";
3163 let result = merge_adjacent_pipe_tables(input);
3164 assert!(
3166 result.contains("| Heading Between |"),
3167 "Heading should be in pipe row: {}",
3168 result
3169 );
3170 assert!(
3172 !result.contains("# Heading Between"),
3173 "Heading marker should be removed: {}",
3174 result
3175 );
3176 assert!(
3178 result.contains("| Row3 |") || result.contains("Row3"),
3179 "Row3 should exist: {}",
3180 result
3181 );
3182 }
3183}