use crate::pdf::markdown::paragraphs::ends_with_sentence_terminator;
use crate::pdf::markdown::types::{LayoutHintClass, PdfParagraph};
pub(in crate::pdf::markdown) fn merge_continuation_paragraphs_region_aware(paragraphs: &mut Vec<PdfParagraph>) {
if paragraphs.len() < 2 {
return;
}
let mut i = 0;
while i + 1 < paragraphs.len() {
let should_merge = {
let current = ¶graphs[i];
let next = ¶graphs[i + 1];
current.heading_level.is_none()
&& next.heading_level.is_none()
&& !current.is_list_item
&& !next.is_list_item
&& !current.is_code_block
&& !next.is_code_block
&& !current.is_formula
&& !next.is_formula
&& current.layout_class == next.layout_class
&& (current.dominant_font_size - next.dominant_font_size).abs() < 2.0
&& !ends_with_sentence_terminator(current)
};
if should_merge {
let next = paragraphs.remove(i + 1);
paragraphs[i].lines.extend(next.lines);
} else {
i += 1;
}
}
}
pub(in crate::pdf::markdown) fn merge_consecutive_code_blocks(paragraphs: &mut Vec<PdfParagraph>) {
if paragraphs.len() < 2 {
return;
}
let mut i = 0;
while i + 1 < paragraphs.len() {
if paragraphs[i].is_code_block && paragraphs[i + 1].is_code_block {
let next = paragraphs.remove(i + 1);
paragraphs[i].lines.extend(next.lines);
} else {
i += 1;
}
}
}
pub(in crate::pdf::markdown) fn merge_list_continuations(paragraphs: &mut Vec<PdfParagraph>) {
if paragraphs.len() < 2 {
return;
}
let mut i = 0;
while i + 1 < paragraphs.len() {
if paragraphs[i].is_list_item && paragraphs[i + 1].is_list_item {
let prev_incomplete = !ends_with_sentence_terminator(¶graphs[i]);
let next_has_prefix = paragraphs[i + 1]
.lines
.first()
.and_then(|l| l.segments.first())
.map(|s| {
let first_word = s.text.split_whitespace().next().unwrap_or("");
crate::pdf::markdown::paragraphs::is_list_prefix(first_word)
})
.unwrap_or(false);
if prev_incomplete && !next_has_prefix {
let next = paragraphs.remove(i + 1);
paragraphs[i].lines.extend(next.lines);
continue; }
}
i += 1;
}
}
pub(in crate::pdf::markdown) fn demote_non_code_blocks(paragraphs: &mut [PdfParagraph]) {
for para in paragraphs.iter_mut() {
if !para.is_code_block {
continue;
}
let all_text: String = para
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
if looks_like_non_code(&all_text) {
para.is_code_block = false;
para.layout_class = Some(LayoutHintClass::Text);
}
}
}
fn looks_like_non_code(text: &str) -> bool {
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return false;
}
let hex_count = words
.iter()
.filter(|w| w.len() <= 2 && !w.is_empty() && w.chars().all(|c| c.is_ascii_hexdigit()))
.count();
if hex_count * 2 > words.len() {
return true;
}
let total_chars = text.len();
if total_chars < 10 {
return false; }
let code_chars: usize = text
.chars()
.filter(|c| matches!(c, '(' | ')' | '{' | '}' | '[' | ']' | '=' | '<' | '>' | ';'))
.count();
code_chars * 100 < total_chars * 3
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pdf::hierarchy::SegmentData;
use crate::pdf::markdown::types::{LayoutHintClass, PdfLine, PdfParagraph};
fn make_segment(text: &str) -> SegmentData {
SegmentData {
text: text.to_string(),
x: 0.0,
y: 700.0,
width: 50.0,
height: 12.0,
font_size: 12.0,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: 700.0,
}
}
fn make_line_with_text(text: &str) -> PdfLine {
PdfLine {
segments: vec![make_segment(text)],
baseline_y: 700.0,
dominant_font_size: 12.0,
is_bold: false,
is_monospace: false,
}
}
fn body_para_with_text(text: &str, class: Option<LayoutHintClass>) -> PdfParagraph {
PdfParagraph {
lines: vec![make_line_with_text(text)],
dominant_font_size: 12.0,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: class,
caption_for: None,
block_bbox: None,
}
}
#[test]
fn test_cjk_sentence_terminator() {
let first_text = "这是第一句。"; let second_text = "这是第二句";
let mut paragraphs = vec![
body_para_with_text(first_text, Some(LayoutHintClass::Text)),
body_para_with_text(second_text, Some(LayoutHintClass::Text)),
];
merge_continuation_paragraphs_region_aware(&mut paragraphs);
assert_eq!(
paragraphs.len(),
2,
"paragraphs should NOT be merged when first ends with 。"
);
}
#[test]
fn test_fullwidth_question_mark() {
let first_text = "Is this correct?"; let second_text = "yes it is";
let mut paragraphs = vec![
body_para_with_text(first_text, Some(LayoutHintClass::Text)),
body_para_with_text(second_text, Some(LayoutHintClass::Text)),
];
merge_continuation_paragraphs_region_aware(&mut paragraphs);
assert_eq!(
paragraphs.len(),
2,
"paragraphs should NOT be merged when first ends with ?"
);
}
#[test]
fn test_paragraphs_without_terminator_are_merged() {
let mut paragraphs = vec![
body_para_with_text("continuation without terminator", Some(LayoutHintClass::Text)),
body_para_with_text("second paragraph", Some(LayoutHintClass::Text)),
];
merge_continuation_paragraphs_region_aware(&mut paragraphs);
assert_eq!(
paragraphs.len(),
1,
"paragraphs SHOULD be merged when first has no sentence terminator"
);
}
}