use super::types::PdfParagraph;
pub(super) fn merge_continuation_paragraphs(paragraphs: &mut Vec<PdfParagraph>) {
if paragraphs.len() < 2 {
return;
}
let old = std::mem::take(paragraphs);
let mut iter = old.into_iter();
let mut current = iter.next().unwrap();
for next in iter {
let both_body = current.heading_level.is_none()
&& next.heading_level.is_none()
&& !current.is_list_item
&& !next.is_list_item
&& !current.is_code_block
&& !next.is_code_block
&& !current.is_formula
&& !next.is_formula;
let fonts_compatible = (current.dominant_font_size - next.dominant_font_size).abs() < 2.0;
let continuation_signal = !ends_with_sentence_terminator(¤t) || starts_with_lowercase_continuation(&next);
let should_merge = both_body && fonts_compatible && continuation_signal;
if should_merge {
current.lines.extend(next.lines);
} else {
paragraphs.push(current);
current = next;
}
}
paragraphs.push(current);
}
fn starts_with_lowercase_continuation(para: &PdfParagraph) -> bool {
let first_text = para
.lines
.first()
.and_then(|l| l.segments.first())
.map(|s| s.text.trim_start())
.unwrap_or("");
first_text.chars().next().is_some_and(|c| c.is_lowercase())
}
fn ends_with_sentence_terminator(para: &PdfParagraph) -> bool {
let last_text = para
.lines
.last()
.and_then(|l| l.segments.last())
.map(|s| s.text.trim_end())
.unwrap_or("");
matches!(
last_text.chars().last(),
Some('.' | '?' | '!' | ':' | ';' | '\u{3002}' | '\u{FF1F}' | '\u{FF01}')
)
}
pub(super) fn split_embedded_list_items(paragraphs: &mut Vec<PdfParagraph>) {
let old = std::mem::take(paragraphs);
for para in old {
if para.heading_level.is_some() || para.is_list_item || para.is_code_block || para.is_formula {
paragraphs.push(para);
continue;
}
let full_text: String = para
.lines
.iter()
.flat_map(|l| l.segments.iter())
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
let bullet_count = full_text.matches('\u{2022}').count();
if bullet_count < 2 {
paragraphs.push(para);
continue;
}
let font_size = para.dominant_font_size;
let is_bold = para.is_bold;
let parts: Vec<&str> = full_text.split('\u{2022}').collect();
let before = parts[0].trim();
if !before.is_empty() {
paragraphs.push(text_to_paragraph(before, font_size, is_bold, false));
}
for part in &parts[1..] {
let item_text = part.trim();
if !item_text.is_empty() {
paragraphs.push(text_to_paragraph(item_text, font_size, is_bold, true));
}
}
}
}
fn text_to_paragraph(text: &str, font_size: f32, is_bold: bool, is_list_item: bool) -> PdfParagraph {
use crate::pdf::hierarchy::SegmentData;
let segments: Vec<SegmentData> = text
.split_whitespace()
.map(|w| SegmentData {
text: w.to_string(),
x: 0.0,
y: 0.0,
width: 0.0,
height: 0.0,
font_size,
is_bold,
is_italic: false,
is_monospace: false,
baseline_y: 0.0,
})
.collect();
let line = super::types::PdfLine {
segments,
baseline_y: 0.0,
dominant_font_size: font_size,
is_bold,
is_monospace: false,
};
PdfParagraph {
text: String::new(),
lines: vec![line],
dominant_font_size: font_size,
heading_level: None,
is_bold,
is_list_item,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
}
}
pub(super) fn is_list_prefix_multi_token(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() {
return false;
}
let tokens: Vec<&str> = trimmed.split_whitespace().take(3).collect();
for token in &tokens {
if is_single_token_list_prefix(token) {
return true;
}
}
if tokens.len() >= 2 {
let joined = format!("{}{}", tokens[0], tokens[1]);
if is_single_token_list_prefix(&joined) {
return true;
}
}
if tokens.len() >= 3 {
let joined = format!("{}{}{}", tokens[0], tokens[1], tokens[2]);
if is_single_token_list_prefix(&joined) {
return true;
}
}
false
}
fn is_single_token_list_prefix(text: &str) -> bool {
let trimmed = text.trim();
if matches!(
trimmed,
"-" | "*"
| "\u{2022}"
| "\u{2013}"
| "\u{2014}"
| "\u{2023}"
| "\u{25E6}"
| "\u{25AA}"
| "\u{25CF}"
| "\u{2043}"
| "\u{27A2}"
) {
return true;
}
let bytes = trimmed.as_bytes();
if bytes.is_empty() {
return false;
}
let digit_end = bytes.iter().position(|&b| !b.is_ascii_digit()).unwrap_or(bytes.len());
if digit_end > 0 && digit_end < bytes.len() {
let suffix = bytes[digit_end];
if suffix == b'.' || suffix == b')' || suffix == b':' {
return true;
}
}
if bytes.len() >= 3 && bytes[0] == b'(' && bytes[bytes.len() - 1] == b')' {
let char_count = trimmed.chars().count();
if char_count >= 3 {
let inner: String = trimmed.chars().skip(1).take(char_count - 2).collect();
if inner.chars().all(|c| c.is_ascii_digit())
|| (inner.len() == 1 && inner.chars().next().is_some_and(|c| c.is_ascii_alphabetic()))
|| is_roman_numeral(&inner)
{
return true;
}
}
}
if bytes.len() >= 3 && bytes[0] == b'[' && bytes[bytes.len() - 1] == b']' {
let char_count = trimmed.chars().count();
if char_count >= 3 {
let inner: String = trimmed.chars().skip(1).take(char_count - 2).collect();
if inner.chars().all(|c| c.is_ascii_digit())
|| (inner.len() == 1 && inner.chars().next().is_some_and(|c| c.is_ascii_alphabetic()))
|| is_roman_numeral(&inner)
{
return true;
}
}
}
if bytes.len() == 2 && bytes[0].is_ascii_alphabetic() && (bytes[1] == b'.' || bytes[1] == b')') {
return true;
}
if trimmed.ends_with('.') || trimmed.ends_with(')') {
let prefix = &trimmed[..trimmed.len() - 1];
if is_roman_numeral(prefix) {
return true;
}
}
false
}
fn is_roman_numeral(s: &str) -> bool {
if s.is_empty() {
return false;
}
let lower = s.to_ascii_lowercase();
matches!(
lower.as_str(),
"i" | "ii" | "iii" | "iv" | "v" | "vi" | "vii" | "viii" | "ix" | "x" | "xi" | "xii"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_single_token_list_prefix_bullet_chars() {
assert!(is_single_token_list_prefix("-"));
assert!(is_single_token_list_prefix("*"));
assert!(is_single_token_list_prefix("\u{2022}")); assert!(is_single_token_list_prefix("\u{2013}")); }
#[test]
fn test_single_token_list_prefix_numbered() {
assert!(is_single_token_list_prefix("1."));
assert!(is_single_token_list_prefix("2)"));
assert!(is_single_token_list_prefix("10."));
assert!(is_single_token_list_prefix("3:"));
}
#[test]
fn test_single_token_list_prefix_parenthesized() {
assert!(is_single_token_list_prefix("(1)"));
assert!(is_single_token_list_prefix("(a)"));
assert!(is_single_token_list_prefix("(A)"));
assert!(is_single_token_list_prefix("(iv)"));
assert!(is_single_token_list_prefix("(12)"));
}
#[test]
fn test_single_token_list_prefix_bracketed() {
assert!(is_single_token_list_prefix("[1]"));
assert!(is_single_token_list_prefix("[a]"));
assert!(is_single_token_list_prefix("[12]"));
assert!(is_single_token_list_prefix("[iv]"));
}
#[test]
fn test_single_token_list_prefix_alphabetic() {
assert!(is_single_token_list_prefix("a."));
assert!(is_single_token_list_prefix("b)"));
assert!(is_single_token_list_prefix("A."));
}
#[test]
fn test_single_token_list_prefix_roman() {
assert!(is_single_token_list_prefix("i."));
assert!(is_single_token_list_prefix("ii."));
assert!(is_single_token_list_prefix("IV."));
assert!(is_single_token_list_prefix("iii)"));
}
#[test]
fn test_single_token_list_prefix_not_regular_text() {
assert!(!is_single_token_list_prefix("Hello"));
assert!(!is_single_token_list_prefix("The"));
assert!(!is_single_token_list_prefix(""));
}
#[test]
fn test_multi_token_first_token_bullet() {
assert!(is_list_prefix_multi_token("- item text"));
assert!(is_list_prefix_multi_token("1. first item"));
assert!(is_list_prefix_multi_token("(a) first item"));
}
#[test]
fn test_multi_token_parenthesized_split() {
assert!(is_list_prefix_multi_token("(1 ) rest of text"));
}
#[test]
fn test_multi_token_bracketed_split() {
assert!(is_list_prefix_multi_token(" [iv] text here"));
}
#[test]
fn test_multi_token_not_list() {
assert!(!is_list_prefix_multi_token("This is regular text"));
assert!(!is_list_prefix_multi_token("The quick brown fox"));
assert!(!is_list_prefix_multi_token(""));
}
#[test]
fn test_multi_token_leading_whitespace() {
assert!(is_list_prefix_multi_token(" 1. indented item"));
assert!(is_list_prefix_multi_token("\t(a) tabbed item"));
}
fn make_body_paragraph(text: &str, font_size: f32) -> PdfParagraph {
use crate::pdf::hierarchy::SegmentData;
let segments = vec![SegmentData {
text: text.to_string(),
x: 0.0,
y: 700.0,
width: 200.0,
height: font_size,
font_size,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: 700.0,
}];
PdfParagraph {
text: String::new(),
lines: vec![super::super::types::PdfLine {
segments,
baseline_y: 700.0,
dominant_font_size: font_size,
is_bold: false,
is_monospace: false,
}],
dominant_font_size: font_size,
heading_level: None,
is_bold: false,
is_list_item: false,
is_code_block: false,
is_formula: false,
is_page_furniture: false,
layout_class: None,
caption_for: None,
block_bbox: None,
}
}
#[test]
fn test_merge_lowercase_continuation() {
let mut paragraphs = vec![
make_body_paragraph("The regulation requires.", 12.0),
make_body_paragraph("and all operators must comply", 12.0),
];
merge_continuation_paragraphs(&mut paragraphs);
assert_eq!(paragraphs.len(), 1, "lowercase continuation should be merged");
}
#[test]
fn test_no_merge_different_font_sizes() {
let mut paragraphs = vec![
make_body_paragraph("First paragraph", 12.0),
make_body_paragraph("second paragraph", 16.0),
];
merge_continuation_paragraphs(&mut paragraphs);
assert_eq!(paragraphs.len(), 2, "different font sizes should prevent merge");
}
#[test]
fn test_merge_no_terminator() {
let mut paragraphs = vec![
make_body_paragraph("The regulation requires", 12.0),
make_body_paragraph("All operators must comply", 12.0),
];
merge_continuation_paragraphs(&mut paragraphs);
assert_eq!(paragraphs.len(), 1, "unterminated paragraph should merge with next");
}
#[test]
fn test_no_merge_terminated_uppercase() {
let mut paragraphs = vec![
make_body_paragraph("The regulation requires compliance.", 12.0),
make_body_paragraph("All operators must comply", 12.0),
];
merge_continuation_paragraphs(&mut paragraphs);
assert_eq!(
paragraphs.len(),
2,
"terminated paragraph + uppercase start should not merge"
);
}
#[test]
fn test_starts_with_lowercase_continuation_fn() {
let para_lower = make_body_paragraph("and furthermore", 12.0);
assert!(starts_with_lowercase_continuation(¶_lower));
let para_upper = make_body_paragraph("Furthermore", 12.0);
assert!(!starts_with_lowercase_continuation(¶_upper));
}
}