use crate::pipeline::reading_order::{ReadingOrder, SimpleReadingOrder, XYCutReadingOrder};
use crate::pipeline::{
Element, ElementBBox, ElementData, ElementMetadata, KeyValueElementData, TableElementData,
};
use crate::text::extraction::TextFragment;
#[derive(Debug, Clone, Default)]
pub enum ReadingOrderStrategy {
#[default]
Simple,
XYCut { min_gap: f64 },
None,
}
#[derive(Debug, Clone)]
pub struct PartitionConfig {
pub detect_tables: bool,
pub detect_headers_footers: bool,
pub title_min_font_ratio: f64,
pub header_zone: f64,
pub footer_zone: f64,
pub reading_order: ReadingOrderStrategy,
pub min_table_confidence: f64,
}
impl Default for PartitionConfig {
fn default() -> Self {
Self {
detect_tables: true,
detect_headers_footers: true,
title_min_font_ratio: 1.3,
header_zone: 0.05,
footer_zone: 0.05,
reading_order: ReadingOrderStrategy::Simple,
min_table_confidence: 0.5,
}
}
}
impl PartitionConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_title_min_font_ratio(mut self, ratio: f64) -> Self {
self.title_min_font_ratio = ratio;
self
}
pub fn without_tables(mut self) -> Self {
self.detect_tables = false;
self
}
pub fn without_headers_footers(mut self) -> Self {
self.detect_headers_footers = false;
self
}
pub fn with_reading_order(mut self, strategy: ReadingOrderStrategy) -> Self {
self.reading_order = strategy;
self
}
pub fn with_min_table_confidence(mut self, threshold: f64) -> Self {
self.min_table_confidence = threshold;
self
}
}
pub struct Partitioner {
config: PartitionConfig,
}
impl Partitioner {
pub fn new(config: PartitionConfig) -> Self {
Self { config }
}
pub fn partition_fragments(
&self,
fragments: &[TextFragment],
page: u32,
page_height: f64,
) -> Vec<Element> {
if fragments.is_empty() {
return Vec::new();
}
let fragments: std::borrow::Cow<[TextFragment]> = match &self.config.reading_order {
ReadingOrderStrategy::Simple => {
let mut ordered = fragments.to_vec();
SimpleReadingOrder::default().order(&mut ordered);
std::borrow::Cow::Owned(ordered)
}
ReadingOrderStrategy::XYCut { min_gap } => {
let mut ordered = fragments.to_vec();
XYCutReadingOrder::new(*min_gap).order(&mut ordered);
std::borrow::Cow::Owned(ordered)
}
ReadingOrderStrategy::None => std::borrow::Cow::Borrowed(fragments),
};
let fragments = fragments.as_ref();
let mut claimed = vec![false; fragments.len()];
let mut elements = Vec::new();
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
let Some(tag) = f.struct_tag.as_deref() else {
continue;
};
match classify_by_struct_tag(tag) {
Some(StructTagClass::Heading) => {
let trimmed = f.text.trim();
if trimmed.is_empty() {
continue;
}
let mut meta = meta_from_fragment(f, page);
meta.confidence = 1.0;
elements.push(Element::Title(ElementData {
text: trimmed.to_string(),
metadata: meta,
}));
claimed[i] = true;
}
Some(StructTagClass::ListItem) => {
let trimmed = f.text.trim();
if trimmed.is_empty() {
continue;
}
let mut meta = meta_from_fragment(f, page);
meta.confidence = 1.0;
elements.push(Element::ListItem(ElementData {
text: trimmed.to_string(),
metadata: meta,
}));
claimed[i] = true;
}
Some(StructTagClass::List) | Some(StructTagClass::Artifact) | None => {
}
}
}
if self.config.detect_headers_footers && page_height > 0.0 {
let header_threshold = page_height * (1.0 - self.config.header_zone);
let footer_threshold = page_height * self.config.footer_zone;
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
let text_too_long = f.text.chars().count() > MAX_HEADER_TEXT_LEN;
let is_body_tagged = struct_tag_is_body(&f.struct_tag);
if f.y >= header_threshold && !text_too_long && !is_body_tagged {
let zone_size = page_height * self.config.header_zone;
let distance = f.y - header_threshold;
let header_confidence = compute_zone_confidence(distance, zone_size);
let mut meta = meta_from_fragment(f, page);
meta.confidence = header_confidence;
elements.push(Element::Header(ElementData {
text: f.text.clone(),
metadata: meta,
}));
claimed[i] = true;
} else if f.y + f.height <= footer_threshold && !text_too_long && !is_body_tagged {
let zone_size = page_height * self.config.footer_zone;
let distance = footer_threshold - (f.y + f.height);
let footer_confidence = compute_zone_confidence(distance, zone_size);
let mut meta = meta_from_fragment(f, page);
meta.confidence = footer_confidence;
elements.push(Element::Footer(ElementData {
text: f.text.clone(),
metadata: meta,
}));
claimed[i] = true;
}
}
}
if self.config.detect_tables {
let unclaimed_frags: Vec<&TextFragment> = fragments
.iter()
.enumerate()
.filter(|(i, _)| !claimed[*i])
.map(|(_, f)| f)
.collect();
let detector = crate::text::structured::StructuredDataDetector::new(Default::default());
let regions = segment_into_table_regions(&unclaimed_frags, 2.0);
for region in ®ions {
if region_looks_like_list(region) {
continue;
}
let region_owned: Vec<TextFragment> = region.iter().map(|f| (*f).clone()).collect();
if let Ok(result) = detector.detect(®ion_owned) {
for table in &result.tables {
if table.confidence < self.config.min_table_confidence {
continue;
}
let rows: Vec<Vec<String>> = table
.rows
.iter()
.map(|row| row.cells.iter().map(|c| c.text.clone()).collect())
.collect();
let bbox = ElementBBox::new(
table.bounding_box.x,
table.bounding_box.y,
table.bounding_box.width,
table.bounding_box.height,
);
elements.push(Element::Table(TableElementData {
rows,
metadata: ElementMetadata {
page,
bbox,
confidence: table.confidence,
..Default::default()
},
}));
for (i, f) in fragments.iter().enumerate() {
if !claimed[i]
&& f.x >= table.bounding_box.x - 1.0
&& f.x <= table.bounding_box.right() + 1.0
&& f.y >= table.bounding_box.y - 1.0
&& f.y <= table.bounding_box.top() + 1.0
{
claimed[i] = true;
}
}
}
}
}
}
let body_font_size = {
let sizes: Vec<f64> = fragments
.iter()
.enumerate()
.filter(|(i, _)| !claimed[*i])
.map(|(_, f)| f.font_size)
.filter(|s| *s > 0.0)
.collect();
if sizes.is_empty() {
12.0
} else {
let mut freq = std::collections::HashMap::new();
for s in &sizes {
let key = (*s * 2.0).round() as i64;
*freq.entry(key).or_insert(0usize) += 1;
}
let mode_key = freq
.into_iter()
.max_by(|(key_a, count_a), (key_b, count_b)| {
count_a.cmp(count_b).then(key_b.cmp(key_a))
})
.map(|(key, _)| key)
.unwrap_or(24);
mode_key as f64 / 2.0
}
};
let title_threshold = body_font_size * self.config.title_min_font_ratio;
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
let meta = meta_from_fragment(f, page);
let text = f.text.trim();
if text.is_empty() {
continue;
}
if let Some(colon_pos) = text.find(':') {
let key = text[..colon_pos].trim();
let value = text[colon_pos + 1..].trim();
let key_word_count = key.split_whitespace().count();
if !key.is_empty()
&& !value.is_empty()
&& key.len() < 40
&& key_word_count <= 4
&& !key.contains('.')
&& !is_prose_prefix(key)
{
let kv_confidence = compute_kv_confidence(key);
let mut meta = meta;
meta.confidence = kv_confidence;
elements.push(Element::KeyValue(KeyValueElementData {
key: key.to_string(),
value: value.to_string(),
metadata: meta,
}));
continue;
}
}
let p_or_span = matches!(f.struct_tag.as_deref(), Some("P") | Some("Span"));
let mut is_title = false;
let mut title_confidence = 0.0_f64;
if f.font_size >= title_threshold && f.font_size > body_font_size {
let ratio = f.font_size / body_font_size;
is_title = true;
title_confidence = title_confidence.max(compute_title_confidence(
ratio,
self.config.title_min_font_ratio,
));
}
if !p_or_span && bold_short_title(f) {
is_title = true;
title_confidence = title_confidence.max(0.7);
}
if numeric_prefix_title(f) {
is_title = true;
title_confidence = title_confidence.max(0.8);
}
if is_title {
let mut meta = meta;
meta.confidence = title_confidence.clamp(0.5, 1.0);
elements.push(Element::Title(ElementData {
text: text.to_string(),
metadata: meta,
}));
continue;
}
if is_list_item(text) {
elements.push(Element::ListItem(ElementData {
text: text.to_string(),
metadata: meta,
}));
continue;
}
elements.push(Element::Paragraph(ElementData {
text: text.to_string(),
metadata: meta,
}));
}
match &self.config.reading_order {
ReadingOrderStrategy::None => {}
_ => {
elements.sort_by_key(|e| e.page());
}
}
let mut current_heading: Option<String> = None;
for element in &mut elements {
if matches!(element, Element::Title(_)) {
current_heading = Some(element.text().to_string());
}
element.set_parent_heading(current_heading.clone());
}
elements
}
}
fn is_prose_prefix(key: &str) -> bool {
let lower = key.to_lowercase();
let words: Vec<&str> = lower.split_whitespace().collect();
const PROSE_INDICATORS: &[&str] = &[
"as",
"the",
"this",
"that",
"these",
"those",
"it",
"is",
"was",
"were",
"has",
"have",
"had",
"will",
"would",
"should",
"could",
"may",
"might",
"shall",
"can",
"do",
"does",
"did",
"being",
"been",
"are",
"for",
"with",
"from",
"into",
"about",
"after",
"before",
"during",
"between",
"through",
"however",
"therefore",
"furthermore",
"moreover",
"although",
"because",
"since",
"while",
"when",
"where",
"which",
"who",
"whom",
"whose",
"according",
];
if let Some(first) = words.first() {
if PROSE_INDICATORS.contains(first) {
return true;
}
}
if words.len() > 2 {
for word in &words[1..] {
if PROSE_INDICATORS.contains(word) {
return true;
}
}
}
false
}
fn is_list_item(text: &str) -> bool {
let trimmed = text.trim_start();
if trimmed.starts_with("- ")
|| trimmed.starts_with("• ")
|| trimmed.starts_with("* ")
|| trimmed.starts_with("– ")
|| trimmed.starts_with("— ")
{
return true;
}
let bytes = trimmed.as_bytes();
if bytes.len() >= 3 {
let first = bytes[0];
let second = bytes[1];
let third = bytes[2];
if (first.is_ascii_digit() || first.is_ascii_lowercase())
&& (second == b'.' || second == b')')
&& third == b' '
{
return true;
}
if bytes.len() >= 4
&& first.is_ascii_digit()
&& second.is_ascii_digit()
&& (bytes[2] == b'.' || bytes[2] == b')')
&& bytes[3] == b' '
{
return true;
}
}
false
}
fn segment_into_table_regions<'a>(
fragments: &[&'a TextFragment],
gap_multiplier: f64,
) -> Vec<Vec<&'a TextFragment>> {
if fragments.is_empty() {
return Vec::new();
}
let mut sorted: Vec<&TextFragment> = fragments.to_vec();
sorted.sort_by(|a, b| b.y.total_cmp(&a.y));
let mut heights: Vec<f64> = sorted
.iter()
.map(|f| f.height)
.filter(|h| *h > 0.0)
.collect();
let median_height = if heights.is_empty() {
12.0
} else {
heights.sort_by(f64::total_cmp);
let mid = heights.len() / 2;
if heights.len() % 2 == 0 {
(heights[mid - 1] + heights[mid]) / 2.0
} else {
heights[mid]
}
};
let gap_threshold = median_height * gap_multiplier;
let mut regions: Vec<Vec<&TextFragment>> = Vec::new();
let mut current_region: Vec<&TextFragment> = Vec::new();
for frag in &sorted {
if let Some(prev) = current_region.last() {
let gap = prev.y - (frag.y + frag.height);
if gap > gap_threshold {
if current_region.len() >= 4 {
regions.push(current_region);
}
current_region = Vec::new();
}
}
current_region.push(frag);
}
if current_region.len() >= 4 {
regions.push(current_region);
}
regions
}
fn region_looks_like_list(fragments: &[&TextFragment]) -> bool {
if fragments.is_empty() {
return false;
}
let tolerance = 15.0;
let mut x_clusters: Vec<f64> = Vec::new();
for frag in fragments {
let x = frag.x;
let found = x_clusters.iter().any(|&cx| (cx - x).abs() <= tolerance);
if !found {
x_clusters.push(x);
}
}
if x_clusters.len() != 2 {
return false;
}
x_clusters.sort_by(f64::total_cmp);
let left_x = x_clusters[0];
let left_frags: Vec<&TextFragment> = fragments
.iter()
.filter(|f| (f.x - left_x).abs() <= tolerance)
.copied()
.collect();
if left_frags.is_empty() {
return false;
}
let avg_left_len = left_frags
.iter()
.map(|f| f.text.trim().chars().count())
.sum::<usize>() as f64
/ left_frags.len() as f64;
avg_left_len <= 3.0
}
fn meta_from_fragment(f: &TextFragment, page: u32) -> ElementMetadata {
ElementMetadata {
page,
bbox: ElementBBox::new(f.x, f.y, f.width, f.height),
confidence: 1.0,
font_name: f.font_name.clone(),
font_size: Some(f.font_size),
is_bold: f.is_bold,
is_italic: f.is_italic,
parent_heading: None,
}
}
fn compute_title_confidence(actual_ratio: f64, min_ratio: f64) -> f64 {
if min_ratio <= 0.0 {
return 1.0;
}
(0.5 + 0.5 * (actual_ratio - min_ratio) / min_ratio).clamp(0.5, 1.0)
}
fn compute_zone_confidence(distance: f64, zone_size: f64) -> f64 {
if zone_size <= 0.0 {
return 0.5;
}
(distance / zone_size).clamp(0.5, 1.0)
}
fn compute_kv_confidence(key: &str) -> f64 {
let len_penalty = key.len() as f64 / 40.0;
let word_count = key.split_whitespace().count();
let word_penalty = if word_count > 2 {
0.1 * (word_count - 2) as f64
} else {
0.0
};
(1.0 - len_penalty - word_penalty).clamp(0.5, 1.0)
}
const MAX_HEADER_TEXT_LEN: usize = 100;
const MAX_BOLD_TITLE_LEN: usize = 120;
const MAX_NUMERIC_TITLE_LEN: usize = 120;
const MAX_NUMERIC_TITLE_WORDS: usize = 14;
fn ends_with_sentence_terminator(s: &str) -> bool {
matches!(s.chars().last(), Some('.') | Some('!') | Some('?'))
}
fn bold_short_title(f: &TextFragment) -> bool {
if !f.is_bold {
return false;
}
let trimmed = f.text.trim();
let char_count = trimmed.chars().count();
if char_count == 0 || char_count > MAX_BOLD_TITLE_LEN {
return false;
}
!ends_with_sentence_terminator(trimmed)
}
fn section_prefix_regex() -> &'static regex::Regex {
use std::sync::OnceLock;
static RE: OnceLock<regex::Regex> = OnceLock::new();
RE.get_or_init(|| {
regex::Regex::new(
r"^([A-Z]\d+(\.\d+)*(\.[a-z]\.?)?|\d+(\.\d+)*\.?|Section\s+\d+:?|Chapter\s+\d+:?|[IVX]+\.)\s+",
)
.expect("section_prefix_regex must compile")
})
}
fn matches_section_prefix(s: &str) -> bool {
section_prefix_regex().is_match(s)
}
fn strip_section_prefix(s: &str) -> &str {
if let Some(m) = section_prefix_regex().find(s) {
&s[m.end()..]
} else {
s
}
}
fn numeric_prefix_title(f: &TextFragment) -> bool {
let trimmed = f.text.trim();
let char_count = trimmed.chars().count();
if char_count == 0 || char_count > MAX_NUMERIC_TITLE_LEN {
return false;
}
if !matches_section_prefix(trimmed) {
return false;
}
if is_list_item(trimmed) {
return false;
}
let rest = strip_section_prefix(trimmed).trim_start();
if !matches!(rest.chars().next(), Some(c) if c.is_uppercase()) {
return false;
}
if trimmed.contains(',') {
return false;
}
if trimmed.split_whitespace().count() > MAX_NUMERIC_TITLE_WORDS {
return false;
}
true
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum StructTagClass {
Heading,
List,
ListItem,
Artifact,
}
fn classify_by_struct_tag(tag: &str) -> Option<StructTagClass> {
match tag {
"H" | "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "Title" => Some(StructTagClass::Heading),
"L" => Some(StructTagClass::List),
"LI" | "Lbl" | "LBody" => Some(StructTagClass::ListItem),
"Artifact" => Some(StructTagClass::Artifact),
_ => None,
}
}
fn struct_tag_is_body(tag: &Option<String>) -> bool {
let Some(t) = tag.as_deref() else {
return false;
};
matches!(
t,
"P" | "Span"
| "H"
| "H1"
| "H2"
| "H3"
| "H4"
| "H5"
| "H6"
| "Title"
| "L"
| "LI"
| "Lbl"
| "LBody"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn classify_by_struct_tag_recognizes_heading_tags() {
for tag in &["H", "H1", "H2", "H3", "H4", "H5", "H6", "Title"] {
assert_eq!(
classify_by_struct_tag(tag),
Some(StructTagClass::Heading),
"tag {tag} should classify as Heading"
);
}
}
#[test]
fn classify_by_struct_tag_recognizes_list_tags() {
assert_eq!(classify_by_struct_tag("L"), Some(StructTagClass::List));
assert_eq!(classify_by_struct_tag("LI"), Some(StructTagClass::ListItem));
assert_eq!(
classify_by_struct_tag("Lbl"),
Some(StructTagClass::ListItem)
);
assert_eq!(
classify_by_struct_tag("LBody"),
Some(StructTagClass::ListItem)
);
}
#[test]
fn classify_by_struct_tag_recognizes_artifact() {
assert_eq!(
classify_by_struct_tag("Artifact"),
Some(StructTagClass::Artifact)
);
}
#[test]
fn classify_by_struct_tag_returns_none_for_passthrough_tags() {
for tag in &[
"P", "Span", "Figure", "Table", "Caption", "Form", "Note", "Random",
] {
assert_eq!(
classify_by_struct_tag(tag),
None,
"tag {tag} should be None (fall through)"
);
}
}
#[test]
fn struct_tag_is_body_recognizes_body_tags() {
for tag in &[
"P", "Span", "L", "LI", "H1", "H2", "H6", "Title", "Lbl", "LBody",
] {
assert!(
struct_tag_is_body(&Some(tag.to_string())),
"tag {tag} should be body"
);
}
}
#[test]
fn struct_tag_is_body_returns_false_for_artifact() {
assert!(!struct_tag_is_body(&Some("Artifact".to_string())));
}
#[test]
fn struct_tag_is_body_returns_false_for_none() {
assert!(!struct_tag_is_body(&None));
}
fn frag(text: &str, bold: bool, font_size: f64) -> TextFragment {
TextFragment {
text: text.to_string(),
x: 0.0,
y: 0.0,
width: 100.0,
height: 12.0,
font_size,
font_name: None,
is_bold: bold,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
}
}
#[test]
fn ends_with_sentence_terminator_table() {
assert!(ends_with_sentence_terminator("This is a paragraph."));
assert!(ends_with_sentence_terminator("Really?"));
assert!(ends_with_sentence_terminator("Stop!"));
assert!(!ends_with_sentence_terminator("Section heading"));
assert!(!ends_with_sentence_terminator("A2.a Risk Management"));
assert!(!ends_with_sentence_terminator(""));
}
#[test]
fn bold_short_title_accepts_bold_short_no_terminator() {
assert!(bold_short_title(&frag("Section Heading", true, 12.0)));
assert!(bold_short_title(&frag("Principle A2", true, 11.0)));
}
#[test]
fn bold_short_title_rejects_non_bold() {
assert!(!bold_short_title(&frag("Section Heading", false, 12.0)));
}
#[test]
fn bold_short_title_rejects_long_text() {
let long = "x".repeat(150);
assert!(!bold_short_title(&frag(&long, true, 12.0)));
}
#[test]
fn bold_short_title_rejects_sentence_with_period() {
assert!(!bold_short_title(&frag(
"This is a complete sentence.",
true,
12.0
)));
}
#[test]
fn bold_short_title_rejects_empty() {
assert!(!bold_short_title(&frag(" ", true, 12.0)));
}
#[test]
fn numeric_prefix_title_accepts_known_patterns() {
let cases = &[
"A2.a Risk Management Process",
"A1.b Roles and Responsibilities",
"1.1 Overview",
"3.2.1 Detailed Requirements",
"Section 4: Implementation",
"Chapter 7 Conclusion",
"IV. Findings",
];
for c in cases {
assert!(
numeric_prefix_title(&frag(c, false, 12.0)),
"should match: {c}"
);
}
}
#[test]
fn numeric_prefix_title_rejects_money_amount() {
assert!(!numeric_prefix_title(&frag(
"1.2 million users were affected",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_version_string() {
assert!(!numeric_prefix_title(&frag(
"version 3.0.1 release notes",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_lowercase_continuation() {
assert!(!numeric_prefix_title(&frag(
"1. take action now",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_flat_numbered_list_marker() {
for c in &["1. First item", "2. Second item", "10) Tenth item"] {
assert!(
!numeric_prefix_title(&frag(c, false, 12.0)),
"flat numbered list marker must not be a Title: {c}"
);
}
for c in &["1.1 Overview", "A2.a Risk Management Process"] {
assert!(
numeric_prefix_title(&frag(c, false, 12.0)),
"multi-level/lettered prefix must remain a Title: {c}"
);
}
}
#[test]
fn numeric_prefix_title_rejects_text_without_prefix() {
assert!(!numeric_prefix_title(&frag(
"Overview of the system",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_too_long() {
let mut s = String::from("A2.a ");
s.push_str(&"X".repeat(220));
assert!(!numeric_prefix_title(&frag(&s, false, 12.0)));
}
#[test]
fn numeric_prefix_title_rejects_text_with_comma() {
assert!(!numeric_prefix_title(&frag(
"1. La vigilancia continua permite detectar amenazas, vulnerabilidades y errores.",
false,
12.0,
)));
}
#[test]
fn numeric_prefix_title_rejects_long_sentence_without_comma() {
assert!(!numeric_prefix_title(&frag(
"1. La vigilancia continua permitira la deteccion de actividades o comportamientos anomalos y su oportuna respuesta.",
false,
12.0,
)));
}
#[test]
fn numeric_prefix_title_rejects_just_over_max_len() {
let mut s = String::from("A2.a Risk Management ");
s.push_str(&"X".repeat(120));
assert!(!numeric_prefix_title(&frag(&s, false, 12.0)));
}
fn frag_at(text: &str, x: f64, y: f64, font_size: f64) -> TextFragment {
TextFragment {
text: text.to_string(),
x,
y,
width: 100.0,
height: font_size,
font_size,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
}
}
#[test]
fn struct_tag_h1_yields_title_no_font_ratio_needed() {
let mut f = frag_at("Section One", 50.0, 400.0, 12.0);
f.struct_tag = Some("H1".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1
);
}
#[test]
fn struct_tag_p_does_not_block_numeric_prefix_title() {
let mut f = frag_at("A2.a Risk Management Process", 50.0, 400.0, 12.0);
f.struct_tag = Some("P".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1,
"numeric-prefix heuristic must fire even when struct_tag=P (NCSC sub-sections)"
);
}
#[test]
fn struct_tag_p_overrides_bold_short_heuristic() {
let mut f = frag_at("Bold Short Text", 50.0, 400.0, 12.0);
f.is_bold = true;
f.struct_tag = Some("P".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert!(elements.iter().any(|e| matches!(e, Element::Paragraph(_))));
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
0
);
}
#[test]
fn bold_short_title_fires_without_struct_tag() {
let mut f = frag_at("Risk Management", 50.0, 400.0, 12.0);
f.is_bold = true;
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1,
"bold-short heuristic must fire when no other signals present"
);
}
#[test]
fn numeric_prefix_title_fires_without_bold() {
let f = frag_at("A2.a Risk Management Process", 50.0, 400.0, 12.0);
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1,
"numeric-prefix heuristic must fire on NCSC-style sections"
);
}
#[test]
fn struct_tag_li_yields_list_item() {
let mut f = frag_at("Bullet content", 50.0, 400.0, 12.0);
f.struct_tag = Some("LI".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::ListItem(_)))
.count(),
1
);
}
#[test]
fn font_ratio_title_still_works() {
let mut frags = vec![];
for i in 0..5 {
frags.push(frag_at(
&format!("body line {i}"),
50.0,
400.0 - (i as f64) * 15.0,
12.0,
));
}
frags.push(frag_at("Big Heading", 50.0, 500.0, 20.0));
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count()
>= 1,
"font-ratio Title path must still fire"
);
}
#[test]
fn header_zone_rejects_long_text() {
let long = "X".repeat(200);
let frags = vec![frag_at(&long, 50.0, 780.0, 12.0)];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
let header_count = elements
.iter()
.filter(|e| matches!(e, Element::Header(_)))
.count();
assert_eq!(
header_count, 0,
"long text in header zone must not classify as Header"
);
}
#[test]
fn header_zone_accepts_short_text() {
let frags = vec![frag_at("My Report 2026", 50.0, 780.0, 12.0)];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert!(
elements.iter().any(|e| matches!(e, Element::Header(_))),
"short text in header zone must classify as Header"
);
}
#[test]
fn header_zone_rejects_p_struct_tag() {
let mut f = frag_at("Short body text", 50.0, 780.0, 12.0);
f.struct_tag = Some("P".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
let header_count = elements
.iter()
.filter(|e| matches!(e, Element::Header(_)))
.count();
assert_eq!(header_count, 0);
}
#[test]
fn footer_zone_rejects_long_text() {
let long = "X".repeat(200);
let frags = vec![frag_at(&long, 50.0, 10.0, 12.0)];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
let footer_count = elements
.iter()
.filter(|e| matches!(e, Element::Footer(_)))
.count();
assert_eq!(footer_count, 0);
}
}