use crate::graphics::extraction::ExtractedGraphics;
use crate::pipeline::reading_order::{ReadingOrder, SimpleReadingOrder, XYCutReadingOrder};
use crate::pipeline::{
Element, ElementBBox, ElementData, ElementMetadata, KeyValueElementData, TableElementData,
};
use crate::text::extraction::TextFragment;
#[derive(Debug, Clone, Default)]
pub enum ReadingOrderStrategy {
#[default]
Simple,
XYCut { min_gap: f64 },
None,
}
#[derive(Debug, Clone)]
pub struct PartitionConfig {
pub detect_tables: bool,
pub detect_headers_footers: bool,
pub title_min_font_ratio: f64,
pub header_zone: f64,
pub footer_zone: f64,
pub reading_order: ReadingOrderStrategy,
pub min_table_confidence: f64,
pub prefer_ruling_tables: bool,
}
impl Default for PartitionConfig {
fn default() -> Self {
Self {
detect_tables: true,
detect_headers_footers: true,
title_min_font_ratio: 1.3,
header_zone: 0.05,
footer_zone: 0.05,
reading_order: ReadingOrderStrategy::Simple,
min_table_confidence: 0.5,
prefer_ruling_tables: true,
}
}
}
impl PartitionConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_title_min_font_ratio(mut self, ratio: f64) -> Self {
self.title_min_font_ratio = ratio;
self
}
pub fn without_tables(mut self) -> Self {
self.detect_tables = false;
self
}
pub fn without_headers_footers(mut self) -> Self {
self.detect_headers_footers = false;
self
}
pub fn with_reading_order(mut self, strategy: ReadingOrderStrategy) -> Self {
self.reading_order = strategy;
self
}
pub fn with_min_table_confidence(mut self, threshold: f64) -> Self {
self.min_table_confidence = threshold;
self
}
}
pub struct Partitioner {
config: PartitionConfig,
}
impl Partitioner {
pub fn new(config: PartitionConfig) -> Self {
Self { config }
}
pub fn partition_fragments(
&self,
fragments: &[TextFragment],
page: u32,
page_height: f64,
) -> Vec<Element> {
self.partition_fragments_with_graphics(fragments, None, page, page_height)
}
pub fn partition_fragments_with_graphics(
&self,
fragments: &[TextFragment],
graphics: Option<&ExtractedGraphics>,
page: u32,
page_height: f64,
) -> Vec<Element> {
self.partition_fragments_with_graphics_raw(fragments, None, graphics, page, page_height)
}
pub(crate) fn partition_fragments_with_graphics_raw(
&self,
fragments: &[TextFragment],
raw_fragments: Option<&[TextFragment]>,
graphics: Option<&ExtractedGraphics>,
page: u32,
page_height: f64,
) -> Vec<Element> {
if fragments.is_empty() {
return Vec::new();
}
let ruling_fragments = raw_fragments.unwrap_or(fragments);
let fragments: std::borrow::Cow<[TextFragment]> = match &self.config.reading_order {
ReadingOrderStrategy::Simple => {
let mut ordered = fragments.to_vec();
SimpleReadingOrder::default().order(&mut ordered);
std::borrow::Cow::Owned(ordered)
}
ReadingOrderStrategy::XYCut { min_gap } => {
let mut ordered = fragments.to_vec();
XYCutReadingOrder::new(*min_gap).order(&mut ordered);
std::borrow::Cow::Owned(ordered)
}
ReadingOrderStrategy::None => std::borrow::Cow::Borrowed(fragments),
};
let fragments = fragments.as_ref();
let mut claimed = vec![false; fragments.len()];
let mut elements = Vec::new();
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
let Some(tag) = f.struct_tag.as_deref() else {
continue;
};
match classify_by_struct_tag(tag) {
Some(StructTagClass::Heading) => {
let trimmed = f.text.trim();
if trimmed.is_empty() {
continue;
}
let mut meta = meta_from_fragment(f, page);
meta.confidence = 1.0;
elements.push(Element::Title(ElementData {
text: trimmed.to_string(),
metadata: meta,
}));
claimed[i] = true;
}
Some(StructTagClass::ListItem) => {
let trimmed = f.text.trim();
if trimmed.is_empty() {
continue;
}
let mut meta = meta_from_fragment(f, page);
meta.confidence = 1.0;
elements.push(Element::ListItem(ElementData {
text: trimmed.to_string(),
metadata: meta,
}));
claimed[i] = true;
}
Some(StructTagClass::List) | Some(StructTagClass::Artifact) | None => {
}
}
}
if self.config.detect_headers_footers && page_height > 0.0 {
let header_threshold = page_height * (1.0 - self.config.header_zone);
let footer_threshold = page_height * self.config.footer_zone;
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
let text_too_long = f.text.chars().count() > MAX_HEADER_TEXT_LEN;
let is_body_tagged = struct_tag_is_body(&f.struct_tag);
if f.y >= header_threshold && !text_too_long && !is_body_tagged {
let zone_size = page_height * self.config.header_zone;
let distance = f.y - header_threshold;
let header_confidence = compute_zone_confidence(distance, zone_size);
let mut meta = meta_from_fragment(f, page);
meta.confidence = header_confidence;
elements.push(Element::Header(ElementData {
text: f.text.clone(),
metadata: meta,
}));
claimed[i] = true;
} else if f.y + f.height <= footer_threshold && !text_too_long && !is_body_tagged {
let zone_size = page_height * self.config.footer_zone;
let distance = footer_threshold - (f.y + f.height);
let footer_confidence = compute_zone_confidence(distance, zone_size);
let mut meta = meta_from_fragment(f, page);
meta.confidence = footer_confidence;
elements.push(Element::Footer(ElementData {
text: f.text.clone(),
metadata: meta,
}));
claimed[i] = true;
}
}
}
if self.config.detect_tables {
if self.config.prefer_ruling_tables {
if let Some(graphics) = graphics {
if graphics.has_table_structure() {
let detector = crate::text::table_detection::TableDetector::default();
if let Ok(tables) = detector.detect(graphics, ruling_fragments) {
for table in &tables {
if table.confidence < self.config.min_table_confidence {
continue;
}
let rows = ruling_table_to_rows(table);
let bbox = ElementBBox::new(
table.bbox.x,
table.bbox.y,
table.bbox.width,
table.bbox.height,
);
elements.push(Element::Table(TableElementData {
rows,
metadata: ElementMetadata {
page,
bbox,
confidence: table.confidence,
..Default::default()
},
}));
let (rx, ry) = (table.bbox.x, table.bbox.y);
let (rr, rt) = (
table.bbox.x + table.bbox.width,
table.bbox.y + table.bbox.height,
);
for (i, f) in fragments.iter().enumerate() {
if !claimed[i]
&& f.x >= rx - 1.0
&& f.x <= rr + 1.0
&& f.y >= ry - 1.0
&& f.y <= rt + 1.0
{
claimed[i] = true;
}
}
}
}
}
}
}
let unclaimed_frags: Vec<&TextFragment> = fragments
.iter()
.enumerate()
.filter(|(i, _)| !claimed[*i])
.map(|(_, f)| f)
.collect();
let detector = crate::text::structured::StructuredDataDetector::new(Default::default());
let regions = segment_into_table_regions(&unclaimed_frags, 2.0);
for region in ®ions {
if region_looks_like_list(region) {
continue;
}
let region_owned: Vec<TextFragment> = region.iter().map(|f| (*f).clone()).collect();
if let Ok(result) = detector.detect(®ion_owned) {
for table in &result.tables {
if table.confidence < self.config.min_table_confidence {
continue;
}
let rows: Vec<Vec<String>> = table
.rows
.iter()
.map(|row| row.cells.iter().map(|c| c.text.clone()).collect())
.collect();
let bbox = ElementBBox::new(
table.bounding_box.x,
table.bounding_box.y,
table.bounding_box.width,
table.bounding_box.height,
);
elements.push(Element::Table(TableElementData {
rows,
metadata: ElementMetadata {
page,
bbox,
confidence: table.confidence,
..Default::default()
},
}));
for (i, f) in fragments.iter().enumerate() {
if !claimed[i]
&& f.x >= table.bounding_box.x - 1.0
&& f.x <= table.bounding_box.right() + 1.0
&& f.y >= table.bounding_box.y - 1.0
&& f.y <= table.bounding_box.top() + 1.0
{
claimed[i] = true;
}
}
}
}
}
}
let body_font_size = {
let sizes: Vec<f64> = fragments
.iter()
.enumerate()
.filter(|(i, _)| !claimed[*i])
.map(|(_, f)| f.font_size)
.filter(|s| *s > 0.0)
.collect();
if sizes.is_empty() {
12.0
} else {
let mut freq = std::collections::HashMap::new();
for s in &sizes {
let key = (*s * 2.0).round() as i64;
*freq.entry(key).or_insert(0usize) += 1;
}
let mode_key = freq
.into_iter()
.max_by(|(key_a, count_a), (key_b, count_b)| {
count_a.cmp(count_b).then(key_b.cmp(key_a))
})
.map(|(key, _)| key)
.unwrap_or(24);
mode_key as f64 / 2.0
}
};
let title_threshold = body_font_size * self.config.title_min_font_ratio;
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
let meta = meta_from_fragment(f, page);
let text = f.text.trim();
if text.is_empty() {
continue;
}
if let Some(colon_pos) = text.find(':') {
let key = text[..colon_pos].trim();
let value = text[colon_pos + 1..].trim();
let key_word_count = key.split_whitespace().count();
if !key.is_empty()
&& !value.is_empty()
&& key.len() < 40
&& key_word_count <= 4
&& !key.contains('.')
&& !is_prose_prefix(key)
{
let kv_confidence = compute_kv_confidence(key);
let mut meta = meta;
meta.confidence = kv_confidence;
elements.push(Element::KeyValue(KeyValueElementData {
key: key.to_string(),
value: value.to_string(),
metadata: meta,
}));
continue;
}
}
let p_or_span = matches!(f.struct_tag.as_deref(), Some("P") | Some("Span"));
let mut is_title = false;
let mut title_confidence = 0.0_f64;
if f.font_size >= title_threshold && f.font_size > body_font_size {
let ratio = f.font_size / body_font_size;
is_title = true;
title_confidence = title_confidence.max(compute_title_confidence(
ratio,
self.config.title_min_font_ratio,
));
}
if !p_or_span && bold_short_title(f) {
is_title = true;
title_confidence = title_confidence.max(0.7);
}
if numeric_prefix_title(f) {
is_title = true;
title_confidence = title_confidence.max(0.8);
}
if is_title {
let mut meta = meta;
meta.confidence = title_confidence.clamp(0.5, 1.0);
elements.push(Element::Title(ElementData {
text: text.to_string(),
metadata: meta,
}));
continue;
}
if is_list_item(text) {
elements.push(Element::ListItem(ElementData {
text: text.to_string(),
metadata: meta,
}));
continue;
}
elements.push(Element::Paragraph(ElementData {
text: text.to_string(),
metadata: meta,
}));
}
match &self.config.reading_order {
ReadingOrderStrategy::None => {}
_ => {
elements.sort_by_key(|e| e.page());
}
}
elements = Self::assign_heading_paths(elements);
elements
}
pub(crate) fn assign_heading_paths(mut elements: Vec<Element>) -> Vec<Element> {
let mut sizes: Vec<f64> = elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.filter_map(|e| e.metadata().font_size)
.filter(|s| s.is_finite() && *s > 0.0)
.collect();
sizes.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let mut buckets: Vec<f64> = Vec::new();
for s in sizes {
if !buckets.iter().any(|b| (b - s).abs() <= b * 0.05) {
buckets.push(s);
}
}
let to_level = |n: usize| -> u8 { u8::try_from(n).unwrap_or(u8::MAX) };
let level_of = |size: Option<f64>| -> u8 {
match size {
Some(s) if s.is_finite() && s > 0.0 => {
for (i, b) in buckets.iter().enumerate() {
if (s - b).abs() <= b * 0.05 {
return to_level(i + 1);
}
}
to_level(buckets.len().max(1))
}
_ => to_level(buckets.len() + 1),
}
};
let mut stack: Vec<(u8, String)> = Vec::new();
for element in &mut elements {
if matches!(element, Element::Title(_)) {
let level = level_of(element.metadata().font_size);
stack.retain(|(lvl, _)| *lvl < level);
stack.push((level, element.text().to_string()));
}
let path: Vec<String> = stack.iter().map(|(_, t)| t.clone()).collect();
element.set_parent_heading(path.last().cloned());
element.set_heading_path(path);
}
elements
}
}
fn is_prose_prefix(key: &str) -> bool {
let lower = key.to_lowercase();
let words: Vec<&str> = lower.split_whitespace().collect();
const PROSE_INDICATORS: &[&str] = &[
"as",
"the",
"this",
"that",
"these",
"those",
"it",
"is",
"was",
"were",
"has",
"have",
"had",
"will",
"would",
"should",
"could",
"may",
"might",
"shall",
"can",
"do",
"does",
"did",
"being",
"been",
"are",
"for",
"with",
"from",
"into",
"about",
"after",
"before",
"during",
"between",
"through",
"however",
"therefore",
"furthermore",
"moreover",
"although",
"because",
"since",
"while",
"when",
"where",
"which",
"who",
"whom",
"whose",
"according",
];
if let Some(first) = words.first() {
if PROSE_INDICATORS.contains(first) {
return true;
}
}
if words.len() > 2 {
for word in &words[1..] {
if PROSE_INDICATORS.contains(word) {
return true;
}
}
}
false
}
fn is_list_item(text: &str) -> bool {
let trimmed = text.trim_start();
if trimmed.starts_with("- ")
|| trimmed.starts_with("• ")
|| trimmed.starts_with("* ")
|| trimmed.starts_with("– ")
|| trimmed.starts_with("— ")
{
return true;
}
let bytes = trimmed.as_bytes();
if bytes.len() >= 3 {
let first = bytes[0];
let second = bytes[1];
let third = bytes[2];
if (first.is_ascii_digit() || first.is_ascii_lowercase())
&& (second == b'.' || second == b')')
&& third == b' '
{
return true;
}
if bytes.len() >= 4
&& first.is_ascii_digit()
&& second.is_ascii_digit()
&& (bytes[2] == b'.' || bytes[2] == b')')
&& bytes[3] == b' '
{
return true;
}
}
false
}
fn ruling_table_to_rows(table: &crate::text::table_detection::DetectedTable) -> Vec<Vec<String>> {
let mut grid = vec![vec![String::new(); table.columns]; table.rows];
for cell in &table.cells {
if cell.row < table.rows && cell.column < table.columns {
grid[cell.row][cell.column] = cell.text.clone();
}
}
grid
}
fn segment_into_table_regions<'a>(
fragments: &[&'a TextFragment],
gap_multiplier: f64,
) -> Vec<Vec<&'a TextFragment>> {
if fragments.is_empty() {
return Vec::new();
}
let mut sorted: Vec<&TextFragment> = fragments.to_vec();
sorted.sort_by(|a, b| b.y.total_cmp(&a.y));
let mut heights: Vec<f64> = sorted
.iter()
.map(|f| f.height)
.filter(|h| *h > 0.0)
.collect();
let median_height = if heights.is_empty() {
12.0
} else {
heights.sort_by(f64::total_cmp);
let mid = heights.len() / 2;
if heights.len() % 2 == 0 {
(heights[mid - 1] + heights[mid]) / 2.0
} else {
heights[mid]
}
};
let gap_threshold = median_height * gap_multiplier;
let mut regions: Vec<Vec<&TextFragment>> = Vec::new();
let mut current_region: Vec<&TextFragment> = Vec::new();
for frag in &sorted {
if let Some(prev) = current_region.last() {
let gap = prev.y - (frag.y + frag.height);
if gap > gap_threshold {
if current_region.len() >= 4 {
regions.push(current_region);
}
current_region = Vec::new();
}
}
current_region.push(frag);
}
if current_region.len() >= 4 {
regions.push(current_region);
}
regions
}
fn region_looks_like_list(fragments: &[&TextFragment]) -> bool {
if fragments.is_empty() {
return false;
}
let tolerance = 15.0;
let mut x_clusters: Vec<f64> = Vec::new();
for frag in fragments {
let x = frag.x;
let found = x_clusters.iter().any(|&cx| (cx - x).abs() <= tolerance);
if !found {
x_clusters.push(x);
}
}
if x_clusters.len() != 2 {
return false;
}
x_clusters.sort_by(f64::total_cmp);
let left_x = x_clusters[0];
let left_frags: Vec<&TextFragment> = fragments
.iter()
.filter(|f| (f.x - left_x).abs() <= tolerance)
.copied()
.collect();
if left_frags.is_empty() {
return false;
}
let avg_left_len = left_frags
.iter()
.map(|f| f.text.trim().chars().count())
.sum::<usize>() as f64
/ left_frags.len() as f64;
avg_left_len <= 3.0
}
fn meta_from_fragment(f: &TextFragment, page: u32) -> ElementMetadata {
ElementMetadata {
page,
bbox: ElementBBox::new(f.x, f.y, f.width, f.height),
confidence: 1.0,
font_name: f.font_name.clone(),
font_size: Some(f.font_size),
is_bold: f.is_bold,
is_italic: f.is_italic,
parent_heading: None,
heading_path: Vec::new(),
#[cfg(feature = "unstable-spi")]
class_label: None,
}
}
fn compute_title_confidence(actual_ratio: f64, min_ratio: f64) -> f64 {
if min_ratio <= 0.0 {
return 1.0;
}
(0.5 + 0.5 * (actual_ratio - min_ratio) / min_ratio).clamp(0.5, 1.0)
}
fn compute_zone_confidence(distance: f64, zone_size: f64) -> f64 {
if zone_size <= 0.0 {
return 0.5;
}
(distance / zone_size).clamp(0.5, 1.0)
}
fn compute_kv_confidence(key: &str) -> f64 {
let len_penalty = key.len() as f64 / 40.0;
let word_count = key.split_whitespace().count();
let word_penalty = if word_count > 2 {
0.1 * (word_count - 2) as f64
} else {
0.0
};
(1.0 - len_penalty - word_penalty).clamp(0.5, 1.0)
}
const MAX_HEADER_TEXT_LEN: usize = 100;
const MAX_BOLD_TITLE_LEN: usize = 120;
const MAX_NUMERIC_TITLE_LEN: usize = 120;
const MAX_NUMERIC_TITLE_WORDS: usize = 14;
fn ends_with_sentence_terminator(s: &str) -> bool {
matches!(s.chars().last(), Some('.') | Some('!') | Some('?'))
}
fn bold_short_title(f: &TextFragment) -> bool {
if !f.is_bold {
return false;
}
let trimmed = f.text.trim();
let char_count = trimmed.chars().count();
if char_count == 0 || char_count > MAX_BOLD_TITLE_LEN {
return false;
}
!ends_with_sentence_terminator(trimmed)
}
fn section_prefix_regex() -> &'static regex::Regex {
use std::sync::OnceLock;
static RE: OnceLock<regex::Regex> = OnceLock::new();
RE.get_or_init(|| {
regex::Regex::new(
r"^([A-Z]\d+(\.\d+)*(\.[a-z]\.?)?|\d+(\.\d+)*\.?|Section\s+\d+:?|Chapter\s+\d+:?|[IVX]+\.)\s+",
)
.expect("section_prefix_regex must compile")
})
}
fn matches_section_prefix(s: &str) -> bool {
section_prefix_regex().is_match(s)
}
fn strip_section_prefix(s: &str) -> &str {
if let Some(m) = section_prefix_regex().find(s) {
&s[m.end()..]
} else {
s
}
}
fn numeric_prefix_title(f: &TextFragment) -> bool {
let trimmed = f.text.trim();
let char_count = trimmed.chars().count();
if char_count == 0 || char_count > MAX_NUMERIC_TITLE_LEN {
return false;
}
if !matches_section_prefix(trimmed) {
return false;
}
if is_list_item(trimmed) {
return false;
}
let rest = strip_section_prefix(trimmed).trim_start();
if !matches!(rest.chars().next(), Some(c) if c.is_uppercase()) {
return false;
}
if trimmed.contains(',') {
return false;
}
if trimmed.split_whitespace().count() > MAX_NUMERIC_TITLE_WORDS {
return false;
}
true
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum StructTagClass {
Heading,
List,
ListItem,
Artifact,
}
fn classify_by_struct_tag(tag: &str) -> Option<StructTagClass> {
match tag {
"H" | "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "Title" => Some(StructTagClass::Heading),
"L" => Some(StructTagClass::List),
"LI" | "Lbl" | "LBody" => Some(StructTagClass::ListItem),
"Artifact" => Some(StructTagClass::Artifact),
_ => None,
}
}
fn struct_tag_is_body(tag: &Option<String>) -> bool {
let Some(t) = tag.as_deref() else {
return false;
};
matches!(
t,
"P" | "Span"
| "H"
| "H1"
| "H2"
| "H3"
| "H4"
| "H5"
| "H6"
| "Title"
| "L"
| "LI"
| "Lbl"
| "LBody"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn classify_by_struct_tag_recognizes_heading_tags() {
for tag in &["H", "H1", "H2", "H3", "H4", "H5", "H6", "Title"] {
assert_eq!(
classify_by_struct_tag(tag),
Some(StructTagClass::Heading),
"tag {tag} should classify as Heading"
);
}
}
#[test]
fn classify_by_struct_tag_recognizes_list_tags() {
assert_eq!(classify_by_struct_tag("L"), Some(StructTagClass::List));
assert_eq!(classify_by_struct_tag("LI"), Some(StructTagClass::ListItem));
assert_eq!(
classify_by_struct_tag("Lbl"),
Some(StructTagClass::ListItem)
);
assert_eq!(
classify_by_struct_tag("LBody"),
Some(StructTagClass::ListItem)
);
}
#[test]
fn classify_by_struct_tag_recognizes_artifact() {
assert_eq!(
classify_by_struct_tag("Artifact"),
Some(StructTagClass::Artifact)
);
}
#[test]
fn classify_by_struct_tag_returns_none_for_passthrough_tags() {
for tag in &[
"P", "Span", "Figure", "Table", "Caption", "Form", "Note", "Random",
] {
assert_eq!(
classify_by_struct_tag(tag),
None,
"tag {tag} should be None (fall through)"
);
}
}
#[test]
fn struct_tag_is_body_recognizes_body_tags() {
for tag in &[
"P", "Span", "L", "LI", "H1", "H2", "H6", "Title", "Lbl", "LBody",
] {
assert!(
struct_tag_is_body(&Some(tag.to_string())),
"tag {tag} should be body"
);
}
}
#[test]
fn struct_tag_is_body_returns_false_for_artifact() {
assert!(!struct_tag_is_body(&Some("Artifact".to_string())));
}
#[test]
fn struct_tag_is_body_returns_false_for_none() {
assert!(!struct_tag_is_body(&None));
}
fn frag(text: &str, bold: bool, font_size: f64) -> TextFragment {
TextFragment {
text: text.to_string(),
x: 0.0,
y: 0.0,
width: 100.0,
height: 12.0,
font_size,
font_name: None,
is_bold: bold,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
}
}
#[test]
fn ends_with_sentence_terminator_table() {
assert!(ends_with_sentence_terminator("This is a paragraph."));
assert!(ends_with_sentence_terminator("Really?"));
assert!(ends_with_sentence_terminator("Stop!"));
assert!(!ends_with_sentence_terminator("Section heading"));
assert!(!ends_with_sentence_terminator("A2.a Risk Management"));
assert!(!ends_with_sentence_terminator(""));
}
#[test]
fn bold_short_title_accepts_bold_short_no_terminator() {
assert!(bold_short_title(&frag("Section Heading", true, 12.0)));
assert!(bold_short_title(&frag("Principle A2", true, 11.0)));
}
#[test]
fn bold_short_title_rejects_non_bold() {
assert!(!bold_short_title(&frag("Section Heading", false, 12.0)));
}
#[test]
fn bold_short_title_rejects_long_text() {
let long = "x".repeat(150);
assert!(!bold_short_title(&frag(&long, true, 12.0)));
}
#[test]
fn bold_short_title_rejects_sentence_with_period() {
assert!(!bold_short_title(&frag(
"This is a complete sentence.",
true,
12.0
)));
}
#[test]
fn bold_short_title_rejects_empty() {
assert!(!bold_short_title(&frag(" ", true, 12.0)));
}
#[test]
fn numeric_prefix_title_accepts_known_patterns() {
let cases = &[
"A2.a Risk Management Process",
"A1.b Roles and Responsibilities",
"1.1 Overview",
"3.2.1 Detailed Requirements",
"Section 4: Implementation",
"Chapter 7 Conclusion",
"IV. Findings",
];
for c in cases {
assert!(
numeric_prefix_title(&frag(c, false, 12.0)),
"should match: {c}"
);
}
}
#[test]
fn numeric_prefix_title_rejects_money_amount() {
assert!(!numeric_prefix_title(&frag(
"1.2 million users were affected",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_version_string() {
assert!(!numeric_prefix_title(&frag(
"version 3.0.1 release notes",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_lowercase_continuation() {
assert!(!numeric_prefix_title(&frag(
"1. take action now",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_flat_numbered_list_marker() {
for c in &["1. First item", "2. Second item", "10) Tenth item"] {
assert!(
!numeric_prefix_title(&frag(c, false, 12.0)),
"flat numbered list marker must not be a Title: {c}"
);
}
for c in &["1.1 Overview", "A2.a Risk Management Process"] {
assert!(
numeric_prefix_title(&frag(c, false, 12.0)),
"multi-level/lettered prefix must remain a Title: {c}"
);
}
}
#[test]
fn numeric_prefix_title_rejects_text_without_prefix() {
assert!(!numeric_prefix_title(&frag(
"Overview of the system",
false,
12.0
)));
}
#[test]
fn numeric_prefix_title_rejects_too_long() {
let mut s = String::from("A2.a ");
s.push_str(&"X".repeat(220));
assert!(!numeric_prefix_title(&frag(&s, false, 12.0)));
}
#[test]
fn numeric_prefix_title_rejects_text_with_comma() {
assert!(!numeric_prefix_title(&frag(
"1. La vigilancia continua permite detectar amenazas, vulnerabilidades y errores.",
false,
12.0,
)));
}
#[test]
fn numeric_prefix_title_rejects_long_sentence_without_comma() {
assert!(!numeric_prefix_title(&frag(
"1. La vigilancia continua permitira la deteccion de actividades o comportamientos anomalos y su oportuna respuesta.",
false,
12.0,
)));
}
#[test]
fn numeric_prefix_title_rejects_just_over_max_len() {
let mut s = String::from("A2.a Risk Management ");
s.push_str(&"X".repeat(120));
assert!(!numeric_prefix_title(&frag(&s, false, 12.0)));
}
fn frag_at(text: &str, x: f64, y: f64, font_size: f64) -> TextFragment {
TextFragment {
text: text.to_string(),
x,
y,
width: 100.0,
height: font_size,
font_size,
font_name: None,
is_bold: false,
is_italic: false,
color: None,
space_decisions: Vec::new(),
mcid: None,
struct_tag: None,
}
}
fn cell_frag(text: &str, x: f64, y: f64) -> TextFragment {
let mut f = frag_at(text, x, y, 8.0);
f.width = 10.0;
f
}
#[test]
fn raw_fragments_drive_cell_text_while_reconstructed_drive_claiming() {
use crate::graphics::extraction::{ExtractedGraphics, VectorLine};
let mut graphics = ExtractedGraphics::new();
for y in [100.0, 150.0, 200.0] {
graphics.add_line(VectorLine::new(100.0, y, 300.0, y, 1.0, true, None));
}
for x in [100.0, 200.0, 300.0] {
graphics.add_line(VectorLine::new(x, 100.0, x, 200.0, 1.0, true, None));
}
assert!(graphics.has_table_structure());
let raw = vec![
cell_frag("TL", 120.0, 175.0),
cell_frag("TR", 220.0, 175.0),
cell_frag("BL", 120.0, 125.0),
cell_frag("BR", 220.0, 125.0),
];
let reconstructed = vec![cell_frag("TL TR BL BR", 120.0, 175.0)];
let p = Partitioner::new(PartitionConfig::default());
let elements = p.partition_fragments_with_graphics_raw(
&reconstructed,
Some(&raw),
Some(&graphics),
0,
842.0,
);
let rows = elements
.iter()
.find_map(|e| match e {
Element::Table(t) => Some(t.rows.clone()),
_ => None,
})
.expect("a Table element");
assert_eq!(rows.len(), 2, "two grid rows, got {rows:?}");
assert_eq!(rows[0], vec!["TL".to_string(), "TR".to_string()]);
assert_eq!(rows[1], vec!["BL".to_string(), "BR".to_string()]);
assert_eq!(
elements.len(),
1,
"merged fragment must be claimed, got {elements:?}"
);
}
#[test]
fn struct_tag_h1_yields_title_no_font_ratio_needed() {
let mut f = frag_at("Section One", 50.0, 400.0, 12.0);
f.struct_tag = Some("H1".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1
);
}
#[test]
fn struct_tag_p_does_not_block_numeric_prefix_title() {
let mut f = frag_at("A2.a Risk Management Process", 50.0, 400.0, 12.0);
f.struct_tag = Some("P".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1,
"numeric-prefix heuristic must fire even when struct_tag=P (NCSC sub-sections)"
);
}
#[test]
fn struct_tag_p_overrides_bold_short_heuristic() {
let mut f = frag_at("Bold Short Text", 50.0, 400.0, 12.0);
f.is_bold = true;
f.struct_tag = Some("P".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert!(elements.iter().any(|e| matches!(e, Element::Paragraph(_))));
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
0
);
}
#[test]
fn bold_short_title_fires_without_struct_tag() {
let mut f = frag_at("Risk Management", 50.0, 400.0, 12.0);
f.is_bold = true;
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1,
"bold-short heuristic must fire when no other signals present"
);
}
#[test]
fn numeric_prefix_title_fires_without_bold() {
let f = frag_at("A2.a Risk Management Process", 50.0, 400.0, 12.0);
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count(),
1,
"numeric-prefix heuristic must fire on NCSC-style sections"
);
}
#[test]
fn struct_tag_li_yields_list_item() {
let mut f = frag_at("Bullet content", 50.0, 400.0, 12.0);
f.struct_tag = Some("LI".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert_eq!(
elements
.iter()
.filter(|e| matches!(e, Element::ListItem(_)))
.count(),
1
);
}
#[test]
fn font_ratio_title_still_works() {
let mut frags = vec![];
for i in 0..5 {
frags.push(frag_at(
&format!("body line {i}"),
50.0,
400.0 - (i as f64) * 15.0,
12.0,
));
}
frags.push(frag_at("Big Heading", 50.0, 500.0, 20.0));
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert!(
elements
.iter()
.filter(|e| matches!(e, Element::Title(_)))
.count()
>= 1,
"font-ratio Title path must still fire"
);
}
#[test]
fn header_zone_rejects_long_text() {
let long = "X".repeat(200);
let frags = vec![frag_at(&long, 50.0, 780.0, 12.0)];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
let header_count = elements
.iter()
.filter(|e| matches!(e, Element::Header(_)))
.count();
assert_eq!(
header_count, 0,
"long text in header zone must not classify as Header"
);
}
#[test]
fn header_zone_accepts_short_text() {
let frags = vec![frag_at("My Report 2026", 50.0, 780.0, 12.0)];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
assert!(
elements.iter().any(|e| matches!(e, Element::Header(_))),
"short text in header zone must classify as Header"
);
}
#[test]
fn header_zone_rejects_p_struct_tag() {
let mut f = frag_at("Short body text", 50.0, 780.0, 12.0);
f.struct_tag = Some("P".to_string());
let frags = vec![f];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
let header_count = elements
.iter()
.filter(|e| matches!(e, Element::Header(_)))
.count();
assert_eq!(header_count, 0);
}
#[test]
fn footer_zone_rejects_long_text() {
let long = "X".repeat(200);
let frags = vec![frag_at(&long, 50.0, 10.0, 12.0)];
let partitioner = Partitioner::new(PartitionConfig::default());
let elements = partitioner.partition_fragments(&frags, 0, 800.0);
let footer_count = elements
.iter()
.filter(|e| matches!(e, Element::Footer(_)))
.count();
assert_eq!(footer_count, 0);
}
#[test]
fn heading_path_builds_breadcrumb_by_font_size() {
use crate::pipeline::element::{Element, ElementData, ElementMetadata};
let title = |t: &str, size: f64| {
let metadata = ElementMetadata {
font_size: Some(size),
..ElementMetadata::default()
};
Element::Title(ElementData {
text: t.to_string(),
metadata,
})
};
let para = |t: &str| {
Element::Paragraph(ElementData {
text: t.to_string(),
metadata: ElementMetadata::default(),
})
};
let elements = vec![
title("1 Introduction", 20.0),
title("1.2 Scope", 14.0),
para("Body text under scope."),
title("2 Methods", 20.0),
para("Body under methods."),
];
let linked = Partitioner::assign_heading_paths(elements);
assert_eq!(
linked[2].metadata().heading_path,
vec!["1 Introduction", "1.2 Scope"]
);
assert_eq!(
linked[2].metadata().parent_heading.as_deref(),
Some("1.2 Scope")
);
assert_eq!(linked[4].metadata().heading_path, vec!["2 Methods"]);
}
#[test]
fn heading_path_unknown_size_title_appends_not_resets() {
use crate::pipeline::element::{Element, ElementData, ElementMetadata};
let title = |t: &str, size: Option<f64>| {
let metadata = ElementMetadata {
font_size: size,
..ElementMetadata::default()
};
Element::Title(ElementData {
text: t.to_string(),
metadata,
})
};
let para = |t: &str| {
Element::Paragraph(ElementData {
text: t.to_string(),
metadata: ElementMetadata::default(),
})
};
let elements = vec![
title("H1", Some(20.0)),
title("H1.1", Some(10.0)),
title("NoSize", None),
para("body"),
];
let linked = Partitioner::assign_heading_paths(elements);
assert_eq!(
linked[3].metadata().heading_path,
vec!["H1", "H1.1", "NoSize"]
);
assert_eq!(
linked[3].metadata().parent_heading.as_deref(),
Some("NoSize")
);
}
}