use crate::pipeline::reading_order::{ReadingOrder, SimpleReadingOrder, XYCutReadingOrder};
use crate::pipeline::{
Element, ElementBBox, ElementData, ElementMetadata, KeyValueElementData, TableElementData,
};
use crate::text::extraction::TextFragment;
#[derive(Debug, Clone, Default)]
pub enum ReadingOrderStrategy {
#[default]
Simple,
XYCut { min_gap: f64 },
None,
}
#[derive(Debug, Clone)]
pub struct PartitionConfig {
pub detect_tables: bool,
pub detect_headers_footers: bool,
pub title_min_font_ratio: f64,
pub header_zone: f64,
pub footer_zone: f64,
pub reading_order: ReadingOrderStrategy,
pub min_table_confidence: f64,
}
impl Default for PartitionConfig {
fn default() -> Self {
Self {
detect_tables: true,
detect_headers_footers: true,
title_min_font_ratio: 1.3,
header_zone: 0.05,
footer_zone: 0.05,
reading_order: ReadingOrderStrategy::Simple,
min_table_confidence: 0.5,
}
}
}
impl PartitionConfig {
pub fn new() -> Self {
Self::default()
}
pub fn with_title_min_font_ratio(mut self, ratio: f64) -> Self {
self.title_min_font_ratio = ratio;
self
}
pub fn without_tables(mut self) -> Self {
self.detect_tables = false;
self
}
pub fn without_headers_footers(mut self) -> Self {
self.detect_headers_footers = false;
self
}
pub fn with_reading_order(mut self, strategy: ReadingOrderStrategy) -> Self {
self.reading_order = strategy;
self
}
pub fn with_min_table_confidence(mut self, threshold: f64) -> Self {
self.min_table_confidence = threshold;
self
}
}
pub struct Partitioner {
config: PartitionConfig,
}
impl Partitioner {
pub fn new(config: PartitionConfig) -> Self {
Self { config }
}
pub fn partition_fragments(
&self,
fragments: &[TextFragment],
page: u32,
page_height: f64,
) -> Vec<Element> {
if fragments.is_empty() {
return Vec::new();
}
let fragments: std::borrow::Cow<[TextFragment]> = match &self.config.reading_order {
ReadingOrderStrategy::Simple => {
let mut ordered = fragments.to_vec();
SimpleReadingOrder::default().order(&mut ordered);
std::borrow::Cow::Owned(ordered)
}
ReadingOrderStrategy::XYCut { min_gap } => {
let mut ordered = fragments.to_vec();
XYCutReadingOrder::new(*min_gap).order(&mut ordered);
std::borrow::Cow::Owned(ordered)
}
ReadingOrderStrategy::None => std::borrow::Cow::Borrowed(fragments),
};
let fragments = fragments.as_ref();
let mut claimed = vec![false; fragments.len()];
let mut elements = Vec::new();
if self.config.detect_headers_footers && page_height > 0.0 {
let header_threshold = page_height * (1.0 - self.config.header_zone);
let footer_threshold = page_height * self.config.footer_zone;
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
if f.y >= header_threshold {
let zone_size = page_height * self.config.header_zone;
let distance = f.y - header_threshold;
let header_confidence = compute_zone_confidence(distance, zone_size);
let mut meta = meta_from_fragment(f, page);
meta.confidence = header_confidence;
elements.push(Element::Header(ElementData {
text: f.text.clone(),
metadata: meta,
}));
claimed[i] = true;
} else if f.y + f.height <= footer_threshold {
let zone_size = page_height * self.config.footer_zone;
let distance = footer_threshold - (f.y + f.height);
let footer_confidence = compute_zone_confidence(distance, zone_size);
let mut meta = meta_from_fragment(f, page);
meta.confidence = footer_confidence;
elements.push(Element::Footer(ElementData {
text: f.text.clone(),
metadata: meta,
}));
claimed[i] = true;
}
}
}
if self.config.detect_tables {
let unclaimed_frags: Vec<&TextFragment> = fragments
.iter()
.enumerate()
.filter(|(i, _)| !claimed[*i])
.map(|(_, f)| f)
.collect();
let detector = crate::text::structured::StructuredDataDetector::new(Default::default());
let regions = segment_into_table_regions(&unclaimed_frags, 2.0);
for region in ®ions {
if region_looks_like_list(region) {
continue;
}
let region_owned: Vec<TextFragment> = region.iter().map(|f| (*f).clone()).collect();
if let Ok(result) = detector.detect(®ion_owned) {
for table in &result.tables {
if table.confidence < self.config.min_table_confidence {
continue;
}
let rows: Vec<Vec<String>> = table
.rows
.iter()
.map(|row| row.cells.iter().map(|c| c.text.clone()).collect())
.collect();
let bbox = ElementBBox::new(
table.bounding_box.x,
table.bounding_box.y,
table.bounding_box.width,
table.bounding_box.height,
);
elements.push(Element::Table(TableElementData {
rows,
metadata: ElementMetadata {
page,
bbox,
confidence: table.confidence,
..Default::default()
},
}));
for (i, f) in fragments.iter().enumerate() {
if !claimed[i]
&& f.x >= table.bounding_box.x - 1.0
&& f.x <= table.bounding_box.right() + 1.0
&& f.y >= table.bounding_box.y - 1.0
&& f.y <= table.bounding_box.top() + 1.0
{
claimed[i] = true;
}
}
}
}
}
}
let body_font_size = {
let sizes: Vec<f64> = fragments
.iter()
.enumerate()
.filter(|(i, _)| !claimed[*i])
.map(|(_, f)| f.font_size)
.filter(|s| *s > 0.0)
.collect();
if sizes.is_empty() {
12.0
} else {
let mut freq = std::collections::HashMap::new();
for s in &sizes {
let key = (*s * 2.0).round() as i64;
*freq.entry(key).or_insert(0usize) += 1;
}
let mode_key = freq
.into_iter()
.max_by(|(key_a, count_a), (key_b, count_b)| {
count_a.cmp(count_b).then(key_b.cmp(key_a))
})
.map(|(key, _)| key)
.unwrap_or(24);
mode_key as f64 / 2.0
}
};
let title_threshold = body_font_size * self.config.title_min_font_ratio;
for (i, f) in fragments.iter().enumerate() {
if claimed[i] {
continue;
}
let meta = meta_from_fragment(f, page);
let text = f.text.trim();
if text.is_empty() {
continue;
}
if let Some(colon_pos) = text.find(':') {
let key = text[..colon_pos].trim();
let value = text[colon_pos + 1..].trim();
let key_word_count = key.split_whitespace().count();
if !key.is_empty()
&& !value.is_empty()
&& key.len() < 40
&& key_word_count <= 4
&& !key.contains('.')
&& !is_prose_prefix(key)
{
let kv_confidence = compute_kv_confidence(key);
let mut meta = meta;
meta.confidence = kv_confidence;
elements.push(Element::KeyValue(KeyValueElementData {
key: key.to_string(),
value: value.to_string(),
metadata: meta,
}));
continue;
}
}
if f.font_size >= title_threshold && f.font_size > body_font_size {
let ratio = f.font_size / body_font_size;
let title_confidence =
compute_title_confidence(ratio, self.config.title_min_font_ratio);
let mut meta = meta;
meta.confidence = title_confidence;
elements.push(Element::Title(ElementData {
text: text.to_string(),
metadata: meta,
}));
continue;
}
if is_list_item(text) {
elements.push(Element::ListItem(ElementData {
text: text.to_string(),
metadata: meta,
}));
continue;
}
elements.push(Element::Paragraph(ElementData {
text: text.to_string(),
metadata: meta,
}));
}
match &self.config.reading_order {
ReadingOrderStrategy::None => {}
_ => {
elements.sort_by_key(|e| e.page());
}
}
let mut current_heading: Option<String> = None;
for element in &mut elements {
if matches!(element, Element::Title(_)) {
current_heading = Some(element.text().to_string());
}
element.set_parent_heading(current_heading.clone());
}
elements
}
}
fn is_prose_prefix(key: &str) -> bool {
let lower = key.to_lowercase();
let words: Vec<&str> = lower.split_whitespace().collect();
const PROSE_INDICATORS: &[&str] = &[
"as",
"the",
"this",
"that",
"these",
"those",
"it",
"is",
"was",
"were",
"has",
"have",
"had",
"will",
"would",
"should",
"could",
"may",
"might",
"shall",
"can",
"do",
"does",
"did",
"being",
"been",
"are",
"for",
"with",
"from",
"into",
"about",
"after",
"before",
"during",
"between",
"through",
"however",
"therefore",
"furthermore",
"moreover",
"although",
"because",
"since",
"while",
"when",
"where",
"which",
"who",
"whom",
"whose",
"according",
];
if let Some(first) = words.first() {
if PROSE_INDICATORS.contains(first) {
return true;
}
}
if words.len() > 2 {
for word in &words[1..] {
if PROSE_INDICATORS.contains(word) {
return true;
}
}
}
false
}
fn is_list_item(text: &str) -> bool {
let trimmed = text.trim_start();
if trimmed.starts_with("- ")
|| trimmed.starts_with("• ")
|| trimmed.starts_with("* ")
|| trimmed.starts_with("– ")
|| trimmed.starts_with("— ")
{
return true;
}
let bytes = trimmed.as_bytes();
if bytes.len() >= 3 {
let first = bytes[0];
let second = bytes[1];
let third = bytes[2];
if (first.is_ascii_digit() || first.is_ascii_lowercase())
&& (second == b'.' || second == b')')
&& third == b' '
{
return true;
}
if bytes.len() >= 4
&& first.is_ascii_digit()
&& second.is_ascii_digit()
&& (bytes[2] == b'.' || bytes[2] == b')')
&& bytes[3] == b' '
{
return true;
}
}
false
}
fn segment_into_table_regions<'a>(
fragments: &[&'a TextFragment],
gap_multiplier: f64,
) -> Vec<Vec<&'a TextFragment>> {
if fragments.is_empty() {
return Vec::new();
}
let mut sorted: Vec<&TextFragment> = fragments.to_vec();
sorted.sort_by(|a, b| b.y.total_cmp(&a.y));
let mut heights: Vec<f64> = sorted
.iter()
.map(|f| f.height)
.filter(|h| *h > 0.0)
.collect();
let median_height = if heights.is_empty() {
12.0
} else {
heights.sort_by(f64::total_cmp);
let mid = heights.len() / 2;
if heights.len() % 2 == 0 {
(heights[mid - 1] + heights[mid]) / 2.0
} else {
heights[mid]
}
};
let gap_threshold = median_height * gap_multiplier;
let mut regions: Vec<Vec<&TextFragment>> = Vec::new();
let mut current_region: Vec<&TextFragment> = Vec::new();
for frag in &sorted {
if let Some(prev) = current_region.last() {
let gap = prev.y - (frag.y + frag.height);
if gap > gap_threshold {
if current_region.len() >= 4 {
regions.push(current_region);
}
current_region = Vec::new();
}
}
current_region.push(frag);
}
if current_region.len() >= 4 {
regions.push(current_region);
}
regions
}
fn region_looks_like_list(fragments: &[&TextFragment]) -> bool {
if fragments.is_empty() {
return false;
}
let tolerance = 15.0;
let mut x_clusters: Vec<f64> = Vec::new();
for frag in fragments {
let x = frag.x;
let found = x_clusters.iter().any(|&cx| (cx - x).abs() <= tolerance);
if !found {
x_clusters.push(x);
}
}
if x_clusters.len() != 2 {
return false;
}
x_clusters.sort_by(f64::total_cmp);
let left_x = x_clusters[0];
let left_frags: Vec<&TextFragment> = fragments
.iter()
.filter(|f| (f.x - left_x).abs() <= tolerance)
.copied()
.collect();
if left_frags.is_empty() {
return false;
}
let avg_left_len = left_frags
.iter()
.map(|f| f.text.trim().chars().count())
.sum::<usize>() as f64
/ left_frags.len() as f64;
avg_left_len <= 3.0
}
fn meta_from_fragment(f: &TextFragment, page: u32) -> ElementMetadata {
ElementMetadata {
page,
bbox: ElementBBox::new(f.x, f.y, f.width, f.height),
confidence: 1.0,
font_name: f.font_name.clone(),
font_size: Some(f.font_size),
is_bold: f.is_bold,
is_italic: f.is_italic,
parent_heading: None,
}
}
fn compute_title_confidence(actual_ratio: f64, min_ratio: f64) -> f64 {
if min_ratio <= 0.0 {
return 1.0;
}
(0.5 + 0.5 * (actual_ratio - min_ratio) / min_ratio).clamp(0.5, 1.0)
}
fn compute_zone_confidence(distance: f64, zone_size: f64) -> f64 {
if zone_size <= 0.0 {
return 0.5;
}
(distance / zone_size).clamp(0.5, 1.0)
}
fn compute_kv_confidence(key: &str) -> f64 {
let len_penalty = key.len() as f64 / 40.0;
let word_count = key.split_whitespace().count();
let word_penalty = if word_count > 2 {
0.1 * (word_count - 2) as f64
} else {
0.0
};
(1.0 - len_penalty - word_penalty).clamp(0.5, 1.0)
}