use super::hierarchy::SegmentData;
pub use html_to_markdown_rs::hocr::{HocrWord, reconstruct_table, table_to_markdown};
pub fn segment_to_hocr_word(seg: &SegmentData, page_height: f32) -> HocrWord {
let top_image = (page_height - (seg.y + seg.height)).round().max(0.0) as u32;
HocrWord {
text: seg.text.clone(),
left: seg.x.round().max(0.0) as u32,
top: top_image,
width: seg.width.round().max(0.0) as u32,
height: seg.height.round().max(0.0) as u32,
confidence: 95.0,
}
}
pub fn split_segment_to_words(seg: &SegmentData, page_height: f32) -> Vec<HocrWord> {
let trimmed = seg.text.trim();
if trimmed.is_empty() {
return Vec::new();
}
if !trimmed.contains(char::is_whitespace) {
return vec![segment_to_hocr_word(seg, page_height)];
}
let text = &seg.text;
let total_bytes = text.len() as f32;
if total_bytes <= 0.0 {
return Vec::new();
}
let top_image = (page_height - (seg.y + seg.height)).round().max(0.0) as u32;
let seg_height = seg.height.round().max(0.0) as u32;
let mut words = Vec::new();
let mut search_start = 0;
for word in text.split_whitespace() {
let byte_offset = text[search_start..].find(word).map(|pos| search_start + pos);
let Some(offset) = byte_offset else {
continue;
};
search_start = offset + word.len();
let frac_start = offset as f32 / total_bytes;
let frac_width = word.len() as f32 / total_bytes;
words.push(HocrWord {
text: word.to_string(),
left: (seg.x + frac_start * seg.width).round().max(0.0) as u32,
top: top_image,
width: (frac_width * seg.width).round().max(1.0) as u32,
height: seg_height,
confidence: 95.0,
});
}
words
}
pub fn segments_to_words(segments: &[SegmentData], page_height: f32) -> Vec<HocrWord> {
segments
.iter()
.flat_map(|seg| split_segment_to_words(seg, page_height))
.collect()
}
pub fn post_process_table(
table: Vec<Vec<String>>,
layout_guided: bool,
allow_single_column: bool,
) -> Option<Vec<Vec<String>>> {
let min_columns = if allow_single_column {
1
} else if layout_guided {
2
} else {
3
};
post_process_table_inner(table, min_columns, layout_guided)
}
fn post_process_table_inner(
mut table: Vec<Vec<String>>,
min_columns: usize,
layout_guided: bool,
) -> Option<Vec<Vec<String>>> {
table.retain(|row| row.iter().any(|cell| !cell.trim().is_empty()));
if table.is_empty() {
return None;
}
let mut non_empty = 0usize;
let mut long_cells = 0usize;
let mut total_chars = 0usize;
for row in &table {
for cell in row {
let trimmed = cell.trim();
if trimmed.is_empty() {
continue;
}
let char_count = trimmed.chars().count();
non_empty += 1;
total_chars += char_count;
if char_count > 60 {
long_cells += 1;
}
}
}
if non_empty > 0 {
if long_cells * 2 > non_empty {
return None;
}
if total_chars / non_empty > 50 {
return None;
}
}
let col_count = table.first().map_or(0, Vec::len);
if col_count < min_columns {
return None;
}
let data_start = table
.iter()
.enumerate()
.find_map(|(idx, row)| {
let digit_cells = row
.iter()
.filter(|cell| cell.chars().any(|c| c.is_ascii_digit()))
.count();
if digit_cells >= 3 { Some(idx) } else { None }
})
.unwrap_or(0);
let mut header_rows = if data_start > 0 {
table[..data_start].to_vec()
} else {
Vec::new()
};
let mut data_rows = table[data_start..].to_vec();
if header_rows.len() > 2 {
header_rows = header_rows[header_rows.len() - 2..].to_vec();
}
if header_rows.is_empty() {
if data_rows.len() < 2 {
return None;
}
header_rows.push(data_rows[0].clone());
data_rows = data_rows[1..].to_vec();
}
let column_count = header_rows.first().or_else(|| data_rows.first()).map_or(0, Vec::len);
if column_count == 0 {
return None;
}
let mut header = vec![String::new(); column_count];
for row in &header_rows {
for (idx, cell) in row.iter().enumerate() {
let trimmed = cell.trim();
if trimmed.is_empty() {
continue;
}
if !header[idx].is_empty() {
header[idx].push(' ');
}
header[idx].push_str(trimmed);
}
}
let mut processed = Vec::new();
processed.push(header);
processed.extend(data_rows);
if processed.len() <= 1 {
return None;
}
let mut col = 0;
while col < processed[0].len() {
let header_text = processed[0][col].trim().to_string();
let data_empty = processed[1..]
.iter()
.all(|row| row.get(col).is_none_or(|cell| cell.trim().is_empty()));
if data_empty {
merge_header_only_column(&mut processed, col, header_text);
} else {
col += 1;
}
if processed.is_empty() || processed[0].is_empty() {
return None;
}
}
if processed[0].len() < 2 || processed.len() <= 1 {
return None;
}
let data_row_count = processed.len() - 1;
if data_row_count > 0 {
for c in 0..processed[0].len() {
let empty_count = processed[1..]
.iter()
.filter(|row| row.get(c).is_none_or(|cell| cell.trim().is_empty()))
.count();
let too_sparse = if layout_guided {
empty_count * 10 > data_row_count * 9 } else {
empty_count * 4 > data_row_count * 3 };
if too_sparse {
return None;
}
}
}
{
let total_data_cells = data_row_count * processed[0].len();
if total_data_cells > 0 {
let filled = processed[1..]
.iter()
.flat_map(|row| row.iter())
.filter(|cell| !cell.trim().is_empty())
.count();
let too_sparse = if layout_guided {
filled * 4 < total_data_cells } else {
filled * 5 < total_data_cells * 2 };
if too_sparse {
return None;
}
}
}
if !layout_guided {
let num_cols = processed[0].len();
let col_char_counts: Vec<usize> = (0..num_cols)
.map(|c| {
processed[1..]
.iter()
.map(|row| row.get(c).map_or(0, |cell| cell.trim().len()))
.sum()
})
.collect();
let total_chars: usize = col_char_counts.iter().sum();
if total_chars > 0 {
for (c, &col_chars) in col_char_counts.iter().enumerate() {
let char_share = col_chars as f64 / total_chars as f64;
let empty_in_col = processed[1..]
.iter()
.filter(|row| row.get(c).is_none_or(|cell| cell.trim().is_empty()))
.count();
let empty_ratio = empty_in_col as f64 / data_row_count as f64;
if char_share < 0.15 && empty_ratio > 0.5 {
return None;
}
}
}
}
for cell in &mut processed[0] {
let text = cell.trim().replace(" ", " ");
*cell = text;
}
for row in processed.iter_mut().skip(1) {
for cell in row.iter_mut() {
normalize_data_cell(cell);
}
}
Some(processed)
}
fn merge_header_only_column(table: &mut [Vec<String>], col: usize, header_text: String) {
if table.is_empty() || table[0].is_empty() {
return;
}
let trimmed = header_text.trim();
if trimmed.is_empty() && table.len() > 1 {
for row in table.iter_mut() {
row.remove(col);
}
return;
}
if !trimmed.is_empty() {
if col > 0 {
let mut target = col - 1;
while target > 0 && table[0][target].trim().is_empty() {
target -= 1;
}
if !table[0][target].trim().is_empty() || target == 0 {
if !table[0][target].is_empty() {
table[0][target].push(' ');
}
table[0][target].push_str(trimmed);
for row in table.iter_mut() {
row.remove(col);
}
return;
}
}
if col + 1 < table[0].len() {
if table[0][col + 1].trim().is_empty() {
table[0][col + 1] = trimmed.to_string();
} else {
let mut updated = trimmed.to_string();
updated.push(' ');
updated.push_str(table[0][col + 1].trim());
table[0][col + 1] = updated;
}
for row in table.iter_mut() {
row.remove(col);
}
return;
}
}
for row in table.iter_mut() {
row.remove(col);
}
}
fn normalize_data_cell(cell: &mut String) {
let mut text = cell.trim().to_string();
if text.is_empty() {
cell.clear();
return;
}
for ch in ['\u{2014}', '\u{2013}', '\u{2212}'] {
text = text.replace(ch, "-");
}
if text.starts_with("- ") {
text = format!("-{}", text[2..].trim_start());
}
text = text.replace("- ", "-");
text = text.replace(" -", "-");
text = text.replace("E-", "e-").replace("E+", "e+");
if text == "-" {
text.clear();
}
*cell = text;
}
#[cfg(test)]
mod tests {
use super::*;
fn make_seg(text: &str, x: f32, y: f32, width: f32, height: f32) -> SegmentData {
SegmentData {
text: text.to_string(),
x,
y,
width,
height,
font_size: height,
is_bold: false,
is_italic: false,
is_monospace: false,
baseline_y: y,
}
}
#[test]
fn test_split_single_word() {
let seg = make_seg("Hello", 100.0, 500.0, 50.0, 12.0);
let words = split_segment_to_words(&seg, 800.0);
assert_eq!(words.len(), 1);
assert_eq!(words[0].text, "Hello");
assert_eq!(words[0].left, 100);
}
#[test]
fn test_split_two_words() {
let seg = make_seg("Col A", 100.0, 500.0, 100.0, 12.0);
let words = split_segment_to_words(&seg, 800.0);
assert_eq!(words.len(), 2);
assert_eq!(words[0].text, "Col");
assert_eq!(words[1].text, "A");
assert_eq!(words[1].left, 180);
}
#[test]
fn test_split_empty_segment() {
let seg = make_seg(" ", 100.0, 500.0, 50.0, 12.0);
let words = split_segment_to_words(&seg, 800.0);
assert!(words.is_empty());
}
#[test]
fn test_split_many_words() {
let seg = make_seg("a b c d", 0.0, 0.0, 700.0, 12.0);
let words = split_segment_to_words(&seg, 800.0);
assert_eq!(words.len(), 4);
assert_eq!(words[0].text, "a");
assert_eq!(words[1].text, "b");
assert_eq!(words[2].text, "c");
assert_eq!(words[3].text, "d");
assert!(words[1].left > words[0].left);
assert!(words[2].left > words[1].left);
assert!(words[3].left > words[2].left);
}
#[test]
fn test_split_y_coordinate_conversion() {
let seg = make_seg("word", 100.0, 500.0, 50.0, 12.0);
let words = split_segment_to_words(&seg, 800.0);
assert_eq!(words[0].top, 288);
assert_eq!(words[0].height, 12);
}
#[test]
fn test_segments_to_words_multiple() {
let segs = vec![
make_seg("Hello", 10.0, 700.0, 40.0, 12.0),
make_seg("World", 55.0, 700.0, 40.0, 12.0),
];
let words = segments_to_words(&segs, 800.0);
assert_eq!(words.len(), 2);
assert_eq!(words[0].text, "Hello");
assert_eq!(words[1].text, "World");
}
}