use crate::layout::text_block::{TextBlock, TextChar};
#[cfg(feature = "ml")]
pub fn cluster_chars_into_words(chars: &[TextChar], epsilon: f32) -> Vec<Vec<usize>> {
if chars.is_empty() {
return vec![];
}
if chars.len() == 1 {
return vec![vec![0]];
}
let mut indices: Vec<usize> = (0..chars.len()).collect();
indices.sort_by(|&a, &b| {
let y_cmp =
crate::utils::safe_float_cmp(chars[b].bbox.center().y, chars[a].bbox.center().y);
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(chars[a].bbox.center().x, chars[b].bbox.center().x)
});
let mut lines: Vec<Vec<usize>> = vec![];
let mut current_line: Vec<usize> = vec![indices[0]];
let mut line_y = chars[indices[0]].bbox.center().y;
for &idx in &indices[1..] {
let y = chars[idx].bbox.center().y;
if (y - line_y).abs() <= epsilon {
current_line.push(idx);
} else {
lines.push(std::mem::take(&mut current_line));
current_line.push(idx);
line_y = y;
}
}
if !current_line.is_empty() {
lines.push(current_line);
}
let mut clusters: Vec<Vec<usize>> = vec![];
for line in &lines {
let mut cluster = vec![line[0]];
for &idx in &line[1..] {
let prev_idx = *cluster.last().unwrap();
let prev_right = chars[prev_idx].bbox.right();
let curr_left = chars[idx].bbox.left();
let x_gap = (curr_left - prev_right).max(0.0);
if x_gap <= epsilon {
cluster.push(idx);
} else {
cluster.sort_by(|&a, &b| {
crate::utils::safe_float_cmp(chars[a].bbox.x, chars[b].bbox.x)
});
clusters.push(std::mem::take(&mut cluster));
cluster.push(idx);
}
}
cluster.sort_by(|&a, &b| crate::utils::safe_float_cmp(chars[a].bbox.x, chars[b].bbox.x));
clusters.push(cluster);
}
clusters
}
#[cfg(feature = "ml")]
pub fn cluster_words_into_lines(words: &[TextBlock], epsilon_y: f32) -> Vec<Vec<usize>> {
if words.is_empty() {
return vec![];
}
if words.len() == 1 {
return vec![vec![0]];
}
let mut indices: Vec<usize> = (0..words.len()).collect();
indices.sort_by(|&a, &b| {
let y_cmp = crate::utils::safe_float_cmp(words[b].bbox.y, words[a].bbox.y);
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(words[a].bbox.x, words[b].bbox.x)
});
let mut clusters: Vec<Vec<usize>> = vec![];
let mut current_cluster: Vec<usize> = vec![indices[0]];
let mut cluster_y = words[indices[0]].bbox.y;
for &idx in &indices[1..] {
if (words[idx].bbox.y - cluster_y).abs() <= epsilon_y {
current_cluster.push(idx);
} else {
current_cluster
.sort_by(|&a, &b| crate::utils::safe_float_cmp(words[a].bbox.x, words[b].bbox.x));
clusters.push(std::mem::take(&mut current_cluster));
current_cluster.push(idx);
cluster_y = words[idx].bbox.y;
}
}
if !current_cluster.is_empty() {
current_cluster
.sort_by(|&a, &b| crate::utils::safe_float_cmp(words[a].bbox.x, words[b].bbox.x));
clusters.push(current_cluster);
}
clusters
}
#[cfg(not(feature = "ml"))]
pub fn cluster_chars_into_words(chars: &[TextChar], epsilon: f32) -> Vec<Vec<usize>> {
if chars.is_empty() {
return vec![];
}
if chars.len() == 1 {
return vec![vec![0]];
}
let mut indices: Vec<usize> = (0..chars.len()).collect();
indices.sort_by(|&a, &b| {
let y_cmp =
crate::utils::safe_float_cmp(chars[b].bbox.center().y, chars[a].bbox.center().y);
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(chars[a].bbox.center().x, chars[b].bbox.center().x)
});
let mut lines: Vec<Vec<usize>> = vec![];
let mut current_line: Vec<usize> = vec![indices[0]];
let mut line_y = chars[indices[0]].bbox.center().y;
for &idx in &indices[1..] {
let y = chars[idx].bbox.center().y;
let font_half = chars[idx].font_size * 0.5;
if (y - line_y).abs() < font_half.max(chars[current_line[0]].font_size * 0.5) {
current_line.push(idx);
} else {
lines.push(std::mem::take(&mut current_line));
current_line.push(idx);
line_y = y;
}
}
if !current_line.is_empty() {
lines.push(current_line);
}
let mut clusters: Vec<Vec<usize>> = vec![];
for line in &lines {
let mut cluster = vec![line[0]];
for &idx in &line[1..] {
let prev_idx = *cluster.last().unwrap();
let prev_right = chars[prev_idx].bbox.right();
let curr_left = chars[idx].bbox.left();
let x_gap = (curr_left - prev_right).max(0.0);
if x_gap <= epsilon {
cluster.push(idx);
} else {
cluster.sort_by(|&a, &b| {
crate::utils::safe_float_cmp(chars[a].bbox.x, chars[b].bbox.x)
});
clusters.push(std::mem::take(&mut cluster));
cluster.push(idx);
}
}
cluster.sort_by(|&a, &b| crate::utils::safe_float_cmp(chars[a].bbox.x, chars[b].bbox.x));
clusters.push(cluster);
}
clusters
}
#[cfg(not(feature = "ml"))]
pub fn cluster_words_into_lines(words: &[TextBlock], epsilon_y: f32) -> Vec<Vec<usize>> {
if words.is_empty() {
return vec![];
}
let column_gap_threshold = 50.0;
let mut indices: Vec<usize> = (0..words.len()).collect();
indices.sort_by(|&a, &b| {
let y_cmp = crate::utils::safe_float_cmp(words[b].bbox.y, words[a].bbox.y);
if y_cmp != std::cmp::Ordering::Equal {
return y_cmp;
}
crate::utils::safe_float_cmp(words[a].bbox.x, words[b].bbox.x)
});
let mut y_bands: Vec<Vec<usize>> = vec![];
let mut current_band: Vec<usize> = vec![indices[0]];
let mut band_y = words[indices[0]].bbox.y;
for &idx in &indices[1..] {
if (words[idx].bbox.y - band_y).abs() <= epsilon_y {
current_band.push(idx);
} else {
y_bands.push(std::mem::take(&mut current_band));
current_band.push(idx);
band_y = words[idx].bbox.y;
}
}
if !current_band.is_empty() {
y_bands.push(current_band);
}
let mut clusters: Vec<Vec<usize>> = vec![];
for band in &mut y_bands {
band.sort_by(|&a, &b| crate::utils::safe_float_cmp(words[a].bbox.x, words[b].bbox.x));
let mut cluster = vec![band[0]];
for &idx in &band[1..] {
let prev_idx = *cluster.last().unwrap();
let x_dist = (words[idx].bbox.left() - words[prev_idx].bbox.right())
.abs()
.min((words[prev_idx].bbox.left() - words[idx].bbox.right()).abs());
if x_dist < column_gap_threshold {
cluster.push(idx);
} else {
clusters.push(std::mem::take(&mut cluster));
cluster.push(idx);
}
}
clusters.push(cluster);
}
clusters
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
use crate::layout::{Color, FontWeight};
fn mock_char(c: char, x: f32, y: f32) -> TextChar {
let bbox = Rect::new(x, y, 10.0, 12.0);
TextChar {
char: c,
bbox,
font_name: "Times".to_string(),
font_size: 12.0,
font_weight: FontWeight::Normal,
color: Color::black(),
mcid: None,
is_italic: false,
is_monospace: false,
origin_x: bbox.x,
origin_y: bbox.y,
rotation_degrees: 0.0,
advance_width: bbox.width,
matrix: None,
}
}
#[test]
fn test_cluster_chars_empty() {
let chars = vec![];
let clusters = cluster_chars_into_words(&chars, 8.0);
assert_eq!(clusters.len(), 0);
}
#[test]
fn test_cluster_chars_single() {
let chars = vec![mock_char('A', 0.0, 0.0)];
let clusters = cluster_chars_into_words(&chars, 8.0);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0], vec![0]);
}
#[test]
fn test_cluster_chars_into_words() {
let chars = vec![
mock_char('H', 0.0, 0.0),
mock_char('e', 11.0, 0.0),
mock_char('l', 22.0, 0.0),
mock_char('l', 33.0, 0.0),
mock_char('o', 44.0, 0.0),
mock_char('W', 100.0, 0.0),
mock_char('o', 111.0, 0.0),
mock_char('r', 122.0, 0.0),
mock_char('l', 133.0, 0.0),
mock_char('d', 144.0, 0.0),
];
let clusters = cluster_chars_into_words(&chars, 20.0);
assert_eq!(clusters.len(), 2);
assert!(clusters[0].contains(&0));
assert!(clusters[0].contains(&1));
assert!(clusters[0].contains(&2));
assert!(clusters[0].contains(&3));
assert!(clusters[0].contains(&4));
assert!(clusters[1].contains(&5));
assert!(clusters[1].contains(&6));
assert!(clusters[1].contains(&7));
assert!(clusters[1].contains(&8));
assert!(clusters[1].contains(&9));
}
#[test]
fn test_cluster_words_empty() {
let words: Vec<TextBlock> = vec![];
let clusters = cluster_words_into_lines(&words, 5.0);
assert_eq!(clusters.len(), 0);
}
#[test]
fn test_cluster_words_single() {
let chars = vec![mock_char('A', 0.0, 0.0)];
let word = TextBlock::from_chars(chars);
let words = vec![word];
let clusters = cluster_words_into_lines(&words, 5.0);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0], vec![0]);
}
#[test]
fn test_cluster_words_into_lines() {
let word1 = TextBlock::from_chars(vec![mock_char('H', 0.0, 0.0)]);
let word2 = TextBlock::from_chars(vec![mock_char('W', 50.0, 1.0)]); let word3 = TextBlock::from_chars(vec![mock_char('F', 0.0, 30.0)]); let word4 = TextBlock::from_chars(vec![mock_char('B', 50.0, 31.0)]);
let words = vec![word1, word2, word3, word4];
let lines = cluster_words_into_lines(&words, 5.0);
assert_eq!(lines.len(), 2);
assert!(lines[0].contains(&2));
assert!(lines[0].contains(&3));
assert!(lines[1].contains(&0));
assert!(lines[1].contains(&1));
}
#[test]
fn test_words_sorted_by_x_in_line() {
let word1 = TextBlock::from_chars(vec![mock_char('W', 40.0, 0.0)]); let word2 = TextBlock::from_chars(vec![mock_char('H', 0.0, 1.0)]);
let words = vec![word1, word2];
let lines = cluster_words_into_lines(&words, 5.0);
assert_eq!(lines.len(), 1);
assert_eq!(lines[0], vec![1, 0]);
}
}