use super::extraction::TextBlock;
use crate::pdf::error::{PdfError, Result};
const KMEANS_MAX_ITERATIONS: usize = 100;
const KMEANS_CONVERGENCE_THRESHOLD: f32 = 0.01;
#[derive(Debug, Clone)]
pub struct FontSizeCluster {
pub centroid: f32,
pub members: Vec<TextBlock>,
}
pub fn cluster_font_sizes(blocks: &[TextBlock], k: usize) -> Result<Vec<FontSizeCluster>> {
if blocks.is_empty() {
return Ok(Vec::new());
}
if k == 0 {
return Err(PdfError::TextExtractionFailed("K must be greater than 0".to_string()));
}
let actual_k = k.min(blocks.len());
let mut font_sizes: Vec<f32> = blocks.iter().map(|b| b.font_size).collect();
font_sizes.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare font sizes during sorting")); font_sizes.dedup();
let mut centroids: Vec<f32> = Vec::new();
if font_sizes.len() >= actual_k {
let step = font_sizes.len() / actual_k;
for i in 0..actual_k {
let idx = i * step;
centroids.push(font_sizes[idx.min(font_sizes.len() - 1)]);
}
} else {
centroids = font_sizes.clone();
let min_font = font_sizes[font_sizes.len() - 1];
let max_font = font_sizes[0];
let range = max_font - min_font;
while centroids.len() < actual_k {
let t = centroids.len() as f32 / (actual_k - 1) as f32;
let interpolated = max_font - t * range;
centroids.push(interpolated);
}
centroids.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare centroids during sorting"));
}
for _ in 0..KMEANS_MAX_ITERATIONS {
let clusters = assign_blocks_to_centroids(blocks, ¢roids);
let mut new_centroids = Vec::with_capacity(actual_k);
for (i, cluster) in clusters.iter().enumerate() {
if !cluster.is_empty() {
new_centroids.push(cluster.iter().map(|b| b.font_size).sum::<f32>() / cluster.len() as f32);
} else {
new_centroids.push(centroids[i]);
}
}
let converged = centroids
.iter()
.zip(new_centroids.iter())
.all(|(old, new)| (old - new).abs() < KMEANS_CONVERGENCE_THRESHOLD);
std::mem::swap(&mut centroids, &mut new_centroids);
if converged {
break;
}
}
let clusters = assign_blocks_to_centroids(blocks, ¢roids);
let mut result: Vec<FontSizeCluster> = Vec::new();
for i in 0..actual_k {
if !clusters[i].is_empty() {
let centroid_value = centroids[i];
result.push(FontSizeCluster {
centroid: centroid_value,
members: clusters[i].clone(),
});
}
}
result.sort_by(|a, b| {
b.centroid
.partial_cmp(&a.centroid)
.expect("Failed to compare centroids during final sort")
});
Ok(result)
}
fn assign_blocks_to_centroids(blocks: &[TextBlock], centroids: &[f32]) -> Vec<Vec<TextBlock>> {
let mut clusters: Vec<Vec<TextBlock>> = vec![Vec::new(); centroids.len()];
for block in blocks {
let mut min_distance = f32::INFINITY;
let mut best_cluster = 0;
for (i, ¢roid) in centroids.iter().enumerate() {
let distance = (block.font_size - centroid).abs();
if distance < min_distance {
min_distance = distance;
best_cluster = i;
}
}
clusters[best_cluster].push(block.clone());
}
clusters
}