#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(dead_code)] pub(in crate::pdf::structure) enum RegionValidation {
HasContent,
Empty,
Skipped,
}
#[cfg(feature = "layout-detection")]
const MIN_TEXT_CC_COUNT: i32 = 3;
#[cfg(feature = "layout-detection")]
pub(in crate::pdf::structure) fn validate_region_has_text(
page_rgb: &[u8],
page_width: u32,
page_height: u32,
region_x: u32,
region_y: u32,
region_w: u32,
region_h: u32,
) -> RegionValidation {
if region_w < 5 || region_h < 5 {
return RegionValidation::Empty;
}
let mut crop_data = Vec::with_capacity((region_w * region_h * 3) as usize);
for row in region_y..(region_y + region_h).min(page_height) {
let row_start = (row * page_width + region_x) as usize * 3;
let row_end = (row * page_width + (region_x + region_w).min(page_width)) as usize * 3;
if row_end <= page_rgb.len() && row_start <= page_rgb.len() {
crop_data.extend_from_slice(&page_rgb[row_start..row_end]);
}
}
let actual_w = region_w.min(page_width.saturating_sub(region_x));
let actual_h = region_h.min(page_height.saturating_sub(region_y));
if actual_w < 5 || actual_h < 5 || crop_data.len() != (actual_w * actual_h * 3) as usize {
return RegionValidation::Skipped;
}
let pix = match kreuzberg_tesseract::Pix::from_raw_rgb(&crop_data, actual_w, actual_h) {
Ok(p) => p,
Err(_) => return RegionValidation::Skipped,
};
let gray = match pix.to_grayscale() {
Ok(g) => g,
Err(_) => return RegionValidation::Skipped,
};
let binary = match gray.adaptive_threshold(16, 16) {
Ok(b) => b,
Err(_) => return RegionValidation::Skipped,
};
let cc_count = match binary.count_connected_components(4) {
Ok(c) => c,
Err(_) => return RegionValidation::Skipped,
};
if cc_count >= MIN_TEXT_CC_COUNT {
RegionValidation::HasContent
} else {
tracing::trace!(
cc_count,
region_w,
region_h,
"layout validation: region flagged as empty (few CCs)"
);
RegionValidation::Empty
}
}
#[cfg(feature = "layout-detection")]
pub(in crate::pdf::structure) fn validate_page_regions(
page_image: &image::DynamicImage,
hints: &[super::super::types::LayoutHint],
page_result: &crate::pdf::layout_runner::PageLayoutResult,
) -> Vec<RegionValidation> {
use super::super::types::LayoutHintClass;
let rgb = page_image.to_rgb8();
let img_w = rgb.width();
let img_h = rgb.height();
let rgb_data = rgb.as_raw();
let sx = img_w as f32 / page_result.page_width_pts;
let sy = img_h as f32 / page_result.page_height_pts;
hints
.iter()
.map(|hint| {
if !matches!(hint.class, LayoutHintClass::Table | LayoutHintClass::Picture) {
return RegionValidation::Skipped;
}
let px_left = (hint.left * sx).round().max(0.0) as u32;
let px_top = ((page_result.page_height_pts - hint.top) * sy).round().max(0.0) as u32;
let px_right = (hint.right * sx).round().min(img_w as f32) as u32;
let px_bottom = ((page_result.page_height_pts - hint.bottom) * sy)
.round()
.min(img_h as f32) as u32;
let crop_w = px_right.saturating_sub(px_left);
let crop_h = px_bottom.saturating_sub(px_top);
if (crop_w as f32 * crop_h as f32) > (img_w as f32 * img_h as f32 * 0.5) {
return RegionValidation::Skipped;
}
validate_region_has_text(rgb_data, img_w, img_h, px_left, px_top, crop_w, crop_h)
})
.collect()
}