1use super::types::{ContentRect, MarginError, Result};
7use rayon::prelude::*;
8use std::path::{Path, PathBuf};
9
10const TUKEY_K: f64 = 1.5;
16
17const MIN_INLIER_RATIO: f64 = 0.5;
19
20const MIN_INLIER_COUNT: usize = 3;
22
23#[derive(Debug, Clone)]
29pub struct PageBoundingBox {
30 pub page_number: usize,
32 pub bounding_box: ContentRect,
34 pub is_odd: bool,
36}
37
38impl PageBoundingBox {
39 pub fn new(page_number: usize, bounding_box: ContentRect) -> Self {
41 Self {
42 page_number,
43 bounding_box,
44 is_odd: page_number % 2 == 1,
45 }
46 }
47
48 pub fn is_valid(&self) -> bool {
50 self.bounding_box.width > 0 && self.bounding_box.height > 0
51 }
52
53 pub fn right(&self) -> u32 {
55 self.bounding_box.x + self.bounding_box.width
56 }
57
58 pub fn bottom(&self) -> u32 {
60 self.bounding_box.y + self.bounding_box.height
61 }
62}
63
64#[derive(Debug, Clone, Default)]
66pub struct GroupCropRegion {
67 pub left: u32,
69 pub top: u32,
71 pub width: u32,
73 pub height: u32,
75 pub inlier_count: usize,
77 pub total_count: usize,
79}
80
81impl GroupCropRegion {
82 pub fn is_valid(&self) -> bool {
84 self.width > 0 && self.height > 0
85 }
86
87 pub fn right(&self) -> u32 {
89 self.left + self.width
90 }
91
92 pub fn bottom(&self) -> u32 {
94 self.top + self.height
95 }
96
97 pub fn to_content_rect(&self) -> ContentRect {
99 ContentRect {
100 x: self.left,
101 y: self.top,
102 width: self.width,
103 height: self.height,
104 }
105 }
106}
107
108#[derive(Debug, Clone)]
110pub struct UnifiedCropRegions {
111 pub odd_region: GroupCropRegion,
113 pub even_region: GroupCropRegion,
115}
116
117pub struct GroupCropAnalyzer;
123
124impl GroupCropAnalyzer {
125 pub fn decide_group_crop_region(bounding_boxes: &[PageBoundingBox]) -> GroupCropRegion {
134 if bounding_boxes.is_empty() {
136 return GroupCropRegion::default();
137 }
138
139 let valid: Vec<&PageBoundingBox> = bounding_boxes.iter().filter(|b| b.is_valid()).collect();
141
142 if valid.is_empty() {
143 return GroupCropRegion::default();
144 }
145
146 let mut lefts: Vec<u32> = valid.iter().map(|b| b.bounding_box.x).collect();
148 let mut tops: Vec<u32> = valid.iter().map(|b| b.bounding_box.y).collect();
149 let mut rights: Vec<u32> = valid.iter().map(|b| b.right()).collect();
150 let mut bottoms: Vec<u32> = valid.iter().map(|b| b.bottom()).collect();
151
152 lefts.sort_unstable();
153 tops.sort_unstable();
154 rights.sort_unstable();
155 bottoms.sort_unstable();
156
157 let (q1_l, q3_l, iqr_l) = Self::calculate_iqr(&lefts);
159 let (q1_t, q3_t, iqr_t) = Self::calculate_iqr(&tops);
160 let (q1_r, q3_r, iqr_r) = Self::calculate_iqr(&rights);
161 let (q1_b, q3_b, iqr_b) = Self::calculate_iqr(&bottoms);
162
163 let inliers: Vec<&PageBoundingBox> = valid
165 .iter()
166 .filter(|b| {
167 !Self::is_outlier(b.bounding_box.x, q1_l, q3_l, iqr_l)
168 && !Self::is_outlier(b.bounding_box.y, q1_t, q3_t, iqr_t)
169 && !Self::is_outlier(b.right(), q1_r, q3_r, iqr_r)
170 && !Self::is_outlier(b.bottom(), q1_b, q3_b, iqr_b)
171 })
172 .copied()
173 .collect();
174
175 let use_inliers = if inliers.len() >= MIN_INLIER_COUNT
177 && inliers.len() as f64 >= valid.len() as f64 * MIN_INLIER_RATIO
178 {
179 inliers
180 } else {
181 valid
182 };
183
184 let lefts: Vec<u32> = use_inliers.iter().map(|b| b.bounding_box.x).collect();
186 let tops: Vec<u32> = use_inliers.iter().map(|b| b.bounding_box.y).collect();
187 let rights: Vec<u32> = use_inliers.iter().map(|b| b.right()).collect();
188 let bottoms: Vec<u32> = use_inliers.iter().map(|b| b.bottom()).collect();
189
190 let left = Self::median_u32(&lefts);
191 let top = Self::median_u32(&tops);
192 let right = Self::median_u32(&rights);
193 let bottom = Self::median_u32(&bottoms);
194
195 let width = right.saturating_sub(left);
197 let height = bottom.saturating_sub(top);
198
199 GroupCropRegion {
200 left,
201 top,
202 width,
203 height,
204 inlier_count: use_inliers.len(),
205 total_count: bounding_boxes.len(),
206 }
207 }
208
209 pub fn unify_odd_even_regions(bounding_boxes: &[PageBoundingBox]) -> UnifiedCropRegions {
211 Self::unify_and_expand_regions(bounding_boxes, 0, 0, 0)
212 }
213
214 pub fn unify_and_expand_regions(
223 bounding_boxes: &[PageBoundingBox],
224 margin_percent: u32,
225 max_width: u32,
226 max_height: u32,
227 ) -> UnifiedCropRegions {
228 let odd_boxes: Vec<PageBoundingBox> = bounding_boxes
230 .iter()
231 .filter(|b| b.is_odd)
232 .cloned()
233 .collect();
234 let even_boxes: Vec<PageBoundingBox> = bounding_boxes
235 .iter()
236 .filter(|b| !b.is_odd)
237 .cloned()
238 .collect();
239
240 let mut odd_region = Self::decide_group_crop_region(&odd_boxes);
242 let mut even_region = Self::decide_group_crop_region(&even_boxes);
243
244 if odd_region.is_valid() && even_region.is_valid() {
246 let unified_top = odd_region.top.min(even_region.top);
247 let unified_bottom = odd_region.bottom().max(even_region.bottom());
248
249 odd_region.top = unified_top;
250 odd_region.height = unified_bottom.saturating_sub(unified_top);
251
252 even_region.top = unified_top;
253 even_region.height = unified_bottom.saturating_sub(unified_top);
254 }
255
256 if odd_region.is_valid() && even_region.is_valid() {
258 let target_width = odd_region.width.max(even_region.width);
259 let target_height = odd_region.height.max(even_region.height);
260
261 let expanded_width = target_width + target_width * margin_percent / 100;
263 let expanded_height = target_height + target_height * margin_percent / 100;
264
265 let final_width = if max_width > 0 {
267 expanded_width.min(max_width)
268 } else {
269 expanded_width
270 };
271 let final_height = if max_height > 0 {
272 expanded_height.min(max_height)
273 } else {
274 expanded_height
275 };
276
277 Self::expand_region_centered(&mut odd_region, final_width, final_height, max_width, max_height);
279
280 Self::expand_region_centered(&mut even_region, final_width, final_height, max_width, max_height);
282 }
283
284 UnifiedCropRegions {
285 odd_region,
286 even_region,
287 }
288 }
289
290 fn expand_region_centered(
292 region: &mut GroupCropRegion,
293 target_width: u32,
294 target_height: u32,
295 max_width: u32,
296 max_height: u32,
297 ) {
298 if region.width < target_width {
299 let dw = target_width - region.width;
300 let new_left = region.left.saturating_sub(dw / 2);
301
302 let clamped_left = if max_width > 0 {
304 new_left.min(max_width.saturating_sub(target_width))
305 } else {
306 new_left
307 };
308
309 region.left = clamped_left;
310 region.width = target_width;
311 }
312
313 if region.height < target_height {
314 let dh = target_height - region.height;
315 let new_top = region.top.saturating_sub(dh / 2);
316
317 let clamped_top = if max_height > 0 {
319 new_top.min(max_height.saturating_sub(target_height))
320 } else {
321 new_top
322 };
323
324 region.top = clamped_top;
325 region.height = target_height;
326 }
327 }
328
329 fn calculate_iqr(sorted_values: &[u32]) -> (f64, f64, f64) {
332 if sorted_values.is_empty() {
333 return (0.0, 0.0, 1.0);
334 }
335
336 let q1 = Self::percentile(sorted_values, 0.25);
337 let q3 = Self::percentile(sorted_values, 0.75);
338 let iqr = (q3 - q1).max(1.0); (q1, q3, iqr)
341 }
342
343 fn is_outlier(value: u32, q1: f64, q3: f64, iqr: f64) -> bool {
345 let v = value as f64;
346 v < q1 - TUKEY_K * iqr || v > q3 + TUKEY_K * iqr
347 }
348
349 fn percentile(sorted_values: &[u32], p: f64) -> f64 {
352 if sorted_values.is_empty() {
353 return 0.0;
354 }
355 if sorted_values.len() == 1 {
356 return sorted_values[0] as f64;
357 }
358
359 let idx = p * (sorted_values.len() - 1) as f64;
360 let lo = idx.floor() as usize;
361 let hi = idx.ceil() as usize;
362
363 if lo == hi {
364 sorted_values[lo] as f64
365 } else {
366 let frac = idx - lo as f64;
367 sorted_values[lo] as f64 + (sorted_values[hi] as f64 - sorted_values[lo] as f64) * frac
368 }
369 }
370
371 fn median_u32(values: &[u32]) -> u32 {
373 if values.is_empty() {
374 return 0;
375 }
376
377 let mut sorted = values.to_vec();
378 sorted.sort_unstable();
379
380 let n = sorted.len();
381 if n % 2 == 1 {
382 sorted[n / 2]
383 } else {
384 (sorted[n / 2 - 1] + sorted[n / 2]) / 2
385 }
386 }
387
388 pub fn detect_text_bounding_box(
393 image_path: &Path,
394 background_threshold: u8,
395 ) -> Result<ContentRect> {
396 if !image_path.exists() {
397 return Err(MarginError::ImageNotFound(image_path.to_path_buf()));
398 }
399
400 let img = image::open(image_path).map_err(|e| MarginError::InvalidImage(e.to_string()))?;
401 let gray = img.to_luma8();
402 let (width, height) = gray.dimensions();
403
404 let mut min_x = width;
405 let mut max_x = 0u32;
406 let mut min_y = height;
407 let mut max_y = 0u32;
408
409 for y in 0..height {
411 for x in 0..width {
412 let pixel = gray.get_pixel(x, y);
413 if pixel.0[0] < background_threshold {
414 min_x = min_x.min(x);
415 max_x = max_x.max(x);
416 min_y = min_y.min(y);
417 max_y = max_y.max(y);
418 }
419 }
420 }
421
422 if min_x > max_x || min_y > max_y {
424 return Err(MarginError::NoContentDetected);
425 }
426
427 Ok(ContentRect {
428 x: min_x,
429 y: min_y,
430 width: max_x - min_x + 1,
431 height: max_y - min_y + 1,
432 })
433 }
434
435 pub fn detect_all_bounding_boxes(
437 image_paths: &[PathBuf],
438 background_threshold: u8,
439 ) -> Vec<PageBoundingBox> {
440 image_paths
441 .par_iter()
442 .enumerate()
443 .filter_map(|(idx, path)| {
444 match Self::detect_text_bounding_box(path, background_threshold) {
445 Ok(bbox) => Some(PageBoundingBox::new(idx + 1, bbox)),
446 Err(_) => None,
447 }
448 })
449 .collect()
450 }
451}
452
453#[cfg(test)]
454mod tests {
455 use super::*;
456
457 #[test]
458 fn test_page_bounding_box_creation() {
459 let rect = ContentRect {
460 x: 100,
461 y: 50,
462 width: 800,
463 height: 1200,
464 };
465 let bbox = PageBoundingBox::new(1, rect);
466 assert_eq!(bbox.page_number, 1);
467 assert!(bbox.is_odd);
468 assert!(bbox.is_valid());
469 assert_eq!(bbox.right(), 900);
470 assert_eq!(bbox.bottom(), 1250);
471 }
472
473 #[test]
474 fn test_page_bounding_box_even_page() {
475 let rect = ContentRect {
476 x: 100,
477 y: 50,
478 width: 800,
479 height: 1200,
480 };
481 let bbox = PageBoundingBox::new(2, rect);
482 assert_eq!(bbox.page_number, 2);
483 assert!(!bbox.is_odd);
484 }
485
486 #[test]
487 fn test_group_crop_region_valid() {
488 let region = GroupCropRegion {
489 left: 100,
490 top: 50,
491 width: 800,
492 height: 1200,
493 inlier_count: 10,
494 total_count: 12,
495 };
496 assert!(region.is_valid());
497 assert_eq!(region.right(), 900);
498 assert_eq!(region.bottom(), 1250);
499 }
500
501 #[test]
502 fn test_group_crop_region_invalid() {
503 let region = GroupCropRegion {
504 left: 100,
505 top: 50,
506 width: 0,
507 height: 1200,
508 inlier_count: 0,
509 total_count: 0,
510 };
511 assert!(!region.is_valid());
512 }
513
514 #[test]
515 fn test_decide_group_crop_empty() {
516 let result = GroupCropAnalyzer::decide_group_crop_region(&[]);
517 assert!(!result.is_valid());
518 assert_eq!(result.inlier_count, 0);
519 }
520
521 #[test]
522 fn test_decide_group_crop_single_page() {
523 let boxes = vec![PageBoundingBox::new(
524 1,
525 ContentRect {
526 x: 100,
527 y: 50,
528 width: 800,
529 height: 1200,
530 },
531 )];
532 let result = GroupCropAnalyzer::decide_group_crop_region(&boxes);
533 assert!(result.is_valid());
534 assert_eq!(result.left, 100);
535 assert_eq!(result.top, 50);
536 assert_eq!(result.width, 800);
537 assert_eq!(result.height, 1200);
538 }
539
540 #[test]
541 fn test_decide_group_crop_multiple_pages() {
542 let boxes = vec![
543 PageBoundingBox::new(
544 1,
545 ContentRect {
546 x: 100,
547 y: 50,
548 width: 800,
549 height: 1200,
550 },
551 ),
552 PageBoundingBox::new(
553 2,
554 ContentRect {
555 x: 105,
556 y: 55,
557 width: 790,
558 height: 1190,
559 },
560 ),
561 PageBoundingBox::new(
562 3,
563 ContentRect {
564 x: 95,
565 y: 45,
566 width: 810,
567 height: 1210,
568 },
569 ),
570 ];
571 let result = GroupCropAnalyzer::decide_group_crop_region(&boxes);
572 assert!(result.is_valid());
573 assert_eq!(result.inlier_count, 3);
574 assert!((result.left as i32 - 100).abs() <= 5);
576 assert!((result.top as i32 - 50).abs() <= 5);
577 }
578
579 #[test]
580 fn test_decide_group_crop_with_outlier() {
581 let boxes = vec![
582 PageBoundingBox::new(
583 1,
584 ContentRect {
585 x: 100,
586 y: 50,
587 width: 800,
588 height: 1200,
589 },
590 ),
591 PageBoundingBox::new(
592 2,
593 ContentRect {
594 x: 105,
595 y: 55,
596 width: 790,
597 height: 1190,
598 },
599 ),
600 PageBoundingBox::new(
601 3,
602 ContentRect {
603 x: 95,
604 y: 45,
605 width: 810,
606 height: 1210,
607 },
608 ),
609 PageBoundingBox::new(
610 4,
611 ContentRect {
612 x: 100,
613 y: 50,
614 width: 800,
615 height: 1200,
616 },
617 ),
618 PageBoundingBox::new(
619 5,
620 ContentRect {
621 x: 500,
622 y: 500,
623 width: 200,
624 height: 200,
625 },
626 ), ];
628 let result = GroupCropAnalyzer::decide_group_crop_region(&boxes);
629 assert!(result.is_valid());
630 assert!(result.inlier_count <= boxes.len());
632 }
633
634 #[test]
635 fn test_unify_odd_even_regions() {
636 let boxes = vec![
637 PageBoundingBox::new(
638 1,
639 ContentRect {
640 x: 100,
641 y: 50,
642 width: 800,
643 height: 1200,
644 },
645 ),
646 PageBoundingBox::new(
647 2,
648 ContentRect {
649 x: 150,
650 y: 60,
651 width: 750,
652 height: 1180,
653 },
654 ),
655 PageBoundingBox::new(
656 3,
657 ContentRect {
658 x: 105,
659 y: 55,
660 width: 795,
661 height: 1195,
662 },
663 ),
664 PageBoundingBox::new(
665 4,
666 ContentRect {
667 x: 155,
668 y: 65,
669 width: 745,
670 height: 1175,
671 },
672 ),
673 ];
674 let result = GroupCropAnalyzer::unify_odd_even_regions(&boxes);
675
676 assert!(result.odd_region.is_valid());
678 assert_eq!(result.odd_region.total_count, 2);
679
680 assert!(result.even_region.is_valid());
682 assert_eq!(result.even_region.total_count, 2);
683 }
684
685 #[test]
686 fn test_group_crop_region_to_content_rect() {
687 let region = GroupCropRegion {
688 left: 100,
689 top: 50,
690 width: 800,
691 height: 1200,
692 inlier_count: 5,
693 total_count: 5,
694 };
695 let rect = region.to_content_rect();
696 assert_eq!(rect.x, 100);
697 assert_eq!(rect.y, 50);
698 assert_eq!(rect.width, 800);
699 assert_eq!(rect.height, 1200);
700 }
701
702 #[test]
708 fn test_tc_margin_001_uniform_margins_detected() {
709 let boxes = vec![
711 PageBoundingBox::new(1, ContentRect { x: 100, y: 100, width: 800, height: 1000 }),
712 PageBoundingBox::new(2, ContentRect { x: 100, y: 100, width: 800, height: 1000 }),
713 PageBoundingBox::new(3, ContentRect { x: 100, y: 100, width: 800, height: 1000 }),
714 PageBoundingBox::new(4, ContentRect { x: 100, y: 100, width: 800, height: 1000 }),
715 ];
716
717 let result = GroupCropAnalyzer::decide_group_crop_region(&boxes);
718
719 assert!(result.is_valid());
721 assert_eq!(result.left, 100);
722 assert_eq!(result.top, 100);
723 assert_eq!(result.width, 800);
724 assert_eq!(result.height, 1000);
725 assert_eq!(result.inlier_count, 4);
726 }
727
728 #[test]
730 fn test_tc_margin_002_nonuniform_margins_unified() {
731 let boxes = vec![
733 PageBoundingBox::new(1, ContentRect { x: 100, y: 90, width: 800, height: 1000 }),
734 PageBoundingBox::new(2, ContentRect { x: 110, y: 100, width: 790, height: 990 }),
735 PageBoundingBox::new(3, ContentRect { x: 95, y: 95, width: 805, height: 1005 }),
736 PageBoundingBox::new(4, ContentRect { x: 105, y: 105, width: 795, height: 995 }),
737 ];
738
739 let result = GroupCropAnalyzer::decide_group_crop_region(&boxes);
740
741 assert!(result.is_valid());
743 assert!(result.left >= 95 && result.left <= 110);
745 assert!(result.top >= 90 && result.top <= 105);
746 }
747
748 #[test]
750 fn test_tc_margin_003_no_margins() {
751 let boxes = vec![
753 PageBoundingBox::new(1, ContentRect { x: 0, y: 0, width: 1000, height: 1200 }),
754 PageBoundingBox::new(2, ContentRect { x: 0, y: 0, width: 1000, height: 1200 }),
755 ];
756
757 let result = GroupCropAnalyzer::decide_group_crop_region(&boxes);
758
759 assert!(result.is_valid());
760 assert_eq!(result.left, 0);
761 assert_eq!(result.top, 0);
762 }
763
764 #[test]
766 fn test_tc_margin_004_outlier_exclusion_tukey() {
767 let boxes = vec![
769 PageBoundingBox::new(1, ContentRect { x: 100, y: 100, width: 800, height: 1000 }),
770 PageBoundingBox::new(2, ContentRect { x: 102, y: 98, width: 798, height: 1002 }),
771 PageBoundingBox::new(3, ContentRect { x: 101, y: 101, width: 799, height: 999 }),
772 PageBoundingBox::new(4, ContentRect { x: 99, y: 99, width: 801, height: 1001 }),
773 PageBoundingBox::new(5, ContentRect { x: 300, y: 300, width: 400, height: 600 }),
775 ];
776
777 let result = GroupCropAnalyzer::decide_group_crop_region(&boxes);
778
779 assert!(result.is_valid());
781 assert!(result.inlier_count <= result.total_count);
783 assert!(result.left < 200); }
786
787 #[test]
789 fn test_tc_margin_005_odd_even_separate_regions() {
790 let boxes = vec![
792 PageBoundingBox::new(1, ContentRect { x: 120, y: 100, width: 780, height: 1000 }), PageBoundingBox::new(2, ContentRect { x: 100, y: 100, width: 780, height: 1000 }), PageBoundingBox::new(3, ContentRect { x: 122, y: 102, width: 778, height: 998 }), PageBoundingBox::new(4, ContentRect { x: 98, y: 98, width: 782, height: 1002 }), ];
797
798 let result = GroupCropAnalyzer::unify_odd_even_regions(&boxes);
799
800 assert!(result.odd_region.is_valid());
802 assert!(result.even_region.is_valid());
803
804 assert!(result.odd_region.left >= result.even_region.left);
806
807 assert_eq!(result.odd_region.total_count, 2);
809 assert_eq!(result.even_region.total_count, 2);
810 }
811}