1mod empty_rects;
19
20use empty_rects::{FilterOverlapping, max_empty_rects};
21use rten_imageproc::bounding_rect;
22use rten_imageproc::{BoundingRect, Line, LineF, Point, Rect, RotatedRect};
23
24use crate::geom::{leftmost_edge, rightmost_edge};
25
26fn rects_separated_by_line(a: &RotatedRect, b: &RotatedRect, l: LineF) -> bool {
27 let a_to_b = LineF::from_endpoints(a.center(), b.center());
28 a_to_b.intersects(l)
29}
30
31pub fn group_into_lines(rects: &[RotatedRect], separators: &[LineF]) -> Vec<Vec<RotatedRect>> {
33 let mut sorted_rects: Vec<_> = rects.to_vec();
34 sorted_rects.sort_by_key(|r| r.bounding_rect().left() as i32);
35
36 let mut lines: Vec<Vec<RotatedRect>> = Vec::new();
37 let overlap_threshold = 5.;
38 let max_h_overlap = 5.;
39
40 while !sorted_rects.is_empty() {
41 let mut line = Vec::new();
42 line.push(sorted_rects.remove(0));
43
44 loop {
45 let last = line.last().unwrap();
46 let last_edge = rightmost_edge(last);
47
48 if let Some((i, next_item)) = sorted_rects
49 .iter()
50 .enumerate()
51 .filter(|(_, r)| {
52 let edge = leftmost_edge(r);
53 r.center().x > last.center().x
54 && edge.center().x - last_edge.center().x >= -max_h_overlap
55 && last_edge.vertical_overlap(edge) >= overlap_threshold
56 && !separators
57 .iter()
58 .any(|&s| rects_separated_by_line(last, r, s))
59 })
60 .min_by_key(|(_, r)| r.center().x as i32)
61 {
62 line.push(*next_item);
63 sorted_rects.remove(i);
64 } else {
65 break;
66 }
67 }
68 lines.push(line);
69 }
70 lines
71}
72
73fn find_block_separators(words: &[RotatedRect]) -> Vec<Rect> {
74 let Some(page_rect) = bounding_rect(words.iter()).map(|br| br.integral_bounding_rect()) else {
75 return Vec::new();
76 };
77
78 let mut lines = group_into_lines(words, &[]);
79 lines.sort_by_key(|l| l.first().unwrap().bounding_rect().top().round() as i32);
80
81 let mut all_word_spacings = Vec::new();
82 for line in lines {
83 if line.len() > 1 {
84 let mut spacings: Vec<_> = line
85 .iter()
86 .zip(line.iter().skip(1))
87 .map(|(cur, next)| {
88 (next.bounding_rect().left() - cur.bounding_rect().right())
89 .max(0.)
90 .round() as i32
91 })
92 .collect();
93 spacings.sort_unstable();
94 all_word_spacings.extend_from_slice(&spacings);
95 }
96 }
97 all_word_spacings.sort_unstable();
98
99 let median_word_spacing = all_word_spacings
100 .get(all_word_spacings.len() / 2)
101 .copied()
102 .unwrap_or(10);
103 let median_height = words
104 .get(words.len() / 2)
105 .map_or(10.0, |r| r.height())
106 .round() as i32;
107
108 let score = |r: Rect| {
109 let aspect_ratio = (r.height() as f32) / (r.width() as f32);
110 let aspect_ratio_weight = match aspect_ratio.log2().abs() {
111 r if r < 3. => 0.5,
112 r if r < 5. => 1.5,
113 r => r,
114 };
115 ((r.area() as f32) * aspect_ratio_weight).sqrt()
116 };
117
118 let object_bboxes: Vec<_> = words
119 .iter()
120 .map(|r| r.bounding_rect().integral_bounding_rect())
121 .collect();
122 let min_width = median_word_spacing * 3;
123 let min_height = (3 * median_height.max(0)) as u32;
124
125 max_empty_rects(
126 &object_bboxes,
127 page_rect,
128 score,
129 min_width.try_into().unwrap_or(1),
130 min_height,
131 )
132 .filter_overlapping(0.5)
133 .take(80)
134 .collect()
135}
136
137pub fn find_text_lines(words: &[RotatedRect]) -> Vec<Vec<RotatedRect>> {
139 let separators = find_block_separators(words);
140 let vertical_separators: Vec<_> = separators
141 .iter()
142 .map(|r| {
143 let center = r.center();
144 Line::from_endpoints(
145 Point::from_yx(r.top(), center.x).to_f32(),
146 Point::from_yx(r.bottom(), center.x).to_f32(),
147 )
148 })
149 .collect();
150
151 let horizontal_separators: Vec<_> = separators
152 .iter()
153 .map(|r| {
154 let center = r.center();
155 Line::from_endpoints(
156 Point::from_yx(center.y, r.left()).to_f32(),
157 Point::from_yx(center.y, r.right()).to_f32(),
158 )
159 })
160 .collect();
161
162 let mut lines = group_into_lines(words, &vertical_separators);
163
164 let midpoint_line = |words: &[RotatedRect]| -> LineF {
165 assert!(!words.is_empty());
166 Line::from_endpoints(
167 words.first().unwrap().bounding_rect().left_edge().center(),
168 words.last().unwrap().bounding_rect().right_edge().center(),
169 )
170 };
171
172 lines.sort_by_key(|words| midpoint_line(words).center().y as i32);
173
174 let is_separated_by = |line_a: LineF, line_b: LineF, separators: &[LineF]| -> bool {
175 let a_to_b = Line::from_endpoints(line_a.center(), line_b.center());
176 separators.iter().any(|sep| sep.intersects(a_to_b))
177 };
178
179 let mut paragraphs: Vec<Vec<Vec<RotatedRect>>> = Vec::new();
180 while !lines.is_empty() {
181 let seed = lines.remove(0);
182 let mut para = vec![seed.clone()];
183 let mut prev_line = midpoint_line(&seed);
184 let mut index = 0;
185 while index < lines.len() {
186 let candidate_line = midpoint_line(&lines[index]);
187 if prev_line.horizontal_overlap(candidate_line) > 0.
188 && !is_separated_by(prev_line, candidate_line, &horizontal_separators)
189 {
190 para.push(lines.remove(index));
191 prev_line = candidate_line;
192 } else {
193 index += 1;
194 }
195 }
196 paragraphs.push(para);
197 }
198
199 paragraphs
200 .into_iter()
201 .flat_map(|para| para.into_iter())
202 .collect()
203}