1use std::collections::HashMap;
11
12use crate::model::{ColumnMode, DocBlock, Fragment, Line, Page, Span};
13
14#[derive(Debug, Clone, serde::Serialize)]
16pub struct PageStats {
17 pub page: u32,
18 pub lines: usize,
19 pub chars: usize,
20 pub two_column: bool,
21}
22
23pub struct Reconstruction {
24 pub blocks: Vec<DocBlock>,
25 pub pages: Vec<PageStats>,
26}
27
28pub fn reconstruct(pages: &[Page], columns: ColumnMode) -> Reconstruction {
29 let body_size = body_font_size(pages);
30 let heading_levels = heading_levels(pages, body_size);
31
32 let mut blocks: Vec<DocBlock> = Vec::new();
33 let mut stats = Vec::new();
34
35 for page in pages {
36 let lines = merge_fragments_into_lines(page);
37 let two_column = match columns {
38 ColumnMode::Single => false,
39 ColumnMode::Two => true,
40 ColumnMode::Auto => detect_two_columns(page, &lines),
41 };
42 let ordered = if two_column {
43 order_two_column(page, &lines)
44 } else {
45 let mut ordered = lines.clone();
46 ordered.sort_by_key(|line| (line.top, line.left));
47 ordered
48 };
49
50 stats.push(PageStats {
51 page: page.number,
52 lines: ordered.len(),
53 chars: ordered.iter().map(Line::char_count).sum(),
54 two_column,
55 });
56
57 let page_blocks = cluster_paragraphs(&ordered, body_size, &heading_levels);
58 append_with_continuation(&mut blocks, page_blocks);
59 }
60
61 Reconstruction {
62 blocks,
63 pages: stats,
64 }
65}
66
67fn merge_fragments_into_lines(page: &Page) -> Vec<Line> {
70 let mut fragments: Vec<&Fragment> = page
74 .fragments
75 .iter()
76 .filter(|fragment| fragment.width > 0)
77 .collect();
78 fragments.sort_by_key(|fragment| (fragment.top, fragment.left));
79
80 let mut lines: Vec<Line> = Vec::new();
81 for fragment in fragments {
82 let size = page
83 .font_sizes
84 .get(&fragment.font)
85 .copied()
86 .unwrap_or(fragment.height.unsigned_abs());
87 let same_line = lines.last().is_some_and(|line| {
88 let tolerance = (line.height.max(fragment.height) as f32 * 0.5) as i32;
89 let max_join_gap = line.height.max(fragment.height) * 5 / 4;
95 (fragment.top - line.top).abs() <= tolerance
96 && fragment.left >= line.right - 2
97 && fragment.left - line.right <= max_join_gap
98 });
99
100 if same_line {
101 let line = lines.last_mut().expect("checked above");
102 let gap = fragment.left - line.right;
103 if gap > 1 && !line.spans.last().is_some_and(|s| s.text.ends_with(' ')) {
104 push_joined(&mut line.spans, " ", false, false);
105 }
106 for span in &fragment.spans {
107 push_joined(&mut line.spans, &span.text, span.bold, span.italic);
108 }
109 line.right = line.right.max(fragment.right());
110 line.height = line.height.max(fragment.height);
111 line.font_size = line.font_size.max(size);
112 } else {
113 lines.push(Line {
114 top: fragment.top,
115 left: fragment.left,
116 right: fragment.right(),
117 height: fragment.height,
118 font_size: size,
119 spans: fragment.spans.clone(),
120 });
121 }
122 }
123
124 lines.retain(|line| !line.text().trim().is_empty());
125 lines
126}
127
128fn push_joined(spans: &mut Vec<Span>, text: &str, bold: bool, italic: bool) {
129 if text.is_empty() {
130 return;
131 }
132 if let Some(last) = spans.last_mut()
133 && last.bold == bold
134 && last.italic == italic
135 {
136 last.text.push_str(text);
137 return;
138 }
139 spans.push(Span {
140 text: text.to_string(),
141 bold,
142 italic,
143 });
144}
145
146fn detect_two_columns(page: &Page, lines: &[Line]) -> bool {
149 if lines.len() < 8 {
150 return false;
151 }
152 let content_left = lines.iter().map(|line| line.left).min().unwrap_or(0);
153 let content_right = lines
154 .iter()
155 .map(|line| line.right)
156 .max()
157 .unwrap_or(page.width);
158 let content_width = (content_right - content_left).max(1);
159 let mid = content_left + content_width / 2;
160 let slack = content_width / 20;
161
162 let column_lines: Vec<&Line> = lines
163 .iter()
164 .filter(|line| line.width() < (content_width as f32 * 0.62) as i32)
165 .collect();
166 if column_lines.len() < 6 {
167 return false;
168 }
169
170 let left = column_lines
171 .iter()
172 .filter(|line| line.right <= mid + slack)
173 .count();
174 let right = column_lines
175 .iter()
176 .filter(|line| line.left >= mid - slack)
177 .count();
178 let crossing = column_lines.len().saturating_sub(left + right);
179
180 left >= 3 && right >= 3 && crossing * 10 <= column_lines.len()
181}
182
183fn order_two_column(page: &Page, lines: &[Line]) -> Vec<Line> {
187 let content_left = lines.iter().map(|line| line.left).min().unwrap_or(0);
188 let content_right = lines
189 .iter()
190 .map(|line| line.right)
191 .max()
192 .unwrap_or(page.width);
193 let content_width = (content_right - content_left).max(1);
194 let mid = content_left + content_width / 2;
195
196 let mut sorted: Vec<&Line> = lines.iter().collect();
197 sorted.sort_by_key(|line| (line.top, line.left));
198
199 let mut ordered = Vec::with_capacity(lines.len());
200 let mut band_left: Vec<&Line> = Vec::new();
201 let mut band_right: Vec<&Line> = Vec::new();
202
203 let flush =
204 |ordered: &mut Vec<Line>, band_left: &mut Vec<&Line>, band_right: &mut Vec<&Line>| {
205 for line in band_left.drain(..) {
206 ordered.push(line.clone());
207 }
208 for line in band_right.drain(..) {
209 ordered.push(line.clone());
210 }
211 };
212
213 for line in sorted {
214 if line.width() >= (content_width as f32 * 0.62) as i32 {
215 flush(&mut ordered, &mut band_left, &mut band_right);
216 ordered.push(line.clone());
217 } else if line.left + line.width() / 2 <= mid {
218 band_left.push(line);
219 } else {
220 band_right.push(line);
221 }
222 }
223 flush(&mut ordered, &mut band_left, &mut band_right);
224
225 ordered
226}
227
228fn body_font_size(pages: &[Page]) -> u32 {
230 let mut weights: HashMap<u32, usize> = HashMap::new();
231 for page in pages {
232 for fragment in &page.fragments {
233 let size = page
234 .font_sizes
235 .get(&fragment.font)
236 .copied()
237 .unwrap_or(fragment.height.unsigned_abs());
238 *weights.entry(size).or_default() += fragment.char_count();
239 }
240 }
241 weights
242 .into_iter()
243 .max_by_key(|(_, chars)| *chars)
244 .map(|(size, _)| size)
245 .unwrap_or(12)
246}
247
248fn heading_levels(pages: &[Page], body_size: u32) -> HashMap<u32, u8> {
251 let mut sizes: Vec<u32> = pages
252 .iter()
253 .flat_map(|page| page.font_sizes.values().copied())
254 .filter(|size| *size as f32 >= body_size as f32 * 1.15 && *size >= body_size + 2)
255 .collect();
256 sizes.sort_unstable_by(|a, b| b.cmp(a));
257 sizes.dedup();
258 sizes
259 .into_iter()
260 .take(3)
261 .enumerate()
262 .map(|(rank, size)| (size, rank as u8 + 1))
263 .collect()
264}
265
266fn cluster_paragraphs(
268 lines: &[Line],
269 body_size: u32,
270 heading_levels: &HashMap<u32, u8>,
271) -> Vec<DocBlock> {
272 let median_gap = median_line_gap(lines);
273 let mut blocks = Vec::new();
274 let mut current: Vec<Span> = Vec::new();
275 let mut current_heading: Option<u8> = None;
276 let mut previous: Option<&Line> = None;
277
278 let flush = |spans: &mut Vec<Span>, heading: &mut Option<u8>, blocks: &mut Vec<DocBlock>| {
279 if spans.is_empty() {
280 return;
281 }
282 let spans = std::mem::take(spans);
283 match heading.take() {
284 Some(level) => blocks.push(DocBlock::Heading { level, spans }),
285 None => blocks.push(DocBlock::Paragraph { spans }),
286 }
287 };
288
289 for line in lines {
290 let heading = heading_levels.get(&line.font_size).copied();
291 let new_block = match previous {
292 None => true,
293 Some(prev) => {
294 let gap = line.top - prev.top;
295 let size_changed = line.font_size != prev.font_size
296 && (heading.is_some()
297 || heading_levels.contains_key(&prev.font_size)
298 || line.font_size.abs_diff(prev.font_size) >= 2);
299 let large_gap = gap > (median_gap as f32 * 1.8) as i32 || gap < 0;
300 let indented =
301 line.left > prev.left + (body_size as i32) / 2 && prev.left <= line.left; size_changed || large_gap || indented
303 }
304 };
305
306 if new_block {
307 flush(&mut current, &mut current_heading, &mut blocks);
308 current_heading = heading;
309 }
310 join_line_into(&mut current, line);
311 previous = Some(line);
312 }
313 flush(&mut current, &mut current_heading, &mut blocks);
314
315 blocks
316}
317
318fn join_line_into(buffer: &mut Vec<Span>, line: &Line) {
321 if buffer.is_empty() {
322 buffer.extend(line.spans.iter().cloned());
323 return;
324 }
325
326 let next_starts_lower = line
327 .text()
328 .chars()
329 .next()
330 .is_some_and(|ch| ch.is_lowercase());
331 let hyphenated = buffer
332 .last()
333 .is_some_and(|span| span.text.trim_end().ends_with('-'));
334
335 if hyphenated && next_starts_lower {
336 if let Some(last) = buffer.last_mut() {
337 let trimmed = last.text.trim_end();
338 let without = trimmed[..trimmed.len() - 1].to_string();
339 last.text = without;
340 }
341 } else if !buffer.last().is_some_and(|span| span.text.ends_with(' ')) {
342 push_joined(buffer, " ", false, false);
343 }
344 for span in &line.spans {
345 push_joined(buffer, &span.text, span.bold, span.italic);
346 }
347}
348
349fn median_line_gap(lines: &[Line]) -> i32 {
350 let mut gaps: Vec<i32> = lines
351 .windows(2)
352 .filter_map(|pair| {
353 let gap = pair[1].top - pair[0].top;
354 let height = pair[0].height.max(1);
355 (gap >= height / 2 && gap <= height * 4).then_some(gap)
360 })
361 .collect();
362 if gaps.is_empty() {
363 return 16;
364 }
365 gaps.sort_unstable();
366 gaps[(gaps.len() - 1) / 2]
369}
370
371fn append_with_continuation(blocks: &mut Vec<DocBlock>, mut incoming: Vec<DocBlock>) {
375 if let (Some(DocBlock::Paragraph { spans: tail }), Some(DocBlock::Paragraph { spans: head })) =
376 (blocks.last_mut(), incoming.first_mut())
377 {
378 let tail_text: String = tail.iter().map(|span| span.text.as_str()).collect();
379 let head_text: String = head.iter().map(|span| span.text.as_str()).collect();
380 let continues = !tail_text
381 .trim_end()
382 .ends_with(['.', '!', '?', ':', ';', '"', '\u{201d}', '\u{2019}'])
383 && head_text.chars().next().is_some_and(|ch| ch.is_lowercase());
384 if continues {
385 let hyphenated = tail_text.trim_end().ends_with('-');
386 if hyphenated {
387 if let Some(last) = tail.last_mut() {
388 let trimmed = last.text.trim_end();
389 last.text = trimmed[..trimmed.len() - 1].to_string();
390 }
391 } else if !tail.last().is_some_and(|span| span.text.ends_with(' ')) {
392 push_joined(tail, " ", false, false);
393 }
394 for span in head.drain(..) {
395 push_joined(tail, &span.text, span.bold, span.italic);
396 }
397 incoming.remove(0);
398 }
399 }
400 blocks.append(&mut incoming);
401}
402
403#[cfg(test)]
404mod tests {
405 use super::*;
406 use crate::parse::parse_pdf2xml;
407
408 fn fixture_two_column() -> String {
409 let mut texts = String::new();
413 texts.push_str(
414 r#"<text top="80" left="150" width="620" height="26" font="0">A Study of <i>Synthetic</i> Layouts</text>"#,
415 );
416 texts.push_str(
417 r#"<text top="140" left="120" width="680" height="13" font="1">Abstract spanning the full width of the page for testing purposes.</text>"#,
418 );
419 for (i, words) in [
420 "Left column opening line about num-",
421 "bers and continuation of thought.",
422 "Another left line here.",
423 ]
424 .iter()
425 .enumerate()
426 {
427 texts.push_str(&format!(
428 r#"<text top="{}" left="100" width="330" height="12" font="1">{}</text>"#,
429 220 + i * 16,
430 words
431 ));
432 }
433 for (i, words) in [
434 "Right column first line.",
435 "Right column second line.",
436 "Right column third line.",
437 ]
438 .iter()
439 .enumerate()
440 {
441 texts.push_str(&format!(
442 r#"<text top="{}" left="480" width="330" height="12" font="1">{}</text>"#,
443 220 + i * 16,
444 words
445 ));
446 }
447 format!(
448 r#"<?xml version="1.0"?>
449<pdf2xml producer="poppler">
450<page number="1" width="918" height="1188">
451<fontspec id="0" size="22" family="T"/>
452<fontspec id="1" size="11" family="T"/>
453{texts}
454</page>
455</pdf2xml>"#
456 )
457 }
458
459 #[test]
460 fn two_column_page_reads_title_left_then_right() {
461 let pages = parse_pdf2xml(&fixture_two_column()).expect("fixture parses");
462 let result = reconstruct(&pages, ColumnMode::Auto);
463
464 assert!(result.pages[0].two_column, "page must detect two columns");
465 let texts: Vec<String> = result.blocks.iter().map(DocBlock::text).collect();
466
467 assert!(
468 matches!(&result.blocks[0], DocBlock::Heading { level: 1, .. }),
469 "title should be an h1, got {:?}",
470 result.blocks[0]
471 );
472 assert_eq!(texts[0], "A Study of Synthetic Layouts");
473 assert!(texts[1].starts_with("Abstract spanning"));
474 assert!(
475 texts[2].contains("numbers and continuation"),
476 "hyphenated line join must repair the soft hyphen, got: {}",
477 texts[2]
478 );
479 assert!(
480 texts[2].contains("Another left line"),
481 "left column must precede right, got: {texts:?}"
482 );
483 assert!(texts[3].starts_with("Right column first"));
484 }
485
486 #[test]
487 fn single_column_page_clusters_paragraphs_by_gap() {
488 let xml = r#"<?xml version="1.0"?>
489<pdf2xml><page number="1" width="600" height="800">
490<fontspec id="0" size="11" family="T"/>
491<text top="100" left="80" width="440" height="12" font="0">First paragraph line one.</text>
492<text top="116" left="80" width="440" height="12" font="0">First paragraph line two.</text>
493<text top="170" left="80" width="440" height="12" font="0">Second paragraph after a large gap.</text>
494</page></pdf2xml>"#;
495 let pages = parse_pdf2xml(xml).expect("fixture parses");
496 let result = reconstruct(&pages, ColumnMode::Auto);
497
498 let texts: Vec<String> = result.blocks.iter().map(DocBlock::text).collect();
499 assert_eq!(
500 texts,
501 vec![
502 "First paragraph line one. First paragraph line two.".to_string(),
503 "Second paragraph after a large gap.".to_string(),
504 ]
505 );
506 }
507
508 #[test]
509 fn narrow_gutter_does_not_merge_columns_on_aligned_baselines() {
510 let xml = r#"<?xml version="1.0"?>
515<pdf2xml><page number="1" width="892" height="1262">
516<fontspec id="0" size="15" family="T"/>
517<text top="100" left="108" width="327" height="15" font="0">left one alpha beta gamma delta.</text>
518<text top="120" left="108" width="327" height="15" font="0">left two epsilon zeta eta theta.</text>
519<text top="100" left="461" width="327" height="15" font="0">right one iota kappa lambda mu.</text>
520<text top="120" left="461" width="327" height="15" font="0">right two nu xi omicron pi rho.</text>
521<text top="140" left="108" width="327" height="15" font="0">left three sigma tau upsilon phi.</text>
522<text top="140" left="461" width="327" height="15" font="0">right three chi psi omega end.</text>
523<text top="160" left="108" width="327" height="15" font="0">left four extra line for detection.</text>
524<text top="160" left="461" width="327" height="15" font="0">right four extra line as well.</text>
525</page></pdf2xml>"#;
526 let pages = parse_pdf2xml(xml).expect("fixture parses");
527 let result = reconstruct(&pages, ColumnMode::Two);
528
529 let all_text: String = result
530 .blocks
531 .iter()
532 .map(|block| block.text())
533 .collect::<Vec<_>>()
534 .join("\n");
535 let left_pos = all_text.find("left four").expect("left text present");
536 let right_pos = all_text.find("right one").expect("right text present");
537 assert!(
538 left_pos < right_pos,
539 "every left-column line must precede the right column, got:\n{all_text}"
540 );
541 assert!(
542 !all_text.contains("alpha beta gamma delta. right one"),
543 "aligned baselines must not merge across the gutter:\n{all_text}"
544 );
545 }
546
547 #[test]
548 fn paragraph_continues_across_pages() {
549 let xml = r#"<?xml version="1.0"?>
550<pdf2xml>
551<page number="1" width="600" height="800">
552<fontspec id="0" size="11" family="T"/>
553<text top="700" left="80" width="440" height="12" font="0">This sentence does not end</text>
554</page>
555<page number="2" width="600" height="800">
556<fontspec id="0" size="11" family="T"/>
557<text top="80" left="80" width="440" height="12" font="0">until the following page.</text>
558</page>
559</pdf2xml>"#;
560 let pages = parse_pdf2xml(xml).expect("fixture parses");
561 let result = reconstruct(&pages, ColumnMode::Auto);
562
563 let texts: Vec<String> = result.blocks.iter().map(DocBlock::text).collect();
564 assert_eq!(
565 texts,
566 vec!["This sentence does not end until the following page.".to_string()]
567 );
568 }
569}