Skip to main content

papyrus_core/detector/
mod.rs

1use std::collections::HashMap;
2
3use crate::ast::{Document, DocumentMetadata, Node, Span, Warning};
4use crate::parser::{strip_subset_prefix, FontInfo, RawTextSegment};
5
6/// Configuration for the structure-detection pass.
7///
8/// All thresholds are relative to the computed body font size.
9#[derive(Debug, Clone, PartialEq)]
10pub struct DetectorConfig {
11    /// Minimum font-size ratio over body size to classify a segment as a heading.
12    /// Defaults to `1.2`. Must be less than the fixed level-3 boundary (`1.4`).
13    pub heading_size_ratio: f32,
14    /// Whether to detect bold formatting from font name and descriptor metrics.
15    /// When `false`, all spans have `bold = false` regardless of font data.
16    pub detect_bold: bool,
17    /// Whether to detect italic formatting from font name and descriptor metrics.
18    /// When `false`, all spans have `italic = false` regardless of font data.
19    pub detect_italic: bool,
20}
21
22impl Default for DetectorConfig {
23    fn default() -> Self {
24        Self {
25            heading_size_ratio: 1.2,
26            detect_bold: true,
27            detect_italic: true,
28        }
29    }
30}
31
32/// A `RawTextSegment` paired with its structural classification.
33#[derive(Debug, Clone, PartialEq)]
34pub struct ClassifiedSegment {
35    pub segment: RawTextSegment,
36    pub classification: SegmentClass,
37}
38
39/// The structural role of a text segment within the document.
40#[derive(Debug, Clone, PartialEq)]
41pub enum SegmentClass {
42    /// A heading at the given level (1 = largest, 4 = smallest recognised heading).
43    Heading(u8),
44    /// Regular body text.
45    Body,
46}
47
48/// Compute the dominant ("body") font size across all segments using the mode.
49///
50/// Font sizes are bucketed at 0.01-point precision. On a tie, the smallest size
51/// wins — body text is typically the smallest repeated size in a document.
52/// Returns `12.0` when `segments` is empty.
53pub fn compute_body_size(segments: &[RawTextSegment]) -> f32 {
54    if segments.is_empty() {
55        return 12.0;
56    }
57
58    let mut counts: HashMap<i32, usize> = HashMap::new();
59    for segment in segments {
60        let key = (segment.font_size * 100.0).round() as i32;
61        *counts.entry(key).or_insert(0) += 1;
62    }
63
64    // best_key starts at 1200 (12pt) so the very first entry always wins the
65    // count comparison (best_count == 0 < any real count), making the seed
66    // value irrelevant in practice.
67    let mut best_key = 1200;
68    let mut best_count = 0usize;
69
70    for (key, count) in counts {
71        if count > best_count || (count == best_count && key < best_key) {
72            best_key = key;
73            best_count = count;
74        }
75    }
76
77    best_key as f32 / 100.0
78}
79
80/// Classify each segment as `Body` or a heading level based on its font-size
81/// ratio relative to `body_size`.
82///
83/// Fixed level boundaries (ratios are relative to `body_size`):
84/// - ≥ 2.0 → `Heading(1)`
85/// - ≥ 1.7 → `Heading(2)`
86/// - ≥ 1.4 → `Heading(3)`
87/// - ≥ `heading_size_ratio` → `Heading(4)`
88/// - otherwise → `Body`
89///
90/// If `body_size` is zero or negative, falls back to 12.0 pt.
91pub fn detect_headings(
92    segments: Vec<RawTextSegment>,
93    body_size: f32,
94    heading_size_ratio: f32,
95) -> Vec<ClassifiedSegment> {
96    let safe_body = if body_size > 0.0 { body_size } else { 12.0 };
97
98    segments
99        .into_iter()
100        .map(|segment| {
101            let ratio = segment.font_size / safe_body;
102            let classification = if ratio >= 2.0 {
103                // Level 1: at least double the body size
104                SegmentClass::Heading(1)
105            } else if ratio >= 1.7 {
106                // Level 2: 70%+ larger than body
107                SegmentClass::Heading(2)
108            } else if ratio >= 1.4 {
109                // Level 3: 40%+ larger than body
110                SegmentClass::Heading(3)
111            } else if ratio >= heading_size_ratio {
112                // Level 4: exceeds the configurable minimum heading ratio
113                SegmentClass::Heading(4)
114            } else {
115                SegmentClass::Body
116            };
117
118            ClassifiedSegment {
119                segment,
120                classification,
121            }
122        })
123        .collect()
124}
125
126/// Determine bold and italic flags from the font resource name and descriptor metrics.
127///
128/// Detection order:
129/// 1. Lowercase font name (subset prefix stripped) is scanned for `"bold"`,
130///    `"italic"`, `"oblique"`, and combined forms like `"bolditalic"`.
131/// 2. If bold is not found via name, `FontInfo::font_weight > 600` is used as
132///    a fallback.
133/// 3. If italic is not found via name, a non-zero `FontInfo::italic_angle` is
134///    used as a fallback.
135///
136/// Returns `(bold, italic)`.
137pub fn detect_formatting(font_name: &str, font_info: &FontInfo) -> (bool, bool) {
138    // Normalise: strip PDF subset prefix then lowercase for case-insensitive matching.
139    let stripped = strip_subset_prefix(font_name);
140    let normalized = stripped.to_lowercase();
141
142    // Combined forms must be checked first to avoid double-counting
143    // (e.g., "BoldOblique" contains both "bold" and "oblique").
144    let has_bold_combo = normalized.contains("bolditalic") || normalized.contains("boldoblique");
145    let mut bold = has_bold_combo || normalized.contains("bold");
146    let mut italic =
147        has_bold_combo || normalized.contains("italic") || normalized.contains("oblique");
148
149    if !bold {
150        bold = font_info.font_weight.map(|w| w > 600.0).unwrap_or(false);
151    }
152
153    if !italic {
154        italic = font_info
155            .italic_angle
156            .map(|angle| angle.abs() > f32::EPSILON)
157            .unwrap_or(false);
158    }
159
160    (bold, italic)
161}
162
163/// Flush the accumulated spans into an AST node and clear the accumulators.
164///
165/// No-op when `spans` is empty.
166fn flush_group(kind: &Option<SegmentClass>, spans: Vec<Span>, nodes: &mut Vec<Node>) {
167    if spans.is_empty() {
168        return;
169    }
170    match kind {
171        Some(SegmentClass::Heading(level)) => {
172            nodes.push(Node::Heading {
173                level: *level,
174                spans,
175            });
176        }
177        _ => {
178            nodes.push(Node::Paragraph { spans });
179        }
180    }
181}
182
183/// Build an AST `Document` from raw segments, font metadata, and configuration.
184///
185/// Algorithm:
186/// 1. Compute the body font size (mode of all segment sizes).
187/// 2. Classify every segment as a heading level or body text.
188/// 3. Group consecutive segments with the same classification into a single
189///    `Node::Heading` or `Node::Paragraph`.
190/// 4. Segments whose font resource name is absent from `fonts` are emitted as
191///    `Node::RawText` and contribute a `Warning::MissingFontMetrics`.
192///
193/// The returned `Vec<Warning>` is empty when all fonts are resolved.
194pub fn build_document(
195    segments: Vec<RawTextSegment>,
196    fonts: &HashMap<Vec<u8>, FontInfo>,
197    config: &DetectorConfig,
198    metadata: DocumentMetadata,
199) -> (Document, Vec<Warning>) {
200    let mut warnings = Vec::new();
201    let body_size = compute_body_size(&segments);
202    let classified = detect_headings(segments, body_size, config.heading_size_ratio);
203
204    let mut nodes = Vec::new();
205    let mut current_kind: Option<SegmentClass> = None;
206    let mut current_spans: Vec<Span> = Vec::new();
207
208    for item in classified {
209        let font = match fonts.get(&item.segment.font_resource_name) {
210            Some(font) => font,
211            None => {
212                // Flush any pending group before emitting RawText so it lands
213                // as its own node rather than merging into a preceding group.
214                flush_group(
215                    &current_kind,
216                    std::mem::take(&mut current_spans),
217                    &mut nodes,
218                );
219                current_kind = None;
220
221                warnings.push(Warning::MissingFontMetrics {
222                    font_name: String::from_utf8_lossy(&item.segment.font_resource_name)
223                        .to_string(),
224                    page: item.segment.page_number,
225                });
226                nodes.push(Node::RawText(item.segment.text));
227                continue;
228            }
229        };
230
231        let (mut bold, mut italic) = detect_formatting(&font.name, font);
232        if !config.detect_bold {
233            bold = false;
234        }
235        if !config.detect_italic {
236            italic = false;
237        }
238
239        let span = Span {
240            text: item.segment.text,
241            bold,
242            italic,
243            font_size: item.segment.font_size,
244            font_name: Some(font.name.clone()),
245        };
246
247        // Flush the current group when the classification changes.
248        let same_class = match (&current_kind, &item.classification) {
249            (Some(SegmentClass::Heading(a)), SegmentClass::Heading(b)) => a == b,
250            (Some(SegmentClass::Body), SegmentClass::Body) => true,
251            (None, _) => true, // first item — nothing to flush
252            _ => false,
253        };
254
255        if !same_class {
256            flush_group(
257                &current_kind,
258                std::mem::take(&mut current_spans),
259                &mut nodes,
260            );
261        }
262
263        current_kind = Some(item.classification);
264        current_spans.push(span);
265    }
266
267    // Final flush for any trailing group.
268    flush_group(
269        &current_kind,
270        std::mem::take(&mut current_spans),
271        &mut nodes,
272    );
273
274    (Document { metadata, nodes }, warnings)
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    fn seg(text: &str, font_size: f32) -> RawTextSegment {
282        RawTextSegment {
283            text: text.to_string(),
284            font_resource_name: b"F1".to_vec(),
285            font_size,
286            page_number: 1,
287        }
288    }
289
290    fn seg_with_font(
291        text: &str,
292        font_resource: &[u8],
293        font_size: f32,
294        page: usize,
295    ) -> RawTextSegment {
296        RawTextSegment {
297            text: text.to_string(),
298            font_resource_name: font_resource.to_vec(),
299            font_size,
300            page_number: page,
301        }
302    }
303
304    fn font_info(name: &str, font_weight: Option<f32>, italic_angle: Option<f32>) -> FontInfo {
305        FontInfo {
306            name: name.to_string(),
307            size: None,
308            font_weight,
309            italic_angle,
310        }
311    }
312
313    fn map_fonts<const N: usize>(entries: [(Vec<u8>, FontInfo); N]) -> HashMap<Vec<u8>, FontInfo> {
314        entries.into_iter().collect()
315    }
316
317    // ── compute_body_size ──────────────────────────────────────────────────────
318
319    #[test]
320    fn compute_body_size_uses_mode_with_smaller_tie_breaker() {
321        // Three sizes each appear twice: smallest (10.0) should win the tie.
322        let segments = vec![
323            seg("a", 12.0),
324            seg("b", 12.0),
325            seg("c", 14.0),
326            seg("d", 14.0),
327            seg("e", 10.0),
328            seg("f", 10.0),
329        ];
330
331        assert_eq!(compute_body_size(&segments), 10.0);
332    }
333
334    #[test]
335    fn compute_body_size_returns_default_on_empty_segments() {
336        assert_eq!(compute_body_size(&[]), 12.0);
337    }
338
339    // ── detect_headings ────────────────────────────────────────────────────────
340
341    #[test]
342    fn detect_headings_maps_ratios_to_levels_and_boundaries() {
343        let body = 10.0;
344        // Exact boundary values per spec: 2.0, 1.7, 1.4, and heading_size_ratio (1.2).
345        let segments = vec![
346            seg("h1", 20.0),    // ratio 2.0 → Heading(1)
347            seg("h2", 17.0),    // ratio 1.7 → Heading(2)
348            seg("h3", 14.0),    // ratio 1.4 → Heading(3)
349            seg("h4", 12.0),    // ratio 1.2 → Heading(4)
350            seg("body", 11.99), // ratio < 1.2 → Body
351        ];
352
353        let classes = detect_headings(segments, body, 1.2)
354            .into_iter()
355            .map(|c| c.classification)
356            .collect::<Vec<_>>();
357
358        assert_eq!(classes[0], SegmentClass::Heading(1));
359        assert_eq!(classes[1], SegmentClass::Heading(2));
360        assert_eq!(classes[2], SegmentClass::Heading(3));
361        assert_eq!(classes[3], SegmentClass::Heading(4));
362        assert_eq!(classes[4], SegmentClass::Body);
363    }
364
365    // ── detect_formatting ──────────────────────────────────────────────────────
366
367    #[test]
368    fn detect_formatting_reads_font_name_patterns_and_subset_prefix() {
369        let info = font_info("ignored", None, None);
370
371        assert_eq!(detect_formatting("Arial-Bold", &info), (true, false));
372        assert_eq!(
373            detect_formatting("TimesNewRoman-Italic", &info),
374            (false, true)
375        );
376        assert_eq!(
377            detect_formatting("ABCDEF+Helvetica-BoldOblique", &info),
378            (true, true)
379        );
380    }
381
382    #[test]
383    fn detect_formatting_falls_back_to_descriptor_metrics() {
384        let info = font_info("mystery-font", Some(700.0), Some(-10.0));
385        assert_eq!(detect_formatting("CustomFont-Regular", &info), (true, true));
386    }
387
388    // ── build_document ─────────────────────────────────────────────────────────
389
390    #[test]
391    fn build_document_groups_consecutive_classification_and_preserves_spans() {
392        let segments = vec![
393            seg_with_font("Chapter 1", b"F1", 24.0, 1),
394            seg_with_font("Intro", b"F1", 24.0, 1),
395            seg_with_font("Body A", b"F2", 12.0, 1),
396            seg_with_font("Body B", b"F2", 12.0, 1),
397        ];
398
399        let fonts = map_fonts([
400            (
401                b"F1".to_vec(),
402                font_info("Helvetica-Bold", Some(700.0), None),
403            ),
404            (b"F2".to_vec(), font_info("Helvetica", None, None)),
405        ]);
406
407        let cfg = DetectorConfig::default();
408        let metadata = DocumentMetadata {
409            title: Some("Demo".to_string()),
410            author: None,
411            page_count: 1,
412        };
413
414        let (doc, warnings) = build_document(segments, &fonts, &cfg, metadata.clone());
415
416        assert!(warnings.is_empty());
417        assert_eq!(doc.metadata, metadata);
418        assert_eq!(doc.nodes.len(), 2);
419
420        // Verify heading node: level and span texts
421        match &doc.nodes[0] {
422            Node::Heading { level, spans } => {
423                assert_eq!(*level, 1);
424                assert_eq!(spans.len(), 2);
425                assert_eq!(spans[0].text, "Chapter 1");
426                assert_eq!(spans[1].text, "Intro");
427                assert!(spans[0].bold);
428            }
429            other => panic!("expected Heading, got {:?}", other),
430        }
431
432        // Verify paragraph node: span texts
433        match &doc.nodes[1] {
434            Node::Paragraph { spans } => {
435                assert_eq!(spans.len(), 2);
436                assert_eq!(spans[0].text, "Body A");
437                assert_eq!(spans[1].text, "Body B");
438                assert!(!spans[0].bold);
439            }
440            other => panic!("expected Paragraph, got {:?}", other),
441        }
442    }
443
444    #[test]
445    fn build_document_uses_raw_text_when_font_is_missing() {
446        let segments = vec![seg_with_font("Unknown font", b"FX", 12.0, 1)];
447        let cfg = DetectorConfig::default();
448
449        let (doc, warnings) = build_document(
450            segments,
451            &HashMap::new(),
452            &cfg,
453            DocumentMetadata {
454                title: None,
455                author: None,
456                page_count: 1,
457            },
458        );
459
460        assert_eq!(doc.nodes, vec![Node::RawText("Unknown font".to_string())]);
461        assert_eq!(warnings.len(), 1);
462        assert!(matches!(warnings[0], Warning::MissingFontMetrics { .. }));
463    }
464}