Skip to main content

papyrus_core/
lib.rs

1pub mod ast;
2pub mod detector;
3pub mod parser;
4pub mod renderer;
5
6use std::collections::HashMap;
7
8use ast::ConversionResult;
9use detector::{build_document, DetectorConfig};
10
11/// A configured extraction engine.
12///
13/// Construct via [`Papyrus::builder`] to customise detection thresholds,
14/// or call the top-level [`convert`] function for zero-configuration extraction.
15#[derive(Debug, Clone)]
16pub struct Papyrus {
17    config: DetectorConfig,
18}
19
20/// Builder for [`Papyrus`].
21///
22/// All settings have sensible defaults via [`DetectorConfig::default`]; only
23/// set the values you want to override.
24#[derive(Debug, Clone)]
25pub struct PapyrusBuilder {
26    config: DetectorConfig,
27}
28
29impl PapyrusBuilder {
30    /// Minimum font-size ratio over the computed body size to treat a segment
31    /// as a heading. Must be less than `1.4` (the fixed level-3 boundary).
32    /// Default: `1.2`.
33    pub fn heading_size_ratio(mut self, ratio: f32) -> Self {
34        self.config.heading_size_ratio = ratio;
35        self
36    }
37
38    /// Enable or disable bold detection from font name / descriptor metrics.
39    /// When `false`, all spans have `bold = false`. Default: `true`.
40    pub fn detect_bold(mut self, enabled: bool) -> Self {
41        self.config.detect_bold = enabled;
42        self
43    }
44
45    /// Enable or disable italic detection from font name / descriptor metrics.
46    /// When `false`, all spans have `italic = false`. Default: `true`.
47    pub fn detect_italic(mut self, enabled: bool) -> Self {
48        self.config.detect_italic = enabled;
49        self
50    }
51
52    /// Consume the builder and return a configured [`Papyrus`] engine.
53    pub fn build(self) -> Papyrus {
54        Papyrus {
55            config: self.config,
56        }
57    }
58}
59
60impl Papyrus {
61    /// Return a [`PapyrusBuilder`] pre-loaded with default settings.
62    pub fn builder() -> PapyrusBuilder {
63        PapyrusBuilder {
64            config: DetectorConfig::default(),
65        }
66    }
67
68    /// Extract structured content from `pdf_bytes`.
69    ///
70    /// Parsing and detection are best-effort: any problems are captured as
71    /// [`ast::Warning`] values in the returned [`ConversionResult`] rather
72    /// than surfaced as errors.
73    pub fn extract(&self, pdf_bytes: &[u8]) -> ConversionResult {
74        extract_with_config(pdf_bytes, &self.config)
75    }
76}
77
78/// Extract structured content from `pdf_bytes` using default settings.
79///
80/// Equivalent to `Papyrus::builder().build().extract(pdf_bytes)`.
81pub fn convert(pdf_bytes: &[u8]) -> ConversionResult {
82    extract_with_config(pdf_bytes, &DetectorConfig::default())
83}
84
85/// Core single-pass extraction: load PDF once, resolve fonts and text per page
86/// in one pass, then run the detector.
87///
88/// This is the shared implementation for both [`Papyrus::extract`] and
89/// [`convert`]. Keeping it here avoids a redundant `Papyrus::builder().build()`
90/// allocation in the hot path.
91fn extract_with_config(pdf_bytes: &[u8], config: &DetectorConfig) -> ConversionResult {
92    use ast::{DocumentMetadata, Warning};
93
94    let mut all_warnings: Vec<Warning> = Vec::new();
95
96    // Step 1: Load PDF — one load for the entire extraction.
97    let (doc_opt, load_warnings) = parser::load_pdf(pdf_bytes);
98    all_warnings.extend(load_warnings);
99
100    let doc = match doc_opt {
101        Some(d) => d,
102        None => {
103            let (document, _) = build_document(
104                Vec::new(),
105                &HashMap::new(),
106                config,
107                DocumentMetadata {
108                    title: None,
109                    author: None,
110                    page_count: 0,
111                },
112            );
113            return ConversionResult {
114                document,
115                warnings: all_warnings,
116            };
117        }
118    };
119
120    // Step 2: Metadata.
121    let pages = doc.get_pages();
122    let page_count = pages.len();
123    let (title, author) = parser::extract_doc_info_pub(&doc);
124    let metadata = DocumentMetadata {
125        title,
126        author,
127        page_count,
128    };
129
130    // Step 3: Per-page font resolution + text extraction in a single pass.
131    // Fonts are keyed by (page_number, resource_name) to avoid cross-page
132    // collisions when two pages share the same resource name (e.g., both use
133    // "F1" for different physical fonts).
134    let mut page_fonts_map: HashMap<(usize, Vec<u8>), parser::FontInfo> = HashMap::new();
135    let mut all_segments: Vec<parser::RawTextSegment> = Vec::new();
136
137    let mut page_numbers: Vec<u32> = pages.keys().copied().collect();
138    page_numbers.sort();
139
140    for &page_num in &page_numbers {
141        let page_number = page_num as usize;
142
143        let (fonts, font_warnings) = parser::resolve_fonts_for_page(&doc, page_number);
144        all_warnings.extend(font_warnings);
145
146        // Store fonts under (page, resource_name) key.
147        for (resource_name, font_info) in fonts {
148            page_fonts_map.insert((page_number, resource_name), font_info);
149        }
150
151        let (segments, extract_warnings) =
152            parser::extract_text_segments_for_page(&doc, page_number, &HashMap::new());
153        all_warnings.extend(extract_warnings);
154        all_segments.extend(segments);
155    }
156
157    // Build a flat resource-name → FontInfo map for build_document.
158    // Since segments carry their page number, we look up the correct font
159    // per (page, resource_name) and flatten into a per-segment map.
160    let segment_fonts = build_segment_font_map(&all_segments, &page_fonts_map, &mut all_warnings);
161
162    // Step 4: Detect structure and build AST.
163    let (document, detector_warnings) =
164        build_document(all_segments, &segment_fonts, config, metadata);
165    all_warnings.extend(detector_warnings);
166
167    ConversionResult {
168        document,
169        warnings: all_warnings,
170    }
171}
172
173/// Build a `font_resource_name → FontInfo` map for use in `build_document`.
174///
175/// Iterates over all segments and looks up each `(page_number, resource_name)`
176/// pair from the pre-resolved `page_fonts_map`. The result is a flat map keyed
177/// only by `resource_name` (matching `build_document`'s lookup key).
178///
179/// **Known limitation:** `build_document` keys fonts by resource name alone, so
180/// if two pages use the same resource name (e.g., `F1`) for different physical
181/// fonts, the last writer wins. This matches the behaviour of `parser::parse_pdf`
182/// and is acceptable for the current single-pass architecture. A future
183/// improvement would thread the page number through to `build_document`.
184///
185/// Missing entries emit `Warning::MissingFontMetrics`, deduplicated per
186/// resource name to avoid warning spam on multi-segment pages.
187fn build_segment_font_map(
188    segments: &[parser::RawTextSegment],
189    page_fonts_map: &HashMap<(usize, Vec<u8>), parser::FontInfo>,
190    warnings: &mut Vec<ast::Warning>,
191) -> HashMap<Vec<u8>, parser::FontInfo> {
192    let mut result: HashMap<Vec<u8>, parser::FontInfo> = HashMap::new();
193    let mut warned: std::collections::HashSet<Vec<u8>> = std::collections::HashSet::new();
194
195    for segment in segments {
196        let key = (segment.page_number, segment.font_resource_name.clone());
197        match page_fonts_map.get(&key) {
198            Some(font_info) => {
199                // Last-page-wins on collision, consistent with parse_pdf behaviour.
200                result.insert(segment.font_resource_name.clone(), font_info.clone());
201            }
202            None => {
203                if warned.insert(segment.font_resource_name.clone()) {
204                    warnings.push(ast::Warning::MissingFontMetrics {
205                        font_name: String::from_utf8_lossy(&segment.font_resource_name).to_string(),
206                        page: segment.page_number,
207                    });
208                }
209            }
210        }
211    }
212
213    result
214}