papyrus_core/lib.rs
1pub mod ast;
2pub mod detector;
3pub mod parser;
4pub mod renderer;
5
6use std::collections::HashMap;
7
8use ast::ConversionResult;
9use detector::{build_document, DetectorConfig};
10
11/// A configured extraction engine.
12///
13/// Construct via [`Papyrus::builder`] to customise detection thresholds,
14/// or call the top-level [`convert`] function for zero-configuration extraction.
15#[derive(Debug, Clone)]
16pub struct Papyrus {
17 config: DetectorConfig,
18}
19
20/// Builder for [`Papyrus`].
21///
22/// All settings have sensible defaults via [`DetectorConfig::default`]; only
23/// set the values you want to override.
24#[derive(Debug, Clone)]
25pub struct PapyrusBuilder {
26 config: DetectorConfig,
27}
28
29impl PapyrusBuilder {
30 /// Minimum font-size ratio over the computed body size to treat a segment
31 /// as a heading. Must be less than `1.4` (the fixed level-3 boundary).
32 /// Default: `1.2`.
33 pub fn heading_size_ratio(mut self, ratio: f32) -> Self {
34 self.config.heading_size_ratio = ratio;
35 self
36 }
37
38 /// Enable or disable bold detection from font name / descriptor metrics.
39 /// When `false`, all spans have `bold = false`. Default: `true`.
40 pub fn detect_bold(mut self, enabled: bool) -> Self {
41 self.config.detect_bold = enabled;
42 self
43 }
44
45 /// Enable or disable italic detection from font name / descriptor metrics.
46 /// When `false`, all spans have `italic = false`. Default: `true`.
47 pub fn detect_italic(mut self, enabled: bool) -> Self {
48 self.config.detect_italic = enabled;
49 self
50 }
51
52 /// Consume the builder and return a configured [`Papyrus`] engine.
53 pub fn build(self) -> Papyrus {
54 Papyrus {
55 config: self.config,
56 }
57 }
58}
59
60impl Papyrus {
61 /// Return a [`PapyrusBuilder`] pre-loaded with default settings.
62 pub fn builder() -> PapyrusBuilder {
63 PapyrusBuilder {
64 config: DetectorConfig::default(),
65 }
66 }
67
68 /// Extract structured content from `pdf_bytes`.
69 ///
70 /// Parsing and detection are best-effort: any problems are captured as
71 /// [`ast::Warning`] values in the returned [`ConversionResult`] rather
72 /// than surfaced as errors.
73 pub fn extract(&self, pdf_bytes: &[u8]) -> ConversionResult {
74 extract_with_config(pdf_bytes, &self.config)
75 }
76}
77
78/// Extract structured content from `pdf_bytes` using default settings.
79///
80/// Equivalent to `Papyrus::builder().build().extract(pdf_bytes)`.
81pub fn convert(pdf_bytes: &[u8]) -> ConversionResult {
82 extract_with_config(pdf_bytes, &DetectorConfig::default())
83}
84
85/// Core single-pass extraction: load PDF once, resolve fonts and text per page
86/// in one pass, then run the detector.
87///
88/// This is the shared implementation for both [`Papyrus::extract`] and
89/// [`convert`]. Keeping it here avoids a redundant `Papyrus::builder().build()`
90/// allocation in the hot path.
91fn extract_with_config(pdf_bytes: &[u8], config: &DetectorConfig) -> ConversionResult {
92 use ast::{DocumentMetadata, Warning};
93
94 let mut all_warnings: Vec<Warning> = Vec::new();
95
96 // Step 1: Load PDF — one load for the entire extraction.
97 let (doc_opt, load_warnings) = parser::load_pdf(pdf_bytes);
98 all_warnings.extend(load_warnings);
99
100 let doc = match doc_opt {
101 Some(d) => d,
102 None => {
103 let (document, _) = build_document(
104 Vec::new(),
105 &HashMap::new(),
106 config,
107 DocumentMetadata {
108 title: None,
109 author: None,
110 page_count: 0,
111 },
112 );
113 return ConversionResult {
114 document,
115 warnings: all_warnings,
116 };
117 }
118 };
119
120 // Step 2: Metadata.
121 let pages = doc.get_pages();
122 let page_count = pages.len();
123 let (title, author) = parser::extract_doc_info_pub(&doc);
124 let metadata = DocumentMetadata {
125 title,
126 author,
127 page_count,
128 };
129
130 // Step 3: Per-page font resolution + text extraction in a single pass.
131 // Fonts are keyed by (page_number, resource_name) to avoid cross-page
132 // collisions when two pages share the same resource name (e.g., both use
133 // "F1" for different physical fonts).
134 let mut page_fonts_map: HashMap<(usize, Vec<u8>), parser::FontInfo> = HashMap::new();
135 let mut all_segments: Vec<parser::RawTextSegment> = Vec::new();
136
137 let mut page_numbers: Vec<u32> = pages.keys().copied().collect();
138 page_numbers.sort();
139
140 for &page_num in &page_numbers {
141 let page_number = page_num as usize;
142
143 let (fonts, font_warnings) = parser::resolve_fonts_for_page(&doc, page_number);
144 all_warnings.extend(font_warnings);
145
146 // Store fonts under (page, resource_name) key.
147 for (resource_name, font_info) in fonts {
148 page_fonts_map.insert((page_number, resource_name), font_info);
149 }
150
151 let (segments, extract_warnings) =
152 parser::extract_text_segments_for_page(&doc, page_number, &HashMap::new());
153 all_warnings.extend(extract_warnings);
154 all_segments.extend(segments);
155 }
156
157 // Build a flat resource-name → FontInfo map for build_document.
158 // Since segments carry their page number, we look up the correct font
159 // per (page, resource_name) and flatten into a per-segment map.
160 let segment_fonts = build_segment_font_map(&all_segments, &page_fonts_map, &mut all_warnings);
161
162 // Step 4: Detect structure and build AST.
163 let (document, detector_warnings) =
164 build_document(all_segments, &segment_fonts, config, metadata);
165 all_warnings.extend(detector_warnings);
166
167 ConversionResult {
168 document,
169 warnings: all_warnings,
170 }
171}
172
173/// Build a `font_resource_name → FontInfo` map for use in `build_document`.
174///
175/// Iterates over all segments and looks up each `(page_number, resource_name)`
176/// pair from the pre-resolved `page_fonts_map`. The result is a flat map keyed
177/// only by `resource_name` (matching `build_document`'s lookup key).
178///
179/// **Known limitation:** `build_document` keys fonts by resource name alone, so
180/// if two pages use the same resource name (e.g., `F1`) for different physical
181/// fonts, the last writer wins. This matches the behaviour of `parser::parse_pdf`
182/// and is acceptable for the current single-pass architecture. A future
183/// improvement would thread the page number through to `build_document`.
184///
185/// Missing entries emit `Warning::MissingFontMetrics`, deduplicated per
186/// resource name to avoid warning spam on multi-segment pages.
187fn build_segment_font_map(
188 segments: &[parser::RawTextSegment],
189 page_fonts_map: &HashMap<(usize, Vec<u8>), parser::FontInfo>,
190 warnings: &mut Vec<ast::Warning>,
191) -> HashMap<Vec<u8>, parser::FontInfo> {
192 let mut result: HashMap<Vec<u8>, parser::FontInfo> = HashMap::new();
193 let mut warned: std::collections::HashSet<Vec<u8>> = std::collections::HashSet::new();
194
195 for segment in segments {
196 let key = (segment.page_number, segment.font_resource_name.clone());
197 match page_fonts_map.get(&key) {
198 Some(font_info) => {
199 // Last-page-wins on collision, consistent with parse_pdf behaviour.
200 result.insert(segment.font_resource_name.clone(), font_info.clone());
201 }
202 None => {
203 if warned.insert(segment.font_resource_name.clone()) {
204 warnings.push(ast::Warning::MissingFontMetrics {
205 font_name: String::from_utf8_lossy(&segment.font_resource_name).to_string(),
206 page: segment.page_number,
207 });
208 }
209 }
210 }
211 }
212
213 result
214}