pdfmuse_core/ir.rs
1//! Unified intermediate representation (IR).
2//!
3//! Every source format (PDF, DOCX) is parsed down to this one representation, and
4//! every binding (Python/Node/WASM) serializes it through the same serde path —
5//! that shared path is the technical guarantee behind byte-identical output.
6//!
7//! **Coordinate convention:** all coordinates are normalized to a **top-left
8//! origin, Y growing downward, unit = pt**. PDF's native bottom-left origin is
9//! converted at the parsing edge and never leaks into the IR, so every downstream
10//! consumer sees one consistent coordinate space.
11//!
12//! `Char` is kept at the finest granularity on purpose: it lets downstream users
13//! bypass our clustering and regroup by coordinate themselves — we never subtract
14//! for the caller. Fields here mirror the technical plan (§4).
15
16use serde::Serialize;
17
18/// A fully parsed document — the root of the IR.
19#[derive(Serialize, Clone, Debug, Default)]
20pub struct Document {
21 /// Which source format produced this document.
22 pub source: SourceKind,
23 pub metadata: Metadata,
24 pub pages: Vec<Page>,
25 /// Bookmarks / table of contents.
26 pub outline: Vec<OutlineItem>,
27 /// Non-fatal degradations recorded during parsing; parsing is never aborted
28 /// for these (see the "graceful degradation" principle).
29 pub warnings: Vec<Warning>,
30}
31
32/// The format a [`Document`] was parsed from.
33#[derive(Serialize, Clone, Debug, Default, PartialEq, Eq)]
34pub enum SourceKind {
35 #[default]
36 Pdf,
37 Docx,
38}
39
40/// Document-level metadata. All fields optional — absent in the source ⇒ `None`.
41#[derive(Serialize, Clone, Debug, Default)]
42pub struct Metadata {
43 pub title: Option<String>,
44 pub author: Option<String>,
45 pub subject: Option<String>,
46 pub keywords: Option<String>,
47 pub creator: Option<String>,
48 pub producer: Option<String>,
49 pub page_count: u32,
50}
51
52/// A single page and everything found on it, from finest (chars) to coarsest
53/// (blocks). Layers coexist so callers can pick the granularity they need.
54#[derive(Serialize, Clone, Debug, Default)]
55pub struct Page {
56 pub index: u32,
57 pub width: f32,
58 pub height: f32,
59 /// Page rotation in degrees (0/90/180/270).
60 pub rotation: i32,
61 /// Finest granularity: every glyph with its coordinates.
62 pub chars: Vec<Char>,
63 /// Characters clustered into lines (geometric, deterministic).
64 pub lines: Vec<TextLine>,
65 /// Paragraphs / tables / images in reading order.
66 pub blocks: Vec<Block>,
67 /// Vector rectangles — a source of table borders.
68 pub rects: Vec<Rect>,
69 /// Vector line segments — a source of table rules.
70 pub rules: Vec<Rule>,
71 pub images: Vec<ImageRef>,
72 pub links: Vec<Link>,
73}
74
75/// A single character with precise placement. `text` is always Unicode
76/// (post-CMap) — never a raw CID.
77#[derive(Serialize, Clone, Debug)]
78pub struct Char {
79 pub text: String,
80 pub bbox: BBox,
81 pub font: FontRef,
82 pub size: f32,
83 /// RGB in 0.0..=1.0, if known.
84 pub color: Option<[f32; 3]>,
85}
86
87/// An axis-aligned bounding box in normalized (top-left origin, Y-down, pt) space.
88#[derive(Serialize, Clone, Copy, Debug, Default, PartialEq)]
89pub struct BBox {
90 pub x0: f32,
91 pub y0: f32,
92 pub x1: f32,
93 pub y1: f32,
94}
95
96/// Reference to a font by resource name (details filled in by `fonts.rs`, PER-37).
97#[derive(Serialize, Clone, Debug, Default)]
98pub struct FontRef {
99 pub name: String,
100}
101
102/// A line of text produced by clustering [`Char`]s.
103#[derive(Serialize, Clone, Debug)]
104pub struct TextLine {
105 pub bbox: BBox,
106 pub text: String,
107 /// Indices into the owning [`Page::chars`] that make up this line.
108 pub chars: Vec<u32>,
109}
110
111/// A coarse-grained page element in reading order.
112#[derive(Serialize, Clone, Debug)]
113pub enum Block {
114 Paragraph(Paragraph),
115 Table(Table),
116 Image(ImageRef),
117}
118
119/// A paragraph of text. `heading_level` is `Some(n)` when this paragraph is a
120/// heading (drives Markdown `#` levels and RAG `heading_path`).
121#[derive(Serialize, Clone, Debug)]
122pub struct Paragraph {
123 pub bbox: BBox,
124 pub text: String,
125 pub heading_level: Option<u8>,
126 /// A non-body role, when detected. Skipped in JSON when `None`, so ordinary
127 /// paragraphs serialize exactly as before.
128 #[serde(skip_serializing_if = "Option::is_none")]
129 pub role: Option<BlockRole>,
130}
131
132/// A non-body role a paragraph can play. Used by boilerplate removal (PER-168).
133#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
134pub enum BlockRole {
135 /// A running header/footer (page number, document title, "CONFIDENTIAL")
136 /// repeated across pages. Marked, never dropped by default — see
137 /// [`remove_boilerplate`](crate::remove_boilerplate).
138 HeaderFooter,
139}
140
141/// A reconstructed table. `source` records which deterministic path built it.
142#[derive(Serialize, Clone, Debug)]
143pub struct Table {
144 pub bbox: BBox,
145 /// Row-major grid of cells.
146 pub rows: Vec<Vec<Cell>>,
147 pub source: TableSource,
148}
149
150/// Which deterministic reconstruction path produced a [`Table`].
151#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
152pub enum TableSource {
153 /// Built from vector rules/rects (highest precision).
154 Ruled,
155 /// Built from whitespace-aligned text columns (only above a confidence threshold).
156 Whitespace,
157 /// Explicit table structure from a DOCX `w:tbl`.
158 Docx,
159}
160
161/// A single table cell, possibly spanning rows/columns.
162#[derive(Serialize, Clone, Debug)]
163pub struct Cell {
164 pub text: String,
165 pub bbox: BBox,
166 pub row_span: u16,
167 pub col_span: u16,
168}
169
170/// A vector rectangle (e.g. a table border box).
171#[derive(Serialize, Clone, Debug)]
172pub struct Rect {
173 pub bbox: BBox,
174}
175
176/// A vector line segment (e.g. a table rule).
177#[derive(Serialize, Clone, Debug)]
178pub struct Rule {
179 pub x0: f32,
180 pub y0: f32,
181 pub x1: f32,
182 pub y1: f32,
183 /// Stroke width in pt.
184 pub width: f32,
185}
186
187/// A reference to an embedded image and where it sits on the page.
188#[derive(Serialize, Clone, Debug)]
189pub struct ImageRef {
190 pub id: String,
191 pub bbox: BBox,
192 pub width: u32,
193 pub height: u32,
194}
195
196/// A hyperlink region. Exactly one of `uri` / `page` is typically set
197/// (external link vs. intra-document jump).
198#[derive(Serialize, Clone, Debug)]
199pub struct Link {
200 pub bbox: BBox,
201 pub uri: Option<String>,
202 pub page: Option<u32>,
203}
204
205/// An entry in the document outline (bookmarks / TOC). Recursive.
206#[derive(Serialize, Clone, Debug)]
207pub struct OutlineItem {
208 pub title: String,
209 pub page: Option<u32>,
210 pub level: u8,
211 pub children: Vec<OutlineItem>,
212}
213
214/// A non-fatal degradation recorded during parsing.
215#[derive(Serialize, Clone, Debug)]
216pub struct Warning {
217 /// The page it occurred on, if page-scoped.
218 pub page: Option<u32>,
219 pub kind: WarningKind,
220 pub detail: String,
221}
222
223/// The category of a [`Warning`].
224#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
225pub enum WarningKind {
226 /// An object could not be parsed and was skipped (see PER-38).
227 MalformedObject,
228 /// A CID font lacked a usable CMap/ToUnicode mapping.
229 MissingCMap,
230 /// The document was decrypted via a fallback path.
231 EncryptedFallback,
232 /// A scanned page has no text layer and needs OCR (pluggable backend).
233 NeedsOcr,
234 /// A source feature is not (yet) supported by the core.
235 Unsupported,
236}
237
238#[cfg(test)]
239mod tests {
240 use super::*;
241
242 /// The IR must serialize to stable JSON — the foundation of cross-binding parity.
243 #[test]
244 fn document_serializes_to_stable_json() {
245 let doc = Document {
246 source: SourceKind::Pdf,
247 metadata: Metadata { page_count: 1, ..Default::default() },
248 pages: vec![Page {
249 index: 0,
250 width: 612.0,
251 height: 792.0,
252 chars: vec![Char {
253 text: "A".to_string(),
254 bbox: BBox { x0: 0.0, y0: 0.0, x1: 10.0, y1: 12.0 },
255 font: FontRef { name: "Helvetica".to_string() },
256 size: 12.0,
257 color: None,
258 }],
259 ..Default::default()
260 }],
261 warnings: vec![Warning {
262 page: Some(0),
263 kind: WarningKind::MissingCMap,
264 detail: "font F1 has no ToUnicode".to_string(),
265 }],
266 ..Default::default()
267 };
268
269 // Serializes without error and is deterministic (same input → same string).
270 let json = serde_json::to_string(&doc).expect("IR serializes");
271 assert_eq!(json, serde_json::to_string(&doc.clone()).unwrap());
272 assert!(json.contains("\"source\":\"Pdf\""));
273 assert!(json.contains("\"kind\":\"MissingCMap\""));
274 assert!(json.contains("\"text\":\"A\""));
275 }
276}