pdfmuse_core/ir.rs
1//! Unified intermediate representation (IR).
2//!
3//! Every source format (PDF, DOCX) is parsed down to this one representation, and
4//! every binding (Python/Node/WASM) serializes it through the same serde path —
5//! that shared path is the technical guarantee behind byte-identical output.
6//!
7//! **Coordinate convention:** all coordinates are normalized to a **top-left
8//! origin, Y growing downward, unit = pt**. PDF's native bottom-left origin is
9//! converted at the parsing edge and never leaks into the IR, so every downstream
10//! consumer sees one consistent coordinate space.
11//!
12//! `Char` is kept at the finest granularity on purpose: it lets downstream users
13//! bypass our clustering and regroup by coordinate themselves — we never subtract
14//! for the caller. Fields here mirror the technical plan (§4).
15
16use serde::Serialize;
17
18/// A fully parsed document — the root of the IR.
19#[derive(Serialize, Clone, Debug, Default)]
20pub struct Document {
21 /// Which source format produced this document.
22 pub source: SourceKind,
23 pub metadata: Metadata,
24 pub pages: Vec<Page>,
25 /// Bookmarks / table of contents.
26 pub outline: Vec<OutlineItem>,
27 /// Non-fatal degradations recorded during parsing; parsing is never aborted
28 /// for these (see the "graceful degradation" principle).
29 pub warnings: Vec<Warning>,
30}
31
32/// The format a [`Document`] was parsed from.
33#[derive(Serialize, Clone, Debug, Default, PartialEq, Eq)]
34pub enum SourceKind {
35 #[default]
36 Pdf,
37 Docx,
38}
39
40/// Document-level metadata. All fields optional — absent in the source ⇒ `None`.
41#[derive(Serialize, Clone, Debug, Default)]
42pub struct Metadata {
43 pub title: Option<String>,
44 pub author: Option<String>,
45 pub subject: Option<String>,
46 pub keywords: Option<String>,
47 pub creator: Option<String>,
48 pub producer: Option<String>,
49 pub page_count: u32,
50}
51
52/// A single page and everything found on it, from finest (chars) to coarsest
53/// (blocks). Layers coexist so callers can pick the granularity they need.
54#[derive(Serialize, Clone, Debug, Default)]
55pub struct Page {
56 pub index: u32,
57 pub width: f32,
58 pub height: f32,
59 /// Page rotation in degrees (0/90/180/270).
60 pub rotation: i32,
61 /// Finest granularity: every glyph with its coordinates.
62 pub chars: Vec<Char>,
63 /// Characters clustered into lines (geometric, deterministic).
64 pub lines: Vec<TextLine>,
65 /// Paragraphs / tables / images in reading order.
66 pub blocks: Vec<Block>,
67 /// Vector rectangles — a source of table borders.
68 pub rects: Vec<Rect>,
69 /// Vector line segments — a source of table rules.
70 pub rules: Vec<Rule>,
71 pub images: Vec<ImageRef>,
72 pub links: Vec<Link>,
73}
74
75/// A single character with precise placement. `text` is always Unicode
76/// (post-CMap) — never a raw CID.
77#[derive(Serialize, Clone, Debug)]
78pub struct Char {
79 pub text: String,
80 pub bbox: BBox,
81 pub font: FontRef,
82 pub size: f32,
83 /// RGB in 0.0..=1.0, if known.
84 pub color: Option<[f32; 3]>,
85}
86
87/// An axis-aligned bounding box in normalized (top-left origin, Y-down, pt) space.
88#[derive(Serialize, Clone, Copy, Debug, Default, PartialEq)]
89pub struct BBox {
90 pub x0: f32,
91 pub y0: f32,
92 pub x1: f32,
93 pub y1: f32,
94}
95
96/// Reference to a font by resource name (details filled in by `fonts.rs`, PER-37).
97#[derive(Serialize, Clone, Debug, Default)]
98pub struct FontRef {
99 pub name: String,
100}
101
102/// A line of text produced by clustering [`Char`]s.
103#[derive(Serialize, Clone, Debug)]
104pub struct TextLine {
105 pub bbox: BBox,
106 pub text: String,
107 /// Indices into the owning [`Page::chars`] that make up this line.
108 pub chars: Vec<u32>,
109}
110
111/// A coarse-grained page element in reading order.
112#[derive(Serialize, Clone, Debug)]
113pub enum Block {
114 Paragraph(Paragraph),
115 Table(Table),
116 Image(ImageRef),
117}
118
119/// A paragraph of text. `heading_level` is `Some(n)` when this paragraph is a
120/// heading (drives Markdown `#` levels and RAG `heading_path`).
121#[derive(Serialize, Clone, Debug)]
122pub struct Paragraph {
123 pub bbox: BBox,
124 pub text: String,
125 pub heading_level: Option<u8>,
126}
127
128/// A reconstructed table. `source` records which deterministic path built it.
129#[derive(Serialize, Clone, Debug)]
130pub struct Table {
131 pub bbox: BBox,
132 /// Row-major grid of cells.
133 pub rows: Vec<Vec<Cell>>,
134 pub source: TableSource,
135}
136
137/// Which deterministic reconstruction path produced a [`Table`].
138#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
139pub enum TableSource {
140 /// Built from vector rules/rects (highest precision).
141 Ruled,
142 /// Built from whitespace-aligned text columns (only above a confidence threshold).
143 Whitespace,
144 /// Explicit table structure from a DOCX `w:tbl`.
145 Docx,
146}
147
148/// A single table cell, possibly spanning rows/columns.
149#[derive(Serialize, Clone, Debug)]
150pub struct Cell {
151 pub text: String,
152 pub bbox: BBox,
153 pub row_span: u16,
154 pub col_span: u16,
155}
156
157/// A vector rectangle (e.g. a table border box).
158#[derive(Serialize, Clone, Debug)]
159pub struct Rect {
160 pub bbox: BBox,
161}
162
163/// A vector line segment (e.g. a table rule).
164#[derive(Serialize, Clone, Debug)]
165pub struct Rule {
166 pub x0: f32,
167 pub y0: f32,
168 pub x1: f32,
169 pub y1: f32,
170 /// Stroke width in pt.
171 pub width: f32,
172}
173
174/// A reference to an embedded image and where it sits on the page.
175#[derive(Serialize, Clone, Debug)]
176pub struct ImageRef {
177 pub id: String,
178 pub bbox: BBox,
179 pub width: u32,
180 pub height: u32,
181}
182
183/// A hyperlink region. Exactly one of `uri` / `page` is typically set
184/// (external link vs. intra-document jump).
185#[derive(Serialize, Clone, Debug)]
186pub struct Link {
187 pub bbox: BBox,
188 pub uri: Option<String>,
189 pub page: Option<u32>,
190}
191
192/// An entry in the document outline (bookmarks / TOC). Recursive.
193#[derive(Serialize, Clone, Debug)]
194pub struct OutlineItem {
195 pub title: String,
196 pub page: Option<u32>,
197 pub level: u8,
198 pub children: Vec<OutlineItem>,
199}
200
201/// A non-fatal degradation recorded during parsing.
202#[derive(Serialize, Clone, Debug)]
203pub struct Warning {
204 /// The page it occurred on, if page-scoped.
205 pub page: Option<u32>,
206 pub kind: WarningKind,
207 pub detail: String,
208}
209
210/// The category of a [`Warning`].
211#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
212pub enum WarningKind {
213 /// An object could not be parsed and was skipped (see PER-38).
214 MalformedObject,
215 /// A CID font lacked a usable CMap/ToUnicode mapping.
216 MissingCMap,
217 /// The document was decrypted via a fallback path.
218 EncryptedFallback,
219 /// A scanned page has no text layer and needs OCR (pluggable backend).
220 NeedsOcr,
221 /// A source feature is not (yet) supported by the core.
222 Unsupported,
223}
224
225#[cfg(test)]
226mod tests {
227 use super::*;
228
229 /// The IR must serialize to stable JSON — the foundation of cross-binding parity.
230 #[test]
231 fn document_serializes_to_stable_json() {
232 let doc = Document {
233 source: SourceKind::Pdf,
234 metadata: Metadata { page_count: 1, ..Default::default() },
235 pages: vec![Page {
236 index: 0,
237 width: 612.0,
238 height: 792.0,
239 chars: vec![Char {
240 text: "A".to_string(),
241 bbox: BBox { x0: 0.0, y0: 0.0, x1: 10.0, y1: 12.0 },
242 font: FontRef { name: "Helvetica".to_string() },
243 size: 12.0,
244 color: None,
245 }],
246 ..Default::default()
247 }],
248 warnings: vec![Warning {
249 page: Some(0),
250 kind: WarningKind::MissingCMap,
251 detail: "font F1 has no ToUnicode".to_string(),
252 }],
253 ..Default::default()
254 };
255
256 // Serializes without error and is deterministic (same input → same string).
257 let json = serde_json::to_string(&doc).expect("IR serializes");
258 assert_eq!(json, serde_json::to_string(&doc.clone()).unwrap());
259 assert!(json.contains("\"source\":\"Pdf\""));
260 assert!(json.contains("\"kind\":\"MissingCMap\""));
261 assert!(json.contains("\"text\":\"A\""));
262 }
263}