Skip to main content

pdfmuse_core/
ir.rs

1//! Unified intermediate representation (IR).
2//!
3//! Every source format (PDF, DOCX) is parsed down to this one representation, and
4//! every binding (Python/Node/WASM) serializes it through the same serde path —
5//! that shared path is the technical guarantee behind byte-identical output.
6//!
7//! **Coordinate convention:** all coordinates are normalized to a **top-left
8//! origin, Y growing downward, unit = pt**. PDF's native bottom-left origin is
9//! converted at the parsing edge and never leaks into the IR, so every downstream
10//! consumer sees one consistent coordinate space.
11//!
12//! `Char` is kept at the finest granularity on purpose: it lets downstream users
13//! bypass our clustering and regroup by coordinate themselves — we never subtract
14//! for the caller. Fields here mirror the technical plan (§4).
15
16use serde::Serialize;
17
18/// A fully parsed document — the root of the IR.
19#[derive(Serialize, Clone, Debug, Default)]
20pub struct Document {
21    /// Which source format produced this document.
22    pub source: SourceKind,
23    pub metadata: Metadata,
24    pub pages: Vec<Page>,
25    /// Bookmarks / table of contents.
26    pub outline: Vec<OutlineItem>,
27    /// Non-fatal degradations recorded during parsing; parsing is never aborted
28    /// for these (see the "graceful degradation" principle).
29    pub warnings: Vec<Warning>,
30}
31
32/// The format a [`Document`] was parsed from.
33#[derive(Serialize, Clone, Debug, Default, PartialEq, Eq)]
34pub enum SourceKind {
35    #[default]
36    Pdf,
37    Docx,
38}
39
40/// Document-level metadata. All fields optional — absent in the source ⇒ `None`.
41#[derive(Serialize, Clone, Debug, Default)]
42pub struct Metadata {
43    pub title: Option<String>,
44    pub author: Option<String>,
45    pub subject: Option<String>,
46    pub keywords: Option<String>,
47    pub creator: Option<String>,
48    pub producer: Option<String>,
49    pub page_count: u32,
50}
51
52/// A single page and everything found on it, from finest (chars) to coarsest
53/// (blocks). Layers coexist so callers can pick the granularity they need.
54#[derive(Serialize, Clone, Debug, Default)]
55pub struct Page {
56    pub index: u32,
57    pub width: f32,
58    pub height: f32,
59    /// Page rotation in degrees (0/90/180/270).
60    pub rotation: i32,
61    /// Finest granularity: every glyph with its coordinates.
62    pub chars: Vec<Char>,
63    /// Characters clustered into lines (geometric, deterministic).
64    pub lines: Vec<TextLine>,
65    /// Paragraphs / tables / images in reading order.
66    pub blocks: Vec<Block>,
67    /// Vector rectangles — a source of table borders.
68    pub rects: Vec<Rect>,
69    /// Vector line segments — a source of table rules.
70    pub rules: Vec<Rule>,
71    pub images: Vec<ImageRef>,
72    pub links: Vec<Link>,
73}
74
75/// A single character with precise placement. `text` is always Unicode
76/// (post-CMap) — never a raw CID.
77#[derive(Serialize, Clone, Debug)]
78pub struct Char {
79    pub text: String,
80    pub bbox: BBox,
81    pub font: FontRef,
82    pub size: f32,
83    /// RGB in 0.0..=1.0, if known.
84    pub color: Option<[f32; 3]>,
85}
86
87/// An axis-aligned bounding box in normalized (top-left origin, Y-down, pt) space.
88#[derive(Serialize, Clone, Copy, Debug, Default, PartialEq)]
89pub struct BBox {
90    pub x0: f32,
91    pub y0: f32,
92    pub x1: f32,
93    pub y1: f32,
94}
95
96/// Reference to a font by resource name (details filled in by `fonts.rs`, PER-37).
97#[derive(Serialize, Clone, Debug, Default)]
98pub struct FontRef {
99    pub name: String,
100}
101
102/// A line of text produced by clustering [`Char`]s.
103#[derive(Serialize, Clone, Debug)]
104pub struct TextLine {
105    pub bbox: BBox,
106    pub text: String,
107    /// Indices into the owning [`Page::chars`] that make up this line.
108    pub chars: Vec<u32>,
109}
110
111/// A coarse-grained page element in reading order.
112#[derive(Serialize, Clone, Debug)]
113pub enum Block {
114    Paragraph(Paragraph),
115    Table(Table),
116    Image(ImageRef),
117}
118
119/// A paragraph of text. `heading_level` is `Some(n)` when this paragraph is a
120/// heading (drives Markdown `#` levels and RAG `heading_path`).
121#[derive(Serialize, Clone, Debug)]
122pub struct Paragraph {
123    pub bbox: BBox,
124    pub text: String,
125    pub heading_level: Option<u8>,
126    /// A non-body role, when detected. Skipped in JSON when `None`, so ordinary
127    /// paragraphs serialize exactly as before.
128    #[serde(skip_serializing_if = "Option::is_none")]
129    pub role: Option<BlockRole>,
130}
131
132/// A non-body role a paragraph can play. Used by boilerplate removal (PER-168).
133#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
134pub enum BlockRole {
135    /// A running header/footer (page number, document title, "CONFIDENTIAL")
136    /// repeated across pages. Marked, never dropped by default — see
137    /// [`remove_boilerplate`](crate::remove_boilerplate).
138    HeaderFooter,
139}
140
141/// A reconstructed table. `source` records which deterministic path built it.
142#[derive(Serialize, Clone, Debug)]
143pub struct Table {
144    pub bbox: BBox,
145    /// Row-major grid of cells.
146    pub rows: Vec<Vec<Cell>>,
147    pub source: TableSource,
148}
149
150/// Which deterministic reconstruction path produced a [`Table`].
151#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
152pub enum TableSource {
153    /// Built from vector rules/rects (highest precision).
154    Ruled,
155    /// Built from whitespace-aligned text columns (only above a confidence threshold).
156    Whitespace,
157    /// Explicit table structure from a DOCX `w:tbl`.
158    Docx,
159}
160
161/// A single table cell, possibly spanning rows/columns.
162#[derive(Serialize, Clone, Debug)]
163pub struct Cell {
164    pub text: String,
165    pub bbox: BBox,
166    pub row_span: u16,
167    pub col_span: u16,
168}
169
170/// A vector rectangle (e.g. a table border box).
171#[derive(Serialize, Clone, Debug)]
172pub struct Rect {
173    pub bbox: BBox,
174}
175
176/// A vector line segment (e.g. a table rule).
177#[derive(Serialize, Clone, Debug)]
178pub struct Rule {
179    pub x0: f32,
180    pub y0: f32,
181    pub x1: f32,
182    pub y1: f32,
183    /// Stroke width in pt.
184    pub width: f32,
185}
186
187/// A reference to an embedded image and where it sits on the page.
188#[derive(Serialize, Clone, Debug)]
189pub struct ImageRef {
190    pub id: String,
191    pub bbox: BBox,
192    pub width: u32,
193    pub height: u32,
194}
195
196/// A hyperlink region. Exactly one of `uri` / `page` is typically set
197/// (external link vs. intra-document jump).
198#[derive(Serialize, Clone, Debug)]
199pub struct Link {
200    pub bbox: BBox,
201    pub uri: Option<String>,
202    pub page: Option<u32>,
203}
204
205/// An entry in the document outline (bookmarks / TOC). Recursive.
206#[derive(Serialize, Clone, Debug)]
207pub struct OutlineItem {
208    pub title: String,
209    pub page: Option<u32>,
210    pub level: u8,
211    pub children: Vec<OutlineItem>,
212}
213
214/// A non-fatal degradation recorded during parsing.
215#[derive(Serialize, Clone, Debug)]
216pub struct Warning {
217    /// The page it occurred on, if page-scoped.
218    pub page: Option<u32>,
219    pub kind: WarningKind,
220    pub detail: String,
221}
222
223/// The category of a [`Warning`].
224#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
225pub enum WarningKind {
226    /// An object could not be parsed and was skipped (see PER-38).
227    MalformedObject,
228    /// A CID font lacked a usable CMap/ToUnicode mapping.
229    MissingCMap,
230    /// The document was decrypted via a fallback path.
231    EncryptedFallback,
232    /// A scanned page has no text layer and needs OCR (pluggable backend).
233    NeedsOcr,
234    /// A source feature is not (yet) supported by the core.
235    Unsupported,
236}
237
238#[cfg(test)]
239mod tests {
240    use super::*;
241
242    /// The IR must serialize to stable JSON — the foundation of cross-binding parity.
243    #[test]
244    fn document_serializes_to_stable_json() {
245        let doc = Document {
246            source: SourceKind::Pdf,
247            metadata: Metadata { page_count: 1, ..Default::default() },
248            pages: vec![Page {
249                index: 0,
250                width: 612.0,
251                height: 792.0,
252                chars: vec![Char {
253                    text: "A".to_string(),
254                    bbox: BBox { x0: 0.0, y0: 0.0, x1: 10.0, y1: 12.0 },
255                    font: FontRef { name: "Helvetica".to_string() },
256                    size: 12.0,
257                    color: None,
258                }],
259                ..Default::default()
260            }],
261            warnings: vec![Warning {
262                page: Some(0),
263                kind: WarningKind::MissingCMap,
264                detail: "font F1 has no ToUnicode".to_string(),
265            }],
266            ..Default::default()
267        };
268
269        // Serializes without error and is deterministic (same input → same string).
270        let json = serde_json::to_string(&doc).expect("IR serializes");
271        assert_eq!(json, serde_json::to_string(&doc.clone()).unwrap());
272        assert!(json.contains("\"source\":\"Pdf\""));
273        assert!(json.contains("\"kind\":\"MissingCMap\""));
274        assert!(json.contains("\"text\":\"A\""));
275    }
276}