Skip to main content

pdfmuse_core/
ir.rs

1//! Unified intermediate representation (IR).
2//!
3//! Every source format (PDF, DOCX) is parsed down to this one representation, and
4//! every binding (Python/Node/WASM) serializes it through the same serde path —
5//! that shared path is the technical guarantee behind byte-identical output.
6//!
7//! **Coordinate convention:** all coordinates are normalized to a **top-left
8//! origin, Y growing downward, unit = pt**. PDF's native bottom-left origin is
9//! converted at the parsing edge and never leaks into the IR, so every downstream
10//! consumer sees one consistent coordinate space.
11//!
12//! `Char` is kept at the finest granularity on purpose: it lets downstream users
13//! bypass our clustering and regroup by coordinate themselves — we never subtract
14//! for the caller. Fields here mirror the technical plan (§4).
15
16use serde::Serialize;
17
18/// A fully parsed document — the root of the IR.
19#[derive(Serialize, Clone, Debug, Default)]
20pub struct Document {
21    /// Which source format produced this document.
22    pub source: SourceKind,
23    pub metadata: Metadata,
24    pub pages: Vec<Page>,
25    /// Bookmarks / table of contents.
26    pub outline: Vec<OutlineItem>,
27    /// Non-fatal degradations recorded during parsing; parsing is never aborted
28    /// for these (see the "graceful degradation" principle).
29    pub warnings: Vec<Warning>,
30}
31
32/// The format a [`Document`] was parsed from.
33#[derive(Serialize, Clone, Debug, Default, PartialEq, Eq)]
34pub enum SourceKind {
35    #[default]
36    Pdf,
37    Docx,
38}
39
40/// Document-level metadata. All fields optional — absent in the source ⇒ `None`.
41#[derive(Serialize, Clone, Debug, Default)]
42pub struct Metadata {
43    pub title: Option<String>,
44    pub author: Option<String>,
45    pub subject: Option<String>,
46    pub keywords: Option<String>,
47    pub creator: Option<String>,
48    pub producer: Option<String>,
49    pub page_count: u32,
50}
51
52/// A single page and everything found on it, from finest (chars) to coarsest
53/// (blocks). Layers coexist so callers can pick the granularity they need.
54#[derive(Serialize, Clone, Debug, Default)]
55pub struct Page {
56    pub index: u32,
57    pub width: f32,
58    pub height: f32,
59    /// Page rotation in degrees (0/90/180/270).
60    pub rotation: i32,
61    /// Finest granularity: every glyph with its coordinates.
62    pub chars: Vec<Char>,
63    /// Characters clustered into lines (geometric, deterministic).
64    pub lines: Vec<TextLine>,
65    /// Paragraphs / tables / images in reading order.
66    pub blocks: Vec<Block>,
67    /// Vector rectangles — a source of table borders.
68    pub rects: Vec<Rect>,
69    /// Vector line segments — a source of table rules.
70    pub rules: Vec<Rule>,
71    pub images: Vec<ImageRef>,
72    pub links: Vec<Link>,
73}
74
75/// A single character with precise placement. `text` is always Unicode
76/// (post-CMap) — never a raw CID.
77#[derive(Serialize, Clone, Debug)]
78pub struct Char {
79    pub text: String,
80    pub bbox: BBox,
81    pub font: FontRef,
82    pub size: f32,
83    /// RGB in 0.0..=1.0, if known.
84    pub color: Option<[f32; 3]>,
85}
86
87/// An axis-aligned bounding box in normalized (top-left origin, Y-down, pt) space.
88#[derive(Serialize, Clone, Copy, Debug, Default, PartialEq)]
89pub struct BBox {
90    pub x0: f32,
91    pub y0: f32,
92    pub x1: f32,
93    pub y1: f32,
94}
95
96/// Reference to a font by resource name (details filled in by `fonts.rs`, PER-37).
97#[derive(Serialize, Clone, Debug, Default)]
98pub struct FontRef {
99    pub name: String,
100}
101
102/// A line of text produced by clustering [`Char`]s.
103#[derive(Serialize, Clone, Debug)]
104pub struct TextLine {
105    pub bbox: BBox,
106    pub text: String,
107    /// Indices into the owning [`Page::chars`] that make up this line.
108    pub chars: Vec<u32>,
109}
110
111/// A coarse-grained page element in reading order.
112#[derive(Serialize, Clone, Debug)]
113pub enum Block {
114    Paragraph(Paragraph),
115    Table(Table),
116    Image(ImageRef),
117}
118
119/// A paragraph of text. `heading_level` is `Some(n)` when this paragraph is a
120/// heading (drives Markdown `#` levels and RAG `heading_path`).
121#[derive(Serialize, Clone, Debug)]
122pub struct Paragraph {
123    pub bbox: BBox,
124    pub text: String,
125    pub heading_level: Option<u8>,
126}
127
128/// A reconstructed table. `source` records which deterministic path built it.
129#[derive(Serialize, Clone, Debug)]
130pub struct Table {
131    pub bbox: BBox,
132    /// Row-major grid of cells.
133    pub rows: Vec<Vec<Cell>>,
134    pub source: TableSource,
135}
136
137/// Which deterministic reconstruction path produced a [`Table`].
138#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
139pub enum TableSource {
140    /// Built from vector rules/rects (highest precision).
141    Ruled,
142    /// Built from whitespace-aligned text columns (only above a confidence threshold).
143    Whitespace,
144    /// Explicit table structure from a DOCX `w:tbl`.
145    Docx,
146}
147
148/// A single table cell, possibly spanning rows/columns.
149#[derive(Serialize, Clone, Debug)]
150pub struct Cell {
151    pub text: String,
152    pub bbox: BBox,
153    pub row_span: u16,
154    pub col_span: u16,
155}
156
157/// A vector rectangle (e.g. a table border box).
158#[derive(Serialize, Clone, Debug)]
159pub struct Rect {
160    pub bbox: BBox,
161}
162
163/// A vector line segment (e.g. a table rule).
164#[derive(Serialize, Clone, Debug)]
165pub struct Rule {
166    pub x0: f32,
167    pub y0: f32,
168    pub x1: f32,
169    pub y1: f32,
170    /// Stroke width in pt.
171    pub width: f32,
172}
173
174/// A reference to an embedded image and where it sits on the page.
175#[derive(Serialize, Clone, Debug)]
176pub struct ImageRef {
177    pub id: String,
178    pub bbox: BBox,
179    pub width: u32,
180    pub height: u32,
181}
182
183/// A hyperlink region. Exactly one of `uri` / `page` is typically set
184/// (external link vs. intra-document jump).
185#[derive(Serialize, Clone, Debug)]
186pub struct Link {
187    pub bbox: BBox,
188    pub uri: Option<String>,
189    pub page: Option<u32>,
190}
191
192/// An entry in the document outline (bookmarks / TOC). Recursive.
193#[derive(Serialize, Clone, Debug)]
194pub struct OutlineItem {
195    pub title: String,
196    pub page: Option<u32>,
197    pub level: u8,
198    pub children: Vec<OutlineItem>,
199}
200
201/// A non-fatal degradation recorded during parsing.
202#[derive(Serialize, Clone, Debug)]
203pub struct Warning {
204    /// The page it occurred on, if page-scoped.
205    pub page: Option<u32>,
206    pub kind: WarningKind,
207    pub detail: String,
208}
209
210/// The category of a [`Warning`].
211#[derive(Serialize, Clone, Debug, PartialEq, Eq)]
212pub enum WarningKind {
213    /// An object could not be parsed and was skipped (see PER-38).
214    MalformedObject,
215    /// A CID font lacked a usable CMap/ToUnicode mapping.
216    MissingCMap,
217    /// The document was decrypted via a fallback path.
218    EncryptedFallback,
219    /// A scanned page has no text layer and needs OCR (pluggable backend).
220    NeedsOcr,
221    /// A source feature is not (yet) supported by the core.
222    Unsupported,
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228
229    /// The IR must serialize to stable JSON — the foundation of cross-binding parity.
230    #[test]
231    fn document_serializes_to_stable_json() {
232        let doc = Document {
233            source: SourceKind::Pdf,
234            metadata: Metadata { page_count: 1, ..Default::default() },
235            pages: vec![Page {
236                index: 0,
237                width: 612.0,
238                height: 792.0,
239                chars: vec![Char {
240                    text: "A".to_string(),
241                    bbox: BBox { x0: 0.0, y0: 0.0, x1: 10.0, y1: 12.0 },
242                    font: FontRef { name: "Helvetica".to_string() },
243                    size: 12.0,
244                    color: None,
245                }],
246                ..Default::default()
247            }],
248            warnings: vec![Warning {
249                page: Some(0),
250                kind: WarningKind::MissingCMap,
251                detail: "font F1 has no ToUnicode".to_string(),
252            }],
253            ..Default::default()
254        };
255
256        // Serializes without error and is deterministic (same input → same string).
257        let json = serde_json::to_string(&doc).expect("IR serializes");
258        assert_eq!(json, serde_json::to_string(&doc.clone()).unwrap());
259        assert!(json.contains("\"source\":\"Pdf\""));
260        assert!(json.contains("\"kind\":\"MissingCMap\""));
261        assert!(json.contains("\"text\":\"A\""));
262    }
263}