Skip to main content

spdf_types/
text.rs

1//! Document / text data model.
2//!
3//! Serde attributes keep the JSON wire format byte-compatible with
4//! `liteparse`'s `ParseResultJson`.
5
6use serde::{Deserialize, Serialize};
7
8/// Markup annotation data associated with a text item.
9#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
10#[serde(rename_all = "camelCase")]
11pub struct MarkupData {
12    #[serde(skip_serializing_if = "Option::is_none")]
13    pub highlight: Option<String>,
14    #[serde(skip_serializing_if = "Option::is_none")]
15    pub underline: Option<bool>,
16    #[serde(skip_serializing_if = "Option::is_none")]
17    pub squiggly: Option<bool>,
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub strikeout: Option<bool>,
20}
21
22/// An individual text element extracted from a page.
23///
24/// Matches `TextItem` in [`liteparse/src/core/types.ts`]. Coordinates use PDF
25/// points, top-left origin, y increasing downward.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27#[serde(rename_all = "camelCase")]
28pub struct TextItem {
29    pub str: String,
30    pub x: f64,
31    pub y: f64,
32    pub width: f64,
33    pub height: f64,
34    pub w: f64,
35    pub h: f64,
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub font_name: Option<String>,
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub font_size: Option<f64>,
40    #[serde(skip_serializing_if = "Option::is_none")]
41    pub r: Option<i32>,
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub rx: Option<f64>,
44    #[serde(skip_serializing_if = "Option::is_none")]
45    pub ry: Option<f64>,
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub markup: Option<MarkupData>,
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub vgap: Option<bool>,
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub is_placeholder: Option<bool>,
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub confidence: Option<f64>,
54}
55
56impl TextItem {
57    /// Create a `TextItem` with sensible defaults. `w`/`h` are kept in lockstep
58    /// with `width`/`height`, matching liteparse's aliasing convention.
59    pub fn new(str: impl Into<String>, x: f64, y: f64, width: f64, height: f64) -> Self {
60        let s = str.into();
61        Self {
62            str: s,
63            x,
64            y,
65            width,
66            height,
67            w: width,
68            h: height,
69            font_name: None,
70            font_size: None,
71            r: None,
72            rx: None,
73            ry: None,
74            markup: None,
75            vgap: None,
76            is_placeholder: None,
77            confidence: None,
78        }
79    }
80}
81
82/// Snap alignment for a projection box.
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
84#[serde(rename_all = "lowercase")]
85pub enum Snap {
86    Left,
87    Right,
88    Center,
89}
90
91/// A working copy of a text item used during grid projection. Separate from
92/// `TextItem` because it carries algorithm-internal metadata.
93#[derive(Debug, Clone)]
94pub struct ProjectionTextBox {
95    pub str: String,
96    pub x: f64,
97    pub y: f64,
98    pub w: f64,
99    pub h: f64,
100    pub rx: Option<f64>,
101    pub ry: Option<f64>,
102    pub r: Option<i32>,
103    pub str_length: usize,
104    pub markup: Option<MarkupData>,
105    pub page_bbox: Option<Coordinates>,
106    pub vgap: bool,
107    pub is_placeholder: bool,
108    pub from_ocr: bool,
109
110    pub snap: Option<Snap>,
111    pub left_anchor: Option<String>,
112    pub right_anchor: Option<String>,
113    pub center_anchor: Option<String>,
114    pub is_dup: bool,
115    pub rendered: bool,
116    pub is_margin_line_number: bool,
117    pub should_space: Option<f64>,
118    pub force_unsnapped: bool,
119    pub rotated: bool,
120    pub d: Option<f64>,
121    pub font_name: Option<String>,
122    pub font_size: Option<f64>,
123    pub confidence: Option<f64>,
124}
125
126/// A rectangle defined by position and dimensions.
127#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
128#[serde(rename_all = "camelCase")]
129pub struct Coordinates {
130    pub x: f64,
131    pub y: f64,
132    pub w: f64,
133    pub h: f64,
134}
135
136/// Axis-aligned bounding box defined by top-left and bottom-right corners.
137///
138/// Deprecated in the TypeScript source; kept for byte-level JSON parity.
139#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
140#[serde(rename_all = "camelCase")]
141pub struct BoundingBox {
142    pub x1: f64,
143    pub y1: f64,
144    pub x2: f64,
145    pub y2: f64,
146}
147
148/// Raw OCR detection result before conversion to [`TextItem`].
149#[derive(Debug, Clone, Serialize, Deserialize)]
150#[serde(rename_all = "camelCase")]
151pub struct OcrData {
152    pub x: f64,
153    pub y: f64,
154    pub w: f64,
155    pub h: f64,
156    pub confidence: f64,
157    pub text: String,
158}
159
160/// Parsed data for a single page.
161#[derive(Debug, Clone, Serialize, Deserialize)]
162#[serde(rename_all = "camelCase")]
163pub struct ParsedPage {
164    pub page_num: u32,
165    pub width: f64,
166    pub height: f64,
167    pub text: String,
168    pub text_items: Vec<TextItem>,
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub bounding_boxes: Option<Vec<BoundingBox>>,
171}
172
173/// A text element from JSON output.
174#[derive(Debug, Clone, Serialize, Deserialize)]
175#[serde(rename_all = "camelCase")]
176pub struct JsonTextItem {
177    pub text: String,
178    pub x: f64,
179    pub y: f64,
180    pub width: f64,
181    pub height: f64,
182    #[serde(skip_serializing_if = "Option::is_none")]
183    pub font_name: Option<String>,
184    #[serde(skip_serializing_if = "Option::is_none")]
185    pub font_size: Option<f64>,
186    #[serde(skip_serializing_if = "Option::is_none")]
187    pub confidence: Option<f64>,
188}
189
190/// One page of the JSON output.
191#[derive(Debug, Clone, Serialize, Deserialize)]
192#[serde(rename_all = "camelCase")]
193pub struct JsonPage {
194    pub page: u32,
195    pub width: f64,
196    pub height: f64,
197    pub text: String,
198    pub text_items: Vec<JsonTextItem>,
199    pub bounding_boxes: Vec<BoundingBox>,
200}
201
202/// Structured JSON output. Returned when `output_format == Json`.
203#[derive(Debug, Clone, Serialize, Deserialize)]
204pub struct ParseResultJson {
205    pub pages: Vec<JsonPage>,
206}
207
208/// The result of parsing a document.
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct ParseResult {
211    pub pages: Vec<ParsedPage>,
212    pub text: String,
213    #[serde(skip_serializing_if = "Option::is_none")]
214    pub json: Option<ParseResultJson>,
215}
216
217/// Embedded image bounds, used for selective OCR.
218#[derive(Debug, Clone)]
219pub struct Image {
220    pub x: f64,
221    pub y: f64,
222    pub width: f64,
223    pub height: f64,
224    /// Decoded image bytes in memory. When present, OCR runs directly on these
225    /// bytes; otherwise the engine re-renders the region from the PDF.
226    pub data: Option<Vec<u8>>,
227    pub scale_factor: Option<f64>,
228    pub original_orientation_angle: Option<i32>,
229    pub image_type: Option<String>,
230}
231
232/// Result of rendering a page to an image.
233#[derive(Debug, Clone)]
234pub struct ScreenshotResult {
235    pub page_num: u32,
236    pub width: u32,
237    pub height: u32,
238    pub image_buffer: Vec<u8>,
239    pub image_path: Option<std::path::PathBuf>,
240}