Skip to main content

pdfsink_rs/
types.rs

1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use std::collections::BTreeMap;
4use std::path::PathBuf;
5
6pub type JsonMap = BTreeMap<String, Value>;
7
8#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
9pub struct BBox {
10    pub x0: f64,
11    pub top: f64,
12    pub x1: f64,
13    pub bottom: f64,
14}
15
16impl BBox {
17    pub const fn new(x0: f64, top: f64, x1: f64, bottom: f64) -> Self {
18        Self { x0, top, x1, bottom }
19    }
20
21    pub fn width(self) -> f64 {
22        self.x1 - self.x0
23    }
24
25    pub fn height(self) -> f64 {
26        self.bottom - self.top
27    }
28
29    pub fn area(self) -> f64 {
30        self.width() * self.height()
31    }
32
33    pub fn is_valid(self) -> bool {
34        self.x0 <= self.x1 && self.top <= self.bottom
35    }
36
37    pub fn overlaps(self, other: Self) -> bool {
38        !(self.x1 < other.x0 || self.x0 > other.x1 || self.bottom < other.top || self.top > other.bottom)
39    }
40
41    pub fn contains_bbox(self, other: Self) -> bool {
42        self.x0 <= other.x0
43            && self.top <= other.top
44            && self.x1 >= other.x1
45            && self.bottom >= other.bottom
46    }
47
48    pub fn overlap(self, other: Self) -> Option<Self> {
49        let x0 = self.x0.max(other.x0);
50        let top = self.top.max(other.top);
51        let x1 = self.x1.min(other.x1);
52        let bottom = self.bottom.min(other.bottom);
53        if x1 >= x0 && bottom >= top && ((x1 - x0) + (bottom - top) > 0.0) {
54            Some(Self::new(x0, top, x1, bottom))
55        } else {
56            None
57        }
58    }
59
60    pub fn translate(self, dx: f64, dy: f64) -> Self {
61        Self::new(self.x0 + dx, self.top + dy, self.x1 + dx, self.bottom + dy)
62    }
63
64    pub fn as_tuple(self) -> (f64, f64, f64, f64) {
65        (self.x0, self.top, self.x1, self.bottom)
66    }
67
68    pub fn center(self) -> Point {
69        Point::new((self.x0 + self.x1) / 2.0, (self.top + self.bottom) / 2.0)
70    }
71}
72
73impl Default for BBox {
74    fn default() -> Self {
75        Self::new(0.0, 0.0, 0.0, 0.0)
76    }
77}
78
79#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
80pub struct Point {
81    pub x: f64,
82    pub y: f64,
83}
84
85impl Point {
86    pub const fn new(x: f64, y: f64) -> Self {
87        Self { x, y }
88    }
89}
90
91#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
92#[serde(rename_all = "snake_case")]
93pub enum Direction {
94    Ttb,
95    Btt,
96    Ltr,
97    Rtl,
98}
99
100impl Direction {
101    pub fn as_str(self) -> &'static str {
102        match self {
103            Self::Ttb => "ttb",
104            Self::Btt => "btt",
105            Self::Ltr => "ltr",
106            Self::Rtl => "rtl",
107        }
108    }
109
110    pub fn is_horizontal(self) -> bool {
111        matches!(self, Self::Ltr | Self::Rtl)
112    }
113
114    pub fn is_vertical(self) -> bool {
115        matches!(self, Self::Ttb | Self::Btt)
116    }
117}
118
119impl std::str::FromStr for Direction {
120    type Err = crate::Error;
121
122    fn from_str(s: &str) -> crate::Result<Self> {
123        match s {
124            "ttb" => Ok(Self::Ttb),
125            "btt" => Ok(Self::Btt),
126            "ltr" => Ok(Self::Ltr),
127            "rtl" => Ok(Self::Rtl),
128            other => Err(crate::Error::Message(format!("unknown direction: {other}"))),
129        }
130    }
131}
132
133#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
134#[serde(rename_all = "snake_case")]
135pub enum Orientation {
136    Horizontal,
137    Vertical,
138}
139
140impl Orientation {
141    pub fn as_char(self) -> &'static str {
142        match self {
143            Self::Horizontal => "h",
144            Self::Vertical => "v",
145        }
146    }
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
150pub struct Char {
151    pub object_type: String,
152    pub page_number: usize,
153    pub text: String,
154    pub x0: f64,
155    pub top: f64,
156    pub x1: f64,
157    pub bottom: f64,
158    pub y0: f64,
159    pub y1: f64,
160    pub doctop: f64,
161    pub width: f64,
162    pub height: f64,
163    pub size: f64,
164    pub adv: f64,
165    pub upright: bool,
166    pub fontname: String,
167    pub matrix: [f64; 6],
168    #[serde(skip_serializing_if = "Option::is_none")]
169    pub mcid: Option<i64>,
170    #[serde(skip_serializing_if = "Option::is_none")]
171    pub tag: Option<String>,
172    #[serde(skip_serializing_if = "Option::is_none")]
173    pub ncs: Option<String>,
174    #[serde(skip_serializing_if = "Option::is_none")]
175    pub stroking_color: Option<String>,
176    #[serde(skip_serializing_if = "Option::is_none")]
177    pub non_stroking_color: Option<String>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
181pub struct Word {
182    pub text: String,
183    pub x0: f64,
184    pub top: f64,
185    pub x1: f64,
186    pub bottom: f64,
187    pub doctop: f64,
188    pub width: f64,
189    pub height: f64,
190    pub upright: bool,
191    pub direction: Direction,
192    #[serde(skip_serializing_if = "Option::is_none")]
193    pub chars: Option<Vec<Char>>,
194}
195
196#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
197pub struct Line {
198    pub object_type: String,
199    pub page_number: usize,
200    pub x0: f64,
201    pub top: f64,
202    pub x1: f64,
203    pub bottom: f64,
204    pub y0: f64,
205    pub y1: f64,
206    pub doctop: f64,
207    pub width: f64,
208    pub height: f64,
209    pub pts: Vec<Point>,
210    pub stroke: bool,
211    pub fill: bool,
212    pub linewidth: f64,
213}
214
215#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
216pub enum PathCommand {
217    MoveTo(Point),
218    LineTo(Point),
219    CurveTo { c1: Point, c2: Point, p: Point },
220    Rect { x: f64, y: f64, width: f64, height: f64 },
221    Close,
222}
223
224#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
225pub struct RectObject {
226    pub object_type: String,
227    pub page_number: usize,
228    pub x0: f64,
229    pub top: f64,
230    pub x1: f64,
231    pub bottom: f64,
232    pub y0: f64,
233    pub y1: f64,
234    pub doctop: f64,
235    pub width: f64,
236    pub height: f64,
237    pub pts: Vec<Point>,
238    pub path: Vec<PathCommand>,
239    pub stroke: bool,
240    pub fill: bool,
241    pub linewidth: f64,
242}
243
244#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
245pub struct Curve {
246    pub object_type: String,
247    pub page_number: usize,
248    pub x0: f64,
249    pub top: f64,
250    pub x1: f64,
251    pub bottom: f64,
252    pub y0: f64,
253    pub y1: f64,
254    pub doctop: f64,
255    pub width: f64,
256    pub height: f64,
257    pub pts: Vec<Point>,
258    pub path: Vec<PathCommand>,
259    pub stroke: bool,
260    pub fill: bool,
261    pub linewidth: f64,
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
265pub struct ImageObject {
266    pub object_type: String,
267    pub page_number: usize,
268    pub x0: f64,
269    pub top: f64,
270    pub x1: f64,
271    pub bottom: f64,
272    pub y0: f64,
273    pub y1: f64,
274    pub doctop: f64,
275    pub width: f64,
276    pub height: f64,
277    pub name: String,
278    pub srcsize: (u32, u32),
279    #[serde(skip_serializing_if = "Option::is_none")]
280    pub bits: Option<i64>,
281    #[serde(skip_serializing_if = "Option::is_none")]
282    pub colorspace: Option<String>,
283    #[serde(skip_serializing_if = "Option::is_none")]
284    pub imagemask: Option<bool>,
285    #[serde(skip_serializing_if = "Option::is_none")]
286    pub mcid: Option<i64>,
287    #[serde(skip_serializing_if = "Option::is_none")]
288    pub tag: Option<String>,
289}
290
291#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
292pub struct Annotation {
293    pub object_type: String,
294    pub page_number: usize,
295    pub x0: f64,
296    pub top: f64,
297    pub x1: f64,
298    pub bottom: f64,
299    pub y0: f64,
300    pub y1: f64,
301    pub doctop: f64,
302    pub width: f64,
303    pub height: f64,
304    pub subtype: String,
305    #[serde(skip_serializing_if = "Option::is_none")]
306    pub uri: Option<String>,
307    #[serde(skip_serializing_if = "Option::is_none")]
308    pub title: Option<String>,
309    #[serde(skip_serializing_if = "Option::is_none")]
310    pub contents: Option<String>,
311}
312
313#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
314pub struct Hyperlink {
315    pub object_type: String,
316    pub page_number: usize,
317    pub x0: f64,
318    pub top: f64,
319    pub x1: f64,
320    pub bottom: f64,
321    pub y0: f64,
322    pub y1: f64,
323    pub doctop: f64,
324    pub width: f64,
325    pub height: f64,
326    pub uri: String,
327}
328
329#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
330pub struct Edge {
331    pub x0: f64,
332    pub top: f64,
333    pub x1: f64,
334    pub bottom: f64,
335    pub width: f64,
336    pub height: f64,
337    pub orientation: Orientation,
338    pub object_type: String,
339}
340
341#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
342pub struct ObjectCounts {
343    pub chars: usize,
344    pub lines: usize,
345    pub rects: usize,
346    pub curves: usize,
347    pub images: usize,
348    pub annots: usize,
349    pub hyperlinks: usize,
350}
351
352#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
353pub struct SearchMatch {
354    pub text: String,
355    pub x0: f64,
356    pub top: f64,
357    pub x1: f64,
358    pub bottom: f64,
359    #[serde(skip_serializing_if = "Option::is_none")]
360    pub groups: Option<Vec<Option<String>>>,
361    #[serde(skip_serializing_if = "Option::is_none")]
362    pub chars: Option<Vec<Char>>,
363}
364
365#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
366pub struct TextLine {
367    pub text: String,
368    pub x0: f64,
369    pub top: f64,
370    pub x1: f64,
371    pub bottom: f64,
372    #[serde(skip_serializing_if = "Option::is_none")]
373    pub chars: Option<Vec<Char>>,
374}
375
376#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
377pub struct LayoutObject {
378    pub object_type: String,
379    pub page_number: usize,
380    pub x0: f64,
381    pub top: f64,
382    pub x1: f64,
383    pub bottom: f64,
384    pub width: f64,
385    pub height: f64,
386    #[serde(skip_serializing_if = "Option::is_none")]
387    pub text: Option<String>,
388    #[serde(skip_serializing_if = "Option::is_none")]
389    pub direction: Option<Direction>,
390    #[serde(skip_serializing_if = "Option::is_none")]
391    pub upright: Option<bool>,
392    #[serde(default, skip_serializing_if = "Vec::is_empty")]
393    pub children: Vec<LayoutObject>,
394}
395
396impl LayoutObject {
397    pub fn bbox(&self) -> BBox {
398        BBox::new(self.x0, self.top, self.x1, self.bottom)
399    }
400}
401
402#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
403pub struct PageLayout {
404    pub page_number: usize,
405    pub bbox: BBox,
406    pub objects: Vec<LayoutObject>,
407}
408
409#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
410pub struct StructureElement {
411    pub kind: String,
412    #[serde(skip_serializing_if = "Option::is_none")]
413    pub title: Option<String>,
414    #[serde(skip_serializing_if = "Option::is_none")]
415    pub alt: Option<String>,
416    #[serde(skip_serializing_if = "Option::is_none")]
417    pub page_number: Option<usize>,
418    #[serde(skip_serializing_if = "Option::is_none")]
419    pub mcid: Option<i64>,
420    #[serde(default, skip_serializing_if = "Vec::is_empty")]
421    pub children: Vec<StructureElement>,
422}
423
424#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
425pub struct Page {
426    pub page_number: usize,
427    pub rotation: i32,
428    pub width: f64,
429    pub height: f64,
430    pub bbox: BBox,
431    pub mediabox: BBox,
432    pub cropbox: BBox,
433    #[serde(skip_serializing_if = "Option::is_none")]
434    pub trimbox: Option<BBox>,
435    #[serde(skip_serializing_if = "Option::is_none")]
436    pub bleedbox: Option<BBox>,
437    #[serde(skip_serializing_if = "Option::is_none")]
438    pub artbox: Option<BBox>,
439    pub doctop_offset: f64,
440    pub is_original: bool,
441    pub chars: Vec<Char>,
442    pub lines: Vec<Line>,
443    pub rects: Vec<RectObject>,
444    pub curves: Vec<Curve>,
445    pub images: Vec<ImageObject>,
446    pub annots: Vec<Annotation>,
447    pub hyperlinks: Vec<Hyperlink>,
448    #[serde(skip_serializing_if = "Option::is_none")]
449    pub structure_tree: Option<StructureElement>,
450}
451
452#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
453pub struct PdfDocument {
454    pub path: PathBuf,
455    pub pages: Vec<Page>,
456    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
457    pub metadata: JsonMap,
458    #[serde(skip_serializing_if = "Option::is_none")]
459    pub structure_tree: Option<StructureElement>,
460}
461
462pub enum PageObjectRef<'a> {
463    Char(&'a Char),
464    Line(&'a Line),
465    Rect(&'a RectObject),
466    Curve(&'a Curve),
467    Image(&'a ImageObject),
468    Annot(&'a Annotation),
469    Hyperlink(&'a Hyperlink),
470}
471
472pub trait Bounded: Clone {
473    fn bbox(&self) -> BBox;
474    fn with_bbox(&self, bbox: BBox, page_height: f64) -> Self;
475}
476
477macro_rules! impl_bounded {
478    ($ty:ty) => {
479        impl Bounded for $ty {
480            fn bbox(&self) -> BBox {
481                BBox::new(self.x0, self.top, self.x1, self.bottom)
482            }
483
484            fn with_bbox(&self, bbox: BBox, page_height: f64) -> Self {
485                let mut copy = self.clone();
486                copy.x0 = bbox.x0;
487                copy.top = bbox.top;
488                copy.x1 = bbox.x1;
489                copy.bottom = bbox.bottom;
490                copy.width = bbox.width();
491                copy.height = bbox.height();
492                copy.y0 = page_height - bbox.bottom;
493                copy.y1 = page_height - bbox.top;
494                copy.doctop = (self.doctop - self.top) + bbox.top;
495                copy
496            }
497        }
498    };
499}
500
501impl_bounded!(Char);
502impl_bounded!(Line);
503impl_bounded!(RectObject);
504impl_bounded!(Curve);
505impl_bounded!(ImageObject);
506impl_bounded!(Annotation);
507impl_bounded!(Hyperlink);
508
509impl Bounded for Word {
510    fn bbox(&self) -> BBox {
511        BBox::new(self.x0, self.top, self.x1, self.bottom)
512    }
513
514    fn with_bbox(&self, bbox: BBox, _page_height: f64) -> Self {
515        let mut copy = self.clone();
516        copy.x0 = bbox.x0;
517        copy.top = bbox.top;
518        copy.x1 = bbox.x1;
519        copy.bottom = bbox.bottom;
520        copy.width = bbox.width();
521        copy.height = bbox.height();
522        copy.doctop = (self.doctop - self.top) + bbox.top;
523        copy
524    }
525}
526
527impl Bounded for Edge {
528    fn bbox(&self) -> BBox {
529        BBox::new(self.x0, self.top, self.x1, self.bottom)
530    }
531
532    fn with_bbox(&self, bbox: BBox, _page_height: f64) -> Self {
533        let mut copy = self.clone();
534        copy.x0 = bbox.x0;
535        copy.top = bbox.top;
536        copy.x1 = bbox.x1;
537        copy.bottom = bbox.bottom;
538        copy.width = bbox.width();
539        copy.height = bbox.height();
540        copy
541    }
542}
543
544impl Bounded for LayoutObject {
545    fn bbox(&self) -> BBox {
546        BBox::new(self.x0, self.top, self.x1, self.bottom)
547    }
548
549    fn with_bbox(&self, bbox: BBox, _page_height: f64) -> Self {
550        let mut copy = self.clone();
551        copy.x0 = bbox.x0;
552        copy.top = bbox.top;
553        copy.x1 = bbox.x1;
554        copy.bottom = bbox.bottom;
555        copy.width = bbox.width();
556        copy.height = bbox.height();
557        copy
558    }
559}