1use serde::{Deserialize, Serialize};
2use serde_json::Value;
3use std::collections::BTreeMap;
4use std::path::PathBuf;
5
6pub type JsonMap = BTreeMap<String, Value>;
7
8#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
9pub struct BBox {
10 pub x0: f64,
11 pub top: f64,
12 pub x1: f64,
13 pub bottom: f64,
14}
15
16impl BBox {
17 pub const fn new(x0: f64, top: f64, x1: f64, bottom: f64) -> Self {
18 Self { x0, top, x1, bottom }
19 }
20
21 pub fn width(self) -> f64 {
22 self.x1 - self.x0
23 }
24
25 pub fn height(self) -> f64 {
26 self.bottom - self.top
27 }
28
29 pub fn area(self) -> f64 {
30 self.width() * self.height()
31 }
32
33 pub fn is_valid(self) -> bool {
34 self.x0 <= self.x1 && self.top <= self.bottom
35 }
36
37 pub fn overlaps(self, other: Self) -> bool {
38 !(self.x1 < other.x0 || self.x0 > other.x1 || self.bottom < other.top || self.top > other.bottom)
39 }
40
41 pub fn contains_bbox(self, other: Self) -> bool {
42 self.x0 <= other.x0
43 && self.top <= other.top
44 && self.x1 >= other.x1
45 && self.bottom >= other.bottom
46 }
47
48 pub fn overlap(self, other: Self) -> Option<Self> {
49 let x0 = self.x0.max(other.x0);
50 let top = self.top.max(other.top);
51 let x1 = self.x1.min(other.x1);
52 let bottom = self.bottom.min(other.bottom);
53 if x1 >= x0 && bottom >= top && ((x1 - x0) + (bottom - top) > 0.0) {
54 Some(Self::new(x0, top, x1, bottom))
55 } else {
56 None
57 }
58 }
59
60 pub fn translate(self, dx: f64, dy: f64) -> Self {
61 Self::new(self.x0 + dx, self.top + dy, self.x1 + dx, self.bottom + dy)
62 }
63
64 pub fn as_tuple(self) -> (f64, f64, f64, f64) {
65 (self.x0, self.top, self.x1, self.bottom)
66 }
67
68 pub fn center(self) -> Point {
69 Point::new((self.x0 + self.x1) / 2.0, (self.top + self.bottom) / 2.0)
70 }
71}
72
73impl Default for BBox {
74 fn default() -> Self {
75 Self::new(0.0, 0.0, 0.0, 0.0)
76 }
77}
78
79#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
80pub struct Point {
81 pub x: f64,
82 pub y: f64,
83}
84
85impl Point {
86 pub const fn new(x: f64, y: f64) -> Self {
87 Self { x, y }
88 }
89}
90
91#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
92#[serde(rename_all = "snake_case")]
93pub enum Direction {
94 Ttb,
95 Btt,
96 Ltr,
97 Rtl,
98}
99
100impl Direction {
101 pub fn as_str(self) -> &'static str {
102 match self {
103 Self::Ttb => "ttb",
104 Self::Btt => "btt",
105 Self::Ltr => "ltr",
106 Self::Rtl => "rtl",
107 }
108 }
109
110 pub fn is_horizontal(self) -> bool {
111 matches!(self, Self::Ltr | Self::Rtl)
112 }
113
114 pub fn is_vertical(self) -> bool {
115 matches!(self, Self::Ttb | Self::Btt)
116 }
117}
118
119impl std::str::FromStr for Direction {
120 type Err = crate::Error;
121
122 fn from_str(s: &str) -> crate::Result<Self> {
123 match s {
124 "ttb" => Ok(Self::Ttb),
125 "btt" => Ok(Self::Btt),
126 "ltr" => Ok(Self::Ltr),
127 "rtl" => Ok(Self::Rtl),
128 other => Err(crate::Error::Message(format!("unknown direction: {other}"))),
129 }
130 }
131}
132
133#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
134#[serde(rename_all = "snake_case")]
135pub enum Orientation {
136 Horizontal,
137 Vertical,
138}
139
140impl Orientation {
141 pub fn as_char(self) -> &'static str {
142 match self {
143 Self::Horizontal => "h",
144 Self::Vertical => "v",
145 }
146 }
147}
148
149#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
150pub struct Char {
151 pub object_type: String,
152 pub page_number: usize,
153 pub text: String,
154 pub x0: f64,
155 pub top: f64,
156 pub x1: f64,
157 pub bottom: f64,
158 pub y0: f64,
159 pub y1: f64,
160 pub doctop: f64,
161 pub width: f64,
162 pub height: f64,
163 pub size: f64,
164 pub adv: f64,
165 pub upright: bool,
166 pub fontname: String,
167 pub matrix: [f64; 6],
168 #[serde(skip_serializing_if = "Option::is_none")]
169 pub mcid: Option<i64>,
170 #[serde(skip_serializing_if = "Option::is_none")]
171 pub tag: Option<String>,
172 #[serde(skip_serializing_if = "Option::is_none")]
173 pub ncs: Option<String>,
174 #[serde(skip_serializing_if = "Option::is_none")]
175 pub stroking_color: Option<String>,
176 #[serde(skip_serializing_if = "Option::is_none")]
177 pub non_stroking_color: Option<String>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
181pub struct Word {
182 pub text: String,
183 pub x0: f64,
184 pub top: f64,
185 pub x1: f64,
186 pub bottom: f64,
187 pub doctop: f64,
188 pub width: f64,
189 pub height: f64,
190 pub upright: bool,
191 pub direction: Direction,
192 #[serde(skip_serializing_if = "Option::is_none")]
193 pub chars: Option<Vec<Char>>,
194}
195
196#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
197pub struct Line {
198 pub object_type: String,
199 pub page_number: usize,
200 pub x0: f64,
201 pub top: f64,
202 pub x1: f64,
203 pub bottom: f64,
204 pub y0: f64,
205 pub y1: f64,
206 pub doctop: f64,
207 pub width: f64,
208 pub height: f64,
209 pub pts: Vec<Point>,
210 pub stroke: bool,
211 pub fill: bool,
212 pub linewidth: f64,
213}
214
215#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
216pub enum PathCommand {
217 MoveTo(Point),
218 LineTo(Point),
219 CurveTo { c1: Point, c2: Point, p: Point },
220 Rect { x: f64, y: f64, width: f64, height: f64 },
221 Close,
222}
223
224#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
225pub struct RectObject {
226 pub object_type: String,
227 pub page_number: usize,
228 pub x0: f64,
229 pub top: f64,
230 pub x1: f64,
231 pub bottom: f64,
232 pub y0: f64,
233 pub y1: f64,
234 pub doctop: f64,
235 pub width: f64,
236 pub height: f64,
237 pub pts: Vec<Point>,
238 pub path: Vec<PathCommand>,
239 pub stroke: bool,
240 pub fill: bool,
241 pub linewidth: f64,
242}
243
244#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
245pub struct Curve {
246 pub object_type: String,
247 pub page_number: usize,
248 pub x0: f64,
249 pub top: f64,
250 pub x1: f64,
251 pub bottom: f64,
252 pub y0: f64,
253 pub y1: f64,
254 pub doctop: f64,
255 pub width: f64,
256 pub height: f64,
257 pub pts: Vec<Point>,
258 pub path: Vec<PathCommand>,
259 pub stroke: bool,
260 pub fill: bool,
261 pub linewidth: f64,
262}
263
264#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
265pub struct ImageObject {
266 pub object_type: String,
267 pub page_number: usize,
268 pub x0: f64,
269 pub top: f64,
270 pub x1: f64,
271 pub bottom: f64,
272 pub y0: f64,
273 pub y1: f64,
274 pub doctop: f64,
275 pub width: f64,
276 pub height: f64,
277 pub name: String,
278 pub srcsize: (u32, u32),
279 #[serde(skip_serializing_if = "Option::is_none")]
280 pub bits: Option<i64>,
281 #[serde(skip_serializing_if = "Option::is_none")]
282 pub colorspace: Option<String>,
283 #[serde(skip_serializing_if = "Option::is_none")]
284 pub imagemask: Option<bool>,
285 #[serde(skip_serializing_if = "Option::is_none")]
286 pub mcid: Option<i64>,
287 #[serde(skip_serializing_if = "Option::is_none")]
288 pub tag: Option<String>,
289}
290
291#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
292pub struct Annotation {
293 pub object_type: String,
294 pub page_number: usize,
295 pub x0: f64,
296 pub top: f64,
297 pub x1: f64,
298 pub bottom: f64,
299 pub y0: f64,
300 pub y1: f64,
301 pub doctop: f64,
302 pub width: f64,
303 pub height: f64,
304 pub subtype: String,
305 #[serde(skip_serializing_if = "Option::is_none")]
306 pub uri: Option<String>,
307 #[serde(skip_serializing_if = "Option::is_none")]
308 pub title: Option<String>,
309 #[serde(skip_serializing_if = "Option::is_none")]
310 pub contents: Option<String>,
311}
312
313#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
314pub struct Hyperlink {
315 pub object_type: String,
316 pub page_number: usize,
317 pub x0: f64,
318 pub top: f64,
319 pub x1: f64,
320 pub bottom: f64,
321 pub y0: f64,
322 pub y1: f64,
323 pub doctop: f64,
324 pub width: f64,
325 pub height: f64,
326 pub uri: String,
327}
328
329#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
330pub struct Edge {
331 pub x0: f64,
332 pub top: f64,
333 pub x1: f64,
334 pub bottom: f64,
335 pub width: f64,
336 pub height: f64,
337 pub orientation: Orientation,
338 pub object_type: String,
339}
340
341#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
342pub struct ObjectCounts {
343 pub chars: usize,
344 pub lines: usize,
345 pub rects: usize,
346 pub curves: usize,
347 pub images: usize,
348 pub annots: usize,
349 pub hyperlinks: usize,
350}
351
352#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
353pub struct SearchMatch {
354 pub text: String,
355 pub x0: f64,
356 pub top: f64,
357 pub x1: f64,
358 pub bottom: f64,
359 #[serde(skip_serializing_if = "Option::is_none")]
360 pub groups: Option<Vec<Option<String>>>,
361 #[serde(skip_serializing_if = "Option::is_none")]
362 pub chars: Option<Vec<Char>>,
363}
364
365#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
366pub struct TextLine {
367 pub text: String,
368 pub x0: f64,
369 pub top: f64,
370 pub x1: f64,
371 pub bottom: f64,
372 #[serde(skip_serializing_if = "Option::is_none")]
373 pub chars: Option<Vec<Char>>,
374}
375
376#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
377pub struct LayoutObject {
378 pub object_type: String,
379 pub page_number: usize,
380 pub x0: f64,
381 pub top: f64,
382 pub x1: f64,
383 pub bottom: f64,
384 pub width: f64,
385 pub height: f64,
386 #[serde(skip_serializing_if = "Option::is_none")]
387 pub text: Option<String>,
388 #[serde(skip_serializing_if = "Option::is_none")]
389 pub direction: Option<Direction>,
390 #[serde(skip_serializing_if = "Option::is_none")]
391 pub upright: Option<bool>,
392 #[serde(default, skip_serializing_if = "Vec::is_empty")]
393 pub children: Vec<LayoutObject>,
394}
395
396impl LayoutObject {
397 pub fn bbox(&self) -> BBox {
398 BBox::new(self.x0, self.top, self.x1, self.bottom)
399 }
400}
401
402#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
403pub struct PageLayout {
404 pub page_number: usize,
405 pub bbox: BBox,
406 pub objects: Vec<LayoutObject>,
407}
408
409#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
410pub struct StructureElement {
411 pub kind: String,
412 #[serde(skip_serializing_if = "Option::is_none")]
413 pub title: Option<String>,
414 #[serde(skip_serializing_if = "Option::is_none")]
415 pub alt: Option<String>,
416 #[serde(skip_serializing_if = "Option::is_none")]
417 pub page_number: Option<usize>,
418 #[serde(skip_serializing_if = "Option::is_none")]
419 pub mcid: Option<i64>,
420 #[serde(default, skip_serializing_if = "Vec::is_empty")]
421 pub children: Vec<StructureElement>,
422}
423
424#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
425pub struct Page {
426 pub page_number: usize,
427 pub rotation: i32,
428 pub width: f64,
429 pub height: f64,
430 pub bbox: BBox,
431 pub mediabox: BBox,
432 pub cropbox: BBox,
433 #[serde(skip_serializing_if = "Option::is_none")]
434 pub trimbox: Option<BBox>,
435 #[serde(skip_serializing_if = "Option::is_none")]
436 pub bleedbox: Option<BBox>,
437 #[serde(skip_serializing_if = "Option::is_none")]
438 pub artbox: Option<BBox>,
439 pub doctop_offset: f64,
440 pub is_original: bool,
441 pub chars: Vec<Char>,
442 pub lines: Vec<Line>,
443 pub rects: Vec<RectObject>,
444 pub curves: Vec<Curve>,
445 pub images: Vec<ImageObject>,
446 pub annots: Vec<Annotation>,
447 pub hyperlinks: Vec<Hyperlink>,
448 #[serde(skip_serializing_if = "Option::is_none")]
449 pub structure_tree: Option<StructureElement>,
450}
451
452#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
453pub struct PdfDocument {
454 pub path: PathBuf,
455 pub pages: Vec<Page>,
456 #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
457 pub metadata: JsonMap,
458 #[serde(skip_serializing_if = "Option::is_none")]
459 pub structure_tree: Option<StructureElement>,
460}
461
462pub enum PageObjectRef<'a> {
463 Char(&'a Char),
464 Line(&'a Line),
465 Rect(&'a RectObject),
466 Curve(&'a Curve),
467 Image(&'a ImageObject),
468 Annot(&'a Annotation),
469 Hyperlink(&'a Hyperlink),
470}
471
472pub trait Bounded: Clone {
473 fn bbox(&self) -> BBox;
474 fn with_bbox(&self, bbox: BBox, page_height: f64) -> Self;
475}
476
477macro_rules! impl_bounded {
478 ($ty:ty) => {
479 impl Bounded for $ty {
480 fn bbox(&self) -> BBox {
481 BBox::new(self.x0, self.top, self.x1, self.bottom)
482 }
483
484 fn with_bbox(&self, bbox: BBox, page_height: f64) -> Self {
485 let mut copy = self.clone();
486 copy.x0 = bbox.x0;
487 copy.top = bbox.top;
488 copy.x1 = bbox.x1;
489 copy.bottom = bbox.bottom;
490 copy.width = bbox.width();
491 copy.height = bbox.height();
492 copy.y0 = page_height - bbox.bottom;
493 copy.y1 = page_height - bbox.top;
494 copy.doctop = (self.doctop - self.top) + bbox.top;
495 copy
496 }
497 }
498 };
499}
500
501impl_bounded!(Char);
502impl_bounded!(Line);
503impl_bounded!(RectObject);
504impl_bounded!(Curve);
505impl_bounded!(ImageObject);
506impl_bounded!(Annotation);
507impl_bounded!(Hyperlink);
508
509impl Bounded for Word {
510 fn bbox(&self) -> BBox {
511 BBox::new(self.x0, self.top, self.x1, self.bottom)
512 }
513
514 fn with_bbox(&self, bbox: BBox, _page_height: f64) -> Self {
515 let mut copy = self.clone();
516 copy.x0 = bbox.x0;
517 copy.top = bbox.top;
518 copy.x1 = bbox.x1;
519 copy.bottom = bbox.bottom;
520 copy.width = bbox.width();
521 copy.height = bbox.height();
522 copy.doctop = (self.doctop - self.top) + bbox.top;
523 copy
524 }
525}
526
527impl Bounded for Edge {
528 fn bbox(&self) -> BBox {
529 BBox::new(self.x0, self.top, self.x1, self.bottom)
530 }
531
532 fn with_bbox(&self, bbox: BBox, _page_height: f64) -> Self {
533 let mut copy = self.clone();
534 copy.x0 = bbox.x0;
535 copy.top = bbox.top;
536 copy.x1 = bbox.x1;
537 copy.bottom = bbox.bottom;
538 copy.width = bbox.width();
539 copy.height = bbox.height();
540 copy
541 }
542}
543
544impl Bounded for LayoutObject {
545 fn bbox(&self) -> BBox {
546 BBox::new(self.x0, self.top, self.x1, self.bottom)
547 }
548
549 fn with_bbox(&self, bbox: BBox, _page_height: f64) -> Self {
550 let mut copy = self.clone();
551 copy.x0 = bbox.x0;
552 copy.top = bbox.top;
553 copy.x1 = bbox.x1;
554 copy.bottom = bbox.bottom;
555 copy.width = bbox.width();
556 copy.height = bbox.height();
557 copy
558 }
559}