Skip to main content

pdfsink_rs/
lib.rs

1mod clustering;
2mod container_api;
3mod display;
4mod error;
5mod geometry;
6mod layout;
7mod parse;
8mod table;
9mod text;
10mod types;
11
12pub use display::{HasBBox, HasCenter, HasLineSegments, PageImage, RenderOptions, RgbaColor};
13pub use error::{Error, Result};
14pub use parse::open_pdf;
15pub use table::{ExplicitLine, Table, TableFinder, TableSettings, TableStrategy};
16pub use text::{
17    chars_to_textmap, dedupe_chars, extract_text, extract_text_lines, extract_text_simple,
18    extract_words, DedupeOptions, SearchOptions, TextMap, TextOptions, WordExtractor, WordMap,
19};
20pub use types::{
21    Annotation, BBox, Char, Curve, Direction, Edge, Hyperlink, ImageObject, JsonMap, LayoutObject,
22    Line, ObjectCounts, Orientation, Page, PageLayout, PageObjectRef, PathCommand, PdfDocument,
23    Point, RectObject, SearchMatch, StructureElement, TextLine, Word,
24};
25
26pub type PDF = PdfDocument;
27
28use geometry::{crop_objects, outside_objects, test_proposed_bbox, within_objects};
29
30impl PdfDocument {
31    pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
32        open_pdf(path)
33    }
34
35    pub fn len(&self) -> usize {
36        self.pages.len()
37    }
38
39    pub fn is_empty(&self) -> bool {
40        self.pages.is_empty()
41    }
42
43    pub fn page(&self, page_number: usize) -> Result<&Page> {
44        if page_number == 0 || page_number > self.pages.len() {
45            return Err(Error::InvalidPage { page_number });
46        }
47        Ok(&self.pages[page_number - 1])
48    }
49
50    pub fn pages(&self) -> &[Page] {
51        &self.pages
52    }
53}
54
55impl Page {
56    pub fn object_counts(&self) -> ObjectCounts {
57        ObjectCounts {
58            chars: self.chars.len(),
59            lines: self.lines.len(),
60            rects: self.rects.len(),
61            curves: self.curves.len(),
62            images: self.images.len(),
63            annots: self.annots.len(),
64            hyperlinks: self.hyperlinks.len(),
65        }
66    }
67
68    pub fn extract_text(&self) -> String {
69        extract_text(&self.chars, &self.default_text_options())
70    }
71
72    pub fn extract_text_with_options(&self, options: &TextOptions) -> String {
73        extract_text(&self.chars, options)
74    }
75
76    pub fn extract_text_simple(&self) -> String {
77        extract_text_simple(&self.chars, 3.0, 3.0)
78    }
79
80    pub fn extract_text_simple_with_tolerance(&self, x_tolerance: f64, y_tolerance: f64) -> String {
81        extract_text_simple(&self.chars, x_tolerance, y_tolerance)
82    }
83
84    pub fn extract_words(&self) -> Vec<Word> {
85        extract_words(&self.chars, &self.default_text_options(), false)
86    }
87
88    pub fn extract_words_with_options(&self, options: &TextOptions, return_chars: bool) -> Vec<Word> {
89        extract_words(&self.chars, options, return_chars)
90    }
91
92    pub fn extract_text_lines(&self, strip: bool, return_chars: bool) -> Vec<TextLine> {
93        extract_text_lines(&self.chars, &self.default_text_options(), strip, return_chars)
94    }
95
96    pub fn search(&self, pattern: &str) -> Result<Vec<SearchMatch>> {
97        let textmap = chars_to_textmap(&self.chars, &self.default_text_options());
98        textmap.search(pattern, &SearchOptions::default())
99    }
100
101    pub fn search_with_options(&self, pattern: &str, options: &SearchOptions, text_options: &TextOptions) -> Result<Vec<SearchMatch>> {
102        let textmap = chars_to_textmap(&self.chars, text_options);
103        textmap.search(pattern, options)
104    }
105
106    pub fn crop(&self, bbox: BBox, relative: bool, strict: bool) -> Result<Self> {
107        self.crop_inner(bbox, relative, strict, CropMode::Crop)
108    }
109
110    pub fn within_bbox(&self, bbox: BBox, relative: bool, strict: bool) -> Result<Self> {
111        self.crop_inner(bbox, relative, strict, CropMode::Within)
112    }
113
114    pub fn outside_bbox(&self, bbox: BBox, relative: bool, strict: bool) -> Result<Self> {
115        self.crop_inner(bbox, relative, strict, CropMode::Outside)
116    }
117
118    pub fn filter<F>(&self, mut predicate: F) -> Self
119    where
120        F: FnMut(PageObjectRef<'_>) -> bool,
121    {
122        let mut page = self.clone();
123        page.chars = self
124            .chars
125            .iter()
126            .filter(|item| predicate(PageObjectRef::Char(item)))
127            .cloned()
128            .collect();
129        page.lines = self
130            .lines
131            .iter()
132            .filter(|item| predicate(PageObjectRef::Line(item)))
133            .cloned()
134            .collect();
135        page.rects = self
136            .rects
137            .iter()
138            .filter(|item| predicate(PageObjectRef::Rect(item)))
139            .cloned()
140            .collect();
141        page.curves = self
142            .curves
143            .iter()
144            .filter(|item| predicate(PageObjectRef::Curve(item)))
145            .cloned()
146            .collect();
147        page.images = self
148            .images
149            .iter()
150            .filter(|item| predicate(PageObjectRef::Image(item)))
151            .cloned()
152            .collect();
153        page.annots = self
154            .annots
155            .iter()
156            .filter(|item| predicate(PageObjectRef::Annot(item)))
157            .cloned()
158            .collect();
159        page.hyperlinks = self
160            .hyperlinks
161            .iter()
162            .filter(|item| predicate(PageObjectRef::Hyperlink(item)))
163            .cloned()
164            .collect();
165        page.is_original = false;
166        page
167    }
168
169    pub fn dedupe_chars(&self, options: &DedupeOptions) -> Self {
170        let mut page = self.clone();
171        page.chars = dedupe_chars(&self.chars, options);
172        page.is_original = false;
173        page
174    }
175
176    pub fn debug_tablefinder(&self, settings: TableSettings) -> Result<TableFinder> {
177        TableFinder::new(self, settings)
178    }
179
180    pub fn find_tables(&self, settings: TableSettings) -> Result<Vec<Table>> {
181        Ok(TableFinder::new(self, settings)?.tables)
182    }
183
184    pub fn find_table(&self, settings: TableSettings) -> Result<Option<Table>> {
185        let mut tables = self.find_tables(settings)?;
186        if tables.is_empty() {
187            return Ok(None);
188        }
189        tables.sort_by(|a, b| {
190            b.cells
191                .len()
192                .cmp(&a.cells.len())
193                .then_with(|| a.bbox.top.total_cmp(&b.bbox.top))
194                .then_with(|| a.bbox.x0.total_cmp(&b.bbox.x0))
195        });
196        Ok(tables.into_iter().next())
197    }
198
199    pub fn extract_tables(&self, settings: TableSettings) -> Result<Vec<Vec<Vec<Option<String>>>>> {
200        let tables = self.find_tables(settings.clone())?;
201        Ok(tables
202            .iter()
203            .map(|table| table.extract(self, &settings.text_options))
204            .collect())
205    }
206
207    pub fn extract_table(&self, settings: TableSettings) -> Result<Option<Vec<Vec<Option<String>>>>> {
208        let Some(table) = self.find_table(settings.clone())? else {
209            return Ok(None);
210        };
211        Ok(Some(table.extract(self, &settings.text_options)))
212    }
213
214    pub fn to_debug_svg(&self) -> String {
215        let mut out = String::new();
216        out.push_str(&format!(
217            r#"<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {:.3} {:.3}" width="{:.3}" height="{:.3}">"#,
218            self.width, self.height, self.width, self.height
219        ));
220        out.push_str(r#"<rect x="0" y="0" width="100%" height="100%" fill="white" stroke="black"/>"#);
221
222        for line in &self.lines {
223            if line.pts.len() >= 2 {
224                out.push_str(&format!(
225                    r#"<line x1="{:.3}" y1="{:.3}" x2="{:.3}" y2="{:.3}" stroke="black" stroke-width="1"/>"#,
226                    line.pts[0].x, line.pts[0].y, line.pts[1].x, line.pts[1].y
227                ));
228            }
229        }
230
231        for rect in &self.rects {
232            out.push_str(&format!(
233                r#"<rect x="{:.3}" y="{:.3}" width="{:.3}" height="{:.3}" fill="none" stroke="black" stroke-width="1"/>"#,
234                rect.x0, rect.top, rect.width, rect.height
235            ));
236        }
237
238        for curve in &self.curves {
239            if let Some(first) = curve.pts.first() {
240                let mut d = format!("M {:.3} {:.3}", first.x, first.y);
241                for point in curve.pts.iter().skip(1) {
242                    d.push_str(&format!(" L {:.3} {:.3}", point.x, point.y));
243                }
244                out.push_str(&format!(
245                    r#"<path d="{}" fill="none" stroke="black" stroke-width="1"/>"#,
246                    d
247                ));
248            }
249        }
250
251        for image in &self.images {
252            out.push_str(&format!(
253                r#"<rect x="{:.3}" y="{:.3}" width="{:.3}" height="{:.3}" fill="none" stroke="black" stroke-dasharray="4 2"/>"#,
254                image.x0, image.top, image.width, image.height
255            ));
256        }
257
258        for link in &self.hyperlinks {
259            out.push_str(&format!(
260                r#"<rect x="{:.3}" y="{:.3}" width="{:.3}" height="{:.3}" fill="none" stroke="black" stroke-dasharray="2 2"/>"#,
261                link.x0, link.top, link.width, link.height
262            ));
263        }
264
265        for ch in &self.chars {
266            let x = ch.x0;
267            let y = ch.bottom;
268            let escaped = html_escape(&ch.text);
269            out.push_str(&format!(
270                r#"<text x="{:.3}" y="{:.3}" font-size="10">{}</text>"#,
271                x, y, escaped
272            ));
273        }
274
275        out.push_str("</svg>");
276        out
277    }
278
279    fn crop_inner(&self, bbox: BBox, relative: bool, strict: bool, mode: CropMode) -> Result<Self> {
280        let proposed = if relative {
281            bbox.translate(self.bbox.x0, self.bbox.top)
282        } else {
283            bbox
284        };
285
286        if strict {
287            test_proposed_bbox(proposed, self.bbox)?;
288        }
289
290        let mut page = self.clone();
291        page.chars = match mode {
292            CropMode::Crop => crop_objects(&self.chars, proposed, self.height),
293            CropMode::Within => within_objects(&self.chars, proposed),
294            CropMode::Outside => outside_objects(&self.chars, proposed),
295        };
296        page.lines = match mode {
297            CropMode::Crop => crop_objects(&self.lines, proposed, self.height),
298            CropMode::Within => within_objects(&self.lines, proposed),
299            CropMode::Outside => outside_objects(&self.lines, proposed),
300        };
301        page.rects = match mode {
302            CropMode::Crop => crop_objects(&self.rects, proposed, self.height),
303            CropMode::Within => within_objects(&self.rects, proposed),
304            CropMode::Outside => outside_objects(&self.rects, proposed),
305        };
306        page.curves = match mode {
307            CropMode::Crop => crop_objects(&self.curves, proposed, self.height),
308            CropMode::Within => within_objects(&self.curves, proposed),
309            CropMode::Outside => outside_objects(&self.curves, proposed),
310        };
311        page.images = match mode {
312            CropMode::Crop => crop_objects(&self.images, proposed, self.height),
313            CropMode::Within => within_objects(&self.images, proposed),
314            CropMode::Outside => outside_objects(&self.images, proposed),
315        };
316        page.annots = match mode {
317            CropMode::Crop => crop_objects(&self.annots, proposed, self.height),
318            CropMode::Within => within_objects(&self.annots, proposed),
319            CropMode::Outside => outside_objects(&self.annots, proposed),
320        };
321        page.hyperlinks = match mode {
322            CropMode::Crop => crop_objects(&self.hyperlinks, proposed, self.height),
323            CropMode::Within => within_objects(&self.hyperlinks, proposed),
324            CropMode::Outside => outside_objects(&self.hyperlinks, proposed),
325        };
326
327        page.bbox = match mode {
328            CropMode::Outside => self.bbox,
329            CropMode::Crop | CropMode::Within => proposed,
330        };
331        page.is_original = false;
332
333        Ok(page)
334    }
335
336    pub(crate) fn default_text_options(&self) -> TextOptions {
337        let mut options = TextOptions::default();
338        options.layout_bbox = Some(self.bbox);
339        options.layout_width = Some(self.width);
340        options.layout_height = Some(self.height);
341        options
342    }
343}
344
345#[derive(Clone, Copy)]
346enum CropMode {
347    Crop,
348    Within,
349    Outside,
350}
351
352fn html_escape(input: &str) -> String {
353    input
354        .replace('&', "&amp;")
355        .replace('<', "&lt;")
356        .replace('>', "&gt;")
357}