1mod clustering;
2mod container_api;
3mod display;
4mod error;
5mod geometry;
6mod layout;
7mod parse;
8mod table;
9mod text;
10mod types;
11
12pub use display::{HasBBox, HasCenter, HasLineSegments, PageImage, RenderOptions, RgbaColor};
13pub use error::{Error, Result};
14pub use parse::open_pdf;
15pub use table::{ExplicitLine, Table, TableFinder, TableSettings, TableStrategy};
16pub use text::{
17 chars_to_textmap, dedupe_chars, extract_text, extract_text_lines, extract_text_simple,
18 extract_words, DedupeOptions, SearchOptions, TextMap, TextOptions, WordExtractor, WordMap,
19};
20pub use types::{
21 Annotation, BBox, Char, Curve, Direction, Edge, Hyperlink, ImageObject, JsonMap, LayoutObject,
22 Line, ObjectCounts, Orientation, Page, PageLayout, PageObjectRef, PathCommand, PdfDocument,
23 Point, RectObject, SearchMatch, StructureElement, TextLine, Word,
24};
25
26pub type PDF = PdfDocument;
27
28use geometry::{crop_objects, outside_objects, test_proposed_bbox, within_objects};
29
30impl PdfDocument {
31 pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
32 open_pdf(path)
33 }
34
35 pub fn len(&self) -> usize {
36 self.pages.len()
37 }
38
39 pub fn is_empty(&self) -> bool {
40 self.pages.is_empty()
41 }
42
43 pub fn page(&self, page_number: usize) -> Result<&Page> {
44 if page_number == 0 || page_number > self.pages.len() {
45 return Err(Error::InvalidPage { page_number });
46 }
47 Ok(&self.pages[page_number - 1])
48 }
49
50 pub fn pages(&self) -> &[Page] {
51 &self.pages
52 }
53}
54
55impl Page {
56 pub fn object_counts(&self) -> ObjectCounts {
57 ObjectCounts {
58 chars: self.chars.len(),
59 lines: self.lines.len(),
60 rects: self.rects.len(),
61 curves: self.curves.len(),
62 images: self.images.len(),
63 annots: self.annots.len(),
64 hyperlinks: self.hyperlinks.len(),
65 }
66 }
67
68 pub fn extract_text(&self) -> String {
69 extract_text(&self.chars, &self.default_text_options())
70 }
71
72 pub fn extract_text_with_options(&self, options: &TextOptions) -> String {
73 extract_text(&self.chars, options)
74 }
75
76 pub fn extract_text_simple(&self) -> String {
77 extract_text_simple(&self.chars, 3.0, 3.0)
78 }
79
80 pub fn extract_text_simple_with_tolerance(&self, x_tolerance: f64, y_tolerance: f64) -> String {
81 extract_text_simple(&self.chars, x_tolerance, y_tolerance)
82 }
83
84 pub fn extract_words(&self) -> Vec<Word> {
85 extract_words(&self.chars, &self.default_text_options(), false)
86 }
87
88 pub fn extract_words_with_options(&self, options: &TextOptions, return_chars: bool) -> Vec<Word> {
89 extract_words(&self.chars, options, return_chars)
90 }
91
92 pub fn extract_text_lines(&self, strip: bool, return_chars: bool) -> Vec<TextLine> {
93 extract_text_lines(&self.chars, &self.default_text_options(), strip, return_chars)
94 }
95
96 pub fn search(&self, pattern: &str) -> Result<Vec<SearchMatch>> {
97 let textmap = chars_to_textmap(&self.chars, &self.default_text_options());
98 textmap.search(pattern, &SearchOptions::default())
99 }
100
101 pub fn search_with_options(&self, pattern: &str, options: &SearchOptions, text_options: &TextOptions) -> Result<Vec<SearchMatch>> {
102 let textmap = chars_to_textmap(&self.chars, text_options);
103 textmap.search(pattern, options)
104 }
105
106 pub fn crop(&self, bbox: BBox, relative: bool, strict: bool) -> Result<Self> {
107 self.crop_inner(bbox, relative, strict, CropMode::Crop)
108 }
109
110 pub fn within_bbox(&self, bbox: BBox, relative: bool, strict: bool) -> Result<Self> {
111 self.crop_inner(bbox, relative, strict, CropMode::Within)
112 }
113
114 pub fn outside_bbox(&self, bbox: BBox, relative: bool, strict: bool) -> Result<Self> {
115 self.crop_inner(bbox, relative, strict, CropMode::Outside)
116 }
117
118 pub fn filter<F>(&self, mut predicate: F) -> Self
119 where
120 F: FnMut(PageObjectRef<'_>) -> bool,
121 {
122 let mut page = self.clone();
123 page.chars = self
124 .chars
125 .iter()
126 .filter(|item| predicate(PageObjectRef::Char(item)))
127 .cloned()
128 .collect();
129 page.lines = self
130 .lines
131 .iter()
132 .filter(|item| predicate(PageObjectRef::Line(item)))
133 .cloned()
134 .collect();
135 page.rects = self
136 .rects
137 .iter()
138 .filter(|item| predicate(PageObjectRef::Rect(item)))
139 .cloned()
140 .collect();
141 page.curves = self
142 .curves
143 .iter()
144 .filter(|item| predicate(PageObjectRef::Curve(item)))
145 .cloned()
146 .collect();
147 page.images = self
148 .images
149 .iter()
150 .filter(|item| predicate(PageObjectRef::Image(item)))
151 .cloned()
152 .collect();
153 page.annots = self
154 .annots
155 .iter()
156 .filter(|item| predicate(PageObjectRef::Annot(item)))
157 .cloned()
158 .collect();
159 page.hyperlinks = self
160 .hyperlinks
161 .iter()
162 .filter(|item| predicate(PageObjectRef::Hyperlink(item)))
163 .cloned()
164 .collect();
165 page.is_original = false;
166 page
167 }
168
169 pub fn dedupe_chars(&self, options: &DedupeOptions) -> Self {
170 let mut page = self.clone();
171 page.chars = dedupe_chars(&self.chars, options);
172 page.is_original = false;
173 page
174 }
175
176 pub fn debug_tablefinder(&self, settings: TableSettings) -> Result<TableFinder> {
177 TableFinder::new(self, settings)
178 }
179
180 pub fn find_tables(&self, settings: TableSettings) -> Result<Vec<Table>> {
181 Ok(TableFinder::new(self, settings)?.tables)
182 }
183
184 pub fn find_table(&self, settings: TableSettings) -> Result<Option<Table>> {
185 let mut tables = self.find_tables(settings)?;
186 if tables.is_empty() {
187 return Ok(None);
188 }
189 tables.sort_by(|a, b| {
190 b.cells
191 .len()
192 .cmp(&a.cells.len())
193 .then_with(|| a.bbox.top.total_cmp(&b.bbox.top))
194 .then_with(|| a.bbox.x0.total_cmp(&b.bbox.x0))
195 });
196 Ok(tables.into_iter().next())
197 }
198
199 pub fn extract_tables(&self, settings: TableSettings) -> Result<Vec<Vec<Vec<Option<String>>>>> {
200 let tables = self.find_tables(settings.clone())?;
201 Ok(tables
202 .iter()
203 .map(|table| table.extract(self, &settings.text_options))
204 .collect())
205 }
206
207 pub fn extract_table(&self, settings: TableSettings) -> Result<Option<Vec<Vec<Option<String>>>>> {
208 let Some(table) = self.find_table(settings.clone())? else {
209 return Ok(None);
210 };
211 Ok(Some(table.extract(self, &settings.text_options)))
212 }
213
214 pub fn to_debug_svg(&self) -> String {
215 let mut out = String::new();
216 out.push_str(&format!(
217 r#"<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {:.3} {:.3}" width="{:.3}" height="{:.3}">"#,
218 self.width, self.height, self.width, self.height
219 ));
220 out.push_str(r#"<rect x="0" y="0" width="100%" height="100%" fill="white" stroke="black"/>"#);
221
222 for line in &self.lines {
223 if line.pts.len() >= 2 {
224 out.push_str(&format!(
225 r#"<line x1="{:.3}" y1="{:.3}" x2="{:.3}" y2="{:.3}" stroke="black" stroke-width="1"/>"#,
226 line.pts[0].x, line.pts[0].y, line.pts[1].x, line.pts[1].y
227 ));
228 }
229 }
230
231 for rect in &self.rects {
232 out.push_str(&format!(
233 r#"<rect x="{:.3}" y="{:.3}" width="{:.3}" height="{:.3}" fill="none" stroke="black" stroke-width="1"/>"#,
234 rect.x0, rect.top, rect.width, rect.height
235 ));
236 }
237
238 for curve in &self.curves {
239 if let Some(first) = curve.pts.first() {
240 let mut d = format!("M {:.3} {:.3}", first.x, first.y);
241 for point in curve.pts.iter().skip(1) {
242 d.push_str(&format!(" L {:.3} {:.3}", point.x, point.y));
243 }
244 out.push_str(&format!(
245 r#"<path d="{}" fill="none" stroke="black" stroke-width="1"/>"#,
246 d
247 ));
248 }
249 }
250
251 for image in &self.images {
252 out.push_str(&format!(
253 r#"<rect x="{:.3}" y="{:.3}" width="{:.3}" height="{:.3}" fill="none" stroke="black" stroke-dasharray="4 2"/>"#,
254 image.x0, image.top, image.width, image.height
255 ));
256 }
257
258 for link in &self.hyperlinks {
259 out.push_str(&format!(
260 r#"<rect x="{:.3}" y="{:.3}" width="{:.3}" height="{:.3}" fill="none" stroke="black" stroke-dasharray="2 2"/>"#,
261 link.x0, link.top, link.width, link.height
262 ));
263 }
264
265 for ch in &self.chars {
266 let x = ch.x0;
267 let y = ch.bottom;
268 let escaped = html_escape(&ch.text);
269 out.push_str(&format!(
270 r#"<text x="{:.3}" y="{:.3}" font-size="10">{}</text>"#,
271 x, y, escaped
272 ));
273 }
274
275 out.push_str("</svg>");
276 out
277 }
278
279 fn crop_inner(&self, bbox: BBox, relative: bool, strict: bool, mode: CropMode) -> Result<Self> {
280 let proposed = if relative {
281 bbox.translate(self.bbox.x0, self.bbox.top)
282 } else {
283 bbox
284 };
285
286 if strict {
287 test_proposed_bbox(proposed, self.bbox)?;
288 }
289
290 let mut page = self.clone();
291 page.chars = match mode {
292 CropMode::Crop => crop_objects(&self.chars, proposed, self.height),
293 CropMode::Within => within_objects(&self.chars, proposed),
294 CropMode::Outside => outside_objects(&self.chars, proposed),
295 };
296 page.lines = match mode {
297 CropMode::Crop => crop_objects(&self.lines, proposed, self.height),
298 CropMode::Within => within_objects(&self.lines, proposed),
299 CropMode::Outside => outside_objects(&self.lines, proposed),
300 };
301 page.rects = match mode {
302 CropMode::Crop => crop_objects(&self.rects, proposed, self.height),
303 CropMode::Within => within_objects(&self.rects, proposed),
304 CropMode::Outside => outside_objects(&self.rects, proposed),
305 };
306 page.curves = match mode {
307 CropMode::Crop => crop_objects(&self.curves, proposed, self.height),
308 CropMode::Within => within_objects(&self.curves, proposed),
309 CropMode::Outside => outside_objects(&self.curves, proposed),
310 };
311 page.images = match mode {
312 CropMode::Crop => crop_objects(&self.images, proposed, self.height),
313 CropMode::Within => within_objects(&self.images, proposed),
314 CropMode::Outside => outside_objects(&self.images, proposed),
315 };
316 page.annots = match mode {
317 CropMode::Crop => crop_objects(&self.annots, proposed, self.height),
318 CropMode::Within => within_objects(&self.annots, proposed),
319 CropMode::Outside => outside_objects(&self.annots, proposed),
320 };
321 page.hyperlinks = match mode {
322 CropMode::Crop => crop_objects(&self.hyperlinks, proposed, self.height),
323 CropMode::Within => within_objects(&self.hyperlinks, proposed),
324 CropMode::Outside => outside_objects(&self.hyperlinks, proposed),
325 };
326
327 page.bbox = match mode {
328 CropMode::Outside => self.bbox,
329 CropMode::Crop | CropMode::Within => proposed,
330 };
331 page.is_original = false;
332
333 Ok(page)
334 }
335
336 pub(crate) fn default_text_options(&self) -> TextOptions {
337 let mut options = TextOptions::default();
338 options.layout_bbox = Some(self.bbox);
339 options.layout_width = Some(self.width);
340 options.layout_height = Some(self.height);
341 options
342 }
343}
344
345#[derive(Clone, Copy)]
346enum CropMode {
347 Crop,
348 Within,
349 Outside,
350}
351
352fn html_escape(input: &str) -> String {
353 input
354 .replace('&', "&")
355 .replace('<', "<")
356 .replace('>', ">")
357}