Skip to main content

pdfluent_extract/
lib.rs

1#![warn(missing_docs)]
2//! PDF content extraction: text with positions, images, and full-text search.
3//!
4//! Works directly on [`lopdf::Document`] objects, which can be loaded from a
5//! file with `Document::load`. Three extraction targets are available:
6//!
7//! - **Text** — plain strings or [`TextBlock`] records with page, font, bbox
8//! - **Positioned characters** — per-character [`PositionedChar`] with bounding boxes
9//! - **Images** — [`ExtractedImage`] with raw pixel data and format metadata
10//! - **Search** — substring search with [`SearchResult`] entries (page + bboxes)
11//!
12//! # Quick Start
13//!
14//! ```no_run
15//! use lopdf::Document;
16//! use pdfluent_extract::{extract_text, extract_page_text, search_text, SearchOptions};
17//!
18//! let doc = Document::load("document.pdf").unwrap();
19//!
20//! // All text blocks from every page.
21//! for block in extract_text(&doc) {
22//!     println!("[page {}] {} (font: {}, size: {:.1}pt)",
23//!         block.page, block.text, block.font_name, block.font_size);
24//! }
25//!
26//! // Plain text from one page (1-based page number).
27//! let text = extract_page_text(&doc, 1).unwrap();
28//!
29//! // Full-text search with bounding boxes.
30//! let opts = SearchOptions { case_insensitive: true, ..Default::default() };
31//! for result in search_text(&doc, "invoice", &opts) {
32//!     println!("Page {}: {:?} ({} bboxes)",
33//!         result.page, result.text, result.bounding_boxes.len());
34//! }
35//! ```
36//!
37//! # Key Types
38//!
39//! | Type | Description |
40//! |---|---|
41//! | [`TextBlock`] | Text run with page number, bounding box, font name/size |
42//! | [`PositionedChar`] | Single character with per-character bounding box |
43//! | [`ExtractedImage`] | Raw image data extracted from a page |
44//! | [`SearchResult`] | Match with page, text, character bounding boxes, offset |
45//! | [`SearchOptions`] | Case sensitivity, page filter, max results, bbox toggle |
46
47pub mod error;
48pub mod images;
49pub mod search;
50pub mod text;
51
52pub use error::{ExtractError, Result};
53pub use images::{
54    extract_all_images, extract_images_from_page_id, extract_page_images, ExtractedImage,
55    ImageFilter,
56};
57pub use search::{
58    count_occurrences, count_text_only, pages_containing, search_text, SearchOptions, SearchResult,
59};
60pub use text::{
61    extract_blocks_from_page_id, extract_page_blocks, extract_page_text, extract_positioned_chars,
62    extract_text, PositionedChar, TextBlock, WidthSource,
63};