pdfluent_extract/
lib.rs

1//! PDF content extraction: text with positions, images, and full-text search.
2//!
3//! Works directly on [`lopdf::Document`] objects, which can be loaded from a
4//! file with `Document::load`. Three extraction targets are available:
5//!
6//! - **Text** — plain strings or [`TextBlock`] records with page, font, bbox
7//! - **Positioned characters** — per-character [`PositionedChar`] with bounding boxes
8//! - **Images** — [`ExtractedImage`] with raw pixel data and format metadata
9//! - **Search** — substring search with [`SearchResult`] entries (page + bboxes)
10//!
11//! # Quick Start
12//!
13//! ```no_run
14//! use lopdf::Document;
15//! use pdf_extract::{extract_text, extract_page_text, search_text, SearchOptions};
16//!
17//! let doc = Document::load("document.pdf").unwrap();
18//!
19//! // All text blocks from every page.
20//! for block in extract_text(&doc) {
21//!     println!("[page {}] {} (font: {}, size: {:.1}pt)",
22//!         block.page, block.text, block.font_name, block.font_size);
23//! }
24//!
25//! // Plain text from one page (1-based page number).
26//! let text = extract_page_text(&doc, 1).unwrap();
27//!
28//! // Full-text search with bounding boxes.
29//! let opts = SearchOptions { case_insensitive: true, ..Default::default() };
30//! for result in search_text(&doc, "invoice", &opts) {
31//!     println!("Page {}: {:?} ({} bboxes)",
32//!         result.page, result.text, result.bounding_boxes.len());
33//! }
34//! ```
35//!
36//! # Key Types
37//!
38//! | Type | Description |
39//! |---|---|
40//! | [`TextBlock`] | Text run with page number, bounding box, font name/size |
41//! | [`PositionedChar`] | Single character with per-character bounding box |
42//! | [`ExtractedImage`] | Raw image data extracted from a page |
43//! | [`SearchResult`] | Match with page, text, character bounding boxes, offset |
44//! | [`SearchOptions`] | Case sensitivity, page filter, max results, bbox toggle |
45
46pub mod error;
47pub mod images;
48pub mod search;
49pub mod text;
50
51pub use error::{ExtractError, Result};
52pub use images::{
53    extract_all_images, extract_images_from_page_id, extract_page_images, ExtractedImage,
54    ImageFilter,
55};
56pub use search::{
57    count_occurrences, count_text_only, pages_containing, search_text, SearchOptions, SearchResult,
58};
59pub use text::{
60    extract_blocks_from_page_id, extract_page_blocks, extract_page_text, extract_positioned_chars,
61    extract_text, PositionedChar, TextBlock,
62};
pdfluent_extract/lib.rs

pdfluent_extract/
lib.rs