Expand description
PDF content extraction: text with positions, images, and full-text search.
Works directly on lopdf::Document objects, which can be loaded from a
file with Document::load. Three extraction targets are available:
- Text — plain strings or
TextBlockrecords with page, font, bbox - Positioned characters — per-character
PositionedCharwith bounding boxes - Images —
ExtractedImagewith raw pixel data and format metadata - Search — substring search with
SearchResultentries (page + bboxes)
§Quick Start
use lopdf::Document;
use pdfluent_extract::{extract_text, extract_page_text, search_text, SearchOptions};
let doc = Document::load("document.pdf").unwrap();
// All text blocks from every page.
for block in extract_text(&doc) {
println!("[page {}] {} (font: {}, size: {:.1}pt)",
block.page, block.text, block.font_name, block.font_size);
}
// Plain text from one page (1-based page number).
let text = extract_page_text(&doc, 1).unwrap();
// Full-text search with bounding boxes.
let opts = SearchOptions { case_insensitive: true, ..Default::default() };
for result in search_text(&doc, "invoice", &opts) {
println!("Page {}: {:?} ({} bboxes)",
result.page, result.text, result.bounding_boxes.len());
}§Key Types
| Type | Description |
|---|---|
TextBlock | Text run with page number, bounding box, font name/size |
PositionedChar | Single character with per-character bounding box |
ExtractedImage | Raw image data extracted from a page |
SearchResult | Match with page, text, character bounding boxes, offset |
SearchOptions | Case sensitivity, page filter, max results, bbox toggle |
Re-exports§
pub use error::ExtractError;pub use error::Result;pub use images::extract_all_images;pub use images::extract_images_from_page_id;pub use images::extract_page_images;pub use images::ExtractedImage;pub use images::ImageFilter;pub use search::count_occurrences;pub use search::count_text_only;pub use search::pages_containing;pub use search::search_text;pub use search::SearchOptions;pub use search::SearchResult;pub use text::extract_blocks_from_page_id;pub use text::extract_page_blocks;pub use text::extract_page_text;pub use text::extract_positioned_chars;pub use text::extract_text;pub use text::PositionedChar;pub use text::TextBlock;pub use text::WidthSource;