Skip to main content

pdfplumber_core/
lib.rs

1//! Backend-independent data types and algorithms for pdfplumber-rs.
2//!
3//! This crate provides the foundational types ([`BBox`], [`Char`], [`Word`],
4//! [`Line`], [`Rect`], [`Table`], etc.) and algorithms (text grouping, table
5//! detection) used by pdfplumber-rs. It has no required external dependencies —
6//! all functionality is pure Rust.
7//!
8//! # Modules
9//!
10//! - [`geometry`] — Geometric primitives: [`Point`], [`BBox`], [`Ctm`], [`Orientation`]
11//! - [`text`] — Character data: [`Char`], [`TextDirection`], CJK detection
12//! - [`words`] — Word extraction: [`Word`], [`WordExtractor`], [`WordOptions`]
13//! - [`layout`] — Text layout: [`TextLine`], [`TextBlock`], [`TextOptions`]
14//! - [`shapes`] — Shapes from painted paths: [`Line`], [`Rect`], [`Curve`]
15//! - [`edges`] — Edge derivation for table detection: [`Edge`], [`EdgeSource`]
16//! - [`table`] — Table detection: [`Table`], [`TableFinder`], [`TableSettings`]
17//! - [`images`] — Image extraction: [`Image`], [`ImageMetadata`]
18//! - [`painting`] — Graphics state: [`Color`], [`GraphicsState`], [`PaintedPath`]
19//! - [`path`] — Path construction: [`Path`], [`PathBuilder`], [`PathSegment`]
20//! - [`encoding`] — Font encoding: [`FontEncoding`], [`EncodingResolver`]
21//! - [`error`] — Errors and warnings: [`PdfError`], [`ExtractWarning`], [`ExtractOptions`]
22//! - [`search`] — Text search: [`SearchMatch`], [`SearchOptions`], [`search_chars`]
23//! - [`unicode_norm`] — Unicode normalization: [`UnicodeNorm`], [`normalize_chars`]
24
25#![deny(missing_docs)]
26
27/// PDF annotation types.
28pub mod annotation;
29/// PDF bookmark / outline / table of contents types.
30pub mod bookmark;
31/// Duplicate character deduplication.
32pub mod dedupe;
33/// Edge derivation from geometric primitives for table detection.
34pub mod edges;
35/// Font encoding mapping (Standard, Windows, Mac, Custom).
36pub mod encoding;
37/// Error and warning types for PDF processing.
38pub mod error;
39/// PDF form field types for AcroForm extraction.
40pub mod form_field;
41/// Geometric primitives: Point, BBox, CTM, Orientation.
42pub mod geometry;
43/// HTML rendering for PDF page content.
44pub mod html;
45/// PDF hyperlink types.
46pub mod hyperlink;
47/// Image extraction and metadata.
48pub mod images;
49/// Text layout: words → lines → blocks, reading order, text output.
50pub mod layout;
51/// Markdown rendering for PDF page content.
52pub mod markdown;
53/// Document-level metadata types.
54pub mod metadata;
55/// PageObject enum for custom object filtering.
56pub mod page_object;
57/// Graphics state, colors, dash patterns, and painted paths.
58pub mod painting;
59/// PDF path construction (MoveTo, LineTo, CurveTo, ClosePath).
60pub mod path;
61/// PDF repair types for best-effort fixing of common PDF issues.
62pub mod repair;
63/// Text search with position — find text patterns and return matches with bounding boxes.
64pub mod search;
65/// Shape extraction: Lines, Rects, Curves from painted paths.
66pub mod shapes;
67/// PDF digital signature information types.
68pub mod signature;
69/// PDF structure tree types for tagged PDF access.
70pub mod struct_tree;
71/// SVG rendering for visual debugging of PDF pages.
72pub mod svg;
73/// Table detection: lattice, stream, and explicit strategies.
74pub mod table;
75/// Character data types and CJK detection.
76pub mod text;
77/// Unicode normalization for extracted text.
78pub mod unicode_norm;
79/// PDF validation types for detecting specification violations.
80pub mod validation;
81/// Word extraction from characters based on spatial proximity.
82pub mod words;
83
84pub use annotation::{Annotation, AnnotationType};
85pub use bookmark::Bookmark;
86pub use dedupe::{DedupeOptions, dedupe_chars};
87pub use edges::{Edge, EdgeSource, derive_edges, edge_from_curve, edge_from_line, edges_from_rect};
88pub use encoding::{EncodingResolver, FontEncoding, StandardEncoding};
89pub use error::{ExtractOptions, ExtractResult, ExtractWarning, PdfError};
90pub use form_field::{FieldType, FormField};
91pub use geometry::{BBox, Ctm, Orientation, Point};
92pub use html::{HtmlOptions, HtmlRenderer};
93pub use hyperlink::Hyperlink;
94pub use images::{Image, ImageContent, ImageFormat, ImageMetadata, image_from_ctm};
95pub use layout::{
96    TextBlock, TextLine, TextOptions, blocks_to_text, cluster_lines_into_blocks,
97    cluster_words_into_lines, sort_blocks_reading_order, split_lines_at_columns, words_to_text,
98};
99pub use markdown::{MarkdownOptions, MarkdownRenderer};
100pub use metadata::DocumentMetadata;
101pub use page_object::PageObject;
102pub use painting::{Color, DashPattern, ExtGState, FillRule, GraphicsState, PaintedPath};
103pub use path::{Path, PathBuilder, PathSegment};
104pub use repair::{RepairOptions, RepairResult};
105pub use search::{SearchMatch, SearchOptions, search_chars};
106pub use shapes::{Curve, Line, LineOrientation, Rect, extract_shapes};
107pub use signature::SignatureInfo;
108pub use struct_tree::StructElement;
109pub use svg::{DrawStyle, SvgDebugOptions, SvgOptions, SvgRenderer};
110pub use table::{
111    Cell, ExplicitLines, Intersection, Strategy, Table, TableFinder, TableFinderDebug,
112    TableQuality, TableSettings, cells_to_tables, edges_to_intersections, explicit_lines_to_edges,
113    extract_text_for_cells, intersections_to_cells, join_edge_group, snap_edges,
114    words_to_edges_stream,
115};
116pub use text::{Char, TextDirection, is_cjk, is_cjk_text};
117pub use unicode_norm::{UnicodeNorm, normalize_chars};
118pub use validation::{Severity, ValidationIssue};
119pub use words::{Word, WordExtractor, WordOptions};