pdf_oxide/lib.rs
1// Allow some clippy lints that are too pedantic for this project
2#![allow(clippy::type_complexity)]
3#![allow(clippy::too_many_arguments)]
4#![allow(clippy::needless_range_loop)]
5#![allow(clippy::enum_variant_names)]
6#![allow(clippy::wrong_self_convention)]
7#![allow(clippy::explicit_counter_loop)]
8#![allow(clippy::doc_overindented_list_items)]
9#![allow(clippy::should_implement_trait)]
10#![allow(clippy::redundant_guards)]
11#![allow(clippy::regex_creation_in_loops)]
12#![allow(clippy::manual_find)]
13#![allow(clippy::match_like_matches_macro)]
14// Allow unused for tests
15#![cfg_attr(test, allow(dead_code))]
16#![cfg_attr(test, allow(unused_variables))]
17
18//! # PDF Oxide
19//!
20//! The fastest PDF library for Python and Rust. 0.8ms mean text extraction — 5× faster than
21//! PyMuPDF, 15× faster than pypdf, 29× faster than pdfplumber. 100% pass rate on 3,830
22//! real-world PDFs. MIT licensed. A drop-in PyMuPDF alternative with no AGPL restrictions.
23//!
24//! ## Performance (v0.3.9)
25//!
26//! Benchmarked against 18 libraries on 3,830 PDFs from 3 public test suites
27//! (veraPDF, Mozilla pdf.js, DARPA SafeDocs). Single-thread, 60s timeout, no warm-up.
28//!
29//! ### Python PDF Libraries
30//!
31//! | Library | Mean | Pass Rate | License |
32//! |---------|------|-----------|---------|
33//! | **pdf_oxide** | **0.8ms** | **100%** | **MIT** |
34//! | unstructured | 478.4ms | 99.6% | Apache-2.0 |
35//! | PyMuPDF | 4.6ms | 99.3% | AGPL-3.0 |
36//! | pypdfium2 | 4.1ms | 99.2% | Apache-2.0 |
37//! | kreuzberg | 7.2ms | 99.1% | MIT |
38//! | pymupdf4llm | 55.5ms | 99.1% | AGPL-3.0 |
39//! | pdftext | 7.3ms | 99.0% | GPL-3.0 |
40//! | extractous | 112.0ms | 98.9% | Apache-2.0 |
41//! | pdfminer | 16.8ms | 98.8% | MIT |
42//! | pdfplumber | 23.2ms | 98.8% | MIT |
43//! | markitdown | 108.8ms | 98.6% | MIT |
44//! | pypdf | 12.1ms | 98.4% | BSD-3 |
45//!
46//! ### Rust PDF Libraries
47//!
48//! | Library | Mean | Pass Rate | Text Extraction |
49//! |---------|------|-----------|-----------------|
50//! | **pdf_oxide** | **0.8ms** | **100%** | **Built-in** |
51//! | oxidize_pdf | 13.5ms | 99.1% | Basic |
52//! | unpdf | 2.8ms | 95.1% | Basic |
53//! | pdf_extract | 4.08ms | 91.5% | Basic |
54//! | lopdf | 0.3ms | 80.2% | No built-in extraction |
55//!
56//! 99.5% text quality parity vs PyMuPDF, pypdfium2, and kreuzberg across the full corpus.
57//! Full benchmark details: <https://pdf.oxide.fyi/docs/performance>
58//!
59//! ## Core Features
60//!
61//! ### Reading & Extraction
62//! - **Text Extraction**: Character, span, and page-level with font metadata and bounding boxes
63//! - **Reading Order**: 4 pluggable strategies (XY-Cut, Structure Tree, Geometric, Simple)
64//! - **Complex Scripts**: RTL (Arabic/Hebrew), CJK (Japanese/Korean/Chinese), Devanagari, Thai
65//! - **Format Conversion**: PDF → Markdown, HTML, PlainText
66//! - **Image Extraction**: Content streams, Form XObjects, inline images
67//! - **Forms & Annotations**: Read/write form fields, all annotation types, bookmarks
68//! - **Text Search**: Regex and case-insensitive search with page-level results
69//!
70//! ### Writing & Creation
71//! - **PDF Generation**: Fluent DocumentBuilder API for programmatic PDF creation
72//! - **Format Conversion**: Markdown → PDF, HTML → PDF, Plain Text → PDF, Image → PDF
73//! - **Advanced Graphics**: Path operations, image embedding, table generation
74//! - **Font Embedding**: Automatic font subsetting for compact output
75//! - **Interactive Forms**: Fillable forms with text fields, checkboxes, radio buttons, dropdowns
76//! - **QR Codes & Barcodes**: Code128, EAN-13, UPC-A (feature flag: `barcodes`)
77//!
78//! ### Editing
79//! - **DOM-like API**: Query and modify PDF content with strongly-typed wrappers
80//! - **Element Modification**: Find and replace text, modify images, paths, tables
81//! - **Page Operations**: Add, remove, reorder, merge, rotate, crop pages
82//! - **Encryption**: AES-256, password protection
83//! - **Incremental Saves**: Efficient appending without full rewrite
84//!
85//! ### Compliance
86//! - **PDF/A**: Validation and conversion
87//! - **PDF/UA**: Accessibility checks
88//! - **PDF/X**: Print production validation
89//!
90//! ## Quick Start - Rust
91//!
92//! ```ignore
93//! use pdf_oxide::PdfDocument;
94//! use pdf_oxide::pipeline::{TextPipeline, TextPipelineConfig};
95//! use pdf_oxide::pipeline::converters::MarkdownOutputConverter;
96//!
97//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
98//! // Open a PDF
99//! let mut doc = PdfDocument::open("paper.pdf")?;
100//!
101//! // Extract text with reading order (multi-column support)
102//! let spans = doc.extract_spans(0)?;
103//! let config = TextPipelineConfig::default();
104//! let pipeline = TextPipeline::with_config(config.clone());
105//! let ordered_spans = pipeline.process(spans, Default::default())?;
106//!
107//! // Convert to Markdown
108//! let converter = MarkdownOutputConverter::new();
109//! let markdown = converter.convert(&ordered_spans, &config)?;
110//! println!("{}", markdown);
111//! # Ok(())
112//! # }
113//! ```
114//!
115//! ## Quick Start - Python
116//!
117//! ```text
118//! from pdf_oxide import PdfDocument
119//!
120//! # Open and extract with automatic reading order
121//! doc = PdfDocument("paper.pdf")
122//! markdown = doc.to_markdown(0)
123//! print(markdown)
124//! ```
125//!
126//! ## License
127//!
128//! Licensed under either of:
129//!
130//! * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>)
131//! * MIT license ([LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>)
132//!
133//! at your option.
134
135#![warn(missing_docs)]
136#![cfg_attr(docsrs, feature(doc_cfg))]
137
138// Error handling
139pub mod error;
140
141// Core PDF parsing
142pub mod document;
143pub mod lexer;
144pub mod object;
145pub mod objstm;
146pub mod parser;
147/// Parser configuration options
148pub mod parser_config;
149pub mod xref;
150pub mod xref_reconstruction;
151
152// Stream decoders
153pub mod decoders;
154
155// Encryption support
156pub mod encryption;
157
158// Layout analysis
159pub mod geometry;
160pub mod layout;
161
162// Text extraction
163pub mod content;
164pub mod extractors;
165pub mod fonts;
166pub mod text;
167
168// Document structure
169/// Core annotation types and enums per PDF spec
170pub mod annotation_types;
171pub mod annotations;
172/// Content elements for PDF generation
173pub mod elements;
174pub mod outline;
175/// PDF logical structure (Tagged PDFs)
176pub mod structure;
177
178// Format converters
179pub mod converters;
180
181// Pipeline architecture for text extraction
182pub mod pipeline;
183
184// PDF writing/creation (v0.3.0)
185pub mod writer;
186
187// FDF/XFDF form data export (v0.3.3)
188pub mod fdf;
189
190// XFA forms support (v0.3.2)
191pub mod xfa;
192
193// PDF editing (v0.3.0)
194pub mod editor;
195
196// Text search (v0.3.0)
197pub mod search;
198
199// Page rendering to images (optional, v0.3.0)
200#[cfg(feature = "rendering")]
201#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
202pub mod rendering;
203
204// Debug visualization for PDF analysis (optional, v0.3.0)
205#[cfg(feature = "rendering")]
206#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
207pub mod debug;
208
209// Digital signatures (optional, v0.3.0)
210#[cfg(feature = "signatures")]
211#[cfg_attr(docsrs, doc(cfg(feature = "signatures")))]
212pub mod signatures;
213
214// PDF/A compliance validation (v0.3.0)
215pub mod compliance;
216
217// High-level API (v0.3.0)
218pub mod api;
219
220// Re-export specific types from pipeline for use by converters
221pub use pipeline::XYCutStrategy;
222
223// Configuration
224pub mod config;
225
226// Hybrid classical + ML orchestration
227pub mod hybrid;
228
229// OCR - PaddleOCR via ONNX Runtime (optional)
230#[cfg(feature = "ocr")]
231#[cfg_attr(docsrs, doc(cfg(feature = "ocr")))]
232pub mod ocr;
233
234// Python bindings (optional)
235#[cfg(feature = "python")]
236mod python;
237
238// WASM bindings (optional)
239#[cfg(target_arch = "wasm32")]
240#[cfg(feature = "wasm")]
241pub mod wasm;
242
243// Re-exports
244pub use annotation_types::{
245 AnnotationBorderStyle, AnnotationColor, AnnotationFlags, AnnotationSubtype, BorderEffectStyle,
246 BorderStyleType, CaretSymbol, FileAttachmentIcon, FreeTextIntent, HighlightMode,
247 LineEndingStyle, QuadPoint, ReplyType, StampType, TextAlignment, TextAnnotationIcon,
248 TextMarkupType, WidgetFieldType,
249};
250pub use annotations::{Annotation, LinkAction, LinkDestination};
251pub use config::{DocumentType, ExtractionProfile};
252pub use document::{ExtractedImageRef, ImageFormat, PdfDocument};
253pub use error::{Error, Result};
254pub use outline::{Destination, OutlineItem};
255
256// Internal utilities
257pub(crate) mod utils {
258 //! Internal utility functions for the library.
259
260 use std::cmp::Ordering;
261
262 /// Safely compare two floating point numbers, handling NaN cases.
263 ///
264 /// NaN values are treated as equal to each other and greater than all other values.
265 /// This ensures that sorting operations never panic due to NaN comparisons.
266 ///
267 /// # Examples
268 ///
269 /// ```ignore
270 /// # use std::cmp::Ordering;
271 /// # use pdf_oxide::utils::safe_float_cmp;
272 /// assert_eq!(safe_float_cmp(1.0, 2.0), Ordering::Less);
273 /// assert_eq!(safe_float_cmp(2.0, 1.0), Ordering::Greater);
274 /// assert_eq!(safe_float_cmp(1.0, 1.0), Ordering::Equal);
275 ///
276 /// // NaN handling
277 /// assert_eq!(safe_float_cmp(f32::NAN, f32::NAN), Ordering::Equal);
278 /// assert_eq!(safe_float_cmp(f32::NAN, 1.0), Ordering::Greater);
279 /// assert_eq!(safe_float_cmp(1.0, f32::NAN), Ordering::Less);
280 /// ```
281 #[inline]
282 pub fn safe_float_cmp(a: f32, b: f32) -> Ordering {
283 match (a.is_nan(), b.is_nan()) {
284 (true, true) => Ordering::Equal,
285 (true, false) => Ordering::Greater, // NaN > all numbers
286 (false, true) => Ordering::Less, // all numbers < NaN
287 (false, false) => {
288 // Both are normal numbers, safe to unwrap
289 a.partial_cmp(&b).unwrap()
290 },
291 }
292 }
293
294 #[cfg(test)]
295 mod tests {
296 use super::*;
297
298 #[test]
299 fn test_safe_float_cmp_normal() {
300 assert_eq!(safe_float_cmp(1.0, 2.0), Ordering::Less);
301 assert_eq!(safe_float_cmp(2.0, 1.0), Ordering::Greater);
302 assert_eq!(safe_float_cmp(1.5, 1.5), Ordering::Equal);
303 }
304
305 #[test]
306 fn test_safe_float_cmp_nan() {
307 assert_eq!(safe_float_cmp(f32::NAN, f32::NAN), Ordering::Equal);
308 assert_eq!(safe_float_cmp(f32::NAN, 0.0), Ordering::Greater);
309 assert_eq!(safe_float_cmp(0.0, f32::NAN), Ordering::Less);
310 }
311
312 #[test]
313 fn test_safe_float_cmp_infinity() {
314 assert_eq!(safe_float_cmp(f32::INFINITY, f32::INFINITY), Ordering::Equal);
315 assert_eq!(safe_float_cmp(f32::INFINITY, 1.0), Ordering::Greater);
316 assert_eq!(safe_float_cmp(f32::NEG_INFINITY, f32::INFINITY), Ordering::Less);
317 }
318 }
319}
320
321// Version info
322/// Library version
323pub const VERSION: &str = env!("CARGO_PKG_VERSION");
324
325/// Library name
326pub const NAME: &str = env!("CARGO_PKG_NAME");
327
328#[cfg(test)]
329mod tests {
330 use super::*;
331
332 #[test]
333 fn test_version() {
334 // VERSION is populated from CARGO_PKG_VERSION at compile time
335 assert!(VERSION.starts_with("0."));
336 }
337
338 #[test]
339 fn test_name() {
340 assert_eq!(NAME, "pdf_oxide");
341 }
342}