Skip to main content

pdf_oxide/
lib.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2// Allow some clippy lints that are too pedantic for this project
3#![allow(clippy::type_complexity)]
4#![allow(clippy::too_many_arguments)]
5#![allow(clippy::needless_range_loop)]
6#![allow(clippy::enum_variant_names)]
7#![allow(clippy::wrong_self_convention)]
8#![allow(clippy::explicit_counter_loop)]
9#![allow(clippy::doc_overindented_list_items)]
10#![allow(clippy::should_implement_trait)]
11#![allow(clippy::redundant_guards)]
12#![allow(clippy::regex_creation_in_loops)]
13#![allow(clippy::manual_find)]
14#![allow(clippy::match_like_matches_macro)]
15#![allow(clippy::collapsible_match)]
16// Allow unused for tests
17#![cfg_attr(test, allow(dead_code))]
18#![cfg_attr(test, allow(unused_variables))]
19
20//! # PDF Oxide
21//!
22//! The fastest PDF library for Python and Rust. 0.8ms mean text extraction — 5× faster than
23//! PyMuPDF, 15× faster than pypdf, 29× faster than pdfplumber. 100% pass rate on 3,830
24//! real-world PDFs. MIT licensed. A drop-in PyMuPDF alternative with no AGPL restrictions.
25//!
26//! ## Performance (v0.3.10)
27//!
28//! Benchmarked against 14 text extraction libraries on 3,830 PDFs from 3 public test suites
29//! (veraPDF, Mozilla pdf.js, DARPA SafeDocs). Single-thread, 60s timeout, no warm-up.
30//!
31//! ### Python PDF Libraries
32//!
33//! | Library | Mean | Pass Rate | License |
34//! |---------|------|-----------|---------|
35//! | **pdf_oxide** | **0.8ms** | **100%** | **MIT** |
36//! | PyMuPDF | 4.6ms | 99.3% | AGPL-3.0 |
37//! | pypdfium2 | 4.1ms | 99.2% | Apache-2.0 |
38//! | pymupdf4llm | 55.5ms | 99.1% | AGPL-3.0 |
39//! | pdftext | 7.3ms | 99.0% | GPL-3.0 |
40//! | pdfminer | 16.8ms | 98.8% | MIT |
41//! | pdfplumber | 23.2ms | 98.8% | MIT |
42//! | markitdown | 108.8ms | 98.6% | MIT |
43//! | pypdf | 12.1ms | 98.4% | BSD-3 |
44//!
45//! ### Rust PDF Libraries
46//!
47//! | Library | Mean | Pass Rate | Text Extraction |
48//! |---------|------|-----------|-----------------|
49//! | **pdf_oxide** | **0.8ms** | **100%** | **Built-in** |
50//! | oxidize_pdf | 13.5ms | 99.1% | Basic |
51//! | unpdf | 2.8ms | 95.1% | Basic |
52//! | pdf_extract | 4.08ms | 91.5% | Basic |
53//! | lopdf | 0.3ms | 80.2% | No built-in extraction |
54//!
55//! 99.5% text quality parity vs PyMuPDF and pypdfium2 across the full corpus.
56//! Full benchmark details: <https://pdf.oxide.fyi/docs/performance>
57//!
58//! ## Core Features
59//!
60//! ### Reading & Extraction
61//! - **Text Extraction**: Character, span, and page-level with font metadata and bounding boxes
62//! - **Reading Order**: 4 pluggable strategies (XY-Cut, Structure Tree, Geometric, Simple)
63//! - **Complex Scripts**: RTL (Arabic/Hebrew), CJK (Japanese/Korean/Chinese), Devanagari, Thai
64//! - **Format Conversion**: PDF → Markdown, HTML, PlainText
65//! - **Image Extraction**: Content streams, Form XObjects, inline images
66//! - **Forms & Annotations**: Read/write form fields, all annotation types, bookmarks
67//! - **Text Search**: Regex and case-insensitive search with page-level results
68//!
69//! ### Writing & Creation
70//! - **PDF Generation**: Fluent DocumentBuilder API for programmatic PDF creation
71//! - **Format Conversion**: Markdown → PDF, HTML → PDF, Plain Text → PDF, Image → PDF
72//! - **Advanced Graphics**: Path operations, image embedding, table generation
73//! - **Font Embedding**: Automatic font subsetting for compact output
74//! - **Interactive Forms**: Fillable forms with text fields, checkboxes, radio buttons, dropdowns
75//! - **QR Codes & Barcodes**: Code128, EAN-13, UPC-A (feature flag: `barcodes`)
76//!
77//! ### Editing
78//! - **DOM-like API**: Query and modify PDF content with strongly-typed wrappers
79//! - **Element Modification**: Find and replace text, modify images, paths, tables
80//! - **Page Operations**: Add, remove, reorder, merge, rotate, crop pages
81//! - **Encryption**: AES-256, password protection
82//! - **Incremental Saves**: Efficient appending without full rewrite
83//!
84//! ### Compliance
85//! - **PDF/A**: Validation and conversion
86//! - **PDF/UA**: Accessibility checks
87//! - **PDF/X**: Print production validation
88//!
89//! ## Quick Start - Rust
90//!
91//! ```ignore
92//! use pdf_oxide::PdfDocument;
93//! use pdf_oxide::pipeline::{TextPipeline, TextPipelineConfig};
94//! use pdf_oxide::pipeline::converters::OutputConverter;
95//! use pdf_oxide::pipeline::converters::MarkdownOutputConverter;
96//!
97//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
98//! // Open a PDF
99//! let mut doc = PdfDocument::open("paper.pdf")?;
100//!
101//! // Extract text with reading order (multi-column support)
102//! let spans = doc.extract_spans(0)?;
103//! let config = TextPipelineConfig::default();
104//! let pipeline = TextPipeline::with_config(config.clone());
105//! let ordered_spans = pipeline.process(spans, Default::default())?;
106//!
107//! // Convert to Markdown
108//! let converter = MarkdownOutputConverter::new();
109//! let markdown = converter.convert(&ordered_spans, &config)?;
110//! println!("{}", markdown);
111//! # Ok(())
112//! # }
113//! ```
114//!
115//! ## Quick Start - Python
116//!
117//! ```text
118//! from pdf_oxide import PdfDocument
119//!
120//! # Open and extract with automatic reading order
121//! doc = PdfDocument("paper.pdf")
122//! markdown = doc.to_markdown(0)
123//! print(markdown)
124//! ```
125//!
126//! ## License
127//!
128//! Licensed under either of:
129//!
130//! * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>)
131//! * MIT license ([LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>)
132//!
133//! at your option.
134
135#![warn(missing_docs)]
136#![cfg_attr(docsrs, feature(doc_cfg))]
137
138// Glibc 2.34 compatibility (#416): LLVM may emit calls to __memcmpeq@GLIBC_2.35,
139// which does not exist in glibc 2.34 (Amazon Linux 2023, some Ubuntu 22.04 builds).
140// `fips` and `legacy-crypto` are mutually exclusive: FIPS 140-3 forbids MD5
141// and RC4, which `legacy-crypto` pulls in. Build FIPS without legacy crypto:
142//   cargo build --no-default-features --features fips,icc
143#[cfg(all(feature = "fips", feature = "legacy-crypto"))]
144compile_error!(
145    "Features `fips` and `legacy-crypto` are mutually exclusive. \
146     FIPS 140-3 forbids MD5 (pulled in by `legacy-crypto`). \
147     Build with: --no-default-features --features fips,icc"
148);
149
150// A weak stub redirecting to plain memcmp satisfies the reference on older glibc;
151// glibc 2.35's own definition wins when available. global_asm! works with both
152// GNU ld and lld, unlike --defsym which lld rejects for PLT-resolved symbols.
153#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
154core::arch::global_asm!(
155    ".weak __memcmpeq",
156    ".type __memcmpeq, @function",
157    "__memcmpeq:",
158    "jmp memcmp@PLT",
159);
160
161// Error handling
162pub mod error;
163
164// General-purpose caching utilities
165pub(crate) mod cache;
166
167// Core PDF parsing
168pub mod document;
169pub mod lexer;
170pub mod object;
171pub mod objstm;
172pub mod parser;
173/// Parser configuration options
174pub mod parser_config;
175pub mod xref;
176pub mod xref_reconstruction;
177
178// Stream decoders
179pub mod decoders;
180
181// PDF function evaluators (Type 4 PostScript calculator)
182pub mod functions;
183
184// Colour management (ICC profile handling)
185pub mod color;
186
187// Pluggable cryptographic backend (FIPS / sovereign-jurisdiction
188// providers). Issue #236.
189pub mod crypto;
190
191// Encryption support
192pub mod encryption;
193
194// Layout analysis
195pub mod geometry;
196pub mod layout;
197
198// Text extraction
199pub mod content;
200pub mod extractors;
201pub mod fonts;
202pub mod optional_content;
203pub mod text;
204
205// Document structure
206/// Core annotation types and enums per PDF spec
207pub mod annotation_types;
208pub mod annotations;
209/// Content elements for PDF generation
210pub mod elements;
211/// Cross-platform-safe filename slug helpers (shared, pure).
212pub mod filename;
213pub mod outline;
214/// True/destructive redaction + document sanitization (#231).
215pub mod redaction;
216/// Split a PDF into multiple PDFs at outline (bookmark) boundaries (#482).
217pub mod split_bookmarks;
218/// PDF logical structure (Tagged PDFs)
219pub mod structure;
220
221/// Structured per-page extraction (`extract_structured`, #536)
222pub mod structured;
223
224// Format converters
225pub mod converters;
226
227// Pipeline architecture for text extraction
228pub mod pipeline;
229
230// PDF writing/creation (v0.3.0)
231pub mod writer;
232
233// HTML + CSS → PDF pipeline (v0.3.35, issue #248). Hand-rolled tokenizer,
234// parser, selector matcher, cascade, layout glue, paginator, and paint
235// emitter. MIT/Apache-only deps (no MPL); see deny.toml + the v0.3.35
236// pre-flight audit doc for the rationale.
237pub mod html_css;
238
239// FDF/XFDF form data export (v0.3.3)
240pub mod fdf;
241
242// XFA forms support (v0.3.2)
243pub mod xfa;
244
245// PDF editing (v0.3.0)
246pub mod editor;
247
248// Text search (v0.3.0)
249pub mod search;
250
251// Page rendering to images (optional, v0.3.0)
252#[cfg(feature = "rendering")]
253#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
254pub mod rendering;
255
256// Debug visualization for PDF analysis (optional, v0.3.0)
257#[cfg(feature = "rendering")]
258#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
259pub mod debug;
260
261// Digital signatures (optional, v0.3.0)
262#[cfg(feature = "signatures")]
263#[cfg_attr(docsrs, doc(cfg(feature = "signatures")))]
264pub mod signatures;
265
266// Parallel page extraction (optional, v0.3.10)
267#[cfg(feature = "parallel")]
268#[cfg_attr(docsrs, doc(cfg(feature = "parallel")))]
269pub mod parallel;
270
271// Batch processing API (v0.3.10)
272#[cfg(not(target_arch = "wasm32"))]
273pub mod batch;
274
275// PDF/A compliance validation (v0.3.0)
276pub mod compliance;
277
278// High-level API (v0.3.0)
279pub mod api;
280
281// Re-export specific types from pipeline for use by converters
282pub use pipeline::XYCutStrategy;
283
284// Configuration
285pub mod config;
286
287// Hybrid classical + ML orchestration
288pub mod hybrid;
289
290// OCR - PaddleOCR via a pluggable inference backend (optional).
291// Native ONNX Runtime when `ocr` is on; otherwise the pure-Rust
292// `tract` backend (`ocr-tract`, which `ml` implies and the
293// browser/Deno/edge `wasm-ocr` build uses — issue #524). Exposing OCR
294// wherever the tract backend is available costs only the small OCR
295// module itself and keeps it host-testable without a native dylib.
296#[cfg(any(feature = "ocr", feature = "ocr-tract"))]
297#[cfg_attr(docsrs, doc(cfg(any(feature = "ocr", feature = "ocr-tract"))))]
298pub mod ocr;
299
300// C FFI for Go, Node.js, C# bindings (not available on wasm32)
301#[cfg(not(target_arch = "wasm32"))]
302pub mod ffi;
303
304// Python bindings (optional)
305#[cfg(feature = "python")]
306mod python;
307
308// WASM bindings (optional)
309#[cfg(any(target_arch = "wasm32", test))]
310#[cfg(feature = "wasm")]
311pub mod wasm;
312
313// Re-exports
314pub use annotation_types::{
315    AnnotationBorderStyle, AnnotationColor, AnnotationFlags, AnnotationSubtype, BorderEffectStyle,
316    BorderStyleType, CaretSymbol, FileAttachmentIcon, FreeTextIntent, HighlightMode,
317    LineEndingStyle, QuadPoint, ReplyType, StampType, TextAlignment, TextAnnotationIcon,
318    TextMarkupType, WidgetFieldType,
319};
320pub use annotations::{Annotation, LinkAction, LinkDestination};
321pub use config::{DocumentType, ExtractionProfile};
322pub use document::{ExtractedImageRef, ImageFormat, PdfDocument, ReadingOrder};
323pub use error::{Error, Result};
324pub use extractors::images::{PdfFilter, PdfImageHandle};
325pub use layout::PageText;
326pub use outline::{Destination, OutlineItem};
327pub use redaction::{
328    redact_content_stream, Classification, FontInfoMetrics, OcgPolicy, RedactionOptions,
329    RedactionRegion, RedactionReport, RegionSet,
330};
331pub use structured::{ColumnMode, RegionRole, StructuredPage, StructuredRegion};
332
333// Global font cache for batch processing
334pub use fonts::global_cache::{
335    clear_global_font_cache, global_font_cache_stats, set_global_font_cache_capacity,
336};
337
338// Global CMap cache management
339pub use fonts::cmap::{clear_cmap_cache, cmap_cache_size};
340
341#[cfg(feature = "parallel")]
342pub use parallel::{extract_all_markdown_parallel, extract_all_text_parallel, ParallelExtractor};
343
344// Internal utilities
345pub(crate) mod utils {
346    //! Internal utility functions for the library.
347
348    use std::cmp::Ordering;
349
350    /// Safely truncate a string to at most `max_bytes` from the start
351    /// without splitting a multi-byte UTF-8 character.
352    ///
353    /// Returns the full string if it is shorter than `max_bytes`.
354    /// When truncation lands inside a multi-byte character, the boundary
355    /// is rounded **down** to the nearest char boundary (floor).
356    #[inline]
357    pub fn safe_prefix(s: &str, max_bytes: usize) -> &str {
358        if s.len() <= max_bytes {
359            return s;
360        }
361        let mut end = max_bytes;
362        while end > 0 && !s.is_char_boundary(end) {
363            end -= 1;
364        }
365        &s[..end]
366    }
367
368    /// Safely take the last `max_bytes` of a string without splitting
369    /// a multi-byte UTF-8 character.
370    ///
371    /// Returns the full string if it is shorter than `max_bytes`.
372    /// When the computed start offset lands inside a multi-byte character,
373    /// the boundary is rounded **up** to the nearest char boundary (ceil).
374    #[inline]
375    pub fn safe_suffix(s: &str, max_bytes: usize) -> &str {
376        if s.len() <= max_bytes {
377            return s;
378        }
379        let start = s.len() - max_bytes;
380        let mut safe_start = start;
381        while safe_start < s.len() && !s.is_char_boundary(safe_start) {
382            safe_start += 1;
383        }
384        &s[safe_start..]
385    }
386
387    /// Y-band tolerance used by `row_aware_span_cmp`.
388    ///
389    /// Two spans whose top-Y differs by less than this amount are treated
390    /// as lying on the same row. Chosen to absorb typographic baseline
391    /// jitter for 10-12pt body text and glyph-cluster offsets in CJK
392    /// fonts without merging adjacent 14pt-leading lines.
393    pub const ROW_BAND_TOLERANCE_PT: f32 = 3.0;
394
395    /// Row-aware reading-order comparator for spans.
396    ///
397    /// Sorts primarily by "row band" (top-Y quantized to
398    /// `ROW_BAND_TOLERANCE_PT`, larger Y first per PDF Spec ISO 32000-1:2008
399    /// §8.3.2.3) and secondarily by X (left-to-right within a row). This
400    /// keeps tabular layouts where cells in the same logical row have
401    /// slightly different Y values (font-metric jitter, superscripts, CJK
402    /// glyph centering) from being interleaved by a strict Y sort.
403    ///
404    /// Uses `i32` band keys so the ordering is a valid total order —
405    /// comparing raw Y values with tolerance is non-transitive and would
406    /// break `sort_by`.
407    #[inline]
408    pub fn row_aware_span_cmp(a_y: f32, a_x: f32, b_y: f32, b_x: f32) -> Ordering {
409        // Non-finite Y (NaN/±Inf) cannot be quantized into an i32 band —
410        // `as i32` saturates, collapsing distinct non-finite values into
411        // the same band and reordering them unpredictably against finite
412        // spans. Fall back to `safe_float_cmp` so non-finite values follow
413        // the same NaN-last / total-order policy used everywhere else.
414        if !a_y.is_finite() || !b_y.is_finite() {
415            return safe_float_cmp(b_y, a_y).then_with(|| safe_float_cmp(a_x, b_x));
416        }
417        let band_a = (a_y / ROW_BAND_TOLERANCE_PT).round() as i32;
418        let band_b = (b_y / ROW_BAND_TOLERANCE_PT).round() as i32;
419        // Larger Y = higher on page → descending band order.
420        match band_b.cmp(&band_a) {
421            Ordering::Equal => safe_float_cmp(a_x, b_x),
422            other => other,
423        }
424    }
425
426    /// Right-to-left variant of [`row_aware_span_cmp`] (issues #656/#657).
427    ///
428    /// Identical row banding (lines top-to-bottom), but orders spans
429    /// **right-to-left within a row** (X descending). A pure-RTL line's
430    /// logical reading order *is* its rightmost-first geometric order, so
431    /// sorting word-spans by descending X reconstructs logical order
432    /// directly from page geometry — independent of whether the producer
433    /// stored the run in visual or logical order. Used by the tagged
434    /// struct-tree assemblers, which otherwise have no span-order pass for
435    /// RTL (the untagged `reverse_rtl_visual_order_runs` is never reached
436    /// on tagged pages).
437    ///
438    /// Retained as a tested geometric utility: the tagged RTL assembler now
439    /// orders pure-RTL spans via `document::PdfDocument::order_pure_rtl_spans`
440    /// (font-relative line grouping), which subsumes the fixed-band comparator,
441    /// so this has no production caller at present.
442    #[inline]
443    #[allow(dead_code)]
444    pub fn row_aware_span_cmp_rtl(a_y: f32, a_x: f32, b_y: f32, b_x: f32) -> Ordering {
445        if !a_y.is_finite() || !b_y.is_finite() {
446            return safe_float_cmp(b_y, a_y).then_with(|| safe_float_cmp(b_x, a_x));
447        }
448        let band_a = (a_y / ROW_BAND_TOLERANCE_PT).round() as i32;
449        let band_b = (b_y / ROW_BAND_TOLERANCE_PT).round() as i32;
450        match band_b.cmp(&band_a) {
451            Ordering::Equal => safe_float_cmp(b_x, a_x), // X descending = RTL
452            other => other,
453        }
454    }
455
456    /// Safely compare two floating point numbers, handling NaN cases.
457    ///
458    /// NaN values are treated as equal to each other and greater than all other values.
459    /// This ensures that sorting operations never panic due to NaN comparisons.
460    ///
461    /// # Examples
462    ///
463    /// ```ignore
464    /// # use std::cmp::Ordering;
465    /// # use pdf_oxide::utils::safe_float_cmp;
466    /// assert_eq!(safe_float_cmp(1.0, 2.0), Ordering::Less);
467    /// assert_eq!(safe_float_cmp(2.0, 1.0), Ordering::Greater);
468    /// assert_eq!(safe_float_cmp(1.0, 1.0), Ordering::Equal);
469    ///
470    /// // NaN handling
471    /// assert_eq!(safe_float_cmp(f32::NAN, f32::NAN), Ordering::Equal);
472    /// assert_eq!(safe_float_cmp(f32::NAN, 1.0), Ordering::Greater);
473    /// assert_eq!(safe_float_cmp(1.0, f32::NAN), Ordering::Less);
474    /// ```
475    #[inline]
476    pub fn safe_float_cmp(a: f32, b: f32) -> Ordering {
477        match (a.is_nan(), b.is_nan()) {
478            (true, true) => Ordering::Equal,
479            (true, false) => Ordering::Greater, // NaN > all numbers
480            (false, true) => Ordering::Less,    // all numbers < NaN
481            (false, false) => {
482                // Both are normal numbers, safe to unwrap
483                a.partial_cmp(&b).unwrap()
484            },
485        }
486    }
487
488    /// Sort `items` into row-band reading order, computing each element's band
489    /// key once instead of re-quantizing on every `row_aware_span_cmp`
490    /// comparison.
491    ///
492    /// When all `y`/`x` are finite this is a cached-key stable sort with the
493    /// same order as `sort_by(row_aware_span_cmp)` (band descending, then `x`
494    /// ascending — `f32::total_cmp` equals `safe_float_cmp` for finite values,
495    /// and both are stable on ties). Otherwise it falls back to the comparator
496    /// so the NaN/±∞ policy is unchanged.
497    pub fn sort_by_row_band<T>(
498        items: &mut [T],
499        get_y: impl Fn(&T) -> f32,
500        get_x: impl Fn(&T) -> f32,
501    ) {
502        let all_finite = items
503            .iter()
504            .all(|it| get_y(it).is_finite() && get_x(it).is_finite());
505        if !all_finite {
506            items.sort_by(|a, b| row_aware_span_cmp(get_y(a), get_x(a), get_y(b), get_x(b)));
507            return;
508        }
509        // Cached-key stable sort. `total_cmp` matches `safe_float_cmp` for the
510        // finite values we gated on above.
511        items.sort_by_cached_key(|it| {
512            let band = (get_y(it) / ROW_BAND_TOLERANCE_PT).round() as i32;
513            // Reverse band → larger Y (higher on page) first, matching the
514            // comparator's `band_b.cmp(&band_a)`.
515            (std::cmp::Reverse(band), F32Ord(get_x(it)))
516        });
517    }
518
519    /// Total-order wrapper over `f32` for use as a sort key. For finite values
520    /// `total_cmp` is identical to `safe_float_cmp` / `partial_cmp`.
521    #[derive(Clone, Copy, PartialEq)]
522    struct F32Ord(f32);
523    impl Eq for F32Ord {}
524    impl PartialOrd for F32Ord {
525        fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
526            Some(self.cmp(other))
527        }
528    }
529    impl Ord for F32Ord {
530        fn cmp(&self, other: &Self) -> Ordering {
531            self.0.total_cmp(&other.0)
532        }
533    }
534
535    #[cfg(test)]
536    mod tests {
537        use super::*;
538
539        /// The cached-key sort must produce the identical permutation to
540        /// `sort_by(row_aware_span_cmp)` on finite inputs.
541        #[test]
542        fn test_sort_by_row_band_matches_comparator() {
543            // Deterministic pseudo-random spans (no rng in tests).
544            let raw: Vec<(f32, f32)> = (0..500)
545                .map(|i| {
546                    let y = ((i * 37 % 113) as f32) * 1.3;
547                    let x = ((i * 71 % 97) as f32) * 2.1;
548                    (y, x)
549                })
550                .collect();
551            let mut a = raw.clone();
552            let mut b = raw.clone();
553            sort_by_row_band(&mut a, |t| t.0, |t| t.1);
554            b.sort_by(|p, q| row_aware_span_cmp(p.0, p.1, q.0, q.1));
555            assert_eq!(a, b, "cached-key sort must match the comparator permutation");
556        }
557
558        #[test]
559        fn test_safe_float_cmp_normal() {
560            assert_eq!(safe_float_cmp(1.0, 2.0), Ordering::Less);
561            assert_eq!(safe_float_cmp(2.0, 1.0), Ordering::Greater);
562            assert_eq!(safe_float_cmp(1.5, 1.5), Ordering::Equal);
563        }
564
565        #[test]
566        fn test_safe_float_cmp_nan() {
567            assert_eq!(safe_float_cmp(f32::NAN, f32::NAN), Ordering::Equal);
568            assert_eq!(safe_float_cmp(f32::NAN, 0.0), Ordering::Greater);
569            assert_eq!(safe_float_cmp(0.0, f32::NAN), Ordering::Less);
570        }
571
572        #[test]
573        fn test_safe_float_cmp_infinity() {
574            assert_eq!(safe_float_cmp(f32::INFINITY, f32::INFINITY), Ordering::Equal);
575            assert_eq!(safe_float_cmp(f32::INFINITY, 1.0), Ordering::Greater);
576            assert_eq!(safe_float_cmp(f32::NEG_INFINITY, f32::INFINITY), Ordering::Less);
577        }
578
579        /// Verify that sort_by using safe_float_cmp never panics with NaN values.
580        /// This is a regression test for the "total order" panic that affected 42
581        /// PDFs across 5 test datasets (issue found in v0.3.11-pre).
582        #[test]
583        fn test_sort_with_nan_does_not_panic() {
584            let mut values = [3.0_f32, f32::NAN, 1.0, f32::NAN, 2.0, f32::NAN, 0.5];
585            values.sort_by(|a, b| safe_float_cmp(*a, *b));
586            // NaN values should sort to the end (NaN > all numbers)
587            assert!(values[0..4].iter().all(|v| !v.is_nan()));
588            assert!(values[4..].iter().all(|v| v.is_nan()));
589        }
590
591        /// Verify transitivity: if a < b and b < c then a < c.
592        /// The previous `partial_cmp().unwrap_or(Equal)` pattern violated this
593        /// when NaN was involved, causing Rust's sort to panic.
594        #[test]
595        fn test_safe_float_cmp_transitivity() {
596            let a = 1.0_f32;
597            let b = 2.0_f32;
598            let nan = f32::NAN;
599
600            // a < b
601            assert_eq!(safe_float_cmp(a, b), Ordering::Less);
602            // b < NaN
603            assert_eq!(safe_float_cmp(b, nan), Ordering::Less);
604            // Therefore a < NaN (transitivity)
605            assert_eq!(safe_float_cmp(a, nan), Ordering::Less);
606        }
607
608        /// Cells in the same tabular row with slightly-different Y values
609        /// must stay together and be ordered by X, not interleaved with
610        /// cells from other rows.
611        #[test]
612        fn test_row_aware_span_cmp_tolerates_y_jitter() {
613            // Row 1 at y ≈ 100 with small per-cell jitter.
614            // Row 2 at y ≈ 86 (14pt leading below).
615            // A strict Y sort would interleave them because some row-1
616            // cells have lower Y than some row-2 cells.
617            #[derive(Debug, Clone, Copy)]
618            struct Cell {
619                y: f32,
620                x: f32,
621                id: &'static str,
622            }
623            let mut cells = [
624                Cell {
625                    y: 100.5,
626                    x: 50.0,
627                    id: "r1-c1",
628                },
629                Cell {
630                    y: 99.7,
631                    x: 150.0,
632                    id: "r1-c2",
633                },
634                Cell {
635                    y: 100.2,
636                    x: 250.0,
637                    id: "r1-c3",
638                },
639                Cell {
640                    y: 86.4,
641                    x: 50.0,
642                    id: "r2-c1",
643                },
644                Cell {
645                    y: 85.8,
646                    x: 150.0,
647                    id: "r2-c2",
648                },
649                Cell {
650                    y: 86.1,
651                    x: 250.0,
652                    id: "r2-c3",
653                },
654            ];
655            cells.sort_by(|a, b| row_aware_span_cmp(a.y, a.x, b.y, b.x));
656            let order: Vec<&str> = cells.iter().map(|c| c.id).collect();
657            assert_eq!(
658                order,
659                vec!["r1-c1", "r1-c2", "r1-c3", "r2-c1", "r2-c2", "r2-c3"],
660                "cells from the same row must stay contiguous and X-sorted"
661            );
662        }
663
664        /// Row-aware comparator must still put distinct-leading rows in
665        /// top-to-bottom reading order.
666        #[test]
667        fn test_row_aware_span_cmp_distinct_rows_descending() {
668            let mut rows = [
669                (100.0f32, 0.0f32, "top"),
670                (50.0, 0.0, "middle"),
671                (10.0, 0.0, "bottom"),
672            ];
673            rows.sort_by(|a, b| row_aware_span_cmp(a.0, a.1, b.0, b.1));
674            assert_eq!(rows[0].2, "top");
675            assert_eq!(rows[1].2, "middle");
676            assert_eq!(rows[2].2, "bottom");
677        }
678
679        /// The comparator is used by sort_by, which requires a valid total
680        /// order. Run a randomized stress test to confirm no transitivity
681        /// panics.
682        #[test]
683        fn test_row_aware_span_cmp_is_total_order() {
684            let mut v: Vec<(f32, f32)> = (0..200)
685                .map(|i| ((i as f32) * 0.73, ((i * 17) % 500) as f32))
686                .collect();
687            v.sort_by(|a, b| row_aware_span_cmp(a.0, a.1, b.0, b.1));
688        }
689
690        /// #656/#657: the RTL variant keeps rows top-to-bottom but orders
691        /// X *descending* (right-to-left) within a row — a pure-RTL line's
692        /// logical reading order.
693        #[test]
694        fn test_row_aware_span_cmp_rtl_within_row_is_descending() {
695            // Same row (Y within band), laid out left-to-right by X.
696            let mut row = [
697                (100.0f32, 10.0f32, "leftmost"),
698                (100.0, 50.0, "mid"),
699                (100.0, 90.0, "rightmost"),
700            ];
701            row.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1));
702            // Rightmost (highest X) reads first in RTL.
703            assert_eq!(["rightmost", "mid", "leftmost"], [row[0].2, row[1].2, row[2].2]);
704        }
705
706        /// Rows still order top-to-bottom regardless of the within-row flip.
707        #[test]
708        fn test_row_aware_span_cmp_rtl_rows_top_to_bottom() {
709            let mut rows = [
710                (10.0f32, 0.0f32, "bottom"),
711                (100.0, 0.0, "top"),
712                (50.0, 0.0, "middle"),
713            ];
714            rows.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1));
715            assert_eq!(["top", "middle", "bottom"], [rows[0].2, rows[1].2, rows[2].2]);
716        }
717
718        /// Must be a valid total order for `sort_by` (no transitivity panic).
719        #[test]
720        fn test_row_aware_span_cmp_rtl_is_total_order() {
721            let mut v: Vec<(f32, f32)> = (0..200)
722                .map(|i| ((i as f32) * 0.73, ((i * 17) % 500) as f32))
723                .collect();
724            v.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1));
725        }
726
727        /// Sort a large array with mixed NaN/normal values to stress-test.
728        #[test]
729        fn test_sort_stress_with_nan() {
730            let mut values: Vec<f32> = (0..100).map(|i| i as f32).collect();
731            // Insert NaN at various positions
732            for i in (0..100).step_by(7) {
733                values[i] = f32::NAN;
734            }
735            // Must not panic
736            values.sort_by(|a, b| safe_float_cmp(*a, *b));
737        }
738
739        #[test]
740        fn test_safe_prefix_ascii() {
741            assert_eq!(safe_prefix("hello", 3), "hel");
742            assert_eq!(safe_prefix("hello", 10), "hello");
743            assert_eq!(safe_prefix("", 5), "");
744            assert_eq!(safe_prefix("hi", 0), "");
745        }
746
747        #[test]
748        fn test_safe_prefix_multibyte() {
749            let text = "✚✳★✵"; // 4 × 3-byte chars = 12 bytes
750            assert_eq!(safe_prefix(text, 10), "✚✳★"); // rounds down from 10 to 9
751            assert_eq!(safe_prefix(text, 9), "✚✳★"); // exact boundary
752            assert_eq!(safe_prefix(text, 12), "✚✳★✵"); // full string
753        }
754
755        #[test]
756        fn test_safe_suffix_ascii() {
757            assert_eq!(safe_suffix("hello", 3), "llo");
758            assert_eq!(safe_suffix("hello", 10), "hello");
759            assert_eq!(safe_suffix("", 5), "");
760            assert_eq!(safe_suffix("hi", 0), "");
761        }
762
763        #[test]
764        fn test_safe_suffix_multibyte() {
765            let text = "AB✚✳★✵"; // 14 bytes: A(0) B(1) ✚(2..5) ✳(5..8) ★(8..11) ✵(11..14)
766                                 // 14 - 10 = 4, byte 4 is inside ✚ → rounds up to 5
767            assert_eq!(safe_suffix(text, 10), "✳★✵");
768        }
769    }
770}
771
772// Version info
773/// Library version
774pub const VERSION: &str = env!("CARGO_PKG_VERSION");
775
776/// Library name
777pub const NAME: &str = env!("CARGO_PKG_NAME");
778
779#[cfg(test)]
780mod tests {
781    use super::*;
782
783    #[test]
784    fn test_version() {
785        // VERSION is populated from CARGO_PKG_VERSION at compile time
786        assert!(VERSION.starts_with("0."));
787    }
788
789    #[test]
790    fn test_name() {
791        assert_eq!(NAME, "pdf_oxide");
792    }
793}