pdf_oxide/lib.rs
1// SPDX-License-Identifier: MIT OR Apache-2.0
2// Allow some clippy lints that are too pedantic for this project
3#![allow(clippy::type_complexity)]
4#![allow(clippy::too_many_arguments)]
5#![allow(clippy::needless_range_loop)]
6#![allow(clippy::enum_variant_names)]
7#![allow(clippy::wrong_self_convention)]
8#![allow(clippy::explicit_counter_loop)]
9#![allow(clippy::doc_overindented_list_items)]
10#![allow(clippy::should_implement_trait)]
11#![allow(clippy::redundant_guards)]
12#![allow(clippy::regex_creation_in_loops)]
13#![allow(clippy::manual_find)]
14#![allow(clippy::match_like_matches_macro)]
15#![allow(clippy::collapsible_match)]
16// Allow unused for tests
17#![cfg_attr(test, allow(dead_code))]
18#![cfg_attr(test, allow(unused_variables))]
19
20//! # PDF Oxide
21//!
22//! The fastest PDF library for Python and Rust. 0.8ms mean text extraction — 5× faster than
23//! PyMuPDF, 15× faster than pypdf, 29× faster than pdfplumber. 100% pass rate on 3,830
24//! real-world PDFs. MIT licensed. A drop-in PyMuPDF alternative with no AGPL restrictions.
25//!
26//! ## Performance (v0.3.10)
27//!
28//! Benchmarked against 14 text extraction libraries on 3,830 PDFs from 3 public test suites
29//! (veraPDF, Mozilla pdf.js, DARPA SafeDocs). Single-thread, 60s timeout, no warm-up.
30//!
31//! ### Python PDF Libraries
32//!
33//! | Library | Mean | Pass Rate | License |
34//! |---------|------|-----------|---------|
35//! | **pdf_oxide** | **0.8ms** | **100%** | **MIT** |
36//! | PyMuPDF | 4.6ms | 99.3% | AGPL-3.0 |
37//! | pypdfium2 | 4.1ms | 99.2% | Apache-2.0 |
38//! | pymupdf4llm | 55.5ms | 99.1% | AGPL-3.0 |
39//! | pdftext | 7.3ms | 99.0% | GPL-3.0 |
40//! | pdfminer | 16.8ms | 98.8% | MIT |
41//! | pdfplumber | 23.2ms | 98.8% | MIT |
42//! | markitdown | 108.8ms | 98.6% | MIT |
43//! | pypdf | 12.1ms | 98.4% | BSD-3 |
44//!
45//! ### Rust PDF Libraries
46//!
47//! | Library | Mean | Pass Rate | Text Extraction |
48//! |---------|------|-----------|-----------------|
49//! | **pdf_oxide** | **0.8ms** | **100%** | **Built-in** |
50//! | oxidize_pdf | 13.5ms | 99.1% | Basic |
51//! | unpdf | 2.8ms | 95.1% | Basic |
52//! | pdf_extract | 4.08ms | 91.5% | Basic |
53//! | lopdf | 0.3ms | 80.2% | No built-in extraction |
54//!
55//! 99.5% text quality parity vs PyMuPDF and pypdfium2 across the full corpus.
56//! Full benchmark details: <https://pdf.oxide.fyi/docs/performance>
57//!
58//! ## Core Features
59//!
60//! ### Reading & Extraction
61//! - **Text Extraction**: Character, span, and page-level with font metadata and bounding boxes
62//! - **Reading Order**: 4 pluggable strategies (XY-Cut, Structure Tree, Geometric, Simple)
63//! - **Complex Scripts**: RTL (Arabic/Hebrew), CJK (Japanese/Korean/Chinese), Devanagari, Thai
64//! - **Format Conversion**: PDF → Markdown, HTML, PlainText
65//! - **Image Extraction**: Content streams, Form XObjects, inline images
66//! - **Forms & Annotations**: Read/write form fields, all annotation types, bookmarks
67//! - **Text Search**: Regex and case-insensitive search with page-level results
68//!
69//! ### Writing & Creation
70//! - **PDF Generation**: Fluent DocumentBuilder API for programmatic PDF creation
71//! - **Format Conversion**: Markdown → PDF, HTML → PDF, Plain Text → PDF, Image → PDF
72//! - **Advanced Graphics**: Path operations, image embedding, table generation
73//! - **Font Embedding**: Automatic font subsetting for compact output
74//! - **Interactive Forms**: Fillable forms with text fields, checkboxes, radio buttons, dropdowns
75//! - **QR Codes & Barcodes**: Code128, EAN-13, UPC-A (feature flag: `barcodes`)
76//!
77//! ### Editing
78//! - **DOM-like API**: Query and modify PDF content with strongly-typed wrappers
79//! - **Element Modification**: Find and replace text, modify images, paths, tables
80//! - **Page Operations**: Add, remove, reorder, merge, rotate, crop pages
81//! - **Encryption**: AES-256, password protection
82//! - **Incremental Saves**: Efficient appending without full rewrite
83//!
84//! ### Compliance
85//! - **PDF/A**: Validation and conversion
86//! - **PDF/UA**: Accessibility checks
87//! - **PDF/X**: Print production validation
88//!
89//! ## Quick Start - Rust
90//!
91//! ```ignore
92//! use pdf_oxide::PdfDocument;
93//! use pdf_oxide::pipeline::{TextPipeline, TextPipelineConfig};
94//! use pdf_oxide::pipeline::converters::OutputConverter;
95//! use pdf_oxide::pipeline::converters::MarkdownOutputConverter;
96//!
97//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
98//! // Open a PDF
99//! let mut doc = PdfDocument::open("paper.pdf")?;
100//!
101//! // Extract text with reading order (multi-column support)
102//! let spans = doc.extract_spans(0)?;
103//! let config = TextPipelineConfig::default();
104//! let pipeline = TextPipeline::with_config(config.clone());
105//! let ordered_spans = pipeline.process(spans, Default::default())?;
106//!
107//! // Convert to Markdown
108//! let converter = MarkdownOutputConverter::new();
109//! let markdown = converter.convert(&ordered_spans, &config)?;
110//! println!("{}", markdown);
111//! # Ok(())
112//! # }
113//! ```
114//!
115//! ## Quick Start - Python
116//!
117//! ```text
118//! from pdf_oxide import PdfDocument
119//!
120//! # Open and extract with automatic reading order
121//! doc = PdfDocument("paper.pdf")
122//! markdown = doc.to_markdown(0)
123//! print(markdown)
124//! ```
125//!
126//! ## License
127//!
128//! Licensed under either of:
129//!
130//! * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or <http://www.apache.org/licenses/LICENSE-2.0>)
131//! * MIT license ([LICENSE-MIT](LICENSE-MIT) or <http://opensource.org/licenses/MIT>)
132//!
133//! at your option.
134
135#![warn(missing_docs)]
136#![cfg_attr(docsrs, feature(doc_cfg))]
137
138// Glibc 2.34 compatibility (#416): LLVM may emit calls to __memcmpeq@GLIBC_2.35,
139// which does not exist in glibc 2.34 (Amazon Linux 2023, some Ubuntu 22.04 builds).
140// `fips` and `legacy-crypto` are mutually exclusive: FIPS 140-3 forbids MD5
141// and RC4, which `legacy-crypto` pulls in. Build FIPS without legacy crypto:
142// cargo build --no-default-features --features fips,icc
143#[cfg(all(feature = "fips", feature = "legacy-crypto"))]
144compile_error!(
145 "Features `fips` and `legacy-crypto` are mutually exclusive. \
146 FIPS 140-3 forbids MD5 (pulled in by `legacy-crypto`). \
147 Build with: --no-default-features --features fips,icc"
148);
149
150// A weak stub redirecting to plain memcmp satisfies the reference on older glibc;
151// glibc 2.35's own definition wins when available. global_asm! works with both
152// GNU ld and lld, unlike --defsym which lld rejects for PLT-resolved symbols.
153#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
154core::arch::global_asm!(
155 ".weak __memcmpeq",
156 ".type __memcmpeq, @function",
157 "__memcmpeq:",
158 "jmp memcmp@PLT",
159);
160
161// Error handling
162pub mod error;
163
164// General-purpose caching utilities
165pub(crate) mod cache;
166
167// Core PDF parsing
168pub mod document;
169pub mod lexer;
170pub mod object;
171pub mod objstm;
172pub mod parser;
173/// Parser configuration options
174pub mod parser_config;
175pub mod xref;
176pub mod xref_reconstruction;
177
178// Stream decoders
179pub mod decoders;
180
181// PDF function evaluators (Type 4 PostScript calculator)
182pub mod functions;
183
184// Colour management (ICC profile handling)
185pub mod color;
186
187// Pluggable cryptographic backend (FIPS / sovereign-jurisdiction
188// providers). Issue #236.
189pub mod crypto;
190
191// Encryption support
192pub mod encryption;
193
194// Layout analysis
195pub mod geometry;
196pub mod layout;
197
198// Text extraction
199pub mod content;
200pub mod extractors;
201pub mod fonts;
202pub mod optional_content;
203pub mod text;
204
205// Document structure
206/// Core annotation types and enums per PDF spec
207pub mod annotation_types;
208pub mod annotations;
209/// Content elements for PDF generation
210pub mod elements;
211/// Cross-platform-safe filename slug helpers (shared, pure).
212pub mod filename;
213pub mod outline;
214/// True/destructive redaction + document sanitization (#231).
215pub mod redaction;
216/// Split a PDF into multiple PDFs at outline (bookmark) boundaries (#482).
217pub mod split_bookmarks;
218/// PDF logical structure (Tagged PDFs)
219pub mod structure;
220
221/// Structured per-page extraction (`extract_structured`, #536)
222pub mod structured;
223
224// Format converters
225pub mod converters;
226
227// Pipeline architecture for text extraction
228pub mod pipeline;
229
230// PDF writing/creation (v0.3.0)
231pub mod writer;
232
233// HTML + CSS → PDF pipeline (v0.3.35, issue #248). Hand-rolled tokenizer,
234// parser, selector matcher, cascade, layout glue, paginator, and paint
235// emitter. MIT/Apache-only deps (no MPL); see deny.toml + the v0.3.35
236// pre-flight audit doc for the rationale.
237pub mod html_css;
238
239// FDF/XFDF form data export (v0.3.3)
240pub mod fdf;
241
242// XFA forms support (v0.3.2)
243pub mod xfa;
244
245// PDF editing (v0.3.0)
246pub mod editor;
247
248// Text search (v0.3.0)
249pub mod search;
250
251// Page rendering to images (optional, v0.3.0)
252#[cfg(feature = "rendering")]
253#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
254pub mod rendering;
255
256// Debug visualization for PDF analysis (optional, v0.3.0)
257#[cfg(feature = "rendering")]
258#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
259pub mod debug;
260
261// Digital signatures (optional, v0.3.0)
262#[cfg(feature = "signatures")]
263#[cfg_attr(docsrs, doc(cfg(feature = "signatures")))]
264pub mod signatures;
265
266// Parallel page extraction (optional, v0.3.10)
267#[cfg(feature = "parallel")]
268#[cfg_attr(docsrs, doc(cfg(feature = "parallel")))]
269pub mod parallel;
270
271// Batch processing API (v0.3.10)
272#[cfg(not(target_arch = "wasm32"))]
273pub mod batch;
274
275// PDF/A compliance validation (v0.3.0)
276pub mod compliance;
277
278// High-level API (v0.3.0)
279pub mod api;
280
281// Re-export specific types from pipeline for use by converters
282pub use pipeline::XYCutStrategy;
283
284// Configuration
285pub mod config;
286
287// Hybrid classical + ML orchestration
288pub mod hybrid;
289
290// OCR - PaddleOCR via a pluggable inference backend (optional).
291// Native ONNX Runtime when `ocr` is on; otherwise the pure-Rust
292// `tract` backend (`ocr-tract`, which `ml` implies and the
293// browser/Deno/edge `wasm-ocr` build uses — issue #524). Exposing OCR
294// wherever the tract backend is available costs only the small OCR
295// module itself and keeps it host-testable without a native dylib.
296#[cfg(any(feature = "ocr", feature = "ocr-tract"))]
297#[cfg_attr(docsrs, doc(cfg(any(feature = "ocr", feature = "ocr-tract"))))]
298pub mod ocr;
299
300// C FFI for Go, Node.js, C# bindings (not available on wasm32)
301#[cfg(not(target_arch = "wasm32"))]
302pub mod ffi;
303
304// Python bindings (optional)
305#[cfg(feature = "python")]
306mod python;
307
308// WASM bindings (optional)
309#[cfg(any(target_arch = "wasm32", test))]
310#[cfg(feature = "wasm")]
311pub mod wasm;
312
313// Re-exports
314pub use annotation_types::{
315 AnnotationBorderStyle, AnnotationColor, AnnotationFlags, AnnotationSubtype, BorderEffectStyle,
316 BorderStyleType, CaretSymbol, FileAttachmentIcon, FreeTextIntent, HighlightMode,
317 LineEndingStyle, QuadPoint, ReplyType, StampType, TextAlignment, TextAnnotationIcon,
318 TextMarkupType, WidgetFieldType,
319};
320pub use annotations::{Annotation, LinkAction, LinkDestination};
321pub use config::{DocumentType, ExtractionProfile};
322pub use document::{ExtractedImageRef, ImageFormat, PdfDocument, ReadingOrder};
323pub use error::{Error, Result};
324pub use extractors::images::{PdfFilter, PdfImageHandle};
325pub use layout::PageText;
326pub use outline::{Destination, OutlineItem};
327pub use redaction::{
328 redact_content_stream, Classification, FontInfoMetrics, OcgPolicy, RedactionOptions,
329 RedactionRegion, RedactionReport, RegionSet,
330};
331pub use structured::{ColumnMode, RegionRole, StructuredPage, StructuredRegion};
332
333// Global font cache for batch processing
334pub use fonts::global_cache::{
335 clear_global_font_cache, global_font_cache_stats, set_global_font_cache_capacity,
336};
337
338// Global CMap cache management
339pub use fonts::cmap::{clear_cmap_cache, cmap_cache_size};
340
341#[cfg(feature = "parallel")]
342pub use parallel::{extract_all_markdown_parallel, extract_all_text_parallel, ParallelExtractor};
343
344// Internal utilities
345pub(crate) mod utils {
346 //! Internal utility functions for the library.
347
348 use std::cmp::Ordering;
349
350 /// Safely truncate a string to at most `max_bytes` from the start
351 /// without splitting a multi-byte UTF-8 character.
352 ///
353 /// Returns the full string if it is shorter than `max_bytes`.
354 /// When truncation lands inside a multi-byte character, the boundary
355 /// is rounded **down** to the nearest char boundary (floor).
356 #[inline]
357 pub fn safe_prefix(s: &str, max_bytes: usize) -> &str {
358 if s.len() <= max_bytes {
359 return s;
360 }
361 let mut end = max_bytes;
362 while end > 0 && !s.is_char_boundary(end) {
363 end -= 1;
364 }
365 &s[..end]
366 }
367
368 /// Safely take the last `max_bytes` of a string without splitting
369 /// a multi-byte UTF-8 character.
370 ///
371 /// Returns the full string if it is shorter than `max_bytes`.
372 /// When the computed start offset lands inside a multi-byte character,
373 /// the boundary is rounded **up** to the nearest char boundary (ceil).
374 #[inline]
375 pub fn safe_suffix(s: &str, max_bytes: usize) -> &str {
376 if s.len() <= max_bytes {
377 return s;
378 }
379 let start = s.len() - max_bytes;
380 let mut safe_start = start;
381 while safe_start < s.len() && !s.is_char_boundary(safe_start) {
382 safe_start += 1;
383 }
384 &s[safe_start..]
385 }
386
387 /// Y-band tolerance used by `row_aware_span_cmp`.
388 ///
389 /// Two spans whose top-Y differs by less than this amount are treated
390 /// as lying on the same row. Chosen to absorb typographic baseline
391 /// jitter for 10-12pt body text and glyph-cluster offsets in CJK
392 /// fonts without merging adjacent 14pt-leading lines.
393 pub const ROW_BAND_TOLERANCE_PT: f32 = 3.0;
394
395 /// Row-aware reading-order comparator for spans.
396 ///
397 /// Sorts primarily by "row band" (top-Y quantized to
398 /// `ROW_BAND_TOLERANCE_PT`, larger Y first per PDF Spec ISO 32000-1:2008
399 /// §8.3.2.3) and secondarily by X (left-to-right within a row). This
400 /// keeps tabular layouts where cells in the same logical row have
401 /// slightly different Y values (font-metric jitter, superscripts, CJK
402 /// glyph centering) from being interleaved by a strict Y sort.
403 ///
404 /// Uses `i32` band keys so the ordering is a valid total order —
405 /// comparing raw Y values with tolerance is non-transitive and would
406 /// break `sort_by`.
407 #[inline]
408 pub fn row_aware_span_cmp(a_y: f32, a_x: f32, b_y: f32, b_x: f32) -> Ordering {
409 // Non-finite Y (NaN/±Inf) cannot be quantized into an i32 band —
410 // `as i32` saturates, collapsing distinct non-finite values into
411 // the same band and reordering them unpredictably against finite
412 // spans. Fall back to `safe_float_cmp` so non-finite values follow
413 // the same NaN-last / total-order policy used everywhere else.
414 if !a_y.is_finite() || !b_y.is_finite() {
415 return safe_float_cmp(b_y, a_y).then_with(|| safe_float_cmp(a_x, b_x));
416 }
417 let band_a = (a_y / ROW_BAND_TOLERANCE_PT).round() as i32;
418 let band_b = (b_y / ROW_BAND_TOLERANCE_PT).round() as i32;
419 // Larger Y = higher on page → descending band order.
420 match band_b.cmp(&band_a) {
421 Ordering::Equal => safe_float_cmp(a_x, b_x),
422 other => other,
423 }
424 }
425
426 /// Right-to-left variant of [`row_aware_span_cmp`] (issues #656/#657).
427 ///
428 /// Identical row banding (lines top-to-bottom), but orders spans
429 /// **right-to-left within a row** (X descending). A pure-RTL line's
430 /// logical reading order *is* its rightmost-first geometric order, so
431 /// sorting word-spans by descending X reconstructs logical order
432 /// directly from page geometry — independent of whether the producer
433 /// stored the run in visual or logical order. Used by the tagged
434 /// struct-tree assemblers, which otherwise have no span-order pass for
435 /// RTL (the untagged `reverse_rtl_visual_order_runs` is never reached
436 /// on tagged pages).
437 ///
438 /// Retained as a tested geometric utility: the tagged RTL assembler now
439 /// orders pure-RTL spans via `document::PdfDocument::order_pure_rtl_spans`
440 /// (font-relative line grouping), which subsumes the fixed-band comparator,
441 /// so this has no production caller at present.
442 #[inline]
443 #[allow(dead_code)]
444 pub fn row_aware_span_cmp_rtl(a_y: f32, a_x: f32, b_y: f32, b_x: f32) -> Ordering {
445 if !a_y.is_finite() || !b_y.is_finite() {
446 return safe_float_cmp(b_y, a_y).then_with(|| safe_float_cmp(b_x, a_x));
447 }
448 let band_a = (a_y / ROW_BAND_TOLERANCE_PT).round() as i32;
449 let band_b = (b_y / ROW_BAND_TOLERANCE_PT).round() as i32;
450 match band_b.cmp(&band_a) {
451 Ordering::Equal => safe_float_cmp(b_x, a_x), // X descending = RTL
452 other => other,
453 }
454 }
455
456 /// Safely compare two floating point numbers, handling NaN cases.
457 ///
458 /// NaN values are treated as equal to each other and greater than all other values.
459 /// This ensures that sorting operations never panic due to NaN comparisons.
460 ///
461 /// # Examples
462 ///
463 /// ```ignore
464 /// # use std::cmp::Ordering;
465 /// # use pdf_oxide::utils::safe_float_cmp;
466 /// assert_eq!(safe_float_cmp(1.0, 2.0), Ordering::Less);
467 /// assert_eq!(safe_float_cmp(2.0, 1.0), Ordering::Greater);
468 /// assert_eq!(safe_float_cmp(1.0, 1.0), Ordering::Equal);
469 ///
470 /// // NaN handling
471 /// assert_eq!(safe_float_cmp(f32::NAN, f32::NAN), Ordering::Equal);
472 /// assert_eq!(safe_float_cmp(f32::NAN, 1.0), Ordering::Greater);
473 /// assert_eq!(safe_float_cmp(1.0, f32::NAN), Ordering::Less);
474 /// ```
475 #[inline]
476 pub fn safe_float_cmp(a: f32, b: f32) -> Ordering {
477 match (a.is_nan(), b.is_nan()) {
478 (true, true) => Ordering::Equal,
479 (true, false) => Ordering::Greater, // NaN > all numbers
480 (false, true) => Ordering::Less, // all numbers < NaN
481 (false, false) => {
482 // Both are normal numbers, safe to unwrap
483 a.partial_cmp(&b).unwrap()
484 },
485 }
486 }
487
488 /// Sort `items` into row-band reading order, computing each element's band
489 /// key once instead of re-quantizing on every `row_aware_span_cmp`
490 /// comparison.
491 ///
492 /// When all `y`/`x` are finite this is a cached-key stable sort with the
493 /// same order as `sort_by(row_aware_span_cmp)` (band descending, then `x`
494 /// ascending — `f32::total_cmp` equals `safe_float_cmp` for finite values,
495 /// and both are stable on ties). Otherwise it falls back to the comparator
496 /// so the NaN/±∞ policy is unchanged.
497 pub fn sort_by_row_band<T>(
498 items: &mut [T],
499 get_y: impl Fn(&T) -> f32,
500 get_x: impl Fn(&T) -> f32,
501 ) {
502 let all_finite = items
503 .iter()
504 .all(|it| get_y(it).is_finite() && get_x(it).is_finite());
505 if !all_finite {
506 items.sort_by(|a, b| row_aware_span_cmp(get_y(a), get_x(a), get_y(b), get_x(b)));
507 return;
508 }
509 // Cached-key stable sort. `total_cmp` matches `safe_float_cmp` for the
510 // finite values we gated on above.
511 items.sort_by_cached_key(|it| {
512 let band = (get_y(it) / ROW_BAND_TOLERANCE_PT).round() as i32;
513 // Reverse band → larger Y (higher on page) first, matching the
514 // comparator's `band_b.cmp(&band_a)`.
515 (std::cmp::Reverse(band), F32Ord(get_x(it)))
516 });
517 }
518
519 /// Total-order wrapper over `f32` for use as a sort key. For finite values
520 /// `total_cmp` is identical to `safe_float_cmp` / `partial_cmp`.
521 #[derive(Clone, Copy, PartialEq)]
522 struct F32Ord(f32);
523 impl Eq for F32Ord {}
524 impl PartialOrd for F32Ord {
525 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
526 Some(self.cmp(other))
527 }
528 }
529 impl Ord for F32Ord {
530 fn cmp(&self, other: &Self) -> Ordering {
531 self.0.total_cmp(&other.0)
532 }
533 }
534
535 #[cfg(test)]
536 mod tests {
537 use super::*;
538
539 /// The cached-key sort must produce the identical permutation to
540 /// `sort_by(row_aware_span_cmp)` on finite inputs.
541 #[test]
542 fn test_sort_by_row_band_matches_comparator() {
543 // Deterministic pseudo-random spans (no rng in tests).
544 let raw: Vec<(f32, f32)> = (0..500)
545 .map(|i| {
546 let y = ((i * 37 % 113) as f32) * 1.3;
547 let x = ((i * 71 % 97) as f32) * 2.1;
548 (y, x)
549 })
550 .collect();
551 let mut a = raw.clone();
552 let mut b = raw.clone();
553 sort_by_row_band(&mut a, |t| t.0, |t| t.1);
554 b.sort_by(|p, q| row_aware_span_cmp(p.0, p.1, q.0, q.1));
555 assert_eq!(a, b, "cached-key sort must match the comparator permutation");
556 }
557
558 #[test]
559 fn test_safe_float_cmp_normal() {
560 assert_eq!(safe_float_cmp(1.0, 2.0), Ordering::Less);
561 assert_eq!(safe_float_cmp(2.0, 1.0), Ordering::Greater);
562 assert_eq!(safe_float_cmp(1.5, 1.5), Ordering::Equal);
563 }
564
565 #[test]
566 fn test_safe_float_cmp_nan() {
567 assert_eq!(safe_float_cmp(f32::NAN, f32::NAN), Ordering::Equal);
568 assert_eq!(safe_float_cmp(f32::NAN, 0.0), Ordering::Greater);
569 assert_eq!(safe_float_cmp(0.0, f32::NAN), Ordering::Less);
570 }
571
572 #[test]
573 fn test_safe_float_cmp_infinity() {
574 assert_eq!(safe_float_cmp(f32::INFINITY, f32::INFINITY), Ordering::Equal);
575 assert_eq!(safe_float_cmp(f32::INFINITY, 1.0), Ordering::Greater);
576 assert_eq!(safe_float_cmp(f32::NEG_INFINITY, f32::INFINITY), Ordering::Less);
577 }
578
579 /// Verify that sort_by using safe_float_cmp never panics with NaN values.
580 /// This is a regression test for the "total order" panic that affected 42
581 /// PDFs across 5 test datasets (issue found in v0.3.11-pre).
582 #[test]
583 fn test_sort_with_nan_does_not_panic() {
584 let mut values = [3.0_f32, f32::NAN, 1.0, f32::NAN, 2.0, f32::NAN, 0.5];
585 values.sort_by(|a, b| safe_float_cmp(*a, *b));
586 // NaN values should sort to the end (NaN > all numbers)
587 assert!(values[0..4].iter().all(|v| !v.is_nan()));
588 assert!(values[4..].iter().all(|v| v.is_nan()));
589 }
590
591 /// Verify transitivity: if a < b and b < c then a < c.
592 /// The previous `partial_cmp().unwrap_or(Equal)` pattern violated this
593 /// when NaN was involved, causing Rust's sort to panic.
594 #[test]
595 fn test_safe_float_cmp_transitivity() {
596 let a = 1.0_f32;
597 let b = 2.0_f32;
598 let nan = f32::NAN;
599
600 // a < b
601 assert_eq!(safe_float_cmp(a, b), Ordering::Less);
602 // b < NaN
603 assert_eq!(safe_float_cmp(b, nan), Ordering::Less);
604 // Therefore a < NaN (transitivity)
605 assert_eq!(safe_float_cmp(a, nan), Ordering::Less);
606 }
607
608 /// Cells in the same tabular row with slightly-different Y values
609 /// must stay together and be ordered by X, not interleaved with
610 /// cells from other rows.
611 #[test]
612 fn test_row_aware_span_cmp_tolerates_y_jitter() {
613 // Row 1 at y ≈ 100 with small per-cell jitter.
614 // Row 2 at y ≈ 86 (14pt leading below).
615 // A strict Y sort would interleave them because some row-1
616 // cells have lower Y than some row-2 cells.
617 #[derive(Debug, Clone, Copy)]
618 struct Cell {
619 y: f32,
620 x: f32,
621 id: &'static str,
622 }
623 let mut cells = [
624 Cell {
625 y: 100.5,
626 x: 50.0,
627 id: "r1-c1",
628 },
629 Cell {
630 y: 99.7,
631 x: 150.0,
632 id: "r1-c2",
633 },
634 Cell {
635 y: 100.2,
636 x: 250.0,
637 id: "r1-c3",
638 },
639 Cell {
640 y: 86.4,
641 x: 50.0,
642 id: "r2-c1",
643 },
644 Cell {
645 y: 85.8,
646 x: 150.0,
647 id: "r2-c2",
648 },
649 Cell {
650 y: 86.1,
651 x: 250.0,
652 id: "r2-c3",
653 },
654 ];
655 cells.sort_by(|a, b| row_aware_span_cmp(a.y, a.x, b.y, b.x));
656 let order: Vec<&str> = cells.iter().map(|c| c.id).collect();
657 assert_eq!(
658 order,
659 vec!["r1-c1", "r1-c2", "r1-c3", "r2-c1", "r2-c2", "r2-c3"],
660 "cells from the same row must stay contiguous and X-sorted"
661 );
662 }
663
664 /// Row-aware comparator must still put distinct-leading rows in
665 /// top-to-bottom reading order.
666 #[test]
667 fn test_row_aware_span_cmp_distinct_rows_descending() {
668 let mut rows = [
669 (100.0f32, 0.0f32, "top"),
670 (50.0, 0.0, "middle"),
671 (10.0, 0.0, "bottom"),
672 ];
673 rows.sort_by(|a, b| row_aware_span_cmp(a.0, a.1, b.0, b.1));
674 assert_eq!(rows[0].2, "top");
675 assert_eq!(rows[1].2, "middle");
676 assert_eq!(rows[2].2, "bottom");
677 }
678
679 /// The comparator is used by sort_by, which requires a valid total
680 /// order. Run a randomized stress test to confirm no transitivity
681 /// panics.
682 #[test]
683 fn test_row_aware_span_cmp_is_total_order() {
684 let mut v: Vec<(f32, f32)> = (0..200)
685 .map(|i| ((i as f32) * 0.73, ((i * 17) % 500) as f32))
686 .collect();
687 v.sort_by(|a, b| row_aware_span_cmp(a.0, a.1, b.0, b.1));
688 }
689
690 /// #656/#657: the RTL variant keeps rows top-to-bottom but orders
691 /// X *descending* (right-to-left) within a row — a pure-RTL line's
692 /// logical reading order.
693 #[test]
694 fn test_row_aware_span_cmp_rtl_within_row_is_descending() {
695 // Same row (Y within band), laid out left-to-right by X.
696 let mut row = [
697 (100.0f32, 10.0f32, "leftmost"),
698 (100.0, 50.0, "mid"),
699 (100.0, 90.0, "rightmost"),
700 ];
701 row.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1));
702 // Rightmost (highest X) reads first in RTL.
703 assert_eq!(["rightmost", "mid", "leftmost"], [row[0].2, row[1].2, row[2].2]);
704 }
705
706 /// Rows still order top-to-bottom regardless of the within-row flip.
707 #[test]
708 fn test_row_aware_span_cmp_rtl_rows_top_to_bottom() {
709 let mut rows = [
710 (10.0f32, 0.0f32, "bottom"),
711 (100.0, 0.0, "top"),
712 (50.0, 0.0, "middle"),
713 ];
714 rows.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1));
715 assert_eq!(["top", "middle", "bottom"], [rows[0].2, rows[1].2, rows[2].2]);
716 }
717
718 /// Must be a valid total order for `sort_by` (no transitivity panic).
719 #[test]
720 fn test_row_aware_span_cmp_rtl_is_total_order() {
721 let mut v: Vec<(f32, f32)> = (0..200)
722 .map(|i| ((i as f32) * 0.73, ((i * 17) % 500) as f32))
723 .collect();
724 v.sort_by(|a, b| row_aware_span_cmp_rtl(a.0, a.1, b.0, b.1));
725 }
726
727 /// Sort a large array with mixed NaN/normal values to stress-test.
728 #[test]
729 fn test_sort_stress_with_nan() {
730 let mut values: Vec<f32> = (0..100).map(|i| i as f32).collect();
731 // Insert NaN at various positions
732 for i in (0..100).step_by(7) {
733 values[i] = f32::NAN;
734 }
735 // Must not panic
736 values.sort_by(|a, b| safe_float_cmp(*a, *b));
737 }
738
739 #[test]
740 fn test_safe_prefix_ascii() {
741 assert_eq!(safe_prefix("hello", 3), "hel");
742 assert_eq!(safe_prefix("hello", 10), "hello");
743 assert_eq!(safe_prefix("", 5), "");
744 assert_eq!(safe_prefix("hi", 0), "");
745 }
746
747 #[test]
748 fn test_safe_prefix_multibyte() {
749 let text = "✚✳★✵"; // 4 × 3-byte chars = 12 bytes
750 assert_eq!(safe_prefix(text, 10), "✚✳★"); // rounds down from 10 to 9
751 assert_eq!(safe_prefix(text, 9), "✚✳★"); // exact boundary
752 assert_eq!(safe_prefix(text, 12), "✚✳★✵"); // full string
753 }
754
755 #[test]
756 fn test_safe_suffix_ascii() {
757 assert_eq!(safe_suffix("hello", 3), "llo");
758 assert_eq!(safe_suffix("hello", 10), "hello");
759 assert_eq!(safe_suffix("", 5), "");
760 assert_eq!(safe_suffix("hi", 0), "");
761 }
762
763 #[test]
764 fn test_safe_suffix_multibyte() {
765 let text = "AB✚✳★✵"; // 14 bytes: A(0) B(1) ✚(2..5) ✳(5..8) ★(8..11) ✵(11..14)
766 // 14 - 10 = 4, byte 4 is inside ✚ → rounds up to 5
767 assert_eq!(safe_suffix(text, 10), "✳★✵");
768 }
769 }
770}
771
772// Version info
773/// Library version
774pub const VERSION: &str = env!("CARGO_PKG_VERSION");
775
776/// Library name
777pub const NAME: &str = env!("CARGO_PKG_NAME");
778
779#[cfg(test)]
780mod tests {
781 use super::*;
782
783 #[test]
784 fn test_version() {
785 // VERSION is populated from CARGO_PKG_VERSION at compile time
786 assert!(VERSION.starts_with("0."));
787 }
788
789 #[test]
790 fn test_name() {
791 assert_eq!(NAME, "pdf_oxide");
792 }
793}