#![allow(clippy::type_complexity)]
#![allow(clippy::too_many_arguments)]
#![allow(clippy::needless_range_loop)]
#![allow(clippy::enum_variant_names)]
#![allow(clippy::wrong_self_convention)]
#![allow(clippy::explicit_counter_loop)]
#![allow(clippy::doc_overindented_list_items)]
#![allow(clippy::should_implement_trait)]
#![allow(clippy::redundant_guards)]
#![allow(clippy::regex_creation_in_loops)]
#![allow(clippy::manual_find)]
#![allow(clippy::match_like_matches_macro)]
#![allow(clippy::collapsible_match)]
#![cfg_attr(test, allow(dead_code))]
#![cfg_attr(test, allow(unused_variables))]
#![warn(missing_docs)]
#![cfg_attr(docsrs, feature(doc_cfg))]
pub mod error;
pub(crate) mod cache;
pub mod document;
pub mod lexer;
pub mod object;
pub mod objstm;
pub mod parser;
pub mod parser_config;
pub mod xref;
pub mod xref_reconstruction;
pub mod decoders;
pub mod color;
pub mod encryption;
pub mod geometry;
pub mod layout;
pub mod content;
pub mod extractors;
pub mod fonts;
pub mod text;
pub mod annotation_types;
pub mod annotations;
pub mod elements;
pub mod outline;
pub mod structure;
pub mod converters;
pub mod pipeline;
pub mod writer;
pub mod html_css;
pub mod fdf;
pub mod xfa;
pub mod editor;
pub mod search;
#[cfg(feature = "rendering")]
#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
pub mod rendering;
#[cfg(feature = "rendering")]
#[cfg_attr(docsrs, doc(cfg(feature = "rendering")))]
pub mod debug;
#[cfg(feature = "signatures")]
#[cfg_attr(docsrs, doc(cfg(feature = "signatures")))]
pub mod signatures;
#[cfg(feature = "parallel")]
#[cfg_attr(docsrs, doc(cfg(feature = "parallel")))]
pub mod parallel;
#[cfg(not(target_arch = "wasm32"))]
pub mod batch;
pub mod compliance;
pub mod api;
pub use pipeline::XYCutStrategy;
pub mod config;
pub mod hybrid;
#[cfg(feature = "ocr")]
#[cfg_attr(docsrs, doc(cfg(feature = "ocr")))]
pub mod ocr;
#[cfg(not(target_arch = "wasm32"))]
pub mod ffi;
#[cfg(feature = "python")]
mod python;
#[cfg(any(target_arch = "wasm32", test))]
#[cfg(feature = "wasm")]
pub mod wasm;
pub use annotation_types::{
AnnotationBorderStyle, AnnotationColor, AnnotationFlags, AnnotationSubtype, BorderEffectStyle,
BorderStyleType, CaretSymbol, FileAttachmentIcon, FreeTextIntent, HighlightMode,
LineEndingStyle, QuadPoint, ReplyType, StampType, TextAlignment, TextAnnotationIcon,
TextMarkupType, WidgetFieldType,
};
pub use annotations::{Annotation, LinkAction, LinkDestination};
pub use config::{DocumentType, ExtractionProfile};
pub use document::{ExtractedImageRef, ImageFormat, PdfDocument, ReadingOrder};
pub use error::{Error, Result};
pub use layout::PageText;
pub use outline::{Destination, OutlineItem};
pub use fonts::global_cache::{
clear_global_font_cache, global_font_cache_stats, set_global_font_cache_capacity,
};
pub use fonts::cmap::{clear_cmap_cache, cmap_cache_size};
#[cfg(feature = "parallel")]
pub use parallel::{extract_all_markdown_parallel, extract_all_text_parallel, ParallelExtractor};
pub(crate) mod utils {
use std::cmp::Ordering;
#[inline]
pub fn safe_prefix(s: &str, max_bytes: usize) -> &str {
if s.len() <= max_bytes {
return s;
}
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
#[inline]
pub fn safe_suffix(s: &str, max_bytes: usize) -> &str {
if s.len() <= max_bytes {
return s;
}
let start = s.len() - max_bytes;
let mut safe_start = start;
while safe_start < s.len() && !s.is_char_boundary(safe_start) {
safe_start += 1;
}
&s[safe_start..]
}
pub const ROW_BAND_TOLERANCE_PT: f32 = 3.0;
#[inline]
pub fn row_aware_span_cmp(a_y: f32, a_x: f32, b_y: f32, b_x: f32) -> Ordering {
if !a_y.is_finite() || !b_y.is_finite() {
return safe_float_cmp(b_y, a_y).then_with(|| safe_float_cmp(a_x, b_x));
}
let band_a = (a_y / ROW_BAND_TOLERANCE_PT).round() as i32;
let band_b = (b_y / ROW_BAND_TOLERANCE_PT).round() as i32;
match band_b.cmp(&band_a) {
Ordering::Equal => safe_float_cmp(a_x, b_x),
other => other,
}
}
#[inline]
pub fn safe_float_cmp(a: f32, b: f32) -> Ordering {
match (a.is_nan(), b.is_nan()) {
(true, true) => Ordering::Equal,
(true, false) => Ordering::Greater, (false, true) => Ordering::Less, (false, false) => {
a.partial_cmp(&b).unwrap()
},
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_safe_float_cmp_normal() {
assert_eq!(safe_float_cmp(1.0, 2.0), Ordering::Less);
assert_eq!(safe_float_cmp(2.0, 1.0), Ordering::Greater);
assert_eq!(safe_float_cmp(1.5, 1.5), Ordering::Equal);
}
#[test]
fn test_safe_float_cmp_nan() {
assert_eq!(safe_float_cmp(f32::NAN, f32::NAN), Ordering::Equal);
assert_eq!(safe_float_cmp(f32::NAN, 0.0), Ordering::Greater);
assert_eq!(safe_float_cmp(0.0, f32::NAN), Ordering::Less);
}
#[test]
fn test_safe_float_cmp_infinity() {
assert_eq!(safe_float_cmp(f32::INFINITY, f32::INFINITY), Ordering::Equal);
assert_eq!(safe_float_cmp(f32::INFINITY, 1.0), Ordering::Greater);
assert_eq!(safe_float_cmp(f32::NEG_INFINITY, f32::INFINITY), Ordering::Less);
}
#[test]
fn test_sort_with_nan_does_not_panic() {
let mut values = [3.0_f32, f32::NAN, 1.0, f32::NAN, 2.0, f32::NAN, 0.5];
values.sort_by(|a, b| safe_float_cmp(*a, *b));
assert!(values[0..4].iter().all(|v| !v.is_nan()));
assert!(values[4..].iter().all(|v| v.is_nan()));
}
#[test]
fn test_safe_float_cmp_transitivity() {
let a = 1.0_f32;
let b = 2.0_f32;
let nan = f32::NAN;
assert_eq!(safe_float_cmp(a, b), Ordering::Less);
assert_eq!(safe_float_cmp(b, nan), Ordering::Less);
assert_eq!(safe_float_cmp(a, nan), Ordering::Less);
}
#[test]
fn test_row_aware_span_cmp_tolerates_y_jitter() {
#[derive(Debug, Clone, Copy)]
struct Cell {
y: f32,
x: f32,
id: &'static str,
}
let mut cells = [
Cell {
y: 100.5,
x: 50.0,
id: "r1-c1",
},
Cell {
y: 99.7,
x: 150.0,
id: "r1-c2",
},
Cell {
y: 100.2,
x: 250.0,
id: "r1-c3",
},
Cell {
y: 86.4,
x: 50.0,
id: "r2-c1",
},
Cell {
y: 85.8,
x: 150.0,
id: "r2-c2",
},
Cell {
y: 86.1,
x: 250.0,
id: "r2-c3",
},
];
cells.sort_by(|a, b| row_aware_span_cmp(a.y, a.x, b.y, b.x));
let order: Vec<&str> = cells.iter().map(|c| c.id).collect();
assert_eq!(
order,
vec!["r1-c1", "r1-c2", "r1-c3", "r2-c1", "r2-c2", "r2-c3"],
"cells from the same row must stay contiguous and X-sorted"
);
}
#[test]
fn test_row_aware_span_cmp_distinct_rows_descending() {
let mut rows = [
(100.0f32, 0.0f32, "top"),
(50.0, 0.0, "middle"),
(10.0, 0.0, "bottom"),
];
rows.sort_by(|a, b| row_aware_span_cmp(a.0, a.1, b.0, b.1));
assert_eq!(rows[0].2, "top");
assert_eq!(rows[1].2, "middle");
assert_eq!(rows[2].2, "bottom");
}
#[test]
fn test_row_aware_span_cmp_is_total_order() {
let mut v: Vec<(f32, f32)> = (0..200)
.map(|i| ((i as f32) * 0.73, ((i * 17) % 500) as f32))
.collect();
v.sort_by(|a, b| row_aware_span_cmp(a.0, a.1, b.0, b.1));
}
#[test]
fn test_sort_stress_with_nan() {
let mut values: Vec<f32> = (0..100).map(|i| i as f32).collect();
for i in (0..100).step_by(7) {
values[i] = f32::NAN;
}
values.sort_by(|a, b| safe_float_cmp(*a, *b));
}
#[test]
fn test_safe_prefix_ascii() {
assert_eq!(safe_prefix("hello", 3), "hel");
assert_eq!(safe_prefix("hello", 10), "hello");
assert_eq!(safe_prefix("", 5), "");
assert_eq!(safe_prefix("hi", 0), "");
}
#[test]
fn test_safe_prefix_multibyte() {
let text = "✚✳★✵"; assert_eq!(safe_prefix(text, 10), "✚✳★"); assert_eq!(safe_prefix(text, 9), "✚✳★"); assert_eq!(safe_prefix(text, 12), "✚✳★✵"); }
#[test]
fn test_safe_suffix_ascii() {
assert_eq!(safe_suffix("hello", 3), "llo");
assert_eq!(safe_suffix("hello", 10), "hello");
assert_eq!(safe_suffix("", 5), "");
assert_eq!(safe_suffix("hi", 0), "");
}
#[test]
fn test_safe_suffix_multibyte() {
let text = "AB✚✳★✵"; assert_eq!(safe_suffix(text, 10), "✳★✵");
}
}
}
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub const NAME: &str = env!("CARGO_PKG_NAME");
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_version() {
assert!(VERSION.starts_with("0."));
}
#[test]
fn test_name() {
assert_eq!(NAME, "pdf_oxide");
}
}