1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
//! Parsing options and configuration.
use crate::render::PageSelection;
/// Options for parsing PDF documents.
#[derive(Debug, Clone)]
pub struct ParseOptions {
/// Error handling mode
pub error_mode: ErrorMode,
/// What to extract from the document
pub extract_mode: ExtractMode,
/// Whether to extract embedded resources (images, fonts).
///
/// Default is `false` since 0.4.0 — large PDFs silently loading all
/// images into memory was the largest peak-memory vector. Opt in via
/// `.with_resources(true)` when images are needed.
pub extract_resources: bool,
/// Minimum pixel dimension for extracted images. Images whose width
/// OR height falls below this threshold are dropped as decorative
/// (logos, bullets, rule lines, tracking pixels). Set to 0 to keep
/// every image. Default 64 — conservative cutoff for technical docs.
pub min_image_dimension: u32,
/// Whether to use parallel processing
pub parallel: bool,
/// Page selection (which pages to parse)
pub pages: PageSelection,
/// Password for encrypted documents
pub password: Option<String>,
}
impl ParseOptions {
/// Create new parse options with defaults.
pub fn new() -> Self {
Self::default()
}
/// Set error mode.
pub fn with_error_mode(mut self, mode: ErrorMode) -> Self {
self.error_mode = mode;
self
}
/// Enable lenient mode (skip invalid content).
pub fn lenient(mut self) -> Self {
self.error_mode = ErrorMode::Lenient;
self
}
/// Set extract mode.
pub fn with_extract_mode(mut self, mode: ExtractMode) -> Self {
self.extract_mode = mode;
self
}
/// Extract text only.
pub fn text_only(mut self) -> Self {
self.extract_mode = ExtractMode::TextOnly;
self
}
/// Enable or disable resource extraction.
pub fn with_resources(mut self, extract: bool) -> Self {
self.extract_resources = extract;
self
}
/// Enable or disable parallel processing.
pub fn with_parallel(mut self, parallel: bool) -> Self {
self.parallel = parallel;
self
}
/// Disable parallel processing.
pub fn sequential(mut self) -> Self {
self.parallel = false;
self
}
/// Set page selection.
pub fn with_pages(mut self, pages: PageSelection) -> Self {
self.pages = pages;
self
}
/// Set password for encrypted documents.
pub fn with_password(mut self, password: impl Into<String>) -> Self {
self.password = Some(password.into());
self
}
/// Set the minimum image dimension (pixels). Images with width OR
/// height below this value are dropped as decorative. `0` keeps all.
pub fn with_min_image_dimension(mut self, min_px: u32) -> Self {
self.min_image_dimension = min_px;
self
}
}
impl Default for ParseOptions {
fn default() -> Self {
Self {
error_mode: ErrorMode::Lenient,
extract_mode: ExtractMode::Full,
extract_resources: false,
min_image_dimension: 64,
parallel: true,
pages: PageSelection::All,
password: None,
}
}
}
/// Error handling mode during parsing.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ErrorMode {
/// Fail on any error
Strict,
/// Skip invalid content and continue
#[default]
Lenient,
}
/// What content to extract from the document.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ExtractMode {
/// Extract everything (text, structure, resources)
#[default]
Full,
/// Extract text content only
TextOnly,
/// Extract structure only (no text content)
StructureOnly,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_options_builder() {
let options = ParseOptions::new().lenient().text_only().sequential();
assert_eq!(options.error_mode, ErrorMode::Lenient);
assert_eq!(options.extract_mode, ExtractMode::TextOnly);
assert!(!options.parallel);
}
#[test]
fn test_default_min_image_dimension() {
let options = ParseOptions::default();
assert_eq!(options.min_image_dimension, 64);
}
#[test]
fn test_with_min_image_dimension_override() {
let o = ParseOptions::new().with_min_image_dimension(0);
assert_eq!(o.min_image_dimension, 0);
let o = ParseOptions::new().with_min_image_dimension(200);
assert_eq!(o.min_image_dimension, 200);
}
#[test]
fn test_default_options() {
let options = ParseOptions::default();
assert_eq!(options.error_mode, ErrorMode::Lenient);
assert!(options.parallel);
// 0.4.0 breaking: default is now false
assert!(!options.extract_resources);
}
}