oxidize_pdf/text/plaintext/types.rs
1//! Data types for plain text extraction
2//!
3//! This module defines the configuration and result types used by the plain text
4//! extraction system.
5
6/// Configuration for plain text extraction
7///
8/// Controls how text is extracted and formatted when position information
9/// is not required. Thresholds are expressed in text space units and should
10/// be tuned based on your specific PDF characteristics.
11///
12/// # Default Configuration
13///
14/// ```
15/// use oxidize_pdf::text::plaintext::PlainTextConfig;
16///
17/// let config = PlainTextConfig::default();
18/// assert_eq!(config.space_threshold, 0.3);
19/// assert_eq!(config.newline_threshold, 10.0);
20/// assert!(!config.preserve_layout);
21/// ```
22#[derive(Debug, Clone, PartialEq)]
23pub struct PlainTextConfig {
24 /// Space detection threshold (multiple of average character width)
25 ///
26 /// When horizontal displacement between characters exceeds this threshold
27 /// (expressed as a multiple of the average character width), a space
28 /// character is inserted.
29 ///
30 /// - **Lower values** (0.1-0.2): More spaces inserted, good for tightly-spaced text
31 /// - **Default** (0.3): Balanced for most documents
32 /// - **Higher values** (0.4-0.5): Fewer spaces, good for wide-spaced text
33 ///
34 /// **Range**: 0.05 to 1.0 (typical)
35 pub space_threshold: f64,
36
37 /// Newline detection threshold (multiple of line height)
38 ///
39 /// When vertical displacement between text elements exceeds this threshold
40 /// (in text space units), a newline character is inserted.
41 ///
42 /// - **Lower values** (5.0-8.0): More line breaks, preserves paragraph structure
43 /// - **Default** (10.0): Balanced for most documents
44 /// - **Higher values** (15.0-20.0): Fewer line breaks, joins more text
45 ///
46 /// **Range**: 1.0 to 50.0 (typical)
47 pub newline_threshold: f64,
48
49 /// Preserve original layout whitespace
50 ///
51 /// When `true`, attempts to maintain the original document's whitespace
52 /// structure (indentation, spacing) by inserting appropriate spaces and
53 /// newlines based on position changes in the PDF.
54 ///
55 /// When `false`, uses minimal whitespace (single spaces between words,
56 /// single newlines between paragraphs).
57 ///
58 /// **Use `true` for**:
59 /// - Documents with tabular data
60 /// - Code listings or formatted text
61 /// - Documents where indentation matters
62 ///
63 /// **Use `false` for**:
64 /// - Plain text extraction for search indexing
65 /// - Content analysis where layout doesn't matter
66 /// - Maximum performance (less processing)
67 pub preserve_layout: bool,
68
69 /// Line break handling mode
70 ///
71 /// Controls how line breaks in the PDF are interpreted and processed.
72 /// Different modes are useful for different document types and use cases.
73 pub line_break_mode: LineBreakMode,
74}
75
76impl Default for PlainTextConfig {
77 fn default() -> Self {
78 Self {
79 space_threshold: 0.3,
80 newline_threshold: 10.0,
81 preserve_layout: false,
82 line_break_mode: LineBreakMode::Auto,
83 }
84 }
85}
86
87impl PlainTextConfig {
88 /// Create a new configuration with default values
89 ///
90 /// # Examples
91 ///
92 /// ```
93 /// use oxidize_pdf::text::plaintext::PlainTextConfig;
94 ///
95 /// let config = PlainTextConfig::new();
96 /// ```
97 pub fn new() -> Self {
98 Self::default()
99 }
100
101 /// Create a configuration optimized for dense text (tight spacing)
102 ///
103 /// Lower thresholds detect spaces more aggressively, useful for
104 /// PDFs with minimal character spacing.
105 ///
106 /// # Examples
107 ///
108 /// ```
109 /// use oxidize_pdf::text::plaintext::PlainTextConfig;
110 ///
111 /// let config = PlainTextConfig::dense();
112 /// assert_eq!(config.space_threshold, 0.1);
113 /// ```
114 pub fn dense() -> Self {
115 Self {
116 space_threshold: 0.1,
117 newline_threshold: 8.0,
118 preserve_layout: false,
119 line_break_mode: LineBreakMode::Auto,
120 }
121 }
122
123 /// Create a configuration optimized for loose text (wide spacing)
124 ///
125 /// Higher thresholds avoid false space detection in documents with
126 /// generous character spacing.
127 ///
128 /// # Examples
129 ///
130 /// ```
131 /// use oxidize_pdf::text::plaintext::PlainTextConfig;
132 ///
133 /// let config = PlainTextConfig::loose();
134 /// assert_eq!(config.space_threshold, 0.4);
135 /// ```
136 pub fn loose() -> Self {
137 Self {
138 space_threshold: 0.4,
139 newline_threshold: 15.0,
140 preserve_layout: false,
141 line_break_mode: LineBreakMode::Auto,
142 }
143 }
144
145 /// Create a configuration that preserves layout structure
146 ///
147 /// Useful for documents with tabular data, code, or formatted text
148 /// where whitespace is semantically important.
149 ///
150 /// # Examples
151 ///
152 /// ```
153 /// use oxidize_pdf::text::plaintext::PlainTextConfig;
154 ///
155 /// let config = PlainTextConfig::preserve_layout();
156 /// assert!(config.preserve_layout);
157 /// ```
158 pub fn preserve_layout() -> Self {
159 Self {
160 space_threshold: 0.3,
161 newline_threshold: 10.0,
162 preserve_layout: true,
163 line_break_mode: LineBreakMode::PreserveAll,
164 }
165 }
166}
167
168/// Line break handling mode
169///
170/// Controls how line breaks in the PDF are interpreted. PDFs often include
171/// line breaks for layout purposes that should be removed when extracting
172/// continuous text (e.g., hyphenated words at line ends).
173///
174/// # Examples
175///
176/// ```
177/// use oxidize_pdf::text::plaintext::LineBreakMode;
178///
179/// let mode = LineBreakMode::Auto; // Detect based on context
180/// let mode = LineBreakMode::PreserveAll; // Keep all line breaks
181/// let mode = LineBreakMode::Normalize; // Join hyphenated words
182/// ```
183#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
184pub enum LineBreakMode {
185 /// Automatically detect line breaks
186 ///
187 /// Uses heuristics to determine if a line break is semantic (paragraph end)
188 /// or just for layout (line wrap). Joins lines that appear to be wrapped.
189 ///
190 /// **Best for**: General-purpose text extraction
191 Auto,
192
193 /// Preserve all line breaks from PDF
194 ///
195 /// Every line break in the PDF becomes a newline in the output.
196 /// Useful when the PDF's line breaks are semantically meaningful.
197 ///
198 /// **Best for**: Poetry, code listings, formatted text
199 PreserveAll,
200
201 /// Normalize line breaks (join hyphenated words)
202 ///
203 /// Detects hyphenated words at line ends (e.g., "docu-\nment") and joins
204 /// them into single words ("document"). Other line breaks are preserved.
205 ///
206 /// **Best for**: Continuous text extraction from books, articles
207 Normalize,
208}
209
210/// Result of plain text extraction
211///
212/// Contains the extracted text and metadata about the extraction.
213/// Unlike `ExtractedText`, this does not include position information
214/// for individual text fragments.
215///
216/// # Examples
217///
218/// ```ignore
219/// use oxidize_pdf::Document;
220/// use oxidize_pdf::text::plaintext::PlainTextExtractor;
221///
222/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
223/// let doc = Document::open("document.pdf")?;
224/// let page = doc.get_page(1)?;
225///
226/// let extractor = PlainTextExtractor::new();
227/// let result = extractor.extract(&doc, page)?;
228///
229/// println!("Extracted {} characters in {} lines",
230/// result.char_count,
231/// result.line_count
232/// );
233/// # Ok(())
234/// # }
235/// ```
236#[derive(Debug, Clone, PartialEq, Eq)]
237pub struct PlainTextResult {
238 /// Extracted text content
239 ///
240 /// The complete text content from the page, with spaces and newlines
241 /// inserted according to the configured thresholds and line break mode.
242 pub text: String,
243
244 /// Number of lines in the extracted text
245 ///
246 /// Lines are counted by splitting on `\n` characters. A document with
247 /// no newlines will have a line_count of 1.
248 pub line_count: usize,
249
250 /// Number of characters in the extracted text
251 ///
252 /// Total character count including spaces and newlines.
253 pub char_count: usize,
254}
255
256impl PlainTextResult {
257 /// Create a new result from text
258 ///
259 /// Automatically calculates line_count and char_count from the text.
260 ///
261 /// # Examples
262 ///
263 /// ```
264 /// use oxidize_pdf::text::plaintext::PlainTextResult;
265 ///
266 /// let result = PlainTextResult::new("Hello\nWorld".to_string());
267 /// assert_eq!(result.line_count, 2);
268 /// assert_eq!(result.char_count, 11);
269 /// ```
270 pub fn new(text: String) -> Self {
271 let line_count = text.lines().count();
272 let char_count = text.chars().count();
273 Self {
274 text,
275 line_count,
276 char_count,
277 }
278 }
279
280 /// Create an empty result
281 ///
282 /// # Examples
283 ///
284 /// ```
285 /// use oxidize_pdf::text::plaintext::PlainTextResult;
286 ///
287 /// let result = PlainTextResult::empty();
288 /// assert_eq!(result.text, "");
289 /// assert_eq!(result.line_count, 0);
290 /// assert_eq!(result.char_count, 0);
291 /// ```
292 pub fn empty() -> Self {
293 Self {
294 text: String::new(),
295 line_count: 0,
296 char_count: 0,
297 }
298 }
299
300 /// Check if the result is empty
301 ///
302 /// # Examples
303 ///
304 /// ```
305 /// use oxidize_pdf::text::plaintext::PlainTextResult;
306 ///
307 /// let result = PlainTextResult::empty();
308 /// assert!(result.is_empty());
309 ///
310 /// let result = PlainTextResult::new("text".to_string());
311 /// assert!(!result.is_empty());
312 /// ```
313 pub fn is_empty(&self) -> bool {
314 self.text.is_empty()
315 }
316}
317
318#[cfg(test)]
319mod tests {
320 use super::*;
321
322 #[test]
323 fn test_config_default() {
324 let config = PlainTextConfig::default();
325 assert_eq!(config.space_threshold, 0.3);
326 assert_eq!(config.newline_threshold, 10.0);
327 assert!(!config.preserve_layout);
328 assert_eq!(config.line_break_mode, LineBreakMode::Auto);
329 }
330
331 #[test]
332 fn test_config_new() {
333 let config = PlainTextConfig::new();
334 assert_eq!(config, PlainTextConfig::default());
335 }
336
337 #[test]
338 fn test_config_dense() {
339 let config = PlainTextConfig::dense();
340 assert_eq!(config.space_threshold, 0.1);
341 assert_eq!(config.newline_threshold, 8.0);
342 assert!(!config.preserve_layout);
343 }
344
345 #[test]
346 fn test_config_loose() {
347 let config = PlainTextConfig::loose();
348 assert_eq!(config.space_threshold, 0.4);
349 assert_eq!(config.newline_threshold, 15.0);
350 assert!(!config.preserve_layout);
351 }
352
353 #[test]
354 fn test_config_preserve_layout() {
355 let config = PlainTextConfig::preserve_layout();
356 assert!(config.preserve_layout);
357 assert_eq!(config.line_break_mode, LineBreakMode::PreserveAll);
358 }
359
360 #[test]
361 fn test_line_break_mode_equality() {
362 assert_eq!(LineBreakMode::Auto, LineBreakMode::Auto);
363 assert_ne!(LineBreakMode::Auto, LineBreakMode::PreserveAll);
364 }
365
366 #[test]
367 fn test_plain_text_result_new() {
368 let result = PlainTextResult::new("Hello\nWorld".to_string());
369 assert_eq!(result.text, "Hello\nWorld");
370 assert_eq!(result.line_count, 2);
371 assert_eq!(result.char_count, 11);
372 }
373
374 #[test]
375 fn test_plain_text_result_empty() {
376 let result = PlainTextResult::empty();
377 assert_eq!(result.text, "");
378 assert_eq!(result.line_count, 0);
379 assert_eq!(result.char_count, 0);
380 assert!(result.is_empty());
381 }
382
383 #[test]
384 fn test_plain_text_result_is_empty() {
385 let empty = PlainTextResult::empty();
386 assert!(empty.is_empty());
387
388 let not_empty = PlainTextResult::new("text".to_string());
389 assert!(!not_empty.is_empty());
390 }
391
392 #[test]
393 fn test_plain_text_result_line_count() {
394 let single = PlainTextResult::new("single line".to_string());
395 assert_eq!(single.line_count, 1);
396
397 let multiple = PlainTextResult::new("line1\nline2\nline3".to_string());
398 assert_eq!(multiple.line_count, 3);
399 }
400}