oxidize_pdf/text/plaintext/
types.rs

1//! Data types for plain text extraction
2//!
3//! This module defines the configuration and result types used by the plain text
4//! extraction system.
5
6/// Configuration for plain text extraction
7///
8/// Controls how text is extracted and formatted when position information
9/// is not required. Thresholds are expressed in text space units and should
10/// be tuned based on your specific PDF characteristics.
11///
12/// # Default Configuration
13///
14/// ```
15/// use oxidize_pdf::text::plaintext::PlainTextConfig;
16///
17/// let config = PlainTextConfig::default();
18/// assert_eq!(config.space_threshold, 0.3);
19/// assert_eq!(config.newline_threshold, 10.0);
20/// assert!(!config.preserve_layout);
21/// ```
22#[derive(Debug, Clone, PartialEq)]
23pub struct PlainTextConfig {
24    /// Space detection threshold (multiple of average character width)
25    ///
26    /// When horizontal displacement between characters exceeds this threshold
27    /// (expressed as a multiple of the average character width), a space
28    /// character is inserted.
29    ///
30    /// - **Lower values** (0.1-0.2): More spaces inserted, good for tightly-spaced text
31    /// - **Default** (0.3): Balanced for most documents
32    /// - **Higher values** (0.4-0.5): Fewer spaces, good for wide-spaced text
33    ///
34    /// **Range**: 0.05 to 1.0 (typical)
35    pub space_threshold: f64,
36
37    /// Newline detection threshold (multiple of line height)
38    ///
39    /// When vertical displacement between text elements exceeds this threshold
40    /// (in text space units), a newline character is inserted.
41    ///
42    /// - **Lower values** (5.0-8.0): More line breaks, preserves paragraph structure
43    /// - **Default** (10.0): Balanced for most documents
44    /// - **Higher values** (15.0-20.0): Fewer line breaks, joins more text
45    ///
46    /// **Range**: 1.0 to 50.0 (typical)
47    pub newline_threshold: f64,
48
49    /// Preserve original layout whitespace
50    ///
51    /// When `true`, attempts to maintain the original document's whitespace
52    /// structure (indentation, spacing) by inserting appropriate spaces and
53    /// newlines based on position changes in the PDF.
54    ///
55    /// When `false`, uses minimal whitespace (single spaces between words,
56    /// single newlines between paragraphs).
57    ///
58    /// **Use `true` for**:
59    /// - Documents with tabular data
60    /// - Code listings or formatted text
61    /// - Documents where indentation matters
62    ///
63    /// **Use `false` for**:
64    /// - Plain text extraction for search indexing
65    /// - Content analysis where layout doesn't matter
66    /// - Maximum performance (less processing)
67    pub preserve_layout: bool,
68
69    /// Line break handling mode
70    ///
71    /// Controls how line breaks in the PDF are interpreted and processed.
72    /// Different modes are useful for different document types and use cases.
73    pub line_break_mode: LineBreakMode,
74}
75
76impl Default for PlainTextConfig {
77    fn default() -> Self {
78        Self {
79            space_threshold: 0.3,
80            newline_threshold: 10.0,
81            preserve_layout: false,
82            line_break_mode: LineBreakMode::Auto,
83        }
84    }
85}
86
87impl PlainTextConfig {
88    /// Create a new configuration with default values
89    ///
90    /// # Examples
91    ///
92    /// ```
93    /// use oxidize_pdf::text::plaintext::PlainTextConfig;
94    ///
95    /// let config = PlainTextConfig::new();
96    /// ```
97    pub fn new() -> Self {
98        Self::default()
99    }
100
101    /// Create a configuration optimized for dense text (tight spacing)
102    ///
103    /// Lower thresholds detect spaces more aggressively, useful for
104    /// PDFs with minimal character spacing.
105    ///
106    /// # Examples
107    ///
108    /// ```
109    /// use oxidize_pdf::text::plaintext::PlainTextConfig;
110    ///
111    /// let config = PlainTextConfig::dense();
112    /// assert_eq!(config.space_threshold, 0.1);
113    /// ```
114    pub fn dense() -> Self {
115        Self {
116            space_threshold: 0.1,
117            newline_threshold: 8.0,
118            preserve_layout: false,
119            line_break_mode: LineBreakMode::Auto,
120        }
121    }
122
123    /// Create a configuration optimized for loose text (wide spacing)
124    ///
125    /// Higher thresholds avoid false space detection in documents with
126    /// generous character spacing.
127    ///
128    /// # Examples
129    ///
130    /// ```
131    /// use oxidize_pdf::text::plaintext::PlainTextConfig;
132    ///
133    /// let config = PlainTextConfig::loose();
134    /// assert_eq!(config.space_threshold, 0.4);
135    /// ```
136    pub fn loose() -> Self {
137        Self {
138            space_threshold: 0.4,
139            newline_threshold: 15.0,
140            preserve_layout: false,
141            line_break_mode: LineBreakMode::Auto,
142        }
143    }
144
145    /// Create a configuration that preserves layout structure
146    ///
147    /// Useful for documents with tabular data, code, or formatted text
148    /// where whitespace is semantically important.
149    ///
150    /// # Examples
151    ///
152    /// ```
153    /// use oxidize_pdf::text::plaintext::PlainTextConfig;
154    ///
155    /// let config = PlainTextConfig::preserve_layout();
156    /// assert!(config.preserve_layout);
157    /// ```
158    pub fn preserve_layout() -> Self {
159        Self {
160            space_threshold: 0.3,
161            newline_threshold: 10.0,
162            preserve_layout: true,
163            line_break_mode: LineBreakMode::PreserveAll,
164        }
165    }
166}
167
168/// Line break handling mode
169///
170/// Controls how line breaks in the PDF are interpreted. PDFs often include
171/// line breaks for layout purposes that should be removed when extracting
172/// continuous text (e.g., hyphenated words at line ends).
173///
174/// # Examples
175///
176/// ```
177/// use oxidize_pdf::text::plaintext::LineBreakMode;
178///
179/// let mode = LineBreakMode::Auto;         // Detect based on context
180/// let mode = LineBreakMode::PreserveAll;  // Keep all line breaks
181/// let mode = LineBreakMode::Normalize;    // Join hyphenated words
182/// ```
183#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
184pub enum LineBreakMode {
185    /// Automatically detect line breaks
186    ///
187    /// Uses heuristics to determine if a line break is semantic (paragraph end)
188    /// or just for layout (line wrap). Joins lines that appear to be wrapped.
189    ///
190    /// **Best for**: General-purpose text extraction
191    Auto,
192
193    /// Preserve all line breaks from PDF
194    ///
195    /// Every line break in the PDF becomes a newline in the output.
196    /// Useful when the PDF's line breaks are semantically meaningful.
197    ///
198    /// **Best for**: Poetry, code listings, formatted text
199    PreserveAll,
200
201    /// Normalize line breaks (join hyphenated words)
202    ///
203    /// Detects hyphenated words at line ends (e.g., "docu-\nment") and joins
204    /// them into single words ("document"). Other line breaks are preserved.
205    ///
206    /// **Best for**: Continuous text extraction from books, articles
207    Normalize,
208}
209
210/// Result of plain text extraction
211///
212/// Contains the extracted text and metadata about the extraction.
213/// Unlike `ExtractedText`, this does not include position information
214/// for individual text fragments.
215///
216/// # Examples
217///
218/// ```ignore
219/// use oxidize_pdf::Document;
220/// use oxidize_pdf::text::plaintext::PlainTextExtractor;
221///
222/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
223/// let doc = Document::open("document.pdf")?;
224/// let page = doc.get_page(1)?;
225///
226/// let extractor = PlainTextExtractor::new();
227/// let result = extractor.extract(&doc, page)?;
228///
229/// println!("Extracted {} characters in {} lines",
230///     result.char_count,
231///     result.line_count
232/// );
233/// # Ok(())
234/// # }
235/// ```
236#[derive(Debug, Clone, PartialEq, Eq)]
237pub struct PlainTextResult {
238    /// Extracted text content
239    ///
240    /// The complete text content from the page, with spaces and newlines
241    /// inserted according to the configured thresholds and line break mode.
242    pub text: String,
243
244    /// Number of lines in the extracted text
245    ///
246    /// Lines are counted by splitting on `\n` characters. A document with
247    /// no newlines will have a line_count of 1.
248    pub line_count: usize,
249
250    /// Number of characters in the extracted text
251    ///
252    /// Total character count including spaces and newlines.
253    pub char_count: usize,
254}
255
256impl PlainTextResult {
257    /// Create a new result from text
258    ///
259    /// Automatically calculates line_count and char_count from the text.
260    ///
261    /// # Examples
262    ///
263    /// ```
264    /// use oxidize_pdf::text::plaintext::PlainTextResult;
265    ///
266    /// let result = PlainTextResult::new("Hello\nWorld".to_string());
267    /// assert_eq!(result.line_count, 2);
268    /// assert_eq!(result.char_count, 11);
269    /// ```
270    pub fn new(text: String) -> Self {
271        let line_count = text.lines().count();
272        let char_count = text.chars().count();
273        Self {
274            text,
275            line_count,
276            char_count,
277        }
278    }
279
280    /// Create an empty result
281    ///
282    /// # Examples
283    ///
284    /// ```
285    /// use oxidize_pdf::text::plaintext::PlainTextResult;
286    ///
287    /// let result = PlainTextResult::empty();
288    /// assert_eq!(result.text, "");
289    /// assert_eq!(result.line_count, 0);
290    /// assert_eq!(result.char_count, 0);
291    /// ```
292    pub fn empty() -> Self {
293        Self {
294            text: String::new(),
295            line_count: 0,
296            char_count: 0,
297        }
298    }
299
300    /// Check if the result is empty
301    ///
302    /// # Examples
303    ///
304    /// ```
305    /// use oxidize_pdf::text::plaintext::PlainTextResult;
306    ///
307    /// let result = PlainTextResult::empty();
308    /// assert!(result.is_empty());
309    ///
310    /// let result = PlainTextResult::new("text".to_string());
311    /// assert!(!result.is_empty());
312    /// ```
313    pub fn is_empty(&self) -> bool {
314        self.text.is_empty()
315    }
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321
322    #[test]
323    fn test_config_default() {
324        let config = PlainTextConfig::default();
325        assert_eq!(config.space_threshold, 0.3);
326        assert_eq!(config.newline_threshold, 10.0);
327        assert!(!config.preserve_layout);
328        assert_eq!(config.line_break_mode, LineBreakMode::Auto);
329    }
330
331    #[test]
332    fn test_config_new() {
333        let config = PlainTextConfig::new();
334        assert_eq!(config, PlainTextConfig::default());
335    }
336
337    #[test]
338    fn test_config_dense() {
339        let config = PlainTextConfig::dense();
340        assert_eq!(config.space_threshold, 0.1);
341        assert_eq!(config.newline_threshold, 8.0);
342        assert!(!config.preserve_layout);
343    }
344
345    #[test]
346    fn test_config_loose() {
347        let config = PlainTextConfig::loose();
348        assert_eq!(config.space_threshold, 0.4);
349        assert_eq!(config.newline_threshold, 15.0);
350        assert!(!config.preserve_layout);
351    }
352
353    #[test]
354    fn test_config_preserve_layout() {
355        let config = PlainTextConfig::preserve_layout();
356        assert!(config.preserve_layout);
357        assert_eq!(config.line_break_mode, LineBreakMode::PreserveAll);
358    }
359
360    #[test]
361    fn test_line_break_mode_equality() {
362        assert_eq!(LineBreakMode::Auto, LineBreakMode::Auto);
363        assert_ne!(LineBreakMode::Auto, LineBreakMode::PreserveAll);
364    }
365
366    #[test]
367    fn test_plain_text_result_new() {
368        let result = PlainTextResult::new("Hello\nWorld".to_string());
369        assert_eq!(result.text, "Hello\nWorld");
370        assert_eq!(result.line_count, 2);
371        assert_eq!(result.char_count, 11);
372    }
373
374    #[test]
375    fn test_plain_text_result_empty() {
376        let result = PlainTextResult::empty();
377        assert_eq!(result.text, "");
378        assert_eq!(result.line_count, 0);
379        assert_eq!(result.char_count, 0);
380        assert!(result.is_empty());
381    }
382
383    #[test]
384    fn test_plain_text_result_is_empty() {
385        let empty = PlainTextResult::empty();
386        assert!(empty.is_empty());
387
388        let not_empty = PlainTextResult::new("text".to_string());
389        assert!(!not_empty.is_empty());
390    }
391
392    #[test]
393    fn test_plain_text_result_line_count() {
394        let single = PlainTextResult::new("single line".to_string());
395        assert_eq!(single.line_count, 1);
396
397        let multiple = PlainTextResult::new("line1\nline2\nline3".to_string());
398        assert_eq!(multiple.line_count, 3);
399    }
400}
oxidize_pdf/text/plaintext/types.rs

oxidize_pdf/text/plaintext/
types.rs