pdf_oxide 0.3.30

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
//! Format converters for PDF documents.
//!
//! This module provides functionality to convert PDF pages to different formats:
//! - **Markdown**: Semantic text with headings, paragraphs, and images
//! - **HTML**: Both semantic and layout-preserved modes
//! - **Plain text**: Simple text extraction
//!
//! # Examples
//!
//! ```no_run
//! use pdf_oxide::PdfDocument;
//! use pdf_oxide::converters::ConversionOptions;
//!
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let mut doc = PdfDocument::open("paper.pdf")?;
//!
//! // Convert to Markdown with heading detection
//! let options = ConversionOptions {
//!     detect_headings: true,
//!     ..Default::default()
//! };
//! let markdown = doc.to_markdown(0, &options)?;
//!
//! // Convert to semantic HTML
//! let html = doc.to_html(0, &options)?;
//!
//! // Convert to layout-preserved HTML
//! let layout_options = ConversionOptions {
//!     preserve_layout: true,
//!     ..Default::default()
//! };
//! let layout_html = doc.to_html(0, &layout_options)?;
//! # Ok(())
//! # }
//! ```

pub mod formula_renderer;
pub mod html;
pub mod markdown;
pub mod office;
pub mod table_formatter;
pub mod text_post_processor;
pub mod whitespace;

// Re-export main types
pub use formula_renderer::{FormulaRenderer, RenderedFormula};
#[allow(deprecated)]
pub use html::HtmlConverter;
#[allow(deprecated)]
pub use markdown::MarkdownConverter;
pub use table_formatter::MarkdownTableFormatter;
pub use text_post_processor::TextPostProcessor;
pub use whitespace::{cleanup_markdown, normalize_whitespace, remove_page_artifacts};

// Re-export Office conversion types (always available, but stubs without feature)
#[cfg(feature = "office")]
pub use office::{DocxConverter, PptxConverter, XlsxConverter};
pub use office::{Margins, OfficeConfig, OfficeConverter};

// Re-export BoldMarkerBehavior from pipeline config (single source of truth)
pub use crate::pipeline::config::BoldMarkerBehavior;

/// Configuration for table formatting in markdown.
///
/// All formatting parameters are configurable with no magic numbers.
#[derive(Debug, Clone)]
pub struct TableFormatConfig {
    /// Include markdown table header separator (default: true)
    pub include_header_separator: bool,
    /// Spaces around cell content (default: 1)
    pub cell_padding: usize,
    /// Minimum column width in characters (default: 3)
    pub min_column_width: usize,
    /// Merge adjacent empty cells (default: true)
    pub merge_adjacent_empty_cells: bool,
    /// Preserve bold/italic formatting in cells (default: true)
    pub preserve_cell_formatting: bool,
    /// Text to use for empty cells (default: "-")
    pub empty_cell_text: String,
}

impl TableFormatConfig {
    /// Create a standard markdown table configuration.
    pub fn default() -> Self {
        Self {
            include_header_separator: true,
            cell_padding: 1,
            min_column_width: 3,
            merge_adjacent_empty_cells: true,
            preserve_cell_formatting: true,
            empty_cell_text: "-".to_string(),
        }
    }

    /// Create a compact markdown table configuration.
    pub fn compact() -> Self {
        Self {
            include_header_separator: true,
            cell_padding: 0,
            min_column_width: 1,
            merge_adjacent_empty_cells: true,
            preserve_cell_formatting: false,
            empty_cell_text: String::new(),
        }
    }

    /// Create a detailed markdown table configuration.
    pub fn detailed() -> Self {
        Self {
            include_header_separator: true,
            cell_padding: 2,
            min_column_width: 5,
            merge_adjacent_empty_cells: false,
            preserve_cell_formatting: true,
            empty_cell_text: "".to_string(),
        }
    }

    /// Create a custom markdown table configuration.
    pub fn custom() -> Self {
        Self::default()
    }

    /// Set cell padding (builder pattern).
    pub fn with_cell_padding(mut self, padding: usize) -> Self {
        self.cell_padding = padding;
        self
    }

    /// Set minimum column width (builder pattern).
    pub fn with_min_column_width(mut self, width: usize) -> Self {
        self.min_column_width = width;
        self
    }

    /// Set empty cell text (builder pattern).
    pub fn with_empty_cell_text(mut self, text: &str) -> Self {
        self.empty_cell_text = text.to_string();
        self
    }
}

impl Default for TableFormatConfig {
    fn default() -> Self {
        TableFormatConfig::default()
    }
}

/// Options for converting PDF pages to different formats.
///
/// These options control how the conversion is performed, including
/// layout preservation, heading detection, image handling, etc.
///
/// # Examples
///
/// ```
/// use pdf_oxide::converters::{BoldMarkerBehavior, ConversionOptions, ReadingOrderMode};
///
/// // Default options
/// let opts = ConversionOptions::default();
///
/// // Custom options
/// let opts = ConversionOptions {
///     preserve_layout: true,
///     detect_headings: false,
///     extract_tables: false,
///     include_images: false,
///     image_output_dir: Some("images/".to_string()),
///     reading_order_mode: ReadingOrderMode::ColumnAware,
///     bold_marker_behavior: BoldMarkerBehavior::Conservative,
///     table_detection_config: None,
///     ..Default::default()
/// };
/// ```
#[derive(Debug, Clone, PartialEq)]
pub struct ConversionOptions {
    /// Preserve exact layout with CSS positioning (HTML only).
    ///
    /// When true, generates HTML with absolute positioning to match the PDF layout.
    /// When false, generates semantic HTML with natural flow.
    pub preserve_layout: bool,

    /// Automatically detect headings based on font size and weight.
    ///
    /// When true, uses font clustering to identify heading levels (H1, H2, H3).
    /// When false, treats all text as paragraphs.
    pub detect_headings: bool,

    /// Extract tables from the document.
    ///
    /// Note: Table extraction is currently not fully implemented.
    pub extract_tables: bool,

    /// Include images in the output.
    ///
    /// When true, images are included as Markdown image syntax or HTML img tags.
    /// When false (default), images are omitted from the output.
    ///
    /// Defaults to `false` to keep output compact. Base64-embedded images can
    /// add hundreds of KB per page (e.g., 360 KB for a single-page invoice).
    /// Set to `true` when image content is needed.
    pub include_images: bool,

    /// Directory path for saving extracted images.
    ///
    /// If None, images are referenced but not saved.
    /// If Some(path), images are saved to the specified directory.
    pub image_output_dir: Option<String>,

    /// Embed images as base64 data URIs in output.
    ///
    /// When true (default), images are embedded directly as base64 data URIs.
    /// This creates self-contained files that don't require external image files.
    /// Works in HTML and Markdown (Obsidian, Typora, VS Code, Jupyter support base64).
    ///
    /// When false, images are saved to `image_output_dir` and referenced by path.
    /// Note: GitHub/GitLab Markdown renderers block base64 images for security.
    pub embed_images: bool,

    /// Reading order determination mode.
    ///
    /// Controls how text blocks are ordered in the output.
    pub reading_order_mode: ReadingOrderMode,

    /// Control how bold markers are applied in markdown conversion.
    ///
    /// Determines whether bold formatting markers are applied to whitespace-only
    /// content (Aggressive) or only to content-bearing text (Conservative).
    /// See BoldMarkerBehavior for details.
    pub bold_marker_behavior: BoldMarkerBehavior,

    /// Configuration for spatial table detection.
    ///
    /// If None, uses default configuration.
    /// Only applies when extract_tables = true.
    pub table_detection_config: Option<crate::structure::TableDetectionConfig>,

    /// Render formulas as embedded base64 images.
    ///
    /// When true and page_images are provided, formulas from the structure tree
    /// are cropped from rendered page images and embedded as base64 data URIs.
    /// Requires a Tagged PDF with Formula structure elements.
    pub render_formulas: bool,

    /// Paths to pre-rendered page images for formula extraction.
    ///
    /// Each path should point to a PNG image of the corresponding page.
    /// Index 0 = page 0, index 1 = page 1, etc.
    /// Required when render_formulas = true.
    pub page_images: Option<Vec<std::path::PathBuf>>,

    /// Page dimensions in PDF points (width, height).
    ///
    /// Required for coordinate conversion when render_formulas = true.
    /// Defaults to A4 (595.276 x 841.89) if not specified.
    pub page_dimensions: Option<(f32, f32)>,

    /// Include form field values inline in output.
    ///
    /// When true (default), form field values (text fields, checkboxes, choice fields)
    /// are converted to TextSpans at their spatial positions and merged with page content.
    /// This makes field values appear where they visually belong on the page.
    ///
    /// When false, form field values are omitted from output.
    pub include_form_fields: bool,

    /// Maximum image size (in pixels) to embed in output.
    ///
    /// Images exceeding this pixel count (width × height) are skipped when embedding
    /// as base64 data URIs or saving to files. This prevents oversized full-page scans
    /// from bloating output with hundreds of KB of encoded data per page.
    ///
    /// Common reference sizes:
    /// - A4 @ 150 DPI: ~1240 × 1754 = 2.2 MP
    /// - A4 @ 300 DPI: ~2480 × 3508 = 8.7 MP
    /// - A4 @ 600 DPI: ~4960 × 7016 = 34.8 MP
    ///
    /// Default: 16,000,000 (16 MP) — covers A4 at 300 DPI with margin.
    /// Set to `u64::MAX` to disable the limit entirely.
    /// Set to `0` to skip all images.
    pub max_image_pixels: Option<u64>,
}

impl Default for ConversionOptions {
    /// Create default conversion options.
    ///
    /// Defaults:
    /// - preserve_layout: false (semantic mode)
    /// - detect_headings: true (enabled for proper markdown output)
    /// - extract_tables: true
    /// - include_images: false (opt-in to avoid output bloat from base64 images)
    /// - image_output_dir: None
    /// - embed_images: true (base64 for HTML, when include_images is enabled)
    /// - reading_order_mode: StructureTreeFirst (PDF-spec-compliant for Tagged PDFs, falls back to XY-Cut for untagged)
    /// - bold_marker_behavior: Conservative (no bold markers for whitespace-only content)
    /// - table_detection_config: None (uses defaults when table detection is enabled)
    /// - render_formulas: false
    /// - page_images: None
    /// - page_dimensions: None (defaults to A4 when needed)
    /// - include_form_fields: true
    /// - max_image_pixels: None (uses default 16 MP)
    fn default() -> Self {
        Self {
            preserve_layout: false,
            detect_headings: true,
            extract_tables: true,
            include_images: false,
            image_output_dir: None,
            embed_images: true,
            reading_order_mode: ReadingOrderMode::StructureTreeFirst { mcid_order: vec![] },
            bold_marker_behavior: BoldMarkerBehavior::Conservative,
            table_detection_config: None,
            render_formulas: false,
            page_images: None,
            page_dimensions: None,
            include_form_fields: true,
            max_image_pixels: None,
        }
    }
}

impl ConversionOptions {
    /// Enable table detection with custom configuration.
    ///
    /// Sets extract_tables = true and uses the provided configuration.
    ///
    /// # Examples
    ///
    /// ```
    /// use pdf_oxide::converters::ConversionOptions;
    /// use pdf_oxide::structure::TableDetectionConfig;
    ///
    /// let config = TableDetectionConfig::strict();
    /// let opts = ConversionOptions::default().with_table_detection(config);
    ///
    /// assert!(opts.extract_tables);
    /// assert!(opts.table_detection_config.is_some());
    /// ```
    pub fn with_table_detection(mut self, config: crate::structure::TableDetectionConfig) -> Self {
        self.extract_tables = true;
        self.table_detection_config = Some(config);
        self
    }

    /// Enable table detection with default configuration.
    ///
    /// Sets extract_tables = true and table_detection_config = None,
    /// which will use the default TableDetectionConfig when detection runs.
    ///
    /// # Examples
    ///
    /// ```
    /// use pdf_oxide::converters::ConversionOptions;
    ///
    /// let opts = ConversionOptions::default().with_default_table_detection();
    ///
    /// assert!(opts.extract_tables);
    /// assert!(opts.table_detection_config.is_none());
    /// ```
    pub fn with_default_table_detection(mut self) -> Self {
        self.extract_tables = true;
        self.table_detection_config = None;
        self
    }
}

/// Reading order determination mode for text blocks.
///
/// Determines how text blocks are ordered when converting to output formats.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReadingOrderMode {
    /// Simple top-to-bottom, left-to-right ordering.
    ///
    /// Sorts all blocks by Y coordinate (top to bottom), then by X coordinate (left to right).
    /// This works well for single-column documents.
    TopToBottomLeftToRight,

    /// Column-aware reading order.
    ///
    /// Uses the XY-Cut algorithm to detect columns and determines proper reading order
    /// across multiple columns. This works better for multi-column documents.
    ColumnAware,

    /// Structure tree first, with fallback to column-aware.
    ///
    /// For Tagged PDFs: Uses the PDF logical structure tree (ISO 32000-1:2008 Section 14.7)
    /// to determine reading order via Marked Content IDs (MCIDs). This is the PDF-spec-compliant
    /// approach and provides perfect reading order for Tagged PDFs.
    ///
    /// For Untagged PDFs: Falls back to ColumnAware (XY-Cut algorithm).
    ///
    /// This mode requires passing MCID reading order through ConversionOptions.mcid_order.
    StructureTreeFirst {
        /// Reading order as a sequence of MCIDs from structure tree traversal.
        /// If empty, falls back to ColumnAware mode.
        mcid_order: Vec<u32>,
    },
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_conversion_options_default() {
        let opts = ConversionOptions::default();
        assert!(!opts.preserve_layout);
        assert!(opts.detect_headings);
        assert!(opts.extract_tables);
        assert!(!opts.include_images);
        assert_eq!(opts.image_output_dir, None);
        assert!(opts.embed_images);
        assert_eq!(
            opts.reading_order_mode,
            ReadingOrderMode::StructureTreeFirst { mcid_order: vec![] }
        );
    }

    #[test]
    fn test_conversion_options_embed_images() {
        // Default: embed_images = true
        let opts = ConversionOptions::default();
        assert!(opts.embed_images);

        // Custom: embed_images = false
        let opts = ConversionOptions {
            embed_images: false,
            image_output_dir: Some("images/".to_string()),
            ..Default::default()
        };
        assert!(!opts.embed_images);
        assert_eq!(opts.image_output_dir, Some("images/".to_string()));
    }

    #[test]
    fn test_conversion_options_custom() {
        let opts = ConversionOptions {
            preserve_layout: true,
            detect_headings: false,
            extract_tables: false,
            include_images: false,
            image_output_dir: Some("output/".to_string()),
            reading_order_mode: ReadingOrderMode::ColumnAware,
            bold_marker_behavior: BoldMarkerBehavior::Aggressive,
            table_detection_config: None,
            ..Default::default()
        };

        assert!(opts.preserve_layout);
        assert!(!opts.detect_headings);
        assert!(!opts.include_images);
        assert_eq!(opts.image_output_dir, Some("output/".to_string()));
        assert_eq!(opts.reading_order_mode, ReadingOrderMode::ColumnAware);
        assert_eq!(opts.bold_marker_behavior, BoldMarkerBehavior::Aggressive);
        assert!(opts.table_detection_config.is_none());
    }

    #[test]
    fn test_reading_order_mode_equality() {
        assert_eq!(
            ReadingOrderMode::TopToBottomLeftToRight,
            ReadingOrderMode::TopToBottomLeftToRight
        );
        assert_ne!(ReadingOrderMode::TopToBottomLeftToRight, ReadingOrderMode::ColumnAware);
    }

    #[test]
    fn test_conversion_options_clone() {
        let opts1 = ConversionOptions::default();
        let opts2 = opts1.clone();
        assert_eq!(opts1, opts2);
    }

    #[test]
    fn test_conversion_options_debug() {
        let opts = ConversionOptions::default();
        let debug_str = format!("{:?}", opts);
        assert!(debug_str.contains("ConversionOptions"));
    }

    #[test]
    fn test_bold_marker_behavior_default() {
        assert_eq!(BoldMarkerBehavior::default(), BoldMarkerBehavior::Conservative);
    }

    #[test]
    fn test_bold_marker_behavior_equality() {
        assert_eq!(BoldMarkerBehavior::Conservative, BoldMarkerBehavior::Conservative);
        assert_eq!(BoldMarkerBehavior::Aggressive, BoldMarkerBehavior::Aggressive);
        assert_ne!(BoldMarkerBehavior::Conservative, BoldMarkerBehavior::Aggressive);
    }

    #[test]
    fn test_bold_marker_behavior_copy_clone() {
        let behavior = BoldMarkerBehavior::Aggressive;
        let copied = behavior;
        assert_eq!(behavior, copied);
    }

    #[test]
    fn test_with_default_table_detection() {
        let opts = ConversionOptions::default().with_default_table_detection();
        assert!(opts.extract_tables);
        assert!(opts.table_detection_config.is_none());
    }

    #[test]
    fn test_with_table_detection() {
        let config = crate::structure::TableDetectionConfig::strict();
        let opts = ConversionOptions::default().with_table_detection(config);
        assert!(opts.extract_tables);
        assert!(opts.table_detection_config.is_some());
        let cfg = opts.table_detection_config.unwrap();
        assert_eq!(cfg.min_table_columns, 3);
        assert_eq!(cfg.column_tolerance, 2.0);
    }

    #[test]
    fn test_conversion_options_default_table_config() {
        let opts = ConversionOptions::default();
        // In v0.3.16, table extraction is enabled by default
        assert!(opts.extract_tables);
        // But detection config remains None (uses internal defaults) unless customized
        assert!(opts.table_detection_config.is_none());
    }

    #[test]
    fn test_include_images_default_false() {
        // Default should NOT include images to avoid output bloat from base64 embedding.
        // Users must explicitly opt in with include_images: true.
        let opts = ConversionOptions::default();
        assert!(
            !opts.include_images,
            "include_images should default to false to prevent bloated output"
        );
    }

    #[test]
    fn test_include_images_opt_in() {
        // When explicitly enabled, include_images should be true.
        let opts = ConversionOptions {
            include_images: true,
            ..Default::default()
        };
        assert!(opts.include_images);
        // embed_images should still default to true (base64 when images are included)
        assert!(opts.embed_images);
    }
}