pdf_oxide 0.3.35

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
//! Structured text extraction with document semantics.
//!
//! This module provides structured extraction that preserves document structure
//! including paragraphs and formatting information (bold, italic, font size).
//!
//! # Note on PDF Spec Compliance
//!
//! This module has been refactored to focus on PDF-specification-compliant extraction.
//! The following non-spec-compliant heuristics have been removed:
//! - Header/heading detection (based on font size heuristics)
//! - List detection (based on marker patterns)
//! - Text alignment detection (based on position heuristics)
//!
//! These detections are not based on PDF structure but on assumptions about document
//! layout, which may not hold for all PDFs. For semantic extraction, applications
//! should use PDF structure tags (if available) or their own domain-specific logic.
//!
//! # Usage
//!
//! ```ignore
//! use pdf_oxide::PdfDocument;
//! use pdf_oxide::extractors::StructuredExtractor;
//!
//! let mut doc = PdfDocument::open("document.pdf")?;
//! let mut extractor = StructuredExtractor::new();
//! let structured = extractor.extract_page(&mut doc, 0)?;
//!
//! for element in structured.elements {
//!     match element {
//!         DocumentElement::Paragraph { text, .. } => {
//!             println!("P: {}", text);
//!         }
//!         _ => {} // Other element types for API compatibility only
//!     }
//! }
//! # Ok::<(), pdf_oxide::error::Error>(())
//! ```

use crate::document::PdfDocument;
use crate::error::{Error, Result};
use crate::geometry::Rect;
use crate::layout::TextBlock;
use serde::{Deserialize, Serialize};

/// A structured document with semantic elements.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StructuredDocument {
    /// Document elements in reading order
    pub elements: Vec<DocumentElement>,

    /// Page dimensions
    pub page_size: (f32, f32), // (width, height)

    /// Metadata
    pub metadata: DocumentMetadata,
}

/// Document element types with semantic meaning.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum DocumentElement {
    /// Header/title element
    #[serde(rename = "header")]
    Header {
        /// Header level (1-6, where 1 is largest)
        level: u8,
        /// Text content
        text: String,
        /// Text styling
        style: TextStyle,
        /// Bounding box
        bbox: BoundingBox,
    },

    /// Paragraph element
    #[serde(rename = "paragraph")]
    Paragraph {
        /// Text content
        text: String,
        /// Text styling
        style: TextStyle,
        /// Bounding box
        bbox: BoundingBox,
        /// Text alignment (left, center, right, justified)
        alignment: TextAlignment,
    },

    /// List element (ordered or unordered)
    #[serde(rename = "list")]
    List {
        /// List items
        items: Vec<ListItem>,
        /// Whether list is ordered (numbered) or unordered (bullets)
        ordered: bool,
        /// Bounding box
        bbox: BoundingBox,
    },

    /// Table element (future enhancement)
    #[serde(rename = "table")]
    Table {
        /// Number of rows
        rows: usize,
        /// Number of columns
        cols: usize,
        /// Cell data (row-major order)
        cells: Vec<Vec<String>>,
        /// Bounding box
        bbox: BoundingBox,
    },
}

/// List item with optional nesting.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ListItem {
    /// Item text
    pub text: String,
    /// Item styling
    pub style: TextStyle,
    /// Nested list (if any)
    pub nested: Option<Box<DocumentElement>>,
    /// Bounding box
    pub bbox: BoundingBox,
}

/// Text styling information.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextStyle {
    /// Font family name
    pub font_family: String,
    /// Font size in points
    pub font_size: f32,
    /// Bold text
    pub bold: bool,
    /// Italic text
    pub italic: bool,
    /// Text color (RGB 0.0-1.0)
    pub color: (f32, f32, f32),
}

impl Default for TextStyle {
    fn default() -> Self {
        Self {
            font_family: "Unknown".to_string(),
            font_size: 12.0,
            bold: false,
            italic: false,
            color: (0.0, 0.0, 0.0), // black
        }
    }
}

/// Bounding box (x, y, width, height).
pub type BoundingBox = (f32, f32, f32, f32);

/// Text alignment.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum TextAlignment {
    /// Left-aligned
    Left,
    /// Center-aligned
    Center,
    /// Right-aligned
    Right,
    /// Justified
    Justified,
}

/// Document metadata.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMetadata {
    /// Total number of elements extracted
    pub element_count: usize,
    /// Number of headers
    pub header_count: usize,
    /// Number of paragraphs
    pub paragraph_count: usize,
    /// Number of lists
    pub list_count: usize,
    /// Number of tables
    pub table_count: usize,
}

/// Structured text extractor.
///
/// Converts positioned characters into semantic document elements.
#[allow(dead_code)]
pub struct StructuredExtractor {
    /// Configuration options
    config: ExtractorConfig,
}

/// Extraction configuration.
///
/// # Note
///
/// Header detection, list detection, and alignment detection have been removed
/// for PDF-specification compliance. These options are retained for API compatibility
/// but no longer have any effect.
#[derive(Debug, Clone)]
pub struct ExtractorConfig {
    /// (Deprecated) Minimum font size to consider as header
    /// This option no longer has any effect.
    #[allow(dead_code)]
    pub min_header_size: f32,
    /// (Deprecated) Maximum header levels to detect
    /// This option no longer has any effect.
    #[allow(dead_code)]
    pub max_header_levels: u8,
    /// (Deprecated) Vertical gap threshold for paragraph breaks
    /// This option no longer has any effect.
    #[allow(dead_code)]
    pub paragraph_gap_threshold: f32,
    /// (Deprecated) Enable list detection
    /// This option no longer has any effect.
    #[allow(dead_code)]
    pub detect_lists: bool,
    /// (Deprecated) Enable table detection
    /// This option no longer has any effect.
    #[allow(dead_code)]
    pub detect_tables: bool,
}

impl Default for ExtractorConfig {
    fn default() -> Self {
        Self {
            min_header_size: 14.0,
            max_header_levels: 6,
            paragraph_gap_threshold: 1.5,
            detect_lists: false,
            detect_tables: false,
        }
    }
}

impl StructuredExtractor {
    /// Create a new structured extractor with default configuration.
    pub fn new() -> Self {
        Self {
            config: ExtractorConfig::default(),
        }
    }

    /// Create a new structured extractor with custom configuration.
    pub fn with_config(config: ExtractorConfig) -> Self {
        Self { config }
    }

    /// Extract structured content from a page.
    ///
    /// # Arguments
    ///
    /// * `document` - The PDF document
    /// * `page_num` - Zero-based page number
    ///
    /// # Returns
    ///
    /// A structured document with paragraphs and formatting information.
    ///
    /// # Errors
    ///
    /// Returns an error if the page cannot be processed.
    ///
    /// # Note
    ///
    /// This method extracts text blocks from the PDF and converts them to paragraphs
    /// with formatting information (bold, italic, font size). Header detection, list
    /// detection, and alignment detection have been removed for PDF-spec compliance.
    pub fn extract_page(
        &mut self,
        document: &mut PdfDocument,
        page_num: u32,
    ) -> Result<StructuredDocument> {
        // Step 1: Extract text spans (already grouped by PDF text operators)
        let spans = document.extract_spans(page_num as usize)?;

        if spans.is_empty() {
            return Ok(StructuredDocument {
                elements: Vec::new(),
                page_size: (0.0, 0.0),
                metadata: DocumentMetadata {
                    element_count: 0,
                    header_count: 0,
                    paragraph_count: 0,
                    list_count: 0,
                    table_count: 0,
                },
            });
        }

        // Step 2: Convert spans to text blocks (spans are already properly grouped)
        let blocks = self.spans_to_blocks(&spans);

        // Step 3: Convert all blocks to paragraphs
        // (Header detection, list detection, and alignment detection removed for spec compliance)
        let elements = self.blocks_to_simple_paragraphs(&blocks);

        // Step 4: Calculate page size and metadata
        let page_size = self.calculate_page_size_from_spans(&spans);
        let metadata = self.calculate_metadata(&elements);

        Ok(StructuredDocument {
            elements,
            page_size,
            metadata,
        })
    }

    /// Convert text spans to text blocks.
    ///
    /// Spans are already properly grouped by the PDF content stream operators,
    /// so we just convert the data structure.
    fn spans_to_blocks(&self, spans: &[crate::layout::TextSpan]) -> Vec<TextBlock> {
        spans
            .iter()
            .map(|span| TextBlock {
                chars: Vec::new(), // Not needed for structure detection
                bbox: span.bbox,
                text: span.text.clone(),
                avg_font_size: span.font_size,
                dominant_font: span.font_name.clone(),
                is_bold: span.font_weight == crate::layout::FontWeight::Bold,
                is_italic: span.is_italic,
                mcid: span.mcid,
            })
            .collect()
    }

    /// Calculate page size from text spans.
    fn calculate_page_size_from_spans(&self, spans: &[crate::layout::TextSpan]) -> (f32, f32) {
        if spans.is_empty() {
            return (0.0, 0.0);
        }

        let mut max_x = 0.0f32;
        let mut max_y = 0.0f32;

        for span in spans {
            max_x = max_x.max(span.bbox.x + span.bbox.width);
            max_y = max_y.max(span.bbox.y + span.bbox.height);
        }

        (max_x, max_y)
    }

    /// Convert all blocks to simple paragraphs without semantic classification.
    ///
    /// This method converts text blocks to paragraphs with formatting information only.
    /// Header detection, list detection, and alignment detection have been removed
    /// for PDF-specification compliance.
    fn blocks_to_simple_paragraphs(&self, blocks: &[TextBlock]) -> Vec<DocumentElement> {
        blocks
            .iter()
            .map(|block| DocumentElement::Paragraph {
                text: block.text.clone(),
                style: Self::block_to_text_style(block),
                bbox: Self::bbox_from_rect(block.bbox),
                alignment: TextAlignment::Left, // No alignment detection
            })
            .collect()
    }

    /// Convert TextBlock to TextStyle with bold/italic detection.
    fn block_to_text_style(block: &TextBlock) -> TextStyle {
        // Detect bold from font name
        let bold = block.is_bold || block.dominant_font.contains("Bold");

        // Detect italic from font name
        let italic =
            block.dominant_font.contains("Italic") || block.dominant_font.contains("Oblique");

        // Use first character's color if available, otherwise black
        let color = block
            .chars
            .first()
            .map(|c| (c.color.r, c.color.g, c.color.b))
            .unwrap_or((0.0, 0.0, 0.0));

        TextStyle {
            font_family: block.dominant_font.clone(),
            font_size: block.avg_font_size,
            bold,
            italic,
            color,
        }
    }

    /// Convert Rect to BoundingBox tuple.
    fn bbox_from_rect(rect: Rect) -> BoundingBox {
        (rect.x, rect.y, rect.width, rect.height)
    }

    /// Calculate page dimensions from characters.
    /// Calculate document metadata.
    fn calculate_metadata(&self, elements: &[DocumentElement]) -> DocumentMetadata {
        let mut header_count = 0;
        let mut paragraph_count = 0;
        let mut list_count = 0;
        let mut table_count = 0;

        for elem in elements {
            match elem {
                DocumentElement::Header { .. } => header_count += 1,
                DocumentElement::Paragraph { .. } => paragraph_count += 1,
                DocumentElement::List { .. } => list_count += 1,
                DocumentElement::Table { .. } => table_count += 1,
            }
        }

        DocumentMetadata {
            element_count: elements.len(),
            header_count,
            paragraph_count,
            list_count,
            table_count,
        }
    }
}

impl Default for StructuredExtractor {
    fn default() -> Self {
        Self::new()
    }
}

impl StructuredDocument {
    /// Convert to plain text (for backward compatibility).
    pub fn to_plain_text(&self) -> String {
        let mut text = String::new();

        for element in &self.elements {
            match element {
                DocumentElement::Header { text: t, .. } => {
                    if !text.is_empty() {
                        text.push('\n');
                    }
                    text.push_str(t);
                    text.push('\n');
                },
                DocumentElement::Paragraph { text: t, .. } => {
                    if !text.is_empty() {
                        text.push('\n');
                    }
                    text.push_str(t);
                },
                DocumentElement::List { items, .. } => {
                    for item in items {
                        text.push('\n');
                        text.push_str(&item.text);
                    }
                },
                DocumentElement::Table { cells, .. } => {
                    for row in cells {
                        text.push('\n');
                        text.push_str(&row.join("\t"));
                    }
                },
            }
        }

        text
    }

    /// Export to JSON.
    pub fn to_json(&self) -> Result<String> {
        serde_json::to_string_pretty(self).map_err(|e| Error::ParseError {
            offset: 0,
            reason: format!("Failed to serialize to JSON: {}", e),
        })
    }
}