kreuzberg 4.8.4

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 91+ formats and 248 programming languages via tree-sitter code intelligence with async/sync APIs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
//! Format-specific extraction results and OCR configuration types.

use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

use super::document_structure::DocumentStructure;
use super::extraction::ExtractedImage;
use super::metadata::PptxMetadata;
use super::page::{PageContent, PageStructure};

/// Excel workbook representation.
///
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
/// extracted content and metadata.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExcelWorkbook {
    /// All sheets in the workbook
    pub sheets: Vec<ExcelSheet>,
    /// Workbook-level metadata (author, creation date, etc.)
    pub metadata: HashMap<String, String>,
}

/// Single Excel worksheet.
///
/// Represents one sheet from an Excel workbook with its content
/// converted to Markdown format and dimensional statistics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExcelSheet {
    /// Sheet name as it appears in Excel
    pub name: String,
    /// Sheet content converted to Markdown tables
    pub markdown: String,
    /// Number of rows
    pub row_count: usize,
    /// Number of columns
    pub col_count: usize,
    /// Total number of non-empty cells
    pub cell_count: usize,
    /// Pre-extracted table cells (2D vector of cell values)
    /// Populated during markdown generation to avoid re-parsing markdown.
    /// None for empty sheets.
    #[serde(skip)]
    pub table_cells: Option<Vec<Vec<String>>>,
}

/// XML extraction result.
///
/// Contains extracted text content from XML files along with
/// structural statistics about the XML document.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct XmlExtractionResult {
    /// Extracted text content (XML structure filtered out)
    pub content: String,
    /// Total number of XML elements processed
    pub element_count: usize,
    /// List of unique element names found (sorted)
    pub unique_elements: Vec<String>,
}

/// Plain text and Markdown extraction result.
///
/// Contains the extracted text along with statistics and,
/// for Markdown files, structural elements like headers and links.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TextExtractionResult {
    /// Extracted text content
    pub content: String,
    /// Number of lines
    pub line_count: usize,
    /// Number of words
    pub word_count: usize,
    /// Number of characters
    pub character_count: usize,
    /// Markdown headers (text only, Markdown files only)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub headers: Option<Vec<String>>,
    /// Markdown links as (text, URL) tuples (Markdown files only)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub links: Option<Vec<(String, String)>>,
    /// Code blocks as (language, code) tuples (Markdown files only)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub code_blocks: Option<Vec<(String, String)>>,
}

/// PowerPoint (PPTX) extraction result.
///
/// Contains extracted slide content, metadata, and embedded images/tables.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PptxExtractionResult {
    /// Extracted text content from all slides
    pub content: String,
    /// Presentation metadata
    pub metadata: PptxMetadata,
    /// Total number of slides
    pub slide_count: usize,
    /// Total number of embedded images
    pub image_count: usize,
    /// Total number of tables
    pub table_count: usize,
    /// Extracted images from the presentation
    pub images: Vec<ExtractedImage>,
    /// Slide structure with boundaries (when page tracking is enabled)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub page_structure: Option<PageStructure>,
    /// Per-slide content (when page tracking is enabled)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub page_contents: Option<Vec<PageContent>>,
    /// Structured document representation
    #[serde(skip_serializing_if = "Option::is_none")]
    pub document: Option<DocumentStructure>,
    /// Hyperlinks discovered in slides as (url, optional_label) pairs.
    #[serde(skip_serializing_if = "Vec::is_empty", default)]
    pub hyperlinks: Vec<(String, Option<String>)>,
    /// Office metadata extracted from docProps/core.xml and docProps/app.xml.
    ///
    /// Contains keys like "title", "author", "created_by", "subject", "keywords",
    /// "modified_by", "created_at", "modified_at", etc.
    #[serde(skip_serializing_if = "HashMap::is_empty", default)]
    pub office_metadata: HashMap<String, String>,
}

/// Email extraction result.
///
/// Complete representation of an extracted email message (.eml or .msg)
/// including headers, body content, and attachments.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmailExtractionResult {
    /// Email subject line
    pub subject: Option<String>,
    /// Sender email address
    pub from_email: Option<String>,
    /// Primary recipient email addresses
    pub to_emails: Vec<String>,
    /// CC recipient email addresses
    pub cc_emails: Vec<String>,
    /// BCC recipient email addresses
    pub bcc_emails: Vec<String>,
    /// Email date/timestamp
    pub date: Option<String>,
    /// Message-ID header value
    pub message_id: Option<String>,
    /// Plain text version of the email body
    pub plain_text: Option<String>,
    /// HTML version of the email body
    pub html_content: Option<String>,
    /// Cleaned/processed text content
    pub cleaned_text: String,
    /// List of email attachments
    pub attachments: Vec<EmailAttachment>,
    /// Additional email headers and metadata
    pub metadata: HashMap<String, String>,
}

/// Email attachment representation.
///
/// Contains metadata and optionally the content of an email attachment.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmailAttachment {
    /// Attachment name (from Content-Disposition header)
    pub name: Option<String>,
    /// Filename of the attachment
    pub filename: Option<String>,
    /// MIME type of the attachment
    pub mime_type: Option<String>,
    /// Size in bytes
    pub size: Option<usize>,
    /// Whether this attachment is an image
    pub is_image: bool,
    /// Attachment data (if extracted).
    /// Uses `bytes::Bytes` for cheap cloning of large buffers.
    pub data: Option<Bytes>,
}

/// OCR extraction result.
///
/// Result of performing OCR on an image or scanned document,
/// including recognized text and detected tables.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OcrExtractionResult {
    /// Recognized text content
    pub content: String,
    /// Original MIME type of the processed image
    pub mime_type: String,
    /// OCR processing metadata (confidence scores, language, etc.)
    pub metadata: HashMap<String, serde_json::Value>,
    /// Tables detected and extracted via OCR
    pub tables: Vec<OcrTable>,
    /// Structured OCR elements with bounding boxes and confidence scores.
    /// Available when TSV output is requested or table detection is enabled.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub ocr_elements: Option<Vec<super::OcrElement>>,
    /// Structured document produced from hOCR parsing.
    /// Carries paragraph structure, bounding boxes, and confidence scores
    /// that the flattened `content` string discards.
    #[serde(skip)]
    pub internal_document: Option<super::internal::InternalDocument>,
}

/// Table detected via OCR.
///
/// Represents a table structure recognized during OCR processing.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OcrTable {
    /// Table cells as a 2D vector (rows × columns)
    pub cells: Vec<Vec<String>>,
    /// Markdown representation of the table
    pub markdown: String,
    /// Page number where the table was found (1-indexed)
    pub page_number: usize,
    /// Bounding box of the table in pixel coordinates (from OCR word positions).
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub bounding_box: Option<OcrTableBoundingBox>,
}

/// Bounding box for an OCR-detected table in pixel coordinates.
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct OcrTableBoundingBox {
    /// Left x-coordinate (pixels)
    pub left: u32,
    /// Top y-coordinate (pixels)
    pub top: u32,
    /// Right x-coordinate (pixels)
    pub right: u32,
    /// Bottom y-coordinate (pixels)
    pub bottom: u32,
}

/// Image preprocessing configuration for OCR.
///
/// These settings control how images are preprocessed before OCR to improve
/// text recognition quality. Different preprocessing strategies work better
/// for different document types.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct ImagePreprocessingConfig {
    /// Target DPI for the image (300 is standard, 600 for small text).
    pub target_dpi: i32,

    /// Auto-detect and correct image rotation.
    pub auto_rotate: bool,

    /// Correct skew (tilted images).
    pub deskew: bool,

    /// Remove noise from the image.
    pub denoise: bool,

    /// Enhance contrast for better text visibility.
    pub contrast_enhance: bool,

    /// Binarization method: "otsu", "sauvola", "adaptive".
    pub binarization_method: String,

    /// Invert colors (white text on black → black on white).
    pub invert_colors: bool,
}

impl Default for ImagePreprocessingConfig {
    fn default() -> Self {
        Self {
            target_dpi: 300,
            auto_rotate: true,
            deskew: true,
            denoise: false,
            contrast_enhance: false,
            binarization_method: "otsu".to_string(),
            invert_colors: false,
        }
    }
}

/// Tesseract OCR configuration.
///
/// Provides fine-grained control over Tesseract OCR engine parameters.
/// Most users can use the defaults, but these settings allow optimization
/// for specific document types (invoices, handwriting, etc.).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct TesseractConfig {
    /// Language code (e.g., "eng", "deu", "fra")
    pub language: String,

    /// Page Segmentation Mode (0-13).
    ///
    /// Common values:
    /// - 3: Fully automatic page segmentation (default)
    /// - 6: Assume a single uniform block of text
    /// - 11: Sparse text with no particular order
    pub psm: i32,

    /// Output format ("text" or "markdown")
    pub output_format: String,

    /// OCR Engine Mode (0-3).
    ///
    /// - 0: Legacy engine only
    /// - 1: Neural nets (LSTM) only (usually best)
    /// - 2: Legacy + LSTM
    /// - 3: Default (based on what's available)
    pub oem: i32,

    /// Minimum confidence threshold (0.0-100.0).
    ///
    /// Words with confidence below this threshold may be rejected or flagged.
    pub min_confidence: f64,

    /// Image preprocessing configuration.
    ///
    /// Controls how images are preprocessed before OCR. Can significantly
    /// improve quality for scanned documents or low-quality images.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub preprocessing: Option<ImagePreprocessingConfig>,

    /// Enable automatic table detection and reconstruction
    pub enable_table_detection: bool,

    /// Minimum confidence threshold for table detection (0.0-1.0)
    pub table_min_confidence: f64,

    /// Column threshold for table detection (pixels)
    pub table_column_threshold: i32,

    /// Row threshold ratio for table detection (0.0-1.0)
    pub table_row_threshold_ratio: f64,

    /// Enable OCR result caching
    pub use_cache: bool,

    /// Use pre-adapted templates for character classification
    pub classify_use_pre_adapted_templates: bool,

    /// Enable N-gram language model
    pub language_model_ngram_on: bool,

    /// Don't reject good words during block-level processing
    pub tessedit_dont_blkrej_good_wds: bool,

    /// Don't reject good words during row-level processing
    pub tessedit_dont_rowrej_good_wds: bool,

    /// Enable dictionary correction
    pub tessedit_enable_dict_correction: bool,

    /// Whitelist of allowed characters (empty = all allowed)
    pub tessedit_char_whitelist: String,

    /// Blacklist of forbidden characters (empty = none forbidden)
    pub tessedit_char_blacklist: String,

    /// Use primary language params model
    pub tessedit_use_primary_params_model: bool,

    /// Variable-width space detection
    pub textord_space_size_is_variable: bool,

    /// Use adaptive thresholding method
    pub thresholding_method: bool,
}

impl Default for TesseractConfig {
    fn default() -> Self {
        Self {
            language: "eng".to_string(),
            psm: 3,
            output_format: "markdown".to_string(),
            oem: 3,
            min_confidence: 0.0,
            preprocessing: None,
            enable_table_detection: true,
            table_min_confidence: 0.0,
            table_column_threshold: 50,
            table_row_threshold_ratio: 0.5,
            use_cache: true,
            classify_use_pre_adapted_templates: true,
            language_model_ngram_on: false,
            tessedit_dont_blkrej_good_wds: true,
            tessedit_dont_rowrej_good_wds: true,
            tessedit_enable_dict_correction: true,
            tessedit_char_whitelist: String::new(),
            tessedit_char_blacklist: String::new(),
            tessedit_use_primary_params_model: true,
            textord_space_size_is_variable: true,
            thresholding_method: false,
        }
    }
}

/// Image preprocessing metadata.
///
/// Tracks the transformations applied to an image during OCR preprocessing,
/// including DPI normalization, resizing, and resampling.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
pub struct ImagePreprocessingMetadata {
    /// Original image dimensions (width, height) in pixels
    pub original_dimensions: (usize, usize),
    /// Original image DPI (horizontal, vertical)
    pub original_dpi: (f64, f64),
    /// Target DPI from configuration
    pub target_dpi: i32,
    /// Scaling factor applied to the image
    pub scale_factor: f64,
    /// Whether DPI was auto-adjusted based on content
    pub auto_adjusted: bool,
    /// Final DPI after processing
    pub final_dpi: i32,
    /// New dimensions after resizing (if resized)
    pub new_dimensions: Option<(usize, usize)>,
    /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
    pub resample_method: String,
    /// Whether dimensions were clamped to max_image_dimension
    pub dimension_clamped: bool,
    /// Calculated optimal DPI (if auto_adjust_dpi enabled)
    pub calculated_dpi: Option<i32>,
    /// Whether resize was skipped (dimensions already optimal)
    pub skipped_resize: bool,
    /// Error message if resize failed
    pub resize_error: Option<String>,
}

/// Image extraction configuration (internal use).
///
/// **Note:** This is an internal type used for image preprocessing.
/// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionConfig {
    /// Target DPI for image normalization
    pub target_dpi: i32,
    /// Maximum image dimension (width or height)
    pub max_image_dimension: i32,
    /// Whether to auto-adjust DPI based on content
    pub auto_adjust_dpi: bool,
    /// Minimum DPI threshold
    pub min_dpi: i32,
    /// Maximum DPI threshold
    pub max_dpi: i32,
}

impl Default for ExtractionConfig {
    fn default() -> Self {
        Self {
            target_dpi: 300,
            max_image_dimension: 4096,
            auto_adjust_dpi: true,
            min_dpi: 72,
            max_dpi: 600,
        }
    }
}

/// Cache statistics.
///
/// Provides information about the extraction result cache,
/// including size, file count, and age distribution.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheStats {
    /// Total number of cached files
    pub total_files: usize,
    /// Total cache size in megabytes
    pub total_size_mb: f64,
    /// Available disk space in megabytes
    pub available_space_mb: f64,
    /// Age of the oldest cached file in days
    pub oldest_file_age_days: f64,
    /// Age of the newest cached file in days
    pub newest_file_age_days: f64,
}