kreuzberg 4.3.0

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 75+ formats with async/sync APIs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
//! PDF text hierarchy extraction using pdfium character positions.
//!
//! This module provides functions for extracting character information from PDFs,
//! preserving font size and position data for text hierarchy analysis.
//!
//! Note: Requires the "pdf" feature to be enabled.

mod bounding_box;
mod clustering;
mod extraction;

// Re-export all public types and functions for backward compatibility
pub use bounding_box::BoundingBox;
pub use clustering::{FontSizeCluster, cluster_font_sizes};
pub use extraction::{
    CharData, HierarchyBlock, HierarchyLevel, KMeansResult, TextBlock, assign_hierarchy_levels,
    assign_hierarchy_levels_from_clusters, extract_chars_with_fonts, merge_chars_into_blocks, should_trigger_ocr,
};