kreuzberg 4.8.2

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 91+ formats and 248 programming languages via tree-sitter code intelligence with async/sync APIs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
//! Font size clustering for PDF hierarchy extraction.
//!
//! This module implements k-means clustering on font sizes to identify
//! document hierarchy levels (headings vs body text).

use super::extraction::TextBlock;
use crate::pdf::error::{PdfError, Result};

// K-means algorithm constants
const KMEANS_MAX_ITERATIONS: usize = 100;
const KMEANS_CONVERGENCE_THRESHOLD: f32 = 0.01;

/// A cluster of text blocks with the same font size characteristics.
#[derive(Debug, Clone)]
pub struct FontSizeCluster {
    /// The centroid (mean) font size of this cluster
    pub centroid: f32,
    /// The text blocks that belong to this cluster
    pub members: Vec<TextBlock>,
}

/// Cluster text blocks by font size using k-means algorithm.
///
/// Uses k-means clustering to group text blocks by their font size, which helps
/// identify document hierarchy levels (H1, H2, Body, etc.). The algorithm:
/// 1. Extracts font sizes from text blocks
/// 2. Applies k-means clustering to group similar font sizes
/// 3. Sorts clusters by centroid size in descending order (largest = H1)
/// 4. Returns clusters with their member blocks
///
/// # Arguments
///
/// * `blocks` - Slice of TextBlock objects to cluster
/// * `k` - Number of clusters to create
///
/// # Returns
///
/// Result with vector of FontSizeCluster ordered by size (descending),
/// or an error if clustering fails
///
/// # Example
///
/// ```rust,no_run
/// # #[cfg(feature = "pdf")]
/// # {
/// use kreuzberg::pdf::hierarchy::{TextBlock, BoundingBox, cluster_font_sizes};
///
/// let blocks = vec![
///     TextBlock {
///         text: "Title".to_string(),
///         bbox: BoundingBox { left: 0.0, top: 0.0, right: 100.0, bottom: 24.0 },
///         font_size: 24.0,
///     },
///     TextBlock {
///         text: "Body".to_string(),
///         bbox: BoundingBox { left: 0.0, top: 30.0, right: 100.0, bottom: 42.0 },
///         font_size: 12.0,
///     },
/// ];
///
/// let clusters = cluster_font_sizes(&blocks, 2).unwrap();
/// assert_eq!(clusters.len(), 2);
/// assert_eq!(clusters[0].centroid, 24.0); // Largest is first
/// # }
/// ```
pub fn cluster_font_sizes(blocks: &[TextBlock], k: usize) -> Result<Vec<FontSizeCluster>> {
    if blocks.is_empty() {
        return Ok(Vec::new());
    }

    if k == 0 {
        return Err(PdfError::TextExtractionFailed("K must be greater than 0".to_string()));
    }

    let actual_k = k.min(blocks.len());

    // Extract unique font sizes for initialization
    let mut font_sizes: Vec<f32> = blocks
        .iter()
        .map(|b| b.font_size)
        .filter(|fs| fs.is_finite()) // Filter out NaN/Inf from corrupt PDFs
        .collect();
    font_sizes.sort_by(|a, b| b.total_cmp(a)); // Sort descending
    // Tolerance-based dedup: merge font sizes within 0.05pt (float imprecision)
    font_sizes.dedup_by(|a, b| (*a - *b).abs() < 0.05);

    // Initialize centroids using actual font sizes from the data
    // This is more robust than dividing the range uniformly
    let mut centroids: Vec<f32> = Vec::new();

    if font_sizes.len() >= actual_k {
        // If we have at least k unique font sizes, pick them evenly spaced
        let step = font_sizes.len() / actual_k;
        for i in 0..actual_k {
            let idx = i * step;
            centroids.push(font_sizes[idx.min(font_sizes.len() - 1)]);
        }
    } else {
        // If we have fewer unique sizes than k, use all of them and fill with interpolated values
        centroids = font_sizes.clone();

        // Add interpolated centroids between existing ones to reach desired k
        let min_font = font_sizes[font_sizes.len() - 1];
        let max_font = font_sizes[0];
        let range = max_font - min_font;

        while centroids.len() < actual_k {
            let t = centroids.len() as f32 / (actual_k - 1) as f32;
            let interpolated = max_font - t * range;
            centroids.push(interpolated);
        }

        centroids.sort_by(|a, b| b.total_cmp(a));
        // Keep sorted descending
    }

    // Extract font sizes once for iteration loop
    let font_sizes: Vec<f32> = blocks.iter().map(|b| b.font_size).collect();

    // Pre-compute initial assignments to detect changes across iterations
    let mut prev_assignments: Vec<usize> = vec![0; font_sizes.len()];
    // Initialise so the first iteration always runs (sentinel: usize::MAX is never a valid cluster)
    let mut first_iter = true;

    // Run k-means clustering for a fixed number of iterations
    for _ in 0..KMEANS_MAX_ITERATIONS {
        // Assign font sizes to nearest centroid, tracking per-element assignments
        let (size_clusters, assignments) = assign_sizes_to_centroids_tracked(&font_sizes, &centroids);

        // Early exit: if no assignment changed since the last iteration, the solution is stable
        let assignments_changed = if first_iter {
            first_iter = false;
            1 // Force at least one centroid update
        } else {
            assignments
                .iter()
                .zip(prev_assignments.iter())
                .filter(|(a, b)| a != b)
                .count()
        };
        prev_assignments = assignments;

        if assignments_changed == 0 {
            break;
        }

        // Update centroids
        let mut new_centroids = Vec::with_capacity(actual_k);
        for (i, cluster) in size_clusters.iter().enumerate() {
            if !cluster.is_empty() {
                new_centroids.push(cluster.iter().sum::<f32>() / cluster.len() as f32);
            } else {
                new_centroids.push(centroids[i]);
            }
        }

        // Check for convergence based on centroid movement
        let converged = centroids
            .iter()
            .zip(new_centroids.iter())
            .all(|(old, new)| (old - new).abs() < KMEANS_CONVERGENCE_THRESHOLD);

        std::mem::swap(&mut centroids, &mut new_centroids);

        if converged {
            break;
        }
    }

    // Final assignment to create result
    let clusters = assign_blocks_to_centroids(blocks, &centroids);

    // Create FontSizeCluster objects with centroids
    let mut result: Vec<FontSizeCluster> = Vec::new();

    for i in 0..actual_k {
        if !clusters[i].is_empty() {
            let centroid_value = centroids[i];
            result.push(FontSizeCluster {
                centroid: centroid_value,
                members: clusters[i].clone(),
            });
        }
    }

    // Sort by centroid size in descending order (largest font = H1)
    result.sort_by(|a, b| b.centroid.total_cmp(&a.centroid));

    Ok(result)
}

/// Assign heading levels using the "most frequent cluster = Body" rule.
///
/// Instead of naively mapping the largest font size to H1, this function
/// identifies the cluster with the most members as body text. Only clusters
/// with fewer members AND sufficiently larger font size than body become headings.
///
/// # Arguments
///
/// * `clusters` - Slice of FontSizeCluster objects (sorted by centroid descending)
/// * `min_heading_ratio` - Minimum ratio of heading centroid to body centroid (e.g. 1.15)
/// * `min_heading_gap` - Minimum absolute font-size difference in points (e.g. 1.5)
///
/// # Returns
///
/// Vector of tuples `(centroid, heading_level)` where `None` means body text
/// and `Some(1..=6)` means H1-H6. Sorted by centroid descending.
pub fn assign_heading_levels_smart(
    clusters: &[FontSizeCluster],
    min_heading_ratio: f32,
    min_heading_gap: f32,
) -> Vec<(f32, Option<u8>)> {
    if clusters.is_empty() {
        return Vec::new();
    }

    // Single cluster means everything is body
    if clusters.len() == 1 {
        return vec![(clusters[0].centroid, None)];
    }

    // Find the cluster with the most total text content (this is body text).
    // Using total character count rather than member count avoids misidentifying
    // a small repeated header/footer font as the body cluster in documents
    // where headers appear on every page (many members, but little text each).
    let body_idx = clusters
        .iter()
        .enumerate()
        .max_by_key(|(_, c)| c.members.iter().map(|block| block.text.len()).sum::<usize>())
        .map(|(i, _)| i)
        .unwrap_or(0);

    let body_centroid = clusters[body_idx].centroid;

    // Collect heading candidates: clusters with sufficiently larger font size than body.
    // Must pass EITHER the ratio gate OR the absolute gap gate (whichever is less
    // restrictive). This captures LaTeX subsection headings (12pt vs 10pt body)
    // that pass the ratio gate (1.2 > 1.15) but not the gap gate (2.0 < 1.5+10=11.5).
    let min_heading_size = body_centroid * min_heading_ratio;
    let min_heading_abs = body_centroid + min_heading_gap;
    let heading_threshold = min_heading_size.min(min_heading_abs);

    let mut heading_candidates: Vec<(usize, f32)> = clusters
        .iter()
        .enumerate()
        .filter(|(i, c)| *i != body_idx && c.centroid >= heading_threshold)
        .map(|(i, c)| (i, c.centroid))
        .collect();

    // Sort heading candidates by centroid descending (largest = H1)
    heading_candidates.sort_by(|a, b| b.1.total_cmp(&a.1));

    // Assign heading levels H1-H6 (max 6 heading levels)
    let max_headings = 6usize;
    let mut result: Vec<(f32, Option<u8>)> = Vec::with_capacity(clusters.len());

    for (i, cluster) in clusters.iter().enumerate() {
        if i == body_idx {
            result.push((cluster.centroid, None));
        } else if let Some(pos) = heading_candidates.iter().position(|(idx, _)| *idx == i) {
            if pos < max_headings {
                result.push((cluster.centroid, Some((pos + 1) as u8)));
            } else {
                // More than 6 heading levels, treat as body
                result.push((cluster.centroid, None));
            }
        } else {
            // Smaller font size than body, treat as body
            result.push((cluster.centroid, None));
        }
    }

    result
}

/// Helper function to assign font sizes to their nearest centroid (for iteration loop).
///
/// Assigns font sizes to clusters without cloning full TextBlock objects, and also
/// returns per-element cluster assignments so the caller can detect convergence via
/// unchanged assignments (in addition to the centroid-movement threshold).
///
/// # Arguments
///
/// * `font_sizes` - Slice of font size values to assign
/// * `centroids` - Slice of centroid values (one per cluster)
///
/// # Returns
///
/// A tuple of:
/// - A vector of clusters, where each cluster contains the font sizes assigned to that centroid
/// - A vector of per-element cluster indices (same length as `font_sizes`)
fn assign_sizes_to_centroids_tracked(font_sizes: &[f32], centroids: &[f32]) -> (Vec<Vec<f32>>, Vec<usize>) {
    let mut clusters: Vec<Vec<f32>> = vec![Vec::new(); centroids.len()];
    let mut assignments: Vec<usize> = Vec::with_capacity(font_sizes.len());

    for &size in font_sizes {
        let mut min_distance = f32::INFINITY;
        let mut best_cluster = 0;

        for (i, &centroid) in centroids.iter().enumerate() {
            let distance = (size - centroid).abs();
            if distance < min_distance {
                min_distance = distance;
                best_cluster = i;
            }
        }

        clusters[best_cluster].push(size);
        assignments.push(best_cluster);
    }

    (clusters, assignments)
}

/// Helper function to assign blocks to their nearest centroid.
///
/// Iterates through blocks and finds the closest centroid for each block,
/// grouping them into clusters. Used in the final assignment step after convergence.
///
/// # Arguments
///
/// * `blocks` - Slice of TextBlock objects to assign
/// * `centroids` - Slice of centroid values (one per cluster)
///
/// # Returns
///
/// A vector of clusters, where each cluster contains the TextBlock objects
/// assigned to that centroid
fn assign_blocks_to_centroids(blocks: &[TextBlock], centroids: &[f32]) -> Vec<Vec<TextBlock>> {
    let mut clusters: Vec<Vec<TextBlock>> = vec![Vec::new(); centroids.len()];

    for block in blocks {
        let mut min_distance = f32::INFINITY;
        let mut best_cluster = 0;

        for (i, &centroid) in centroids.iter().enumerate() {
            let distance = (block.font_size - centroid).abs();
            if distance < min_distance {
                min_distance = distance;
                best_cluster = i;
            }
        }

        clusters[best_cluster].push(block.clone());
    }

    clusters
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::pdf::hierarchy::bounding_box::BoundingBox;

    fn make_block(text: &str, font_size: f32) -> TextBlock {
        TextBlock {
            text: text.to_string(),
            bbox: BoundingBox {
                left: 0.0,
                top: 0.0,
                right: 100.0,
                bottom: font_size,
            },
            font_size,
        }
    }

    #[test]
    fn test_body_cluster_by_text_content_not_member_count() {
        // Scenario: 10 short header blocks (8pt) vs 3 long body blocks (12pt).
        // By member count, the 8pt cluster would win. By text content, the 12pt
        // cluster should win because body text contains more characters.
        let mut blocks = Vec::new();
        for i in 0..10 {
            blocks.push(make_block(&format!("Hdr{i}"), 8.0)); // 4 chars each = 40 total
        }
        for _ in 0..3 {
            // Each body block has ~50 chars = 150 total
            blocks.push(make_block("This is a longer body text paragraph with content.", 12.0));
        }

        let clusters = cluster_font_sizes(&blocks, 2).unwrap();
        let levels = assign_heading_levels_smart(&clusters, 1.15, 1.5);

        // The 12pt cluster (body text) should be identified as body (None),
        // and the 8pt cluster should NOT be body.
        let body_centroid = levels.iter().find(|(_, l)| l.is_none()).map(|(c, _)| *c);
        assert!(body_centroid.is_some(), "should have a body cluster");
        // Body centroid should be near 12pt, not 8pt
        let bc = body_centroid.unwrap();
        assert!((bc - 12.0).abs() < 1.0, "body centroid should be near 12pt, got {bc}");
    }

    #[test]
    fn test_body_cluster_equal_members_picks_more_content() {
        // Same number of members but different text lengths
        let blocks = vec![
            make_block("AB", 18.0),
            make_block("CD", 18.0),
            make_block("This is much longer body text content here.", 12.0),
            make_block("Another long paragraph of body text for the doc.", 12.0),
        ];

        let clusters = cluster_font_sizes(&blocks, 2).unwrap();
        let levels = assign_heading_levels_smart(&clusters, 1.15, 1.5);

        let body_centroid = levels.iter().find(|(_, l)| l.is_none()).map(|(c, _)| *c);
        assert!(body_centroid.is_some());
        let bc = body_centroid.unwrap();
        assert!(
            (bc - 12.0).abs() < 1.0,
            "body should be 12pt cluster (more text), got {bc}"
        );
    }
}