fleischwolf-pdf 0.9.1

PDF/image backend for Fleischwolf: pdfium text extraction + ONNX layout/table/OCR pipeline.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
//! Port of docling-parse's line-cell sanitizer
//! (`src/parse/page_item_sanitators/cells.h` → `create_line_cells` /
//! `contract_cells_into_lines_v1`). It merges per-glyph char cells into line
//! cells via a 3-pass contraction — left-to-right, right-to-left, then
//! left-to-right with reverse — using corner-distance adjacency and inserting at
//! most one space per merge. This reproduces docling-parse's inter-word spacing
//! (justified double spaces, the space before a `:`, and RTL ordering) that the
//! ad-hoc `lines_from_glyphs` reconstruction can't.
//!
//! Geometry uses native PDF coordinates (y increases upward); each cell carries
//! its four transformed corners r0=bottom-left, r1=bottom-right, r2=top-right,
//! r3=top-left, exactly like `page_cell.h`.

use crate::pdfium_backend::{Glyph, TextCell};

// config.h: the factors that actually bind for line cells.
const MERGE: f64 = 1.0; // line_space_width_factor_for_merge (adjacency gate)
const MERGE_WITH_SPACE: f64 = 0.33; // line_space_width_factor_for_merge_with_space
const H_TOL: f64 = 1.0; // horizontal_cell_tolerance (ligature eps_d1 relaxation)

#[derive(Clone)]
struct Cell {
    text: String,
    rx0: f64,
    ry0: f64, // bottom-left
    rx1: f64,
    ry1: f64, // bottom-right
    rx2: f64,
    ry2: f64, // top-right
    rx3: f64,
    ry3: f64, // top-left
    ltr: bool,
    active: bool,
    lig_carry: bool, // last_merged_cell_was_ligature
    font: u64,       // hash of the PDF font name+flags (for enforce_same_font)
    /// Sub-word segments accumulated during contraction, in final logical order.
    /// A word boundary is recorded wherever `merge_with` inserts a separator space
    /// (`delta < gap`); within a boundary the glyphs share one segment. Flattening
    /// these across all cells yields docling-parse's `word_cells` (item 6). The
    /// line path ignores this; only [`word_cells`] reads it.
    words: Vec<WordSeg>,
}

/// One word's accumulated text and native-coordinate bounding box (y up).
#[derive(Clone)]
struct WordSeg {
    text: String,
    l: f64,
    b: f64,
    r: f64,
    t: f64,
}

impl WordSeg {
    fn from_glyph(text: String, l: f64, b: f64, r: f64, t: f64) -> Self {
        WordSeg { text, l, b, r, t }
    }
    /// Absorb `o` into this segment (same word): union the box, append text.
    fn absorb(&mut self, o: &WordSeg) {
        self.text.push_str(&o.text);
        self.l = self.l.min(o.l);
        self.b = self.b.min(o.b);
        self.r = self.r.max(o.r);
        self.t = self.t.max(o.t);
    }
    /// Extend the box to cover a single glyph (ligature recompose into one word).
    fn extend(&mut self, l: f64, b: f64, r: f64, t: f64) {
        self.l = self.l.min(l);
        self.b = self.b.min(b);
        self.r = self.r.max(r);
        self.t = self.t.max(t);
    }
}

/// Concatenate two word runs (in final logical order). With a separator space
/// the runs stay distinct (a word boundary); without one, `left`'s last word and
/// `right`'s first word are the same word and merge. Mirrors `merge_with`'s
/// space decision so word grouping tracks the line contraction exactly.
fn merge_word_runs(mut left: Vec<WordSeg>, mut right: Vec<WordSeg>, space: bool) -> Vec<WordSeg> {
    if left.is_empty() {
        return right;
    }
    if right.is_empty() {
        return left;
    }
    if !space {
        let first = right.remove(0);
        left.last_mut().unwrap().absorb(&first);
    }
    left.extend(right);
    left
}

impl Cell {
    /// Length of the bottom edge (baseline advance) — `page_cell.h::length`.
    fn length(&self) -> f64 {
        ((self.rx1 - self.rx0).powi(2) + (self.ry1 - self.ry0).powi(2)).sqrt()
    }

    /// Running mean glyph advance over the whole accumulated cell.
    fn avg_char_width(&self) -> f64 {
        let n = self.text.chars().count();
        if n > 0 {
            self.length() / n as f64
        } else {
            0.0
        }
    }

    /// Distance from this cell's bottom-right corner to `other`'s bottom-left.
    fn gap(&self, other: &Cell) -> f64 {
        ((self.rx1 - other.rx0).powi(2) + (self.ry1 - other.ry0).powi(2)).sqrt()
    }

    /// `is_adjacent_to`: both the bottom-corner gap (`< eps0`) and the top-corner
    /// gap (`< eps1`) must be small. The vertical component keeps different
    /// baselines/lines from merging.
    fn adjacent(&self, other: &Cell, eps0: f64, eps1: f64) -> bool {
        let d0 = self.gap(other);
        let d1 = ((self.rx2 - other.rx3).powi(2) + (self.ry2 - other.ry3).powi(2)).sqrt();
        d0 < eps0 && d1 < eps1
    }

    /// Punctuation/space cells are bidi-neutral bridges.
    fn same_orientation(&self, other: &Cell) -> bool {
        self.ltr == other.ltr || is_punct_or_space(&self.text) || is_punct_or_space(&other.text)
    }

    /// `merge_with`: absorb `other` (which lies to this cell's right). Insert at
    /// most one separator space when the gap exceeds `delta`. RTL prepends.
    ///
    /// `euclidean` picks the gap measure: docling-parse uses the **Euclidean
    /// corner distance** `d0` (the same one `is_adjacent_to` uses). The pure-Rust
    /// parser produces clean advance boxes, so it uses `d0` to match docling
    /// byte-for-byte. pdfium's loose boxes overhang (an `f` extends left and
    /// overlaps its neighbour), which a Euclidean distance reads as a false
    /// positive gap and over-inserts spaces (`Self` → `Sel f`); that path keeps
    /// the **signed horizontal gap** instead.
    fn merge_with(&mut self, other: &Cell, delta: f64, euclidean: bool) {
        let gap = if euclidean {
            self.gap(other)
        } else {
            other.rx0 - self.rx1
        };
        let space = delta < gap;
        if !self.ltr || !other.ltr {
            if space {
                self.text.insert(0, ' ');
            }
            self.text = format!("{}{}", other.text, self.text);
            self.ltr = false;
            // RTL: `other` is logically first, so its words precede self's. The
            // junction is between other's last word and self's first.
            self.words =
                merge_word_runs(other.words.clone(), std::mem::take(&mut self.words), space);
        } else {
            if space {
                self.text.push(' ');
            }
            self.text.push_str(&other.text);
            self.ltr = true;
            self.words =
                merge_word_runs(std::mem::take(&mut self.words), other.words.clone(), space);
        }
        // Extend the right edge to `other`.
        self.rx1 = other.rx1;
        self.ry1 = other.ry1;
        self.rx2 = other.rx2;
        self.ry2 = other.ry2;
    }
}

/// `applicable_for_merge`: both active and same reading orientation. A different
/// font normally blocks the merge (keeps a bold label and its value as separate
/// line cells). On the clean-box parser path, **punctuation/space cells bridge
/// fonts** so a sentence period set in a separate punctuation font joins its word
/// instead of fragmenting (`العمل .` → `العمل.`); letters still enforce the font.
fn applicable(a: &Cell, b: &Cell, parser: bool) -> bool {
    if !a.active || !b.active {
        return false;
    }
    // A lone punctuation glyph (not a space) set in a separate punctuation font
    // bridges fonts so it joins its word — but only next to RTL text. In LTR a
    // different-font punctuation (e.g. a bold `:`) is a real run boundary docling
    // keeps spaced (`Laboratories :`); in Arabic the sentence period sits in a
    // Latin punctuation font yet attaches (`العمل.`). Parser path only.
    let lone_punct = |s: &str| {
        let mut ch = s.chars();
        matches!(ch.next(), Some(c) if c != ' ' && is_punct_or_space(&c.to_string()))
            && ch.next().is_none()
    };
    let punct_bridge =
        parser && ((lone_punct(&a.text) && !b.ltr) || (lone_punct(&b.text) && !a.ltr));
    let font_neutral = is_ligature(&a.text) || is_ligature(&b.text) || punct_bridge;
    if a.font != 0 && b.font != 0 && a.font != b.font && !font_neutral {
        return false;
    }
    a.same_orientation(b)
}

/// Left-to-right pass: `i` ascending accumulates cells to its right.
fn pass_ltr(cells: &mut [Cell], allow_reverse: bool, euclidean: bool) {
    for i in 0..cells.len() {
        if !cells[i].active {
            continue;
        }
        let mut j = i + 1;
        while j < cells.len() {
            if !applicable(&cells[i], &cells[j], euclidean) {
                break;
            }
            let i_lig = is_ligature(&cells[i].text) || cells[i].lig_carry;
            let j_lig = is_ligature(&cells[j].text) || cells[j].lig_carry;
            let d0 = cells[i].avg_char_width() * MERGE;
            let d1 = cells[i].avg_char_width() * MERGE_WITH_SPACE;
            let adj_d1 = d0 + if i_lig || j_lig { H_TOL } else { 0.0 };
            if cells[i].adjacent(&cells[j], d0, adj_d1) {
                let other = cells[j].clone();
                cells[i].merge_with(&other, d1, euclidean);
                cells[i].lig_carry = is_ligature(&other.text);
                cells[j].active = false;
                j += 1; // i keeps absorbing the next cell to its right
            } else if allow_reverse && cells[j].adjacent(&cells[i], d0, adj_d1) {
                let other = cells[i].clone();
                cells[j].merge_with(&other, d1, euclidean);
                cells[j].lig_carry = is_ligature(&other.text);
                cells[i].active = false;
                break; // i is consumed
            } else {
                break;
            }
        }
    }
}

/// Right-to-left pass: `i` descending; its immediate left neighbour `i-1`
/// absorbs it (then the outer loop continues leftward through the absorber).
fn pass_rtl(cells: &mut [Cell], euclidean: bool) {
    let n = cells.len();
    for k in 0..n {
        let i = n - 1 - k;
        if !cells[i].active || i == 0 {
            continue;
        }
        let j = i - 1;
        if !applicable(&cells[i], &cells[j], euclidean) {
            continue;
        }
        let i_lig = is_ligature(&cells[i].text) || cells[i].lig_carry;
        let j_lig = is_ligature(&cells[j].text) || cells[j].lig_carry;
        let d0 = cells[i].avg_char_width() * MERGE;
        let d1 = cells[i].avg_char_width() * MERGE_WITH_SPACE;
        let adj_d1 = d0 + if i_lig || j_lig { H_TOL } else { 0.0 };
        if cells[j].adjacent(&cells[i], d0, adj_d1) {
            let other = cells[i].clone();
            cells[j].merge_with(&other, d1, euclidean);
            cells[j].lig_carry = is_ligature(&other.text);
            cells[i].active = false;
        }
    }
}

fn contract(cells: &mut Vec<Cell>, euclidean: bool) {
    pass_ltr(cells, false, euclidean);
    cells.retain(|c| c.active);
    pass_rtl(cells, euclidean);
    cells.retain(|c| c.active);
    pass_ltr(cells, true, euclidean);
    cells.retain(|c| c.active);
}

/// Build per-glyph char cells from a page's glyph stream (shared by the line and
/// word paths): drop degenerate spaces, recompose ligatures, init word segments.
fn build_cells(glyphs: &[Glyph], euclidean: bool) -> Vec<Cell> {
    let mut cells: Vec<Cell> = Vec::new();
    for g in glyphs {
        // Use the loose box (uniform font ascent/descent + advance) so adjacent
        // glyphs share a top edge, matching docling-parse's `compute_rect`.
        if !g.ll.is_finite() {
            continue;
        }
        // Drop *degenerate* space glyphs (zero-width loose box): pdfium's generated
        // spaces get a zero-width box at the wrong baseline that breaks the
        // corner-distance adjacency. Without them the inter-word gap drives
        // `merge_with`'s space insertion. Spaces with a real width are kept (they
        // carry justified double-space information).
        if g.ch == ' ' && (g.lr - g.ll).abs() < 0.5 {
            continue;
        }
        // Recompose a ligature: pdfium decomposes one font glyph (Latin fi/ffi,
        // Arabic lam-alef) into several chars at the *same* loose box. Append them
        // into one cell so the contraction never inserts a space inside it.
        if let Some(last) = cells.last_mut() {
            if (last.rx0 - g.ll as f64).abs() < 0.5 && (last.rx1 - g.lr as f64).abs() < 0.5 {
                // Overprint duplicate: the *same* character re-stamped, offset by a
                // fraction of its width (a kashida/elongation segment re-drawn for
                // weight). docling-parse drops it; appending over-counts
                // (right_to_left_02's `قويووووة` vs `قويوووة`). Require a real offset
                // (> 0.1) so a ligature expansion — which decomposes one glyph into
                // several chars at the *identical* box (`ff`→`ff`, diff ≈ 0) — is still
                // recomposed; real doubled letters sit a full advance apart (> 0.5).
                let offset = (g.ll as f64 - last.rx0).abs();
                if euclidean && offset > 0.1 && last.text.ends_with(g.ch) {
                    continue;
                }
                last.text.push(g.ch);
                last.ltr = !is_right_to_left(&last.text);
                if let Some(w) = last.words.last_mut() {
                    w.text.push(g.ch);
                    w.extend(g.ll as f64, g.lb as f64, g.lr as f64, g.lt as f64);
                }
                continue;
            }
        }
        let text = g.ch.to_string();
        let ltr = !is_right_to_left(&text);
        cells.push(Cell {
            words: vec![WordSeg::from_glyph(
                text.clone(),
                g.ll as f64,
                g.lb as f64,
                g.lr as f64,
                g.lt as f64,
            )],
            text,
            rx0: g.ll as f64,
            ry0: g.lb as f64,
            rx1: g.lr as f64,
            ry1: g.lb as f64,
            rx2: g.lr as f64,
            ry2: g.lt as f64,
            rx3: g.ll as f64,
            ry3: g.lt as f64,
            ltr,
            active: true,
            lig_carry: false,
            font: g.font,
        });
    }
    cells
}

/// Build line cells from a page's glyph stream via the docling-parse contraction.
pub(crate) fn line_cells(glyphs: &[Glyph], page_h: f32, euclidean: bool) -> Vec<TextCell> {
    let mut cells = build_cells(glyphs, euclidean);
    contract(&mut cells, euclidean);
    cells
        .into_iter()
        .map(|c| {
            let l = c.rx0.min(c.rx1).min(c.rx2).min(c.rx3) as f32;
            let r = c.rx0.max(c.rx1).max(c.rx2).max(c.rx3) as f32;
            let top = c.ry0.max(c.ry1).max(c.ry2).max(c.ry3) as f32;
            let bot = c.ry0.min(c.ry1).min(c.ry2).min(c.ry3) as f32;
            TextCell {
                text: c.text,
                l,
                t: page_h - top,
                r,
                b: page_h - bot,
            }
        })
        .collect()
}

/// Build **word** cells from a page's glyph stream via the same contraction as
/// [`line_cells`], then split each line into its constituent words at exactly the
/// points where the contraction inserted a separator space. This reproduces
/// docling-parse's `word_cells` (the per-word tokens TableFormer matches against
/// table-grid cells), letting the pipeline drop pdfium's text path entirely
/// (roadmap item 6). Empty words (overprint-cleared) are skipped.
pub(crate) fn word_cells(glyphs: &[Glyph], page_h: f32, euclidean: bool) -> Vec<TextCell> {
    let mut cells = build_cells(glyphs, euclidean);
    contract(&mut cells, euclidean);
    let mut out = Vec::new();
    for c in cells {
        for w in c.words {
            if w.text.trim().is_empty() {
                continue;
            }
            out.push(TextCell {
                text: w.text,
                l: w.l as f32,
                t: page_h - w.t as f32,
                r: w.r as f32,
                b: page_h - w.b as f32,
            });
        }
    }
    out
}

fn is_rtl_char(c: char) -> bool {
    let ch = c as u32;
    (0x0600..=0x06FF).contains(&ch)
        || (0x0750..=0x077F).contains(&ch)
        || (0x08A0..=0x08FF).contains(&ch)
        || (0xFB50..=0xFDFF).contains(&ch)
        || (0xFE70..=0xFEFF).contains(&ch)
        || (0x0590..=0x05FF).contains(&ch)
        || (0xFB1D..=0xFB4F).contains(&ch)
        || (0x0700..=0x074F).contains(&ch)
        || (0x0780..=0x07BF).contains(&ch)
        || (0x07C0..=0x07FF).contains(&ch)
}

/// All codepoints are RTL-script (matches `string.h::is_right_to_left`).
fn is_right_to_left(s: &str) -> bool {
    !s.is_empty() && s.chars().all(is_rtl_char)
}

/// A single-codepoint punctuation/space cell (matches `string.h`).
fn is_punct_or_space(s: &str) -> bool {
    let mut chars = s.chars();
    let (Some(c), None) = (chars.next(), chars.next()) else {
        return false;
    };
    if matches!(
        c,
        ' ' | '\t'
            | '\n'
            | '\r'
            | '\u{0c}'
            | '\u{0b}'
            | '.'
            | ','
            | ';'
            | ':'
            | '!'
            | '?'
            | '('
            | ')'
            | '['
            | ']'
            | '{'
            | '}'
            | '\''
            | '"'
            | '`'
            | '\u{2018}'
            | '\u{2019}'
            | '\u{201c}'
            | '\u{201d}'
            | '-'
            | '\u{2013}'
            | '\u{2014}'
            | '_'
            | '/'
            | '\\'
            | '|'
            | '@'
            | '#'
            | '%'
            | '&'
            | '*'
            | '+'
            | '='
            | '<'
            | '>'
    ) {
        return true;
    }
    let ch = c as u32;
    (0x2000..=0x206F).contains(&ch)
        || (0x3000..=0x303F).contains(&ch)
        || (0xFE50..=0xFE6F).contains(&ch)
        || (0xFF00..=0xFF0F).contains(&ch)
        || (0xFF1A..=0xFF1F).contains(&ch)
        || (0xFF3B..=0xFF5E).contains(&ch)
}

/// Ligature glyph or its ASCII spelling (matches `string.h::is_ligature`).
fn is_ligature(s: &str) -> bool {
    matches!(s, "ff" | "fi" | "fl" | "ffi" | "ffl")
        || s.chars().any(|c| (0xFB00..=0xFB06).contains(&(c as u32)))
}