pdf2md 0.1.0

PDF → Markdown extractor with figure rasterization, table & banner detection. Built on pdfium-render.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
//! Header/footer (banner) detection and stripping.

use std::collections::{HashMap, HashSet};

use super::element::{PageElement, TextElement};
use super::zone::{BBox, Bounded};

pub(super) fn bbox_of_elements(elements: &[PageElement]) -> BBox {
    if elements.is_empty() {
        return BBox::empty();
    }
    let mut acc = elements[0].bbox();
    for el in &elements[1..] {
        let b = el.bbox();
        acc = BBox {
            left: acc.left.min(b.left),
            right: acc.right.max(b.right),
            top: acc.top.max(b.top),
            bottom: acc.bottom.min(b.bottom),
        };
    }
    acc
}

// ---- header/footer detection ------------------------------------------------

pub(super) const BANNER_BAND_FRACTION: f32 = 0.10;
const BANNER_MIN_PAGES: usize = 3;
/// Minimum token length (after digit-run normalization) considered for
/// per-token banner detection. 4 chars filters out filler words like "of",
/// "ze", "a" while keeping branding tokens such as "PŘEDPIS" or "HLAVA".
const BANNER_MIN_TOKEN_LEN: usize = 4;

/// Detect repeating page banners (headers and footers) by scanning the top and
/// bottom 10% bands of every page, line-grouping the text within each band,
/// normalizing (collapse whitespace, replace digit runs with `\d`), and
/// returning three banner sets:
///   * `lines` — normalized full-line strings on `>= 25 %` of pages
///   * `tokens` — normalized whitespace-separated tokens (≥ 4 chars after
///     digit normalization) appearing on `>= 25 %` of pages. A token is
///     counted once per page even if it recurs within that page's bands.
///   * `image_hashes` — `Figure.hash` of images whose bbox lies in the top
///     or bottom band on `>= 25 %` of pages. Catches repeating brand logos
///     placed in page headers/footers.
///
/// The token set catches mirrored running headers (e.g. L14's
/// "HLAVA n PŘEDPIS L14" / "PŘEDPIS L14 HLAVA n") that the line-level set
/// misses because the chapter number breaks whole-line equality.
pub(super) fn detect_banners(
    per_page_elements: &[Vec<PageElement>],
    per_page_bbox: &[BBox],
) -> (HashSet<String>, HashSet<String>, HashSet<String>) {
    let n_pages = per_page_elements.len();
    if n_pages < BANNER_MIN_PAGES {
        return (HashSet::new(), HashSet::new(), HashSet::new());
    }
    // Threshold: line/token must appear on ≥ 25 % of pages (or
    // BANNER_MIN_PAGES, whichever is greater). The historical 50 % was too
    // strict for PDFs that alternate header layouts on even/odd pages.
    let threshold = std::cmp::max(BANNER_MIN_PAGES, n_pages.div_ceil(4));

    let mut line_counts: HashMap<String, usize> = HashMap::new();
    let mut token_counts: HashMap<String, usize> = HashMap::new();
    let mut image_counts: HashMap<String, usize> = HashMap::new();
    for (page_idx, elements) in per_page_elements.iter().enumerate() {
        let bbox = per_page_bbox[page_idx];
        // Token uniqueness *within* a page: only count each token once per
        // page so that a single chapter page printing "PŘEDPIS" twice in
        // its header doesn't inflate the across-page count.
        let mut page_tokens: HashSet<String> = HashSet::new();
        for line in band_lines(elements, &bbox) {
            let n = normalize_banner(&line);
            if !n.is_empty() {
                *line_counts.entry(n.clone()).or_insert(0) += 1;
                for tok in tokenize_banner(&n) {
                    page_tokens.insert(tok);
                }
            }
        }
        for tok in page_tokens {
            *token_counts.entry(tok).or_insert(0) += 1;
        }
        // Same dedup per page for image figures: a logo appearing twice
        // on the same page (e.g. header + footer) still only counts as
        // one occurrence for the across-page tally.
        let mut page_image_hashes: HashSet<String> = HashSet::new();
        let height = bbox.height();
        if height > 0.0 {
            let top_cutoff = bbox.top - height * BANNER_BAND_FRACTION;
            let bottom_cutoff = bbox.bottom + height * BANNER_BAND_FRACTION;
            for el in elements {
                if let PageElement::Image(im) = el {
                    let cy = (im.top + im.bottom) / 2.0;
                    if cy >= top_cutoff || cy <= bottom_cutoff {
                        page_image_hashes.insert(im.figure.hash.clone());
                    }
                }
            }
        }
        for h in page_image_hashes {
            *image_counts.entry(h).or_insert(0) += 1;
        }
    }
    let lines: HashSet<String> = line_counts
        .into_iter()
        .filter(|(_, c)| *c >= threshold)
        .map(|(k, _)| k)
        .collect();
    let tokens: HashSet<String> = token_counts
        .into_iter()
        .filter(|(_, c)| *c >= threshold)
        .map(|(k, _)| k)
        .collect();
    let image_hashes: HashSet<String> = image_counts
        .into_iter()
        .filter(|(_, c)| *c >= threshold)
        .map(|(k, _)| k)
        .collect();
    (lines, tokens, image_hashes)
}

/// Group elements within the top/bottom band of a page into single-line strings
/// (sorted left-to-right within each line, lines separated by Y center).
fn band_lines(elements: &[PageElement], bbox: &BBox) -> Vec<String> {
    let height = bbox.height();
    if height <= 0.0 {
        return Vec::new();
    }
    let top_cutoff = bbox.top - height * BANNER_BAND_FRACTION;
    let bottom_cutoff = bbox.bottom + height * BANNER_BAND_FRACTION;

    let mut texts: Vec<&TextElement> = elements
        .iter()
        .filter_map(|el| match el {
            PageElement::Text(t) => {
                let cy = t.y_center();
                if cy >= top_cutoff || cy <= bottom_cutoff {
                    Some(t)
                } else {
                    None
                }
            }
            _ => None,
        })
        .collect();
    if texts.is_empty() {
        return Vec::new();
    }
    // Sort by Y descending (top-first), then X ascending.
    texts.sort_by(|a, b| {
        b.y_center()
            .partial_cmp(&a.y_center())
            .unwrap_or(std::cmp::Ordering::Equal)
            .then(
                a.left
                    .partial_cmp(&b.left)
                    .unwrap_or(std::cmp::Ordering::Equal),
            )
    });

    let mut lines: Vec<Vec<&TextElement>> = Vec::new();
    let mut current: Vec<&TextElement> = Vec::new();
    let mut current_y: Option<f32> = None;
    for t in texts {
        let cy = t.y_center();
        let line_h = (t.top - t.bottom).abs().max(1.0);
        match current_y {
            Some(y) if (cy - y).abs() <= line_h * 0.5 => current.push(t),
            _ => {
                if !current.is_empty() {
                    lines.push(std::mem::take(&mut current));
                }
                current_y = Some(cy);
                current.push(t);
            }
        }
    }
    if !current.is_empty() {
        lines.push(current);
    }

    lines
        .into_iter()
        .map(|mut line| {
            line.sort_by(|a, b| {
                a.left
                    .partial_cmp(&b.left)
                    .unwrap_or(std::cmp::Ordering::Equal)
            });
            line.iter()
                .map(|t| t.text.trim())
                .filter(|s| !s.is_empty())
                .collect::<Vec<_>>()
                .join(" ")
        })
        .filter(|s| !s.is_empty())
        .collect()
}

/// Collapse whitespace and replace runs of ASCII digits with the literal `\d`
/// so that "Page 1" and "Page 12" normalize to the same banner key.
pub(super) fn normalize_banner(s: &str) -> String {
    let trimmed = s.trim();
    if trimmed.is_empty() {
        return String::new();
    }
    let mut out = String::new();
    let mut prev_digit = false;
    for c in trimmed.chars() {
        if c.is_ascii_digit() {
            if !prev_digit {
                out.push_str("\\d");
            }
            prev_digit = true;
        } else {
            out.push(c);
            prev_digit = false;
        }
    }
    out.split_whitespace().collect::<Vec<_>>().join(" ")
}

/// Split an already-normalized banner line into tokens worth tracking
/// individually. Keeps whitespace-separated tokens whose character count is
/// at least `BANNER_MIN_TOKEN_LEN`. The input is assumed to have already
/// gone through `normalize_banner` (digit runs collapsed to `\d`).
fn tokenize_banner(normalized: &str) -> Vec<String> {
    normalized
        .split_whitespace()
        .filter(|t| t.chars().count() >= BANNER_MIN_TOKEN_LEN)
        .map(|t| t.to_string())
        .collect()
}

/// Re-run line-grouping on this page's banner bands and drop any text element
/// belonging to a line that matches either:
///   * the full-line banner set (exact normalized match), or
///   * the token-banner set — every ≥ 4-char token (after digit-run
///     normalization) is in `banner_tokens`. Pure-punctuation/digit tokens
///     are treated as auto-covered, so e.g. "PŘEDPIS L14 \d" matches if
///     "PŘEDPIS" and "L14" are both banner tokens.
pub(super) fn strip_banners(
    elements: Vec<PageElement>,
    bbox: BBox,
    banner_lines: &HashSet<String>,
    banner_tokens: &HashSet<String>,
    banner_image_hashes: &HashSet<String>,
) -> Vec<PageElement> {
    if banner_lines.is_empty() && banner_tokens.is_empty() && banner_image_hashes.is_empty() {
        return elements;
    }
    let height = bbox.height();
    if height <= 0.0 {
        return elements;
    }
    let top_cutoff = bbox.top - height * BANNER_BAND_FRACTION;
    let bottom_cutoff = bbox.bottom + height * BANNER_BAND_FRACTION;

    // First pass: gather image indices to drop when their figure hash is a
    // recurring banner image AND their bbox sits in the top/bottom band.
    let mut drop_indices: HashSet<usize> = HashSet::new();
    if !banner_image_hashes.is_empty() {
        for (idx, el) in elements.iter().enumerate() {
            if let PageElement::Image(im) = el {
                let cy = (im.top + im.bottom) / 2.0;
                if (cy >= top_cutoff || cy <= bottom_cutoff)
                    && banner_image_hashes.contains(&im.figure.hash)
                {
                    drop_indices.insert(idx);
                }
            }
        }
    }

    // Walk in band, group into lines (same algorithm as `band_lines`), but
    // remember the original-elements indices so we can drop them.
    let mut indexed: Vec<(usize, &TextElement)> = Vec::new();
    for (idx, el) in elements.iter().enumerate() {
        if let PageElement::Text(t) = el {
            let cy = t.y_center();
            if cy >= top_cutoff || cy <= bottom_cutoff {
                indexed.push((idx, t));
            }
        }
    }
    if indexed.is_empty() && drop_indices.is_empty() {
        return elements;
    }
    indexed.sort_by(|a, b| {
        b.1.y_center()
            .partial_cmp(&a.1.y_center())
            .unwrap_or(std::cmp::Ordering::Equal)
            .then(
                a.1.left
                    .partial_cmp(&b.1.left)
                    .unwrap_or(std::cmp::Ordering::Equal),
            )
    });

    let mut lines: Vec<Vec<(usize, &TextElement)>> = Vec::new();
    let mut current: Vec<(usize, &TextElement)> = Vec::new();
    let mut current_y: Option<f32> = None;
    for (idx, t) in indexed {
        let cy = t.y_center();
        let line_h = (t.top - t.bottom).abs().max(1.0);
        match current_y {
            Some(y) if (cy - y).abs() <= line_h * 0.5 => current.push((idx, t)),
            _ => {
                if !current.is_empty() {
                    lines.push(std::mem::take(&mut current));
                }
                current_y = Some(cy);
                current.push((idx, t));
            }
        }
    }
    if !current.is_empty() {
        lines.push(current);
    }

    for line in lines {
        let mut sorted = line;
        sorted.sort_by(|a, b| {
            a.1.left
                .partial_cmp(&b.1.left)
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        let joined = sorted
            .iter()
            .map(|(_, t)| t.text.trim())
            .filter(|s| !s.is_empty())
            .collect::<Vec<_>>()
            .join(" ");
        let normalized = normalize_banner(&joined);
        let line_match = banner_lines.contains(&normalized);
        // Token coverage: every ≥ 4-char token must be in banner_tokens.
        // Short / pure-punctuation tokens (anything that wouldn't be
        // emitted by `tokenize_banner`) are treated as auto-covered.
        let token_match = !normalized.is_empty() && !banner_tokens.is_empty() && {
            let toks = tokenize_banner(&normalized);
            // Require at least one substantive token so an all-digit /
            // all-punctuation line ("\d \d \d") doesn't vacuously match.
            !toks.is_empty() && toks.iter().all(|t| banner_tokens.contains(t))
        };
        if line_match || token_match {
            for (idx, _) in sorted {
                drop_indices.insert(idx);
            }
        }
    }

    elements
        .into_iter()
        .enumerate()
        .filter_map(|(idx, el)| {
            if drop_indices.contains(&idx) {
                None
            } else {
                Some(el)
            }
        })
        .collect()
}

/// Fraction of page height (from the bottom) inside which a bare numeric
/// text run is always treated as a page-number footer and dropped.
const FOOTER_DIGIT_BAND_FRACTION: f32 = 0.08;

/// Drop text elements whose stripped content is exactly 1–3 digits AND
/// whose bbox sits in the bottom `FOOTER_DIGIT_BAND_FRACTION` of the page.
/// Unlike `strip_banners` this is unconditional — a bare number alone in
/// the bottom band is essentially always a page-number footer that pdfium
/// has split off from the surrounding paragraph stream.
pub(super) fn strip_bottom_band_bare_digits(
    elements: Vec<PageElement>,
    bbox: BBox,
) -> Vec<PageElement> {
    let height = bbox.height();
    if height <= 0.0 {
        return elements;
    }
    let cutoff = bbox.bottom + height * FOOTER_DIGIT_BAND_FRACTION;
    elements
        .into_iter()
        .filter(|el| match el {
            PageElement::Text(t) => {
                let cy = t.y_center();
                if cy > cutoff {
                    return true;
                }
                let trimmed = t.text.trim();
                let is_bare_digits = !trimmed.is_empty()
                    && trimmed.len() <= 3
                    && trimmed.chars().all(|c| c.is_ascii_digit());
                !is_bare_digits
            }
            _ => true,
        })
        .collect()
}

#[cfg(test)]
#[path = "banner_tests.rs"]
mod tests;