1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
//! Threshold constants for PDF-to-Markdown spatial analysis.
/// Baseline Y tolerance as a fraction of the smaller font size for same-line grouping.
pub const BASELINE_Y_TOLERANCE_FRACTION: f32 = 0.5;
/// Multiplier for baseline line spacing (Q1) to detect paragraph breaks.
pub const PARAGRAPH_GAP_MULTIPLIER: f32 = 1.8;
/// Font size change threshold (in points) to trigger a paragraph break.
pub const FONT_SIZE_CHANGE_THRESHOLD: f32 = 1.5;
/// Left indent change threshold (in points) to trigger a paragraph break.
pub const LEFT_INDENT_CHANGE_THRESHOLD: f32 = 10.0;
/// Maximum word count for a paragraph to qualify as a heading.
pub const MAX_HEADING_WORD_COUNT: usize = 20;
/// Maximum number of lines for a paragraph to be classified as a list item.
pub const MAX_LIST_ITEM_LINES: usize = 8;
/// Maximum distance multiplier relative to average inter-cluster gap for heading assignment.
pub const MAX_HEADING_DISTANCE_MULTIPLIER: f32 = 2.0;
/// Minimum ratio of heading font size to body font size (heading must be this much larger).
/// 1.15 captures LaTeX \subsection (12pt vs 10pt body = 1.2 ratio).
pub const MIN_HEADING_FONT_RATIO: f32 = 1.15;
/// Minimum absolute font-size difference (in points) between heading and body.
/// 1.5pt captures academic sub-headings (11.5pt vs 10pt body).
pub const MIN_HEADING_FONT_GAP: f32 = 1.5;
/// Fraction of page height to exclude from top (page headers).
pub const PAGE_TOP_MARGIN_FRACTION: f32 = 0.06;
/// Fraction of page height to exclude from bottom (page footers/numbers).
pub const PAGE_BOTTOM_MARGIN_FRACTION: f32 = 0.05;
/// Minimum font size (in points) for a segment to be included in analysis.
/// Segments below this size are likely artifacts (embedded images, symbols, noise).
pub const MIN_FONT_SIZE: f32 = 4.0;
/// Maximum word count for a bold paragraph to be promoted to a section heading.
pub const MAX_BOLD_HEADING_WORD_COUNT: usize = 15;
/// Fraction of the maximum right edge that a line must reach to be considered "full"
/// (used for dehyphenation to avoid false joins on short/indented lines).
pub const FULL_LINE_FRACTION: f32 = 0.85;
/// Y-tolerance for grouping layout regions into the same row (fraction of page height).
/// Regions with vertical centers within this fraction are considered same-row and sorted left-to-right.
pub const REGION_SAME_ROW_FRACTION: f32 = 0.02;