crw_pdf/
types.rs

1//! Shared types used across the extraction and markdown pipelines.
2//!
3//! Centralises `TextItem`, `TextLine`, `PdfRect`, font-width / encoding
4//! type aliases, and the `ItemType` enum so that every module can import
5//! them from one place.
6
7use std::collections::HashMap;
8
9use crate::text_utils::should_join_items;
10
11/// Result tuple returned by page-level text extraction: text items, rectangles, line segments,
12/// and whether fonts with unresolvable gid-encoded glyphs were encountered.
13pub(crate) type PageExtraction = (Vec<TextItem>, Vec<PdfRect>, Vec<PdfLine>);
14
15// ── Font types (crate-internal) ──────────────────────────────────────
16
17/// Font encoding map: maps byte codes to Unicode characters
18pub(crate) type FontEncodingMap = HashMap<u8, char>;
19
20/// All font encodings for a page
21pub(crate) type PageFontEncodings = HashMap<String, FontEncodingMap>;
22
23/// Font width information extracted from PDF font dictionaries
24#[derive(Debug, Clone)]
25#[allow(dead_code)]
26pub(crate) struct FontWidthInfo {
27    /// Glyph widths: maps character code to width in font units
28    pub(crate) widths: HashMap<u16, u16>,
29    /// Default width for glyphs not in the widths table
30    pub(crate) default_width: u16,
31    /// Width of the space character (code 32) if known
32    pub(crate) space_width: u16,
33    /// Whether this is a CID font (2-byte character codes)
34    pub(crate) is_cid: bool,
35    /// Scale factor to convert font units to text space units.
36    /// For Type1/TrueType: 0.001 (widths in 1000ths of em)
37    /// For Type3: FontMatrix[0] (e.g., 0.00048828125 for 2048-unit grid)
38    pub(crate) units_scale: f32,
39    /// Writing mode: 0 = horizontal (default), 1 = vertical
40    pub(crate) wmode: u8,
41}
42
43/// All font width info for a page, keyed by font resource name
44pub(crate) type PageFontWidths = HashMap<String, FontWidthInfo>;
45
46// ── Public types ─────────────────────────────────────────────────────
47
48/// Type of extracted item
49#[derive(Debug, Clone, Default)]
50pub enum ItemType {
51    /// Regular text content
52    #[default]
53    Text,
54    /// Image placeholder
55    Image,
56    /// Hyperlink (with URL)
57    Link(String),
58    /// Form field (name: value)
59    FormField,
60}
61
62/// Layout complexity analysis result.
63///
64/// Callers can use this to decide whether the extracted markdown is reliable
65/// or whether the PDF should be routed to an OCR pipeline instead.
66#[derive(Debug, Clone, Default)]
67pub struct LayoutComplexity {
68    /// True if any page has tables or multi-column text.
69    pub is_complex: bool,
70    /// 1-indexed pages where table borders were detected (rect count > 6).
71    pub pages_with_tables: Vec<u32>,
72    /// 1-indexed pages where 2+ text columns were detected.
73    pub pages_with_columns: Vec<u32>,
74}
75
76/// A line segment from PDF path operators (`m`/`l`/`S`).
77#[derive(Debug, Clone)]
78pub struct PdfLine {
79    pub x1: f32,
80    pub y1: f32,
81    pub x2: f32,
82    pub y2: f32,
83    pub page: u32,
84}
85
86/// A rectangle from a PDF `re` operator (cell boundary, border, etc.)
87#[derive(Debug, Clone)]
88pub struct PdfRect {
89    pub x: f32,
90    pub y: f32,
91    pub width: f32,
92    pub height: f32,
93    pub page: u32,
94}
95
96/// A text item with position information
97#[derive(Debug, Clone)]
98pub struct TextItem {
99    /// The text content
100    pub text: String,
101    /// X position on page
102    pub x: f32,
103    /// Y position on page (PDF coordinates, origin at bottom-left)
104    pub y: f32,
105    /// Width of text
106    pub width: f32,
107    /// Height (approximated from font size)
108    pub height: f32,
109    /// Font name
110    pub font: String,
111    /// Font size
112    pub font_size: f32,
113    /// Page number (1-indexed)
114    pub page: u32,
115    /// Whether the font is bold
116    pub is_bold: bool,
117    /// Whether the font is italic
118    pub is_italic: bool,
119    /// Type of item (text, image, link)
120    pub item_type: ItemType,
121    /// Marked Content ID from the content stream's BDC/BMC operator.
122    /// Used to link this item to the PDF structure tree for tagged PDFs.
123    pub mcid: Option<i64>,
124}
125
126/// A line of text (grouped text items)
127#[derive(Debug, Clone)]
128pub struct TextLine {
129    pub items: Vec<TextItem>,
130    pub y: f32,
131    pub page: u32,
132    /// Adaptive join threshold from page-level letter-spacing detection.
133    /// Default 0.10 for normal PDFs; higher for Canva-style PDFs.
134    #[doc(hidden)]
135    pub adaptive_threshold: f32,
136}
137
138impl TextLine {
139    pub fn text(&self) -> String {
140        self.text_with_formatting(false, false)
141    }
142
143    /// Get text with optional bold/italic markdown formatting
144    pub fn text_with_formatting(&self, format_bold: bool, format_italic: bool) -> String {
145        if !format_bold && !format_italic {
146            return self.text_plain();
147        }
148
149        let single_char_threshold = self.adaptive_threshold;
150
151        let mut result = String::new();
152        let mut current_bold = false;
153        let mut current_italic = false;
154
155        for (i, item) in self.items.iter().enumerate() {
156            let text = item.text.as_str();
157            let text_trimmed = text.trim();
158
159            // Skip empty items
160            if text_trimmed.is_empty() {
161                continue;
162            }
163
164            // Determine spacing
165            let needs_space = if i == 0 || result.is_empty() {
166                false
167            } else {
168                let prev_item = &self.items[i - 1];
169                self.needs_space_between(prev_item, item, &result, single_char_threshold)
170            };
171
172            // Preserve leading whitespace from the item text.
173            // Items like " means any person" have a leading space that indicates
174            // a word boundary. needs_space_between returns false for these (because
175            // space_already_exists), but we still need to emit the space since
176            // we push text_trimmed below (which strips it).
177            let has_leading_space = text.starts_with(' ');
178
179            // Check for style changes
180            let item_bold = format_bold && item.is_bold;
181            let item_italic = format_italic && item.is_italic;
182
183            // Close previous styles if they change
184            if current_italic && !item_italic {
185                result.push('*');
186                current_italic = false;
187            }
188            if current_bold && !item_bold {
189                result.push_str("**");
190                current_bold = false;
191            }
192
193            // Add space: either from spacing logic or preserved from item text
194            if needs_space || (has_leading_space && !result.is_empty() && !result.ends_with(' ')) {
195                result.push(' ');
196            }
197
198            // Open new styles
199            if item_bold && !current_bold {
200                result.push_str("**");
201                current_bold = true;
202            }
203            if item_italic && !current_italic {
204                result.push('*');
205                current_italic = true;
206            }
207
208            result.push_str(text_trimmed);
209        }
210
211        // Close any remaining open styles
212        if current_italic {
213            result.push('*');
214        }
215        if current_bold {
216            result.push_str("**");
217        }
218
219        result
220    }
221
222    /// Get plain text without formatting
223    fn text_plain(&self) -> String {
224        let single_char_threshold = self.adaptive_threshold;
225
226        let mut result = String::new();
227        for (i, item) in self.items.iter().enumerate() {
228            let text = item.text.as_str();
229            if i == 0 {
230                result.push_str(text);
231            } else {
232                let prev_item = &self.items[i - 1];
233                if self.needs_space_between(prev_item, item, &result, single_char_threshold) {
234                    result.push(' ');
235                }
236                result.push_str(text);
237            }
238        }
239        result
240    }
241
242    /// Determine if a space is needed between two items
243    fn needs_space_between(
244        &self,
245        prev_item: &TextItem,
246        item: &TextItem,
247        result: &str,
248        single_char_threshold: f32,
249    ) -> bool {
250        let text = item.text.as_str();
251
252        // Don't add space before/after hyphens for hyphenated words
253        let prev_ends_with_hyphen = result.ends_with('-');
254        let curr_is_hyphen = text.trim() == "-";
255        let curr_starts_with_hyphen = text.starts_with('-');
256
257        // Detect subscript/superscript: smaller font size and/or Y offset
258        let font_ratio = item.font_size / prev_item.font_size;
259        let reverse_font_ratio = prev_item.font_size / item.font_size;
260        let y_diff = (item.y - prev_item.y).abs();
261
262        let is_sub_super = font_ratio < 0.85 && y_diff > 1.0;
263        let was_sub_super = reverse_font_ratio < 0.85 && y_diff > 1.0;
264
265        // Use position-based spacing detection
266        let should_join = should_join_items(prev_item, item, single_char_threshold);
267
268        // Check if space already exists
269        let prev_ends_with_space = result.ends_with(' ');
270        let curr_starts_with_space = text.starts_with(' ');
271        let space_already_exists = prev_ends_with_space || curr_starts_with_space;
272
273        // Add space unless one of these conditions applies
274        !(prev_ends_with_hyphen
275            || curr_is_hyphen
276            || curr_starts_with_hyphen
277            || is_sub_super
278            || was_sub_super
279            || should_join
280            || space_already_exists)
281    }
282}
crw_pdf/types.rs

crw_pdf/
types.rs