crw_pdf/types.rs
1//! Shared types used across the extraction and markdown pipelines.
2//!
3//! Centralises `TextItem`, `TextLine`, `PdfRect`, font-width / encoding
4//! type aliases, and the `ItemType` enum so that every module can import
5//! them from one place.
6
7use std::collections::HashMap;
8
9use crate::text_utils::should_join_items;
10
11/// Result tuple returned by page-level text extraction: text items, rectangles, line segments,
12/// and whether fonts with unresolvable gid-encoded glyphs were encountered.
13pub(crate) type PageExtraction = (Vec<TextItem>, Vec<PdfRect>, Vec<PdfLine>);
14
15// ── Font types (crate-internal) ──────────────────────────────────────
16
17/// Font encoding map: maps byte codes to Unicode characters
18pub(crate) type FontEncodingMap = HashMap<u8, char>;
19
20/// All font encodings for a page
21pub(crate) type PageFontEncodings = HashMap<String, FontEncodingMap>;
22
23/// Font width information extracted from PDF font dictionaries
24#[derive(Debug, Clone)]
25#[allow(dead_code)]
26pub(crate) struct FontWidthInfo {
27 /// Glyph widths: maps character code to width in font units
28 pub(crate) widths: HashMap<u16, u16>,
29 /// Default width for glyphs not in the widths table
30 pub(crate) default_width: u16,
31 /// Width of the space character (code 32) if known
32 pub(crate) space_width: u16,
33 /// Whether this is a CID font (2-byte character codes)
34 pub(crate) is_cid: bool,
35 /// Scale factor to convert font units to text space units.
36 /// For Type1/TrueType: 0.001 (widths in 1000ths of em)
37 /// For Type3: FontMatrix[0] (e.g., 0.00048828125 for 2048-unit grid)
38 pub(crate) units_scale: f32,
39 /// Writing mode: 0 = horizontal (default), 1 = vertical
40 pub(crate) wmode: u8,
41}
42
43/// All font width info for a page, keyed by font resource name
44pub(crate) type PageFontWidths = HashMap<String, FontWidthInfo>;
45
46// ── Public types ─────────────────────────────────────────────────────
47
48/// Type of extracted item
49#[derive(Debug, Clone, Default)]
50pub enum ItemType {
51 /// Regular text content
52 #[default]
53 Text,
54 /// Image placeholder
55 Image,
56 /// Hyperlink (with URL)
57 Link(String),
58 /// Form field (name: value)
59 FormField,
60}
61
62/// Layout complexity analysis result.
63///
64/// Callers can use this to decide whether the extracted markdown is reliable
65/// or whether the PDF should be routed to an OCR pipeline instead.
66#[derive(Debug, Clone, Default)]
67pub struct LayoutComplexity {
68 /// True if any page has tables or multi-column text.
69 pub is_complex: bool,
70 /// 1-indexed pages where table borders were detected (rect count > 6).
71 pub pages_with_tables: Vec<u32>,
72 /// 1-indexed pages where 2+ text columns were detected.
73 pub pages_with_columns: Vec<u32>,
74}
75
76/// A line segment from PDF path operators (`m`/`l`/`S`).
77#[derive(Debug, Clone)]
78pub struct PdfLine {
79 pub x1: f32,
80 pub y1: f32,
81 pub x2: f32,
82 pub y2: f32,
83 pub page: u32,
84}
85
86/// A rectangle from a PDF `re` operator (cell boundary, border, etc.)
87#[derive(Debug, Clone)]
88pub struct PdfRect {
89 pub x: f32,
90 pub y: f32,
91 pub width: f32,
92 pub height: f32,
93 pub page: u32,
94}
95
96/// A text item with position information
97#[derive(Debug, Clone)]
98pub struct TextItem {
99 /// The text content
100 pub text: String,
101 /// X position on page
102 pub x: f32,
103 /// Y position on page (PDF coordinates, origin at bottom-left)
104 pub y: f32,
105 /// Width of text
106 pub width: f32,
107 /// Height (approximated from font size)
108 pub height: f32,
109 /// Font name
110 pub font: String,
111 /// Font size
112 pub font_size: f32,
113 /// Page number (1-indexed)
114 pub page: u32,
115 /// Whether the font is bold
116 pub is_bold: bool,
117 /// Whether the font is italic
118 pub is_italic: bool,
119 /// Type of item (text, image, link)
120 pub item_type: ItemType,
121 /// Marked Content ID from the content stream's BDC/BMC operator.
122 /// Used to link this item to the PDF structure tree for tagged PDFs.
123 pub mcid: Option<i64>,
124}
125
126/// A line of text (grouped text items)
127#[derive(Debug, Clone)]
128pub struct TextLine {
129 pub items: Vec<TextItem>,
130 pub y: f32,
131 pub page: u32,
132 /// Adaptive join threshold from page-level letter-spacing detection.
133 /// Default 0.10 for normal PDFs; higher for Canva-style PDFs.
134 #[doc(hidden)]
135 pub adaptive_threshold: f32,
136}
137
138impl TextLine {
139 pub fn text(&self) -> String {
140 self.text_with_formatting(false, false)
141 }
142
143 /// Get text with optional bold/italic markdown formatting
144 pub fn text_with_formatting(&self, format_bold: bool, format_italic: bool) -> String {
145 if !format_bold && !format_italic {
146 return self.text_plain();
147 }
148
149 let single_char_threshold = self.adaptive_threshold;
150
151 let mut result = String::new();
152 let mut current_bold = false;
153 let mut current_italic = false;
154
155 for (i, item) in self.items.iter().enumerate() {
156 let text = item.text.as_str();
157 let text_trimmed = text.trim();
158
159 // Skip empty items
160 if text_trimmed.is_empty() {
161 continue;
162 }
163
164 // Determine spacing
165 let needs_space = if i == 0 || result.is_empty() {
166 false
167 } else {
168 let prev_item = &self.items[i - 1];
169 self.needs_space_between(prev_item, item, &result, single_char_threshold)
170 };
171
172 // Preserve leading whitespace from the item text.
173 // Items like " means any person" have a leading space that indicates
174 // a word boundary. needs_space_between returns false for these (because
175 // space_already_exists), but we still need to emit the space since
176 // we push text_trimmed below (which strips it).
177 let has_leading_space = text.starts_with(' ');
178
179 // Check for style changes
180 let item_bold = format_bold && item.is_bold;
181 let item_italic = format_italic && item.is_italic;
182
183 // Close previous styles if they change
184 if current_italic && !item_italic {
185 result.push('*');
186 current_italic = false;
187 }
188 if current_bold && !item_bold {
189 result.push_str("**");
190 current_bold = false;
191 }
192
193 // Add space: either from spacing logic or preserved from item text
194 if needs_space || (has_leading_space && !result.is_empty() && !result.ends_with(' ')) {
195 result.push(' ');
196 }
197
198 // Open new styles
199 if item_bold && !current_bold {
200 result.push_str("**");
201 current_bold = true;
202 }
203 if item_italic && !current_italic {
204 result.push('*');
205 current_italic = true;
206 }
207
208 result.push_str(text_trimmed);
209 }
210
211 // Close any remaining open styles
212 if current_italic {
213 result.push('*');
214 }
215 if current_bold {
216 result.push_str("**");
217 }
218
219 result
220 }
221
222 /// Get plain text without formatting
223 fn text_plain(&self) -> String {
224 let single_char_threshold = self.adaptive_threshold;
225
226 let mut result = String::new();
227 for (i, item) in self.items.iter().enumerate() {
228 let text = item.text.as_str();
229 if i == 0 {
230 result.push_str(text);
231 } else {
232 let prev_item = &self.items[i - 1];
233 if self.needs_space_between(prev_item, item, &result, single_char_threshold) {
234 result.push(' ');
235 }
236 result.push_str(text);
237 }
238 }
239 result
240 }
241
242 /// Determine if a space is needed between two items
243 fn needs_space_between(
244 &self,
245 prev_item: &TextItem,
246 item: &TextItem,
247 result: &str,
248 single_char_threshold: f32,
249 ) -> bool {
250 let text = item.text.as_str();
251
252 // Don't add space before/after hyphens for hyphenated words
253 let prev_ends_with_hyphen = result.ends_with('-');
254 let curr_is_hyphen = text.trim() == "-";
255 let curr_starts_with_hyphen = text.starts_with('-');
256
257 // Detect subscript/superscript: smaller font size and/or Y offset
258 let font_ratio = item.font_size / prev_item.font_size;
259 let reverse_font_ratio = prev_item.font_size / item.font_size;
260 let y_diff = (item.y - prev_item.y).abs();
261
262 let is_sub_super = font_ratio < 0.85 && y_diff > 1.0;
263 let was_sub_super = reverse_font_ratio < 0.85 && y_diff > 1.0;
264
265 // Use position-based spacing detection
266 let should_join = should_join_items(prev_item, item, single_char_threshold);
267
268 // Check if space already exists
269 let prev_ends_with_space = result.ends_with(' ');
270 let curr_starts_with_space = text.starts_with(' ');
271 let space_already_exists = prev_ends_with_space || curr_starts_with_space;
272
273 // Add space unless one of these conditions applies
274 !(prev_ends_with_hyphen
275 || curr_is_hyphen
276 || curr_starts_with_hyphen
277 || is_sub_super
278 || was_sub_super
279 || should_join
280 || space_already_exists)
281 }
282}