oxidize_pdf/text/extraction.rs
1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::{PdfDictionary, PdfObject};
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19 /// Preserve the original layout (spacing and positioning)
20 pub preserve_layout: bool,
21 /// Minimum space width to insert space character (in text space units)
22 pub space_threshold: f64,
23 /// Threshold for synthesising an implicit `U+0020` from a `TJ` numeric
24 /// kerning offset, expressed as a fraction of the current font size.
25 /// A TJ kern advances the text matrix by `-adjustment/1000 * font_size`
26 /// without rendering any glyph; many PDFs (academic publishers, LaTeX,
27 /// kerned typography) encode inter-word gaps purely as wide negative
28 /// kerns rather than literal space bytes. When the synthesised advance
29 /// exceeds `tj_space_threshold * font_size`, the extractor inserts one
30 /// `U+0020`. Default `0.2` (200 milli-em) sits well between typical
31 /// intra-word kerning (10-50 milli-em) and the width of a `space`
32 /// glyph in most fonts (250-300 milli-em). Lower values catch tighter
33 /// spaces; higher values reduce false positives in fonts with unusually
34 /// wide kerning. Separate from `space_threshold` (which governs the
35 /// post-glyph gap between separate text-show operators) because the TJ
36 /// numeric kern is measured without any glyph advance baseline and
37 /// needs a more sensitive threshold (issue #272).
38 pub tj_space_threshold: f64,
39 /// Minimum vertical distance to insert newline (in text space units)
40 pub newline_threshold: f64,
41 /// Sort text fragments by position (useful for multi-column layouts)
42 pub sort_by_position: bool,
43 /// Detect and handle columns
44 pub detect_columns: bool,
45 /// Column separation threshold (in page units)
46 pub column_threshold: f64,
47 /// Merge hyphenated words at line ends
48 pub merge_hyphenated: bool,
49 /// Track space insertion decisions in each TextFragment (default: false).
50 /// When false: zero overhead. When true: populates `TextFragment::space_decisions`.
51 pub track_space_decisions: bool,
52 /// Reconstruct visual lines and paragraphs from the raw text fragments
53 /// produced by PDF text-show operators. When `true`, the extractor groups
54 /// fragments by baseline into single-line fragments, then groups
55 /// consecutive lines with normal leading into paragraph-level fragments.
56 /// This is what the partition pipeline needs to produce Element values at
57 /// paragraph granularity rather than at per-`Tj` granularity (see
58 /// [issue #261](https://github.com/bzsanti/oxidizePdf/issues/261)).
59 ///
60 /// Default `false` for backward compatibility with direct `extract_text`
61 /// callers. The `PdfDocument::partition*` entry points force this to
62 /// `true`.
63 pub reconstruct_paragraphs: bool,
64 /// Include content inside `/Artifact` marked-content scopes (page headers,
65 /// footers, watermarks, decorative content). Default `false` — Artifact
66 /// content is filtered out, as the PDF/UA conformance level recommends
67 /// for accessibility tooling and as RAG callers consistently want
68 /// (issue #269 Phase 1). Opt-in by setting `true` when extracting
69 /// page furniture matters (e.g. forensic auditing, redaction tools).
70 pub include_artifacts: bool,
71}
72
73impl Default for ExtractionOptions {
74 fn default() -> Self {
75 Self {
76 preserve_layout: false,
77 space_threshold: 0.3,
78 tj_space_threshold: 0.2,
79 newline_threshold: 10.0,
80 sort_by_position: true,
81 detect_columns: false,
82 column_threshold: 50.0,
83 merge_hyphenated: true,
84 track_space_decisions: false,
85 reconstruct_paragraphs: false,
86 include_artifacts: false,
87 }
88 }
89}
90
91/// Extracted text with position information
92#[derive(Debug, Clone)]
93pub struct ExtractedText {
94 /// The extracted text content
95 pub text: String,
96 /// Text fragments with position information (if preserve_layout is true)
97 pub fragments: Vec<TextFragment>,
98}
99
100/// Metadata about a space insertion decision during text extraction.
101/// Only populated when [`ExtractionOptions::track_space_decisions`] is `true`.
102#[derive(Debug, Clone)]
103pub struct SpaceDecision {
104 /// Character offset in the extracted text.
105 pub offset: usize,
106 /// Actual horizontal gap (dx) in text space units.
107 pub dx: f64,
108 /// The threshold used at this point.
109 pub threshold: f64,
110 /// Confidence: `|dx - threshold| / threshold`, clamped to [0.0, 1.0].
111 pub confidence: f64,
112 /// Whether a space was inserted.
113 pub inserted: bool,
114}
115
116/// A fragment of text with position information
117#[derive(Debug, Clone)]
118pub struct TextFragment {
119 /// Text content
120 pub text: String,
121 /// X position in page coordinates
122 pub x: f64,
123 /// Y position in page coordinates
124 pub y: f64,
125 /// Width of the text
126 pub width: f64,
127 /// Height of the text
128 pub height: f64,
129 /// Font size
130 pub font_size: f64,
131 /// Font name (if known) - used for kerning-aware text spacing
132 pub font_name: Option<String>,
133 /// Whether the font is bold (detected from font name)
134 pub is_bold: bool,
135 /// Whether the font is italic (detected from font name)
136 pub is_italic: bool,
137 /// Fill color of the text (from graphics state)
138 pub color: Option<Color>,
139 /// Space insertion decisions (empty unless `track_space_decisions` is true).
140 pub space_decisions: Vec<SpaceDecision>,
141 /// Marked-content identifier from the innermost ancestor BDC with `/MCID`
142 /// (issue #269 Phase 1). `None` for non-tagged PDFs, which preserves the
143 /// pre-Phase-1 grouping behavior (`None == None` collapses to legacy keys).
144 pub mcid: Option<u32>,
145 /// Structural tag of the owning BDC (e.g. `"P"`, `"H1"`, `"Figure"`,
146 /// `"Artifact"`). Set on the same ancestor that supplied `mcid`. Phase 3
147 /// will consume this for partitioner classification; Phase 1 only carries it.
148 pub struct_tag: Option<String>,
149}
150
151/// One entry on the marked-content stack maintained by `TextState`.
152///
153/// PDF marked-content operators (BDC/BMC/EMC) form a balanced LIFO stack
154/// per content stream. Each entry remembers the tag (`"P"`, `"H1"`,
155/// `"Artifact"`, …), the optional `MCID` for fragment grouping, the
156/// optional `/ActualText` substitution string, and a computed
157/// `is_artifact` flag that inherits from any ancestor (so nested
158/// `/P` inside `/Artifact` is still filtered out).
159#[derive(Debug, Clone)]
160struct MarkedContentEntry {
161 /// The BDC/BMC tag (e.g. `"P"`, `"Figure"`, `"Artifact"`, `"Span"`).
162 tag: String,
163 /// MCID from `/MCID <int>` if present in the BDC props.
164 mcid: Option<u32>,
165 /// Decoded ActualText from `/ActualText (...)` if present. Decoded
166 /// once at BDC time (UTF-16BE BOM detection in `decode_pdf_string`)
167 /// rather than per-fragment.
168 #[allow(dead_code)] // Task 9 reads this via pending_actualtext flush path
169 actual_text: Option<String>,
170 /// True if this entry's tag == `"Artifact"` OR any ancestor on the
171 /// stack at push time had `is_artifact == true`. Inheritance lets the
172 /// emitter check only the innermost entry to decide filtering.
173 is_artifact: bool,
174}
175
176/// A pending ActualText run. Created when a BDC pushes an entry with
177/// `actual_text == Some(_)`; drained and emitted as a single synthetic
178/// `TextFragment` when the matching EMC pops the entry.
179///
180/// Spec §3a/§4 (collapse-on-EMC): per-`Tj` emission inside an ActualText
181/// scope is suppressed; on scope close we emit one fragment whose `text`
182/// is the substitution string, `x`/`y` is the first `Tj` origin, and
183/// `width` is the sum of suppressed text widths.
184#[derive(Debug, Clone)]
185struct PendingActualText {
186 /// Substitution text from the BDC's `/ActualText` (already decoded).
187 text: String,
188 /// Pen origin of the first suppressed `Tj` (page-space).
189 first_x: f64,
190 /// Same for Y.
191 first_y: f64,
192 /// Accumulated effective width of suppressed `Tj` runs.
193 width: f64,
194 /// Effective font size at the time the first `Tj` was suppressed.
195 font_size: f64,
196 /// Font name + style at first `Tj`. Set on first suppression.
197 font_name: Option<String>,
198 /// Bold/italic from the font name at first suppression.
199 is_bold: bool,
200 is_italic: bool,
201 /// Fill color at first suppression.
202 color: Option<Color>,
203 /// Depth in `mc_stack` at which this run was opened. When the entry at
204 /// this depth is popped, the pending run is flushed.
205 stack_depth: usize,
206 /// Whether a `Tj`/`TJ`/`'`/`"` has been observed yet inside the scope.
207 /// Until the first one fires, the run has no origin to record.
208 populated: bool,
209}
210
211/// Text extraction state
212struct TextState {
213 /// Current text matrix
214 text_matrix: [f64; 6],
215 /// Current text line matrix
216 text_line_matrix: [f64; 6],
217 /// Current transformation matrix (CTM)
218 ctm: [f64; 6],
219 /// Text leading (line spacing)
220 leading: f64,
221 /// Character spacing
222 char_space: f64,
223 /// Word spacing
224 word_space: f64,
225 /// Horizontal scaling
226 horizontal_scale: f64,
227 /// Text rise
228 text_rise: f64,
229 /// Current font size
230 font_size: f64,
231 /// Current font name
232 font_name: Option<String>,
233 /// Render mode (0 = fill, 1 = stroke, etc.)
234 render_mode: u8,
235 /// Fill color (for text rendering)
236 fill_color: Option<Color>,
237 /// Graphics state stack for `q`/`Q` operators. Each entry holds the CTM
238 /// and other graphics state items that the text extractor needs to restore.
239 /// Per PDF spec §8.4.4, `q` pushes the full graphics state and `Q` pops it;
240 /// here we save only the fields that influence text extraction.
241 saved_states: Vec<SavedGraphicsState>,
242 /// Marked-content stack (issue #269 Phase 1). Pushed on BMC/BDC,
243 /// popped on EMC. Empty on entry to each page.
244 mc_stack: Vec<MarkedContentEntry>,
245 /// Pending ActualText run if any BDC ancestor declared `/ActualText`.
246 /// At most one active run at a time — nested ActualText replaces the
247 /// outer (innermost wins, per spec §4).
248 pending_actualtext: Option<PendingActualText>,
249}
250
251/// Subset of graphics state saved by `q` and restored by `Q` (issue #262).
252#[derive(Clone)]
253struct SavedGraphicsState {
254 ctm: [f64; 6],
255 fill_color: Option<Color>,
256}
257
258/// Mutable accumulator threaded through `process_operations` so the op loop
259/// can be driven recursively (page content stream → Form XObjects) while
260/// carrying text state, position, and accumulated output. Bundled into one
261/// struct so the op match moves verbatim into the recursive method (#319).
262struct OpRunState {
263 state: TextState,
264 in_text_object: bool,
265 last_x: f64,
266 last_y: f64,
267 extracted_text: String,
268 fragments: Vec<TextFragment>,
269}
270
271impl Default for TextState {
272 fn default() -> Self {
273 Self {
274 text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
275 text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
276 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
277 leading: 0.0,
278 char_space: 0.0,
279 word_space: 0.0,
280 horizontal_scale: 100.0,
281 text_rise: 0.0,
282 font_size: 0.0,
283 font_name: None,
284 render_mode: 0,
285 fill_color: None,
286 saved_states: Vec::new(),
287 mc_stack: Vec::new(),
288 pending_actualtext: None,
289 }
290 }
291}
292
293/// Parse font style (bold/italic) from font name
294///
295/// Detects bold and italic styles from common font naming patterns.
296/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
297/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
298///
299/// # Examples
300///
301/// ```
302/// use oxidize_pdf::text::extraction::parse_font_style;
303///
304/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
305/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
306/// assert_eq!(parse_font_style("Courier"), (false, false));
307/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
308/// ```
309///
310/// # Returns
311///
312/// Tuple of (is_bold, is_italic)
313pub fn parse_font_style(font_name: &str) -> (bool, bool) {
314 let name_lower = font_name.to_lowercase();
315
316 // Detect bold from common patterns
317 let is_bold = name_lower.contains("bold")
318 || name_lower.contains("-b")
319 || name_lower.contains(" b ")
320 || name_lower.ends_with(" b");
321
322 // Detect italic/oblique from common patterns
323 let is_italic = name_lower.contains("italic")
324 || name_lower.contains("oblique")
325 || name_lower.contains("-i")
326 || name_lower.contains(" i ")
327 || name_lower.ends_with(" i");
328
329 (is_bold, is_italic)
330}
331
332/// Text extractor for PDF pages with CMap support
333pub struct TextExtractor {
334 options: ExtractionOptions,
335 /// Font cache for the current page (name-keyed, rebuilt per page since names are page-local)
336 font_cache: HashMap<String, FontInfo>,
337 /// Persistent font cache keyed by PDF object reference — avoids re-parsing the same font
338 /// object across pages. Most multi-page PDFs reuse the same font objects.
339 font_object_cache: HashMap<(u32, u16), FontInfo>,
340}
341
342impl TextExtractor {
343 /// Create a new text extractor with default options
344 pub fn new() -> Self {
345 Self {
346 options: ExtractionOptions::default(),
347 font_cache: HashMap::new(),
348 font_object_cache: HashMap::new(),
349 }
350 }
351
352 /// Create a text extractor with custom options
353 pub fn with_options(options: ExtractionOptions) -> Self {
354 Self {
355 options,
356 font_cache: HashMap::new(),
357 font_object_cache: HashMap::new(),
358 }
359 }
360
361 /// Run the full fragment-merge chain used by the partition pipeline:
362 /// kerning fix → line reconstruction → paragraph reconstruction.
363 ///
364 /// Honors `ExtractionOptions::reconstruct_paragraphs`: when `false`, only
365 /// `merge_close_fragments` (the kerning fix) runs and the input is
366 /// returned at fragment granularity.
367 ///
368 /// This method is `pub` so the integration test in
369 /// `tests/paragraph_reconstruction_test.rs` can exercise it without going
370 /// through a PDF file. Production callers should prefer
371 /// `PdfDocument::partition()` and friends, which use this internally.
372 pub fn merge_fragments_for_partition(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
373 let kerning_fixed = self.merge_close_fragments(fragments);
374 if !self.options.reconstruct_paragraphs {
375 return kerning_fixed;
376 }
377 let lines = self.merge_into_lines(&kerning_fixed);
378 self.merge_into_paragraphs(&lines)
379 }
380
381 /// Group fragments by baseline into single-line fragments.
382 ///
383 /// Two fragments are on the same line when their Y centers differ by less
384 /// than `0.2 * min(head.height, frag.height)`. The 0.2 ratio absorbs
385 /// sub-point baseline jitter from text-matrix arithmetic while keeping
386 /// tightly-spaced visual rows (e.g. table cells whose baselines are
387 /// separated by ~2-3pt at 9pt font) on distinct logical lines — see
388 /// issue #265.
389 ///
390 /// Fragments are grouped by `(row_id, Y_bucket, mcid)`, where `row_id`
391 /// comes from `assign_row_ids` (increments on Y-up-jumps in emission
392 /// order). Within a line the tie-break is emission index for tagged PDFs
393 /// (any fragment carries an mcid — ISO 32000 mandates logical order) and
394 /// X coordinate for non-tagged PDFs. A space is inserted between adjacent
395 /// fragments when the X gap exceeds `space_threshold * font_size`.
396 ///
397 /// The output bounding box for each line is the axis-aligned union of the
398 /// input fragments' bounding boxes; `font_size` and `font_name` are
399 /// inherited from the line's first fragment.
400 fn merge_into_lines(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
401 if fragments.is_empty() {
402 return Vec::new();
403 }
404
405 // Pre-pass: assign row_id from Y-up-jumps in emission order. This
406 // disambiguates columns in multi-column layouts where a single outer
407 // BDC makes mcid uniform across visually distinct columns. See
408 // `docs/superpowers/specs/2026-05-23-issue-265-line-interleaving-design.md`.
409 let row_ids = assign_row_ids(fragments);
410
411 // Whether this page has at least one tagged (mcid-carrying) fragment.
412 // `.any()` returns true if even one fragment has mcid=Some; the within-line
413 // tie-break then uses emission index for the whole page rather than X.
414 // See `docs/superpowers/specs/2026-05-23-issue-265-line-interleaving-design.md`.
415 //
416 // For tagged PDFs (PDF/UA, ISO 32000-2 tagged), the content stream delivers
417 // text in logical reading order, so within a visual line we preserve emission
418 // order rather than sorting by X. Out-of-left-to-right glyph placement
419 // (common in typeset tagged PDFs where the PDF author lays out glyphs via
420 // non-monotone Td/Tm operators) is correctly rendered by keeping emission order.
421 //
422 // For non-tagged PDFs (all mcid=None), we retain the X-sort fallback
423 // because many generators emit glyphs in arbitrary (often right-to-left
424 // or random) order and only the X coordinate gives reading order.
425 let is_tagged = fragments.iter().any(|f| f.mcid.is_some());
426
427 // Sort for line GROUPING only: row_id, then Y descending, then X.
428 // row_id keeps fragments from different visual rows in separate
429 // Y-bucket groups; Y descending puts higher-on-page lines first. The
430 // X tie-break only makes same-line fragments adjacent for grouping —
431 // the authoritative reading order WITHIN each line is decided per line
432 // below (#302 symptom 1), so this grouping order is not the final order.
433 let mut indexed: Vec<(u32, usize, &TextFragment)> = row_ids
434 .iter()
435 .copied()
436 .zip(fragments.iter().enumerate())
437 .map(|(rid, (idx, f))| (rid, idx, f))
438 .collect();
439 indexed.sort_by(|a, b| {
440 a.0.cmp(&b.0)
441 .then(b.2.y.total_cmp(&a.2.y))
442 .then(a.2.x.total_cmp(&b.2.x))
443 });
444
445 // Group into visual lines, carrying each fragment's emission index so
446 // the per-line ordering decision below can restore emission order.
447 let mut lines: Vec<Vec<(usize, &TextFragment)>> = Vec::new();
448 let mut last_seen_row_id: Option<u32> = None;
449 for (rid, idx, frag) in indexed {
450 let same_batch = last_seen_row_id == Some(rid);
451 let placed = same_batch
452 && lines.last_mut().is_some_and(|line| {
453 let head = line[0].1;
454 let tol = (head.height.min(frag.height)) * 0.2;
455 (head.y - frag.y).abs() < tol && head.mcid == frag.mcid
456 });
457 if placed {
458 lines.last_mut().unwrap().push((idx, frag));
459 } else {
460 lines.push(vec![(idx, frag)]);
461 last_seen_row_id = Some(rid);
462 }
463 }
464
465 // Decide reading order per visual line (#302 symptom 1).
466 //
467 // X-sort is wrong when one line mixes fonts whose glyph metrics differ
468 // (e.g. an italic particle symbol set in roman body text): the producer
469 // gives the font-switched run an x-origin that falls INSIDE the x-span
470 // of its neighbours, so sorting by x interleaves it
471 // ("to the Z boson" -> "tZboso theon"). The content stream still emits
472 // these runs in correct reading order, so when a line's emission order
473 // has no DISJOINT backward x-step (only span overlaps, or is already
474 // x-monotone) we keep emission order. A disjoint backward step signals
475 // a genuinely scrambled stream (right-to-left / random generators), for
476 // which x-order stays authoritative. Deciding per line — not per
477 // column — prevents one scrambled line from forcing x-sort on the rest.
478 lines
479 .into_iter()
480 .map(|mut line| {
481 if is_tagged || line_prefers_emission_order(&line) {
482 line.sort_by_key(|&(idx, _)| idx);
483 } else {
484 line.sort_by(|a, b| a.1.x.total_cmp(&b.1.x));
485 }
486 let frags: Vec<&TextFragment> = line.into_iter().map(|(_, f)| f).collect();
487 self.build_line_fragment(frags)
488 })
489 .collect()
490 }
491
492 /// Space-glyph advance for `font_name` in text space (point units at
493 /// `font_size`), or `None` when unknown. Prefers the font's embedded
494 /// `/Widths` entry for code 32; falls back to the Adobe Core-14 AFM space
495 /// width for the standard base fonts (Times/Helvetica/Courier/Symbol/
496 /// ZapfDingbats), which ship no `/Widths` array (#302 symptom 2).
497 fn font_space_advance(&self, font_name: Option<&str>, font_size: f64) -> Option<f64> {
498 let info = self.font_cache.get(font_name?)?;
499 if let Some(ref widths) = info.metrics.widths {
500 let first = info.metrics.first_char.unwrap_or(0);
501 if first <= 32 {
502 if let Some(&w) = widths.get((32 - first) as usize) {
503 if w > 0.0 {
504 return Some(w / 1000.0 * font_size);
505 }
506 }
507 }
508 }
509 standard_14_space_width(&info.name).map(|em| em / 1000.0 * font_size)
510 }
511
512 /// Minimum inter-fragment x-gap that counts as a word space for `frag`.
513 /// Anchored to the font's real space-glyph advance when known — word gaps
514 /// scale with the font's space metric, not with a fixed fraction of font
515 /// size — falling back to `space_threshold * font_size` otherwise. Tightly
516 /// set justified text (e.g. Standard-14 Times body) has word gaps near
517 /// 0.2em, far below the legacy 0.3*font_size, which dropped spaces
518 /// ("thequadrupletis"); a font with a 250-unit space then gets a 0.125em
519 /// threshold instead (#302 symptom 2).
520 fn space_gap_threshold(&self, frag: &TextFragment) -> f64 {
521 match self.font_space_advance(frag.font_name.as_deref(), frag.font_size) {
522 Some(adv) if adv > 0.0 => 0.5 * adv,
523 _ => self.options.space_threshold * frag.font_size,
524 }
525 }
526
527 /// Assemble one visual line's fragments into a single line `TextFragment`,
528 /// inserting a space between consecutive fragments whose x-gap exceeds the
529 /// font-anchored [`space_gap_threshold`](Self::space_gap_threshold).
530 fn build_line_fragment(&self, line: Vec<&TextFragment>) -> TextFragment {
531 let head = line[0];
532 let mut text = String::new();
533 let mut x_min = head.x;
534 let mut x_max = head.x + head.width;
535 let mut y_min = head.y;
536 let mut y_max = head.y + head.height;
537
538 for (i, frag) in line.iter().enumerate() {
539 if i > 0 {
540 let prev = line[i - 1];
541 let gap = frag.x - (prev.x + prev.width);
542 if gap > self.space_gap_threshold(frag) {
543 text.push(' ');
544 }
545 }
546 text.push_str(&frag.text);
547 x_min = x_min.min(frag.x);
548 x_max = x_max.max(frag.x + frag.width);
549 y_min = y_min.min(frag.y);
550 y_max = y_max.max(frag.y + frag.height);
551 }
552
553 TextFragment {
554 text,
555 x: x_min,
556 y: y_min,
557 width: x_max - x_min,
558 height: y_max - y_min,
559 font_size: head.font_size,
560 font_name: head.font_name.clone(),
561 is_bold: head.is_bold,
562 is_italic: head.is_italic,
563 color: head.color,
564 space_decisions: Vec::new(),
565 mcid: head.mcid,
566 struct_tag: head.struct_tag.clone(),
567 }
568 }
569
570 /// Group consecutive lines into paragraphs based on vertical gap.
571 ///
572 /// Two consecutive lines are part of the same paragraph when the vertical
573 /// gap between them is less than 1.5× the median line height in the
574 /// input. Hyphenated line breaks (previous line ends with `-` and
575 /// `merge_hyphenated` is set) join without a separator and drop the
576 /// hyphen; otherwise lines join with `'\n'`.
577 fn merge_into_paragraphs(&self, lines: &[TextFragment]) -> Vec<TextFragment> {
578 if lines.is_empty() {
579 return Vec::new();
580 }
581
582 // Median line height — robust to outliers
583 let mut heights: Vec<f64> = lines.iter().map(|l| l.height).collect();
584 heights.sort_by(f64::total_cmp);
585 let median_h = heights[heights.len() / 2];
586 let max_paragraph_gap = median_h * 1.5;
587
588 let mut paragraphs: Vec<TextFragment> = Vec::new();
589 let mut current = lines[0].clone();
590
591 for line in &lines[1..] {
592 let prev_bottom = current.y;
593 let line_top = line.y + line.height;
594 let gap = prev_bottom - line_top;
595
596 if gap < 0.0 || gap > max_paragraph_gap || current.mcid != line.mcid {
597 paragraphs.push(current);
598 current = line.clone();
599 continue;
600 }
601
602 // Same paragraph — join
603 let joined_text = if self.options.merge_hyphenated && current.text.ends_with('-') {
604 let mut s = current.text.clone();
605 s.pop(); // drop trailing hyphen
606 s.push_str(&line.text);
607 s
608 } else {
609 format!("{}\n{}", current.text, line.text)
610 };
611
612 let x_min = current.x.min(line.x);
613 let x_max = (current.x + current.width).max(line.x + line.width);
614 let y_min = current.y.min(line.y);
615 let y_max = (current.y + current.height).max(line.y + line.height);
616
617 current = TextFragment {
618 text: joined_text,
619 x: x_min,
620 y: y_min,
621 width: x_max - x_min,
622 height: y_max - y_min,
623 font_size: current.font_size,
624 font_name: current.font_name.clone(),
625 is_bold: current.is_bold,
626 is_italic: current.is_italic,
627 color: current.color,
628 space_decisions: Vec::new(),
629 mcid: current.mcid,
630 struct_tag: current.struct_tag.clone(),
631 };
632 }
633 paragraphs.push(current);
634
635 paragraphs
636 }
637
638 /// Extract text from a PDF document
639 pub fn extract_from_document<R: Read + Seek>(
640 &mut self,
641 document: &PdfDocument<R>,
642 ) -> ParseResult<Vec<ExtractedText>> {
643 let page_count = document.page_count()?;
644 let mut results = Vec::new();
645
646 for i in 0..page_count {
647 let text = self.extract_from_page(document, i)?;
648 results.push(text);
649 }
650
651 Ok(results)
652 }
653
654 /// Extract text from a specific page
655 pub fn extract_from_page<R: Read + Seek>(
656 &mut self,
657 document: &PdfDocument<R>,
658 page_index: u32,
659 ) -> ParseResult<ExtractedText> {
660 // Get the page
661 let page = document.get_page(page_index)?;
662
663 // Extract font resources first
664 {
665 let _span = tracing::info_span!("font_resources").entered();
666 self.extract_font_resources(&page, document)?;
667 }
668
669 // Get content streams
670 let streams = {
671 let _span = tracing::info_span!("stream_decompress").entered();
672 page.content_streams_with_document(document)?
673 };
674
675 let extracted_text = String::new();
676 let fragments = Vec::new();
677 let state = TextState::default();
678 let in_text_object = false;
679 let last_x = 0.0;
680 let last_y = 0.0;
681
682 // Page resources (owned) for XObject + /Properties lookup during
683 // recursive Form XObject extraction (issue #319).
684 let page_resources: Option<crate::parser::objects::PdfDictionary> =
685 if let Some(rr) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
686 document
687 .get_object(rr.0, rr.1)
688 .ok()
689 .and_then(|o| o.as_dict().cloned())
690 } else {
691 page.get_resources().cloned()
692 };
693
694 let mut run = OpRunState {
695 state,
696 in_text_object,
697 last_x,
698 last_y,
699 extracted_text,
700 fragments,
701 };
702
703 // Process each content stream
704 for (stream_idx, stream_data) in streams.iter().enumerate() {
705 let operations = match {
706 let _span = tracing::info_span!("content_parse").entered();
707 ContentParser::parse_content(stream_data)
708 } {
709 Ok(ops) => ops,
710 Err(e) => {
711 // Enhanced diagnostic logging for content stream parsing failures
712 tracing::debug!(
713 "Warning: Failed to parse content stream on page {}, stream {}/{}",
714 page_index + 1,
715 stream_idx + 1,
716 streams.len()
717 );
718 tracing::debug!(" Error: {}", e);
719 tracing::debug!(" Stream size: {} bytes", stream_data.len());
720
721 // Show first 100 bytes for diagnosis (or less if stream is smaller)
722 let preview_len = stream_data.len().min(100);
723 let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
724 tracing::debug!(
725 " Stream preview (first {} bytes): {:?}",
726 preview_len,
727 preview.chars().take(80).collect::<String>()
728 );
729
730 // Continue processing other streams
731 continue;
732 }
733 };
734
735 run = self.process_operations(
736 operations,
737 document,
738 page_resources.as_ref(),
739 run,
740 page_index,
741 0,
742 )?;
743 }
744
745 let OpRunState {
746 mut extracted_text,
747 mut fragments,
748 ..
749 } = run;
750 {
751 let _span = tracing::info_span!("layout_finalize").entered();
752
753 // Sort and process fragments if requested — but ONLY when we're not
754 // going to run merge_into_lines later. merge_into_lines does its
755 // own (row_id, y, x) sort that needs pre-sort emission order to
756 // detect Y-up-jumps for column splitting (issue #265). For the
757 // legacy path with reconstruct_paragraphs=false, the early sort is
758 // still required because nothing downstream reorders fragments.
759 if self.options.sort_by_position
760 && !self.options.reconstruct_paragraphs
761 && !fragments.is_empty()
762 {
763 self.sort_and_merge_fragments(&mut fragments);
764 }
765
766 // Merge close fragments to eliminate spacing artifacts (kerning fix)
767 if self.options.preserve_layout && !fragments.is_empty() {
768 fragments = self.merge_close_fragments(&fragments);
769 }
770
771 // Reconstruct visual lines and paragraphs from raw fragments.
772 // Required for the partition pipeline to produce Element values at
773 // paragraph granularity (issue #261).
774 if self.options.reconstruct_paragraphs && !fragments.is_empty() {
775 let lines = self.merge_into_lines(&fragments);
776 fragments = self.merge_into_paragraphs(&lines);
777 }
778
779 // Reconstruct text from sorted fragments if layout is preserved
780 if self.options.preserve_layout && !fragments.is_empty() {
781 extracted_text = self.reconstruct_text_from_fragments(&fragments);
782 }
783 }
784
785 Ok(ExtractedText {
786 text: extracted_text,
787 fragments,
788 })
789 }
790
791 /// Run a content-stream operation list, recursing into Form XObjects so
792 /// text drawn inside a `Do`-painted Form XObject is extracted (issue #319).
793 #[allow(clippy::too_many_arguments)]
794 fn process_operations<R: Read + Seek>(
795 &mut self,
796 operations: Vec<ContentOperation>,
797 document: &PdfDocument<R>,
798 resources: Option<&crate::parser::objects::PdfDictionary>,
799 run: OpRunState,
800 page_index: u32,
801 depth: u8,
802 ) -> ParseResult<OpRunState> {
803 let OpRunState {
804 mut state,
805 mut in_text_object,
806 mut last_x,
807 mut last_y,
808 mut extracted_text,
809 mut fragments,
810 } = run;
811
812 let page_properties: Option<&crate::parser::objects::PdfDictionary> =
813 resources.and_then(|res| match res.get("Properties") {
814 Some(crate::parser::objects::PdfObject::Dictionary(d)) => Some(d),
815 _ => None,
816 });
817
818 let _ops_span = tracing::info_span!("text_ops_loop").entered();
819 for op in operations {
820 match op {
821 ContentOperation::BeginText => {
822 in_text_object = true;
823 // Reset text matrix to identity
824 state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
825 state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
826 }
827
828 ContentOperation::EndText => {
829 in_text_object = false;
830 }
831
832 ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
833 state.text_matrix =
834 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
835 state.text_line_matrix =
836 [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
837 }
838
839 ContentOperation::MoveText(tx, ty) => {
840 // Update text matrix by translation
841 let new_matrix = multiply_matrix(
842 &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
843 &state.text_line_matrix,
844 );
845 state.text_matrix = new_matrix;
846 state.text_line_matrix = new_matrix;
847 }
848
849 ContentOperation::NextLine => {
850 // Move to next line using current leading
851 let new_matrix = multiply_matrix(
852 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
853 &state.text_line_matrix,
854 );
855 state.text_matrix = new_matrix;
856 state.text_line_matrix = new_matrix;
857 }
858
859 ContentOperation::ShowText(text) => {
860 if in_text_object {
861 let text_bytes = &text;
862 let decoded = self.decode_text(text_bytes, &state)?;
863
864 // Pen origin in user space = (CTM × text_matrix)(0, 0).
865 let (x, y) = text_origin(&state);
866
867 // Mirror the gate inside `emit_text_fragment` so that
868 // `.text` and `.fragments` stay consistent for pages
869 // wrapped in an `/Artifact` marked-content scope —
870 // issue #330.
871 let skip_text = skip_artifact_text(&state, self.options.include_artifacts);
872
873 // Add spacing based on position change
874 if !skip_text && !extracted_text.is_empty() {
875 let dx = x - last_x;
876 let dy = (y - last_y).abs();
877
878 if dy > self.options.newline_threshold {
879 extracted_text.push('\n');
880 } else if dx > self.options.space_threshold * state.font_size {
881 extracted_text.push(' ');
882 }
883 }
884
885 if !skip_text {
886 extracted_text.push_str(&decoded);
887 }
888
889 // Get font info for accurate width calculation.
890 // Width comes from the char codes (`text_bytes`), not
891 // the decoded Unicode: the Widths array is code-indexed
892 // (issue #302).
893 let text_width = {
894 let font_info = state
895 .font_name
896 .as_ref()
897 .and_then(|name| self.font_cache.get(name));
898 calculate_text_width_from_codes(
899 text_bytes,
900 &decoded,
901 state.font_size,
902 font_info,
903 )
904 };
905
906 if self.options.preserve_layout {
907 emit_text_fragment(
908 &mut fragments,
909 &decoded,
910 text_width,
911 x,
912 y,
913 &mut state,
914 self.options.include_artifacts,
915 );
916 }
917
918 // Update position for next text
919 last_x = x + text_width;
920 last_y = y;
921
922 // Update text matrix for next show operation
923 let tx = text_width * state.horizontal_scale / 100.0;
924 state.text_matrix =
925 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
926 }
927 }
928
929 ContentOperation::ShowTextArray(array) => {
930 if in_text_object {
931 for item in array {
932 match item {
933 TextElement::Text(text_bytes) => {
934 let decoded = self.decode_text(&text_bytes, &state)?;
935 // Mirror the gate inside `emit_text_fragment`
936 // so `.text` and `.fragments` stay consistent
937 // for Artifact scopes (issue #330).
938 let skip_text =
939 skip_artifact_text(&state, self.options.include_artifacts);
940 if !skip_text {
941 extracted_text.push_str(&decoded);
942 }
943
944 let text_width = {
945 let font_info = state
946 .font_name
947 .as_ref()
948 .and_then(|name| self.font_cache.get(name));
949 calculate_text_width_from_codes(
950 &text_bytes,
951 &decoded,
952 state.font_size,
953 font_info,
954 )
955 };
956
957 if self.options.preserve_layout {
958 let (x, y) = text_origin(&state);
959 emit_text_fragment(
960 &mut fragments,
961 &decoded,
962 text_width,
963 x,
964 y,
965 &mut state,
966 self.options.include_artifacts,
967 );
968 }
969
970 let tx = text_width * state.horizontal_scale / 100.0;
971 state.text_matrix = multiply_matrix(
972 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
973 &state.text_matrix,
974 );
975 }
976 TextElement::Spacing(adjustment) => {
977 // Text position adjustment (negative = move left,
978 // i.e. shifts the pen forward). When the synthesised
979 // forward advance exceeds `tj_space_threshold * font_size`
980 // we treat the kern as an implicit `U+0020` (issue #272):
981 // many PDFs encode word breaks purely as wide negative
982 // kerns and never emit a literal space byte.
983 let tx = -(adjustment as f64) / 1000.0 * state.font_size;
984
985 let skip_tj_space =
986 skip_artifact_text(&state, self.options.include_artifacts);
987 if !skip_tj_space
988 && tx > self.options.tj_space_threshold * state.font_size
989 && !extracted_text.is_empty()
990 && !extracted_text.ends_with(' ')
991 {
992 extracted_text.push(' ');
993
994 // Skip the fragment-level emission while an
995 // ActualText scope is pending: the synthesised
996 // space is a heuristic, not real content, and
997 // emitting it would call `emit_text_fragment`
998 // whose ActualText short-circuit would inflate
999 // `pending.width` and set `pending.populated`
1000 // even though no real `Tj` has fired yet. The
1001 // EMC flush will supply the canonical fragment
1002 // text from the override (Phase 1 #269 contract).
1003 if self.options.preserve_layout
1004 && state.pending_actualtext.is_none()
1005 {
1006 // Emit a synthetic single-space fragment at the
1007 // current pen origin so downstream layout merges
1008 // (e.g. `merge_close_fragments`) see the gap as
1009 // explicit content rather than as a sub-threshold
1010 // x-jump. Width = the kern advance so the next
1011 // text fragment begins flush against it.
1012 let (sx, sy) = text_origin(&state);
1013 emit_text_fragment(
1014 &mut fragments,
1015 " ",
1016 tx,
1017 sx,
1018 sy,
1019 &mut state,
1020 self.options.include_artifacts,
1021 );
1022 }
1023 }
1024
1025 state.text_matrix = multiply_matrix(
1026 &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
1027 &state.text_matrix,
1028 );
1029 }
1030 }
1031 }
1032 }
1033 }
1034
1035 ContentOperation::NextLineShowText(text) => {
1036 if in_text_object {
1037 // ' = T* then Tj string. Advance line matrix by -leading.
1038 let new_matrix = multiply_matrix(
1039 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
1040 &state.text_line_matrix,
1041 );
1042 state.text_matrix = new_matrix;
1043 state.text_line_matrix = new_matrix;
1044
1045 let decoded = self.decode_text(&text, &state)?;
1046 let (x, y) = text_origin(&state);
1047
1048 // Mirror the artifact gate (issue #330).
1049 let skip_text = skip_artifact_text(&state, self.options.include_artifacts);
1050 if !skip_text {
1051 if !extracted_text.is_empty() {
1052 extracted_text.push('\n');
1053 }
1054 extracted_text.push_str(&decoded);
1055 }
1056
1057 let text_width = {
1058 let font_info = state
1059 .font_name
1060 .as_ref()
1061 .and_then(|name| self.font_cache.get(name));
1062 calculate_text_width_from_codes(
1063 &text,
1064 &decoded,
1065 state.font_size,
1066 font_info,
1067 )
1068 };
1069
1070 if self.options.preserve_layout {
1071 emit_text_fragment(
1072 &mut fragments,
1073 &decoded,
1074 text_width,
1075 x,
1076 y,
1077 &mut state,
1078 self.options.include_artifacts,
1079 );
1080 }
1081
1082 last_x = x + text_width;
1083 last_y = y;
1084
1085 let tx = text_width * state.horizontal_scale / 100.0;
1086 state.text_matrix =
1087 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
1088 }
1089 }
1090
1091 ContentOperation::SetSpacingNextLineShowText(word_space, char_space, text) => {
1092 if in_text_object {
1093 // " = aw Tw, ac Tc, then ' string. ISO 32000-1 §9.4.3.
1094 // The variant fields mirror the spec field names:
1095 // (word_spacing, char_spacing, text).
1096 state.word_space = word_space as f64;
1097 state.char_space = char_space as f64;
1098
1099 let new_matrix = multiply_matrix(
1100 &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
1101 &state.text_line_matrix,
1102 );
1103 state.text_matrix = new_matrix;
1104 state.text_line_matrix = new_matrix;
1105
1106 let decoded = self.decode_text(&text, &state)?;
1107 let (x, y) = text_origin(&state);
1108
1109 // Mirror the artifact gate (issue #330).
1110 let skip_text = skip_artifact_text(&state, self.options.include_artifacts);
1111 if !skip_text {
1112 if !extracted_text.is_empty() {
1113 extracted_text.push('\n');
1114 }
1115 extracted_text.push_str(&decoded);
1116 }
1117
1118 let text_width = {
1119 let font_info = state
1120 .font_name
1121 .as_ref()
1122 .and_then(|name| self.font_cache.get(name));
1123 calculate_text_width_from_codes(
1124 &text,
1125 &decoded,
1126 state.font_size,
1127 font_info,
1128 )
1129 };
1130
1131 if self.options.preserve_layout {
1132 emit_text_fragment(
1133 &mut fragments,
1134 &decoded,
1135 text_width,
1136 x,
1137 y,
1138 &mut state,
1139 self.options.include_artifacts,
1140 );
1141 }
1142
1143 last_x = x + text_width;
1144 last_y = y;
1145
1146 let tx = text_width * state.horizontal_scale / 100.0;
1147 state.text_matrix =
1148 multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
1149 }
1150 }
1151
1152 ContentOperation::SetFont(name, size) => {
1153 state.font_name = Some(name);
1154 state.font_size = size as f64;
1155 }
1156
1157 ContentOperation::SetLeading(leading) => {
1158 state.leading = leading as f64;
1159 }
1160
1161 ContentOperation::SetCharSpacing(spacing) => {
1162 state.char_space = spacing as f64;
1163 }
1164
1165 ContentOperation::SetWordSpacing(spacing) => {
1166 state.word_space = spacing as f64;
1167 }
1168
1169 ContentOperation::SetHorizontalScaling(scale) => {
1170 state.horizontal_scale = scale as f64;
1171 }
1172
1173 ContentOperation::SetTextRise(rise) => {
1174 state.text_rise = rise as f64;
1175 }
1176
1177 ContentOperation::SetTextRenderMode(mode) => {
1178 state.render_mode = mode as u8;
1179 }
1180
1181 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
1182 // Update CTM: new_ctm = concat_matrix * current_ctm
1183 let [a0, b0, c0, d0, e0, f0] = state.ctm;
1184 let a = a as f64;
1185 let b = b as f64;
1186 let c = c as f64;
1187 let d = d as f64;
1188 let e = e as f64;
1189 let f = f as f64;
1190 state.ctm = [
1191 a * a0 + b * c0,
1192 a * b0 + b * d0,
1193 c * a0 + d * c0,
1194 c * b0 + d * d0,
1195 e * a0 + f * c0 + e0,
1196 e * b0 + f * d0 + f0,
1197 ];
1198 }
1199
1200 // Graphics state stack (issue #262). `q` snapshots the
1201 // current CTM and fill_color; `Q` restores the most recent
1202 // snapshot. Without these, every `cm` accumulates onto the
1203 // CTM forever, producing absurd page-space coordinates and
1204 // wrong font_size scaling on PDFs that nest graphics state.
1205 ContentOperation::SaveGraphicsState => {
1206 state.saved_states.push(SavedGraphicsState {
1207 ctm: state.ctm,
1208 fill_color: state.fill_color,
1209 });
1210 }
1211 ContentOperation::RestoreGraphicsState => {
1212 if let Some(saved) = state.saved_states.pop() {
1213 state.ctm = saved.ctm;
1214 state.fill_color = saved.fill_color;
1215 }
1216 // Unbalanced Q (pop on empty stack) is silently ignored
1217 // to keep extraction robust to malformed PDFs.
1218 }
1219
1220 // Color operations (Phase 4: Color extraction)
1221 ContentOperation::SetNonStrokingGray(gray) => {
1222 state.fill_color = Some(Color::gray(gray as f64));
1223 }
1224
1225 ContentOperation::SetNonStrokingRGB(r, g, b) => {
1226 state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
1227 }
1228
1229 ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
1230 state.fill_color = Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
1231 }
1232
1233 // Issue #269 Phase 1: marked-content operators
1234 ContentOperation::BeginMarkedContent(tag) => {
1235 let parent_artifact = state.mc_stack.last().is_some_and(|e| e.is_artifact);
1236 state.mc_stack.push(MarkedContentEntry {
1237 is_artifact: tag == "Artifact" || parent_artifact,
1238 tag,
1239 mcid: None,
1240 actual_text: None,
1241 });
1242 }
1243
1244 ContentOperation::BeginMarkedContentWithProps(tag, props) => {
1245 let parent_artifact = state.mc_stack.last().is_some_and(|e| e.is_artifact);
1246 let (mcid, actual_text) = resolve_props(&props, page_properties);
1247
1248 // If this scope declares ActualText, open a pending run that will be
1249 // flushed on the matching EMC. Suppresses per-Tj emission inside the
1250 // scope (innermost-ActualText-wins per spec §4).
1251 if let Some(ref text) = actual_text {
1252 state.pending_actualtext = Some(PendingActualText {
1253 text: text.clone(),
1254 first_x: 0.0,
1255 first_y: 0.0,
1256 width: 0.0,
1257 font_size: state.font_size,
1258 font_name: state.font_name.clone(),
1259 is_bold: false, // overwritten on first Tj
1260 is_italic: false,
1261 color: state.fill_color,
1262 stack_depth: state.mc_stack.len(), // BEFORE the push below
1263 populated: false,
1264 });
1265 }
1266
1267 state.mc_stack.push(MarkedContentEntry {
1268 is_artifact: tag == "Artifact" || parent_artifact,
1269 tag,
1270 mcid,
1271 actual_text,
1272 });
1273 }
1274
1275 ContentOperation::EndMarkedContent => {
1276 let popped_depth = state.mc_stack.len();
1277 if state.mc_stack.pop().is_none() {
1278 // Unbalanced EMC — log and ignore. Real PDFs occasionally emit
1279 // dangling EMC (e.g. from incremental updates). We must not panic.
1280 tracing::debug!(
1281 "extraction: EMC with empty marked-content stack on page {}",
1282 page_index + 1
1283 );
1284 } else if let Some(pending) = state.pending_actualtext.as_ref() {
1285 // If we just closed the scope that opened the pending run, flush it.
1286 if pending.stack_depth + 1 == popped_depth {
1287 let run = state.pending_actualtext.take().unwrap();
1288 if run.populated && self.options.preserve_layout {
1289 let (mcid, struct_tag) = innermost_mc_tag(&state.mc_stack);
1290 let in_artifact = state.mc_stack.iter().any(|e| e.is_artifact);
1291 if !in_artifact || self.options.include_artifacts {
1292 fragments.push(TextFragment {
1293 text: run.text,
1294 x: run.first_x,
1295 y: run.first_y,
1296 width: run.width,
1297 height: run.font_size,
1298 font_size: run.font_size,
1299 font_name: run.font_name,
1300 is_bold: run.is_bold,
1301 is_italic: run.is_italic,
1302 color: run.color,
1303 space_decisions: Vec::new(),
1304 mcid,
1305 struct_tag,
1306 });
1307 }
1308 }
1309 }
1310 }
1311 }
1312
1313 ContentOperation::PaintXObject(name) => {
1314 // Issue #319: recurse into Form XObjects. `Do` paints a
1315 // Form XObject in an implicit q/Q, with the XObject's
1316 // /Matrix composed onto the CTM and its own /Resources
1317 // fonts in scope. Without this, text drawn inside the
1318 // XObject (the page body, for RML2PDF "inclPDF" output)
1319 // is never extracted.
1320 const MAX_XOBJECT_DEPTH: u8 = 12;
1321 if depth < MAX_XOBJECT_DEPTH {
1322 if let Some((xobj_ops, xobj_res, matrix)) =
1323 self.load_form_xobject(resources, &name, document)
1324 {
1325 let saved_ctm = state.ctm;
1326 let saved_fill = state.fill_color;
1327 let saved_stack = state.saved_states.len();
1328 let saved_fonts = self.font_cache.clone();
1329
1330 if let Some(m) = matrix {
1331 let [a0, b0, c0, d0, e0, f0] = state.ctm;
1332 let [a, b, c, d, e, f] = m;
1333 state.ctm = [
1334 a * a0 + b * c0,
1335 a * b0 + b * d0,
1336 c * a0 + d * c0,
1337 c * b0 + d * d0,
1338 e * a0 + f * c0 + e0,
1339 e * b0 + f * d0 + f0,
1340 ];
1341 }
1342 if let Some(ref xr) = xobj_res {
1343 self.cache_fonts_from_resources::<R>(xr, document);
1344 }
1345
1346 let sub = OpRunState {
1347 state,
1348 in_text_object: false,
1349 last_x,
1350 last_y,
1351 extracted_text,
1352 fragments,
1353 };
1354 let mut out = self.process_operations(
1355 xobj_ops,
1356 document,
1357 xobj_res.as_ref(),
1358 sub,
1359 page_index,
1360 depth + 1,
1361 )?;
1362
1363 out.state.ctm = saved_ctm;
1364 out.state.fill_color = saved_fill;
1365 out.state.saved_states.truncate(saved_stack);
1366 self.font_cache = saved_fonts;
1367
1368 state = out.state;
1369 last_x = out.last_x;
1370 last_y = out.last_y;
1371 extracted_text = out.extracted_text;
1372 fragments = out.fragments;
1373 }
1374 }
1375 }
1376 _ => {
1377 // Other operations don't affect text extraction
1378 }
1379 }
1380 }
1381
1382 Ok(OpRunState {
1383 state,
1384 in_text_object,
1385 last_x,
1386 last_y,
1387 extracted_text,
1388 fragments,
1389 })
1390 }
1391
1392 /// Load a Form XObject by name: parsed operations, resolved /Resources,
1393 /// and optional /Matrix. None for image XObjects or anything unparseable.
1394 fn load_form_xobject<R: Read + Seek>(
1395 &self,
1396 resources: Option<&crate::parser::objects::PdfDictionary>,
1397 name: &str,
1398 document: &PdfDocument<R>,
1399 ) -> Option<(
1400 Vec<ContentOperation>,
1401 Option<crate::parser::objects::PdfDictionary>,
1402 Option<[f64; 6]>,
1403 )> {
1404 use crate::parser::objects::PdfObject;
1405 let res = resources?;
1406 let xobjects = match res.get("XObject")? {
1407 PdfObject::Dictionary(d) => d.clone(),
1408 PdfObject::Reference(n, g) => match document.get_object(*n, *g).ok()? {
1409 PdfObject::Dictionary(d) => d,
1410 _ => return None,
1411 },
1412 _ => return None,
1413 };
1414 let (n, g) = xobjects.get(name)?.as_reference()?;
1415 let obj = document.get_object(n, g).ok()?;
1416 let stream = obj.as_stream()?;
1417 if stream
1418 .dict
1419 .get("Subtype")
1420 .and_then(|o| o.as_name())
1421 .map(|nm| nm.0.as_str())
1422 != Some("Form")
1423 {
1424 return None;
1425 }
1426 let data = stream.decode(&Default::default()).ok()?;
1427 let ops = ContentParser::parse_content(&data).ok()?;
1428 let xobj_res = match stream.dict.get("Resources") {
1429 Some(PdfObject::Dictionary(d)) => Some(d.clone()),
1430 Some(PdfObject::Reference(rn, rg)) => document
1431 .get_object(*rn, *rg)
1432 .ok()
1433 .and_then(|o| o.as_dict().cloned()),
1434 _ => None,
1435 };
1436 let matrix = stream
1437 .dict
1438 .get("Matrix")
1439 .and_then(|o| o.as_array())
1440 .and_then(|a| {
1441 if a.0.len() == 6 {
1442 let mut m = [0.0f64; 6];
1443 for (i, slot) in m.iter_mut().enumerate() {
1444 *slot = a.0[i]
1445 .as_real()
1446 .or_else(|| a.0[i].as_integer().map(|x| x as f64))?;
1447 }
1448 Some(m)
1449 } else {
1450 None
1451 }
1452 });
1453 Some((ops, xobj_res, matrix))
1454 }
1455
1456 /// Sort text fragments by position and merge them appropriately
1457 fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
1458 // Sort fragments by Y position (top to bottom) then X position (left to right).
1459 //
1460 // We quantize Y into bands of `newline_threshold` width so that fragments
1461 // on the "same line" get identical Y keys. This ensures the comparator is
1462 // a strict total order (transitive), which Rust's sort algorithm requires.
1463 // Without quantization, threshold-based "same line" detection breaks
1464 // transitivity: A≈B and B≈C does NOT imply A≈C.
1465 let threshold = self.options.newline_threshold;
1466 fragments.sort_by(|a, b| {
1467 // Quantize Y to nearest band (PDF Y increases upward, so negate first)
1468 let band_a = if threshold > 0.0 {
1469 (-a.y / threshold).round()
1470 } else {
1471 -a.y
1472 };
1473 let band_b = if threshold > 0.0 {
1474 (-b.y / threshold).round()
1475 } else {
1476 -b.y
1477 };
1478
1479 // Compare by Y band (top to bottom), then by X within same band
1480 band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
1481 });
1482
1483 // Detect columns if requested
1484 if self.options.detect_columns {
1485 self.detect_and_sort_columns(fragments);
1486 }
1487 }
1488
1489 /// Detect columns and re-sort fragments accordingly
1490 fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
1491 // Group fragments by approximate Y position
1492 let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
1493 let mut current_line: Vec<&mut TextFragment> = Vec::new();
1494 let mut last_y = f64::INFINITY;
1495
1496 for fragment in fragments.iter_mut() {
1497 let fragment_y = fragment.y;
1498 if (last_y - fragment_y).abs() > self.options.newline_threshold
1499 && !current_line.is_empty()
1500 {
1501 lines.push(current_line);
1502 current_line = Vec::new();
1503 }
1504 current_line.push(fragment);
1505 last_y = fragment_y;
1506 }
1507 if !current_line.is_empty() {
1508 lines.push(current_line);
1509 }
1510
1511 // Detect column boundaries
1512 let mut column_boundaries = vec![0.0];
1513 for line in &lines {
1514 if line.len() > 1 {
1515 for i in 0..line.len() - 1 {
1516 let gap = line[i + 1].x - (line[i].x + line[i].width);
1517 if gap > self.options.column_threshold {
1518 let boundary = line[i].x + line[i].width + gap / 2.0;
1519 if !column_boundaries
1520 .iter()
1521 .any(|&b| (b - boundary).abs() < 10.0)
1522 {
1523 column_boundaries.push(boundary);
1524 }
1525 }
1526 }
1527 }
1528 }
1529 column_boundaries.sort_by(|a, b| a.total_cmp(b));
1530
1531 // Re-sort fragments by column then Y position
1532 if column_boundaries.len() > 1 {
1533 fragments.sort_by(|a, b| {
1534 // Determine column for each fragment
1535 let col_a = column_boundaries
1536 .iter()
1537 .position(|&boundary| a.x < boundary)
1538 .unwrap_or(column_boundaries.len())
1539 - 1;
1540 let col_b = column_boundaries
1541 .iter()
1542 .position(|&boundary| b.x < boundary)
1543 .unwrap_or(column_boundaries.len())
1544 - 1;
1545
1546 if col_a != col_b {
1547 col_a.cmp(&col_b)
1548 } else {
1549 // Same column, sort by Y position
1550 b.y.total_cmp(&a.y)
1551 }
1552 });
1553 }
1554 }
1555
1556 /// Reconstruct text from sorted fragments
1557 fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
1558 // First, merge consecutive fragments that are very close together
1559 let merged_fragments = self.merge_close_fragments(fragments);
1560
1561 let mut result = String::new();
1562 let mut last_y = f64::INFINITY;
1563 let mut last_x = 0.0;
1564 let mut last_line_ended_with_hyphen = false;
1565
1566 for fragment in &merged_fragments {
1567 // Check if we need a newline
1568 let y_diff = (last_y - fragment.y).abs();
1569 if !result.is_empty() && y_diff > self.options.newline_threshold {
1570 // Handle hyphenation
1571 if self.options.merge_hyphenated && last_line_ended_with_hyphen {
1572 // Remove the hyphen and don't add newline
1573 if result.ends_with('-') {
1574 result.pop();
1575 }
1576 } else {
1577 result.push('\n');
1578 }
1579 } else if !result.is_empty() {
1580 // Check if we need a space
1581 let x_gap = fragment.x - last_x;
1582 if x_gap > self.options.space_threshold * fragment.font_size {
1583 result.push(' ');
1584 }
1585 }
1586
1587 result.push_str(&fragment.text);
1588 last_line_ended_with_hyphen = fragment.text.ends_with('-');
1589 last_y = fragment.y;
1590 last_x = fragment.x + fragment.width;
1591 }
1592
1593 result
1594 }
1595
1596 /// Merge fragments that are very close together on the same line
1597 /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
1598 fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
1599 if fragments.is_empty() {
1600 return Vec::new();
1601 }
1602
1603 let mut merged = Vec::new();
1604 let mut current = fragments[0].clone();
1605
1606 for fragment in &fragments[1..] {
1607 // Check if this fragment is on the same line and very close
1608 let y_diff = (current.y - fragment.y).abs();
1609 let x_gap = fragment.x - (current.x + current.width);
1610
1611 // Y-tolerance for same-line merging.
1612 //
1613 // Legacy path (`reconstruct_paragraphs=false`): fragments arrive
1614 // after `sort_and_merge_fragments` which quantizes Y into 10pt bands.
1615 // All same-band fragments share nearly identical Y, so 1.0pt is enough.
1616 //
1617 // Reconstruct-paragraphs path (`reconstruct_paragraphs=true`): fragments
1618 // arrive in emission order. Inline superscripts (e.g. citation numbers
1619 // raised via `Td` operators) have Y deltas of 3-4pt for 10pt body text.
1620 // Without a wider tolerance, each superscript becomes its own fragment
1621 // → line proliferation (issue #265 follow-up). Use 0.5 * font_size,
1622 // which captures typical superscript/subscript offsets (typically
1623 // 0.33-0.4 * font_size from baseline) and stays below the row_id
1624 // threshold (also 0.5 * font_size) so adjacent rows are not collapsed.
1625 let y_tol = if self.options.reconstruct_paragraphs {
1626 // Defend against malformed PDFs that emit text before any `Tf` font
1627 // operator (font_size=0 in TextState initial). 0.5 * 0 = 0 would
1628 // prevent any merge, even at identical Y. Fall back to the legacy
1629 // 1.0pt threshold in that case so the path is at least as forgiving
1630 // as the non-reconstruct path.
1631 let base = 0.5 * current.font_size.min(fragment.font_size);
1632 if base > 0.0 {
1633 base
1634 } else {
1635 1.0
1636 }
1637 } else {
1638 1.0
1639 };
1640
1641 let should_merge = y_diff < y_tol
1642 && x_gap >= 0.0 // Fragment is to the right
1643 && x_gap < fragment.font_size * 0.5 // Gap less than 50% of font size
1644 && current.mcid == fragment.mcid;
1645
1646 if should_merge {
1647 // Merge this fragment into current, preserving word boundaries
1648 // when the gap exceeds the font-anchored space threshold.
1649 if x_gap > self.space_gap_threshold(fragment) {
1650 current.text.push(' ');
1651 }
1652 current.text.push_str(&fragment.text);
1653 current.width = (fragment.x + fragment.width) - current.x;
1654 } else {
1655 // Start a new fragment
1656 merged.push(current);
1657 current = fragment.clone();
1658 }
1659 }
1660
1661 merged.push(current);
1662 merged
1663 }
1664
1665 /// Extract font resources from page
1666 ///
1667 /// Clears the per-page name cache (font names are page-local in PDF), but
1668 /// reuses previously parsed font objects via `font_object_cache` to avoid
1669 /// re-parsing the same font object across multiple pages.
1670 fn extract_font_resources<R: Read + Seek>(
1671 &mut self,
1672 page: &ParsedPage,
1673 document: &PdfDocument<R>,
1674 ) -> ParseResult<()> {
1675 // Clear per-page name mapping (font names like /F1 are page-local)
1676 self.font_cache.clear();
1677
1678 // Try to get resources manually from page dictionary first
1679 // This is necessary because ParsedPage.get_resources() may not always work
1680 if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
1681 if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
1682 {
1683 self.cache_fonts_from_resources::<R>(&resources, document);
1684 }
1685 } else if let Some(resources) = page.get_resources() {
1686 // Fallback to get_resources() if Resources is not a reference
1687 self.cache_fonts_from_resources::<R>(resources, document);
1688 }
1689
1690 Ok(())
1691 }
1692
1693 /// Cache every font declared in a page's `/Resources` `/Font` dictionary.
1694 ///
1695 /// `/Font` itself may be either an inline dictionary or an indirect
1696 /// reference (`/Font 191 0 R`); both are common in real PDFs (e.g. the
1697 /// ATLAS Higgs paper references it). Resolving the reference is required —
1698 /// otherwise the font cache stays empty, decoding loses ToUnicode, and
1699 /// glyph widths fall back to a flat estimate that scrambles multi-column
1700 /// layout (issue #302).
1701 fn cache_fonts_from_resources<R: Read + Seek>(
1702 &mut self,
1703 resources: &PdfDictionary,
1704 document: &PdfDocument<R>,
1705 ) {
1706 let font_dict = match resources.get("Font") {
1707 Some(PdfObject::Dictionary(dict)) => Some(dict.clone()),
1708 Some(PdfObject::Reference(num, gen)) => match document.get_object(*num, *gen) {
1709 Ok(PdfObject::Dictionary(dict)) => Some(dict),
1710 _ => None,
1711 },
1712 _ => None,
1713 };
1714
1715 if let Some(font_dict) = font_dict {
1716 for (font_name, font_obj) in font_dict.0.iter() {
1717 if let Some(font_ref) = font_obj.as_reference() {
1718 self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
1719 }
1720 }
1721 }
1722 }
1723
1724 /// Cache a font, reusing the persistent object cache when possible.
1725 fn cache_font_by_ref<R: Read + Seek>(
1726 &mut self,
1727 font_name: &str,
1728 font_ref: (u32, u16),
1729 document: &PdfDocument<R>,
1730 ) {
1731 // Check persistent object cache first — avoids re-parsing across pages
1732 if let Some(cached) = self.font_object_cache.get(&font_ref) {
1733 self.font_cache
1734 .insert(font_name.to_string(), cached.clone());
1735 tracing::debug!(
1736 "Reused cached font object ({}, {}): {} (ToUnicode: {})",
1737 font_ref.0,
1738 font_ref.1,
1739 font_name,
1740 cached.to_unicode.is_some()
1741 );
1742 return;
1743 }
1744
1745 // Parse font object
1746 if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
1747 let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
1748 if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
1749 let has_to_unicode = font_info.to_unicode.is_some();
1750 // Store in persistent cache
1751 self.font_object_cache.insert(font_ref, font_info.clone());
1752 // Store in per-page name cache
1753 self.font_cache.insert(font_name.to_string(), font_info);
1754 tracing::debug!(
1755 "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
1756 font_ref.0,
1757 font_ref.1,
1758 font_name,
1759 has_to_unicode
1760 );
1761 }
1762 }
1763 }
1764
1765 /// Decode text using the current font encoding and ToUnicode mapping
1766 fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
1767 use crate::text::encoding::TextEncoding;
1768
1769 // First, try to use cached font information with ToUnicode CMap
1770 if let Some(ref font_name) = state.font_name {
1771 if let Some(font_info) = self.font_cache.get(font_name) {
1772 // Try CMap-based decoding first (free function — no allocation)
1773 if let Ok(decoded) =
1774 crate::text::extraction_cmap::decode_text_with_font(text, font_info)
1775 {
1776 // Only accept if we got meaningful text (not all null bytes or garbage)
1777 if !decoded.trim().is_empty()
1778 && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
1779 {
1780 // Apply sanitization to remove control characters (Issue #116)
1781 let sanitized = sanitize_extracted_text(&decoded);
1782 tracing::debug!(
1783 "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
1784 font_name,
1785 text,
1786 sanitized
1787 );
1788 return Ok(sanitized);
1789 }
1790 }
1791
1792 tracing::debug!(
1793 "CMap decoding failed or produced garbage for font {}, falling back to encoding",
1794 font_name
1795 );
1796 }
1797 }
1798
1799 // Fall back to encoding-based decoding
1800 let encoding = if let Some(ref font_name) = state.font_name {
1801 match font_name.to_lowercase().as_str() {
1802 name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
1803 name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
1804 name if name.contains("standard") => TextEncoding::StandardEncoding,
1805 name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
1806 _ => {
1807 // Default based on common patterns
1808 if font_name.starts_with("Times")
1809 || font_name.starts_with("Helvetica")
1810 || font_name.starts_with("Courier")
1811 {
1812 TextEncoding::WinAnsiEncoding // Most common for standard fonts
1813 } else {
1814 TextEncoding::PdfDocEncoding // Safe default
1815 }
1816 }
1817 }
1818 } else {
1819 TextEncoding::WinAnsiEncoding // Default for most PDFs
1820 };
1821
1822 let fallback_result = encoding.decode(text);
1823 // Apply sanitization to remove control characters (Issue #116)
1824 let sanitized = sanitize_extracted_text(&fallback_result);
1825 tracing::debug!(
1826 "Fallback encoding decoding: {:?} -> \"{}\"",
1827 text,
1828 sanitized
1829 );
1830 Ok(sanitized)
1831 }
1832}
1833
1834impl Default for TextExtractor {
1835 fn default() -> Self {
1836 Self::new()
1837 }
1838}
1839
1840/// Emit a `TextFragment` for one decoded text-show event under `preserve_layout`.
1841///
1842/// Encapsulates the style-derivation + push sequence shared by every
1843/// text-show operator handler in `extract_from_page` (`Tj`, `TJ`, `'`,
1844/// `"`). The caller supplies the pen origin `(x, y)` already mapped to
1845/// user space (typically via `text_origin(&state)`); doing so avoids the
1846/// double `multiply_matrix + transform_point` that prior versions did
1847/// (handler computed it for `last_x`/`last_y`, then this fn recomputed
1848/// it on the same `state`).
1849///
1850/// Skips emission when an ancestor in the marked-content stack is `/Artifact`
1851/// and `include_artifacts` is false. When a pending ActualText run is
1852/// active in the current scope, accumulates the text-width contribution and
1853/// records the first origin instead of pushing a fragment (the run is flushed
1854/// once on EMC, see Task 8's EndMarkedContent handler).
1855///
1856/// `mcid` and `struct_tag` come from the innermost ancestor on the stack that
1857/// declared `/MCID`; non-tagged content leaves both as `None`.
1858/// Whether the current marked-content stack should suppress text emission.
1859///
1860/// Mirrors the gate inside [`emit_text_fragment`]: when an ancestor in the
1861/// stack is `/Artifact` and the caller has not opted into artifact content
1862/// via `include_artifacts`, neither `.text` nor `.fragments` should receive
1863/// the run. Used by the four show-text operator arms to keep `extracted_text`
1864/// and `fragments` symmetric — a page whose entire content is an
1865/// `/Artifact BMC … EMC` scope (the common pattern for screen-reader-skipped
1866/// disclaimers / footers / decorative tagged-PDF content) used to surface
1867/// text in `.text` while leaving `.fragments` empty, silently dropping the
1868/// page from `partition_with(...)` / `rag_chunks(...)` (issue #330).
1869fn skip_artifact_text(state: &TextState, include_artifacts: bool) -> bool {
1870 !include_artifacts && state.mc_stack.iter().any(|e| e.is_artifact)
1871}
1872
1873fn emit_text_fragment(
1874 fragments: &mut Vec<TextFragment>,
1875 decoded: &str,
1876 text_width: f64,
1877 x: f64,
1878 y: f64,
1879 state: &mut TextState,
1880 include_artifacts: bool,
1881) {
1882 if decoded.is_empty() {
1883 return;
1884 }
1885
1886 // Artifact filter (default: skip emission for Artifact subtrees).
1887 if !include_artifacts && state.mc_stack.iter().any(|e| e.is_artifact) {
1888 return;
1889 }
1890
1891 let (is_bold, is_italic) = state
1892 .font_name
1893 .as_ref()
1894 .map(|name| parse_font_style(name))
1895 .unwrap_or((false, false));
1896
1897 // Issue #262: font_size, height, and width must be in page space so that
1898 // downstream heuristics (line/paragraph reconstruction, header/footer zone
1899 // detection, table detection) reason about real geometry. `x` and `y` are
1900 // already page-space (caller transforms via `text_origin`); we still need
1901 // to scale the size/width fields by the combined `text_matrix × CTM`.
1902 let combined = multiply_matrix(&state.text_matrix, &state.ctm);
1903 let x_scale = (combined[0] * combined[0] + combined[1] * combined[1]).sqrt();
1904 let y_scale = (combined[2] * combined[2] + combined[3] * combined[3]).sqrt();
1905 let effective_width = text_width * x_scale;
1906 let effective_size = state.font_size * y_scale;
1907
1908 // If a pending ActualText run is active in the current scope, accumulate
1909 // into it instead of emitting a fragment now. The run is flushed on the
1910 // matching EMC by the EndMarkedContent arm (Task 8).
1911 // Hoist font_name/fill_color reads before taking &mut on pending_actualtext
1912 // to avoid borrow-checker conflicts with the disjoint fields.
1913 let local_font_name = state.font_name.clone();
1914 let local_fill_color = state.fill_color;
1915 if let Some(pending) = state.pending_actualtext.as_mut() {
1916 if !pending.populated {
1917 pending.first_x = x;
1918 pending.first_y = y;
1919 pending.font_size = effective_size;
1920 pending.font_name = local_font_name;
1921 pending.is_bold = is_bold;
1922 pending.is_italic = is_italic;
1923 pending.color = local_fill_color;
1924 pending.populated = true;
1925 }
1926 pending.width += effective_width;
1927 return;
1928 }
1929
1930 let (mcid, struct_tag) = innermost_mc_tag(&state.mc_stack);
1931
1932 fragments.push(TextFragment {
1933 text: decoded.to_owned(),
1934 x,
1935 y,
1936 width: effective_width,
1937 height: effective_size,
1938 font_size: effective_size,
1939 font_name: state.font_name.clone(),
1940 is_bold,
1941 is_italic,
1942 color: state.fill_color,
1943 space_decisions: Vec::new(),
1944 mcid,
1945 struct_tag,
1946 });
1947}
1948
1949/// Pen origin (user-space coordinates) of the next glyph in the current
1950/// text state.
1951///
1952/// Per ISO 32000-1 §8.3.4, the text rendering matrix is `Tm × CTM` (row-vector
1953/// convention). `multiply_matrix(a, b)` returns the matrix that applies `a`
1954/// first and then `b`, so the correct composition is
1955/// `multiply_matrix(text_matrix, ctm)`. Prior to issue #262 this used the
1956/// reverse order which gave correct results only when the CTM was an identity
1957/// or pure-translation matrix; non-uniform CTM scaling produced wrong origins.
1958fn text_origin(state: &TextState) -> (f64, f64) {
1959 let combined = multiply_matrix(&state.text_matrix, &state.ctm);
1960 transform_point(0.0, 0.0, &combined)
1961}
1962
1963/// Multiply two transformation matrices
1964fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
1965 [
1966 a[0] * b[0] + a[1] * b[2],
1967 a[0] * b[1] + a[1] * b[3],
1968 a[2] * b[0] + a[3] * b[2],
1969 a[2] * b[1] + a[3] * b[3],
1970 a[4] * b[0] + a[5] * b[2] + b[4],
1971 a[4] * b[1] + a[5] * b[3] + b[5],
1972 ]
1973}
1974
1975/// Decode a PDF string operand into Rust `String`.
1976///
1977/// PDF strings inside marked-content properties (notably `/ActualText`)
1978/// may be encoded as:
1979///
1980/// - **UTF-16BE with BOM**: leading `0xFE 0xFF`, then big-endian 16-bit
1981/// code units. This is the canonical encoding for non-ASCII ActualText
1982/// (e.g. `fi` ligature, Greek/math symbols). Decoded via `String::from_utf16_lossy`
1983/// so invalid surrogate pairs become `U+FFFD` rather than panicking.
1984/// - **PDFDocEncoding** (the catch-all for non-BOM bytes). For the ASCII
1985/// subset (0x20-0x7E) PDFDocEncoding is identical to Latin-1. We
1986/// conservatively map byte-by-byte to `char`. A future revision can
1987/// plug in the full PDFDocEncoding table if a real PDF emerges with
1988/// high-bit characters in ActualText *without* a UTF-16BE BOM (rare;
1989/// most producers emit the BOM when going outside ASCII).
1990fn decode_pdf_string(bytes: &[u8]) -> String {
1991 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1992 let mut code_units: Vec<u16> = Vec::with_capacity((bytes.len() - 2) / 2);
1993 let mut i = 2;
1994 while i + 1 < bytes.len() {
1995 code_units.push(u16::from_be_bytes([bytes[i], bytes[i + 1]]));
1996 i += 2;
1997 }
1998 String::from_utf16_lossy(&code_units)
1999 } else {
2000 bytes.iter().map(|&b| b as char).collect()
2001 }
2002}
2003
2004/// Resolve a `MarkedContentProps` to `(mcid, actual_text)`.
2005///
2006/// For `Inline` props, walk the map: `/MCID` (Integer, must fit in `u32`)
2007/// becomes `mcid`; `/ActualText` (String) is decoded via `decode_pdf_string`.
2008///
2009/// For `ResourceRef(name)`, look up `properties.get(name)`. If found and
2010/// it's a Dictionary, extract `/MCID` and `/ActualText` from there. If
2011/// not found (or the named entry is not a dict), return `(None, None)`
2012/// — a malformed reference must not abort extraction.
2013fn resolve_props(
2014 props: &crate::parser::content::MarkedContentProps,
2015 properties: Option<&crate::parser::objects::PdfDictionary>,
2016) -> (Option<u32>, Option<String>) {
2017 use crate::parser::content::{MarkedContentProps, MarkedContentValue};
2018
2019 let map_mcid_actual =
2020 |map: &std::collections::HashMap<String, MarkedContentValue>| -> (Option<u32>, Option<String>) {
2021 let mcid = match map.get("MCID") {
2022 Some(MarkedContentValue::Integer(n)) if *n >= 0 && *n <= u32::MAX as i64 => {
2023 Some(*n as u32)
2024 }
2025 _ => None,
2026 };
2027 let actual = match map.get("ActualText") {
2028 Some(MarkedContentValue::String(bytes)) => Some(decode_pdf_string(bytes)),
2029 _ => None,
2030 };
2031 (mcid, actual)
2032 };
2033
2034 match props {
2035 MarkedContentProps::Inline(map) => map_mcid_actual(map),
2036 MarkedContentProps::ResourceRef(name) => {
2037 let Some(properties) = properties else {
2038 return (None, None);
2039 };
2040 let Some(entry) = properties.get(name) else {
2041 return (None, None);
2042 };
2043 let crate::parser::objects::PdfObject::Dictionary(dict) = entry else {
2044 return (None, None);
2045 };
2046 let mcid = dict.get("MCID").and_then(|o| match o {
2047 crate::parser::objects::PdfObject::Integer(n)
2048 if *n >= 0 && *n <= u32::MAX as i64 =>
2049 {
2050 Some(*n as u32)
2051 }
2052 _ => None,
2053 });
2054 let actual_text = dict.get("ActualText").and_then(|o| match o {
2055 crate::parser::objects::PdfObject::String(s) => {
2056 Some(decode_pdf_string(s.as_bytes()))
2057 }
2058 _ => None,
2059 });
2060 (mcid, actual_text)
2061 }
2062 }
2063}
2064
2065/// Walk the marked-content stack from innermost (top) outward, returning the
2066/// first entry's `(mcid, tag)` pair whose `mcid` is `Some`. Returns
2067/// `(None, None)` when no ancestor declared an MCID — typical of non-tagged
2068/// PDFs, in which case the `None == None` grouping-key invariant preserves
2069/// legacy behaviour.
2070fn innermost_mc_tag(stack: &[MarkedContentEntry]) -> (Option<u32>, Option<String>) {
2071 stack
2072 .iter()
2073 .rev()
2074 .find(|e| e.mcid.is_some())
2075 .map_or((None, None), |e| (e.mcid, Some(e.tag.clone())))
2076}
2077
2078/// Transform a point using a transformation matrix
2079fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
2080 let tx = matrix[0] * x + matrix[2] * y + matrix[4];
2081 let ty = matrix[1] * x + matrix[3] * y + matrix[5];
2082 (tx, ty)
2083}
2084
2085/// Calculate text width using actual font metrics (including kerning)
2086fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
2087 // If we have font metrics, use them for accurate width calculation
2088 if let Some(font) = font_info {
2089 if let Some(ref widths) = font.metrics.widths {
2090 let first_char = font.metrics.first_char.unwrap_or(0);
2091 let last_char = font.metrics.last_char.unwrap_or(255);
2092 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
2093
2094 let mut total_width = 0.0;
2095 let mut chars = text.chars().peekable();
2096
2097 while let Some(ch) = chars.next() {
2098 let char_code = ch as u32;
2099
2100 // Get width from Widths array or use missing_width
2101 let width = if char_code >= first_char && char_code <= last_char {
2102 let index = (char_code - first_char) as usize;
2103 widths.get(index).copied().unwrap_or(missing_width)
2104 } else {
2105 missing_width
2106 };
2107
2108 // Convert from glyph space (1/1000 units) to user space
2109 total_width += width / 1000.0 * font_size;
2110
2111 // Apply kerning if available (for character pairs)
2112 if let Some(ref kerning) = font.metrics.kerning {
2113 if let Some(&next_ch) = chars.peek() {
2114 let next_char = next_ch as u32;
2115 if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
2116 // Kerning is in FUnits (1/1000), convert to user space
2117 total_width += kern_value / 1000.0 * font_size;
2118 }
2119 }
2120 }
2121 }
2122
2123 return total_width;
2124 }
2125 }
2126
2127 // Fallback to simplified calculation if no metrics available
2128 text.len() as f64 * font_size * 0.5
2129}
2130
2131/// Compute advance width from the original character **codes**, not the decoded
2132/// Unicode text.
2133///
2134/// A simple font's `Widths` array is indexed by character code (`first_char..=
2135/// last_char`), i.e. the byte value in the content stream — not by the Unicode
2136/// codepoint the code decodes to. [`calculate_text_width`] indexes by the decoded
2137/// codepoint (`ch as u32`), which is correct only when code == codepoint (ASCII /
2138/// WinAnsi fonts). For custom-encoded fonts (Type1 with `Differences`, embedded
2139/// Computer Modern in LaTeX PDFs, ToUnicode remaps) the codepoint diverges from
2140/// the code, so the wrong slot — or `missing_width` — is read, desyncing glyph
2141/// advance and scrambling word order once fragments are sorted by position
2142/// (issue #302).
2143///
2144/// `decoded` is the already-decoded text for this run; it is only consulted for
2145/// composite (Type0) fonts, whose multi-byte codes cannot be indexed byte-wise
2146/// and whose width path is unchanged here to avoid regressing CJK extraction.
2147fn calculate_text_width_from_codes(
2148 codes: &[u8],
2149 decoded: &str,
2150 font_size: f64,
2151 font_info: Option<&FontInfo>,
2152) -> f64 {
2153 // Composite (Type0) fonts use multi-byte codes; a single byte is not a code,
2154 // so byte-indexed width lookup is invalid. Preserve the existing decoded-based
2155 // behavior for them.
2156 let is_composite =
2157 font_info.is_some_and(|f| f.font_type == "Type0" || f.descendant_font.is_some());
2158 if is_composite {
2159 return calculate_text_width(decoded, font_size, font_info);
2160 }
2161
2162 if let Some(font) = font_info {
2163 if let Some(ref widths) = font.metrics.widths {
2164 let first_char = font.metrics.first_char.unwrap_or(0);
2165 let last_char = font.metrics.last_char.unwrap_or(255);
2166 let missing_width = font.metrics.missing_width.unwrap_or(500.0);
2167
2168 let mut total_width = 0.0;
2169 let mut iter = codes.iter().peekable();
2170 while let Some(&byte) = iter.next() {
2171 let code = byte as u32;
2172 let width = if code >= first_char && code <= last_char {
2173 widths
2174 .get((code - first_char) as usize)
2175 .copied()
2176 .unwrap_or(missing_width)
2177 } else {
2178 missing_width
2179 };
2180 total_width += width / 1000.0 * font_size;
2181
2182 // Kerning is keyed by code pair, consistent with code-based widths.
2183 if let Some(ref kerning) = font.metrics.kerning {
2184 if let Some(&next_byte) = iter.peek() {
2185 if let Some(&kern_value) = kerning.get(&(code, *next_byte as u32)) {
2186 total_width += kern_value / 1000.0 * font_size;
2187 }
2188 }
2189 }
2190 }
2191
2192 return total_width;
2193 }
2194 }
2195
2196 // No metrics: one fallback width per code (byte), the simple-font glyph count.
2197 codes.len() as f64 * font_size * 0.5
2198}
2199
2200/// Sanitize extracted text by removing or replacing control characters.
2201///
2202/// This function addresses Issue #116 where extracted text contains NUL bytes (`\0`)
2203/// and ETX characters (`\u{3}`) where spaces should appear.
2204///
2205/// # Behavior
2206///
2207/// - Replaces `\0\u{3}` sequences with a single space (common word separator pattern)
2208/// - Replaces standalone `\0` (NUL) with space
2209/// - Removes other ASCII control characters (0x01-0x1F) except:
2210/// - `\t` (0x09) - Tab
2211/// - `\n` (0x0A) - Line feed
2212/// - `\r` (0x0D) - Carriage return
2213/// - Collapses multiple consecutive spaces into a single space
2214///
2215/// # Examples
2216///
2217/// ```
2218/// use oxidize_pdf::text::extraction::sanitize_extracted_text;
2219///
2220/// // Issue #116 pattern: NUL+ETX as word separator
2221/// let dirty = "a\0\u{3}sergeant\0\u{3}and";
2222/// assert_eq!(sanitize_extracted_text(dirty), "a sergeant and");
2223///
2224/// // Standalone NUL becomes space
2225/// let with_nul = "word\0another";
2226/// assert_eq!(sanitize_extracted_text(with_nul), "word another");
2227///
2228/// // Clean text passes through unchanged
2229/// let clean = "Normal text";
2230/// assert_eq!(sanitize_extracted_text(clean), "Normal text");
2231/// ```
2232pub fn sanitize_extracted_text(text: &str) -> String {
2233 if text.is_empty() {
2234 return String::new();
2235 }
2236
2237 // Pre-allocate with same capacity (result will be <= input length)
2238 let mut result = String::with_capacity(text.len());
2239 let mut chars = text.chars().peekable();
2240 let mut last_was_space = false;
2241
2242 while let Some(ch) = chars.next() {
2243 match ch {
2244 // NUL byte - check if followed by ETX for the \0\u{3} pattern
2245 '\0' => {
2246 // Peek at next char to detect \0\u{3} sequence
2247 if chars.peek() == Some(&'\u{3}') {
2248 chars.next(); // consume the ETX
2249 }
2250 // In both cases (standalone NUL or NUL+ETX), emit space
2251 if !last_was_space {
2252 result.push(' ');
2253 last_was_space = true;
2254 }
2255 }
2256
2257 // ETX alone (not preceded by NUL) - remove it
2258 '\u{3}' => {
2259 // Don't emit anything, just skip
2260 }
2261
2262 // Preserve allowed whitespace
2263 '\t' | '\n' | '\r' => {
2264 result.push(ch);
2265 // Reset space tracking on newlines but not tabs
2266 last_was_space = ch == '\t';
2267 }
2268
2269 // Regular space - collapse multiples
2270 ' ' => {
2271 if !last_was_space {
2272 result.push(' ');
2273 last_was_space = true;
2274 }
2275 }
2276
2277 // Other control characters (0x01-0x1F except tab/newline/CR) - remove
2278 c if c.is_ascii_control() => {
2279 // Skip control characters
2280 }
2281
2282 // Normal characters - keep them
2283 _ => {
2284 result.push(ch);
2285 last_was_space = false;
2286 }
2287 }
2288 }
2289
2290 result
2291}
2292
2293/// Assign a logical row identifier to each fragment based on Y-up-jumps in
2294/// emission order. Used by `merge_into_lines` to distinguish columns in
2295/// multi-column layouts where a single outer BDC scope makes mcid uniform.
2296///
2297/// Increments `row_id` whenever the next fragment's Y exceeds the previous
2298/// by more than `max(font_size * 0.5, 2.0)`. Superscripts (small positive
2299/// deltas) and normal line descents (negative deltas) leave `row_id`
2300/// unchanged. See `docs/superpowers/specs/2026-05-23-issue-265-line-interleaving-design.md`.
2301///
2302/// # Invariants
2303/// Returns a `Vec<u32>` with exactly `fragments.len()` elements — one
2304/// row id per input fragment, in input order. Callers may safely `.zip(fragments)`.
2305fn assign_row_ids(fragments: &[TextFragment]) -> Vec<u32> {
2306 let mut result = Vec::with_capacity(fragments.len());
2307 let mut row_id: u32 = 0;
2308 let mut prev_y: Option<f64> = None;
2309 for frag in fragments {
2310 if let Some(py) = prev_y {
2311 let delta = frag.y - py;
2312 // Threshold anchored to the arriving fragment's font_size; for the
2313 // symmetric same-font case (body→body, same font) this is equivalent
2314 // to anchoring to the previous fragment.
2315 let threshold = (frag.font_size * 0.5).max(2.0);
2316 if delta > threshold {
2317 row_id += 1;
2318 }
2319 }
2320 result.push(row_id);
2321 prev_y = Some(frag.y);
2322 }
2323 debug_assert_eq!(
2324 result.len(),
2325 fragments.len(),
2326 "assign_row_ids: output length must equal input length"
2327 );
2328 result
2329}
2330
2331/// Decide whether a single visual line should be read in emission order.
2332///
2333/// `line` holds `(emission_index, fragment)` pairs for one visual line in any
2334/// order. Returns `true` when, walked in emission order, the line has no
2335/// DISJOINT backward x-step — i.e. no fragment lands entirely to the LEFT of
2336/// everything emitted so far on the line. Such a left jump is the signature of
2337/// a genuinely scrambled stream (right-to-left / random generators), for which
2338/// x-order is authoritative.
2339///
2340/// The comparison is against the line's running left edge, not the immediately
2341/// preceding fragment: dense bodies are split into sub-word glyph runs, so a
2342/// run that legitimately backfills the line (a font-switched math symbol, or a
2343/// word whose run starts left of the previous short run — #302 symptom 1 /
2344/// #305) overlaps the *covered span* even when it does not overlap the single
2345/// fragment right before it. As long as it does not jump past the line's left
2346/// edge, emission order is preserved. Lines that are already x-monotone in
2347/// emission satisfy this trivially and decode identically under either policy.
2348fn line_prefers_emission_order(line: &[(usize, &TextFragment)]) -> bool {
2349 if line.len() < 2 {
2350 return true;
2351 }
2352 let mut em: Vec<&(usize, &TextFragment)> = line.iter().collect();
2353 em.sort_by_key(|&&(idx, _)| idx);
2354 let mut min_start = em[0].1.x;
2355 for &&(_, f) in &em[1..] {
2356 let end = f.x + f.width;
2357 // A fragment whose right edge is at or left of the leftmost glyph seen
2358 // so far is a true backward jump — emission order is not reading order.
2359 if end <= min_start {
2360 return false;
2361 }
2362 min_start = min_start.min(f.x);
2363 }
2364 true
2365}
2366
2367/// Space-glyph advance width (1000-em units) for the Adobe Core-14 base fonts,
2368/// keyed by `/BaseFont`. Subset prefixes (`ABCDEF+`) are stripped; common
2369/// substitute names (Arial→Helvetica, TimesNewRoman→Times, CourierNew→Courier)
2370/// map to their metric-compatible base. Returns `None` for unknown fonts, which
2371/// leaves the caller on its fixed-fraction fallback. These fonts legitimately
2372/// ship no `/Widths` array, so their space metric is only available here.
2373fn standard_14_space_width(base_font: &str) -> Option<f64> {
2374 let name = base_font.rsplit('+').next().unwrap_or(base_font);
2375 let lower = name.to_ascii_lowercase();
2376 if lower.contains("courier") {
2377 Some(600.0)
2378 } else if lower.contains("helvetica") || lower.contains("arial") {
2379 Some(278.0)
2380 } else if lower.contains("times") {
2381 Some(250.0)
2382 } else if lower == "symbol" {
2383 Some(250.0)
2384 } else if lower.contains("zapfdingbats") || lower.contains("dingbats") {
2385 Some(278.0)
2386 } else {
2387 None
2388 }
2389}
2390
2391#[cfg(test)]
2392mod tests {
2393 use super::*;
2394
2395 #[test]
2396 fn test_matrix_multiplication() {
2397 let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2398 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
2399
2400 let result = multiply_matrix(&identity, &translation);
2401 assert_eq!(result, translation);
2402
2403 let result2 = multiply_matrix(&translation, &identity);
2404 assert_eq!(result2, translation);
2405 }
2406
2407 #[test]
2408 fn test_transform_point() {
2409 let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
2410 let (x, y) = transform_point(5.0, 5.0, &translation);
2411 assert_eq!(x, 15.0);
2412 assert_eq!(y, 25.0);
2413 }
2414
2415 #[test]
2416 fn test_extraction_options_default() {
2417 let options = ExtractionOptions::default();
2418 assert!(!options.preserve_layout);
2419 assert_eq!(options.space_threshold, 0.3);
2420 assert_eq!(options.newline_threshold, 10.0);
2421 assert!(options.sort_by_position);
2422 assert!(!options.detect_columns);
2423 assert_eq!(options.column_threshold, 50.0);
2424 assert!(options.merge_hyphenated);
2425 }
2426
2427 #[test]
2428 fn test_extraction_options_custom() {
2429 let options = ExtractionOptions {
2430 preserve_layout: true,
2431 space_threshold: 0.5,
2432 tj_space_threshold: 0.15,
2433 newline_threshold: 15.0,
2434 sort_by_position: false,
2435 detect_columns: true,
2436 column_threshold: 75.0,
2437 merge_hyphenated: false,
2438 track_space_decisions: false,
2439 reconstruct_paragraphs: false,
2440 include_artifacts: false,
2441 };
2442 assert!(options.preserve_layout);
2443 assert_eq!(options.space_threshold, 0.5);
2444 assert_eq!(options.tj_space_threshold, 0.15);
2445 assert_eq!(options.newline_threshold, 15.0);
2446 assert!(!options.sort_by_position);
2447 assert!(options.detect_columns);
2448 assert_eq!(options.column_threshold, 75.0);
2449 assert!(!options.merge_hyphenated);
2450 }
2451
2452 #[test]
2453 fn test_parse_font_style_bold() {
2454 // PostScript style
2455 assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
2456 assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
2457
2458 // TrueType style
2459 assert_eq!(parse_font_style("Arial Bold"), (true, false));
2460 assert_eq!(parse_font_style("Calibri Bold"), (true, false));
2461
2462 // Short form
2463 assert_eq!(parse_font_style("Helvetica-B"), (true, false));
2464 }
2465
2466 #[test]
2467 fn test_parse_font_style_italic() {
2468 // PostScript style
2469 assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
2470 assert_eq!(parse_font_style("Times-Oblique"), (false, true));
2471
2472 // TrueType style
2473 assert_eq!(parse_font_style("Arial Italic"), (false, true));
2474 assert_eq!(parse_font_style("Courier Oblique"), (false, true));
2475
2476 // Short form
2477 assert_eq!(parse_font_style("Helvetica-I"), (false, true));
2478 }
2479
2480 #[test]
2481 fn test_parse_font_style_bold_italic() {
2482 assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
2483 assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
2484 assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
2485 }
2486
2487 #[test]
2488 fn test_parse_font_style_regular() {
2489 assert_eq!(parse_font_style("Helvetica"), (false, false));
2490 assert_eq!(parse_font_style("Times-Roman"), (false, false));
2491 assert_eq!(parse_font_style("Courier"), (false, false));
2492 assert_eq!(parse_font_style("Arial"), (false, false));
2493 }
2494
2495 #[test]
2496 fn test_parse_font_style_edge_cases() {
2497 // Empty and unusual cases
2498 assert_eq!(parse_font_style(""), (false, false));
2499 assert_eq!(parse_font_style("UnknownFont"), (false, false));
2500
2501 // Case insensitive
2502 assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
2503 assert_eq!(parse_font_style("times-ITALIC"), (false, true));
2504 }
2505
2506 #[test]
2507 fn test_text_fragment() {
2508 let fragment = TextFragment {
2509 text: "Hello".to_string(),
2510 x: 100.0,
2511 y: 200.0,
2512 width: 50.0,
2513 height: 12.0,
2514 font_size: 10.0,
2515 font_name: None,
2516 is_bold: false,
2517 is_italic: false,
2518 color: None,
2519 space_decisions: Vec::new(),
2520 mcid: None,
2521 struct_tag: None,
2522 };
2523 assert_eq!(fragment.text, "Hello");
2524 assert_eq!(fragment.x, 100.0);
2525 assert_eq!(fragment.y, 200.0);
2526 assert_eq!(fragment.width, 50.0);
2527 assert_eq!(fragment.height, 12.0);
2528 assert_eq!(fragment.font_size, 10.0);
2529 }
2530
2531 #[test]
2532 fn test_extracted_text() {
2533 let fragments = vec![
2534 TextFragment {
2535 text: "Hello".to_string(),
2536 x: 100.0,
2537 y: 200.0,
2538 width: 50.0,
2539 height: 12.0,
2540 font_size: 10.0,
2541 font_name: None,
2542 is_bold: false,
2543 is_italic: false,
2544 color: None,
2545 space_decisions: Vec::new(),
2546 mcid: None,
2547 struct_tag: None,
2548 },
2549 TextFragment {
2550 text: "World".to_string(),
2551 x: 160.0,
2552 y: 200.0,
2553 width: 50.0,
2554 height: 12.0,
2555 font_size: 10.0,
2556 font_name: None,
2557 is_bold: false,
2558 is_italic: false,
2559 color: None,
2560 space_decisions: Vec::new(),
2561 mcid: None,
2562 struct_tag: None,
2563 },
2564 ];
2565
2566 let extracted = ExtractedText {
2567 text: "Hello World".to_string(),
2568 fragments: fragments,
2569 };
2570
2571 assert_eq!(extracted.text, "Hello World");
2572 assert_eq!(extracted.fragments.len(), 2);
2573 assert_eq!(extracted.fragments[0].text, "Hello");
2574 assert_eq!(extracted.fragments[1].text, "World");
2575 }
2576
2577 #[test]
2578 fn test_text_state_default() {
2579 let state = TextState::default();
2580 assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
2581 assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
2582 assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
2583 assert_eq!(state.leading, 0.0);
2584 assert_eq!(state.char_space, 0.0);
2585 assert_eq!(state.word_space, 0.0);
2586 assert_eq!(state.horizontal_scale, 100.0);
2587 assert_eq!(state.text_rise, 0.0);
2588 assert_eq!(state.font_size, 0.0);
2589 assert!(state.font_name.is_none());
2590 assert_eq!(state.render_mode, 0);
2591 }
2592
2593 #[test]
2594 fn test_matrix_operations() {
2595 // Test rotation matrix
2596 let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
2597 let (x, y) = transform_point(1.0, 0.0, &rotation);
2598 assert_eq!(x, 0.0);
2599 assert_eq!(y, 1.0);
2600
2601 // Test scaling matrix
2602 let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
2603 let (x, y) = transform_point(5.0, 5.0, &scale);
2604 assert_eq!(x, 10.0);
2605 assert_eq!(y, 15.0);
2606
2607 // Test complex transformation
2608 let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
2609 let (x, y) = transform_point(1.0, 1.0, &complex);
2610 assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
2611 assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
2612 }
2613
2614 #[test]
2615 fn test_text_extractor_new() {
2616 let extractor = TextExtractor::new();
2617 let options = extractor.options;
2618 assert!(!options.preserve_layout);
2619 assert_eq!(options.space_threshold, 0.3);
2620 assert_eq!(options.newline_threshold, 10.0);
2621 assert!(options.sort_by_position);
2622 assert!(!options.detect_columns);
2623 assert_eq!(options.column_threshold, 50.0);
2624 assert!(options.merge_hyphenated);
2625 }
2626
2627 #[test]
2628 fn test_text_extractor_with_options() {
2629 let options = ExtractionOptions {
2630 preserve_layout: true,
2631 space_threshold: 0.3,
2632 tj_space_threshold: 0.2,
2633 newline_threshold: 12.0,
2634 sort_by_position: false,
2635 detect_columns: true,
2636 column_threshold: 60.0,
2637 merge_hyphenated: false,
2638 track_space_decisions: false,
2639 reconstruct_paragraphs: false,
2640 include_artifacts: false,
2641 };
2642 let extractor = TextExtractor::with_options(options.clone());
2643 assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
2644 assert_eq!(extractor.options.space_threshold, options.space_threshold);
2645 assert_eq!(
2646 extractor.options.newline_threshold,
2647 options.newline_threshold
2648 );
2649 assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
2650 assert_eq!(extractor.options.detect_columns, options.detect_columns);
2651 assert_eq!(extractor.options.column_threshold, options.column_threshold);
2652 assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
2653 }
2654
2655 // =========================================================================
2656 // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
2657 // =========================================================================
2658
2659 #[test]
2660 fn test_calculate_text_width_with_no_font_info() {
2661 // Test fallback: should use simplified calculation
2662 let width = calculate_text_width("Hello", 12.0, None);
2663
2664 // Expected: 5 chars * 12.0 * 0.5 = 30.0
2665 assert_eq!(
2666 width, 30.0,
2667 "Without font info, should use simplified calculation: len * font_size * 0.5"
2668 );
2669 }
2670
2671 #[test]
2672 fn test_calculate_text_width_with_empty_metrics() {
2673 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2674
2675 // Font with no widths array
2676 let font_info = FontInfo {
2677 name: "TestFont".to_string(),
2678 font_type: "Type1".to_string(),
2679 encoding: None,
2680 to_unicode: None,
2681 differences: None,
2682 descendant_font: None,
2683 cid_to_gid_map: None,
2684 cid_ordering: None,
2685 metrics: FontMetrics {
2686 first_char: None,
2687 last_char: None,
2688 widths: None,
2689 missing_width: Some(500.0),
2690 kerning: None,
2691 },
2692 cid_encoding: None,
2693 };
2694
2695 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
2696
2697 // Should fall back to simplified calculation
2698 assert_eq!(
2699 width, 30.0,
2700 "Without widths array, should fall back to simplified calculation"
2701 );
2702 }
2703
2704 #[test]
2705 fn test_calculate_text_width_with_complete_metrics() {
2706 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2707
2708 // Font with complete metrics for ASCII range 32-126
2709 // Simulate typical Helvetica widths (in 1/1000 units)
2710 let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
2711
2712 // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
2713 widths[72 - 32] = 722.0; // 'H' is ASCII 72
2714 widths[101 - 32] = 556.0; // 'e' is ASCII 101
2715 widths[108 - 32] = 278.0; // 'l' is ASCII 108
2716 widths[111 - 32] = 611.0; // 'o' is ASCII 111
2717
2718 let font_info = FontInfo {
2719 name: "Helvetica".to_string(),
2720 font_type: "Type1".to_string(),
2721 encoding: None,
2722 to_unicode: None,
2723 differences: None,
2724 descendant_font: None,
2725 cid_to_gid_map: None,
2726 cid_ordering: None,
2727 metrics: FontMetrics {
2728 first_char: Some(32),
2729 last_char: Some(126),
2730 widths: Some(widths),
2731 missing_width: Some(500.0),
2732 kerning: None,
2733 },
2734 cid_encoding: None,
2735 };
2736
2737 let width = calculate_text_width("Hello", 12.0, Some(&font_info));
2738
2739 // Expected calculation (widths in glyph space / 1000 * font_size):
2740 // H: 722/1000 * 12 = 8.664
2741 // e: 556/1000 * 12 = 6.672
2742 // l: 278/1000 * 12 = 3.336
2743 // l: 278/1000 * 12 = 3.336
2744 // o: 611/1000 * 12 = 7.332
2745 // Total: 29.34
2746 let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
2747 let tolerance = 0.0001; // Floating point tolerance
2748 assert!(
2749 (width - expected).abs() < tolerance,
2750 "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
2751 expected,
2752 width,
2753 (width - expected).abs()
2754 );
2755
2756 // Verify it's different from simplified calculation
2757 let simplified = 5.0 * 12.0 * 0.5; // 30.0
2758 assert_ne!(
2759 width, simplified,
2760 "Metrics-based calculation should differ from simplified (30.0)"
2761 );
2762 }
2763
2764 #[test]
2765 fn width_from_codes_uses_char_code_not_decoded_unicode() {
2766 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2767
2768 // Simple Type1 font with a code-indexed Widths array: code 1 -> 1000,
2769 // code 2 -> 100. A custom encoding decodes code 1 -> 'm' (U+006D) and
2770 // code 2 -> 'i' (U+0069), so the decoded Unicode codepoints (109, 105)
2771 // are far from the codes (1, 2). The advance width MUST come from the
2772 // codes; indexing the Widths array by the decoded Unicode codepoint
2773 // reads out-of-range -> missing_width, desyncing glyph advance on
2774 // custom-encoded fonts (issue #302, Higgs/Computer-Modern scramble).
2775 let font_info = FontInfo {
2776 name: "F1".to_string(),
2777 font_type: "Type1".to_string(),
2778 encoding: None,
2779 to_unicode: None,
2780 differences: None,
2781 descendant_font: None,
2782 cid_to_gid_map: None,
2783 cid_ordering: None,
2784 metrics: FontMetrics {
2785 first_char: Some(1),
2786 last_char: Some(2),
2787 widths: Some(vec![1000.0, 100.0]),
2788 missing_width: Some(500.0),
2789 kerning: None,
2790 },
2791 cid_encoding: None,
2792 };
2793
2794 let codes = [1u8, 2u8];
2795 let decoded = "mi"; // what decode_text produced for these codes
2796 let width = calculate_text_width_from_codes(&codes, decoded, 10.0, Some(&font_info));
2797 let expected = (1000.0 + 100.0) / 1000.0 * 10.0; // 11.0
2798 assert!(
2799 (width - expected).abs() < 1e-6,
2800 "width must come from char codes: expected {expected}, got {width}"
2801 );
2802
2803 // The decoded-Unicode-indexed path is the bug: 109 and 105 are outside
2804 // [1,2] so both fall back to missing_width -> (500+500)/1000*10 = 10.0.
2805 let buggy = calculate_text_width(decoded, 10.0, Some(&font_info));
2806 assert_eq!(buggy, 10.0);
2807 assert_ne!(
2808 width, buggy,
2809 "code-based width must differ from the Unicode-indexed bug"
2810 );
2811 }
2812
2813 #[test]
2814 fn test_calculate_text_width_character_outside_range() {
2815 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2816
2817 // Font with narrow range (only covers 'A'-'Z')
2818 let widths = vec![722.0; 26]; // All uppercase letters same width
2819
2820 let font_info = FontInfo {
2821 name: "TestFont".to_string(),
2822 font_type: "Type1".to_string(),
2823 encoding: None,
2824 to_unicode: None,
2825 differences: None,
2826 descendant_font: None,
2827 cid_to_gid_map: None,
2828 cid_ordering: None,
2829 metrics: FontMetrics {
2830 first_char: Some(65), // 'A'
2831 last_char: Some(90), // 'Z'
2832 widths: Some(widths),
2833 missing_width: Some(500.0),
2834 kerning: None,
2835 },
2836 cid_encoding: None,
2837 };
2838
2839 // Test with character outside range
2840 let width = calculate_text_width("A1", 10.0, Some(&font_info));
2841
2842 // Expected:
2843 // 'A' (65) is in range: 722/1000 * 10 = 7.22
2844 // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
2845 // Total: 12.22
2846 let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
2847 assert_eq!(
2848 width, expected,
2849 "Should use missing_width for characters outside range"
2850 );
2851 }
2852
2853 #[test]
2854 fn test_calculate_text_width_missing_width_in_array() {
2855 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2856
2857 // Font with incomplete widths array (some characters have 0.0)
2858 let mut widths = vec![500.0; 95]; // Default width
2859 widths[10] = 0.0; // Character at index 10 has no width defined
2860
2861 let font_info = FontInfo {
2862 name: "TestFont".to_string(),
2863 font_type: "Type1".to_string(),
2864 encoding: None,
2865 to_unicode: None,
2866 differences: None,
2867 descendant_font: None,
2868 cid_to_gid_map: None,
2869 cid_ordering: None,
2870 metrics: FontMetrics {
2871 first_char: Some(32),
2872 last_char: Some(126),
2873 widths: Some(widths),
2874 missing_width: Some(600.0),
2875 kerning: None,
2876 },
2877 cid_encoding: None,
2878 };
2879
2880 // Character 42 (index 10 from first_char 32)
2881 let char_code = 42u8 as char; // '*'
2882 let text = char_code.to_string();
2883 let width = calculate_text_width(&text, 10.0, Some(&font_info));
2884
2885 // Character is in range but width is 0.0, should NOT fall back to missing_width
2886 // (0.0 is a valid width for zero-width characters)
2887 assert_eq!(
2888 width, 0.0,
2889 "Should use 0.0 width from array, not missing_width"
2890 );
2891 }
2892
2893 #[test]
2894 fn test_calculate_text_width_empty_string() {
2895 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2896
2897 let font_info = FontInfo {
2898 name: "TestFont".to_string(),
2899 font_type: "Type1".to_string(),
2900 encoding: None,
2901 to_unicode: None,
2902 differences: None,
2903 descendant_font: None,
2904 cid_to_gid_map: None,
2905 cid_ordering: None,
2906 metrics: FontMetrics {
2907 first_char: Some(32),
2908 last_char: Some(126),
2909 widths: Some(vec![500.0; 95]),
2910 missing_width: Some(500.0),
2911 kerning: None,
2912 },
2913 cid_encoding: None,
2914 };
2915
2916 let width = calculate_text_width("", 12.0, Some(&font_info));
2917 assert_eq!(width, 0.0, "Empty string should have zero width");
2918
2919 // Also test without font info
2920 let width_no_font = calculate_text_width("", 12.0, None);
2921 assert_eq!(
2922 width_no_font, 0.0,
2923 "Empty string should have zero width (no font)"
2924 );
2925 }
2926
2927 #[test]
2928 fn test_calculate_text_width_unicode_characters() {
2929 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2930
2931 // Font with limited ASCII range
2932 let font_info = FontInfo {
2933 name: "TestFont".to_string(),
2934 font_type: "Type1".to_string(),
2935 encoding: None,
2936 to_unicode: None,
2937 differences: None,
2938 descendant_font: None,
2939 cid_to_gid_map: None,
2940 cid_ordering: None,
2941 metrics: FontMetrics {
2942 first_char: Some(32),
2943 last_char: Some(126),
2944 widths: Some(vec![500.0; 95]),
2945 missing_width: Some(600.0),
2946 kerning: None,
2947 },
2948 cid_encoding: None,
2949 };
2950
2951 // Test with Unicode characters outside ASCII range
2952 let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
2953
2954 // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
2955 // Expected: 600/1000 * 10 = 6.0
2956 assert_eq!(
2957 width, 6.0,
2958 "Unicode character outside range should use missing_width"
2959 );
2960 }
2961
2962 #[test]
2963 fn test_calculate_text_width_different_font_sizes() {
2964 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2965
2966 let font_info = FontInfo {
2967 name: "TestFont".to_string(),
2968 font_type: "Type1".to_string(),
2969 encoding: None,
2970 to_unicode: None,
2971 differences: None,
2972 descendant_font: None,
2973 cid_to_gid_map: None,
2974 cid_ordering: None,
2975 metrics: FontMetrics {
2976 first_char: Some(65), // 'A'
2977 last_char: Some(65), // 'A'
2978 widths: Some(vec![722.0]),
2979 missing_width: Some(500.0),
2980 kerning: None,
2981 },
2982 cid_encoding: None,
2983 };
2984
2985 // Test same character with different font sizes
2986 let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
2987 let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
2988
2989 // Widths should scale linearly with font size
2990 assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
2991 assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
2992 assert_eq!(
2993 width_20,
2994 width_10 * 2.0,
2995 "Width should scale linearly with font size"
2996 );
2997 }
2998
2999 #[test]
3000 fn test_calculate_text_width_proportional_vs_monospace() {
3001 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
3002
3003 // Simulate proportional font (different widths)
3004 let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
3005 let proportional_font = FontInfo {
3006 name: "Helvetica".to_string(),
3007 font_type: "Type1".to_string(),
3008 encoding: None,
3009 to_unicode: None,
3010 differences: None,
3011 descendant_font: None,
3012 cid_to_gid_map: None,
3013 cid_ordering: None,
3014 metrics: FontMetrics {
3015 first_char: Some(105), // 'i'
3016 last_char: Some(107), // covers i, j, k
3017 widths: Some(proportional_widths),
3018 missing_width: Some(500.0),
3019 kerning: None,
3020 },
3021 cid_encoding: None,
3022 };
3023
3024 // Simulate monospace font (same width)
3025 let monospace_widths = vec![600.0, 600.0, 600.0];
3026 let monospace_font = FontInfo {
3027 name: "Courier".to_string(),
3028 font_type: "Type1".to_string(),
3029 encoding: None,
3030 to_unicode: None,
3031 differences: None,
3032 descendant_font: None,
3033 cid_to_gid_map: None,
3034 cid_ordering: None,
3035 metrics: FontMetrics {
3036 first_char: Some(105),
3037 last_char: Some(107),
3038 widths: Some(monospace_widths),
3039 missing_width: Some(600.0),
3040 kerning: None,
3041 },
3042 cid_encoding: None,
3043 };
3044
3045 let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
3046 let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
3047
3048 // Proportional 'i' should be narrower than monospace 'i'
3049 assert!(
3050 prop_width < mono_width,
3051 "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
3052 prop_width,
3053 mono_width
3054 );
3055 }
3056
3057 // =========================================================================
3058 // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
3059 // =========================================================================
3060
3061 #[test]
3062 fn test_calculate_text_width_with_kerning() {
3063 use crate::text::extraction_cmap::{FontInfo, FontMetrics};
3064 use std::collections::HashMap;
3065
3066 // Create a font with kerning pairs
3067 let mut widths = vec![500.0; 95]; // ASCII 32-126
3068 widths[65 - 32] = 722.0; // 'A'
3069 widths[86 - 32] = 722.0; // 'V'
3070 widths[87 - 32] = 944.0; // 'W'
3071
3072 let mut kerning = HashMap::new();
3073 // Typical kerning pairs (in FUnits, 1/1000)
3074 kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
3075 kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
3076
3077 let font_info = FontInfo {
3078 name: "Helvetica".to_string(),
3079 font_type: "Type1".to_string(),
3080 encoding: None,
3081 to_unicode: None,
3082 differences: None,
3083 descendant_font: None,
3084 cid_to_gid_map: None,
3085 cid_ordering: None,
3086 metrics: FontMetrics {
3087 first_char: Some(32),
3088 last_char: Some(126),
3089 widths: Some(widths),
3090 missing_width: Some(500.0),
3091 kerning: Some(kerning),
3092 },
3093 cid_encoding: None,
3094 };
3095
3096 // Test "AV" with kerning
3097 let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
3098 // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
3099 // = 17.328 - 0.6 = 16.728
3100 let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
3101 let tolerance = 0.0001;
3102 assert!(
3103 (width_av - expected_av).abs() < tolerance,
3104 "AV with kerning: expected {}, got {}, diff {}",
3105 expected_av,
3106 width_av,
3107 (width_av - expected_av).abs()
3108 );
3109
3110 // Test "AW" with different kerning value
3111 let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
3112 // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
3113 // = 19.992 - 0.48 = 19.512
3114 let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
3115 assert!(
3116 (width_aw - expected_aw).abs() < tolerance,
3117 "AW with kerning: expected {}, got {}, diff {}",
3118 expected_aw,
3119 width_aw,
3120 (width_aw - expected_aw).abs()
3121 );
3122
3123 // Test "VA" with NO kerning (pair not in HashMap)
3124 let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
3125 // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
3126 let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
3127 assert!(
3128 (width_va - expected_va).abs() < tolerance,
3129 "VA without kerning: expected {}, got {}, diff {}",
3130 expected_va,
3131 width_va,
3132 (width_va - expected_va).abs()
3133 );
3134
3135 // Verify kerning makes a measurable difference
3136 assert!(
3137 width_av < width_va,
3138 "AV with kerning ({}) should be narrower than VA without kerning ({})",
3139 width_av,
3140 width_va
3141 );
3142 }
3143
3144 #[test]
3145 fn test_parse_truetype_kern_table_minimal() {
3146 use crate::text::extraction_cmap::parse_truetype_kern_table;
3147
3148 // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
3149 // Structure:
3150 // 1. Offset table (12 bytes)
3151 // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
3152 // 3. 'head' table data (54 bytes)
3153 // 4. 'kern' table data (30 bytes)
3154 // Total: 128 bytes
3155 let mut ttf_data = vec![
3156 // Offset table
3157 0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
3158 0x00, 0x02, // numTables: 2
3159 0x00, 0x20, // searchRange: 32
3160 0x00, 0x01, // entrySelector: 1
3161 0x00, 0x00, // rangeShift: 0
3162 ];
3163
3164 // Table directory entry 1: 'head' table
3165 ttf_data.extend_from_slice(b"head"); // tag
3166 ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
3167 ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
3168 ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
3169
3170 // Table directory entry 2: 'kern' table
3171 ttf_data.extend_from_slice(b"kern"); // tag
3172 ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
3173 ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
3174 ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
3175
3176 // 'head' table data (54 bytes of zeros - minimal valid head table)
3177 ttf_data.extend_from_slice(&[0u8; 54]);
3178
3179 // 'kern' table data (34 bytes)
3180 ttf_data.extend_from_slice(&[
3181 // Kern table header
3182 0x00, 0x00, // version: 0
3183 0x00, 0x01, // nTables: 1
3184 // Subtable header
3185 0x00, 0x00, // version: 0
3186 0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
3187 0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
3188 0x00, 0x02, // nPairs: 2
3189 0x00, 0x08, // searchRange: 8
3190 0x00, 0x00, // entrySelector: 0
3191 0x00, 0x04, // rangeShift: 4
3192 // Kerning pair 1: A + V → -50
3193 0x00, 0x41, // left glyph: 65 ('A')
3194 0x00, 0x56, // right glyph: 86 ('V')
3195 0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
3196 // Kerning pair 2: A + W → -40
3197 0x00, 0x41, // left glyph: 65 ('A')
3198 0x00, 0x57, // right glyph: 87 ('W')
3199 0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
3200 ]);
3201
3202 let result = parse_truetype_kern_table(&ttf_data);
3203 assert!(
3204 result.is_ok(),
3205 "Should parse minimal kern table successfully: {:?}",
3206 result.err()
3207 );
3208
3209 let kerning_map = result.unwrap();
3210 assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
3211
3212 // Verify pair 1: A + V → -50
3213 assert_eq!(
3214 kerning_map.get(&(65, 86)),
3215 Some(&-50.0),
3216 "Should have A+V kerning pair with value -50"
3217 );
3218
3219 // Verify pair 2: A + W → -40
3220 assert_eq!(
3221 kerning_map.get(&(65, 87)),
3222 Some(&-40.0),
3223 "Should have A+W kerning pair with value -40"
3224 );
3225 }
3226
3227 #[test]
3228 fn test_parse_kern_table_no_kern_table() {
3229 use crate::text::extraction_cmap::extract_truetype_kerning;
3230
3231 // TrueType font data WITHOUT a 'kern' table
3232 // Structure:
3233 // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
3234 // - Table directory: 1 entry for 'head' table (not 'kern')
3235 let ttf_data = vec![
3236 // Offset table
3237 0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
3238 0x00, 0x01, // numTables: 1
3239 0x00, 0x10, // searchRange: 16
3240 0x00, 0x00, // entrySelector: 0
3241 0x00, 0x00, // rangeShift: 0
3242 // Table directory entry: 'head' table (not 'kern')
3243 b'h', b'e', b'a', b'd', // tag: 'head'
3244 0x00, 0x00, 0x00, 0x00, // checksum
3245 0x00, 0x00, 0x00, 0x1C, // offset: 28
3246 0x00, 0x00, 0x00, 0x36, // length: 54
3247 // Mock 'head' table data (54 bytes of zeros)
3248 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3249 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3250 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3252 ];
3253
3254 let result = extract_truetype_kerning(&ttf_data);
3255 assert!(
3256 result.is_ok(),
3257 "Should gracefully handle missing kern table"
3258 );
3259
3260 let kerning_map = result.unwrap();
3261 assert!(
3262 kerning_map.is_empty(),
3263 "Should return empty HashMap when no kern table exists"
3264 );
3265 }
3266
3267 // Helper for paragraph-reconstruction unit tests. TextFragment has 11
3268 // fields so a helper keeps the test bodies focused on geometry.
3269 fn tf(text: &str, x: f64, y: f64, width: f64, font_size: f64) -> TextFragment {
3270 TextFragment {
3271 text: text.to_string(),
3272 x,
3273 y,
3274 width,
3275 height: font_size,
3276 font_size,
3277 font_name: None,
3278 is_bold: false,
3279 is_italic: false,
3280 color: None,
3281 space_decisions: Vec::new(),
3282 mcid: None,
3283 struct_tag: None,
3284 }
3285 }
3286
3287 #[test]
3288 fn merge_into_lines_groups_same_baseline_fragments() {
3289 let extractor = TextExtractor::with_options(ExtractionOptions {
3290 reconstruct_paragraphs: true,
3291 ..Default::default()
3292 });
3293 let input = vec![
3294 tf("Hello", 50.0, 400.0, 30.0, 12.0),
3295 tf("world", 90.0, 400.0, 30.0, 12.0),
3296 tf("now.", 130.0, 400.0, 25.0, 12.0),
3297 tf("Next", 50.0, 386.0, 30.0, 12.0),
3298 tf("line.", 90.0, 386.0, 25.0, 12.0),
3299 ];
3300 let lines = extractor.merge_into_lines(&input);
3301 assert_eq!(
3302 lines.len(),
3303 2,
3304 "two distinct baselines must produce two line fragments"
3305 );
3306 assert_eq!(
3307 lines[0].text, "Hello world now.",
3308 "first line concatenated with spaces"
3309 );
3310 assert_eq!(lines[1].text, "Next line.", "second line concatenated");
3311 }
3312
3313 #[test]
3314 fn merge_into_lines_inserts_space_only_when_gap_exceeds_threshold() {
3315 let extractor = TextExtractor::with_options(ExtractionOptions {
3316 reconstruct_paragraphs: true,
3317 space_threshold: 0.3,
3318 ..Default::default()
3319 });
3320 // Gap of 4pt at font_size 12 = 0.33x — above threshold 0.3
3321 let with_gap = vec![
3322 tf("AB", 50.0, 400.0, 10.0, 12.0),
3323 tf("CD", 64.0, 400.0, 10.0, 12.0),
3324 ];
3325 let lines = extractor.merge_into_lines(&with_gap);
3326 assert_eq!(
3327 lines[0].text, "AB CD",
3328 "gap above threshold must insert space"
3329 );
3330
3331 // Gap of 1pt = 0.083x — below threshold
3332 let tight = vec![
3333 tf("AB", 50.0, 400.0, 10.0, 12.0),
3334 tf("CD", 61.0, 400.0, 10.0, 12.0),
3335 ];
3336 let lines = extractor.merge_into_lines(&tight);
3337 assert_eq!(lines[0].text, "ABCD", "tight gap must NOT insert space");
3338 }
3339
3340 #[test]
3341 fn standard_14_space_width_maps_base_fonts_and_substitutes() {
3342 // Adobe Core-14 AFM space advances, with subset prefixes stripped and
3343 // metric-compatible substitutes folded in (#302 symptom 2).
3344 assert_eq!(super::standard_14_space_width("Times-Roman"), Some(250.0));
3345 assert_eq!(
3346 super::standard_14_space_width("Times-BoldItalic"),
3347 Some(250.0)
3348 );
3349 assert_eq!(super::standard_14_space_width("Helvetica"), Some(278.0));
3350 assert_eq!(super::standard_14_space_width("Courier-Bold"), Some(600.0));
3351 assert_eq!(super::standard_14_space_width("Symbol"), Some(250.0));
3352 assert_eq!(super::standard_14_space_width("ZapfDingbats"), Some(278.0));
3353 // subset prefix stripped
3354 assert_eq!(
3355 super::standard_14_space_width("ABCDEF+Times-Roman"),
3356 Some(250.0)
3357 );
3358 // metric-compatible substitutes
3359 assert_eq!(super::standard_14_space_width("Arial-BoldMT"), Some(278.0));
3360 assert_eq!(
3361 super::standard_14_space_width("TimesNewRomanPSMT"),
3362 Some(250.0)
3363 );
3364 assert_eq!(
3365 super::standard_14_space_width("CourierNewPSMT"),
3366 Some(600.0)
3367 );
3368 // unknown / embedded fonts fall through to the caller's fallback
3369 assert_eq!(super::standard_14_space_width("Poppins-Regular"), None);
3370 assert_eq!(super::standard_14_space_width("VUNXGH+Calibri"), None);
3371 }
3372
3373 #[test]
3374 fn merge_into_lines_keeps_emission_order_for_font_switch_overlap() {
3375 // #302 symptom 1: a font-switched glyph (e.g. the italic particle
3376 // symbol "Z" in "to the Z boson") is positioned by the producer with
3377 // an x-origin that falls INSIDE the x-span of the preceding roman run
3378 // ("to the"). The content stream still delivers it in correct reading
3379 // order. Sorting a row purely by x-origin interleaves the overlapping
3380 // fragment, yielding "Zto the" instead of "to theZ". When a row's only
3381 // backward emission steps are span overlaps (not disjoint jumps),
3382 // emission order is the authoritative reading order.
3383 let extractor = TextExtractor::with_options(ExtractionOptions {
3384 reconstruct_paragraphs: true,
3385 ..Default::default()
3386 });
3387 // emission order = reading order; "Z" overlaps "to t" + "he" in x.
3388 let row = vec![
3389 tf("to t", 455.5, 400.0, 12.0, 10.0), // 455.5 .. 467.5
3390 tf("he", 467.5, 400.0, 10.0, 10.0), // 467.5 .. 477.5
3391 tf("Z", 455.3, 400.0, 23.0, 10.0), // 455.3 .. 478.3 (overlaps both)
3392 ];
3393 let lines = extractor.merge_into_lines(&row);
3394 assert_eq!(lines.len(), 1);
3395 assert_eq!(
3396 lines[0].text, "to theZ",
3397 "overlapping font-switch fragment must keep emission (reading) order"
3398 );
3399 }
3400
3401 #[test]
3402 fn merge_into_lines_keeps_emission_when_run_backfills_covered_span() {
3403 // #305: dense justified body text is split into sub-word fragments by
3404 // the font's arbitrary glyph runs. A later word ("described", x 492..537)
3405 // is emitted with a backward x-origin that lands INSIDE the span already
3406 // covered by the line ("...selections", 479..521), but does NOT overlap
3407 // the short immediately-preceding fragment ("s", 517..521). Emission is
3408 // still the reading order, so the line must keep it — the overlap test
3409 // has to consider the line's running extent, not just the previous
3410 // fragment. (Real case: Higgs p5 "kinematic selections described in".)
3411 let extractor = TextExtractor::with_options(ExtractionOptions {
3412 reconstruct_paragraphs: true,
3413 ..Default::default()
3414 });
3415 let row = vec![
3416 tf("selection", 479.0, 400.0, 38.0, 8.0), // 479..517
3417 tf("s", 517.0, 400.0, 4.0, 8.0), // 517..521 short predecessor
3418 tf("d", 492.0, 400.0, 4.0, 8.0), // 492..496 backfill, no overlap with "s"
3419 tf("escribed", 496.0, 400.0, 41.0, 8.0), // 496..537
3420 ];
3421 let lines = extractor.merge_into_lines(&row);
3422 assert_eq!(
3423 lines[0].text, "selectionsdescribed",
3424 "a run that backfills the line's covered span must keep emission order"
3425 );
3426 }
3427
3428 #[test]
3429 fn merge_into_lines_uses_x_order_for_disjoint_backward_jump() {
3430 // Guard: a genuinely scrambled non-tagged stream (fragments emitted
3431 // out of x-order at DISJOINT positions, e.g. right-to-left or random
3432 // generators) must still be reordered by x. Here "the" is emitted
3433 // after "boson" with no span overlap, so x-order is authoritative.
3434 let extractor = TextExtractor::with_options(ExtractionOptions {
3435 reconstruct_paragraphs: true,
3436 ..Default::default()
3437 });
3438 let row = vec![
3439 tf("boson", 100.0, 400.0, 28.0, 10.0), // 100 .. 128
3440 tf("the", 80.0, 400.0, 15.0, 10.0), // 80 .. 95 (disjoint, left of boson)
3441 ];
3442 let lines = extractor.merge_into_lines(&row);
3443 assert_eq!(lines.len(), 1);
3444 assert_eq!(
3445 lines[0].text, "the boson",
3446 "disjoint backward emission jump must be reordered by x"
3447 );
3448 }
3449
3450 #[test]
3451 fn merge_into_lines_unioned_bounding_box() {
3452 let extractor = TextExtractor::with_options(ExtractionOptions {
3453 reconstruct_paragraphs: true,
3454 ..Default::default()
3455 });
3456 let input = vec![
3457 tf("A", 50.0, 400.0, 10.0, 12.0),
3458 tf("B", 100.0, 400.0, 10.0, 12.0),
3459 ];
3460 let lines = extractor.merge_into_lines(&input);
3461 assert_eq!(lines.len(), 1);
3462 assert!((lines[0].x - 50.0).abs() < 0.01);
3463 assert!(
3464 (lines[0].width - 60.0).abs() < 0.01,
3465 "width must span 50->110"
3466 );
3467 }
3468
3469 #[test]
3470 fn assign_row_ids_monotone_y_descending_keeps_zero() {
3471 let frags = vec![
3472 tf("A", 50.0, 400.0, 10.0, 9.0),
3473 tf("B", 50.0, 395.0, 10.0, 9.0),
3474 tf("C", 50.0, 390.0, 10.0, 9.0),
3475 ];
3476 let row_ids = super::assign_row_ids(&frags);
3477 assert_eq!(row_ids, vec![0u32, 0, 0]);
3478 }
3479
3480 #[test]
3481 fn assign_row_ids_increments_on_y_up_jump_above_threshold() {
3482 // font_size=9 → threshold = max(4.5, 2.0) = 4.5
3483 // deltas: 395-400=-5, 420-395=+25 (>4.5)
3484 let frags = vec![
3485 tf("A", 50.0, 400.0, 10.0, 9.0),
3486 tf("B", 50.0, 395.0, 10.0, 9.0),
3487 tf("C", 50.0, 420.0, 10.0, 9.0),
3488 ];
3489 let row_ids = super::assign_row_ids(&frags);
3490 assert_eq!(row_ids, vec![0u32, 0, 1]);
3491 }
3492
3493 #[test]
3494 fn assign_row_ids_ignores_superscript_within_threshold() {
3495 // font_size=9 → threshold 4.5. delta 2.5 must NOT trigger.
3496 let frags = vec![
3497 tf("A", 50.0, 400.0, 10.0, 9.0),
3498 tf("^2", 60.0, 402.5, 5.0, 9.0),
3499 tf("B", 65.0, 395.0, 10.0, 9.0),
3500 ];
3501 let row_ids = super::assign_row_ids(&frags);
3502 assert_eq!(row_ids, vec![0u32, 0, 0]);
3503 }
3504
3505 #[test]
3506 fn assign_row_ids_floor_2pt_for_small_fonts() {
3507 // font_size=3 → font_size*0.5 = 1.5; floor lifts threshold to 2.0
3508 // delta = +2.5 > 2.0 must trigger.
3509 let frags = vec![
3510 tf("A", 50.0, 100.0, 10.0, 3.0),
3511 tf("B", 50.0, 102.5, 10.0, 3.0),
3512 ];
3513 let row_ids = super::assign_row_ids(&frags);
3514 assert_eq!(row_ids, vec![0u32, 1]);
3515 }
3516
3517 #[test]
3518 fn assign_row_ids_empty_slice_returns_empty() {
3519 let frags: Vec<TextFragment> = vec![];
3520 let row_ids = super::assign_row_ids(&frags);
3521 assert!(row_ids.is_empty(), "empty input must yield empty output");
3522 }
3523
3524 #[test]
3525 fn merge_into_lines_splits_two_columns_emitted_sequentially() {
3526 let extractor = TextExtractor::with_options(ExtractionOptions {
3527 reconstruct_paragraphs: true,
3528 ..Default::default()
3529 });
3530 // Emission order: col1.l1, col1.l2 (Y monotone down), then col2.l1
3531 // (Y jumps UP by 10 > threshold 5 for font 10pt), col2.l2.
3532 let input = vec![
3533 tf("col1-top", 50.0, 400.0, 80.0, 10.0),
3534 tf("col1-bot", 50.0, 395.0, 80.0, 10.0),
3535 tf("col2-top", 200.0, 405.0, 80.0, 10.0),
3536 tf("col2-bot", 200.0, 400.0, 80.0, 10.0),
3537 ];
3538 let lines = extractor.merge_into_lines(&input);
3539 assert_eq!(
3540 lines.len(),
3541 4,
3542 "two columns at near-identical Y must split into 4 lines"
3543 );
3544 // row_id=0 batch first (col1), then row_id=1 (col2). Within each batch, Y desc.
3545 assert_eq!(lines[0].text, "col1-top");
3546 assert_eq!(lines[0].y, 400.0);
3547 assert_eq!(lines[1].text, "col1-bot");
3548 assert_eq!(lines[1].y, 395.0);
3549 assert_eq!(lines[2].text, "col2-top");
3550 assert_eq!(lines[2].y, 405.0);
3551 assert_eq!(lines[3].text, "col2-bot");
3552 assert_eq!(lines[3].y, 400.0);
3553 }
3554
3555 #[test]
3556 fn merge_into_lines_preserves_single_column_continuation() {
3557 let extractor = TextExtractor::with_options(ExtractionOptions {
3558 reconstruct_paragraphs: true,
3559 ..Default::default()
3560 });
3561 // Single column: same Y continuation (X grows), then next line down.
3562 let input = vec![
3563 tf("Hello", 50.0, 400.0, 30.0, 10.0),
3564 tf("world", 90.0, 400.0, 30.0, 10.0),
3565 tf("next-line", 50.0, 395.0, 70.0, 10.0),
3566 ];
3567 let lines = extractor.merge_into_lines(&input);
3568 assert_eq!(
3569 lines.len(),
3570 2,
3571 "single column continuation must collapse to 2 lines"
3572 );
3573 assert!(lines[0].text.contains("Hello"));
3574 assert!(lines[0].text.contains("world"));
3575 assert_eq!(lines[1].text, "next-line");
3576 }
3577
3578 #[test]
3579 fn merge_into_lines_splits_columns_with_uniform_mcid() {
3580 // Regression guard for #265 root cause: NCSC page 12 has a single
3581 // outer BDC, so every fragment has mcid=Some(0). Column separation
3582 // must come from row_id alone, not from mcid.
3583 let extractor = TextExtractor::with_options(ExtractionOptions {
3584 reconstruct_paragraphs: true,
3585 ..Default::default()
3586 });
3587 let mut frags = vec![
3588 tf("col1-top", 50.0, 400.0, 80.0, 10.0),
3589 tf("col1-bot", 50.0, 395.0, 80.0, 10.0),
3590 tf("col2-top", 200.0, 405.0, 80.0, 10.0),
3591 tf("col2-bot", 200.0, 400.0, 80.0, 10.0),
3592 ];
3593 for f in &mut frags {
3594 f.mcid = Some(0);
3595 }
3596 let lines = extractor.merge_into_lines(&frags);
3597 assert_eq!(
3598 lines.len(),
3599 4,
3600 "uniform mcid must not prevent row_id-based column split (NCSC root cause)"
3601 );
3602 assert_eq!(lines[0].text, "col1-top");
3603 assert_eq!(lines[1].text, "col1-bot");
3604 assert_eq!(lines[2].text, "col2-top");
3605 assert_eq!(lines[3].text, "col2-bot");
3606 }
3607
3608 #[test]
3609 fn merge_close_fragments_superscript_merges_when_reconstruct_paragraphs() {
3610 let extractor = TextExtractor::with_options(ExtractionOptions {
3611 reconstruct_paragraphs: true,
3612 ..Default::default()
3613 });
3614 // Citation superscript: body text at y=400, raised digit at y=403.5
3615 // (3.5pt above baseline for 10pt font). y_tol = 0.5 * 10 = 5.0 > 3.5
3616 // and x_gap = 4pt < 10*0.5 = 5pt, so the superscript must merge into
3617 // the body fragment.
3618 let frags = vec![
3619 tf("body-text", 50.0, 400.0, 25.0, 10.0),
3620 tf("1", 79.0, 403.5, 4.0, 10.0),
3621 ];
3622 let merged = extractor.merge_close_fragments(&frags);
3623 assert_eq!(
3624 merged.len(),
3625 1,
3626 "superscript within 5pt of baseline must merge in reconstruct path"
3627 );
3628 assert!(merged[0].text.contains("body-text"));
3629 assert!(merged[0].text.contains("1"));
3630 }
3631
3632 #[test]
3633 fn merge_close_fragments_superscript_does_not_merge_in_legacy_path() {
3634 let extractor = TextExtractor::with_options(ExtractionOptions {
3635 reconstruct_paragraphs: false,
3636 ..Default::default()
3637 });
3638 // Legacy path: y_tol=1.0 fixed. A 3.5pt delta must NOT merge.
3639 let frags = vec![
3640 tf("body-text", 50.0, 400.0, 25.0, 10.0),
3641 tf("1", 79.0, 403.5, 4.0, 10.0),
3642 ];
3643 let merged = extractor.merge_close_fragments(&frags);
3644 assert_eq!(
3645 merged.len(),
3646 2,
3647 "3.5pt Y delta exceeds legacy 1.0pt threshold; superscript stays separate"
3648 );
3649 }
3650
3651 #[test]
3652 fn merge_into_paragraphs_groups_consecutive_lines() {
3653 let extractor = TextExtractor::with_options(ExtractionOptions {
3654 reconstruct_paragraphs: true,
3655 ..Default::default()
3656 });
3657 // Three lines, 14pt leading (line height 12pt, gap 2pt)
3658 let lines = vec![
3659 tf("Line one.", 50.0, 400.0, 60.0, 12.0),
3660 tf("Line two.", 50.0, 386.0, 60.0, 12.0),
3661 tf("Line three.", 50.0, 372.0, 70.0, 12.0),
3662 ];
3663 let paragraphs = extractor.merge_into_paragraphs(&lines);
3664 assert_eq!(paragraphs.len(), 1);
3665 assert_eq!(paragraphs[0].text, "Line one.\nLine two.\nLine three.");
3666 }
3667
3668 #[test]
3669 fn merge_into_paragraphs_splits_on_large_vertical_gap() {
3670 let extractor = TextExtractor::with_options(ExtractionOptions {
3671 reconstruct_paragraphs: true,
3672 ..Default::default()
3673 });
3674 let lines = vec![
3675 tf("P1L1.", 50.0, 400.0, 40.0, 12.0),
3676 tf("P1L2.", 50.0, 386.0, 40.0, 12.0),
3677 tf("P2L1.", 50.0, 300.0, 40.0, 12.0),
3678 ];
3679 let paragraphs = extractor.merge_into_paragraphs(&lines);
3680 assert_eq!(paragraphs.len(), 2);
3681 assert_eq!(paragraphs[0].text, "P1L1.\nP1L2.");
3682 assert_eq!(paragraphs[1].text, "P2L1.");
3683 }
3684
3685 #[test]
3686 fn merge_into_paragraphs_drops_hyphen_when_merge_hyphenated() {
3687 let extractor = TextExtractor::with_options(ExtractionOptions {
3688 reconstruct_paragraphs: true,
3689 merge_hyphenated: true,
3690 ..Default::default()
3691 });
3692 let lines = vec![
3693 tf("Kryp-", 50.0, 400.0, 30.0, 12.0),
3694 tf("tographie", 50.0, 386.0, 60.0, 12.0),
3695 ];
3696 let paragraphs = extractor.merge_into_paragraphs(&lines);
3697 assert_eq!(paragraphs.len(), 1);
3698 assert_eq!(
3699 paragraphs[0].text, "Kryptographie",
3700 "hyphen elided, no newline inserted"
3701 );
3702 }
3703
3704 #[test]
3705 fn decode_pdf_string_utf16be_bom_decodes_fi_ligature() {
3706 let bytes = [0xFE, 0xFF, 0x00, 0x66, 0x00, 0x69];
3707 assert_eq!(super::decode_pdf_string(&bytes), "fi");
3708 }
3709
3710 #[test]
3711 fn decode_pdf_string_ascii_pdfdocencoding_passthrough() {
3712 let bytes = b"page 12";
3713 assert_eq!(super::decode_pdf_string(bytes), "page 12");
3714 }
3715
3716 #[test]
3717 fn decode_pdf_string_empty_input_returns_empty() {
3718 assert_eq!(super::decode_pdf_string(&[]), "");
3719 }
3720
3721 #[test]
3722 fn decode_pdf_string_lone_bom_returns_empty() {
3723 // BOM only, no code units after.
3724 assert_eq!(super::decode_pdf_string(&[0xFE, 0xFF]), "");
3725 }
3726
3727 #[test]
3728 fn resolve_props_extracts_integer_mcid() {
3729 use crate::parser::content::{MarkedContentProps, MarkedContentValue};
3730 use std::collections::HashMap;
3731 let mut map = HashMap::new();
3732 map.insert("MCID".to_string(), MarkedContentValue::Integer(7));
3733 let props = MarkedContentProps::Inline(map);
3734
3735 let (mcid, actual) = super::resolve_props(&props, None);
3736 assert_eq!(mcid, Some(7));
3737 assert_eq!(actual, None);
3738 }
3739
3740 #[test]
3741 fn resolve_props_decodes_utf16be_actualtext() {
3742 use crate::parser::content::{MarkedContentProps, MarkedContentValue};
3743 use std::collections::HashMap;
3744 let mut map = HashMap::new();
3745 map.insert(
3746 "ActualText".to_string(),
3747 MarkedContentValue::String(vec![0xFE, 0xFF, 0x00, 0x66, 0x00, 0x69]),
3748 );
3749 let props = MarkedContentProps::Inline(map);
3750
3751 let (mcid, actual) = super::resolve_props(&props, None);
3752 assert_eq!(mcid, None);
3753 assert_eq!(actual.as_deref(), Some("fi"));
3754 }
3755
3756 #[test]
3757 fn resolve_props_returns_none_for_unresolvable_resource_ref() {
3758 use crate::parser::content::MarkedContentProps;
3759 let props = MarkedContentProps::ResourceRef("PropsName".to_string());
3760 let (mcid, actual) = super::resolve_props(&props, None);
3761 assert_eq!((mcid, actual), (None, None));
3762 }
3763
3764 #[test]
3765 fn resolve_props_negative_mcid_rejected() {
3766 use crate::parser::content::{MarkedContentProps, MarkedContentValue};
3767 use std::collections::HashMap;
3768 // MCID is unsigned per ISO 32000-1; negative integer is malformed.
3769 let mut map = HashMap::new();
3770 map.insert("MCID".to_string(), MarkedContentValue::Integer(-1));
3771 let props = MarkedContentProps::Inline(map);
3772
3773 let (mcid, _) = super::resolve_props(&props, None);
3774 assert_eq!(mcid, None);
3775 }
3776
3777 #[test]
3778 fn resolve_props_resource_ref_overflow_mcid_rejected() {
3779 // ISO 32000-1 §14.7.4: MCID is an unsigned 32-bit integer. A
3780 // PdfObject::Integer holds an i64, so a malformed PDF can carry an
3781 // out-of-range MCID. The ResourceRef path must reject those rather
3782 // than wrap silently via `as u32`. Mirrors the Inline-path guard
3783 // already covered by `resolve_props_negative_mcid_rejected`.
3784 use crate::parser::content::MarkedContentProps;
3785 use crate::parser::objects::{PdfDictionary, PdfObject};
3786
3787 let mut inner = PdfDictionary::new();
3788 inner.insert("MCID".to_string(), PdfObject::Integer(i64::MAX));
3789
3790 let mut properties = PdfDictionary::new();
3791 properties.insert("PropsName".to_string(), PdfObject::Dictionary(inner));
3792
3793 let props = MarkedContentProps::ResourceRef("PropsName".to_string());
3794 let (mcid, _) = super::resolve_props(&props, Some(&properties));
3795 assert_eq!(mcid, None);
3796 }
3797}