dongler_core/
pdf.rs

1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::io::Read;
4use std::sync::Arc;
5
6use flate2::read::ZlibDecoder;
7#[cfg(feature = "parallel")]
8use rayon::prelude::*;
9use sha2::{Digest, Sha256};
10
11use crate::engine::ExtractionEngine;
12use crate::error::{DonglerError, Result};
13use crate::ir::{
14    Asset, BBox, Block, Confidence, Document, FigureBlock, ImageObject, Line, Metadata, Page,
15    SourceAnchor, Span, TableBlock, TableCell, TextBlock, Warning, SCHEMA_VERSION,
16};
17use crate::source::Source;
18
19#[derive(Debug, Default, Clone, Copy)]
20pub struct PdfEngine;
21
22impl ExtractionEngine for PdfEngine {
23    fn name(&self) -> &'static str {
24        "pdf-native"
25    }
26
27    fn extract(&self, source: &Source) -> Result<Document> {
28        let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
29        extract_pdf(bytes, source, self.name())
30    }
31}
32
33#[derive(Debug, Clone)]
34struct PdfObject {
35    object_number: u32,
36    generation: u16,
37    body: Vec<u8>,
38}
39
40#[derive(Debug, Clone)]
41struct PageSeed {
42    number: usize,
43    body: String,
44}
45
46#[derive(Debug, Clone)]
47struct PageExtraction {
48    page: Page,
49    text: String,
50    spans: Vec<SpanGeom>,
51}
52
53/// A single text-layer fragment with geometry, in PDF user space (y-up). Exposed
54/// (via [`extract_pdf_spans`]) so the hybrid pipeline can snap model-detected
55/// regions/cells to real text without re-parsing the PDF. Independent of block
56/// assembly, so spans consumed by table detection are still present here.
57#[derive(Debug, Clone, PartialEq)]
58pub struct SpanGeom {
59    pub bbox: BBox,
60    pub text: String,
61}
62
63/// All text-layer spans for one page, with the page's dimensions.
64#[derive(Debug, Clone, PartialEq)]
65pub struct PageSpans {
66    pub page_number: usize,
67    pub width: f32,
68    pub height: f32,
69    pub spans: Vec<SpanGeom>,
70}
71
72#[derive(Debug, Clone)]
73struct TextRun {
74    text: String,
75    bbox: BBox,
76    /// Page-space y of the text baseline, kept separate from `bbox` (which now
77    /// spans ascent..descent) so super/subscript detection stays baseline-based.
78    baseline_y: f32,
79    font: Option<String>,
80    size: f32,
81    /// Page-space advance of a single space glyph in this run's font/size, used to
82    /// decide whether a horizontal gap to the next run is a word break. Producers
83    /// often position fragments with `Td`/`TJ` and omit the space character, so the
84    /// gap is the only signal; sizing the threshold to the actual space width keeps
85    /// word segmentation correct across fonts and zoom levels.
86    space_width: f32,
87    bold: bool,
88    italic: bool,
89    source_object_ids: Vec<String>,
90}
91
92#[derive(Debug, Clone)]
93struct TextLine {
94    runs: Vec<TextRun>,
95    bbox: BBox,
96    baseline_y: f32,
97}
98
99#[derive(Debug, Clone)]
100struct DetectedTable {
101    table: TableBlock,
102    line_indices: Vec<usize>,
103}
104
105#[derive(Debug, Clone)]
106struct TableRowCandidate {
107    line_index: usize,
108    cells: Vec<TextRun>,
109}
110
111#[derive(Debug, Clone, Copy)]
112struct GraphicEdge {
113    x0: f32,
114    y0: f32,
115    x1: f32,
116    y1: f32,
117}
118
119#[derive(Debug, Clone, Copy, PartialEq, Eq)]
120enum ScriptKind {
121    Superscript,
122    Subscript,
123}
124
125#[derive(Debug, Clone)]
126struct ColumnLayout<'a> {
127    leading: Vec<&'a TextLine>,
128    columns: Vec<Vec<&'a TextLine>>,
129    trailing: Vec<&'a TextLine>,
130}
131
132#[derive(Debug, Clone)]
133struct ContentExtraction {
134    text_runs: Vec<TextRun>,
135    edges: Vec<GraphicEdge>,
136    images: Vec<ImageObject>,
137    assets: Vec<Asset>,
138    warnings: Vec<Warning>,
139}
140
141#[derive(Debug, Clone, Default)]
142struct FontDecoder {
143    cmap: HashMap<Vec<u8>, String>,
144    encoding: HashMap<u8, String>,
145    widths: HashMap<char, f32>,
146    max_code_len: usize,
147    bold: bool,
148    italic: bool,
149    ascent: f32,
150    descent: f32,
151}
152
153impl FontDecoder {
154    fn decode_byte(&self, byte: u8) -> String {
155        self.encoding
156            .get(&byte)
157            .cloned()
158            .unwrap_or_else(|| (byte as char).to_string())
159    }
160}
161
162#[derive(Debug, Clone)]
163enum Operand {
164    Number(f32),
165    Name(String),
166    Literal(Vec<u8>),
167    Hex(Vec<u8>),
168    Array(Vec<Operand>),
169    Other,
170}
171
172#[derive(Debug, Clone)]
173struct ContentOp {
174    operands: Vec<Operand>,
175    operator: String,
176}
177
178#[derive(Debug, Clone)]
179struct GraphicsState {
180    ctm: Matrix,
181    text_matrix: Matrix,
182    line_matrix: Matrix,
183    font_name: Option<String>,
184    font_size: f32,
185    leading: f32,
186    char_spacing: f32,
187    word_spacing: f32,
188    horizontal_scaling: f32,
189    text_rise: f32,
190}
191
192impl Default for GraphicsState {
193    fn default() -> Self {
194        Self {
195            ctm: Matrix::identity(),
196            text_matrix: Matrix::identity(),
197            line_matrix: Matrix::identity(),
198            font_name: None,
199            font_size: 12.0,
200            leading: 12.0,
201            char_spacing: 0.0,
202            word_spacing: 0.0,
203            horizontal_scaling: 1.0,
204            text_rise: 0.0,
205        }
206    }
207}
208
209#[derive(Debug, Clone, Copy)]
210struct Matrix {
211    a: f32,
212    b: f32,
213    c: f32,
214    d: f32,
215    e: f32,
216    f: f32,
217}
218
219impl Matrix {
220    fn identity() -> Self {
221        Self {
222            a: 1.0,
223            b: 0.0,
224            c: 0.0,
225            d: 1.0,
226            e: 0.0,
227            f: 0.0,
228        }
229    }
230
231    fn multiply(self, other: Self) -> Self {
232        Self {
233            a: self.a * other.a + self.b * other.c,
234            b: self.a * other.b + self.b * other.d,
235            c: self.c * other.a + self.d * other.c,
236            d: self.c * other.b + self.d * other.d,
237            e: self.e * other.a + self.f * other.c + other.e,
238            f: self.e * other.b + self.f * other.d + other.f,
239        }
240    }
241
242    fn point(self, x: f32, y: f32) -> (f32, f32) {
243        (
244            self.a * x + self.c * y + self.e,
245            self.b * x + self.d * y + self.f,
246        )
247    }
248
249    fn translate(self, x: f32, y: f32) -> Self {
250        Self {
251            e: self.e + self.a * x + self.c * y,
252            f: self.f + self.b * x + self.d * y,
253            ..self
254        }
255    }
256
257    fn bbox(self) -> BBox {
258        BBox {
259            x: self.e,
260            y: self.f,
261            width: self.a.abs(),
262            height: self.d.abs(),
263        }
264    }
265}
266
267/// The shared result of parsing a PDF into per-page extractions, before the
268/// Document/spans views are built from it.
269struct ParsedPdf {
270    page_extractions: Vec<PageExtraction>,
271    document_warnings: Vec<crate::ir::Warning>,
272    title: Option<String>,
273    encrypted: bool,
274}
275
276pub fn extract_pdf(bytes: &[u8], source: &Source, engine_name: &str) -> Result<Document> {
277    let parsed = parse_pdf_pages(bytes)?;
278    let ParsedPdf {
279        page_extractions,
280        document_warnings,
281        title,
282        encrypted,
283    } = parsed;
284
285    let mut pages = Vec::with_capacity(page_extractions.len());
286    let mut all_text = String::new();
287    let mut assets = Vec::new();
288
289    for extraction in page_extractions {
290        all_text.push_str(&extraction.text);
291        all_text.push('\n');
292        assets.extend(extraction.page.assets.clone());
293        pages.push(extraction.page);
294    }
295
296    Ok(Document {
297        schema_version: SCHEMA_VERSION.to_owned(),
298        metadata: Metadata {
299            format: "pdf".to_owned(),
300            engine: engine_name.to_owned(),
301            source: source.path.clone(),
302            title,
303            character_count: all_text.chars().count(),
304            word_count: all_text.split_whitespace().count(),
305            block_count: pages.iter().map(|page| page.blocks.len()).sum(),
306            file_size_bytes: Some(bytes.len() as u64),
307            pdf_version: pdf_version(bytes),
308            encrypted,
309        },
310        pages,
311        assets,
312        warnings: document_warnings,
313    })
314}
315
316/// Extract every text-layer span (with geometry, in PDF user space) per page.
317/// Unlike [`extract_pdf`], this exposes spans that block assembly later folds into
318/// tables/paragraphs — the raw input the hybrid pipeline snaps model regions to.
319pub fn extract_pdf_spans(bytes: &[u8]) -> Result<Vec<PageSpans>> {
320    let parsed = parse_pdf_pages(bytes)?;
321    Ok(parsed
322        .page_extractions
323        .into_iter()
324        .map(|e| PageSpans {
325            page_number: e.page.number,
326            width: e.page.width.unwrap_or(0.0),
327            height: e.page.height.unwrap_or(0.0),
328            spans: e.spans,
329        })
330        .collect())
331}
332
333fn parse_pdf_pages(bytes: &[u8]) -> Result<ParsedPdf> {
334    if !bytes.starts_with(b"%PDF-") {
335        return Err(DonglerError::pdf("missing %PDF header"));
336    }
337
338    let mut objects = parse_indirect_objects(bytes);
339    expand_object_streams(&mut objects);
340    if objects.is_empty() {
341        return Err(DonglerError::pdf("no indirect objects found"));
342    }
343
344    // Share each parsed object behind a single Arc between the ordered list
345    // (which preserves page order and any duplicate object numbers exactly) and
346    // the lookup map, so object bodies are stored once instead of copied per
347    // map entry.
348    let title = extract_info_string(&objects, "Title");
349    let objects: Vec<Arc<PdfObject>> = objects.into_iter().map(Arc::new).collect();
350    let object_map: HashMap<u32, Arc<PdfObject>> = objects
351        .iter()
352        .map(|object| (object.object_number, Arc::clone(object)))
353        .collect();
354    let page_seeds = objects
355        .iter()
356        .filter_map(|object| page_seed(object.as_ref(), &object_map))
357        .enumerate()
358        .map(|(index, mut seed)| {
359            seed.number = index + 1;
360            seed
361        })
362        .collect::<Vec<_>>();
363
364    if page_seeds.is_empty() {
365        return Err(DonglerError::pdf("no page objects found"));
366    }
367
368    let mut document_warnings = Vec::new();
369    let encrypted = contains_name(bytes, b"/Encrypt");
370    if encrypted {
371        document_warnings.push(warning(
372            "pdf.encrypted",
373            "warning",
374            "document declares encryption; extraction may be incomplete",
375            None,
376        ));
377    }
378    if contains_name(bytes, b"/ObjStm") {
379        document_warnings.push(warning(
380            "pdf.object_stream",
381            "info",
382            "object streams detected and expanded by the native scanner",
383            None,
384        ));
385    }
386
387    // Decode each font once per document. Fonts (and their compressed ToUnicode
388    // CMaps) are shared resources referenced by most pages, so decoding them in
389    // every page re-inflates the same streams pages*fonts times.
390    let mut font_object_numbers: Vec<u32> = page_seeds
391        .iter()
392        .flat_map(|seed| {
393            let resource_body = resolve_resource_body(&seed.body, &object_map);
394            let resource_text = resource_body.as_deref().unwrap_or(&seed.body);
395            resolve_named_resource_refs(resource_text, "/Font", &object_map)
396                .into_values()
397                .collect::<Vec<_>>()
398        })
399        .collect();
400    font_object_numbers.sort_unstable();
401    font_object_numbers.dedup();
402    let decode_font = |number: u32| {
403        object_map
404            .get(&number)
405            .map(|font| (number, Arc::new(font_decoder(font.as_ref(), &object_map))))
406    };
407    #[cfg(feature = "parallel")]
408    let font_cache: HashMap<u32, Arc<FontDecoder>> = font_object_numbers
409        .into_par_iter()
410        .filter_map(decode_font)
411        .collect();
412    #[cfg(not(feature = "parallel"))]
413    let font_cache: HashMap<u32, Arc<FontDecoder>> = font_object_numbers
414        .into_iter()
415        .filter_map(decode_font)
416        .collect();
417
418    let extract_one = |seed: &PageSeed| extract_page(seed, &object_map, &font_cache);
419    #[cfg(feature = "parallel")]
420    let page_extractions = page_seeds.par_iter().map(extract_one).collect::<Vec<_>>();
421    #[cfg(not(feature = "parallel"))]
422    let page_extractions = page_seeds.iter().map(extract_one).collect::<Vec<_>>();
423
424    Ok(ParsedPdf {
425        page_extractions,
426        document_warnings,
427        title,
428        encrypted,
429    })
430}
431
432fn extract_page(
433    seed: &PageSeed,
434    object_map: &HashMap<u32, Arc<PdfObject>>,
435    font_cache: &HashMap<u32, Arc<FontDecoder>>,
436) -> PageExtraction {
437    let media_box = parse_number_array_after(&seed.body, "/MediaBox")
438        .unwrap_or_else(|| vec![0.0, 0.0, 612.0, 792.0]);
439    let width =
440        media_box.get(2).copied().unwrap_or(612.0) - media_box.first().copied().unwrap_or(0.0);
441    let height =
442        media_box.get(3).copied().unwrap_or(792.0) - media_box.get(1).copied().unwrap_or(0.0);
443    let rotation = parse_number_after(&seed.body, "/Rotate").map(|value| value as i32);
444    let contents = parse_refs_after_key(&seed.body, "/Contents");
445    let resource_body = resolve_resource_body(&seed.body, object_map);
446    let resource_text = resource_body.as_deref().unwrap_or(&seed.body);
447    let xobjects = resolve_named_resource_refs(resource_text, "/XObject", object_map);
448    let fonts = load_font_decoders(resource_text, object_map, font_cache);
449
450    let mut warnings = Vec::new();
451    let mut extraction = ContentExtraction {
452        text_runs: Vec::new(),
453        edges: Vec::new(),
454        images: Vec::new(),
455        assets: Vec::new(),
456        warnings: Vec::new(),
457    };
458
459    for content_ref in contents {
460        match object_map
461            .get(&(content_ref as u32))
462            .map(|object| decode_stream_object(object.as_ref()))
463        {
464            Some(Ok(Some(stream))) => {
465                let object_id = format!("{content_ref} 0 R");
466                let mut content = interpret_content_stream(
467                    &stream,
468                    seed.number,
469                    &[object_id],
470                    &xobjects,
471                    &fonts,
472                    object_map,
473                );
474                extraction.text_runs.append(&mut content.text_runs);
475                extraction.edges.append(&mut content.edges);
476                extraction.images.append(&mut content.images);
477                extraction.assets.append(&mut content.assets);
478                extraction.warnings.append(&mut content.warnings);
479            }
480            Some(Ok(None)) | None => warnings.push(warning(
481                "pdf.missing_content",
482                "warning",
483                "page content stream is missing",
484                Some(seed.number),
485            )),
486            Some(Err(error)) => warnings.push(warning(
487                "pdf.stream_decode",
488                "warning",
489                &error.to_string(),
490                Some(seed.number),
491            )),
492        }
493    }
494
495    warnings.append(&mut extraction.warnings);
496
497    // Apply the page /Rotate so line grouping and reading order run in the
498    // orientation a reader sees. Display dimensions swap for 90/270.
499    let normalized_rotation = rotation.map(|value| value.rem_euclid(360)).unwrap_or(0);
500    if normalized_rotation != 0 {
501        for run in &mut extraction.text_runs {
502            run.bbox = rotate_bbox(run.bbox, normalized_rotation, width, height);
503        }
504        for image in &mut extraction.images {
505            if let Some(bbox) = image.bbox {
506                image.bbox = Some(rotate_bbox(bbox, normalized_rotation, width, height));
507            }
508        }
509        for edge in &mut extraction.edges {
510            let (x0, y0) = rotate_point(edge.x0, edge.y0, normalized_rotation, width, height);
511            let (x1, y1) = rotate_point(edge.x1, edge.y1, normalized_rotation, width, height);
512            edge.x0 = x0;
513            edge.y0 = y0;
514            edge.x1 = x1;
515            edge.y1 = y1;
516        }
517    }
518    let (page_width, page_height) = if matches!(normalized_rotation, 90 | 270) {
519        (height, width)
520    } else {
521        (width, height)
522    };
523    let (page_x, page_y) = if normalized_rotation == 0 {
524        (
525            media_box.first().copied().unwrap_or(0.0),
526            media_box.get(1).copied().unwrap_or(0.0),
527        )
528    } else {
529        (0.0, 0.0)
530    };
531
532    let lines = group_text_runs(extraction.text_runs);
533
534    // Raw text-layer spans (one per positioned run), in PDF user space, captured
535    // before block assembly folds/consumes them — the hybrid pipeline snaps
536    // model regions to these (see `extract_pdf_spans`).
537    let spans: Vec<SpanGeom> = lines
538        .iter()
539        .flat_map(|line| line.runs.iter())
540        .filter(|run| !run.text.trim().is_empty())
541        .map(|run| SpanGeom {
542            bbox: run.bbox,
543            text: run.text.clone(),
544        })
545        .collect();
546
547    let mut blocks = build_blocks(seed.number, &lines, &extraction.edges);
548    if blocks.is_empty() && !extraction.images.is_empty() {
549        blocks.extend(image_figure_blocks(seed.number, &extraction.images));
550    }
551    let text = blocks
552        .iter()
553        .map(block_text)
554        .filter(|text| !text.is_empty())
555        .collect::<Vec<_>>()
556        .join("\n");
557
558    let page = Page {
559        number: seed.number,
560        width: Some(page_width),
561        height: Some(page_height),
562        rotation,
563        bbox: Some(BBox {
564            x: page_x,
565            y: page_y,
566            width: page_width,
567            height: page_height,
568        }),
569        blocks,
570        images: extraction.images,
571        assets: extraction.assets,
572        warnings, ..Default::default()
573    };
574
575    PageExtraction { page, text, spans }
576}
577
578fn interpret_content_stream(
579    bytes: &[u8],
580    page_number: usize,
581    source_object_ids: &[String],
582    xobjects: &HashMap<String, u32>,
583    fonts: &HashMap<String, Arc<FontDecoder>>,
584    object_map: &HashMap<u32, Arc<PdfObject>>,
585) -> ContentExtraction {
586    let mut state = GraphicsState::default();
587    let mut graphics_stack = Vec::new();
588    let mut current_path_point: Option<(f32, f32)> = None;
589    let mut pending_edges = Vec::new();
590    let mut extraction = ContentExtraction {
591        text_runs: Vec::new(),
592        edges: Vec::new(),
593        images: Vec::new(),
594        assets: Vec::new(),
595        warnings: Vec::new(),
596    };
597
598    for op in parse_content_ops(bytes) {
599        match op.operator.as_str() {
600            "q" => graphics_stack.push(state.clone()),
601            "Q" => {
602                if let Some(previous) = graphics_stack.pop() {
603                    state = previous;
604                }
605            }
606            "cm" => {
607                if let Some(values) = numbers(&op.operands, 6) {
608                    state.ctm = state.ctm.multiply(Matrix {
609                        a: values[0],
610                        b: values[1],
611                        c: values[2],
612                        d: values[3],
613                        e: values[4],
614                        f: values[5],
615                    });
616                }
617            }
618            "BT" => {
619                state.text_matrix = Matrix::identity();
620                state.line_matrix = Matrix::identity();
621            }
622            "Tf" => {
623                if let [Operand::Name(name), Operand::Number(size)] = op.operands.as_slice() {
624                    state.font_name = Some(name.clone());
625                    state.font_size = *size;
626                    state.leading = *size * 1.2;
627                }
628            }
629            "Tc" => {
630                if let Some(values) = numbers(&op.operands, 1) {
631                    state.char_spacing = values[0];
632                }
633            }
634            "Tw" => {
635                if let Some(values) = numbers(&op.operands, 1) {
636                    state.word_spacing = values[0];
637                }
638            }
639            "Tz" => {
640                if let Some(values) = numbers(&op.operands, 1) {
641                    state.horizontal_scaling = (values[0] / 100.0).max(0.01);
642                }
643            }
644            "TL" => {
645                if let Some(values) = numbers(&op.operands, 1) {
646                    state.leading = values[0];
647                }
648            }
649            "Ts" => {
650                if let Some(values) = numbers(&op.operands, 1) {
651                    state.text_rise = values[0];
652                }
653            }
654            "Td" | "TD" => {
655                if let Some(values) = numbers(&op.operands, 2) {
656                    let next_line = state.line_matrix.translate(values[0], values[1]);
657                    state.line_matrix = next_line;
658                    state.text_matrix = next_line;
659                    if op.operator == "TD" {
660                        state.leading = -values[1];
661                    }
662                }
663            }
664            "Tm" => {
665                if let Some(values) = numbers(&op.operands, 6) {
666                    let matrix = Matrix {
667                        a: values[0],
668                        b: values[1],
669                        c: values[2],
670                        d: values[3],
671                        e: values[4],
672                        f: values[5],
673                    };
674                    state.line_matrix = matrix;
675                    state.text_matrix = matrix;
676                }
677            }
678            "T*" => {
679                move_to_next_text_line(&mut state);
680            }
681            "Tj" => {
682                if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
683                    push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
684                }
685            }
686            "TJ" => {
687                if let Some(Operand::Array(items)) = op.operands.first() {
688                    let text = text_from_array(items, &state, fonts);
689                    push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
690                }
691            }
692            "'" => {
693                move_to_next_text_line(&mut state);
694                if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
695                    push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
696                }
697            }
698            "\"" => {
699                if let [Operand::Number(word_spacing), Operand::Number(char_spacing), ..] =
700                    op.operands.as_slice()
701                {
702                    state.word_spacing = *word_spacing;
703                    state.char_spacing = *char_spacing;
704                }
705                move_to_next_text_line(&mut state);
706                if let Some(text) = op
707                    .operands
708                    .last()
709                    .and_then(|operand| operand_text(operand, &state, fonts))
710                {
711                    push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
712                }
713            }
714            "Do" => {
715                if let Some(Operand::Name(name)) = op.operands.first() {
716                    if let Some(object_number) = xobjects.get(name) {
717                        if let Some(object) = object_map.get(object_number) {
718                            let object_body = lossy(&object.body);
719                            if object_body.contains("/Subtype /Image") {
720                                let bbox = state.ctm.bbox();
721                                let id = format!("image-{}-{name}", page_number);
722                                let object_id = Some(format!(
723                                    "{} {} R",
724                                    object.object_number, object.generation
725                                ));
726                                let width = parse_number_after(&object_body, "/Width")
727                                    .map(|value| value as u32);
728                                let height = parse_number_after(&object_body, "/Height")
729                                    .map(|value| value as u32);
730
731                                extraction.images.push(ImageObject {
732                                    id: id.clone(),
733                                    object_id: object_id.clone(),
734                                    bbox: Some(bbox),
735                                    width,
736                                    height,
737                                });
738                                extraction.assets.push(Asset {
739                                    id,
740                                    kind: "image".to_owned(),
741                                    object_id,
742                                    bbox: Some(bbox),
743                                    width,
744                                    height,
745                                });
746                            }
747                        }
748                    }
749                }
750            }
751            "m" => {
752                if let Some(values) = numbers(&op.operands, 2) {
753                    current_path_point = Some((values[0], values[1]));
754                }
755            }
756            "l" => {
757                if let (Some(start), Some(values)) = (current_path_point, numbers(&op.operands, 2))
758                {
759                    let end = (values[0], values[1]);
760                    pending_edges.push(graphic_edge_from_points(state.ctm, start, end));
761                    current_path_point = Some(end);
762                }
763            }
764            "re" => {
765                if let Some(values) = numbers(&op.operands, 4) {
766                    pending_edges.extend(graphic_edges_from_rect(
767                        state.ctm, values[0], values[1], values[2], values[3],
768                    ));
769                    current_path_point = Some((values[0], values[1]));
770                }
771            }
772            "S" | "s" => {
773                extraction.edges.append(&mut pending_edges);
774                current_path_point = None;
775            }
776            "n" => {
777                pending_edges.clear();
778                current_path_point = None;
779            }
780            _ => {}
781        }
782    }
783
784    extraction
785}
786
787fn graphic_edge_from_points(matrix: Matrix, start: (f32, f32), end: (f32, f32)) -> GraphicEdge {
788    let (x0, y0) = matrix.point(start.0, start.1);
789    let (x1, y1) = matrix.point(end.0, end.1);
790    GraphicEdge { x0, y0, x1, y1 }
791}
792
793fn graphic_edges_from_rect(
794    matrix: Matrix,
795    x: f32,
796    y: f32,
797    width: f32,
798    height: f32,
799) -> Vec<GraphicEdge> {
800    let right = x + width;
801    let top = y + height;
802    vec![
803        graphic_edge_from_points(matrix, (x, y), (right, y)),
804        graphic_edge_from_points(matrix, (right, y), (right, top)),
805        graphic_edge_from_points(matrix, (right, top), (x, top)),
806        graphic_edge_from_points(matrix, (x, top), (x, y)),
807    ]
808}
809
810fn move_to_next_text_line(state: &mut GraphicsState) {
811    let next_line = state.line_matrix.translate(0.0, -state.leading);
812    state.line_matrix = next_line;
813    state.text_matrix = next_line;
814}
815
816fn push_text_run(
817    extraction: &mut ContentExtraction,
818    state: &mut GraphicsState,
819    source_object_ids: &[String],
820    text: String,
821    fonts: &HashMap<String, Arc<FontDecoder>>,
822) {
823    let advance = text_advance_width(&text, state, fonts);
824    if text.trim().is_empty() {
825        state.text_matrix = state.text_matrix.translate(advance, 0.0);
826        return;
827    }
828
829    let font = state.font_name.as_ref().and_then(|name| fonts.get(name));
830    let (bold, italic) = font
831        .map(|font| (font.bold, font.italic))
832        .unwrap_or((false, false));
833    let (ascent, descent) = font
834        .map(|font| (font.ascent, font.descent))
835        .unwrap_or((0.75, -0.25));
836    let bbox = text_run_bbox(state, advance, ascent, descent);
837    let (base_x, base_y) = state.text_matrix.point(0.0, state.text_rise);
838    let (_, baseline_y) = state.ctm.point(base_x, base_y);
839    let space_width = space_advance_width(state, fonts);
840    extraction.text_runs.push(TextRun {
841        text,
842        bbox,
843        baseline_y,
844        font: state.font_name.clone(),
845        size: state.font_size,
846        space_width,
847        bold,
848        italic,
849        source_object_ids: source_object_ids.to_vec(),
850    });
851    state.text_matrix = state.text_matrix.translate(advance, 0.0);
852}
853
854fn text_advance_width(
855    text: &str,
856    state: &GraphicsState,
857    fonts: &HashMap<String, Arc<FontDecoder>>,
858) -> f32 {
859    let glyphs = text.chars().count() as f32;
860    if glyphs == 0.0 {
861        return 0.0;
862    }
863    let spaces = text.chars().filter(|character| *character == ' ').count() as f32;
864    let font = state
865        .font_name
866        .as_ref()
867        .and_then(|font_name| fonts.get(font_name));
868    let base = text
869        .chars()
870        .map(|character| {
871            font.and_then(|font| font.widths.get(&character).copied())
872                .unwrap_or_else(|| default_glyph_width(character))
873                / 1000.0
874                * state.font_size
875        })
876        .sum::<f32>();
877    let spacing = glyphs * state.char_spacing + spaces * state.word_spacing;
878    ((base + spacing) * state.horizontal_scaling).max(0.0)
879}
880
881/// Approximate advance (1/1000 em) of a glyph when the font carries no width for
882/// it. Uses Helvetica's metrics, which track real proportional Latin widths far
883/// better than a flat half-em: narrow glyphs (`i l . ,`) are ~250, wide ones
884/// (`m w M W`) ~850. Accurate advances are what let gap-based word segmentation
885/// work on fonts that omit `/Widths` (some subset and OCR-layer fonts).
886fn default_glyph_width(character: char) -> f32 {
887    match character {
888        ' ' | '!' | ',' | '.' | '/' | ':' | ';' | 'I' | '[' | '\\' | ']' | 'i' | 'j' | 'l'
889        | '|' | '\'' => 250.0,
890        '"' | '(' | ')' | '*' | '`' | '-' | 'f' | 'r' | 't' | '{' | '}' => 333.0,
891        'm' | 'M' | 'W' | 'w' | '@' => 850.0,
892        '0'..='9' => 556.0,
893        'A'..='Z' | '$' | '+' | '<' | '=' | '>' | '?' | '_' | '~' => 650.0,
894        _ => 500.0,
895    }
896}
897
898/// Page-space advance of one space glyph in the current font/size, scaled by the
899/// horizontal scaling. Falls back to a quarter-em when the font has no space-glyph
900/// metric, which is the typical width of a space across text fonts.
901fn space_advance_width(state: &GraphicsState, fonts: &HashMap<String, Arc<FontDecoder>>) -> f32 {
902    let from_font = state
903        .font_name
904        .as_ref()
905        .and_then(|font_name| fonts.get(font_name))
906        .and_then(|font| font.widths.get(&' ').copied())
907        .filter(|width| *width > 0.0)
908        .map(|width| width / 1000.0 * state.font_size);
909    let width = from_font.unwrap_or_else(|| default_glyph_width(' ') / 1000.0 * state.font_size);
910    (width * state.horizontal_scaling).max(0.0)
911}
912
913fn text_run_bbox(state: &GraphicsState, advance: f32, ascent: f32, descent: f32) -> BBox {
914    // Vertical extent from the font's ascent/descent (em-relative to the
915    // baseline) rather than a flat font-size box, so glyph boxes are tight and
916    // baseline-correct under scaling/rotation.
917    let bottom = state.text_rise + descent * state.font_size;
918    let top = state.text_rise + ascent * state.font_size;
919    let corners = [
920        (0.0, bottom),
921        (advance, bottom),
922        (0.0, top),
923        (advance, top),
924    ];
925    let points = corners
926        .into_iter()
927        .map(|(x, y)| {
928            let (text_x, text_y) = state.text_matrix.point(x, y);
929            state.ctm.point(text_x, text_y)
930        })
931        .collect::<Vec<_>>();
932    let min_x = points.iter().map(|(x, _)| *x).fold(f32::INFINITY, f32::min);
933    let min_y = points.iter().map(|(_, y)| *y).fold(f32::INFINITY, f32::min);
934    let max_x = points
935        .iter()
936        .map(|(x, _)| *x)
937        .fold(f32::NEG_INFINITY, f32::max);
938    let max_y = points
939        .iter()
940        .map(|(_, y)| *y)
941        .fold(f32::NEG_INFINITY, f32::max);
942    BBox {
943        x: min_x,
944        y: min_y,
945        width: (max_x - min_x).max(state.font_size * 0.25),
946        height: (max_y - min_y).max(state.font_size * 0.25),
947    }
948}
949
950fn build_blocks(page_number: usize, lines: &[TextLine], edges: &[GraphicEdge]) -> Vec<Block> {
951    let body_size = page_body_size(lines);
952    let tables = detect_page_tables(page_number, lines, edges);
953
954    if tables.is_empty() {
955        let split_lines = split_wide_text_lines(lines);
956        let text_blocks = text_lines_in_reading_order(&split_lines)
957            .into_iter()
958            .filter_map(|line| text_block_from_line(page_number, line, body_size))
959            .collect::<Vec<_>>();
960        return merge_wrapped_text_blocks(text_blocks)
961            .into_iter()
962            .map(Block::Text)
963            .collect();
964    }
965
966    build_blocks_with_tables(page_number, lines, tables, body_size)
967}
968
969/// Detect *every* table on the page, not just the first. A page commonly stacks
970/// two or three statements/schedules; each pass consumes its lines and re-runs
971/// detection on what is left, so a second or third table is recovered instead of
972/// being shredded into loose numeric lines by the prose column reader. Entirely
973/// geometric and document-agnostic — the same detectors, applied repeatedly.
974fn detect_page_tables(
975    page_number: usize,
976    lines: &[TextLine],
977    edges: &[GraphicEdge],
978) -> Vec<DetectedTable> {
979    let mut tables: Vec<DetectedTable> = Vec::new();
980    let mut consumed = vec![false; lines.len()];
981    // A page has only so many tables; the cap is a guard against a detector that
982    // would otherwise keep re-claiming the same sliver and never make progress.
983    while tables.len() < 8 {
984        let mapping: Vec<usize> = (0..lines.len()).filter(|&index| !consumed[index]).collect();
985        if mapping.len() < 2 {
986            break;
987        }
988        let subset: Vec<TextLine> = mapping.iter().map(|&index| lines[index].clone()).collect();
989        let Some(mut detected) = detect_table(page_number, &subset, edges) else {
990            break;
991        };
992        // `line_indices` index into `subset`; map them back to the page's lines.
993        let original: Vec<usize> = detected
994            .line_indices
995            .iter()
996            .filter_map(|&subset_index| mapping.get(subset_index).copied())
997            .collect();
998        if original.is_empty() {
999            break;
1000        }
1001        for &index in &original {
1002            consumed[index] = true;
1003        }
1004        detected.line_indices = original;
1005        tables.push(detected);
1006    }
1007    tables
1008}
1009
1010fn build_blocks_with_tables(
1011    page_number: usize,
1012    lines: &[TextLine],
1013    mut tables: Vec<DetectedTable>,
1014    body_size: f32,
1015) -> Vec<Block> {
1016    let mut consumed = vec![false; lines.len()];
1017    for table in &tables {
1018        for &index in &table.line_indices {
1019            if let Some(slot) = consumed.get_mut(index) {
1020                *slot = true;
1021            }
1022        }
1023    }
1024    let remaining_lines = lines
1025        .iter()
1026        .enumerate()
1027        .filter(|(line_index, _)| !consumed[*line_index])
1028        .map(|(_, line)| line.clone())
1029        .collect::<Vec<_>>();
1030    let split_lines = split_wide_text_lines(&remaining_lines);
1031    let text_blocks = merge_wrapped_text_blocks(
1032        text_lines_in_reading_order(&split_lines)
1033            .into_iter()
1034            .filter_map(|line| text_block_from_line(page_number, line, body_size))
1035            .collect(),
1036    );
1037
1038    // Interleave tables among the text blocks by vertical position: a table is
1039    // emitted just before the first text block that sits below its top edge. Text
1040    // blocks keep their reading order (which may be column-aware), so this matches
1041    // the single-table behaviour exactly when there is only one table.
1042    let table_top = |table: &DetectedTable| {
1043        table
1044            .table
1045            .bbox
1046            .map(|bbox| bbox.y + bbox.height)
1047            .unwrap_or(f32::NEG_INFINITY)
1048    };
1049    tables.sort_by(|left, right| table_top(right).total_cmp(&table_top(left)));
1050
1051    let mut blocks = Vec::new();
1052    let mut next_table = 0usize;
1053    for text_block in text_blocks {
1054        let block_top = text_block
1055            .bbox
1056            .map(|bbox| bbox.y + bbox.height)
1057            .unwrap_or(f32::NEG_INFINITY);
1058        while next_table < tables.len() && table_top(&tables[next_table]) > block_top {
1059            blocks.push(Block::Table(tables[next_table].table.clone()));
1060            next_table += 1;
1061        }
1062        blocks.push(Block::Text(text_block));
1063    }
1064    for table in tables.into_iter().skip(next_table) {
1065        blocks.push(Block::Table(table.table));
1066    }
1067
1068    blocks
1069}
1070
1071fn image_figure_blocks(page_number: usize, images: &[ImageObject]) -> Vec<Block> {
1072    images
1073        .iter()
1074        .map(|image| {
1075            Block::Figure(FigureBlock {
1076                alt_text: Some(format!("Image {}", image.id)),
1077                caption: None,
1078                bbox: image.bbox,
1079                image_ref: Some(image.id.clone()),
1080                source_anchors: vec![anchor(
1081                    page_number,
1082                    image.bbox,
1083                    image.object_id.clone().into_iter().collect(),
1084                )],
1085                confidence: Some(Confidence {
1086                    score: 0.6,
1087                    calibrated: false,
1088                }), ..Default::default()
1089            })
1090        })
1091        .collect()
1092}
1093
1094fn split_wide_text_lines(lines: &[TextLine]) -> Vec<TextLine> {
1095    let enable_tight_column_band = has_repeated_tight_column_band_evidence(lines);
1096    let mut split_lines = Vec::new();
1097    for line in lines {
1098        match split_text_line_at_wide_gap(line, enable_tight_column_band) {
1099            Some((left, right)) => {
1100                split_lines.push(left);
1101                split_lines.push(right);
1102            }
1103            None => split_lines.push(line.clone()),
1104        }
1105    }
1106    split_lines
1107}
1108
1109/// True when a line's runs are already ordered left-to-right by x.
1110fn line_runs_x_sorted(runs: &[TextRun]) -> bool {
1111    runs.windows(2).all(|pair| pair[0].bbox.x <= pair[1].bbox.x)
1112}
1113
1114/// Runs of a line ordered left-to-right by x. Borrows when already sorted — the
1115/// common case, since `group_text_runs` keeps each line x-sorted — and clones +
1116/// sorts only when a reorder is actually required, avoiding a deep
1117/// `Vec<TextRun>` clone on every column/word pass.
1118fn runs_sorted_by_x(line: &TextLine) -> Cow<'_, [TextRun]> {
1119    if line_runs_x_sorted(&line.runs) {
1120        Cow::Borrowed(&line.runs)
1121    } else {
1122        let mut runs = line.runs.clone();
1123        runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
1124        Cow::Owned(runs)
1125    }
1126}
1127
1128fn split_text_line_at_wide_gap(
1129    line: &TextLine,
1130    enable_tight_column_band: bool,
1131) -> Option<(TextLine, TextLine)> {
1132    if line.runs.len() < 2 {
1133        return None;
1134    }
1135    let runs = runs_sorted_by_x(line);
1136    let contains_math = runs
1137        .iter()
1138        .any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)));
1139    let tight_column_split_index = enable_tight_column_band
1140        .then(|| tight_column_band_split_index_for_runs(&runs[..]))
1141        .flatten();
1142    let largest_gap_split = largest_run_gap(&runs[..]);
1143    if contains_math && tight_column_split_index.is_none() {
1144        return None;
1145    }
1146    let split_index = match (tight_column_split_index, largest_gap_split) {
1147        (Some(tight_index), Some((wide_index, gap, x_jump)))
1148            if prefers_wide_gap_before_tight_band(&runs[..], wide_index, tight_index, gap, x_jump) =>
1149        {
1150            wide_index
1151        }
1152        (Some(tight_index), _) => tight_index,
1153        (None, Some((wide_index, _, _))) => wide_index,
1154        (None, None) => return None,
1155    };
1156    let left_runs = runs[..split_index].to_vec();
1157    let right_runs = runs[split_index..].to_vec();
1158    if left_runs.is_empty() || right_runs.is_empty() {
1159        return None;
1160    }
1161    // A wide gap between a row label and its right-aligned figures is a TABLE ROW,
1162    // not a two-column page split: the right side is a cluster of numeric values
1163    // that belong with the label (financial statements often set a wide leader gap
1164    // between the line item and its columns). Splitting it strands the figures —
1165    // the reading-order reader then emits every label, then every value, and the
1166    // table is destroyed. Keep such a row whole so the table detectors can pair the
1167    // label with its figures. Genuine two-column prose has text (not a value
1168    // cluster) on the right, so it still splits.
1169    // Strict: *every* non-blank run on the right is a figure, currency symbol, or
1170    // bracket — a pure right-aligned value cluster — AND the gap to it is a genuine
1171    // wide *leader* gap (financial statements set ~100–360pt between a line item
1172    // and its columns; a two-column page's gutter is far narrower, ~30–50pt). A
1173    // prose right column (words) or a mere column gutter still splits; only a
1174    // financial row's figure block after a leader gap is kept whole.
1175    let right_value_cells = right_runs
1176        .iter()
1177        .filter(|run| is_numeric_value(&run.text))
1178        .count();
1179    let right_all_figures = right_runs.iter().all(|run| {
1180        let text = run.text.trim();
1181        text.is_empty()
1182            || is_value_cell(text)
1183            || matches!(text, "$" | "€" | "£" | "¥" | "(" | ")" | "($")
1184    });
1185    let leader_gap = right_runs.first().map_or(0.0, |run| run.bbox.x)
1186        - left_runs
1187            .last()
1188            .map_or(0.0, |run| run.bbox.x + run.bbox.width);
1189    if right_value_cells >= 3 && right_all_figures && leader_gap >= 100.0 {
1190        return None;
1191    }
1192    Some((
1193        text_line_from_runs(left_runs)?,
1194        text_line_from_runs(right_runs)?,
1195    ))
1196}
1197
1198fn has_repeated_tight_column_band_evidence(lines: &[TextLine]) -> bool {
1199    lines
1200        .iter()
1201        .filter(|line| {
1202            let runs = runs_sorted_by_x(line);
1203            tight_column_band_split_index_for_runs(&runs[..]).is_some()
1204        })
1205        .take(2)
1206        .count()
1207        >= 2
1208}
1209
1210fn tight_column_band_split_index_for_runs(runs: &[TextRun]) -> Option<usize> {
1211    let split_index = right_column_band_split_index(runs)?;
1212    let contains_math = runs
1213        .iter()
1214        .any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)));
1215    if contains_math && !allows_math_column_split(&runs[..split_index]) {
1216        return None;
1217    }
1218    Some(split_index)
1219}
1220
1221fn right_column_band_split_index(runs: &[TextRun]) -> Option<usize> {
1222    if runs.len() < 3 || runs.first()?.bbox.x > 120.0 {
1223        return None;
1224    }
1225
1226    for index in 1..runs.len() {
1227        if index < 2 {
1228            continue;
1229        }
1230        let algorithm_like_left = allows_math_column_split(&runs[..index]);
1231        let right_x = runs[index].bbox.x;
1232        let in_standard_column_band = (300.0..=340.0).contains(&right_x);
1233        let in_algorithm_column_band = algorithm_like_left && (280.0..=340.0).contains(&right_x);
1234        if !in_standard_column_band && !in_algorithm_column_band {
1235            continue;
1236        }
1237        if runs.len() - index < 2 && !algorithm_like_left {
1238            continue;
1239        }
1240
1241        let previous = &runs[index - 1].bbox;
1242        let gap = right_x - (previous.x + previous.width);
1243        if gap < -35.0 {
1244            continue;
1245        }
1246
1247        let right_text_len = runs[index..]
1248            .iter()
1249            .map(|run| run.text.trim().len())
1250            .sum::<usize>();
1251        if right_text_len < 18 {
1252            continue;
1253        }
1254
1255        return Some(index);
1256    }
1257
1258    None
1259}
1260
1261fn allows_math_column_split(left_runs: &[TextRun]) -> bool {
1262    let text = left_runs
1263        .iter()
1264        .map(|run| run.text.trim())
1265        .filter(|text| !text.is_empty())
1266        .collect::<Vec<_>>()
1267        .join(" ");
1268    let trimmed = text.trim_start();
1269    starts_with_numbered_step(trimmed)
1270        || trimmed.starts_with("Require:")
1271        || trimmed.starts_with("Ensure:")
1272        || trimmed.starts_with("Algorithm ")
1273}
1274
1275fn largest_run_gap(runs: &[TextRun]) -> Option<(usize, f32, f32)> {
1276    runs.windows(2)
1277        .enumerate()
1278        .filter_map(|(index, window)| {
1279            let left = &window[0].bbox;
1280            let right = &window[1].bbox;
1281            let gap = right.x - (left.x + left.width);
1282            let x_jump = right.x - left.x;
1283            is_likely_column_split_gap(&window[0].bbox, &window[1].bbox, gap, x_jump).then_some((
1284                index + 1,
1285                gap,
1286                x_jump,
1287            ))
1288        })
1289        .max_by(|left, right| left.1.max(left.2).total_cmp(&right.1.max(right.2)))
1290}
1291
1292fn is_likely_column_split_gap(left: &BBox, right: &BBox, gap: f32, x_jump: f32) -> bool {
1293    if gap >= 18.0 {
1294        return true;
1295    }
1296
1297    x_jump >= 110.0 && left.x < 280.0 && right.x > 280.0
1298}
1299
1300/// Whether a candidate column split has a genuinely clear gutter at `midpoint`.
1301/// A real two-column layout never has a line crossing the gutter; a single column
1302/// falsely paired (its lines start at the left margin and extend across the page
1303/// centre, as happens when a per-glyph PDF splits a line mid-way) has many lines
1304/// straddling it. Reject when more than a quarter of the region's lines cross.
1305fn column_gutter_is_clear(lines: &[TextLine], midpoint: f32, min_y: f32, max_y: f32) -> bool {
1306    let band = 4.0;
1307    let mut region = 0usize;
1308    let mut crossing = 0usize;
1309    for line in lines {
1310        if line.bbox.y < min_y - line.bbox.height || line.bbox.y > max_y + line.bbox.height {
1311            continue;
1312        }
1313        region += 1;
1314        if line.bbox.x < midpoint - band && line.bbox.x + line.bbox.width > midpoint + band {
1315            crossing += 1;
1316        }
1317    }
1318    region == 0 || (crossing as f32) <= (region as f32) * 0.25
1319}
1320
1321fn text_line_from_runs(runs: Vec<TextRun>) -> Option<TextLine> {
1322    let bbox = union_boxes(runs.iter().map(|run| run.bbox))?;
1323    let baseline_y = runs.iter().map(|run| run.baseline_y).sum::<f32>() / runs.len() as f32;
1324    Some(TextLine {
1325        runs,
1326        bbox,
1327        baseline_y,
1328    })
1329}
1330
1331fn prefers_wide_gap_before_tight_band(
1332    runs: &[TextRun],
1333    wide_index: usize,
1334    tight_index: usize,
1335    gap: f32,
1336    x_jump: f32,
1337) -> bool {
1338    if wide_index == 0 || wide_index >= tight_index || tight_index > runs.len() {
1339        return false;
1340    }
1341
1342    let left = &runs[wide_index - 1].bbox;
1343    let right = &runs[wide_index].bbox;
1344    let stranded_right_glyphs = runs[wide_index..tight_index]
1345        .iter()
1346        .all(|run| run.bbox.x >= 280.0 && run.text.trim().chars().count() <= 2);
1347
1348    stranded_right_glyphs && left.x < 280.0 && right.x >= 280.0 && x_jump >= 110.0 && gap >= -160.0
1349}
1350
1351fn text_lines_in_reading_order(lines: &[TextLine]) -> Vec<&TextLine> {
1352    if let Some(layout) = detect_paired_text_columns(lines) {
1353        return order_column_layout(layout);
1354    }
1355    if let Some(mut columns) = detect_text_columns(lines) {
1356        columns.sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
1357        return columns
1358            .into_iter()
1359            .flat_map(|mut column| {
1360                column.sort_by(|left, right| {
1361                    right
1362                        .bbox
1363                        .y
1364                        .total_cmp(&left.bbox.y)
1365                        .then(left.bbox.x.total_cmp(&right.bbox.x))
1366                });
1367                column
1368            })
1369            .collect();
1370    }
1371    lines.iter().collect()
1372}
1373
1374fn order_column_layout(mut layout: ColumnLayout<'_>) -> Vec<&TextLine> {
1375    let mut ordered = Vec::new();
1376    sort_lines_top_down(&mut layout.leading);
1377    ordered.extend(layout.leading);
1378    layout
1379        .columns
1380        .sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
1381    for mut column in layout.columns {
1382        sort_lines_top_down(&mut column);
1383        ordered.extend(column);
1384    }
1385    sort_lines_top_down(&mut layout.trailing);
1386    ordered.extend(layout.trailing);
1387    ordered
1388}
1389
1390fn sort_lines_top_down(lines: &mut [&TextLine]) {
1391    lines.sort_by(|left, right| {
1392        right
1393            .bbox
1394            .y
1395            .total_cmp(&left.bbox.y)
1396            .then(left.bbox.x.total_cmp(&right.bbox.x))
1397    });
1398}
1399
1400fn detect_paired_text_columns(lines: &[TextLine]) -> Option<ColumnLayout<'_>> {
1401    if lines.len() < 4 {
1402        return None;
1403    }
1404
1405    let mut left_seed_indices = Vec::new();
1406    let mut right_seed_indices = Vec::new();
1407    for (left_index, left) in lines.iter().enumerate() {
1408        for (right_index, right) in lines.iter().enumerate() {
1409            if left_index == right_index || left.bbox.x >= right.bbox.x {
1410                continue;
1411            }
1412            if (left.bbox.y - right.bbox.y).abs() > column_pair_y_tolerance(left, right) {
1413                continue;
1414            }
1415            let gap = right.bbox.x - (left.bbox.x + left.bbox.width);
1416            let x_jump = right.bbox.x - left.bbox.x;
1417            if !is_likely_column_split_gap(&left.bbox, &right.bbox, gap, x_jump) {
1418                continue;
1419            }
1420            left_seed_indices.push(left_index);
1421            right_seed_indices.push(right_index);
1422        }
1423    }
1424    dedupe_indices(&mut left_seed_indices);
1425    dedupe_indices(&mut right_seed_indices);
1426    if left_seed_indices.len() < 2 || right_seed_indices.len() < 2 {
1427        return None;
1428    }
1429
1430    let left_x = average_x(lines, &left_seed_indices)?;
1431    let right_x = average_x(lines, &right_seed_indices)?;
1432    if right_x - left_x < 90.0 {
1433        return None;
1434    }
1435    let column_min_y = left_seed_indices
1436        .iter()
1437        .chain(&right_seed_indices)
1438        .map(|index| lines[*index].bbox.y)
1439        .reduce(f32::min)?;
1440    let column_max_y = left_seed_indices
1441        .iter()
1442        .chain(&right_seed_indices)
1443        .map(|index| lines[*index].bbox.y)
1444        .reduce(f32::max)?;
1445    let abstract_y = abstract_heading_y(lines);
1446    let midpoint = (left_x + right_x) / 2.0;
1447    // Reject an illusory gutter: single-column prose whose lines start at the left
1448    // margin and run across the page centre would otherwise be torn into two
1449    // false columns and read left-halves-then-right-halves.
1450    if !column_gutter_is_clear(lines, midpoint, column_min_y, column_max_y) {
1451        return None;
1452    }
1453    let mut leading = Vec::new();
1454    let mut trailing = Vec::new();
1455    let mut left_column = Vec::new();
1456    let mut right_column = Vec::new();
1457
1458    for line in lines {
1459        if is_likely_front_matter_line(line, abstract_y)
1460            || line.bbox.y > column_max_y + line.bbox.height
1461        {
1462            leading.push(line);
1463        } else if line.bbox.y < column_min_y - line.bbox.height * 1.8
1464            && (is_likely_page_number_line(line) || is_likely_bottom_footnote_line(line))
1465        {
1466            trailing.push(line);
1467        } else if line.bbox.x < midpoint {
1468            left_column.push(line);
1469        } else {
1470            right_column.push(line);
1471        }
1472    }
1473
1474    if left_column.len() < 2 || right_column.len() < 2 {
1475        return None;
1476    }
1477
1478    Some(ColumnLayout {
1479        leading,
1480        columns: vec![left_column, right_column],
1481        trailing,
1482    })
1483}
1484
1485fn column_pair_y_tolerance(left: &TextLine, right: &TextLine) -> f32 {
1486    left.bbox.height.max(right.bbox.height) * 0.45
1487}
1488
1489fn abstract_heading_y(lines: &[TextLine]) -> Option<f32> {
1490    lines
1491        .iter()
1492        .find(|line| text_line_plain_text(line).eq_ignore_ascii_case("abstract"))
1493        .map(|line| line.bbox.y)
1494}
1495
1496fn is_likely_front_matter_line(line: &TextLine, abstract_y: Option<f32>) -> bool {
1497    abstract_y.is_some_and(|y| line.bbox.y > y + 36.0)
1498}
1499
1500fn is_likely_bottom_footnote_line(line: &TextLine) -> bool {
1501    average_run_size(line) <= 10.0 && text_line_plain_text(line).len() > 4
1502}
1503
1504fn average_run_size(line: &TextLine) -> f32 {
1505    if line.runs.is_empty() {
1506        return line.bbox.height;
1507    }
1508    line.runs.iter().map(|run| run.size).sum::<f32>() / line.runs.len() as f32
1509}
1510
1511fn is_likely_page_number_line(line: &TextLine) -> bool {
1512    let text = text_line_plain_text(line);
1513    !text.is_empty() && text.len() <= 4 && text.chars().all(|character| character.is_ascii_digit())
1514}
1515
1516fn text_line_plain_text(line: &TextLine) -> String {
1517    // Geometry-aware join so callers (table-label checks, wrapped-label detection,
1518    // header assembly) see real words rather than the letter-spaced output the old
1519    // `trim().join(" ")` produced on glyph-by-glyph PDFs.
1520    join_runs_spaced(&runs_sorted_by_x(line)).trim().to_owned()
1521}
1522
1523fn dedupe_indices(indices: &mut Vec<usize>) {
1524    indices.sort_unstable();
1525    indices.dedup();
1526}
1527
1528fn average_x(lines: &[TextLine], indices: &[usize]) -> Option<f32> {
1529    if indices.is_empty() {
1530        return None;
1531    }
1532    Some(
1533        indices
1534            .iter()
1535            .map(|index| lines[*index].bbox.x)
1536            .sum::<f32>()
1537            / indices.len() as f32,
1538    )
1539}
1540
1541fn detect_text_columns(lines: &[TextLine]) -> Option<Vec<Vec<&TextLine>>> {
1542    if lines.len() < 4 {
1543        return None;
1544    }
1545
1546    let mut centers = lines
1547        .iter()
1548        .enumerate()
1549        .map(|(index, line)| (index, line.bbox.x + line.bbox.width / 2.0))
1550        .collect::<Vec<_>>();
1551    centers.sort_by(|left, right| left.1.total_cmp(&right.1));
1552
1553    let (split_index, largest_gap) = centers
1554        .windows(2)
1555        .enumerate()
1556        .map(|(index, window)| (index + 1, window[1].1 - window[0].1))
1557        .max_by(|left, right| left.1.total_cmp(&right.1))?;
1558    if largest_gap < 90.0 {
1559        return None;
1560    }
1561
1562    let (left_indices, right_indices) = centers.split_at(split_index);
1563    if left_indices.len() < 2 || right_indices.len() < 2 {
1564        return None;
1565    }
1566
1567    let left = left_indices
1568        .iter()
1569        .map(|(index, _)| &lines[*index])
1570        .collect::<Vec<_>>();
1571    let right = right_indices
1572        .iter()
1573        .map(|(index, _)| &lines[*index])
1574        .collect::<Vec<_>>();
1575
1576    let overlap = y_overlap(&left, &right)?;
1577    let average_height = average_line_height(lines);
1578    if overlap < average_height {
1579        return None;
1580    }
1581
1582    // A large gap between column *centres* is not enough: a single column whose
1583    // lines were split mid-way has two centre clusters but the halves abut (the
1584    // left half's right edge meets the right half's left edge). Require a genuine
1585    // gutter between the columns' edges — contiguous halves are one wrapped line.
1586    let left_right_edge = left
1587        .iter()
1588        .map(|line| line.bbox.x + line.bbox.width)
1589        .fold(f32::MIN, f32::max);
1590    let right_left_edge = right.iter().map(|line| line.bbox.x).fold(f32::MAX, f32::min);
1591    if right_left_edge - left_right_edge < 15.0 {
1592        return None;
1593    }
1594
1595    Some(vec![left, right])
1596}
1597
1598fn column_x(lines: &[&TextLine]) -> f32 {
1599    if lines.is_empty() {
1600        return 0.0;
1601    }
1602    lines.iter().map(|line| line.bbox.x).sum::<f32>() / lines.len() as f32
1603}
1604
1605fn y_overlap(left: &[&TextLine], right: &[&TextLine]) -> Option<f32> {
1606    let left_min = left.iter().map(|line| line.bbox.y).reduce(f32::min)?;
1607    let left_max = left
1608        .iter()
1609        .map(|line| line.bbox.y + line.bbox.height)
1610        .reduce(f32::max)?;
1611    let right_min = right.iter().map(|line| line.bbox.y).reduce(f32::min)?;
1612    let right_max = right
1613        .iter()
1614        .map(|line| line.bbox.y + line.bbox.height)
1615        .reduce(f32::max)?;
1616    Some((left_max.min(right_max) - left_min.max(right_min)).max(0.0))
1617}
1618
1619fn average_line_height(lines: &[TextLine]) -> f32 {
1620    let total = lines.iter().map(|line| line.bbox.height).sum::<f32>();
1621    total / lines.len() as f32
1622}
1623
1624fn text_block_from_line(page_number: usize, line: &TextLine, body_size: f32) -> Option<TextBlock> {
1625    let text = text_from_line_runs(line);
1626    let text = clean_pdf_line_text(&text);
1627    if text.is_empty() {
1628        return None;
1629    }
1630
1631    Some(TextBlock {
1632        text: text.clone(),
1633        kind: classify_text_line(&text, line_dominant_size(line), body_size),
1634        bbox: Some(line.bbox),
1635        lines: vec![Line {
1636            text,
1637            bbox: Some(line.bbox),
1638            spans: line
1639                .runs
1640                .iter()
1641                .filter_map(|run| {
1642                    let text = clean_pdf_span_text(&run.text);
1643                    (!text.is_empty()).then(|| Span {
1644                        text,
1645                        bbox: Some(run.bbox),
1646                        font: run.font.clone(),
1647                        size: Some(run.size),
1648                        bold: run.bold,
1649                        italic: run.italic,
1650                    })
1651                })
1652                .collect(),
1653        }],
1654        source_anchors: vec![anchor(
1655            page_number,
1656            Some(line.bbox),
1657            source_ids_for_line(line),
1658        )],
1659        confidence: Some(Confidence {
1660            score: 0.82,
1661            calibrated: false,
1662        }), ..Default::default()
1663    })
1664}
1665
1666/// Assemble a line's text from its x-sorted runs. A space is placed between two
1667/// runs only when the producer already encoded one (a space at the boundary) or
1668/// the horizontal gap is wide enough to be a word break, sized to the font's own
1669/// space-glyph width. Run-internal spaces are preserved verbatim — only the
1670/// inter-run boundary is decided here. This replaces the old `trim().join(" ")`,
1671/// which both dropped producer spaces (joining words: "Netincome") and inserted
1672/// spurious ones (splitting fragmented words: "Y ear", "2 0 5 4 9").
1673/// Per-line space threshold for a run of single glyphs, adapted to the line's own
1674/// gap distribution. PDFs that place every glyph individually encode spacing only
1675/// in the inter-glyph gaps, and the magnitude differs wildly by context: tight
1676/// body text glues words under a fixed threshold, while a letter-spaced ("tracked")
1677/// table header splits into "P r o d u c t i v i t y" under the same one. Anchoring
1678/// the threshold to the median gap of the line — tight lines get a low bar (word
1679/// spaces recovered), tracked lines a capped high bar (letters stay joined) —
1680/// handles both. Returns `None` when there are too few gaps to judge.
1681fn adaptive_single_glyph_gap(runs: &[TextRun]) -> Option<f32> {
1682    let mut gaps: Vec<f32> = Vec::new();
1683    let mut space_w = 0.0f32;
1684    let mut prev_end: Option<f32> = None;
1685    for run in runs {
1686        if run.text.is_empty() {
1687            continue;
1688        }
1689        space_w = space_w.max(run.space_width);
1690        if let Some(end) = prev_end {
1691            let gap = run.bbox.x - end;
1692            if gap.is_finite() && gap > 0.0 {
1693                gaps.push(gap);
1694            }
1695        }
1696        prev_end = Some(run.bbox.x + run.bbox.width);
1697    }
1698    if gaps.len() < 3 || space_w <= 0.0 {
1699        return None;
1700    }
1701    gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1702    let median = gaps[gaps.len() / 2];
1703    // Sit the bar above the line's typical gap so word spaces (much larger than
1704    // the intra-word gap) clear it. The ceiling is the static single-glyph default
1705    // (0.4 of a space width): the adaptive bar may only *lower* the threshold to
1706    // recover word spaces on tight or letter-spaced lines, never raise it — a
1707    // higher bar would glue words on loosely-set text whose median gap is moderate.
1708    Some((median * 1.8).clamp(space_w * 0.08, space_w * 0.4))
1709}
1710
1711fn join_runs_spaced(runs: &[TextRun]) -> String {
1712    let mut out = String::new();
1713    // Per-line adaptive bar for single-glyph sequences (see fn docs).
1714    let adaptive_glyph_gap = adaptive_single_glyph_gap(runs);
1715    // (end_x, space_width, baseline_y, multi_char)
1716    let mut previous: Option<(f32, f32, f32, bool)> = None;
1717    for run in runs {
1718        if run.text.is_empty() {
1719            continue;
1720        }
1721        let multi_char = run.text.trim().chars().count() >= 2;
1722        if let Some((prev_end_x, prev_space_width, prev_baseline_y, prev_multi)) = previous {
1723            let boundary_has_space = out.ends_with(char::is_whitespace)
1724                || run.text.starts_with(char::is_whitespace);
1725            let gap = run.bbox.x - prev_end_x;
1726            // Two complete (multi-char) tokens are separate words, so even a tight
1727            // gap is a word break; a sequence of single glyphs may be a
1728            // letter-spaced word, so only a clear gap separates them. This is what
1729            // distinguishes "It occurs" (two words, ~2pt apart) from a fragmented
1730            // or letter-spaced "U N I T E D" that should read "UNITED".
1731            // A digit-to-digit boundary, though, is a single number split mid-way
1732            // ("79,1" + "13" = "79,113"): treat it like a letter-spaced
1733            // continuation (the wider threshold) so a number is not torn at an
1734            // internal gap, while a real column gap still separates two figures.
1735            let numeric_continuation = out.trim_end().ends_with(|c: char| c.is_ascii_digit())
1736                && run.text.trim_start().starts_with(|c: char| c.is_ascii_digit());
1737            let tokens_separate = (prev_multi || multi_char) && !numeric_continuation;
1738            // Single-glyph boundaries use the per-line adaptive bar when available;
1739            // multi-char tokens keep the tight word-break threshold.
1740            let threshold = match adaptive_glyph_gap {
1741                Some(adaptive) if !tokens_separate => adaptive,
1742                _ => word_gap_threshold(prev_space_width, run.space_width, run.size, tokens_separate),
1743            };
1744            // A meaningful baseline shift means the adjacent run sits on a
1745            // different line of text (a super/subscript or a stacked cell being
1746            // flattened); keep those tokens apart even when they abut horizontally.
1747            let baseline_break =
1748                (prev_baseline_y - run.baseline_y).abs() >= run.size.max(1.0) * 0.18;
1749            // Two complete tokens that appear to *overlap* by more than half a space
1750            // width are separate words whose advance was over-estimated (common with
1751            // fallback metrics), not a continuation — a real word never overlaps the
1752            // next. A near-zero gap stays joined, so a ligature fragment that abuts
1753            // ("fi" + "scal") is unaffected.
1754            let overlap_break =
1755                tokens_separate && gap <= -(prev_space_width.max(run.space_width) * 0.6).max(0.5);
1756            if !out.is_empty()
1757                && !boundary_has_space
1758                && (gap >= threshold || baseline_break || overlap_break)
1759            {
1760                out.push(' ');
1761            }
1762        }
1763        out.push_str(&run.text);
1764        previous = Some((
1765            run.bbox.x + run.bbox.width,
1766            run.space_width,
1767            run.baseline_y,
1768            multi_char,
1769        ));
1770    }
1771    out
1772}
1773
1774/// Minimum horizontal gap (page units) between two runs that reads as a word
1775/// break. Scaled to the wider of the two runs' space-glyph widths (quarter-em
1776/// floor when a font lacks the metric). Separate multi-char tokens use a small
1777/// fraction (a real but tight inter-word space still counts), while single-glyph
1778/// runs need most of a space width so a letter-spaced word is not torn apart.
1779fn word_gap_threshold(
1780    left_space_width: f32,
1781    right_space_width: f32,
1782    size: f32,
1783    tokens_separate: bool,
1784) -> f32 {
1785    let space = left_space_width
1786        .max(right_space_width)
1787        .max(size * 0.25)
1788        .max(0.1);
1789    space * if tokens_separate { 0.1 } else { 0.4 }
1790}
1791
1792fn text_from_line_runs(line: &TextLine) -> String {
1793    let runs = runs_sorted_by_x(line);
1794    if !line_has_math_script_context(&runs[..]) {
1795        return join_runs_spaced(&runs[..]);
1796    }
1797
1798    let Some(baseline_y) = dominant_baseline_y(&runs[..]) else {
1799        return join_runs_spaced(&runs[..]);
1800    };
1801    let mut pieces: Vec<String> = Vec::new();
1802
1803    for run in runs.iter() {
1804        let token = run.text.trim();
1805        if token.is_empty() {
1806            continue;
1807        }
1808
1809        if let Some(script) = script_kind_for_run(run, baseline_y) {
1810            if let Some(previous) = pieces.last_mut() {
1811                if can_attach_math_script(previous, token) {
1812                    previous.push_str(&format_math_script(script, token));
1813                    continue;
1814                }
1815            }
1816        }
1817
1818        pieces.push(token.to_owned());
1819    }
1820
1821    pieces.join(" ")
1822}
1823
1824fn dominant_baseline_y(runs: &[TextRun]) -> Option<f32> {
1825    let max_size = runs
1826        .iter()
1827        .map(|run| run.size)
1828        .reduce(f32::max)
1829        .filter(|size| *size > 0.0)?;
1830    let mut baselines = runs
1831        .iter()
1832        .filter(|run| run.size >= max_size * 0.8)
1833        .map(|run| run.baseline_y)
1834        .collect::<Vec<_>>();
1835    if baselines.is_empty() {
1836        baselines = runs.iter().map(|run| run.baseline_y).collect();
1837    }
1838    baselines.sort_by(|left, right| left.total_cmp(right));
1839    baselines.get(baselines.len() / 2).copied()
1840}
1841
1842fn script_kind_for_run(run: &TextRun, baseline_y: f32) -> Option<ScriptKind> {
1843    let delta = run.baseline_y - baseline_y;
1844    let threshold = (run.size * 0.25).clamp(2.0, 4.0);
1845    if delta >= threshold {
1846        Some(ScriptKind::Superscript)
1847    } else if delta <= -threshold {
1848        Some(ScriptKind::Subscript)
1849    } else {
1850        None
1851    }
1852}
1853
1854fn line_has_math_script_context(runs: &[TextRun]) -> bool {
1855    let joined = runs
1856        .iter()
1857        .map(|run| run.text.as_str())
1858        .collect::<Vec<_>>()
1859        .join(" ");
1860    joined.chars().any(|character| {
1861        matches!(
1862            character,
1863            // ASCII '-' is excluded: it is overwhelmingly a hyphen in prose
1864            // ("non-trade", "well-known"), so triggering math assembly on it
1865            // mangles hyphenated words. The real math minus is U+2212 ('−').
1866            '=' | '+'
1867                | '−'
1868                | '×'
1869                | '*'
1870                | '^'
1871                | '_'
1872                | '∈'
1873                | '≤'
1874                | '≥'
1875                | '≠'
1876                | 'λ'
1877                | 'θ'
1878                | 'ρ'
1879                | 'τ'
1880                | 'Σ'
1881                | '∑'
1882        )
1883    }) || runs.windows(2).any(|window| {
1884        let left = window[0].text.trim();
1885        let right = window[1].text.trim();
1886        // Require an actual baseline offset: a super/subscript sits visibly above
1887        // or below its base. Without this the predicate fires on ordinary
1888        // glyph-by-glyph prose (every letter is a single alphanumeric "base"
1889        // followed by another "script"), which is the norm in Chrome/Skia PDFs,
1890        // wrongly routing plain text through the script-assembly path.
1891        let baseline_delta = (window[0].baseline_y - window[1].baseline_y).abs();
1892        let script_offset = window[0].size.max(window[1].size) * 0.2;
1893        baseline_delta >= script_offset
1894            && is_math_script_base(left)
1895            && is_math_script_text(right)
1896    })
1897}
1898
1899fn can_attach_math_script(previous: &str, token: &str) -> bool {
1900    !previous.ends_with('^')
1901        && !previous.ends_with('_')
1902        && is_math_script_text(token)
1903        && previous_has_math_script_base(previous)
1904}
1905
1906fn is_math_script_base(token: &str) -> bool {
1907    let trimmed = token.trim_matches(|character: char| matches!(character, '(' | '[' | '{'));
1908    let count = trimmed.chars().count();
1909    (count == 1 && trimmed.chars().any(|character| character.is_alphanumeric()))
1910        || trimmed.starts_with('\\')
1911}
1912
1913fn previous_has_math_script_base(previous: &str) -> bool {
1914    let trimmed = previous.trim_end();
1915    if trimmed.ends_with('}') || trimmed.ends_with(']') || trimmed.ends_with(')') {
1916        return trimmed.contains('\\') || trimmed.contains('_') || trimmed.contains('^');
1917    }
1918    trimmed
1919        .chars()
1920        .rev()
1921        .find(|character| !matches!(character, '*' | '\'' | '′'))
1922        .is_some_and(|character| character.is_alphabetic() || character == '\\')
1923}
1924
1925fn is_math_script_text(token: &str) -> bool {
1926    let cleaned = token.trim_matches(|character: char| matches!(character, '(' | ')' | '[' | ']'));
1927    !cleaned.is_empty()
1928        && cleaned.chars().all(|character| {
1929            character.is_alphanumeric()
1930                || matches!(character, '+' | '-' | '−' | '=' | ',' | '.' | '\\')
1931        })
1932}
1933
1934fn format_math_script(kind: ScriptKind, token: &str) -> String {
1935    let marker = match kind {
1936        ScriptKind::Superscript => '^',
1937        ScriptKind::Subscript => '_',
1938    };
1939    let cleaned = token.trim();
1940    if cleaned.chars().count() == 1
1941        || cleaned
1942            .chars()
1943            .all(|character| character.is_ascii_alphanumeric())
1944    {
1945        format!("{marker}{cleaned}")
1946    } else {
1947        format!("{marker}{{{cleaned}}}")
1948    }
1949}
1950
1951fn merge_wrapped_text_blocks(blocks: Vec<TextBlock>) -> Vec<TextBlock> {
1952    let mut merged: Vec<TextBlock> = Vec::new();
1953    for block in blocks {
1954        if let Some(previous) = merged.last_mut() {
1955            if should_merge_text_blocks(previous, &block) {
1956                merge_text_block(previous, block);
1957                continue;
1958            }
1959        }
1960        merged.push(block);
1961    }
1962    merged
1963}
1964
1965fn should_merge_text_blocks(previous: &TextBlock, next: &TextBlock) -> bool {
1966    let Some(previous_bbox) = previous.bbox else {
1967        return false;
1968    };
1969    let Some(next_bbox) = next.bbox else {
1970        return false;
1971    };
1972    let baseline_gap = previous_bbox.y - next_bbox.y;
1973    if baseline_gap <= 0.0 || baseline_gap > previous_bbox.height.max(next_bbox.height) * 1.8 {
1974        return false;
1975    }
1976    let x_aligned = (previous_bbox.x - next_bbox.x).abs() <= 18.0;
1977    let hyphenated = previous.text.ends_with('-') && starts_with_lowercase(&next.text);
1978    if x_aligned && hyphenated {
1979        return true;
1980    }
1981    if starts_with_numbered_step(&previous.text) && starts_with_numbered_step(&next.text) {
1982        return false;
1983    }
1984    if previous.kind != "paragraph" || next.kind != "paragraph" {
1985        return false;
1986    }
1987    let lowercase_continuation =
1988        starts_with_lowercase(&next.text) && !ends_sentence(&previous.text);
1989    x_aligned && (hyphenated || lowercase_continuation)
1990}
1991
1992fn merge_text_block(previous: &mut TextBlock, next: TextBlock) {
1993    previous.text = join_wrapped_text(&previous.text, &next.text);
1994    previous.bbox = union_boxes(previous.bbox.into_iter().chain(next.bbox)).or(previous.bbox);
1995    previous.lines.extend(next.lines);
1996    for anchor in next.source_anchors {
1997        previous.source_anchors.push(anchor);
1998    }
1999}
2000
2001fn join_wrapped_text(previous: &str, next: &str) -> String {
2002    if let Some(stem) = previous.strip_suffix('-') {
2003        format!("{stem}{}", next.trim_start())
2004    } else {
2005        format!("{} {}", previous.trim_end(), next.trim_start())
2006    }
2007}
2008
2009fn starts_with_lowercase(text: &str) -> bool {
2010    text.chars()
2011        .find(|character| character.is_alphabetic())
2012        .is_some_and(|character| character.is_lowercase())
2013}
2014
2015fn starts_with_numbered_step(text: &str) -> bool {
2016    let trimmed = text.trim_start();
2017    let digit_count = trimmed
2018        .chars()
2019        .take_while(|character| character.is_ascii_digit())
2020        .count();
2021    digit_count > 0
2022        && trimmed
2023            .chars()
2024            .nth(digit_count)
2025            .is_some_and(|character| matches!(character, ':' | '.'))
2026}
2027
2028fn ends_sentence(text: &str) -> bool {
2029    text.trim_end()
2030        .chars()
2031        .last()
2032        .is_some_and(|character| matches!(character, '.' | '!' | '?'))
2033}
2034
2035fn clean_pdf_line_text(text: &str) -> String {
2036    let text = repair_windows_1252_ellipsis_before_tokenizing(text);
2037    let tokens = text
2038        .split_whitespace()
2039        .map(normalize_pdf_token)
2040        .filter(|token| !token.is_empty())
2041        .collect::<Vec<_>>();
2042    let mut cleaned: Vec<String> = Vec::new();
2043    let mut index = 0;
2044    while index < tokens.len() {
2045        let token = tokens[index].as_str();
2046        if is_closing_punctuation_token(token) && !cleaned.is_empty() {
2047            let previous = cleaned.last_mut().expect("checked non-empty");
2048            previous.push_str(token);
2049            index += 1;
2050            continue;
2051        }
2052        if is_joining_apostrophe(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
2053            let next = tokens[index + 1].as_str();
2054            if is_word_piece(next) {
2055                let previous = cleaned.last_mut().expect("checked non-empty");
2056                previous.push('\'');
2057                previous.push_str(next);
2058                index += 2;
2059                continue;
2060            }
2061        }
2062        if is_joining_hyphen(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
2063            let next = tokens[index + 1].as_str();
2064            if is_word_piece(next) {
2065                let previous = cleaned.last_mut().expect("checked non-empty");
2066                previous.push('-');
2067                previous.push_str(next);
2068                index += 2;
2069                continue;
2070            }
2071        }
2072        if let Some(previous) = cleaned.last_mut() {
2073            if should_join_after_trailing_hyphen(previous, token) {
2074                previous.push_str(token);
2075                index += 1;
2076                continue;
2077            }
2078            if should_join_pdf_word_piece(previous, token) {
2079                previous.push_str(token);
2080                index += 1;
2081                continue;
2082            }
2083        }
2084        if is_letter_fragment(token) {
2085            let mut merged = String::new();
2086            let mut end = index;
2087            while end < tokens.len() && is_letter_fragment(tokens[end].as_str()) {
2088                merged.push_str(tokens[end].as_str());
2089                end += 1;
2090            }
2091            if end - index >= 2 {
2092                cleaned.push(merged);
2093                index = end;
2094                continue;
2095            }
2096        }
2097        cleaned.push(token.to_owned());
2098        index += 1;
2099    }
2100    repair_pdf_math_notation(&repair_pdf_word_fragment_phrases(&cleaned.join(" ")))
2101}
2102
2103fn clean_pdf_span_text(text: &str) -> String {
2104    repair_pdf_math_notation(&normalize_pdf_token(text))
2105}
2106
2107fn repair_pdf_word_fragment_phrases(text: &str) -> String {
2108    let mut repaired = text.to_owned();
2109    for (broken, fixed) in [
2110        ("a c onversatio n", "a conversation"),
2111        ("ac onversatio n", "a conversation"),
2112        ("an other", "another"),
2113        ("ce nters", "centers"),
2114        ("prod uction", "production"),
2115        ("de mands", "demands"),
2116        ("turn s", "turns"),
2117        ("coordinate s", "coordinates"),
2118        ("coordinat e", "coordinate"),
2119        ("facilitat e", "facilitate"),
2120        ("speake rs", "speakers"),
2121        ("listener s'", "listeners'"),
2122        ("th e", "the"),
2123        ("p resent", "present"),
2124        ("linguisti c", "linguistic"),
2125        ("an d", "and"),
2126        ("inferen ces", "inferences"),
2127        ("attentio n", "attention"),
2128        ("B eyond", "Beyond"),
2129        ("variabilit y", "variability"),
2130        ("l essons", "lessons"),
2131        ("re peating", "repeating"),
2132        ("import ant", "important"),
2133        ("sp ecified", "specified"),
2134    ] {
2135        repaired = repaired.replace(broken, fixed);
2136    }
2137    repaired
2138}
2139
2140fn normalize_pdf_token(token: &str) -> String {
2141    let normalized = token
2142        .replace("â\u{80}\u{98}", "'")
2143        .replace("â\u{80}\u{99}", "'")
2144        .replace("Â·", "·")
2145        .replace("â\u{84}\u{93}", "ℓ")
2146        .replace("Î“", "Γ")
2147        .replace("Î˜", "Θ")
2148        .replace("Î›", "Λ")
2149        .replace("Î\u{a0}", "Π")
2150        .replace("Î£", "Σ")
2151        .replace("Î¦", "Φ")
2152        .replace("Î©", "Ω")
2153        .replace("Î»", "λ")
2154        .replace("Ï\u{84}", "τ")
2155        .replace("Ã\u{97}", "×")
2156        .replace("â\u{86}\u{92}", "→")
2157        .replace("â\u{89}¥", "≥")
2158        .replace("â\u{89}¤", "≤")
2159        .replace("â\u{88}\u{88}", "∈")
2160        .replace("â\u{88}\u{91}", "∑")
2161        .replace(['‘', '’'], "'")
2162        .replace(['“', '”'], "\"");
2163    let normalized = expand_latin_ligatures(&normalized);
2164    let normalized = repair_windows_1252_control_punctuation(&normalized);
2165    repair_embedded_pdf_control_glyphs(&normalized)
2166}
2167
2168/// Expand Unicode Latin presentation-form ligatures (U+FB00–U+FB06) to their
2169/// component ASCII letters. Some PDF producers map a ligature glyph's ToUnicode
2170/// entry (or a `uniFB01`-style name) to the precomposed codepoint; leaving it in
2171/// the output degrades downstream search and matching. NFC/NFD do not decompose
2172/// these — only an explicit table (or NFKC) does.
2173fn expand_latin_ligatures(text: &str) -> String {
2174    if !text.chars().any(|character| ('\u{FB00}'..='\u{FB06}').contains(&character)) {
2175        return text.to_owned();
2176    }
2177    let mut output = String::with_capacity(text.len());
2178    for character in text.chars() {
2179        match character {
2180            '\u{FB00}' => output.push_str("ff"),
2181            '\u{FB01}' => output.push_str("fi"),
2182            '\u{FB02}' => output.push_str("fl"),
2183            '\u{FB03}' => output.push_str("ffi"),
2184            '\u{FB04}' => output.push_str("ffl"),
2185            '\u{FB05}' | '\u{FB06}' => output.push_str("st"),
2186            other => output.push(other),
2187        }
2188    }
2189    output
2190}
2191
2192fn repair_windows_1252_control_punctuation(text: &str) -> String {
2193    let mut output = String::with_capacity(text.len());
2194
2195    for character in text.chars() {
2196        match character {
2197            '\u{80}' => output.push_str("EUR"),
2198            '\u{82}' => output.push(','),
2199            '\u{83}' => output.push('f'),
2200            '\u{84}' => output.push('"'),
2201            '\u{85}' => output.push_str("..."),
2202            '\u{86}' => output.push_str("†"),
2203            '\u{87}' => output.push_str("‡"),
2204            '\u{88}' => output.push('^'),
2205            '\u{89}' => output.push_str("‰"),
2206            '\u{8a}' => output.push_str("Š"),
2207            '\u{8b}' => output.push('<'),
2208            '\u{8c}' => output.push_str("OE"),
2209            '\u{8e}' => output.push_str("Ž"),
2210            '\u{91}' | '\u{92}' => output.push('\''),
2211            '\u{93}' | '\u{94}' => output.push('"'),
2212            '\u{95}' => output.push('*'),
2213            '\u{96}' => output.push('–'),
2214            '\u{97}' => output.push('—'),
2215            '\u{98}' => output.push('~'),
2216            '\u{99}' => output.push_str("(TM)"),
2217            '\u{9a}' => output.push_str("š"),
2218            '\u{9b}' => output.push('>'),
2219            '\u{9c}' => output.push_str("oe"),
2220            '\u{9e}' => output.push_str("ž"),
2221            '\u{9f}' => output.push_str("Ÿ"),
2222            _ => output.push(character),
2223        }
2224    }
2225
2226    output
2227}
2228
2229fn repair_windows_1252_ellipsis_before_tokenizing(text: &str) -> String {
2230    text.replace('\u{85}', "...")
2231}
2232
2233fn repair_embedded_pdf_control_glyphs(token: &str) -> String {
2234    let characters = token.chars().collect::<Vec<_>>();
2235    let mut output = String::with_capacity(token.len());
2236    for (index, character) in characters.iter().enumerate() {
2237        match character {
2238            '\u{2}' if has_following_alphabetic(&characters, index + 1) => {
2239                output.push_str("fi");
2240            }
2241            '\u{2}' => {}
2242            '\u{3}' if has_following_alphabetic(&characters, index + 1) => {
2243                output.push_str("fl");
2244            }
2245            _ => output.push(*character),
2246        }
2247    }
2248    output
2249}
2250
2251fn has_following_alphabetic(characters: &[char], index: usize) -> bool {
2252    characters
2253        .get(index)
2254        .is_some_and(|character| character.is_alphabetic())
2255}
2256
2257fn is_closing_punctuation_token(token: &str) -> bool {
2258    matches!(token, "." | "," | ":" | ";" | "!" | "?" | ")" | "]" | "}")
2259}
2260
2261fn should_join_after_trailing_hyphen(previous: &str, token: &str) -> bool {
2262    previous.ends_with('-')
2263        && token
2264            .chars()
2265            .next()
2266            .is_some_and(|character| character.is_ascii_alphanumeric())
2267        && previous
2268            .chars()
2269            .any(|character| character.is_ascii_alphanumeric())
2270}
2271
2272fn should_join_pdf_word_piece(previous: &str, token: &str) -> bool {
2273    if !is_alphabetic_word(previous) || !is_alphabetic_word(token) {
2274        return false;
2275    }
2276    if !previous
2277        .chars()
2278        .last()
2279        .is_some_and(|character| character.is_lowercase())
2280        || !starts_with_lowercase(token)
2281    {
2282        return false;
2283    }
2284
2285    matches!(
2286        (previous, token),
2287        ("coordina", "ting") | ("de", "scribe") | ("foc", "i") | ("pro", "posed")
2288    )
2289}
2290
2291fn is_alphabetic_word(token: &str) -> bool {
2292    !token.is_empty() && token.chars().all(|character| character.is_alphabetic())
2293}
2294
2295fn repair_pdf_math_notation(text: &str) -> String {
2296    let normalized = text.replace("Â·", "·").replace("â\u{84}\u{93}", "ℓ");
2297    if !looks_like_pdf_math_notation(&normalized) {
2298        return strip_pdf_control_glyphs(&normalized);
2299    }
2300
2301    let normalized = repair_combining_math_operator_sequences(&normalized);
2302    let symbols = replace_math_symbols(&normalized);
2303    strip_pdf_control_glyphs(&repair_math_subscript_spacing(&symbols))
2304}
2305
2306fn repair_combining_math_operator_sequences(text: &str) -> String {
2307    text.replace("\u{338} =", "≠")
2308        .replace("\u{338}=", "≠")
2309        .replace("=\u{338}", "≠")
2310}
2311
2312fn looks_like_pdf_math_notation(text: &str) -> bool {
2313    text.chars().any(|character| {
2314        matches!(
2315            character,
2316            'ℓ' | 'λ'
2317                | 'θ'
2318                | 'ρ'
2319                | 'τ'
2320                | '∆'
2321                | 'Δ'
2322                | '≤'
2323                | '≥'
2324                | '∈'
2325                | '∪'
2326                | '∑'
2327                | '∅'
2328                | '·'
2329                | '−'
2330                | '±'
2331                | '⊆'
2332                | '∼'
2333                | '≠'
2334                | '→'
2335        )
2336    }) || has_math_ellipsis_context(text)
2337        || text.contains("Fq")
2338        || text.contains(" 6 =")
2339}
2340
2341fn has_math_ellipsis_context(text: &str) -> bool {
2342    if !text.contains("...") {
2343        return false;
2344    }
2345
2346    let compact = text.split_whitespace().collect::<String>();
2347    compact.contains(",...,")
2348        || compact.contains("),...")
2349        || compact.contains("...,(")
2350        || text.chars().any(|character| {
2351            matches!(
2352                character,
2353                '=' | '+' | '_' | '^' | '\\' | '∈' | '≤' | '≥' | '≠' | 'λ' | 'θ' | 'ρ' | 'τ'
2354            )
2355        })
2356}
2357
2358fn replace_math_symbols(text: &str) -> String {
2359    let collapsed = text
2360        .replace("· · ·", r"\cdots")
2361        .replace("...", r"\ldots")
2362        .replace("6 =", r"\neq")
2363        .replace("Fq", r"\mathbb{F}_q");
2364    let mut output = String::with_capacity(collapsed.len());
2365
2366    for character in collapsed.chars() {
2367        match character {
2368            '\u{3}' => output.push_str(r"\Lambda"),
2369            'Γ' => output.push_str(r"\Gamma"),
2370            'Θ' => output.push_str(r"\Theta"),
2371            'ℓ' => output.push_str(r"\ell"),
2372            'λ' => output.push_str(r"\lambda"),
2373            'Λ' => output.push_str(r"\Lambda"),
2374            'Π' => output.push_str(r"\Pi"),
2375            'Σ' => output.push_str(r"\Sigma"),
2376            'Φ' => output.push_str(r"\Phi"),
2377            'Ω' => output.push_str(r"\Omega"),
2378            'θ' => output.push_str(r"\theta"),
2379            'ρ' => output.push_str(r"\rho"),
2380            'τ' => output.push_str(r"\tau"),
2381            '∆' | 'Δ' => output.push_str(r"\Delta"),
2382            '≤' => output.push_str(r"\leq"),
2383            '≥' => output.push_str(r"\geq"),
2384            '∈' => output.push_str(r"\in"),
2385            '∪' => output.push_str(r"\cup"),
2386            '∑' => output.push_str(r"\sum"),
2387            '∅' => output.push_str(r"\varnothing"),
2388            '−' => output.push('-'),
2389            '±' => output.push_str(r"\pm"),
2390            '⊆' => output.push_str(r"\subseteq"),
2391            '∼' => output.push_str(r"\sim"),
2392            '≠' => output.push_str(r"\neq"),
2393            '×' => output.push_str(r"\times"),
2394            '→' => output.push_str(r"\to"),
2395            '·' => output.push_str(r"\cdot"),
2396            _ => output.push(character),
2397        }
2398    }
2399
2400    output
2401}
2402
2403fn strip_pdf_control_glyphs(text: &str) -> String {
2404    let mut sanitized = String::with_capacity(text.len());
2405    let mut last_was_space = false;
2406
2407    for character in text.chars() {
2408        if is_nonprinting_pdf_control(character) {
2409            if !last_was_space {
2410                sanitized.push(' ');
2411                last_was_space = true;
2412            }
2413            continue;
2414        }
2415
2416        sanitized.push(character);
2417        last_was_space = character.is_whitespace();
2418    }
2419
2420    sanitized.split_whitespace().collect::<Vec<_>>().join(" ")
2421}
2422
2423fn is_nonprinting_pdf_control(character: char) -> bool {
2424    character.is_control() && !matches!(character, '\n' | '\r' | '\t')
2425}
2426
2427fn repair_math_subscript_spacing(text: &str) -> String {
2428    let tokens = text.split_whitespace().collect::<Vec<_>>();
2429    let mut repaired = Vec::with_capacity(tokens.len());
2430    let mut index = 0;
2431
2432    while index < tokens.len() {
2433        let token = tokens[index];
2434        if is_math_base_token(token) && index + 1 < tokens.len() {
2435            if tokens[index + 1].starts_with('_') {
2436                repaired.push(format!("{}{}", token, tokens[index + 1]));
2437                index += 2;
2438                continue;
2439            }
2440            if let Some((subscript, suffix)) = split_math_subscript_token(tokens[index + 1]) {
2441                repaired.push(format!(
2442                    "{}{}{}",
2443                    token,
2444                    format_math_subscript(subscript),
2445                    suffix
2446                ));
2447                index += 2;
2448                continue;
2449            }
2450        }
2451
2452        repaired.push(repair_compact_math_subscript(token));
2453        index += 1;
2454    }
2455
2456    repaired.join(" ")
2457}
2458
2459fn repair_compact_math_subscript(token: &str) -> String {
2460    if token.chars().count() > 2 && token.chars().all(|character| character.is_alphabetic()) {
2461        return token.to_owned();
2462    }
2463
2464    for base in ["m", "n", "N", "T", "V", "C", "x", "t", "i", "k", "h", "g"] {
2465        if let Some(rest) = token.strip_prefix(base) {
2466            if rest.is_empty() || rest.starts_with('_') {
2467                continue;
2468            }
2469            if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
2470                return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
2471            }
2472        }
2473    }
2474
2475    for base in [r"\lambda", r"\theta", r"\rho"] {
2476        if let Some(rest) = token.strip_prefix(base) {
2477            if rest.is_empty() || rest.starts_with('_') {
2478                continue;
2479            }
2480            if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
2481                return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
2482            }
2483        }
2484    }
2485
2486    token.to_owned()
2487}
2488
2489fn is_math_base_token(token: &str) -> bool {
2490    matches!(
2491        token,
2492        "m" | "n"
2493            | "N"
2494            | "T"
2495            | "V"
2496            | "C"
2497            | "x"
2498            | "t"
2499            | "i"
2500            | "k"
2501            | "h"
2502            | "g"
2503            | r"\lambda"
2504            | r"\theta"
2505            | r"\rho"
2506    )
2507}
2508
2509fn split_math_subscript_token(token: &str) -> Option<(&str, &str)> {
2510    for command in [r"\ell", r"\lambda", r"\theta", r"\rho"] {
2511        if let Some(suffix) = token.strip_prefix(command) {
2512            return Some((command, suffix));
2513        }
2514    }
2515    for word in ["init", "cl"] {
2516        if let Some(suffix) = token.strip_prefix(word) {
2517            return Some((word, suffix));
2518        }
2519    }
2520
2521    let mut end = 0;
2522    for (offset, character) in token.char_indices() {
2523        if character.is_ascii_digit() {
2524            end = offset + character.len_utf8();
2525            continue;
2526        }
2527        break;
2528    }
2529    if end > 0 {
2530        return Some((&token[..end], &token[end..]));
2531    }
2532
2533    let mut chars = token.char_indices();
2534    let (_, first) = chars.next()?;
2535    if matches!(first, 'i' | 'j' | 'k' | 'l' | 'n' | 'r' | 's') {
2536        let end = first.len_utf8();
2537        return Some((&token[..end], &token[end..]));
2538    }
2539    None
2540}
2541
2542fn format_math_subscript(subscript: &str) -> String {
2543    match subscript {
2544        "init" => r"_{\text{init}}".to_owned(),
2545        _ => format!("_{subscript}"),
2546    }
2547}
2548
2549fn is_letter_fragment(token: &str) -> bool {
2550    let chars = token.chars().collect::<Vec<_>>();
2551    matches!(chars.as_slice(), [character] if character.is_ascii_alphabetic())
2552        || matches!(chars.as_slice(), [character, '-'] if character.is_ascii_alphabetic())
2553}
2554
2555fn is_word_piece(token: &str) -> bool {
2556    token.chars().any(|character| character.is_alphabetic())
2557}
2558
2559fn is_joining_apostrophe(token: &str) -> bool {
2560    matches!(token, "'" | "’")
2561}
2562
2563fn is_joining_hyphen(token: &str) -> bool {
2564    matches!(token, "-" | "‐" | "‑")
2565}
2566
2567fn detect_table(
2568    page_number: usize,
2569    lines: &[TextLine],
2570    edges: &[GraphicEdge],
2571) -> Option<DetectedTable> {
2572    detect_ruled_grid_table(page_number, lines, edges)
2573        .or_else(|| detect_exact_run_table(page_number, lines))
2574        .or_else(|| detect_columnar_numeric_table(page_number, lines))
2575        .or_else(|| detect_implied_alignment_table(page_number, lines))
2576}
2577
2578/// Detect a table by anchoring on the columns themselves rather than on a run of
2579/// identically-shaped rows. Numeric cells across the page are clustered by their
2580/// right edge into stable columns (numbers are right-aligned), then *every* line
2581/// in the table's vertical span is assigned to those columns — so section headers
2582/// and subtotals ("Operating activities:", "Cash generated by operating
2583/// activities") become full-width label rows instead of breaking the table apart.
2584/// This is what lets a whole multi-section financial statement extract as one
2585/// table. Entirely geometric and document-agnostic — no financial-specific rules.
2586fn detect_columnar_numeric_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
2587    let line_cells: Vec<Vec<TextRun>> = lines
2588        .iter()
2589        .map(|line| coalesce_currency_prefixes(implied_table_cells(line)))
2590        .collect();
2591
2592    // Right edges of value cells (figures *and* dash placeholders), from lines that
2593    // already look like data rows (>= 2 value cells), so prose with an incidental
2594    // figure does not vote. Counting "—" placeholders lets a sparse column — common
2595    // in wide segment/equity tables where most rows are blank — still be detected.
2596    let mut right_edges: Vec<f32> = Vec::new();
2597    let mut data_rows = 0usize;
2598    for cells in &line_cells {
2599        // A prose sentence near the table (a caption like "The following table shows
2600        // … for 2024, 2023 and 2022 …") carries figures but is not a data row;
2601        // letting it vote scatters phantom columns. Skip lines with a many-word cell.
2602        if cells_contain_prose(cells) {
2603            continue;
2604        }
2605        let values = cells.iter().filter(|cell| is_value_cell(&cell.text)).count();
2606        if values >= 2 {
2607            data_rows += 1;
2608            for cell in cells.iter().filter(|cell| is_value_cell(&cell.text)) {
2609                right_edges.push(cell.bbox.x + cell.bbox.width);
2610            }
2611        }
2612    }
2613    if data_rows < 4 {
2614        return None;
2615    }
2616
2617    let min_support = ((data_rows as f32) * 0.35).ceil().max(3.0) as usize;
2618    let all_clusters = cluster_column_right_edges_with_support(&right_edges, 8.0);
2619    let mut columns: Vec<f32> = all_clusters
2620        .iter()
2621        .filter(|(_, support)| *support >= min_support)
2622        .map(|(position, _)| *position)
2623        .collect();
2624    // Recover sparse-but-periodic sub-columns (paired Shares/Amount, fair-value
2625    // Level 1/2/3) that the support vote drops; a no-op for plain N-year tables.
2626    columns.extend(rescue_periodic_subcolumns(
2627        &all_clusters,
2628        &columns,
2629        min_support,
2630        data_rows,
2631    ));
2632    columns.sort_by(f32::total_cmp);
2633    if columns.len() < 2 {
2634        return None;
2635    }
2636    // Boundary between the label column and the first numeric column. Sit it well
2637    // left of the figures (a couple of cell widths, but no further left than half
2638    // way to the next column) so a wide right-aligned header date counts as a
2639    // column entry while a left-anchored row label does not.
2640    let cell_width = column_cell_width(&line_cells, columns[0]);
2641    let half_gap = columns
2642        .get(1)
2643        .map_or(cell_width * 2.5, |next| (next - columns[0]) / 2.0);
2644    let first_column_left = columns[0] - (cell_width * 2.5).min(half_gap.max(cell_width * 1.5));
2645    let table_right = columns.last().copied().unwrap_or_default();
2646
2647    // Lines whose cells land on the detected columns are the table's rows.
2648    let aligned: Vec<usize> = (0..lines.len())
2649        .filter(|&index| {
2650            line_cells[index]
2651                .iter()
2652                .filter(|cell| is_value_cell(&cell.text))
2653                .any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some())
2654        })
2655        .collect();
2656    let (first, last) = (*aligned.first()?, *aligned.last()?);
2657
2658    // Walk the span; keep contiguous table rows (data rows + interleaved label-only
2659    // rows) and stop at a clear break — a non-aligned numeric line (a different
2660    // table) or a large vertical gap.
2661    let mut row_indices: Vec<usize> = Vec::new();
2662    let mut previous_y: Option<f32> = None;
2663    for index in first..=last {
2664        let line = &lines[index];
2665        let cells = &line_cells[index];
2666        let aligned_here = cells
2667            .iter()
2668                .filter(|cell| is_value_cell(&cell.text))
2669                .any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some());
2670        let numeric_here = cells.iter().any(|cell| is_numeric_value(&cell.text));
2671        let label_only = !numeric_here && line.bbox.x <= table_right;
2672        if !aligned_here && !label_only {
2673            break;
2674        }
2675        if let Some(prev) = previous_y {
2676            if (prev - line.bbox.y).abs() > average_run_size(line).max(line.bbox.height) * 3.5 {
2677                break;
2678            }
2679        }
2680        row_indices.push(index);
2681        previous_y = Some(line.bbox.y);
2682    }
2683    let aligned_in_span = row_indices
2684        .iter()
2685        .filter(|&&index| {
2686            line_cells[index]
2687                .iter()
2688                .filter(|cell| is_value_cell(&cell.text))
2689                .any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some())
2690        })
2691        .count();
2692    if aligned_in_span < 4 {
2693        return None;
2694    }
2695
2696    build_columnar_table(page_number, lines, &line_cells, &columns, first_column_left, &row_indices)
2697}
2698
2699/// Merge a lone currency symbol cell into the figure that follows it. Financial
2700/// statements left-align the `$` at the column edge and right-align the number, so
2701/// the splitter sees two cells ("$", "30,737"); rejoined the `$` belongs to the
2702/// number on its right ("$30,737"), not the column on its left.
2703fn coalesce_currency_prefixes(cells: Vec<TextRun>) -> Vec<TextRun> {
2704    const SYMBOLS: [char; 4] = ['$', '€', '£', '¥'];
2705    let mut out: Vec<TextRun> = Vec::with_capacity(cells.len());
2706    let mut pending: Option<TextRun> = None;
2707    for mut cell in cells {
2708        let mut text = cell.text.trim().to_string();
2709        if let Some(prefix) = pending.take() {
2710            cell.bbox = union_boxes([prefix.bbox, cell.bbox]).unwrap_or(cell.bbox);
2711            text = format!("{}{}", prefix.text.trim(), text);
2712        }
2713        // A lone symbol carries to the next figure (left-aligned column `$`).
2714        if text.chars().count() == 1 && text.chars().all(|c| SYMBOLS.contains(&c)) {
2715            cell.text = text;
2716            pending = Some(cell);
2717            continue;
2718        }
2719        // A trailing symbol belongs to the *next* column's figure: the splitter
2720        // groups each column's `$` with the preceding number ("30,737 $").
2721        if let Some(last) = text.chars().last() {
2722            if SYMBOLS.contains(&last) {
2723                let stripped = text[..text.len() - last.len_utf8()].trim_end();
2724                if !stripped.is_empty() {
2725                    let mut carry = cell.clone();
2726                    carry.text = last.to_string();
2727                    text = stripped.to_string();
2728                    pending = Some(carry);
2729                }
2730            }
2731        }
2732        cell.text = text;
2733        out.push(cell);
2734    }
2735    if let Some(prefix) = pending {
2736        out.push(prefix);
2737    }
2738    out
2739}
2740
2741/// Is this cell a numeric value — a figure, possibly wrapped in `$`, parens
2742/// (negatives), commas, a percent or a trailing footnote marker? Used to find the
2743/// columns to anchor on, so it must accept real table figures and reject prose.
2744fn is_numeric_value(text: &str) -> bool {
2745    let trimmed = text.trim();
2746    if trimmed.is_empty() {
2747        return false;
2748    }
2749    let mut digits = 0usize;
2750    for character in trimmed.chars() {
2751        match character {
2752            '0'..='9' => digits += 1,
2753            '$' | '(' | ')' | ',' | '.' | '%' | '-' | '+' | ' ' | '\u{2014}' | '\u{2013}' => {}
2754            _ => return false,
2755        }
2756    }
2757    digits >= 1
2758}
2759
2760/// A cell that occupies a value column — a figure or a dash placeholder ("—",
2761/// the financial "zero/none"). Used for column detection so a column that is
2762/// mostly blank still registers.
2763fn is_value_cell(text: &str) -> bool {
2764    is_numeric_value(text) || matches!(text.trim(), "—" | "–")
2765}
2766
2767/// Whether any cell on the line is a prose sentence (a long run of words) rather
2768/// than a label or a figure. Table captions and intro sentences sit near tables
2769/// and carry years/figures, but must not vote for columns or join the header.
2770fn cells_contain_prose(cells: &[TextRun]) -> bool {
2771    // A real data row — even one with a long wrapped label ("Effect of exchange
2772    // rate changes on cash and cash equivalents and restricted cash") — carries
2773    // its figures in two or more *separate* aligned value cells. A prose caption
2774    // ("The following table presents … for 2024, 2023 …") keeps its numbers inline
2775    // in one many-word cell, so after splitting it has at most one value cell.
2776    // Only the latter is prose; never drop a multi-figure data row as a caption.
2777    if cells.iter().filter(|cell| is_value_cell(&cell.text)).count() >= 2 {
2778        return false;
2779    }
2780    cells.iter().any(|cell| {
2781        cell.text
2782            .split_whitespace()
2783            .filter(|word| word.chars().any(|c| c.is_alphabetic()))
2784            .count()
2785            > 12
2786    })
2787}
2788
2789/// Every right-edge cluster with its support (the row count behind it), sorted
2790/// left→right. Lets a caller keep the well-supported columns *and* selectively
2791/// rescue sparse ones, rather than dropping everything below a single threshold.
2792fn cluster_column_right_edges_with_support(values: &[f32], tol: f32) -> Vec<(f32, usize)> {
2793    let mut sorted = values.to_vec();
2794    sorted.sort_by(f32::total_cmp);
2795    let mut clusters: Vec<(f32, usize)> = Vec::new();
2796    let mut start = 0usize;
2797    for index in 1..=sorted.len() {
2798        let split = index == sorted.len() || sorted[index] - sorted[index - 1] > tol;
2799        if split {
2800            let cluster = &sorted[start..index];
2801            if !cluster.is_empty() {
2802                clusters.push((cluster[cluster.len() / 2], cluster.len()));
2803            }
2804            start = index;
2805        }
2806    }
2807    clusters
2808}
2809
2810/// Revive geometrically-clean but *sparse* sub-columns that the support vote
2811/// drops — the paired Shares/Amount of a change-in-equity statement, or the
2812/// Level 1/2/3 of a fair-value hierarchy, where most rows carry only the dense
2813/// (Amount/Total) column. A dropped cluster is rescued only when it *repeats
2814/// periodically* across the column groups: bucket the sparse interior clusters
2815/// by their offset within the group pitch and keep an offset class that recurs
2816/// in two or more groups. That is the fingerprint of a real sub-column; row-label
2817/// noise is aperiodic and single-hit, so it is never revived. By construction a
2818/// plain N-year table (every value column dense) has nothing to rescue — a no-op.
2819fn rescue_periodic_subcolumns(
2820    all_clusters: &[(f32, usize)],
2821    kept: &[f32],
2822    min_support: usize,
2823    data_rows: usize,
2824) -> Vec<f32> {
2825    if kept.len() < 2 {
2826        return Vec::new();
2827    }
2828    let floor = ((data_rows as f32) * 0.15).ceil().max(3.0) as usize;
2829    if floor >= min_support {
2830        return Vec::new();
2831    }
2832    let mut diffs: Vec<f32> = kept.windows(2).map(|window| window[1] - window[0]).collect();
2833    diffs.sort_by(f32::total_cmp);
2834    let pitch = diffs[diffs.len() / 2];
2835    if pitch <= 0.0 {
2836        return Vec::new();
2837    }
2838    let anchor = kept[0];
2839    let (first, last) = (kept[0], kept[kept.len() - 1]);
2840
2841    // Sparse-but-not-noise clusters sitting inside the numeric grid.
2842    let candidates: Vec<f32> = all_clusters
2843        .iter()
2844        .filter(|(position, support)| {
2845            *support >= floor
2846                && *support < min_support
2847                && *position >= first - pitch
2848                && *position <= last + pitch
2849        })
2850        .map(|(position, _)| *position)
2851        .collect();
2852
2853    let residue = |position: f32| ((position - anchor) % pitch + pitch) % pitch;
2854    let group_of = |position: f32| ((position - anchor) / pitch).round() as i32;
2855
2856    let mut rescued = Vec::new();
2857    let mut used = vec![false; candidates.len()];
2858    for index in 0..candidates.len() {
2859        if used[index] {
2860            continue;
2861        }
2862        let target = residue(candidates[index]);
2863        let mut class = vec![index];
2864        for other in (index + 1)..candidates.len() {
2865            if used[other] {
2866                continue;
2867            }
2868            let delta = (target - residue(candidates[other])).abs();
2869            if delta.min(pitch - delta) <= 8.0 {
2870                class.push(other);
2871            }
2872        }
2873        let groups: std::collections::HashSet<i32> =
2874            class.iter().map(|&member| group_of(candidates[member])).collect();
2875        if class.len() >= 2 && groups.len() >= 2 {
2876            for &member in &class {
2877                used[member] = true;
2878                rescued.push(candidates[member]);
2879            }
2880        }
2881    }
2882    rescued
2883}
2884
2885/// Index of the column whose right edge is within tolerance of `right_edge`.
2886fn nearest_column(right_edge: f32, columns: &[f32]) -> Option<usize> {
2887    columns
2888        .iter()
2889        .enumerate()
2890        .map(|(index, edge)| (index, (right_edge - edge).abs()))
2891        .filter(|(_, distance)| *distance <= 14.0)
2892        .min_by(|left, right| left.1.total_cmp(&right.1))
2893        .map(|(index, _)| index)
2894}
2895
2896/// Typical width of the cells feeding the first column, used to place the
2897/// label/number boundary just left of that column.
2898fn column_cell_width(line_cells: &[Vec<TextRun>], first_column: f32) -> f32 {
2899    let widths: Vec<f32> = line_cells
2900        .iter()
2901        .flat_map(|cells| cells.iter())
2902        .filter(|cell| is_numeric_value(&cell.text))
2903        .filter(|cell| ((cell.bbox.x + cell.bbox.width) - first_column).abs() <= 14.0)
2904        .map(|cell| cell.bbox.width)
2905        .collect();
2906    if widths.is_empty() {
2907        return 40.0;
2908    }
2909    let mut sorted = widths.clone();
2910    sorted.sort_by(f32::total_cmp);
2911    sorted[sorted.len() / 2].max(20.0)
2912}
2913
2914/// Label-only continuation lines directly above a data row that wrap its label —
2915/// a long row label that overflowed onto the previous line(s). A continuation sits
2916/// at the same left indent, carries no figures, and does not end in ":" (which
2917/// marks a section header, not a wrap). Returned top-to-bottom so the text can
2918/// prefix the row's own label.
2919fn wrapped_label_above(
2920    lines: &[TextLine],
2921    line_cells: &[Vec<TextRun>],
2922    row_index: usize,
2923    first_column_left: f32,
2924    used: &[usize],
2925) -> Vec<usize> {
2926    let label_x = lines[row_index].bbox.x;
2927    let line_height = average_run_size(&lines[row_index]).max(lines[row_index].bbox.height);
2928    let mut result: Vec<usize> = Vec::new();
2929    let mut current_y = lines[row_index].bbox.y;
2930    loop {
2931        let above = (0..lines.len())
2932            .filter(|&index| {
2933                index != row_index
2934                    && !used.contains(&index)
2935                    && !result.contains(&index)
2936                    && lines[index].bbox.y > current_y
2937            })
2938            .min_by(|&left, &right| lines[left].bbox.y.total_cmp(&lines[right].bbox.y));
2939        let Some(above) = above else { break };
2940        let line = &lines[above];
2941        let text = text_line_plain_text(line);
2942        // A wrapped label line: vertically adjacent, roughly the same indent
2943        // (continuations are often hanging-indented), no figures, no trailing ":",
2944        // and — crucially — long. A label wraps because it ran the width of the
2945        // label column, which distinguishes it from a short section header like
2946        // "Assets" or a one-word heading.
2947        let long_enough = text.chars().count() >= 28
2948            || line.bbox.x + line.bbox.width >= first_column_left - 12.0;
2949        // An all-caps line is a section heading ("CASH FLOWS FROM FINANCING
2950        // ACTIVITIES"), not a wrapped sentence fragment, even when it is long.
2951        let all_caps_heading = text.chars().any(char::is_alphabetic)
2952            && text.chars().filter(|c| c.is_alphabetic()).all(char::is_uppercase);
2953        if line.bbox.y - current_y > line_height * 1.8
2954            || (line.bbox.x - label_x).abs() > 16.0
2955            || !long_enough
2956            || all_caps_heading
2957            || text.trim().is_empty()
2958            || text.trim_end().ends_with(':')
2959            || line_cells[above].iter().any(|cell| is_numeric_value(&cell.text))
2960        {
2961            break;
2962        }
2963        result.push(above);
2964        current_y = line.bbox.y;
2965    }
2966    result.reverse();
2967    result
2968}
2969
2970/// A row whose figure columns are all four-digit years (e.g. "2025 2024 2023").
2971/// Such a row is a period header, not data — column titles, not values — so it
2972/// belongs in the header even when it also carries a label like "Year Ended …".
2973fn is_period_header_row(row: &[String]) -> bool {
2974    let values: Vec<&str> = row[1..]
2975        .iter()
2976        .map(|cell| cell.trim())
2977        .filter(|cell| !cell.is_empty())
2978        .collect();
2979    !values.is_empty()
2980        && values.iter().all(|cell| {
2981            cell.len() == 4
2982                && cell.chars().all(|c| c.is_ascii_digit())
2983                && cell.parse::<i32>().is_ok_and(|year| (1900..=2100).contains(&year))
2984        })
2985}
2986
2987fn build_columnar_table(
2988    page_number: usize,
2989    lines: &[TextLine],
2990    line_cells: &[Vec<TextRun>],
2991    columns: &[f32],
2992    first_column_left: f32,
2993    row_indices: &[usize],
2994) -> Option<DetectedTable> {
2995    let column_count = columns.len() + 1; // label column + one per numeric column
2996    let assign_row = |index: usize| -> Vec<String> {
2997        let mut row = vec![String::new(); column_count];
2998        for cell in &line_cells[index] {
2999            let column = assign_cell_column(cell, columns, first_column_left);
3000            push_table_cell_text(&mut row[column], &cell.text);
3001        }
3002        row
3003    };
3004
3005    // The header is everything above the first *labelled* row: period/column titles
3006    // sitting over the numeric columns (lines above the span) plus any leading rows
3007    // whose label column is empty (a bare "2024 2023 2022" year row). The first row
3008    // carrying label-column text begins the body.
3009    let span_top_y = lines[*row_indices.first()?].bbox.y;
3010    let mut header_indices: Vec<usize> = (0..lines.len())
3011        .filter(|&index| {
3012            let line = &lines[index];
3013            !row_indices.contains(&index)
3014                && line.bbox.y > span_top_y
3015                && line.bbox.y - span_top_y
3016                    <= average_run_size(line).max(line.bbox.height) * 5.0
3017                && line.bbox.x + line.bbox.width >= first_column_left - 24.0
3018                && !text_line_plain_text(line).to_ascii_lowercase().starts_with("table ")
3019                && !line_is_data_row(line, column_count)
3020                && !cells_contain_prose(&line_cells[index])
3021                // A real column header sits *over the numeric columns*; a line whose
3022                // content all falls in the label column is a statement title or a
3023                // "(in millions)" note centered above the table, not a header.
3024                && assign_row(index)[1..].iter().any(|cell| !cell.trim().is_empty())
3025        })
3026        .collect();
3027
3028    let mut data_start = 0usize;
3029    for (position, &index) in row_indices.iter().enumerate() {
3030        let row = assign_row(index);
3031        // A leading row is part of the header when its label column is empty (a bare
3032        // "2024 2023 2022" line) or its figure cells are all years/periods (a
3033        // "Year Ended June 30, | 2025 | 2024 | 2023" line) — the body begins at the
3034        // first row carrying real figures.
3035        if row[0].trim().is_empty() || is_period_header_row(&row) {
3036            header_indices.push(index);
3037            data_start = position + 1;
3038        } else {
3039            data_start = position;
3040            break;
3041        }
3042    }
3043    header_indices.sort_by(|left, right| lines[*right].bbox.y.total_cmp(&lines[*left].bbox.y));
3044
3045    let mut header_cells: Vec<String> = vec![String::new(); column_count];
3046    for &index in &header_indices {
3047        for (column, text) in assign_row(index).into_iter().enumerate() {
3048            push_table_cell_text(&mut header_cells[column], &text);
3049        }
3050    }
3051    let header_has_text = header_cells.iter().any(|cell| !cell.is_empty());
3052
3053    let mut rows: Vec<Vec<String>> = Vec::new();
3054    let mut cell_records: Vec<TableCell> = Vec::new();
3055    if header_has_text {
3056        for (column, text) in header_cells.iter().enumerate() {
3057            cell_records.push(table_cell(0, column, text.clone(), true));
3058        }
3059    }
3060
3061    // Pull a wrapped label up into the data row it belongs to: a long row label
3062    // can overflow onto the previous line, leaving the figure row with only the
3063    // label's tail ("balances" instead of "Cash …, beginning balances").
3064    let mut consumed: Vec<usize> = Vec::new();
3065    let mut prefixes: Vec<(usize, String)> = Vec::new();
3066    for &index in &row_indices[data_start..] {
3067        if !line_cells[index].iter().any(|cell| is_numeric_value(&cell.text)) {
3068            continue;
3069        }
3070        // Only a *short tail* row pulls a wrap up: "balances", "equivalents". A row
3071        // that already carries a full label ("Net earnings", "Additions to …") is a
3072        // section's own item, and the long line above it is that section's heading,
3073        // not a wrap — merging there would corrupt the table.
3074        if assign_row(index)[0].trim().chars().count() > 11 {
3075            continue;
3076        }
3077        let mut search_used = header_indices.clone();
3078        search_used.extend_from_slice(&consumed);
3079        let chain = wrapped_label_above(lines, line_cells, index, first_column_left, &search_used);
3080        if !chain.is_empty() {
3081            let prefix = chain
3082                .iter()
3083                .map(|&line| text_line_plain_text(&lines[line]))
3084                .collect::<Vec<_>>()
3085                .join(" ");
3086            prefixes.push((index, prefix));
3087            consumed.extend(chain);
3088        }
3089    }
3090
3091    let mut prose_skipped: Vec<usize> = Vec::new();
3092    for &index in &row_indices[data_start..] {
3093        if consumed.contains(&index) {
3094            continue;
3095        }
3096        // A prose caption that landed inside the table span is not a row; drop it
3097        // here and let it render as its own paragraph rather than a stray table row.
3098        if cells_contain_prose(&line_cells[index]) {
3099            prose_skipped.push(index);
3100            continue;
3101        }
3102        let mut row = assign_row(index);
3103        if let Some((_, prefix)) = prefixes.iter().find(|(line, _)| *line == index) {
3104            row[0] = if row[0].trim().is_empty() {
3105                prefix.clone()
3106            } else {
3107                format!("{prefix} {}", row[0])
3108            };
3109        }
3110        if row.iter().all(|cell| cell.is_empty()) {
3111            continue;
3112        }
3113        let table_row = rows.len() + usize::from(header_has_text);
3114        for (column, text) in row.iter().enumerate() {
3115            cell_records.push(table_cell(table_row, column, text.clone(), false));
3116        }
3117        rows.push(row);
3118    }
3119    if rows.is_empty() {
3120        return None;
3121    }
3122
3123    // Only take over from the simpler detectors when this method earns its keep:
3124    // a large statement whose rows are *not* uniform — section headers / subtotals
3125    // (a label with no figures) interleaved with data rows. A small uniform grid is
3126    // handled just as well by exact/implied alignment, so defer to those there and
3127    // avoid disturbing tables this geometry would only re-shape, not improve.
3128    let value_rows = rows.iter().filter(|row| !row[0].trim().is_empty()).count();
3129    let label_only_rows = rows
3130        .iter()
3131        .filter(|row| !row[0].trim().is_empty() && row[1..].iter().all(|cell| cell.trim().is_empty()))
3132        .count();
3133    let data_with_figures = rows
3134        .iter()
3135        .filter(|row| row[1..].iter().any(|cell| !cell.trim().is_empty()))
3136        .count();
3137    // Take over from the simpler detectors only where this method earns its keep.
3138    // Two cases qualify: a multi-section statement (section-header rows interleaved
3139    // with data, which fragments the other detectors), or a genuinely wide table
3140    // (>= 5 numeric columns — segment, equity, geography breakdowns) that the
3141    // exact/implied detectors cannot assemble at all. A small uniform grid is left
3142    // to those detectors so we do not merely re-shape what they already get right.
3143    let multi_section = label_only_rows >= 2 && value_rows >= 8;
3144    let wide_table = columns.len() >= 5 && value_rows >= 6;
3145    if data_with_figures < 6 || !(multi_section || wide_table) {
3146        return None;
3147    }
3148
3149    let mut line_index_set: Vec<usize> = row_indices.to_vec();
3150    line_index_set.extend(header_indices.iter().copied());
3151    line_index_set.extend(consumed.iter().copied());
3152    // Prose captions dropped from the body stay out of the table's claimed lines so
3153    // they are emitted as their own text blocks.
3154    line_index_set.retain(|index| !prose_skipped.contains(index));
3155    line_index_set.sort_unstable();
3156    line_index_set.dedup();
3157    let bbox = union_boxes(line_index_set.iter().map(|&index| lines[index].bbox))?;
3158
3159    Some(DetectedTable {
3160        table: TableBlock {
3161            headers: if header_has_text {
3162                header_cells
3163            } else {
3164                Vec::new()
3165            },
3166            rows,
3167            caption: None,
3168            bbox: Some(bbox),
3169            cells: cell_records,
3170            source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3171            confidence: Some(Confidence {
3172                score: 0.7,
3173                calibrated: false,
3174            }), ..Default::default()
3175        },
3176        line_indices: line_index_set,
3177    })
3178}
3179
3180/// Column a cell belongs to (0 = label, 1..=N = numeric columns). Right-aligned
3181/// figures match a column by their right edge; a header title or a centered/narrow
3182/// year that no right edge matches falls to the column band its center sits in;
3183/// a non-numeric cell that *starts* in the label region (a row label, however long)
3184/// stays in column 0.
3185fn assign_cell_column(cell: &TextRun, columns: &[f32], first_column_left: f32) -> usize {
3186    if is_numeric_value(&cell.text) {
3187        if let Some(column) = nearest_column(cell.bbox.x + cell.bbox.width, columns) {
3188            return column + 1;
3189        }
3190    }
3191    // A left-anchored row label, however long, keeps its center well left of the
3192    // columns, so the band naturally returns 0 for it; a header title or year
3193    // centered over a column lands on that column.
3194    column_band(cell, columns, first_column_left)
3195}
3196
3197/// Numeric column (1..=N) whose horizontal band contains the cell's center, or 0
3198/// when the center is left of the first column. Band boundaries are the midpoints
3199/// between adjacent column right edges.
3200fn column_band(cell: &TextRun, columns: &[f32], first_column_left: f32) -> usize {
3201    let center = cell.bbox.x + cell.bbox.width / 2.0;
3202    if center < first_column_left {
3203        return 0;
3204    }
3205    for index in 0..columns.len() {
3206        let upper = columns
3207            .get(index + 1)
3208            .map_or(f32::INFINITY, |next| (columns[index] + next) / 2.0);
3209        if center <= upper {
3210            return index + 1;
3211        }
3212    }
3213    columns.len()
3214}
3215
3216fn push_table_cell_text(target: &mut String, text: &str) {
3217    let text = text.trim();
3218    if text.is_empty() {
3219        return;
3220    }
3221    if !target.is_empty() {
3222        target.push(' ');
3223    }
3224    target.push_str(text);
3225}
3226
3227fn table_cell(row: usize, column: usize, text: String, is_header: bool) -> TableCell {
3228    TableCell {
3229        row,
3230        column,
3231        text,
3232        bbox: None,
3233        is_header,
3234        col_span: 1,
3235        row_span: 1,
3236    }
3237}
3238
3239/// Order a cell's runs top-to-bottom by text line (PDF space is y-up, so the
3240/// visually-top line has the larger baseline), then left-to-right — so a cell
3241/// holding several wrapped lines reads in order rather than interleaving glyphs.
3242fn sort_runs_reading_order(runs: &mut [TextRun]) {
3243    runs.sort_by(|a, b| {
3244        let line_a = (a.baseline_y / 3.0).round();
3245        let line_b = (b.baseline_y / 3.0).round();
3246        line_b
3247            .total_cmp(&line_a)
3248            .then(a.bbox.x.total_cmp(&b.bbox.x))
3249    });
3250}
3251
3252/// Whether a grid row is really a prose paragraph (a note between data rows)
3253/// rather than a row of discrete cells. Prose leaves one long cell or, when
3254/// sliced by the columns, spreads non-numeric text across many of them.
3255fn row_is_prose(cells: &[String]) -> bool {
3256    let word_counts: Vec<usize> = cells.iter().map(|c| c.split_whitespace().count()).collect();
3257    if word_counts.iter().copied().max().unwrap_or(0) >= 12 {
3258        return true;
3259    }
3260    let nonempty = cells.iter().filter(|c| !c.trim().is_empty()).count();
3261    let total_words: usize = word_counts.iter().sum();
3262    let numeric = cells.iter().filter(|c| is_value_cell(c)).count();
3263    nonempty >= 5 && total_words >= 25 && (numeric as f32) < nonempty as f32 * 0.3
3264}
3265
3266fn detect_ruled_grid_table(
3267    page_number: usize,
3268    lines: &[TextLine],
3269    edges: &[GraphicEdge],
3270) -> Option<DetectedTable> {
3271    let verticals = grid_axis_values(edges, EdgeOrientation::Vertical);
3272    let horizontals = grid_axis_values(edges, EdgeOrientation::Horizontal);
3273    if verticals.len() < 2 || horizontals.len() < 2 {
3274        return None;
3275    }
3276
3277    let columns = verticals.len() - 1;
3278    let rows = horizontals.len() - 1;
3279    if columns < 2 || rows < 2 {
3280        return None;
3281    }
3282    if !has_nearby_ruled_table_label(lines, &verticals, &horizontals)
3283        && !has_multirow_ruled_grid_evidence(columns, rows)
3284    {
3285        return None;
3286    }
3287
3288    // Collect the runs that fall in each grid cell, then assemble the cell text
3289    // with the gap-aware joiner. Appending run text glyph-by-glyph (the old path)
3290    // inserted a space between every run, which on a per-glyph PDF rendered
3291    // "P r o d u c t i v i t y" — the same letter-spacing the prose path avoids.
3292    let mut grid_runs: Vec<Vec<Vec<TextRun>>> = vec![vec![Vec::new(); columns]; rows];
3293    let mut cell_boxes = vec![vec![None; columns]; rows];
3294    let mut line_indices = Vec::new();
3295
3296    for (line_index, line) in lines.iter().enumerate() {
3297        let mut used_line = false;
3298        for run in &line.runs {
3299            let center_x = run.bbox.x + run.bbox.width / 2.0;
3300            let center_y = run.bbox.y + run.bbox.height / 2.0;
3301            let Some(column) = grid_column_for(center_x, &verticals) else {
3302                continue;
3303            };
3304            let Some(row) = grid_row_for(center_y, &horizontals) else {
3305                continue;
3306            };
3307            grid_runs[row][column].push(run.clone());
3308            cell_boxes[row][column] = Some(
3309                cell_boxes[row][column]
3310                    .and_then(|bbox| union_boxes([bbox, run.bbox]))
3311                    .unwrap_or(run.bbox),
3312            );
3313            used_line = true;
3314        }
3315        if used_line {
3316            line_indices.push(line_index);
3317        }
3318    }
3319
3320    let mut grid = vec![vec![String::new(); columns]; rows];
3321    let mut prose_rows = vec![false; rows];
3322    for row in 0..rows {
3323        let mut cell_texts = vec![String::new(); columns];
3324        for column in 0..columns {
3325            if grid_runs[row][column].is_empty() {
3326                continue;
3327            }
3328            let mut runs = grid_runs[row][column].clone();
3329            sort_runs_reading_order(&mut runs);
3330            cell_texts[column] = clean_pdf_line_text(&join_runs_spaced(&runs));
3331        }
3332        // A row that is really a prose paragraph (a note set between data rows)
3333        // gets sliced across the columns into scattered fragments. Detect it and
3334        // merge the whole row — re-assembled in reading order — into one
3335        // full-width cell instead of shredding the sentence.
3336        if row_is_prose(&cell_texts) {
3337            prose_rows[row] = true;
3338            let mut all: Vec<TextRun> = grid_runs[row].iter().flatten().cloned().collect();
3339            sort_runs_reading_order(&mut all);
3340            grid[row][0] = clean_pdf_line_text(&join_runs_spaced(&all));
3341        } else {
3342            grid[row] = cell_texts;
3343        }
3344    }
3345
3346    if grid
3347        .iter()
3348        .flatten()
3349        .filter(|text| !text.trim().is_empty())
3350        .count()
3351        < 3
3352    {
3353        return None;
3354    }
3355
3356    let headers = grid[0].clone();
3357    let body_rows = grid.iter().skip(1).cloned().collect::<Vec<_>>();
3358    if headers.iter().all(|text| text.trim().is_empty())
3359        || body_rows
3360            .iter()
3361            .flatten()
3362            .all(|text| text.trim().is_empty())
3363    {
3364        return None;
3365    }
3366
3367    // Merged cells: a cell whose content overruns a ruled column boundary into an
3368    // empty neighbour band spans it. The grid text stays rectangular so renderers
3369    // are unchanged; only `cells` carries the span topology.
3370    let (mut col_span, mut covered) = merged_cell_col_spans(&cell_boxes, &verticals);
3371    // A merged prose row occupies one full-width spanning cell.
3372    for row in 0..rows {
3373        if prose_rows[row] {
3374            covered[row][0] = false;
3375            col_span[row][0] = columns;
3376            for column in 1..columns {
3377                covered[row][column] = true;
3378            }
3379        }
3380    }
3381
3382    let mut cells = Vec::new();
3383    for row in 0..rows {
3384        for column in 0..columns {
3385            if covered[row][column] {
3386                continue;
3387            }
3388            cells.push(TableCell {
3389                row,
3390                column,
3391                text: grid[row][column].clone(),
3392                bbox: cell_boxes[row][column],
3393                is_header: row == 0,
3394                col_span: col_span[row][column],
3395                row_span: 1,
3396            });
3397        }
3398    }
3399
3400    let bbox = BBox {
3401        x: *verticals.first()?,
3402        y: *horizontals.first()?,
3403        width: *verticals.last()? - *verticals.first()?,
3404        height: *horizontals.last()? - *horizontals.first()?,
3405    };
3406
3407    Some(DetectedTable {
3408        table: TableBlock {
3409            headers,
3410            rows: body_rows,
3411            caption: None,
3412            bbox: Some(bbox),
3413            cells,
3414            source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3415            confidence: Some(Confidence {
3416                score: 0.7,
3417                calibrated: false,
3418            }), ..Default::default()
3419        },
3420        line_indices,
3421    })
3422}
3423
3424/// Detect horizontally merged cells (column spans) in a ruled grid.
3425///
3426/// A non-empty cell whose content bbox overruns its ruled column boundary into
3427/// an adjacent *empty* band (by more than `SPAN_MARGIN`) is treated as spanning
3428/// it — the natural signature of a grouped column header, whose label is
3429/// physically wider than one column. Returns the per-cell `col_span` grid plus a
3430/// `covered` mask of the spanned-over continuation positions, which the caller
3431/// omits from `cells`.
3432///
3433/// Spans are scanned rightward from the anchoring cell, so a centred merged
3434/// header must lean into its left band (the common case). Row spans are not
3435/// inferred here: a vertically merged cell is usually a single line centred in a
3436/// tall region whose bbox does not overflow the row rule, so it needs
3437/// rule-segment analysis rather than content overflow.
3438fn merged_cell_col_spans(
3439    cell_boxes: &[Vec<Option<BBox>>],
3440    verticals: &[f32],
3441) -> (Vec<Vec<usize>>, Vec<Vec<bool>>) {
3442    const SPAN_MARGIN: f32 = 2.0;
3443    let rows = cell_boxes.len();
3444    let columns = cell_boxes.first().map_or(0, Vec::len);
3445    let mut col_span = vec![vec![1usize; columns]; rows];
3446    let mut covered = vec![vec![false; columns]; rows];
3447
3448    for row in 0..rows {
3449        for column in 0..columns {
3450            if covered[row][column] {
3451                continue;
3452            }
3453            let Some(bbox) = cell_boxes[row][column] else {
3454                continue;
3455            };
3456
3457            let content_right = bbox.x + bbox.width;
3458            let mut next_column = column + 1;
3459            while next_column < columns
3460                && cell_boxes[row][next_column].is_none()
3461                && !covered[row][next_column]
3462                && verticals
3463                    .get(next_column)
3464                    .is_some_and(|edge| content_right > edge + SPAN_MARGIN)
3465            {
3466                covered[row][next_column] = true;
3467                next_column += 1;
3468            }
3469            col_span[row][column] = next_column - column;
3470        }
3471    }
3472
3473    (col_span, covered)
3474}
3475
3476fn has_nearby_ruled_table_label(
3477    lines: &[TextLine],
3478    verticals: &[f32],
3479    horizontals: &[f32],
3480) -> bool {
3481    let Some(left) = verticals.first().copied() else {
3482        return false;
3483    };
3484    let Some(right) = verticals.last().copied() else {
3485        return false;
3486    };
3487    let Some(top) = horizontals.last().copied() else {
3488        return false;
3489    };
3490
3491    lines.iter().any(|line| {
3492        let text = text_line_plain_text(line).to_ascii_lowercase();
3493        text.starts_with("table")
3494            && line.bbox.y >= top
3495            && line.bbox.y <= top + 96.0
3496            && line.bbox.x <= right + 24.0
3497            && line.bbox.x + line.bbox.width >= left - 24.0
3498    })
3499}
3500
3501fn has_multirow_ruled_grid_evidence(columns: usize, rows: usize) -> bool {
3502    columns >= 2 && rows >= 4
3503}
3504
3505#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3506enum EdgeOrientation {
3507    Horizontal,
3508    Vertical,
3509}
3510
3511fn grid_axis_values(edges: &[GraphicEdge], orientation: EdgeOrientation) -> Vec<f32> {
3512    let mut values = edges
3513        .iter()
3514        .filter_map(|edge| match orientation {
3515            EdgeOrientation::Horizontal if is_horizontal_edge(edge) => {
3516                Some((edge.y0 + edge.y1) / 2.0)
3517            }
3518            EdgeOrientation::Vertical if is_vertical_edge(edge) => Some((edge.x0 + edge.x1) / 2.0),
3519            _ => None,
3520        })
3521        .collect::<Vec<_>>();
3522    values.sort_by(f32::total_cmp);
3523    dedup_axis_values(values, 2.0)
3524}
3525
3526fn is_horizontal_edge(edge: &GraphicEdge) -> bool {
3527    (edge.y0 - edge.y1).abs() <= 1.0 && (edge.x0 - edge.x1).abs() >= 12.0
3528}
3529
3530fn is_vertical_edge(edge: &GraphicEdge) -> bool {
3531    (edge.x0 - edge.x1).abs() <= 1.0 && (edge.y0 - edge.y1).abs() >= 12.0
3532}
3533
3534fn dedup_axis_values(values: Vec<f32>, tolerance: f32) -> Vec<f32> {
3535    let mut deduped: Vec<f32> = Vec::new();
3536    for value in values {
3537        if let Some(previous) = deduped.last_mut() {
3538            if (value - *previous).abs() <= tolerance {
3539                *previous = (*previous + value) / 2.0;
3540                continue;
3541            }
3542        }
3543        deduped.push(value);
3544    }
3545    deduped
3546}
3547
3548fn grid_column_for(x: f32, verticals: &[f32]) -> Option<usize> {
3549    verticals
3550        .windows(2)
3551        .position(|window| x >= window[0] - 1.0 && x <= window[1] + 1.0)
3552}
3553
3554fn grid_row_for(y: f32, horizontals: &[f32]) -> Option<usize> {
3555    let band = horizontals
3556        .windows(2)
3557        .position(|window| y >= window[0] - 1.0 && y <= window[1] + 1.0)?;
3558    Some(horizontals.len().saturating_sub(2).saturating_sub(band))
3559}
3560
3561
3562fn detect_exact_run_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
3563    let candidate_lines = lines
3564        .iter()
3565        .enumerate()
3566        .filter(|(_, line)| line.runs.len() >= 2)
3567        .collect::<Vec<_>>();
3568    if candidate_lines.len() < 2 {
3569        return None;
3570    }
3571
3572    let width = candidate_lines[0].1.runs.len();
3573    if !candidate_lines.iter().all(|(_, line)| {
3574        line.runs.len() == width && columns_align(&candidate_lines[0].1.runs, &line.runs)
3575    }) {
3576        return None;
3577    }
3578    if !has_table_evidence(&candidate_lines) {
3579        return None;
3580    }
3581
3582    let headers = candidate_lines[0]
3583        .1
3584        .runs
3585        .iter()
3586        .map(|run| run.text.trim().to_owned())
3587        .collect::<Vec<_>>();
3588    let rows = candidate_lines
3589        .iter()
3590        .skip(1)
3591        .map(|(_, line)| {
3592            line.runs
3593                .iter()
3594                .map(|run| run.text.trim().to_owned())
3595                .collect::<Vec<_>>()
3596        })
3597        .collect::<Vec<_>>();
3598    let bbox = union_boxes(candidate_lines.iter().map(|(_, line)| line.bbox))?;
3599    let mut cells = Vec::new();
3600
3601    for (row_index, (_, line)) in candidate_lines.iter().enumerate() {
3602        for (column_index, run) in line.runs.iter().enumerate() {
3603            cells.push(TableCell {
3604                row: row_index,
3605                column: column_index,
3606                text: run.text.clone(),
3607                bbox: Some(run.bbox),
3608                is_header: row_index == 0,
3609                col_span: 1,
3610                row_span: 1,
3611            });
3612        }
3613    }
3614
3615    Some(DetectedTable {
3616        table: TableBlock {
3617            headers,
3618            rows,
3619            caption: None,
3620            bbox: Some(bbox),
3621            cells,
3622            source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3623            confidence: Some(Confidence {
3624                score: 0.72,
3625                calibrated: false,
3626            }), ..Default::default()
3627        },
3628        line_indices: candidate_lines
3629            .iter()
3630            .map(|(line_index, _)| *line_index)
3631            .collect(),
3632    })
3633}
3634
3635fn detect_implied_alignment_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
3636    let row_candidates = lines
3637        .iter()
3638        .enumerate()
3639        .filter_map(|(line_index, line)| {
3640            let cells = implied_table_cells(line);
3641            (cells.len() >= 3 && row_has_numeric_table_evidence(&cells))
3642                .then_some(TableRowCandidate { line_index, cells })
3643        })
3644        .collect::<Vec<_>>();
3645    let group = best_aligned_table_row_group(&row_candidates)?;
3646    // A nearby "Table N" caption confirms an implied table, but most real tables
3647    // (financial statements, schedules) have no such caption. Accept those when the
3648    // aligned group is strong enough on its own — many rows of consistently aligned
3649    // numeric columns — mirroring the ruled-grid detector's multi-row evidence path.
3650    if !has_nearby_table_label(lines, &group) && !has_strong_numeric_table_evidence(&group) {
3651        return None;
3652    }
3653    build_implied_alignment_table(page_number, lines, &group)
3654}
3655
3656/// Whether an aligned row group is, by itself, strong evidence of a table: at
3657/// least four rows of three or more columns where most rows carry numeric values
3658/// in their non-label cells. Deliberately conservative so prose with incidental
3659/// numbers is not promoted to a table.
3660fn has_strong_numeric_table_evidence(rows: &[TableRowCandidate]) -> bool {
3661    let columns = rows.first().map_or(0, |row| row.cells.len());
3662    if rows.len() < 4 || columns < 3 {
3663        return false;
3664    }
3665    let numeric_rows = rows
3666        .iter()
3667        .filter(|row| row_has_numeric_table_evidence(&row.cells))
3668        .count();
3669    numeric_rows * 4 >= rows.len() * 3
3670}
3671
3672fn has_nearby_table_label(lines: &[TextLine], rows: &[TableRowCandidate]) -> bool {
3673    let Some(first_row) = rows.first() else {
3674        return false;
3675    };
3676    let first_y = first_row
3677        .cells
3678        .iter()
3679        .map(|cell| cell.bbox.y)
3680        .reduce(f32::max)
3681        .unwrap_or_default();
3682    let table_left = first_row
3683        .cells
3684        .iter()
3685        .map(|cell| cell.bbox.x)
3686        .reduce(f32::min)
3687        .unwrap_or_default();
3688    let table_right = first_row
3689        .cells
3690        .iter()
3691        .map(|cell| cell.bbox.x + cell.bbox.width)
3692        .reduce(f32::max)
3693        .unwrap_or_default();
3694
3695    lines.iter().any(|line| {
3696        let text = text_line_plain_text(line).to_ascii_lowercase();
3697        text.starts_with("table")
3698            && line.bbox.y >= first_y
3699            && line.bbox.y <= first_y + 96.0
3700            && line.bbox.x <= table_right + 24.0
3701            && line.bbox.x + line.bbox.width >= table_left - 24.0
3702    })
3703}
3704
3705fn implied_table_cells(line: &TextLine) -> Vec<TextRun> {
3706    if line.runs.len() < 2 {
3707        return line.runs.clone();
3708    }
3709
3710    let mut runs = line.runs.clone();
3711    runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
3712    let threshold = implied_cell_gap_threshold(line);
3713    let mut groups: Vec<Vec<TextRun>> = Vec::new();
3714    let mut current: Vec<TextRun> = Vec::new();
3715
3716    for run in runs {
3717        if let Some(previous) = current.last() {
3718            let gap = run.bbox.x - (previous.bbox.x + previous.bbox.width);
3719            // A `$` is a column-leading currency marker: a financial statement's
3720            // total rows print each value column as a flush-left `$` with a
3721            // right-aligned number, so the gap from the previous column's number to
3722            // this `$` is small and would otherwise merge two columns into one cell
3723            // (`$286,004 $—`) — a row of merged cells then fails to align to the
3724            // detected columns and drops out as loose numbers. Force a cell boundary
3725            // before any `$`-led run that follows a genuine preceding column.
3726            let starts_currency = run.text.trim_start().starts_with('$');
3727            // …unless the previous run is a lone marker this `$` completes: `$` +
3728            // `30,737` is one value, and `(` + `$11,829)` is one negative value
3729            // `($11,829)` — don't strand the opening paren in the previous cell.
3730            let previous_attaches_currency = matches!(previous.text.trim(), "$" | "(" | "($");
3731            if gap >= threshold || (starts_currency && !previous_attaches_currency) {
3732                groups.push(std::mem::take(&mut current));
3733            }
3734        }
3735        current.push(run);
3736    }
3737    if !current.is_empty() {
3738        groups.push(current);
3739    }
3740
3741    groups
3742        .into_iter()
3743        .filter_map(|runs| text_run_from_cell_runs(&runs))
3744        .collect()
3745}
3746
3747fn implied_cell_gap_threshold(line: &TextLine) -> f32 {
3748    let height = average_run_size(line).max(line.bbox.height);
3749    (height * 1.5).clamp(10.0, 18.0)
3750}
3751
3752fn text_run_from_cell_runs(runs: &[TextRun]) -> Option<TextRun> {
3753    let bbox = union_boxes(runs.iter().map(|run| run.bbox))?;
3754    let text = clean_pdf_line_text(&join_runs_spaced(runs));
3755    if text.is_empty() {
3756        return None;
3757    }
3758
3759    Some(TextRun {
3760        text,
3761        bbox,
3762        baseline_y: runs.iter().map(|run| run.baseline_y).sum::<f32>() / runs.len() as f32,
3763        font: runs.iter().find_map(|run| run.font.clone()),
3764        size: runs.iter().map(|run| run.size).sum::<f32>() / runs.len() as f32,
3765        space_width: runs.iter().map(|run| run.space_width).fold(0.0, f32::max),
3766        bold: !runs.is_empty() && runs.iter().all(|run| run.bold),
3767        italic: !runs.is_empty() && runs.iter().all(|run| run.italic),
3768        source_object_ids: source_ids_for_runs(runs),
3769    })
3770}
3771
3772fn row_has_numeric_table_evidence(cells: &[TextRun]) -> bool {
3773    cells.iter().skip(1).any(|cell| {
3774        cell.text
3775            .chars()
3776            .any(|character| character.is_ascii_digit())
3777    })
3778}
3779
3780fn best_aligned_table_row_group(rows: &[TableRowCandidate]) -> Option<Vec<TableRowCandidate>> {
3781    let mut best: Option<Vec<TableRowCandidate>> = None;
3782    let mut current: Vec<TableRowCandidate> = Vec::new();
3783
3784    for row in rows {
3785        if current.is_empty() {
3786            current.push(row.clone());
3787            continue;
3788        }
3789
3790        let compatible = current
3791            .first()
3792            .is_some_and(|first| table_rows_align(first, row))
3793            && current
3794                .last()
3795                .is_some_and(|previous| table_row_vertical_gap(previous, row) <= 28.0);
3796        if compatible {
3797            current.push(row.clone());
3798        } else {
3799            record_table_row_group(&mut best, &current);
3800            current.clear();
3801            current.push(row.clone());
3802        }
3803    }
3804    record_table_row_group(&mut best, &current);
3805    best
3806}
3807
3808fn record_table_row_group(
3809    best: &mut Option<Vec<TableRowCandidate>>,
3810    candidate: &[TableRowCandidate],
3811) {
3812    if candidate.len() < 2 {
3813        return;
3814    }
3815    let Some(width) = candidate.first().map(|row| row.cells.len()) else {
3816        return;
3817    };
3818    if width < 3 {
3819        return;
3820    }
3821    let score = candidate.len() * width;
3822    let best_score = best
3823        .as_ref()
3824        .and_then(|rows| rows.first().map(|row| rows.len() * row.cells.len()))
3825        .unwrap_or_default();
3826    if score > best_score {
3827        *best = Some(candidate.to_vec());
3828    }
3829}
3830
3831fn table_rows_align(first: &TableRowCandidate, next: &TableRowCandidate) -> bool {
3832    first.cells.len() == next.cells.len()
3833        && first
3834            .cells
3835            .iter()
3836            .zip(&next.cells)
3837            .all(|(left, right)| cells_column_aligned(left, right))
3838}
3839
3840/// Two cells share a column when their left edges line up (left-aligned text) or
3841/// their right edges line up (right-aligned numeric columns — the norm in
3842/// financial statements, where the left edge slides with the number's width).
3843fn cells_column_aligned(left: &TextRun, right: &TextRun) -> bool {
3844    let left_edge = (left.bbox.x - right.bbox.x).abs() <= 14.0;
3845    let right_edge =
3846        ((left.bbox.x + left.bbox.width) - (right.bbox.x + right.bbox.width)).abs() <= 14.0;
3847    left_edge || right_edge
3848}
3849
3850fn table_row_vertical_gap(previous: &TableRowCandidate, next: &TableRowCandidate) -> f32 {
3851    let previous_y = previous
3852        .cells
3853        .iter()
3854        .map(|cell| cell.bbox.y)
3855        .reduce(f32::max)
3856        .unwrap_or_default();
3857    let next_y = next
3858        .cells
3859        .iter()
3860        .map(|cell| cell.bbox.y)
3861        .reduce(f32::max)
3862        .unwrap_or_default();
3863    (previous_y - next_y).abs()
3864}
3865
3866fn build_implied_alignment_table(
3867    page_number: usize,
3868    lines: &[TextLine],
3869    rows: &[TableRowCandidate],
3870) -> Option<DetectedTable> {
3871    let columns = rows.first()?.cells.len();
3872    let bbox = union_boxes(
3873        rows.iter()
3874            .flat_map(|row| row.cells.iter().map(|cell| cell.bbox)),
3875    )?;
3876    let header = implied_table_header(lines, rows, columns);
3877    let has_explicit_header = header.has_text();
3878    let mut line_indices = rows.iter().map(|row| row.line_index).collect::<Vec<_>>();
3879    line_indices.extend(header.line_indices.iter().copied());
3880    line_indices.sort_unstable();
3881    line_indices.dedup();
3882
3883    let (headers, body_rows, header_cells) = if has_explicit_header {
3884        (
3885            header
3886                .cells
3887                .iter()
3888                .map(|cell| {
3889                    cell.as_ref()
3890                        .map(|cell| cell.text.clone())
3891                        .unwrap_or_default()
3892                })
3893                .collect::<Vec<_>>(),
3894            rows.iter()
3895                .map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
3896                .collect::<Vec<Vec<_>>>(),
3897            header.cells,
3898        )
3899    } else {
3900        (
3901            rows.first()?
3902                .cells
3903                .iter()
3904                .map(|cell| cell.text.clone())
3905                .collect::<Vec<_>>(),
3906            rows.iter()
3907                .skip(1)
3908                .map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
3909                .collect::<Vec<Vec<_>>>(),
3910            rows.first()?.cells.iter().cloned().map(Some).collect(),
3911        )
3912    };
3913
3914    let mut cells = Vec::new();
3915    for (column, cell) in header_cells.into_iter().enumerate() {
3916        let text = headers.get(column).cloned().unwrap_or_default();
3917        cells.push(TableCell {
3918            row: 0,
3919            column,
3920            text,
3921            bbox: cell.map(|cell| cell.bbox),
3922            is_header: true,
3923            col_span: 1,
3924            row_span: 1,
3925        });
3926    }
3927    for (row_index, row) in rows.iter().enumerate() {
3928        let table_row = if has_explicit_header {
3929            row_index + 1
3930        } else {
3931            row_index
3932        };
3933        if !has_explicit_header && row_index == 0 {
3934            continue;
3935        }
3936        for (column, cell) in row.cells.iter().enumerate() {
3937            cells.push(TableCell {
3938                row: table_row,
3939                column,
3940                text: cell.text.clone(),
3941                bbox: Some(cell.bbox),
3942                is_header: false,
3943                col_span: 1,
3944                row_span: 1,
3945            });
3946        }
3947    }
3948
3949    Some(DetectedTable {
3950        table: TableBlock {
3951            headers,
3952            rows: body_rows,
3953            caption: None,
3954            bbox: Some(bbox),
3955            cells,
3956            source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3957            confidence: Some(Confidence {
3958                score: 0.68,
3959                calibrated: false,
3960            }), ..Default::default()
3961        },
3962        line_indices,
3963    })
3964}
3965
3966#[derive(Debug, Clone)]
3967struct ImpliedTableHeader {
3968    cells: Vec<Option<TextRun>>,
3969    line_indices: Vec<usize>,
3970}
3971
3972impl ImpliedTableHeader {
3973    fn has_text(&self) -> bool {
3974        self.cells
3975            .iter()
3976            .any(|cell| cell.as_ref().is_some_and(|cell| !cell.text.is_empty()))
3977    }
3978}
3979
3980fn implied_table_header(
3981    lines: &[TextLine],
3982    rows: &[TableRowCandidate],
3983    columns: usize,
3984) -> ImpliedTableHeader {
3985    let mut header = ImpliedTableHeader {
3986        cells: vec![None; columns],
3987        line_indices: Vec::new(),
3988    };
3989    let Some(first_row) = rows.first() else {
3990        return header;
3991    };
3992    let first_y = first_row
3993        .cells
3994        .iter()
3995        .map(|cell| cell.bbox.y)
3996        .reduce(f32::max)
3997        .unwrap_or_default();
3998    let table_left = first_row
3999        .cells
4000        .iter()
4001        .map(|cell| cell.bbox.x)
4002        .reduce(f32::min)
4003        .unwrap_or_default();
4004    let table_right = first_row
4005        .cells
4006        .iter()
4007        .map(|cell| cell.bbox.x + cell.bbox.width)
4008        .reduce(f32::max)
4009        .unwrap_or_default();
4010    let column_refs = first_row
4011        .cells
4012        .iter()
4013        .map(|cell| (cell.bbox.x, cell.bbox.x + cell.bbox.width))
4014        .collect::<Vec<_>>();
4015
4016    let mut candidates = lines
4017        .iter()
4018        .enumerate()
4019        .filter(|(line_index, line)| {
4020            !rows.iter().any(|row| row.line_index == *line_index)
4021                && line.bbox.y > first_y
4022                && line.bbox.y <= first_y + 80.0
4023                && line.bbox.x <= table_right + 12.0
4024                && line.bbox.x + line.bbox.width >= table_left - 12.0
4025                && !text_line_plain_text(line)
4026                    .to_ascii_lowercase()
4027                    .starts_with("table ")
4028                // Skip lines that are themselves full data rows (a labelled row of
4029                // numeric columns, e.g. a "$"-prefixed opening balance): those
4030                // belong in the body, not merged into the column header.
4031                && !line_is_data_row(line, columns)
4032        })
4033        .collect::<Vec<_>>();
4034    candidates.sort_by(|left, right| right.1.bbox.y.total_cmp(&left.1.bbox.y));
4035
4036    for (line_index, line) in candidates {
4037        let mut used_line = false;
4038        for cell in implied_table_cells(line) {
4039            if cell.text.chars().count() > 40 {
4040                continue;
4041            }
4042            let Some(column) = nearest_table_column(&cell, &column_refs) else {
4043                continue;
4044            };
4045            append_header_cell(&mut header.cells[column], cell);
4046            used_line = true;
4047        }
4048        if used_line {
4049            header.line_indices.push(line_index);
4050        }
4051    }
4052
4053    header
4054}
4055
4056/// A line that looks like a full body row — at least as many cells as the table
4057/// has columns, with numeric values in the non-label cells. Used to keep opening
4058/// balances and similar `$`-prefixed rows out of the inferred header.
4059fn line_is_data_row(line: &TextLine, columns: usize) -> bool {
4060    let cells = implied_table_cells(line);
4061    cells.len() >= columns && row_has_numeric_table_evidence(&cells)
4062}
4063
4064/// Assign a header fragment to the column whose horizontal span it overlaps (or is
4065/// nearest in center). Center matching, rather than left-edge matching, is what
4066/// lets a left-aligned header word line up with a right-aligned numeric column.
4067fn nearest_table_column(cell: &TextRun, column_refs: &[(f32, f32)]) -> Option<usize> {
4068    let cell_center = cell.bbox.x + cell.bbox.width / 2.0;
4069    let (column, distance) = column_refs
4070        .iter()
4071        .enumerate()
4072        .map(|(index, (left, right))| {
4073            let column_center = (left + right) / 2.0;
4074            (index, (cell_center - column_center).abs())
4075        })
4076        .min_by(|left, right| left.1.total_cmp(&right.1))?;
4077    let (left, right) = column_refs[column];
4078    let tolerance = ((right - left) / 2.0 + 18.0).max(24.0);
4079    (distance <= tolerance).then_some(column)
4080}
4081
4082fn append_header_cell(target: &mut Option<TextRun>, fragment: TextRun) {
4083    if let Some(existing) = target {
4084        if !existing.text.is_empty() {
4085            existing.text.push(' ');
4086        }
4087        existing.text.push_str(&fragment.text);
4088        existing.bbox = union_boxes([existing.bbox, fragment.bbox]).unwrap_or(existing.bbox);
4089        for id in fragment.source_object_ids {
4090            if !existing.source_object_ids.contains(&id) {
4091                existing.source_object_ids.push(id);
4092            }
4093        }
4094    } else {
4095        *target = Some(fragment);
4096    }
4097}
4098
4099fn has_table_evidence(candidate_lines: &[(usize, &TextLine)]) -> bool {
4100    if candidate_lines.len() >= 3 {
4101        return true;
4102    }
4103    candidate_lines
4104        .iter()
4105        .skip(1)
4106        .flat_map(|(_, line)| line.runs.iter())
4107        .any(|run| run.text.chars().any(|character| character.is_ascii_digit()))
4108}
4109
4110fn columns_align(first: &[TextRun], next: &[TextRun]) -> bool {
4111    first
4112        .iter()
4113        .zip(next)
4114        .all(|(left, right)| (left.bbox.x - right.bbox.x).abs() <= 6.0)
4115}
4116
4117/// Map a point from unrotated page space into the displayed (clockwise-rotated)
4118/// frame for a `/Rotate` of 90/180/270 (ISO 32000-1 §7.7.3.3). Assumes the page
4119/// origin is at (0, 0).
4120fn rotate_point(x: f32, y: f32, rotation: i32, width: f32, height: f32) -> (f32, f32) {
4121    match rotation.rem_euclid(360) {
4122        90 => (y, width - x),
4123        180 => (width - x, height - y),
4124        270 => (height - y, x),
4125        _ => (x, y),
4126    }
4127}
4128
4129/// Rotate an axis-aligned bbox into the displayed frame (90/180/270 keep it
4130/// axis-aligned), recomputing width/height from the transformed corners.
4131fn rotate_bbox(bbox: BBox, rotation: i32, width: f32, height: f32) -> BBox {
4132    if rotation.rem_euclid(360) == 0 {
4133        return bbox;
4134    }
4135    let (x0, y0) = rotate_point(bbox.x, bbox.y, rotation, width, height);
4136    let (x1, y1) = rotate_point(bbox.x + bbox.width, bbox.y + bbox.height, rotation, width, height);
4137    BBox {
4138        x: x0.min(x1),
4139        y: y0.min(y1),
4140        width: (x1 - x0).abs(),
4141        height: (y1 - y0).abs(),
4142    }
4143}
4144
4145fn group_text_runs(mut runs: Vec<TextRun>) -> Vec<TextLine> {
4146    runs.sort_by(|left, right| {
4147        right
4148            .baseline_y
4149            .total_cmp(&left.baseline_y)
4150            .then(left.bbox.x.total_cmp(&right.bbox.x))
4151    });
4152
4153    let mut lines: Vec<TextLine> = Vec::new();
4154    for run in runs {
4155        // Group by text baseline, not the visual bbox top, so a smaller-font
4156        // super/subscript stays on its line even though its box (ascent/descent)
4157        // differs from the body text.
4158        if let Some(line) = lines
4159            .iter_mut()
4160            .find(|line| (line.baseline_y - run.baseline_y).abs() <= 3.0)
4161        {
4162            line.bbox = union_boxes([line.bbox, run.bbox]).unwrap_or(line.bbox);
4163            // Drift the line anchor toward the lowest baseline, matching the old
4164            // union-of-boxes behavior, so following runs match the body baseline
4165            // rather than a leading super/subscript.
4166            line.baseline_y = line.baseline_y.min(run.baseline_y);
4167            line.runs.push(run);
4168        } else {
4169            lines.push(TextLine {
4170                baseline_y: run.baseline_y,
4171                bbox: run.bbox,
4172                runs: vec![run],
4173            });
4174        }
4175    }
4176
4177    // Sort each line's runs left-to-right once at the end, instead of re-sorting
4178    // the whole line on every insert (which was O(k^2 log k) per line).
4179    for line in &mut lines {
4180        line.runs
4181            .sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
4182    }
4183
4184    lines
4185}
4186
4187fn parse_content_ops(bytes: &[u8]) -> Vec<ContentOp> {
4188    let mut parser = ContentParser::new(bytes);
4189    let mut stack = Vec::new();
4190    let mut ops = Vec::new();
4191
4192    while let Some(token) = parser.next_operand_or_operator() {
4193        match token {
4194            ContentToken::Operand(operand) => stack.push(operand),
4195            ContentToken::Operator(operator) => {
4196                ops.push(ContentOp {
4197                    operands: std::mem::take(&mut stack),
4198                    operator,
4199                });
4200            }
4201        }
4202    }
4203
4204    ops
4205}
4206
4207#[derive(Debug)]
4208enum ContentToken {
4209    Operand(Operand),
4210    Operator(String),
4211}
4212
4213struct ContentParser<'a> {
4214    bytes: &'a [u8],
4215    pos: usize,
4216}
4217
4218impl<'a> ContentParser<'a> {
4219    fn new(bytes: &'a [u8]) -> Self {
4220        Self { bytes, pos: 0 }
4221    }
4222
4223    fn next_operand_or_operator(&mut self) -> Option<ContentToken> {
4224        self.skip_ws_and_comments();
4225        if self.pos >= self.bytes.len() {
4226            return None;
4227        }
4228
4229        let byte = self.bytes[self.pos];
4230        match byte {
4231            b'/' => Some(ContentToken::Operand(Operand::Name(self.read_name()))),
4232            b'(' => Some(ContentToken::Operand(Operand::Literal(self.read_literal()))),
4233            b'[' => Some(ContentToken::Operand(Operand::Array(self.read_array()))),
4234            b'<' if self.peek(1) != Some(b'<') => {
4235                Some(ContentToken::Operand(Operand::Hex(self.read_hex_string())))
4236            }
4237            b'+' | b'-' | b'.' | b'0'..=b'9' => self
4238                .read_number()
4239                .map(|number| ContentToken::Operand(Operand::Number(number))),
4240            _ => {
4241                let word = self.read_word();
4242                if word.is_empty() {
4243                    self.pos += 1;
4244                    Some(ContentToken::Operand(Operand::Other))
4245                } else {
4246                    Some(ContentToken::Operator(word))
4247                }
4248            }
4249        }
4250    }
4251
4252    fn read_array(&mut self) -> Vec<Operand> {
4253        self.pos += 1;
4254        let mut items = Vec::new();
4255        loop {
4256            self.skip_ws_and_comments();
4257            if self.pos >= self.bytes.len() || self.bytes[self.pos] == b']' {
4258                self.pos = (self.pos + 1).min(self.bytes.len());
4259                break;
4260            }
4261
4262            match self.next_operand_or_operator() {
4263                Some(ContentToken::Operand(operand)) => items.push(operand),
4264                Some(ContentToken::Operator(_)) | None => {}
4265            }
4266        }
4267        items
4268    }
4269
4270    fn read_name(&mut self) -> String {
4271        self.pos += 1;
4272        let start = self.pos;
4273        while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
4274            self.pos += 1;
4275        }
4276        lossy(&self.bytes[start..self.pos])
4277    }
4278
4279    fn read_literal(&mut self) -> Vec<u8> {
4280        self.pos += 1;
4281        let mut depth = 1;
4282        let mut output = Vec::new();
4283
4284        while self.pos < self.bytes.len() && depth > 0 {
4285            let byte = self.bytes[self.pos];
4286            self.pos += 1;
4287            match byte {
4288                b'\\' => {
4289                    if self.pos < self.bytes.len() {
4290                        match self.bytes[self.pos] {
4291                            b'n' => {
4292                                output.push(b'\n');
4293                                self.pos += 1;
4294                            }
4295                            b'r' => {
4296                                output.push(b'\r');
4297                                self.pos += 1;
4298                            }
4299                            b't' => {
4300                                output.push(b'\t');
4301                                self.pos += 1;
4302                            }
4303                            b'b' => {
4304                                output.push(0x08);
4305                                self.pos += 1;
4306                            }
4307                            b'f' => {
4308                                output.push(0x0c);
4309                                self.pos += 1;
4310                            }
4311                            b'\n' => {
4312                                self.pos += 1;
4313                            }
4314                            b'\r' => {
4315                                self.pos += 1;
4316                                if self.bytes.get(self.pos) == Some(&b'\n') {
4317                                    self.pos += 1;
4318                                }
4319                            }
4320                            b'0'..=b'7' => output.push(self.read_octal_escape()),
4321                            other => {
4322                                output.push(other);
4323                                self.pos += 1;
4324                            }
4325                        }
4326                    }
4327                }
4328                b'(' => {
4329                    depth += 1;
4330                    output.push(byte);
4331                }
4332                b')' => {
4333                    depth -= 1;
4334                    if depth > 0 {
4335                        output.push(byte);
4336                    }
4337                }
4338                _ => output.push(byte),
4339            }
4340        }
4341
4342        output
4343    }
4344
4345    fn read_octal_escape(&mut self) -> u8 {
4346        let mut value = 0u16;
4347        let mut digits = 0;
4348        while self.pos < self.bytes.len()
4349            && digits < 3
4350            && matches!(self.bytes[self.pos], b'0'..=b'7')
4351        {
4352            value = (value << 3) + u16::from(self.bytes[self.pos] - b'0');
4353            self.pos += 1;
4354            digits += 1;
4355        }
4356        value.min(u16::from(u8::MAX)) as u8
4357    }
4358
4359    fn read_hex_string(&mut self) -> Vec<u8> {
4360        self.pos += 1;
4361        let start = self.pos;
4362        while self.pos < self.bytes.len() && self.bytes[self.pos] != b'>' {
4363            self.pos += 1;
4364        }
4365        let raw = self.bytes[start..self.pos].to_vec();
4366        self.pos = (self.pos + 1).min(self.bytes.len());
4367        decode_hex(&raw)
4368    }
4369
4370    fn read_number(&mut self) -> Option<f32> {
4371        let start = self.pos;
4372        while self.pos < self.bytes.len()
4373            && matches!(self.bytes[self.pos], b'+' | b'-' | b'.' | b'0'..=b'9')
4374        {
4375            self.pos += 1;
4376        }
4377        std::str::from_utf8(&self.bytes[start..self.pos])
4378            .ok()
4379            .and_then(|text| text.parse().ok())
4380    }
4381
4382    fn read_word(&mut self) -> String {
4383        let start = self.pos;
4384        while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
4385            self.pos += 1;
4386        }
4387        lossy(&self.bytes[start..self.pos])
4388    }
4389
4390    fn skip_ws_and_comments(&mut self) {
4391        loop {
4392            while self.pos < self.bytes.len() && is_ws(self.bytes[self.pos]) {
4393                self.pos += 1;
4394            }
4395            if self.pos < self.bytes.len() && self.bytes[self.pos] == b'%' {
4396                while self.pos < self.bytes.len() && !matches!(self.bytes[self.pos], b'\n' | b'\r')
4397                {
4398                    self.pos += 1;
4399                }
4400            } else {
4401                break;
4402            }
4403        }
4404    }
4405
4406    fn peek(&self, offset: usize) -> Option<u8> {
4407        self.bytes.get(self.pos + offset).copied()
4408    }
4409}
4410
4411fn parse_indirect_objects(bytes: &[u8]) -> Vec<PdfObject> {
4412    let mut objects = Vec::new();
4413    let mut pos = 0;
4414
4415    while pos < bytes.len() {
4416        if !is_ws_or_line_start(bytes, pos) && pos != 0 {
4417            pos += 1;
4418            continue;
4419        }
4420
4421        let Some((object_number, after_object_number)) = parse_unsigned_at(bytes, pos) else {
4422            pos += 1;
4423            continue;
4424        };
4425        let Some(after_space) = skip_required_ws(bytes, after_object_number) else {
4426            pos += 1;
4427            continue;
4428        };
4429        let Some((generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
4430            pos += 1;
4431            continue;
4432        };
4433        let Some(after_space) = skip_required_ws(bytes, after_generation) else {
4434            pos += 1;
4435            continue;
4436        };
4437        if !bytes[after_space..].starts_with(b"obj") {
4438            pos += 1;
4439            continue;
4440        }
4441
4442        let body_start = after_space + 3;
4443        if let Some(relative_end) = find_subslice(&bytes[body_start..], b"endobj") {
4444            let body_end = body_start + relative_end;
4445            objects.push(PdfObject {
4446                object_number: object_number as u32,
4447                generation: generation as u16,
4448                body: bytes[body_start..body_end].to_vec(),
4449            });
4450            pos = body_end + b"endobj".len();
4451        } else {
4452            break;
4453        }
4454    }
4455
4456    objects
4457}
4458
4459fn expand_object_streams(objects: &mut Vec<PdfObject>) {
4460    let object_streams = objects
4461        .iter()
4462        .filter(|object| {
4463            lossy(&object.body)
4464                .split_whitespace()
4465                .collect::<String>()
4466                .contains("/Type/ObjStm")
4467        })
4468        .cloned()
4469        .collect::<Vec<_>>();
4470    let existing = objects
4471        .iter()
4472        .map(|object| object.object_number)
4473        .collect::<std::collections::HashSet<_>>();
4474    let mut expanded = Vec::new();
4475
4476    for object_stream in object_streams {
4477        let object_body = lossy(&object_stream.body);
4478        let Some(count) = parse_number_after(&object_body, "/N").map(|value| value as usize) else {
4479            continue;
4480        };
4481        let Some(first) = parse_number_after(&object_body, "/First").map(|value| value as usize)
4482        else {
4483            continue;
4484        };
4485        let Ok(Some(decoded)) = decode_stream_object(&object_stream) else {
4486            continue;
4487        };
4488        if first > decoded.len() {
4489            continue;
4490        }
4491
4492        let header = lossy(&decoded[..first]);
4493        let header_numbers = header
4494            .split_whitespace()
4495            .filter_map(|part| part.parse::<usize>().ok())
4496            .collect::<Vec<_>>();
4497        let mut entries = Vec::new();
4498        for pair in header_numbers.chunks_exact(2).take(count) {
4499            entries.push((pair[0] as u32, pair[1]));
4500        }
4501
4502        for (index, (object_number, offset)) in entries.iter().enumerate() {
4503            if existing.contains(object_number) {
4504                continue;
4505            }
4506            let next_offset = entries
4507                .get(index + 1)
4508                .map(|(_, next_offset)| *next_offset)
4509                .unwrap_or(decoded.len() - first);
4510            if *offset > next_offset || first + next_offset > decoded.len() {
4511                continue;
4512            }
4513            expanded.push(PdfObject {
4514                object_number: *object_number,
4515                generation: 0,
4516                body: decoded[first + *offset..first + next_offset].to_vec(),
4517            });
4518        }
4519    }
4520
4521    objects.extend(expanded);
4522}
4523
4524fn page_seed(object: &PdfObject, object_map: &HashMap<u32, Arc<PdfObject>>) -> Option<PageSeed> {
4525    let body = lossy(&object.body);
4526    let compact = body.split_whitespace().collect::<String>();
4527    if compact.contains("/Type/Page") && !compact.contains("/Type/Pages") {
4528        Some(PageSeed {
4529            number: 0,
4530            body: body_with_inherited_page_tree_entries(&body, object_map),
4531        })
4532    } else {
4533        None
4534    }
4535}
4536
4537fn body_with_inherited_page_tree_entries(
4538    page_body: &str,
4539    object_map: &HashMap<u32, Arc<PdfObject>>,
4540) -> String {
4541    let mut body = page_body.to_owned();
4542    append_parent_page_tree_entries(page_body, object_map, &mut body, 0);
4543    body
4544}
4545
4546fn append_parent_page_tree_entries(
4547    body: &str,
4548    object_map: &HashMap<u32, Arc<PdfObject>>,
4549    output: &mut String,
4550    depth: usize,
4551) {
4552    if depth >= 16 {
4553        return;
4554    }
4555    let Some(parent_ref) = parse_direct_ref_after_key(body, "/Parent") else {
4556        return;
4557    };
4558    let Some(parent) = object_map.get(&(parent_ref as u32)) else {
4559        return;
4560    };
4561    let parent_body = lossy(&parent.body);
4562    output.push('\n');
4563    output.push_str(&parent_body);
4564    append_parent_page_tree_entries(&parent_body, object_map, output, depth + 1);
4565}
4566
4567fn decode_stream_object(object: &PdfObject) -> Result<Option<Vec<u8>>> {
4568    let Some(stream_marker) = find_subslice(&object.body, b"stream") else {
4569        return Ok(None);
4570    };
4571    let Some(end_marker) = find_subslice(&object.body, b"endstream") else {
4572        return Err(DonglerError::pdf("stream is missing endstream marker"));
4573    };
4574    if end_marker <= stream_marker {
4575        return Err(DonglerError::pdf("stream markers are malformed"));
4576    }
4577
4578    let dict = lossy(&object.body[..stream_marker]);
4579    let mut stream = object.body[stream_marker + b"stream".len()..end_marker].to_vec();
4580    trim_stream_edges(&mut stream);
4581
4582    for filter in stream_filters(&dict) {
4583        stream = decode_stream_filter(&filter, &stream)?;
4584    }
4585    Ok(Some(stream))
4586}
4587
4588fn decode_stream_filter(filter: &str, stream: &[u8]) -> Result<Vec<u8>> {
4589    match filter {
4590        "FlateDecode" | "Fl" => {
4591            let mut decoder = ZlibDecoder::new(stream);
4592            let mut decoded = Vec::new();
4593            decoder
4594                .read_to_end(&mut decoded)
4595                .map_err(|error| DonglerError::pdf(format!("FlateDecode failed: {error}")))?;
4596            Ok(decoded)
4597        }
4598        "ASCII85Decode" | "A85" => ascii85_decode(stream),
4599        other => Err(DonglerError::pdf(format!(
4600            "unsupported stream filter: {other}"
4601        ))),
4602    }
4603}
4604
4605fn stream_filters(dict: &str) -> Vec<String> {
4606    let Some(mut index) = dict.find("/Filter").map(|index| index + "/Filter".len()) else {
4607        return Vec::new();
4608    };
4609    let bytes = dict.as_bytes();
4610    skip_pdf_whitespace(bytes, &mut index);
4611    if bytes.get(index) == Some(&b'[') {
4612        index += 1;
4613        let mut filters = Vec::new();
4614        while index < bytes.len() && bytes[index] != b']' {
4615            skip_pdf_whitespace(bytes, &mut index);
4616            if bytes.get(index) == Some(&b']') {
4617                break;
4618            }
4619            if bytes.get(index) == Some(&b'/') {
4620                index += 1;
4621                let start = index;
4622                while index < bytes.len() && !is_pdf_name_delimiter(bytes[index]) {
4623                    index += 1;
4624                }
4625                if start < index {
4626                    filters.push(dict[start..index].to_owned());
4627                }
4628            } else {
4629                index += 1;
4630            }
4631        }
4632        filters
4633    } else if bytes.get(index) == Some(&b'/') {
4634        index += 1;
4635        let start = index;
4636        while index < bytes.len() && !is_pdf_name_delimiter(bytes[index]) {
4637            index += 1;
4638        }
4639        (start < index)
4640            .then(|| vec![dict[start..index].to_owned()])
4641            .unwrap_or_default()
4642    } else {
4643        Vec::new()
4644    }
4645}
4646
4647fn skip_pdf_whitespace(bytes: &[u8], index: &mut usize) {
4648    while bytes
4649        .get(*index)
4650        .is_some_and(|byte| matches!(byte, b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' '))
4651    {
4652        *index += 1;
4653    }
4654}
4655
4656fn is_pdf_name_delimiter(byte: u8) -> bool {
4657    matches!(
4658        byte,
4659        b'\0'
4660            | b'\t'
4661            | b'\n'
4662            | b'\x0c'
4663            | b'\r'
4664            | b' '
4665            | b'('
4666            | b')'
4667            | b'<'
4668            | b'>'
4669            | b'['
4670            | b']'
4671            | b'{'
4672            | b'}'
4673            | b'/'
4674            | b'%'
4675    )
4676}
4677
4678fn ascii85_decode(bytes: &[u8]) -> Result<Vec<u8>> {
4679    let mut output = Vec::new();
4680    let mut group = Vec::new();
4681    let mut index = 0;
4682    while index < bytes.len() {
4683        let byte = bytes[index];
4684        match byte {
4685            b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ' => {}
4686            b'<' if bytes.get(index + 1) == Some(&b'~') => {
4687                index += 1;
4688            }
4689            b'~' if bytes.get(index + 1) == Some(&b'>') => break,
4690            b'z' if group.is_empty() => output.extend_from_slice(&[0, 0, 0, 0]),
4691            b'!'..=b'u' => {
4692                group.push(byte - b'!');
4693                if group.len() == 5 {
4694                    output.extend_from_slice(&ascii85_group_to_bytes(&group)?);
4695                    group.clear();
4696                }
4697            }
4698            _ => {
4699                return Err(DonglerError::pdf(format!(
4700                    "ASCII85Decode failed: invalid byte 0x{byte:02x}"
4701                )));
4702            }
4703        }
4704        index += 1;
4705    }
4706
4707    if !group.is_empty() {
4708        if group.len() == 1 {
4709            return Err(DonglerError::pdf(
4710                "ASCII85Decode failed: dangling single digit",
4711            ));
4712        }
4713        let output_len = group.len() - 1;
4714        while group.len() < 5 {
4715            group.push(b'u' - b'!');
4716        }
4717        output.extend_from_slice(&ascii85_group_to_bytes(&group)?[..output_len]);
4718    }
4719
4720    Ok(output)
4721}
4722
4723fn ascii85_group_to_bytes(group: &[u8]) -> Result<[u8; 4]> {
4724    let mut value = 0u64;
4725    for digit in group {
4726        value = value * 85 + u64::from(*digit);
4727    }
4728    if value > u64::from(u32::MAX) {
4729        return Err(DonglerError::pdf("ASCII85Decode failed: invalid group"));
4730    }
4731    Ok((value as u32).to_be_bytes())
4732}
4733
4734fn trim_stream_edges(stream: &mut Vec<u8>) {
4735    while matches!(stream.first(), Some(b'\n' | b'\r')) {
4736        stream.remove(0);
4737    }
4738    while matches!(stream.last(), Some(b'\n' | b'\r')) {
4739        stream.pop();
4740    }
4741}
4742
4743fn parse_refs_after_key(text: &str, key: &str) -> Vec<usize> {
4744    let Some(start) = text.find(key) else {
4745        return Vec::new();
4746    };
4747    let rest = &text[start + key.len()..];
4748    if let Some(array_start) = rest.find('[') {
4749        let before_array = rest[..array_start].trim();
4750        if before_array.is_empty() {
4751            if let Some(array_end) = rest[array_start..].find(']') {
4752                return parse_refs(&rest[array_start..array_start + array_end]);
4753            }
4754        }
4755    }
4756    parse_refs(rest).into_iter().take(1).collect()
4757}
4758
4759fn parse_direct_ref_after_key(text: &str, key: &str) -> Option<usize> {
4760    let start = text.find(key)?;
4761    let bytes = text.as_bytes();
4762    let mut pos = start + key.len();
4763    while pos < bytes.len() && is_ws(bytes[pos]) {
4764        pos += 1;
4765    }
4766    let (object, after_object) = parse_unsigned_at(bytes, pos)?;
4767    let after_space = skip_required_ws(bytes, after_object)?;
4768    let (_generation, after_generation) = parse_unsigned_at(bytes, after_space)?;
4769    let after_space = skip_required_ws(bytes, after_generation)?;
4770    if bytes.get(after_space) == Some(&b'R') {
4771        Some(object)
4772    } else {
4773        None
4774    }
4775}
4776
4777fn parse_resource_refs(text: &str, key: &str) -> HashMap<String, u32> {
4778    let Some(start) = text.find(key) else {
4779        return HashMap::new();
4780    };
4781    let rest = &text[start + key.len()..];
4782    let Some(dict_start) = rest.find("<<") else {
4783        return HashMap::new();
4784    };
4785    let Some(dict_end) = rest[dict_start + 2..].find(">>") else {
4786        return HashMap::new();
4787    };
4788    let dict = &rest[dict_start + 2..dict_start + 2 + dict_end];
4789    parse_named_refs(dict)
4790}
4791
4792fn resolve_resource_body(page_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> Option<String> {
4793    let resource_ref = parse_direct_ref_after_key(page_body, "/Resources")?;
4794    object_map
4795        .get(&(resource_ref as u32))
4796        .map(|object| lossy(&object.body))
4797}
4798
4799fn load_font_decoders(
4800    resource_text: &str,
4801    object_map: &HashMap<u32, Arc<PdfObject>>,
4802    font_cache: &HashMap<u32, Arc<FontDecoder>>,
4803) -> HashMap<String, Arc<FontDecoder>> {
4804    resolve_named_resource_refs(resource_text, "/Font", object_map)
4805        .into_iter()
4806        .map(|(name, object_number)| {
4807            let decoder = font_cache.get(&object_number).cloned().unwrap_or_else(|| {
4808                Arc::new(
4809                    object_map
4810                        .get(&object_number)
4811                        .map(|font| font_decoder(font.as_ref(), object_map))
4812                        .unwrap_or_default(),
4813                )
4814            });
4815            (name, decoder)
4816        })
4817        .collect()
4818}
4819
4820fn resolve_named_resource_refs(
4821    resource_text: &str,
4822    key: &str,
4823    object_map: &HashMap<u32, Arc<PdfObject>>,
4824) -> HashMap<String, u32> {
4825    let direct = parse_resource_refs(resource_text, key);
4826    if !direct.is_empty() {
4827        return direct;
4828    }
4829
4830    parse_direct_ref_after_key(resource_text, key)
4831        .and_then(|object_number| object_map.get(&(object_number as u32)))
4832        .map(|object| parse_named_refs(&lossy(&object.body)))
4833        .unwrap_or_default()
4834}
4835
4836fn font_decoder(font: &PdfObject, object_map: &HashMap<u32, Arc<PdfObject>>) -> FontDecoder {
4837    let font_body = lossy(&font.body);
4838    let encoding = font_encoding_differences(&font_body, object_map);
4839    let widths = font_widths(&font_body, &encoding);
4840    let (bold, italic) = font_style(&font_body, object_map);
4841    let (ascent, descent) = font_vertical_metrics(&font_body, object_map);
4842    let Some(to_unicode_ref) = parse_refs_after_key(&font_body, "/ToUnicode")
4843        .into_iter()
4844        .next()
4845    else {
4846        return FontDecoder {
4847            cmap: HashMap::new(),
4848            encoding,
4849            widths,
4850            max_code_len: 1,
4851            bold,
4852            italic,
4853            ascent,
4854            descent,
4855        };
4856    };
4857    let Some(to_unicode) = object_map.get(&(to_unicode_ref as u32)) else {
4858        return FontDecoder {
4859            cmap: HashMap::new(),
4860            encoding,
4861            widths,
4862            max_code_len: 1,
4863            bold,
4864            italic,
4865            ascent,
4866            descent,
4867        };
4868    };
4869    let Ok(Some(cmap_stream)) = decode_stream_object(to_unicode.as_ref()) else {
4870        return FontDecoder {
4871            cmap: HashMap::new(),
4872            encoding,
4873            widths,
4874            max_code_len: 1,
4875            bold,
4876            italic,
4877            ascent,
4878            descent,
4879        };
4880    };
4881
4882    let mut decoder = parse_to_unicode_cmap(&lossy(&cmap_stream));
4883    decoder.encoding = encoding;
4884    decoder.widths = if widths.is_empty() {
4885        cid_char_widths(&decoder.cmap, &font_cid_widths(&font_body, object_map))
4886    } else {
4887        widths
4888    };
4889    decoder.bold = bold;
4890    decoder.italic = italic;
4891    decoder.ascent = ascent;
4892    decoder.descent = descent;
4893    decoder
4894}
4895
4896/// Font ascent/descent in em units (text-space fractions of the font size),
4897/// from `/FontDescriptor` `/Ascent` and `/Descent` (glyph space, /1000). Falls
4898/// back to typical Latin metrics when the descriptor is absent.
4899fn font_vertical_metrics(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> (f32, f32) {
4900    let mut ascent = 0.75;
4901    let mut descent = -0.25;
4902    if let Some(descriptor_ref) = parse_direct_ref_after_key(font_body, "/FontDescriptor") {
4903        if let Some(object) = object_map.get(&(descriptor_ref as u32)) {
4904            let body = lossy(&object.body);
4905            if let Some(value) = parse_number_after(&body, "/Ascent") {
4906                if value != 0.0 {
4907                    ascent = value / 1000.0;
4908                }
4909            }
4910            if let Some(value) = parse_number_after(&body, "/Descent") {
4911                if value != 0.0 {
4912                    descent = value / 1000.0;
4913                }
4914            }
4915        }
4916    }
4917    (ascent, descent)
4918}
4919
4920/// Detect bold/italic for a font from its `/BaseFont` name (after stripping the
4921/// subset prefix) and, when present, its `/FontDescriptor` `/Flags` (bit 7
4922/// Italic, bit 19 ForceBold) and `/ItalicAngle`.
4923fn font_style(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> (bool, bool) {
4924    let mut bold = false;
4925    let mut italic = false;
4926    if let Some(name) = parse_name_after(font_body, "/BaseFont") {
4927        let bare = name.rsplit('+').next().unwrap_or(name.as_str()).to_ascii_lowercase();
4928        bold |= ["bold", "black", "heavy", "semibold", "demibold", "-bd", "demi"]
4929            .iter()
4930            .any(|needle| bare.contains(needle));
4931        italic |= ["italic", "oblique", "-it"]
4932            .iter()
4933            .any(|needle| bare.contains(needle));
4934    }
4935    if let Some(descriptor_ref) = parse_direct_ref_after_key(font_body, "/FontDescriptor") {
4936        if let Some(object) = object_map.get(&(descriptor_ref as u32)) {
4937            let body = lossy(&object.body);
4938            if let Some(flags) = parse_number_after(&body, "/Flags") {
4939                let flags = flags as i64;
4940                italic |= flags & 64 != 0;
4941                bold |= flags & 262_144 != 0;
4942            }
4943            if let Some(angle) = parse_number_after(&body, "/ItalicAngle") {
4944                italic |= angle.abs() > f32::EPSILON;
4945            }
4946        }
4947    }
4948    (bold, italic)
4949}
4950
4951/// Parse a PDF name value (`/Name`) following `key`.
4952fn parse_name_after(text: &str, key: &str) -> Option<String> {
4953    let start = text.find(key)? + key.len();
4954    let rest = text[start..].trim_start();
4955    let mut chars = rest.chars();
4956    if chars.next()? != '/' {
4957        return None;
4958    }
4959    let name: String = chars
4960        .take_while(|character| {
4961            !character.is_whitespace()
4962                && !matches!(character, '/' | '[' | ']' | '<' | '>' | '(' | ')')
4963        })
4964        .collect();
4965    (!name.is_empty()).then_some(name)
4966}
4967
4968fn font_widths(font_body: &str, encoding: &HashMap<u8, String>) -> HashMap<char, f32> {
4969    let Some(first_char) = parse_number_after(font_body, "/FirstChar").map(|value| value as u8)
4970    else {
4971        return HashMap::new();
4972    };
4973    let Some(widths) = parse_number_array_after(font_body, "/Widths") else {
4974        return HashMap::new();
4975    };
4976
4977    widths
4978        .into_iter()
4979        .enumerate()
4980        .filter_map(|(index, width)| {
4981            let code = first_char.wrapping_add(index as u8);
4982            let text = encoding
4983                .get(&code)
4984                .cloned()
4985                .unwrap_or_else(|| (code as char).to_string());
4986            let mut chars = text.chars();
4987            let character = chars.next()?;
4988            chars.next().is_none().then_some((character, width))
4989        })
4990        .collect()
4991}
4992
4993/// Glyph widths for a Type0 (composite) font, read from its descendant CIDFont's
4994/// `/W` array and keyed by CID. Simple fonts carry `/FirstChar`+`/Widths`, but
4995/// composite fonts — the norm for born-digital PDFs from Chrome/Skia, LaTeX, and
4996/// modern Office exporters — keep per-CID widths in `/DescendantFonts[0]/W`.
4997/// Without these every glyph falls back to a flat half-em, which destroys gap-based
4998/// word segmentation. The `/W` array mixes two run encodings: `c [w1 w2 …]` (widths
4999/// for consecutive CIDs starting at `c`) and `c_first c_last w` (one width for a
5000/// CID range). Returns CID → width in 1/1000 em.
5001fn font_cid_widths(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> HashMap<u32, f32> {
5002    let mut widths = HashMap::new();
5003    if parse_name_after(font_body, "/Subtype").as_deref() != Some("Type0") {
5004        return widths;
5005    }
5006    let Some(descendant) = parse_refs_after_key(font_body, "/DescendantFonts")
5007        .into_iter()
5008        .next()
5009    else {
5010        return widths;
5011    };
5012    let Some(cidfont) = object_map.get(&(descendant as u32)) else {
5013        return widths;
5014    };
5015    let body = lossy(&cidfont.body);
5016    let Some((open, close)) = find_w_array(&body) else {
5017        return widths;
5018    };
5019    let mut parser = ContentParser::new(&body.as_bytes()[open..=close]);
5020    let Some(ContentToken::Operand(Operand::Array(items))) = parser.next_operand_or_operator() else {
5021        return widths;
5022    };
5023
5024    let mut index = 0;
5025    while index < items.len() {
5026        match (&items[index], items.get(index + 1)) {
5027            (Operand::Number(first), Some(Operand::Array(list))) => {
5028                let base = *first as i64;
5029                for (offset, width) in list.iter().enumerate() {
5030                    if let Operand::Number(width) = width {
5031                        let cid = base + offset as i64;
5032                        if cid >= 0 {
5033                            widths.insert(cid as u32, *width);
5034                        }
5035                    }
5036                }
5037                index += 2;
5038            }
5039            (Operand::Number(first), Some(Operand::Number(last))) => {
5040                if let Some(Operand::Number(width)) = items.get(index + 2) {
5041                    let (lo, hi) = (*first as i64, *last as i64);
5042                    if lo >= 0 && hi >= lo && hi - lo < 70_000 {
5043                        for cid in lo..=hi {
5044                            widths.insert(cid as u32, *width);
5045                        }
5046                    }
5047                    index += 3;
5048                } else {
5049                    index += 1;
5050                }
5051            }
5052            _ => index += 1,
5053        }
5054    }
5055    widths
5056}
5057
5058/// Locate the `/W` array of a CIDFont, returning the byte span of its `[ … ]`.
5059/// Distinguishes the `/W` key from look-alikes (`/WMode`, `/Widths`) by requiring
5060/// whitespace or `[` immediately after.
5061fn find_w_array(body: &str) -> Option<(usize, usize)> {
5062    let bytes = body.as_bytes();
5063    let mut search = 0;
5064    while let Some(rel) = body[search..].find("/W") {
5065        let key_end = search + rel + 2;
5066        if matches!(bytes.get(key_end), Some(byte) if is_ws(*byte) || *byte == b'[') {
5067            let mut pos = key_end;
5068            while pos < bytes.len() && is_ws(bytes[pos]) {
5069                pos += 1;
5070            }
5071            if bytes.get(pos) == Some(&b'[') {
5072                if let Some(close) = matching_array_close(body, pos) {
5073                    return Some((pos, close));
5074                }
5075            }
5076        }
5077        search = key_end;
5078    }
5079    None
5080}
5081
5082/// Translate CID-keyed widths into char-keyed widths via the font's ToUnicode
5083/// cmap. For Identity-H (the universal Skia/LaTeX encoding) the CID is the numeric
5084/// value of the 2-byte code, which is exactly the cmap key, so each single-char
5085/// mapping yields one char → width pair.
5086fn cid_char_widths(
5087    cmap: &HashMap<Vec<u8>, String>,
5088    cid_widths: &HashMap<u32, f32>,
5089) -> HashMap<char, f32> {
5090    let mut out = HashMap::new();
5091    if cid_widths.is_empty() {
5092        return out;
5093    }
5094    for (code, text) in cmap {
5095        if code.is_empty() || code.len() > 4 {
5096            continue;
5097        }
5098        let mut chars = text.chars();
5099        let (Some(character), None) = (chars.next(), chars.next()) else {
5100            continue;
5101        };
5102        let cid = code.iter().fold(0u32, |acc, byte| (acc << 8) | u32::from(*byte));
5103        if let Some(width) = cid_widths.get(&cid) {
5104            out.insert(character, *width);
5105        }
5106    }
5107    out
5108}
5109
5110fn font_encoding_differences(
5111    font_body: &str,
5112    object_map: &HashMap<u32, Arc<PdfObject>>,
5113) -> HashMap<u8, String> {
5114    if let Some(encoding_ref) = parse_direct_ref_after_key(font_body, "/Encoding") {
5115        if let Some(object) = object_map.get(&(encoding_ref as u32)) {
5116            let differences = parse_encoding_differences(&lossy(&object.body));
5117            if !differences.is_empty() {
5118                return differences;
5119            }
5120        }
5121    }
5122    parse_encoding_differences(font_body)
5123}
5124
5125fn parse_encoding_differences(text: &str) -> HashMap<u8, String> {
5126    let Some(start) = text.find("/Differences") else {
5127        return HashMap::new();
5128    };
5129    let rest = &text[start + "/Differences".len()..];
5130    let Some(open) = rest.find('[') else {
5131        return HashMap::new();
5132    };
5133    let Some(close) = matching_array_close(rest, open) else {
5134        return HashMap::new();
5135    };
5136    let mut parser = ContentParser::new(rest[open..=close].as_bytes());
5137    let Some(ContentToken::Operand(Operand::Array(items))) = parser.next_operand_or_operator()
5138    else {
5139        return HashMap::new();
5140    };
5141
5142    let mut differences = HashMap::new();
5143    let mut code: Option<u16> = None;
5144    for item in items {
5145        match item {
5146            Operand::Number(value) if value >= 0.0 => {
5147                code = Some(value as u16);
5148            }
5149            Operand::Name(name) => {
5150                let Some(current_code) = code else {
5151                    continue;
5152                };
5153                if current_code <= u16::from(u8::MAX) {
5154                    if let Some(text) = glyph_name_to_text(&name) {
5155                        differences.insert(current_code as u8, text);
5156                    }
5157                }
5158                code = current_code.checked_add(1);
5159            }
5160            _ => {}
5161        }
5162    }
5163    differences
5164}
5165
5166fn matching_array_close(text: &str, open: usize) -> Option<usize> {
5167    let mut depth = 0usize;
5168    for (offset, byte) in text.as_bytes().iter().enumerate().skip(open) {
5169        match byte {
5170            b'[' => depth += 1,
5171            b']' => {
5172                depth = depth.checked_sub(1)?;
5173                if depth == 0 {
5174                    return Some(offset);
5175                }
5176            }
5177            _ => {}
5178        }
5179    }
5180    None
5181}
5182
5183fn parse_to_unicode_cmap(text: &str) -> FontDecoder {
5184    let mut cmap = HashMap::new();
5185    let mut in_bfchar = false;
5186    let mut in_bfrange = false;
5187    let mut bfrange_array_entry = String::new();
5188    let mut bfrange_array_depth = 0i32;
5189
5190    for line in text.lines() {
5191        let trimmed = line.trim();
5192        match trimmed {
5193            value if value.ends_with("beginbfchar") => {
5194                in_bfchar = true;
5195                continue;
5196            }
5197            "endbfchar" => {
5198                in_bfchar = false;
5199                continue;
5200            }
5201            value if value.ends_with("beginbfrange") => {
5202                in_bfrange = true;
5203                continue;
5204            }
5205            "endbfrange" => {
5206                in_bfrange = false;
5207                bfrange_array_entry.clear();
5208                bfrange_array_depth = 0;
5209                continue;
5210            }
5211            _ => {}
5212        }
5213
5214        if in_bfrange {
5215            if bfrange_array_depth > 0 {
5216                bfrange_array_entry.push(' ');
5217                bfrange_array_entry.push_str(trimmed);
5218                bfrange_array_depth += bracket_delta(trimmed);
5219                if bfrange_array_depth <= 0 {
5220                    add_bfrange_entry(&mut cmap, &bfrange_array_entry);
5221                    bfrange_array_entry.clear();
5222                    bfrange_array_depth = 0;
5223                }
5224                continue;
5225            }
5226
5227            let depth = bracket_delta(trimmed);
5228            if depth > 0 {
5229                bfrange_array_entry.clear();
5230                bfrange_array_entry.push_str(trimmed);
5231                bfrange_array_depth = depth;
5232                continue;
5233            }
5234
5235            add_bfrange_entry(&mut cmap, trimmed);
5236            continue;
5237        }
5238
5239        let hexes = hex_strings_in_line(trimmed);
5240        if in_bfchar && hexes.len() >= 2 {
5241            cmap.insert(
5242                hexes[0].clone(),
5243                cmap_text_for_mapping(&hexes[0], &hexes[1]),
5244            );
5245        }
5246    }
5247
5248    let max_code_len = cmap.keys().map(Vec::len).max().unwrap_or(1);
5249    FontDecoder {
5250        cmap,
5251        encoding: HashMap::new(),
5252        widths: HashMap::new(),
5253        max_code_len,
5254        bold: false,
5255        italic: false,
5256        ascent: 0.75,
5257        descent: -0.25,
5258    }
5259}
5260
5261fn bracket_delta(text: &str) -> i32 {
5262    text.chars().fold(0, |depth, character| match character {
5263        '[' => depth + 1,
5264        ']' => depth - 1,
5265        _ => depth,
5266    })
5267}
5268
5269fn add_bfrange_entry(cmap: &mut HashMap<Vec<u8>, String>, line: &str) {
5270    let hexes = hex_strings_in_line(line);
5271    if hexes.len() < 3 {
5272        return;
5273    }
5274    if line.contains('[') {
5275        add_bfrange_array(cmap, &hexes);
5276    } else {
5277        add_bfrange(cmap, &hexes);
5278    }
5279}
5280
5281fn add_bfrange(cmap: &mut HashMap<Vec<u8>, String>, hexes: &[Vec<u8>]) {
5282    let Some(start) = hex_to_u32(&hexes[0]) else {
5283        return;
5284    };
5285    let Some(end) = hex_to_u32(&hexes[1]) else {
5286        return;
5287    };
5288    let Some(destination) = hex_to_u32(&hexes[2]) else {
5289        return;
5290    };
5291    let source_len = hexes[0].len();
5292
5293    for offset in 0..=(end.saturating_sub(start)).min(512) {
5294        let source = start + offset;
5295        let destination = destination + offset;
5296        cmap.insert(
5297            number_to_be_bytes(source, source_len),
5298            cmap_text_for_codes(source, destination),
5299        );
5300    }
5301}
5302
5303fn add_bfrange_array(cmap: &mut HashMap<Vec<u8>, String>, hexes: &[Vec<u8>]) {
5304    let Some(start) = hex_to_u32(&hexes[0]) else {
5305        return;
5306    };
5307    let Some(end) = hex_to_u32(&hexes[1]) else {
5308        return;
5309    };
5310    let source_len = hexes[0].len();
5311    let range_len = end.saturating_sub(start).saturating_add(1) as usize;
5312
5313    for (offset, destination) in hexes.iter().skip(2).take(range_len.min(512)).enumerate() {
5314        let source = start + offset as u32;
5315        let source_bytes = number_to_be_bytes(source, source_len);
5316        cmap.insert(
5317            source_bytes.clone(),
5318            cmap_text_for_mapping(&source_bytes, destination),
5319        );
5320    }
5321}
5322
5323fn cmap_text_for_mapping(source: &[u8], destination: &[u8]) -> String {
5324    if destination.len() > 2 {
5325        return utf16be_hex_to_string(destination);
5326    }
5327    let Some(source_code) = hex_to_u32(source) else {
5328        return utf16be_hex_to_string(destination);
5329    };
5330    let Some(destination_code) = hex_to_u32(destination) else {
5331        return utf16be_hex_to_string(destination);
5332    };
5333    cmap_text_for_codes(source_code, destination_code)
5334}
5335
5336fn cmap_text_for_codes(source: u32, destination: u32) -> String {
5337    if is_private_use_text_code(destination) {
5338        if let Some(character) = private_use_source_ascii(source) {
5339            return character.to_string();
5340        }
5341    }
5342    char::from_u32(destination)
5343        .map(|character| character.to_string())
5344        .unwrap_or_default()
5345}
5346
5347fn is_private_use_text_code(code: u32) -> bool {
5348    (0xe000..=0xf8ff).contains(&code)
5349}
5350
5351fn private_use_source_ascii(source: u32) -> Option<char> {
5352    let ascii = source + 28;
5353    (0x20..=0x7e)
5354        .contains(&ascii)
5355        .then(|| char::from_u32(ascii))
5356        .flatten()
5357}
5358
5359fn hex_strings_in_line(line: &str) -> Vec<Vec<u8>> {
5360    let bytes = line.as_bytes();
5361    let mut hexes = Vec::new();
5362    let mut pos = 0;
5363
5364    while pos < bytes.len() {
5365        if bytes[pos] == b'<' && bytes.get(pos + 1) != Some(&b'<') {
5366            let start = pos + 1;
5367            if let Some(end) = bytes[start..].iter().position(|byte| *byte == b'>') {
5368                hexes.push(decode_hex(&bytes[start..start + end]));
5369                pos = start + end + 1;
5370                continue;
5371            }
5372        }
5373        pos += 1;
5374    }
5375
5376    hexes
5377}
5378
5379fn utf16be_hex_to_string(bytes: &[u8]) -> String {
5380    if bytes.len() >= 2 {
5381        let units = bytes
5382            .chunks_exact(2)
5383            .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
5384            .collect::<Vec<_>>();
5385        String::from_utf16_lossy(&units)
5386    } else {
5387        bytes.iter().map(|byte| *byte as char).collect()
5388    }
5389}
5390
5391fn hex_to_u32(bytes: &[u8]) -> Option<u32> {
5392    let mut value = 0u32;
5393    for byte in bytes {
5394        value = (value << 8) | (*byte as u32);
5395    }
5396    Some(value)
5397}
5398
5399fn number_to_be_bytes(value: u32, len: usize) -> Vec<u8> {
5400    (0..len)
5401        .rev()
5402        .map(|shift| ((value >> (shift * 8)) & 0xff) as u8)
5403        .collect()
5404}
5405
5406fn parse_named_refs(text: &str) -> HashMap<String, u32> {
5407    let mut refs = HashMap::new();
5408    let bytes = text.as_bytes();
5409    let mut pos = 0;
5410
5411    while pos < bytes.len() {
5412        if bytes[pos] != b'/' || bytes.get(pos + 1) == Some(&b'/') {
5413            pos += 1;
5414            continue;
5415        }
5416        pos += 1;
5417        let name_start = pos;
5418        while pos < bytes.len() && !is_delimiter_or_ws(bytes[pos]) {
5419            pos += 1;
5420        }
5421        let name = lossy(&bytes[name_start..pos]);
5422        while pos < bytes.len() && is_ws(bytes[pos]) {
5423            pos += 1;
5424        }
5425        let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
5426            continue;
5427        };
5428        let Some(after_space) = skip_required_ws(bytes, after_object) else {
5429            pos += 1;
5430            continue;
5431        };
5432        let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
5433            pos += 1;
5434            continue;
5435        };
5436        let Some(after_space) = skip_required_ws(bytes, after_generation) else {
5437            pos += 1;
5438            continue;
5439        };
5440        if bytes.get(after_space) == Some(&b'R') {
5441            refs.insert(name, object as u32);
5442            pos = after_space + 1;
5443        }
5444    }
5445
5446    refs
5447}
5448
5449fn parse_refs(text: &str) -> Vec<usize> {
5450    let mut refs = Vec::new();
5451    let bytes = text.as_bytes();
5452    let mut pos = 0;
5453
5454    while pos < bytes.len() {
5455        let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
5456            pos += 1;
5457            continue;
5458        };
5459        let Some(after_space) = skip_required_ws(bytes, after_object) else {
5460            pos += 1;
5461            continue;
5462        };
5463        let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
5464            pos += 1;
5465            continue;
5466        };
5467        let Some(after_space) = skip_required_ws(bytes, after_generation) else {
5468            pos += 1;
5469            continue;
5470        };
5471        if bytes.get(after_space) == Some(&b'R') {
5472            refs.push(object);
5473            pos = after_space + 1;
5474        } else {
5475            pos += 1;
5476        }
5477    }
5478
5479    refs
5480}
5481
5482fn parse_number_array_after(text: &str, key: &str) -> Option<Vec<f32>> {
5483    let start = text.find(key)?;
5484    let rest = &text[start + key.len()..];
5485    let open = rest.find('[')?;
5486    let close = rest[open + 1..].find(']')?;
5487    Some(
5488        rest[open + 1..open + 1 + close]
5489            .split_whitespace()
5490            .filter_map(|part| part.parse::<f32>().ok())
5491            .collect(),
5492    )
5493}
5494
5495fn parse_number_after(text: &str, key: &str) -> Option<f32> {
5496    let start = text.find(key)?;
5497    let bytes = text.as_bytes();
5498    let mut pos = start + key.len();
5499    while pos < bytes.len() && (is_ws(bytes[pos]) || matches!(bytes[pos], b'[' | b']')) {
5500        pos += 1;
5501    }
5502    let number_start = pos;
5503    while pos < bytes.len() && matches!(bytes[pos], b'+' | b'-' | b'.' | b'0'..=b'9') {
5504        pos += 1;
5505    }
5506    if pos == number_start {
5507        return None;
5508    }
5509    text[number_start..pos].parse().ok()
5510}
5511
5512fn first_text_operand(
5513    operands: &[Operand],
5514    state: &GraphicsState,
5515    fonts: &HashMap<String, Arc<FontDecoder>>,
5516) -> Option<String> {
5517    operands
5518        .first()
5519        .and_then(|operand| operand_text(operand, state, fonts))
5520}
5521
5522fn operand_text(
5523    operand: &Operand,
5524    state: &GraphicsState,
5525    fonts: &HashMap<String, Arc<FontDecoder>>,
5526) -> Option<String> {
5527    match operand {
5528        Operand::Literal(bytes) | Operand::Hex(bytes) => Some(decode_pdf_text(
5529            bytes,
5530            state
5531                .font_name
5532                .as_ref()
5533                .and_then(|font_name| fonts.get(font_name))
5534                .map(|font| font.as_ref()),
5535        )),
5536        _ => None,
5537    }
5538}
5539
5540fn text_from_array(
5541    items: &[Operand],
5542    state: &GraphicsState,
5543    fonts: &HashMap<String, Arc<FontDecoder>>,
5544) -> String {
5545    // A `TJ` number displaces the next glyphs by `-value/1000 * font_size` (text
5546    // space): a *negative* value opens a rightward gap, a *positive* value tightens
5547    // (kerning). When the gap is a meaningful fraction of the font's own space
5548    // width it is a word space the producer encoded as positioning rather than a
5549    // space glyph — the dominant cause of glued words in professionally typeset
5550    // PDFs. Scaling to the actual space width (not a fixed 120/1000-em cutoff) and
5551    // honoring the sign recovers those spaces without splitting kerned pairs.
5552    let space_width = space_advance_width(state, fonts).max(state.font_size * 0.04);
5553    let gap_threshold = space_width * SPACE_GAP_FRACTION;
5554    let mut text = String::new();
5555    for item in items {
5556        match item {
5557            Operand::Number(value) => {
5558                let gap = -value / 1000.0 * state.font_size * state.horizontal_scaling;
5559                if gap >= gap_threshold && !text.ends_with(' ') {
5560                    text.push(' ');
5561                }
5562            }
5563            _ => {
5564                if let Some(part) = operand_text(item, state, fonts) {
5565                    text.push_str(&part);
5566                }
5567            }
5568        }
5569    }
5570    text
5571}
5572
5573/// Fraction of a font's space-glyph advance that a `TJ` rightward gap must reach
5574/// to read as a word space. Below this it is intra-word kerning. Tuned to sit
5575/// well above typical kerning (~0.05–0.15 em) and below a real inter-word gap.
5576const SPACE_GAP_FRACTION: f32 = 0.3;
5577
5578fn decode_pdf_text(bytes: &[u8], font: Option<&FontDecoder>) -> String {
5579    if let Some(font) = font {
5580        if !font.cmap.is_empty() {
5581            return decode_with_cmap(bytes, font);
5582        }
5583        if !font.encoding.is_empty() {
5584            return bytes.iter().map(|byte| font.decode_byte(*byte)).collect();
5585        }
5586    }
5587
5588    if bytes.starts_with(&[0xfe, 0xff]) {
5589        let utf16 = bytes[2..]
5590            .chunks_exact(2)
5591            .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
5592            .collect::<Vec<_>>();
5593        String::from_utf16_lossy(&utf16)
5594    } else {
5595        bytes.iter().map(|byte| *byte as char).collect()
5596    }
5597}
5598
5599fn decode_with_cmap(bytes: &[u8], font: &FontDecoder) -> String {
5600    let mut output = String::new();
5601    let mut index = 0;
5602
5603    while index < bytes.len() {
5604        let max_len = font.max_code_len.min(bytes.len() - index).max(1);
5605        let mut matched = false;
5606        for len in (1..=max_len).rev() {
5607            if let Some(text) = font.cmap.get(&bytes[index..index + len]) {
5608                output.push_str(text);
5609                index += len;
5610                matched = true;
5611                break;
5612            }
5613        }
5614        if !matched {
5615            output.push_str(&font.decode_byte(bytes[index]));
5616            index += 1;
5617        }
5618    }
5619
5620    output
5621}
5622
5623fn glyph_name_to_text(name: &str) -> Option<String> {
5624    let text = match name {
5625        "space" => " ",
5626        "exclam" => "!",
5627        "quotedbl" => "\"",
5628        "numbersign" => "#",
5629        "dollar" => "$",
5630        "percent" => "%",
5631        "ampersand" => "&",
5632        "quotesingle" | "quoteright" | "quoteleft" => "'",
5633        "parenleft" | "parenleftbig" | "parenleftBig" | "parenleftbigg" | "parenleftBigg" => "(",
5634        "parenright" | "parenrightbig" | "parenrightBig" | "parenrightbigg" | "parenrightBigg" => {
5635            ")"
5636        }
5637        "asterisk" | "asteriskmath" => "*",
5638        "plus" => "+",
5639        "comma" => ",",
5640        "hyphen" => "-",
5641        "period" => ".",
5642        "slash" => "/",
5643        "zero" => "0",
5644        "one" => "1",
5645        "two" => "2",
5646        "three" => "3",
5647        "four" => "4",
5648        "five" => "5",
5649        "six" => "6",
5650        "seven" => "7",
5651        "eight" => "8",
5652        "nine" => "9",
5653        "colon" => ":",
5654        "semicolon" => ";",
5655        "less" => "<",
5656        "equal" => "=",
5657        "greater" => ">",
5658        "question" => "?",
5659        "at" => "@",
5660        "bracketleft" => "[",
5661        "backslash" => "\\",
5662        "bracketright" => "]",
5663        "circumflex" | "hatwide" | "hatwider" | "hatwidest" => "^",
5664        "underscore" => "_",
5665        "braceleft" | "braceleftBig" | "braceleftBigg" | "bracelefttp" | "braceleftbt"
5666        | "braceleftmid" => "{",
5667        "bar" | "vextendsingle" | "braceex" => "|",
5668        "braceright" | "bracerightBig" => "}",
5669        "tilde" | "tildewide" => "~",
5670        "ff" => "ff",
5671        "fi" => "fi",
5672        "fl" => "fl",
5673        "ffi" => "ffi",
5674        "ffl" => "ffl",
5675        "Gamma" => "Γ",
5676        "Theta" => "Θ",
5677        "Lambda" => "Λ",
5678        "Pi" => "Π",
5679        "Sigma" => "Σ",
5680        "Phi" => "Φ",
5681        "Omega" => "Ω",
5682        "alpha" => "α",
5683        "beta" => "β",
5684        "gamma" => "γ",
5685        "delta" => "δ",
5686        "epsilon" => "ε",
5687        "zeta" => "ζ",
5688        "lambda" => "λ",
5689        "mu" => "μ",
5690        "pi" | "pi1" => "π",
5691        "rho" => "ρ",
5692        "sigma" => "σ",
5693        "tau" => "τ",
5694        "phi" => "φ",
5695        "chi" => "χ",
5696        "omega" => "ω",
5697        "partialdiff" => "∂",
5698        "minus" => "−",
5699        "periodcentered" => "·",
5700        "multiply" => "×",
5701        "plusminus" => "±",
5702        "circlemultiply" => "⊗",
5703        "openbullet" | "bullet" => "•",
5704        "lessequal" => "≤",
5705        "greaterequal" => "≥",
5706        "similar" => "∼",
5707        "arrowright" => "→",
5708        "mapsto" => "↦",
5709        "prime" => "′",
5710        "infinity" => "∞",
5711        "element" => "∈",
5712        "universal" => "∀",
5713        "union" | "uniontext" | "uniondisplay" => "∪",
5714        "intersection" | "intersectiontext" | "intersectiondisplay" => "∩",
5715        "reflexsubset" => "⊇",
5716        "reflexsuperset" => "⊆",
5717        "summationtext" | "summationdisplay" => "∑",
5718        "productdisplay" => "∏",
5719        "integraldisplay" => "∫",
5720        "circleplusdisplay" => "⊕",
5721        "unionsqdisplay" => "⊔",
5722        "negationslash" => "̸",
5723        _ if name.chars().count() == 1 => name,
5724        _ => return unicode_glyph_name_to_text(name),
5725    };
5726    Some(text.to_owned())
5727}
5728
5729fn unicode_glyph_name_to_text(name: &str) -> Option<String> {
5730    if let Some(hex) = name.strip_prefix("uni") {
5731        if hex.len() >= 4 && hex.len() % 4 == 0 {
5732            let mut output = String::new();
5733            for chunk in hex.as_bytes().chunks(4) {
5734                let chunk = std::str::from_utf8(chunk).ok()?;
5735                let code = u32::from_str_radix(chunk, 16).ok()?;
5736                output.push(char::from_u32(code)?);
5737            }
5738            return Some(output);
5739        }
5740    }
5741    if let Some(hex) = name.strip_prefix('u') {
5742        if (4..=6).contains(&hex.len()) {
5743            let code = u32::from_str_radix(hex, 16).ok()?;
5744            return char::from_u32(code).map(|character| character.to_string());
5745        }
5746    }
5747    None
5748}
5749
5750fn numbers(operands: &[Operand], count: usize) -> Option<Vec<f32>> {
5751    if operands.len() < count {
5752        return None;
5753    }
5754    let values = operands[operands.len() - count..]
5755        .iter()
5756        .map(|operand| match operand {
5757            Operand::Number(value) => Some(*value),
5758            _ => None,
5759        })
5760        .collect::<Option<Vec<_>>>()?;
5761    Some(values)
5762}
5763
5764fn block_text(block: &Block) -> String {
5765    match block {
5766        Block::Text(text) => text.text.clone(),
5767        Block::Table(table) => {
5768            let mut rows = Vec::new();
5769            if !table.headers.is_empty() {
5770                rows.push(table.headers.join(" "));
5771            }
5772            rows.extend(table.rows.iter().map(|row| row.join(" ")));
5773            rows.join("\n")
5774        }
5775        Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
5776    }
5777}
5778
5779/// Classify a text line as a heading (`heading_1`..`heading_3`) or `paragraph`
5780/// from its font size relative to the page body size. Headings on born-digital
5781/// pages are typically set in a visibly larger size; the renderer maps
5782/// `heading_N` to Markdown `#`*N and LaTeX `\section`/`\subsection`/etc.
5783fn classify_text_line(text: &str, line_size: f32, body_size: f32) -> String {
5784    let chars = text.chars().count();
5785    // Long runs of text are body copy even if slightly larger; very short empty
5786    // lines are not headings.
5787    if chars == 0 || chars >= 200 || body_size <= 0.0 || line_size <= 0.0 {
5788        return "paragraph".to_owned();
5789    }
5790    let ratio = line_size / body_size;
5791    if ratio >= 1.5 {
5792        "heading_1".to_owned()
5793    } else if ratio >= 1.3 {
5794        "heading_2".to_owned()
5795    } else if ratio >= 1.12 {
5796        "heading_3".to_owned()
5797    } else {
5798        "paragraph".to_owned()
5799    }
5800}
5801
5802/// The font size of the dominant (longest by character count) run in a line.
5803fn line_dominant_size(line: &TextLine) -> f32 {
5804    let mut best_chars = 0usize;
5805    let mut best_size = 0.0f32;
5806    for run in &line.runs {
5807        if run.size <= 0.0 {
5808            continue;
5809        }
5810        let chars = run.text.chars().count();
5811        if chars >= best_chars {
5812            best_chars = chars;
5813            best_size = run.size;
5814        }
5815    }
5816    best_size
5817}
5818
5819/// The page's body font size: the most common run size (in 0.5pt buckets),
5820/// weighted by character count. Used as the baseline for heading detection.
5821fn page_body_size(lines: &[TextLine]) -> f32 {
5822    let mut weights: Vec<(u32, usize)> = Vec::new();
5823    for line in lines {
5824        for run in &line.runs {
5825            if run.size <= 0.0 {
5826                continue;
5827            }
5828            let bucket = (run.size * 2.0).round() as u32;
5829            let chars = run.text.chars().count();
5830            if let Some(entry) = weights.iter_mut().find(|(value, _)| *value == bucket) {
5831                entry.1 += chars;
5832            } else {
5833                weights.push((bucket, chars));
5834            }
5835        }
5836    }
5837    weights
5838        .into_iter()
5839        .max_by_key(|(_, chars)| *chars)
5840        .map(|(bucket, _)| bucket as f32 / 2.0)
5841        .unwrap_or(0.0)
5842}
5843
5844fn source_ids_for_line(line: &TextLine) -> Vec<String> {
5845    source_ids_for_runs(&line.runs)
5846}
5847
5848fn source_ids_for_runs(runs: &[TextRun]) -> Vec<String> {
5849    let mut ids = Vec::new();
5850    for run in runs {
5851        for id in &run.source_object_ids {
5852            if !ids.contains(id) {
5853                ids.push(id.clone());
5854            }
5855        }
5856    }
5857    ids
5858}
5859
5860fn anchor(page_number: usize, bbox: Option<BBox>, pdf_object_ids: Vec<String>) -> SourceAnchor {
5861    SourceAnchor {
5862        page_number,
5863        pdf_object_ids,
5864        bbox,
5865        extraction_method: "native_pdf".to_owned(),
5866    }
5867}
5868
5869fn warning(code: &str, severity: &str, message: &str, page_number: Option<usize>) -> Warning {
5870    Warning {
5871        code: code.to_owned(),
5872        severity: severity.to_owned(),
5873        message: message.to_owned(),
5874        source_anchor: page_number.map(|page_number| anchor(page_number, None, Vec::new())),
5875    }
5876}
5877
5878#[cfg(test)]
5879mod tests {
5880    use super::*;
5881
5882    #[test]
5883    fn text_from_line_runs_does_not_treat_slash_prose_page_number_as_script() {
5884        let line = TextLine {
5885            runs: vec![
5886                test_run("Art Cutting / Bates Technical College", 72.0, 720.0, 12.0),
5887                test_run("24", 300.0, 722.0, 8.0),
5888                test_run("Core Competencies", 315.0, 720.0, 12.0),
5889            ],
5890            bbox: BBox {
5891                x: 72.0,
5892                y: 720.0,
5893                width: 360.0,
5894                height: 12.0,
5895            },
5896            baseline_y: 720.0,
5897        };
5898
5899        assert_eq!(
5900            text_from_line_runs(&line),
5901            "Art Cutting / Bates Technical College 24 Core Competencies"
5902        );
5903    }
5904
5905    fn test_run(text: &str, x: f32, y: f32, size: f32) -> TextRun {
5906        TextRun {
5907            text: text.to_owned(),
5908            bbox: BBox {
5909                x,
5910                y,
5911                width: text.len() as f32 * size * 0.4,
5912                height: size,
5913            },
5914            baseline_y: y,
5915            font: None,
5916            size,
5917            space_width: size * 0.25,
5918            bold: false,
5919            italic: false,
5920            source_object_ids: Vec::new(),
5921        }
5922    }
5923}
5924
5925fn union_boxes(boxes: impl IntoIterator<Item = BBox>) -> Option<BBox> {
5926    let mut iter = boxes.into_iter();
5927    let first = iter.next()?;
5928    let mut min_x = first.x;
5929    let mut min_y = first.y;
5930    let mut max_x = first.x + first.width;
5931    let mut max_y = first.y + first.height;
5932
5933    for bbox in iter {
5934        min_x = min_x.min(bbox.x);
5935        min_y = min_y.min(bbox.y);
5936        max_x = max_x.max(bbox.x + bbox.width);
5937        max_y = max_y.max(bbox.y + bbox.height);
5938    }
5939
5940    Some(BBox {
5941        x: min_x,
5942        y: min_y,
5943        width: max_x - min_x,
5944        height: max_y - min_y,
5945    })
5946}
5947
5948fn extract_info_string(objects: &[PdfObject], key: &str) -> Option<String> {
5949    let needle = format!("/{key}");
5950    objects.iter().find_map(|object| {
5951        let body = lossy(&object.body);
5952        if !(body.contains("/Producer") || body.contains("/Creator") || body.contains("/Author")) {
5953            return None;
5954        }
5955        let start = body.find(&needle)?;
5956        let rest = &object.body[start + needle.len()..];
5957        let open = rest.iter().position(|byte| *byte == b'(')?;
5958        let mut parser = ContentParser::new(&rest[open..]);
5959        match parser.next_operand_or_operator()? {
5960            ContentToken::Operand(Operand::Literal(bytes)) => Some(decode_pdf_text(&bytes, None)),
5961            _ => None,
5962        }
5963    })
5964}
5965
5966fn pdf_version(bytes: &[u8]) -> Option<String> {
5967    let first_line = bytes.split(|byte| matches!(byte, b'\n' | b'\r')).next()?;
5968    let text = std::str::from_utf8(first_line).ok()?;
5969    text.strip_prefix("%PDF-").map(ToOwned::to_owned)
5970}
5971
5972fn decode_hex(bytes: &[u8]) -> Vec<u8> {
5973    let hex = bytes
5974        .iter()
5975        .copied()
5976        .filter(|byte| !is_ws(*byte))
5977        .collect::<Vec<_>>();
5978    let mut output = Vec::new();
5979    let mut index = 0;
5980    while index < hex.len() {
5981        let high = hex_value(hex[index]).unwrap_or(0);
5982        let low = hex
5983            .get(index + 1)
5984            .and_then(|byte| hex_value(*byte))
5985            .unwrap_or(0);
5986        output.push((high << 4) | low);
5987        index += 2;
5988    }
5989    output
5990}
5991
5992fn hex_value(byte: u8) -> Option<u8> {
5993    match byte {
5994        b'0'..=b'9' => Some(byte - b'0'),
5995        b'a'..=b'f' => Some(byte - b'a' + 10),
5996        b'A'..=b'F' => Some(byte - b'A' + 10),
5997        _ => None,
5998    }
5999}
6000
6001fn parse_unsigned_at(bytes: &[u8], mut pos: usize) -> Option<(usize, usize)> {
6002    let start = pos;
6003    while pos < bytes.len() && bytes[pos].is_ascii_digit() {
6004        pos += 1;
6005    }
6006    if pos == start {
6007        return None;
6008    }
6009    std::str::from_utf8(&bytes[start..pos])
6010        .ok()?
6011        .parse()
6012        .ok()
6013        .map(|value| (value, pos))
6014}
6015
6016fn skip_required_ws(bytes: &[u8], mut pos: usize) -> Option<usize> {
6017    if pos >= bytes.len() || !is_ws(bytes[pos]) {
6018        return None;
6019    }
6020    while pos < bytes.len() && is_ws(bytes[pos]) {
6021        pos += 1;
6022    }
6023    Some(pos)
6024}
6025
6026fn is_ws_or_line_start(bytes: &[u8], pos: usize) -> bool {
6027    pos == 0 || matches!(bytes[pos - 1], b'\n' | b'\r')
6028}
6029
6030fn is_delimiter_or_ws(byte: u8) -> bool {
6031    is_ws(byte) || matches!(byte, b'[' | b']' | b'<' | b'>' | b'/' | b'(' | b')')
6032}
6033
6034fn is_ws(byte: u8) -> bool {
6035    matches!(byte, 0x00 | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
6036}
6037
6038fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
6039    haystack
6040        .windows(needle.len())
6041        .position(|window| window == needle)
6042}
6043
6044fn contains_name(bytes: &[u8], name: &[u8]) -> bool {
6045    find_subslice(bytes, name).is_some()
6046}
6047
6048fn lossy(bytes: &[u8]) -> String {
6049    String::from_utf8_lossy(bytes).into_owned()
6050}
6051
6052#[allow(dead_code)]
6053fn sha256_hex(bytes: &[u8]) -> String {
6054    let digest = Sha256::digest(bytes);
6055    digest.iter().map(|byte| format!("{byte:02x}")).collect()
6056}
dongler_core/pdf.rs

dongler_core/
pdf.rs