1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::io::Read;
4use std::sync::Arc;
5
6use flate2::read::ZlibDecoder;
7#[cfg(feature = "parallel")]
8use rayon::prelude::*;
9use sha2::{Digest, Sha256};
10
11use crate::engine::ExtractionEngine;
12use crate::error::{DonglerError, Result};
13use crate::ir::{
14 Asset, BBox, Block, Confidence, Document, FigureBlock, ImageObject, Line, Metadata, Page,
15 SourceAnchor, Span, TableBlock, TableCell, TextBlock, Warning, SCHEMA_VERSION,
16};
17use crate::source::Source;
18
19#[derive(Debug, Default, Clone, Copy)]
20pub struct PdfEngine;
21
22impl ExtractionEngine for PdfEngine {
23 fn name(&self) -> &'static str {
24 "pdf-native"
25 }
26
27 fn extract(&self, source: &Source) -> Result<Document> {
28 let bytes = source.bytes.as_deref().unwrap_or(source.content.as_bytes());
29 extract_pdf(bytes, source, self.name())
30 }
31}
32
33#[derive(Debug, Clone)]
34struct PdfObject {
35 object_number: u32,
36 generation: u16,
37 body: Vec<u8>,
38}
39
40#[derive(Debug, Clone)]
41struct PageSeed {
42 number: usize,
43 body: String,
44}
45
46#[derive(Debug, Clone)]
47struct PageExtraction {
48 page: Page,
49 text: String,
50 spans: Vec<SpanGeom>,
51}
52
53#[derive(Debug, Clone, PartialEq)]
58pub struct SpanGeom {
59 pub bbox: BBox,
60 pub text: String,
61}
62
63#[derive(Debug, Clone, PartialEq)]
65pub struct PageSpans {
66 pub page_number: usize,
67 pub width: f32,
68 pub height: f32,
69 pub spans: Vec<SpanGeom>,
70}
71
72#[derive(Debug, Clone)]
73struct TextRun {
74 text: String,
75 bbox: BBox,
76 baseline_y: f32,
79 font: Option<String>,
80 size: f32,
81 space_width: f32,
87 bold: bool,
88 italic: bool,
89 source_object_ids: Vec<String>,
90}
91
92#[derive(Debug, Clone)]
93struct TextLine {
94 runs: Vec<TextRun>,
95 bbox: BBox,
96 baseline_y: f32,
97}
98
99#[derive(Debug, Clone)]
100struct DetectedTable {
101 table: TableBlock,
102 line_indices: Vec<usize>,
103}
104
105#[derive(Debug, Clone)]
106struct TableRowCandidate {
107 line_index: usize,
108 cells: Vec<TextRun>,
109}
110
111#[derive(Debug, Clone, Copy)]
112struct GraphicEdge {
113 x0: f32,
114 y0: f32,
115 x1: f32,
116 y1: f32,
117}
118
119#[derive(Debug, Clone, Copy, PartialEq, Eq)]
120enum ScriptKind {
121 Superscript,
122 Subscript,
123}
124
125#[derive(Debug, Clone)]
126struct ColumnLayout<'a> {
127 leading: Vec<&'a TextLine>,
128 columns: Vec<Vec<&'a TextLine>>,
129 trailing: Vec<&'a TextLine>,
130}
131
132#[derive(Debug, Clone)]
133struct ContentExtraction {
134 text_runs: Vec<TextRun>,
135 edges: Vec<GraphicEdge>,
136 images: Vec<ImageObject>,
137 assets: Vec<Asset>,
138 warnings: Vec<Warning>,
139}
140
141#[derive(Debug, Clone, Default)]
142struct FontDecoder {
143 cmap: HashMap<Vec<u8>, String>,
144 encoding: HashMap<u8, String>,
145 widths: HashMap<char, f32>,
146 max_code_len: usize,
147 bold: bool,
148 italic: bool,
149 ascent: f32,
150 descent: f32,
151}
152
153impl FontDecoder {
154 fn decode_byte(&self, byte: u8) -> String {
155 self.encoding
156 .get(&byte)
157 .cloned()
158 .unwrap_or_else(|| (byte as char).to_string())
159 }
160}
161
162#[derive(Debug, Clone)]
163enum Operand {
164 Number(f32),
165 Name(String),
166 Literal(Vec<u8>),
167 Hex(Vec<u8>),
168 Array(Vec<Operand>),
169 Other,
170}
171
172#[derive(Debug, Clone)]
173struct ContentOp {
174 operands: Vec<Operand>,
175 operator: String,
176}
177
178#[derive(Debug, Clone)]
179struct GraphicsState {
180 ctm: Matrix,
181 text_matrix: Matrix,
182 line_matrix: Matrix,
183 font_name: Option<String>,
184 font_size: f32,
185 leading: f32,
186 char_spacing: f32,
187 word_spacing: f32,
188 horizontal_scaling: f32,
189 text_rise: f32,
190}
191
192impl Default for GraphicsState {
193 fn default() -> Self {
194 Self {
195 ctm: Matrix::identity(),
196 text_matrix: Matrix::identity(),
197 line_matrix: Matrix::identity(),
198 font_name: None,
199 font_size: 12.0,
200 leading: 12.0,
201 char_spacing: 0.0,
202 word_spacing: 0.0,
203 horizontal_scaling: 1.0,
204 text_rise: 0.0,
205 }
206 }
207}
208
209#[derive(Debug, Clone, Copy)]
210struct Matrix {
211 a: f32,
212 b: f32,
213 c: f32,
214 d: f32,
215 e: f32,
216 f: f32,
217}
218
219impl Matrix {
220 fn identity() -> Self {
221 Self {
222 a: 1.0,
223 b: 0.0,
224 c: 0.0,
225 d: 1.0,
226 e: 0.0,
227 f: 0.0,
228 }
229 }
230
231 fn multiply(self, other: Self) -> Self {
232 Self {
233 a: self.a * other.a + self.b * other.c,
234 b: self.a * other.b + self.b * other.d,
235 c: self.c * other.a + self.d * other.c,
236 d: self.c * other.b + self.d * other.d,
237 e: self.e * other.a + self.f * other.c + other.e,
238 f: self.e * other.b + self.f * other.d + other.f,
239 }
240 }
241
242 fn point(self, x: f32, y: f32) -> (f32, f32) {
243 (
244 self.a * x + self.c * y + self.e,
245 self.b * x + self.d * y + self.f,
246 )
247 }
248
249 fn translate(self, x: f32, y: f32) -> Self {
250 Self {
251 e: self.e + self.a * x + self.c * y,
252 f: self.f + self.b * x + self.d * y,
253 ..self
254 }
255 }
256
257 fn bbox(self) -> BBox {
258 BBox {
259 x: self.e,
260 y: self.f,
261 width: self.a.abs(),
262 height: self.d.abs(),
263 }
264 }
265}
266
267struct ParsedPdf {
270 page_extractions: Vec<PageExtraction>,
271 document_warnings: Vec<crate::ir::Warning>,
272 title: Option<String>,
273 encrypted: bool,
274}
275
276pub fn extract_pdf(bytes: &[u8], source: &Source, engine_name: &str) -> Result<Document> {
277 let parsed = parse_pdf_pages(bytes)?;
278 let ParsedPdf {
279 page_extractions,
280 document_warnings,
281 title,
282 encrypted,
283 } = parsed;
284
285 let mut pages = Vec::with_capacity(page_extractions.len());
286 let mut all_text = String::new();
287 let mut assets = Vec::new();
288
289 for extraction in page_extractions {
290 all_text.push_str(&extraction.text);
291 all_text.push('\n');
292 assets.extend(extraction.page.assets.clone());
293 pages.push(extraction.page);
294 }
295
296 Ok(Document {
297 schema_version: SCHEMA_VERSION.to_owned(),
298 metadata: Metadata {
299 format: "pdf".to_owned(),
300 engine: engine_name.to_owned(),
301 source: source.path.clone(),
302 title,
303 character_count: all_text.chars().count(),
304 word_count: all_text.split_whitespace().count(),
305 block_count: pages.iter().map(|page| page.blocks.len()).sum(),
306 file_size_bytes: Some(bytes.len() as u64),
307 pdf_version: pdf_version(bytes),
308 encrypted,
309 },
310 pages,
311 assets,
312 warnings: document_warnings,
313 })
314}
315
316pub fn extract_pdf_spans(bytes: &[u8]) -> Result<Vec<PageSpans>> {
320 let parsed = parse_pdf_pages(bytes)?;
321 Ok(parsed
322 .page_extractions
323 .into_iter()
324 .map(|e| PageSpans {
325 page_number: e.page.number,
326 width: e.page.width.unwrap_or(0.0),
327 height: e.page.height.unwrap_or(0.0),
328 spans: e.spans,
329 })
330 .collect())
331}
332
333fn parse_pdf_pages(bytes: &[u8]) -> Result<ParsedPdf> {
334 if !bytes.starts_with(b"%PDF-") {
335 return Err(DonglerError::pdf("missing %PDF header"));
336 }
337
338 let mut objects = parse_indirect_objects(bytes);
339 expand_object_streams(&mut objects);
340 if objects.is_empty() {
341 return Err(DonglerError::pdf("no indirect objects found"));
342 }
343
344 let title = extract_info_string(&objects, "Title");
349 let objects: Vec<Arc<PdfObject>> = objects.into_iter().map(Arc::new).collect();
350 let object_map: HashMap<u32, Arc<PdfObject>> = objects
351 .iter()
352 .map(|object| (object.object_number, Arc::clone(object)))
353 .collect();
354 let page_seeds = objects
355 .iter()
356 .filter_map(|object| page_seed(object.as_ref(), &object_map))
357 .enumerate()
358 .map(|(index, mut seed)| {
359 seed.number = index + 1;
360 seed
361 })
362 .collect::<Vec<_>>();
363
364 if page_seeds.is_empty() {
365 return Err(DonglerError::pdf("no page objects found"));
366 }
367
368 let mut document_warnings = Vec::new();
369 let encrypted = contains_name(bytes, b"/Encrypt");
370 if encrypted {
371 document_warnings.push(warning(
372 "pdf.encrypted",
373 "warning",
374 "document declares encryption; extraction may be incomplete",
375 None,
376 ));
377 }
378 if contains_name(bytes, b"/ObjStm") {
379 document_warnings.push(warning(
380 "pdf.object_stream",
381 "info",
382 "object streams detected and expanded by the native scanner",
383 None,
384 ));
385 }
386
387 let mut font_object_numbers: Vec<u32> = page_seeds
391 .iter()
392 .flat_map(|seed| {
393 let resource_body = resolve_resource_body(&seed.body, &object_map);
394 let resource_text = resource_body.as_deref().unwrap_or(&seed.body);
395 resolve_named_resource_refs(resource_text, "/Font", &object_map)
396 .into_values()
397 .collect::<Vec<_>>()
398 })
399 .collect();
400 font_object_numbers.sort_unstable();
401 font_object_numbers.dedup();
402 let decode_font = |number: u32| {
403 object_map
404 .get(&number)
405 .map(|font| (number, Arc::new(font_decoder(font.as_ref(), &object_map))))
406 };
407 #[cfg(feature = "parallel")]
408 let font_cache: HashMap<u32, Arc<FontDecoder>> = font_object_numbers
409 .into_par_iter()
410 .filter_map(decode_font)
411 .collect();
412 #[cfg(not(feature = "parallel"))]
413 let font_cache: HashMap<u32, Arc<FontDecoder>> = font_object_numbers
414 .into_iter()
415 .filter_map(decode_font)
416 .collect();
417
418 let extract_one = |seed: &PageSeed| extract_page(seed, &object_map, &font_cache);
419 #[cfg(feature = "parallel")]
420 let page_extractions = page_seeds.par_iter().map(extract_one).collect::<Vec<_>>();
421 #[cfg(not(feature = "parallel"))]
422 let page_extractions = page_seeds.iter().map(extract_one).collect::<Vec<_>>();
423
424 Ok(ParsedPdf {
425 page_extractions,
426 document_warnings,
427 title,
428 encrypted,
429 })
430}
431
432fn extract_page(
433 seed: &PageSeed,
434 object_map: &HashMap<u32, Arc<PdfObject>>,
435 font_cache: &HashMap<u32, Arc<FontDecoder>>,
436) -> PageExtraction {
437 let media_box = parse_number_array_after(&seed.body, "/MediaBox")
438 .unwrap_or_else(|| vec![0.0, 0.0, 612.0, 792.0]);
439 let width =
440 media_box.get(2).copied().unwrap_or(612.0) - media_box.first().copied().unwrap_or(0.0);
441 let height =
442 media_box.get(3).copied().unwrap_or(792.0) - media_box.get(1).copied().unwrap_or(0.0);
443 let rotation = parse_number_after(&seed.body, "/Rotate").map(|value| value as i32);
444 let contents = parse_refs_after_key(&seed.body, "/Contents");
445 let resource_body = resolve_resource_body(&seed.body, object_map);
446 let resource_text = resource_body.as_deref().unwrap_or(&seed.body);
447 let xobjects = resolve_named_resource_refs(resource_text, "/XObject", object_map);
448 let fonts = load_font_decoders(resource_text, object_map, font_cache);
449
450 let mut warnings = Vec::new();
451 let mut extraction = ContentExtraction {
452 text_runs: Vec::new(),
453 edges: Vec::new(),
454 images: Vec::new(),
455 assets: Vec::new(),
456 warnings: Vec::new(),
457 };
458
459 for content_ref in contents {
460 match object_map
461 .get(&(content_ref as u32))
462 .map(|object| decode_stream_object(object.as_ref()))
463 {
464 Some(Ok(Some(stream))) => {
465 let object_id = format!("{content_ref} 0 R");
466 let mut content = interpret_content_stream(
467 &stream,
468 seed.number,
469 &[object_id],
470 &xobjects,
471 &fonts,
472 object_map,
473 );
474 extraction.text_runs.append(&mut content.text_runs);
475 extraction.edges.append(&mut content.edges);
476 extraction.images.append(&mut content.images);
477 extraction.assets.append(&mut content.assets);
478 extraction.warnings.append(&mut content.warnings);
479 }
480 Some(Ok(None)) | None => warnings.push(warning(
481 "pdf.missing_content",
482 "warning",
483 "page content stream is missing",
484 Some(seed.number),
485 )),
486 Some(Err(error)) => warnings.push(warning(
487 "pdf.stream_decode",
488 "warning",
489 &error.to_string(),
490 Some(seed.number),
491 )),
492 }
493 }
494
495 warnings.append(&mut extraction.warnings);
496
497 let normalized_rotation = rotation.map(|value| value.rem_euclid(360)).unwrap_or(0);
500 if normalized_rotation != 0 {
501 for run in &mut extraction.text_runs {
502 run.bbox = rotate_bbox(run.bbox, normalized_rotation, width, height);
503 }
504 for image in &mut extraction.images {
505 if let Some(bbox) = image.bbox {
506 image.bbox = Some(rotate_bbox(bbox, normalized_rotation, width, height));
507 }
508 }
509 for edge in &mut extraction.edges {
510 let (x0, y0) = rotate_point(edge.x0, edge.y0, normalized_rotation, width, height);
511 let (x1, y1) = rotate_point(edge.x1, edge.y1, normalized_rotation, width, height);
512 edge.x0 = x0;
513 edge.y0 = y0;
514 edge.x1 = x1;
515 edge.y1 = y1;
516 }
517 }
518 let (page_width, page_height) = if matches!(normalized_rotation, 90 | 270) {
519 (height, width)
520 } else {
521 (width, height)
522 };
523 let (page_x, page_y) = if normalized_rotation == 0 {
524 (
525 media_box.first().copied().unwrap_or(0.0),
526 media_box.get(1).copied().unwrap_or(0.0),
527 )
528 } else {
529 (0.0, 0.0)
530 };
531
532 let lines = group_text_runs(extraction.text_runs);
533
534 let spans: Vec<SpanGeom> = lines
538 .iter()
539 .flat_map(|line| line.runs.iter())
540 .filter(|run| !run.text.trim().is_empty())
541 .map(|run| SpanGeom {
542 bbox: run.bbox,
543 text: run.text.clone(),
544 })
545 .collect();
546
547 let mut blocks = build_blocks(seed.number, &lines, &extraction.edges);
548 if blocks.is_empty() && !extraction.images.is_empty() {
549 blocks.extend(image_figure_blocks(seed.number, &extraction.images));
550 }
551 let text = blocks
552 .iter()
553 .map(block_text)
554 .filter(|text| !text.is_empty())
555 .collect::<Vec<_>>()
556 .join("\n");
557
558 let page = Page {
559 number: seed.number,
560 width: Some(page_width),
561 height: Some(page_height),
562 rotation,
563 bbox: Some(BBox {
564 x: page_x,
565 y: page_y,
566 width: page_width,
567 height: page_height,
568 }),
569 blocks,
570 images: extraction.images,
571 assets: extraction.assets,
572 warnings, ..Default::default()
573 };
574
575 PageExtraction { page, text, spans }
576}
577
578fn interpret_content_stream(
579 bytes: &[u8],
580 page_number: usize,
581 source_object_ids: &[String],
582 xobjects: &HashMap<String, u32>,
583 fonts: &HashMap<String, Arc<FontDecoder>>,
584 object_map: &HashMap<u32, Arc<PdfObject>>,
585) -> ContentExtraction {
586 let mut state = GraphicsState::default();
587 let mut graphics_stack = Vec::new();
588 let mut current_path_point: Option<(f32, f32)> = None;
589 let mut pending_edges = Vec::new();
590 let mut extraction = ContentExtraction {
591 text_runs: Vec::new(),
592 edges: Vec::new(),
593 images: Vec::new(),
594 assets: Vec::new(),
595 warnings: Vec::new(),
596 };
597
598 for op in parse_content_ops(bytes) {
599 match op.operator.as_str() {
600 "q" => graphics_stack.push(state.clone()),
601 "Q" => {
602 if let Some(previous) = graphics_stack.pop() {
603 state = previous;
604 }
605 }
606 "cm" => {
607 if let Some(values) = numbers(&op.operands, 6) {
608 state.ctm = state.ctm.multiply(Matrix {
609 a: values[0],
610 b: values[1],
611 c: values[2],
612 d: values[3],
613 e: values[4],
614 f: values[5],
615 });
616 }
617 }
618 "BT" => {
619 state.text_matrix = Matrix::identity();
620 state.line_matrix = Matrix::identity();
621 }
622 "Tf" => {
623 if let [Operand::Name(name), Operand::Number(size)] = op.operands.as_slice() {
624 state.font_name = Some(name.clone());
625 state.font_size = *size;
626 state.leading = *size * 1.2;
627 }
628 }
629 "Tc" => {
630 if let Some(values) = numbers(&op.operands, 1) {
631 state.char_spacing = values[0];
632 }
633 }
634 "Tw" => {
635 if let Some(values) = numbers(&op.operands, 1) {
636 state.word_spacing = values[0];
637 }
638 }
639 "Tz" => {
640 if let Some(values) = numbers(&op.operands, 1) {
641 state.horizontal_scaling = (values[0] / 100.0).max(0.01);
642 }
643 }
644 "TL" => {
645 if let Some(values) = numbers(&op.operands, 1) {
646 state.leading = values[0];
647 }
648 }
649 "Ts" => {
650 if let Some(values) = numbers(&op.operands, 1) {
651 state.text_rise = values[0];
652 }
653 }
654 "Td" | "TD" => {
655 if let Some(values) = numbers(&op.operands, 2) {
656 let next_line = state.line_matrix.translate(values[0], values[1]);
657 state.line_matrix = next_line;
658 state.text_matrix = next_line;
659 if op.operator == "TD" {
660 state.leading = -values[1];
661 }
662 }
663 }
664 "Tm" => {
665 if let Some(values) = numbers(&op.operands, 6) {
666 let matrix = Matrix {
667 a: values[0],
668 b: values[1],
669 c: values[2],
670 d: values[3],
671 e: values[4],
672 f: values[5],
673 };
674 state.line_matrix = matrix;
675 state.text_matrix = matrix;
676 }
677 }
678 "T*" => {
679 move_to_next_text_line(&mut state);
680 }
681 "Tj" => {
682 if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
683 push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
684 }
685 }
686 "TJ" => {
687 if let Some(Operand::Array(items)) = op.operands.first() {
688 let text = text_from_array(items, &state, fonts);
689 push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
690 }
691 }
692 "'" => {
693 move_to_next_text_line(&mut state);
694 if let Some(text) = first_text_operand(&op.operands, &state, fonts) {
695 push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
696 }
697 }
698 "\"" => {
699 if let [Operand::Number(word_spacing), Operand::Number(char_spacing), ..] =
700 op.operands.as_slice()
701 {
702 state.word_spacing = *word_spacing;
703 state.char_spacing = *char_spacing;
704 }
705 move_to_next_text_line(&mut state);
706 if let Some(text) = op
707 .operands
708 .last()
709 .and_then(|operand| operand_text(operand, &state, fonts))
710 {
711 push_text_run(&mut extraction, &mut state, source_object_ids, text, fonts);
712 }
713 }
714 "Do" => {
715 if let Some(Operand::Name(name)) = op.operands.first() {
716 if let Some(object_number) = xobjects.get(name) {
717 if let Some(object) = object_map.get(object_number) {
718 let object_body = lossy(&object.body);
719 if object_body.contains("/Subtype /Image") {
720 let bbox = state.ctm.bbox();
721 let id = format!("image-{}-{name}", page_number);
722 let object_id = Some(format!(
723 "{} {} R",
724 object.object_number, object.generation
725 ));
726 let width = parse_number_after(&object_body, "/Width")
727 .map(|value| value as u32);
728 let height = parse_number_after(&object_body, "/Height")
729 .map(|value| value as u32);
730
731 extraction.images.push(ImageObject {
732 id: id.clone(),
733 object_id: object_id.clone(),
734 bbox: Some(bbox),
735 width,
736 height,
737 });
738 extraction.assets.push(Asset {
739 id,
740 kind: "image".to_owned(),
741 object_id,
742 bbox: Some(bbox),
743 width,
744 height,
745 });
746 }
747 }
748 }
749 }
750 }
751 "m" => {
752 if let Some(values) = numbers(&op.operands, 2) {
753 current_path_point = Some((values[0], values[1]));
754 }
755 }
756 "l" => {
757 if let (Some(start), Some(values)) = (current_path_point, numbers(&op.operands, 2))
758 {
759 let end = (values[0], values[1]);
760 pending_edges.push(graphic_edge_from_points(state.ctm, start, end));
761 current_path_point = Some(end);
762 }
763 }
764 "re" => {
765 if let Some(values) = numbers(&op.operands, 4) {
766 pending_edges.extend(graphic_edges_from_rect(
767 state.ctm, values[0], values[1], values[2], values[3],
768 ));
769 current_path_point = Some((values[0], values[1]));
770 }
771 }
772 "S" | "s" => {
773 extraction.edges.append(&mut pending_edges);
774 current_path_point = None;
775 }
776 "n" => {
777 pending_edges.clear();
778 current_path_point = None;
779 }
780 _ => {}
781 }
782 }
783
784 extraction
785}
786
787fn graphic_edge_from_points(matrix: Matrix, start: (f32, f32), end: (f32, f32)) -> GraphicEdge {
788 let (x0, y0) = matrix.point(start.0, start.1);
789 let (x1, y1) = matrix.point(end.0, end.1);
790 GraphicEdge { x0, y0, x1, y1 }
791}
792
793fn graphic_edges_from_rect(
794 matrix: Matrix,
795 x: f32,
796 y: f32,
797 width: f32,
798 height: f32,
799) -> Vec<GraphicEdge> {
800 let right = x + width;
801 let top = y + height;
802 vec![
803 graphic_edge_from_points(matrix, (x, y), (right, y)),
804 graphic_edge_from_points(matrix, (right, y), (right, top)),
805 graphic_edge_from_points(matrix, (right, top), (x, top)),
806 graphic_edge_from_points(matrix, (x, top), (x, y)),
807 ]
808}
809
810fn move_to_next_text_line(state: &mut GraphicsState) {
811 let next_line = state.line_matrix.translate(0.0, -state.leading);
812 state.line_matrix = next_line;
813 state.text_matrix = next_line;
814}
815
816fn push_text_run(
817 extraction: &mut ContentExtraction,
818 state: &mut GraphicsState,
819 source_object_ids: &[String],
820 text: String,
821 fonts: &HashMap<String, Arc<FontDecoder>>,
822) {
823 let advance = text_advance_width(&text, state, fonts);
824 if text.trim().is_empty() {
825 state.text_matrix = state.text_matrix.translate(advance, 0.0);
826 return;
827 }
828
829 let font = state.font_name.as_ref().and_then(|name| fonts.get(name));
830 let (bold, italic) = font
831 .map(|font| (font.bold, font.italic))
832 .unwrap_or((false, false));
833 let (ascent, descent) = font
834 .map(|font| (font.ascent, font.descent))
835 .unwrap_or((0.75, -0.25));
836 let bbox = text_run_bbox(state, advance, ascent, descent);
837 let (base_x, base_y) = state.text_matrix.point(0.0, state.text_rise);
838 let (_, baseline_y) = state.ctm.point(base_x, base_y);
839 let space_width = space_advance_width(state, fonts);
840 extraction.text_runs.push(TextRun {
841 text,
842 bbox,
843 baseline_y,
844 font: state.font_name.clone(),
845 size: state.font_size,
846 space_width,
847 bold,
848 italic,
849 source_object_ids: source_object_ids.to_vec(),
850 });
851 state.text_matrix = state.text_matrix.translate(advance, 0.0);
852}
853
854fn text_advance_width(
855 text: &str,
856 state: &GraphicsState,
857 fonts: &HashMap<String, Arc<FontDecoder>>,
858) -> f32 {
859 let glyphs = text.chars().count() as f32;
860 if glyphs == 0.0 {
861 return 0.0;
862 }
863 let spaces = text.chars().filter(|character| *character == ' ').count() as f32;
864 let font = state
865 .font_name
866 .as_ref()
867 .and_then(|font_name| fonts.get(font_name));
868 let base = text
869 .chars()
870 .map(|character| {
871 font.and_then(|font| font.widths.get(&character).copied())
872 .unwrap_or_else(|| default_glyph_width(character))
873 / 1000.0
874 * state.font_size
875 })
876 .sum::<f32>();
877 let spacing = glyphs * state.char_spacing + spaces * state.word_spacing;
878 ((base + spacing) * state.horizontal_scaling).max(0.0)
879}
880
881fn default_glyph_width(character: char) -> f32 {
887 match character {
888 ' ' | '!' | ',' | '.' | '/' | ':' | ';' | 'I' | '[' | '\\' | ']' | 'i' | 'j' | 'l'
889 | '|' | '\'' => 250.0,
890 '"' | '(' | ')' | '*' | '`' | '-' | 'f' | 'r' | 't' | '{' | '}' => 333.0,
891 'm' | 'M' | 'W' | 'w' | '@' => 850.0,
892 '0'..='9' => 556.0,
893 'A'..='Z' | '$' | '+' | '<' | '=' | '>' | '?' | '_' | '~' => 650.0,
894 _ => 500.0,
895 }
896}
897
898fn space_advance_width(state: &GraphicsState, fonts: &HashMap<String, Arc<FontDecoder>>) -> f32 {
902 let from_font = state
903 .font_name
904 .as_ref()
905 .and_then(|font_name| fonts.get(font_name))
906 .and_then(|font| font.widths.get(&' ').copied())
907 .filter(|width| *width > 0.0)
908 .map(|width| width / 1000.0 * state.font_size);
909 let width = from_font.unwrap_or_else(|| default_glyph_width(' ') / 1000.0 * state.font_size);
910 (width * state.horizontal_scaling).max(0.0)
911}
912
913fn text_run_bbox(state: &GraphicsState, advance: f32, ascent: f32, descent: f32) -> BBox {
914 let bottom = state.text_rise + descent * state.font_size;
918 let top = state.text_rise + ascent * state.font_size;
919 let corners = [
920 (0.0, bottom),
921 (advance, bottom),
922 (0.0, top),
923 (advance, top),
924 ];
925 let points = corners
926 .into_iter()
927 .map(|(x, y)| {
928 let (text_x, text_y) = state.text_matrix.point(x, y);
929 state.ctm.point(text_x, text_y)
930 })
931 .collect::<Vec<_>>();
932 let min_x = points.iter().map(|(x, _)| *x).fold(f32::INFINITY, f32::min);
933 let min_y = points.iter().map(|(_, y)| *y).fold(f32::INFINITY, f32::min);
934 let max_x = points
935 .iter()
936 .map(|(x, _)| *x)
937 .fold(f32::NEG_INFINITY, f32::max);
938 let max_y = points
939 .iter()
940 .map(|(_, y)| *y)
941 .fold(f32::NEG_INFINITY, f32::max);
942 BBox {
943 x: min_x,
944 y: min_y,
945 width: (max_x - min_x).max(state.font_size * 0.25),
946 height: (max_y - min_y).max(state.font_size * 0.25),
947 }
948}
949
950fn build_blocks(page_number: usize, lines: &[TextLine], edges: &[GraphicEdge]) -> Vec<Block> {
951 let body_size = page_body_size(lines);
952 let tables = detect_page_tables(page_number, lines, edges);
953
954 if tables.is_empty() {
955 let split_lines = split_wide_text_lines(lines);
956 let text_blocks = text_lines_in_reading_order(&split_lines)
957 .into_iter()
958 .filter_map(|line| text_block_from_line(page_number, line, body_size))
959 .collect::<Vec<_>>();
960 return merge_wrapped_text_blocks(text_blocks)
961 .into_iter()
962 .map(Block::Text)
963 .collect();
964 }
965
966 build_blocks_with_tables(page_number, lines, tables, body_size)
967}
968
969fn detect_page_tables(
975 page_number: usize,
976 lines: &[TextLine],
977 edges: &[GraphicEdge],
978) -> Vec<DetectedTable> {
979 let mut tables: Vec<DetectedTable> = Vec::new();
980 let mut consumed = vec![false; lines.len()];
981 while tables.len() < 8 {
984 let mapping: Vec<usize> = (0..lines.len()).filter(|&index| !consumed[index]).collect();
985 if mapping.len() < 2 {
986 break;
987 }
988 let subset: Vec<TextLine> = mapping.iter().map(|&index| lines[index].clone()).collect();
989 let Some(mut detected) = detect_table(page_number, &subset, edges) else {
990 break;
991 };
992 let original: Vec<usize> = detected
994 .line_indices
995 .iter()
996 .filter_map(|&subset_index| mapping.get(subset_index).copied())
997 .collect();
998 if original.is_empty() {
999 break;
1000 }
1001 for &index in &original {
1002 consumed[index] = true;
1003 }
1004 detected.line_indices = original;
1005 tables.push(detected);
1006 }
1007 tables
1008}
1009
1010fn build_blocks_with_tables(
1011 page_number: usize,
1012 lines: &[TextLine],
1013 mut tables: Vec<DetectedTable>,
1014 body_size: f32,
1015) -> Vec<Block> {
1016 let mut consumed = vec![false; lines.len()];
1017 for table in &tables {
1018 for &index in &table.line_indices {
1019 if let Some(slot) = consumed.get_mut(index) {
1020 *slot = true;
1021 }
1022 }
1023 }
1024 let remaining_lines = lines
1025 .iter()
1026 .enumerate()
1027 .filter(|(line_index, _)| !consumed[*line_index])
1028 .map(|(_, line)| line.clone())
1029 .collect::<Vec<_>>();
1030 let split_lines = split_wide_text_lines(&remaining_lines);
1031 let text_blocks = merge_wrapped_text_blocks(
1032 text_lines_in_reading_order(&split_lines)
1033 .into_iter()
1034 .filter_map(|line| text_block_from_line(page_number, line, body_size))
1035 .collect(),
1036 );
1037
1038 let table_top = |table: &DetectedTable| {
1043 table
1044 .table
1045 .bbox
1046 .map(|bbox| bbox.y + bbox.height)
1047 .unwrap_or(f32::NEG_INFINITY)
1048 };
1049 tables.sort_by(|left, right| table_top(right).total_cmp(&table_top(left)));
1050
1051 let mut blocks = Vec::new();
1052 let mut next_table = 0usize;
1053 for text_block in text_blocks {
1054 let block_top = text_block
1055 .bbox
1056 .map(|bbox| bbox.y + bbox.height)
1057 .unwrap_or(f32::NEG_INFINITY);
1058 while next_table < tables.len() && table_top(&tables[next_table]) > block_top {
1059 blocks.push(Block::Table(tables[next_table].table.clone()));
1060 next_table += 1;
1061 }
1062 blocks.push(Block::Text(text_block));
1063 }
1064 for table in tables.into_iter().skip(next_table) {
1065 blocks.push(Block::Table(table.table));
1066 }
1067
1068 blocks
1069}
1070
1071fn image_figure_blocks(page_number: usize, images: &[ImageObject]) -> Vec<Block> {
1072 images
1073 .iter()
1074 .map(|image| {
1075 Block::Figure(FigureBlock {
1076 alt_text: Some(format!("Image {}", image.id)),
1077 caption: None,
1078 bbox: image.bbox,
1079 image_ref: Some(image.id.clone()),
1080 source_anchors: vec![anchor(
1081 page_number,
1082 image.bbox,
1083 image.object_id.clone().into_iter().collect(),
1084 )],
1085 confidence: Some(Confidence {
1086 score: 0.6,
1087 calibrated: false,
1088 }), ..Default::default()
1089 })
1090 })
1091 .collect()
1092}
1093
1094fn split_wide_text_lines(lines: &[TextLine]) -> Vec<TextLine> {
1095 let enable_tight_column_band = has_repeated_tight_column_band_evidence(lines);
1096 let mut split_lines = Vec::new();
1097 for line in lines {
1098 match split_text_line_at_wide_gap(line, enable_tight_column_band) {
1099 Some((left, right)) => {
1100 split_lines.push(left);
1101 split_lines.push(right);
1102 }
1103 None => split_lines.push(line.clone()),
1104 }
1105 }
1106 split_lines
1107}
1108
1109fn line_runs_x_sorted(runs: &[TextRun]) -> bool {
1111 runs.windows(2).all(|pair| pair[0].bbox.x <= pair[1].bbox.x)
1112}
1113
1114fn runs_sorted_by_x(line: &TextLine) -> Cow<'_, [TextRun]> {
1119 if line_runs_x_sorted(&line.runs) {
1120 Cow::Borrowed(&line.runs)
1121 } else {
1122 let mut runs = line.runs.clone();
1123 runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
1124 Cow::Owned(runs)
1125 }
1126}
1127
1128fn split_text_line_at_wide_gap(
1129 line: &TextLine,
1130 enable_tight_column_band: bool,
1131) -> Option<(TextLine, TextLine)> {
1132 if line.runs.len() < 2 {
1133 return None;
1134 }
1135 let runs = runs_sorted_by_x(line);
1136 let contains_math = runs
1137 .iter()
1138 .any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)));
1139 let tight_column_split_index = enable_tight_column_band
1140 .then(|| tight_column_band_split_index_for_runs(&runs[..]))
1141 .flatten();
1142 let largest_gap_split = largest_run_gap(&runs[..]);
1143 if contains_math && tight_column_split_index.is_none() {
1144 return None;
1145 }
1146 let split_index = match (tight_column_split_index, largest_gap_split) {
1147 (Some(tight_index), Some((wide_index, gap, x_jump)))
1148 if prefers_wide_gap_before_tight_band(&runs[..], wide_index, tight_index, gap, x_jump) =>
1149 {
1150 wide_index
1151 }
1152 (Some(tight_index), _) => tight_index,
1153 (None, Some((wide_index, _, _))) => wide_index,
1154 (None, None) => return None,
1155 };
1156 let left_runs = runs[..split_index].to_vec();
1157 let right_runs = runs[split_index..].to_vec();
1158 if left_runs.is_empty() || right_runs.is_empty() {
1159 return None;
1160 }
1161 let right_value_cells = right_runs
1176 .iter()
1177 .filter(|run| is_numeric_value(&run.text))
1178 .count();
1179 let right_all_figures = right_runs.iter().all(|run| {
1180 let text = run.text.trim();
1181 text.is_empty()
1182 || is_value_cell(text)
1183 || matches!(text, "$" | "€" | "£" | "¥" | "(" | ")" | "($")
1184 });
1185 let leader_gap = right_runs.first().map_or(0.0, |run| run.bbox.x)
1186 - left_runs
1187 .last()
1188 .map_or(0.0, |run| run.bbox.x + run.bbox.width);
1189 if right_value_cells >= 3 && right_all_figures && leader_gap >= 100.0 {
1190 return None;
1191 }
1192 Some((
1193 text_line_from_runs(left_runs)?,
1194 text_line_from_runs(right_runs)?,
1195 ))
1196}
1197
1198fn has_repeated_tight_column_band_evidence(lines: &[TextLine]) -> bool {
1199 lines
1200 .iter()
1201 .filter(|line| {
1202 let runs = runs_sorted_by_x(line);
1203 tight_column_band_split_index_for_runs(&runs[..]).is_some()
1204 })
1205 .take(2)
1206 .count()
1207 >= 2
1208}
1209
1210fn tight_column_band_split_index_for_runs(runs: &[TextRun]) -> Option<usize> {
1211 let split_index = right_column_band_split_index(runs)?;
1212 let contains_math = runs
1213 .iter()
1214 .any(|run| looks_like_pdf_math_notation(&normalize_pdf_token(&run.text)));
1215 if contains_math && !allows_math_column_split(&runs[..split_index]) {
1216 return None;
1217 }
1218 Some(split_index)
1219}
1220
1221fn right_column_band_split_index(runs: &[TextRun]) -> Option<usize> {
1222 if runs.len() < 3 || runs.first()?.bbox.x > 120.0 {
1223 return None;
1224 }
1225
1226 for index in 1..runs.len() {
1227 if index < 2 {
1228 continue;
1229 }
1230 let algorithm_like_left = allows_math_column_split(&runs[..index]);
1231 let right_x = runs[index].bbox.x;
1232 let in_standard_column_band = (300.0..=340.0).contains(&right_x);
1233 let in_algorithm_column_band = algorithm_like_left && (280.0..=340.0).contains(&right_x);
1234 if !in_standard_column_band && !in_algorithm_column_band {
1235 continue;
1236 }
1237 if runs.len() - index < 2 && !algorithm_like_left {
1238 continue;
1239 }
1240
1241 let previous = &runs[index - 1].bbox;
1242 let gap = right_x - (previous.x + previous.width);
1243 if gap < -35.0 {
1244 continue;
1245 }
1246
1247 let right_text_len = runs[index..]
1248 .iter()
1249 .map(|run| run.text.trim().len())
1250 .sum::<usize>();
1251 if right_text_len < 18 {
1252 continue;
1253 }
1254
1255 return Some(index);
1256 }
1257
1258 None
1259}
1260
1261fn allows_math_column_split(left_runs: &[TextRun]) -> bool {
1262 let text = left_runs
1263 .iter()
1264 .map(|run| run.text.trim())
1265 .filter(|text| !text.is_empty())
1266 .collect::<Vec<_>>()
1267 .join(" ");
1268 let trimmed = text.trim_start();
1269 starts_with_numbered_step(trimmed)
1270 || trimmed.starts_with("Require:")
1271 || trimmed.starts_with("Ensure:")
1272 || trimmed.starts_with("Algorithm ")
1273}
1274
1275fn largest_run_gap(runs: &[TextRun]) -> Option<(usize, f32, f32)> {
1276 runs.windows(2)
1277 .enumerate()
1278 .filter_map(|(index, window)| {
1279 let left = &window[0].bbox;
1280 let right = &window[1].bbox;
1281 let gap = right.x - (left.x + left.width);
1282 let x_jump = right.x - left.x;
1283 is_likely_column_split_gap(&window[0].bbox, &window[1].bbox, gap, x_jump).then_some((
1284 index + 1,
1285 gap,
1286 x_jump,
1287 ))
1288 })
1289 .max_by(|left, right| left.1.max(left.2).total_cmp(&right.1.max(right.2)))
1290}
1291
1292fn is_likely_column_split_gap(left: &BBox, right: &BBox, gap: f32, x_jump: f32) -> bool {
1293 if gap >= 18.0 {
1294 return true;
1295 }
1296
1297 x_jump >= 110.0 && left.x < 280.0 && right.x > 280.0
1298}
1299
1300fn column_gutter_is_clear(lines: &[TextLine], midpoint: f32, min_y: f32, max_y: f32) -> bool {
1306 let band = 4.0;
1307 let mut region = 0usize;
1308 let mut crossing = 0usize;
1309 for line in lines {
1310 if line.bbox.y < min_y - line.bbox.height || line.bbox.y > max_y + line.bbox.height {
1311 continue;
1312 }
1313 region += 1;
1314 if line.bbox.x < midpoint - band && line.bbox.x + line.bbox.width > midpoint + band {
1315 crossing += 1;
1316 }
1317 }
1318 region == 0 || (crossing as f32) <= (region as f32) * 0.25
1319}
1320
1321fn text_line_from_runs(runs: Vec<TextRun>) -> Option<TextLine> {
1322 let bbox = union_boxes(runs.iter().map(|run| run.bbox))?;
1323 let baseline_y = runs.iter().map(|run| run.baseline_y).sum::<f32>() / runs.len() as f32;
1324 Some(TextLine {
1325 runs,
1326 bbox,
1327 baseline_y,
1328 })
1329}
1330
1331fn prefers_wide_gap_before_tight_band(
1332 runs: &[TextRun],
1333 wide_index: usize,
1334 tight_index: usize,
1335 gap: f32,
1336 x_jump: f32,
1337) -> bool {
1338 if wide_index == 0 || wide_index >= tight_index || tight_index > runs.len() {
1339 return false;
1340 }
1341
1342 let left = &runs[wide_index - 1].bbox;
1343 let right = &runs[wide_index].bbox;
1344 let stranded_right_glyphs = runs[wide_index..tight_index]
1345 .iter()
1346 .all(|run| run.bbox.x >= 280.0 && run.text.trim().chars().count() <= 2);
1347
1348 stranded_right_glyphs && left.x < 280.0 && right.x >= 280.0 && x_jump >= 110.0 && gap >= -160.0
1349}
1350
1351fn text_lines_in_reading_order(lines: &[TextLine]) -> Vec<&TextLine> {
1352 if let Some(layout) = detect_paired_text_columns(lines) {
1353 return order_column_layout(layout);
1354 }
1355 if let Some(mut columns) = detect_text_columns(lines) {
1356 columns.sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
1357 return columns
1358 .into_iter()
1359 .flat_map(|mut column| {
1360 column.sort_by(|left, right| {
1361 right
1362 .bbox
1363 .y
1364 .total_cmp(&left.bbox.y)
1365 .then(left.bbox.x.total_cmp(&right.bbox.x))
1366 });
1367 column
1368 })
1369 .collect();
1370 }
1371 lines.iter().collect()
1372}
1373
1374fn order_column_layout(mut layout: ColumnLayout<'_>) -> Vec<&TextLine> {
1375 let mut ordered = Vec::new();
1376 sort_lines_top_down(&mut layout.leading);
1377 ordered.extend(layout.leading);
1378 layout
1379 .columns
1380 .sort_by(|left, right| column_x(left).total_cmp(&column_x(right)));
1381 for mut column in layout.columns {
1382 sort_lines_top_down(&mut column);
1383 ordered.extend(column);
1384 }
1385 sort_lines_top_down(&mut layout.trailing);
1386 ordered.extend(layout.trailing);
1387 ordered
1388}
1389
1390fn sort_lines_top_down(lines: &mut [&TextLine]) {
1391 lines.sort_by(|left, right| {
1392 right
1393 .bbox
1394 .y
1395 .total_cmp(&left.bbox.y)
1396 .then(left.bbox.x.total_cmp(&right.bbox.x))
1397 });
1398}
1399
1400fn detect_paired_text_columns(lines: &[TextLine]) -> Option<ColumnLayout<'_>> {
1401 if lines.len() < 4 {
1402 return None;
1403 }
1404
1405 let mut left_seed_indices = Vec::new();
1406 let mut right_seed_indices = Vec::new();
1407 for (left_index, left) in lines.iter().enumerate() {
1408 for (right_index, right) in lines.iter().enumerate() {
1409 if left_index == right_index || left.bbox.x >= right.bbox.x {
1410 continue;
1411 }
1412 if (left.bbox.y - right.bbox.y).abs() > column_pair_y_tolerance(left, right) {
1413 continue;
1414 }
1415 let gap = right.bbox.x - (left.bbox.x + left.bbox.width);
1416 let x_jump = right.bbox.x - left.bbox.x;
1417 if !is_likely_column_split_gap(&left.bbox, &right.bbox, gap, x_jump) {
1418 continue;
1419 }
1420 left_seed_indices.push(left_index);
1421 right_seed_indices.push(right_index);
1422 }
1423 }
1424 dedupe_indices(&mut left_seed_indices);
1425 dedupe_indices(&mut right_seed_indices);
1426 if left_seed_indices.len() < 2 || right_seed_indices.len() < 2 {
1427 return None;
1428 }
1429
1430 let left_x = average_x(lines, &left_seed_indices)?;
1431 let right_x = average_x(lines, &right_seed_indices)?;
1432 if right_x - left_x < 90.0 {
1433 return None;
1434 }
1435 let column_min_y = left_seed_indices
1436 .iter()
1437 .chain(&right_seed_indices)
1438 .map(|index| lines[*index].bbox.y)
1439 .reduce(f32::min)?;
1440 let column_max_y = left_seed_indices
1441 .iter()
1442 .chain(&right_seed_indices)
1443 .map(|index| lines[*index].bbox.y)
1444 .reduce(f32::max)?;
1445 let abstract_y = abstract_heading_y(lines);
1446 let midpoint = (left_x + right_x) / 2.0;
1447 if !column_gutter_is_clear(lines, midpoint, column_min_y, column_max_y) {
1451 return None;
1452 }
1453 let mut leading = Vec::new();
1454 let mut trailing = Vec::new();
1455 let mut left_column = Vec::new();
1456 let mut right_column = Vec::new();
1457
1458 for line in lines {
1459 if is_likely_front_matter_line(line, abstract_y)
1460 || line.bbox.y > column_max_y + line.bbox.height
1461 {
1462 leading.push(line);
1463 } else if line.bbox.y < column_min_y - line.bbox.height * 1.8
1464 && (is_likely_page_number_line(line) || is_likely_bottom_footnote_line(line))
1465 {
1466 trailing.push(line);
1467 } else if line.bbox.x < midpoint {
1468 left_column.push(line);
1469 } else {
1470 right_column.push(line);
1471 }
1472 }
1473
1474 if left_column.len() < 2 || right_column.len() < 2 {
1475 return None;
1476 }
1477
1478 Some(ColumnLayout {
1479 leading,
1480 columns: vec![left_column, right_column],
1481 trailing,
1482 })
1483}
1484
1485fn column_pair_y_tolerance(left: &TextLine, right: &TextLine) -> f32 {
1486 left.bbox.height.max(right.bbox.height) * 0.45
1487}
1488
1489fn abstract_heading_y(lines: &[TextLine]) -> Option<f32> {
1490 lines
1491 .iter()
1492 .find(|line| text_line_plain_text(line).eq_ignore_ascii_case("abstract"))
1493 .map(|line| line.bbox.y)
1494}
1495
1496fn is_likely_front_matter_line(line: &TextLine, abstract_y: Option<f32>) -> bool {
1497 abstract_y.is_some_and(|y| line.bbox.y > y + 36.0)
1498}
1499
1500fn is_likely_bottom_footnote_line(line: &TextLine) -> bool {
1501 average_run_size(line) <= 10.0 && text_line_plain_text(line).len() > 4
1502}
1503
1504fn average_run_size(line: &TextLine) -> f32 {
1505 if line.runs.is_empty() {
1506 return line.bbox.height;
1507 }
1508 line.runs.iter().map(|run| run.size).sum::<f32>() / line.runs.len() as f32
1509}
1510
1511fn is_likely_page_number_line(line: &TextLine) -> bool {
1512 let text = text_line_plain_text(line);
1513 !text.is_empty() && text.len() <= 4 && text.chars().all(|character| character.is_ascii_digit())
1514}
1515
1516fn text_line_plain_text(line: &TextLine) -> String {
1517 join_runs_spaced(&runs_sorted_by_x(line)).trim().to_owned()
1521}
1522
1523fn dedupe_indices(indices: &mut Vec<usize>) {
1524 indices.sort_unstable();
1525 indices.dedup();
1526}
1527
1528fn average_x(lines: &[TextLine], indices: &[usize]) -> Option<f32> {
1529 if indices.is_empty() {
1530 return None;
1531 }
1532 Some(
1533 indices
1534 .iter()
1535 .map(|index| lines[*index].bbox.x)
1536 .sum::<f32>()
1537 / indices.len() as f32,
1538 )
1539}
1540
1541fn detect_text_columns(lines: &[TextLine]) -> Option<Vec<Vec<&TextLine>>> {
1542 if lines.len() < 4 {
1543 return None;
1544 }
1545
1546 let mut centers = lines
1547 .iter()
1548 .enumerate()
1549 .map(|(index, line)| (index, line.bbox.x + line.bbox.width / 2.0))
1550 .collect::<Vec<_>>();
1551 centers.sort_by(|left, right| left.1.total_cmp(&right.1));
1552
1553 let (split_index, largest_gap) = centers
1554 .windows(2)
1555 .enumerate()
1556 .map(|(index, window)| (index + 1, window[1].1 - window[0].1))
1557 .max_by(|left, right| left.1.total_cmp(&right.1))?;
1558 if largest_gap < 90.0 {
1559 return None;
1560 }
1561
1562 let (left_indices, right_indices) = centers.split_at(split_index);
1563 if left_indices.len() < 2 || right_indices.len() < 2 {
1564 return None;
1565 }
1566
1567 let left = left_indices
1568 .iter()
1569 .map(|(index, _)| &lines[*index])
1570 .collect::<Vec<_>>();
1571 let right = right_indices
1572 .iter()
1573 .map(|(index, _)| &lines[*index])
1574 .collect::<Vec<_>>();
1575
1576 let overlap = y_overlap(&left, &right)?;
1577 let average_height = average_line_height(lines);
1578 if overlap < average_height {
1579 return None;
1580 }
1581
1582 let left_right_edge = left
1587 .iter()
1588 .map(|line| line.bbox.x + line.bbox.width)
1589 .fold(f32::MIN, f32::max);
1590 let right_left_edge = right.iter().map(|line| line.bbox.x).fold(f32::MAX, f32::min);
1591 if right_left_edge - left_right_edge < 15.0 {
1592 return None;
1593 }
1594
1595 Some(vec![left, right])
1596}
1597
1598fn column_x(lines: &[&TextLine]) -> f32 {
1599 if lines.is_empty() {
1600 return 0.0;
1601 }
1602 lines.iter().map(|line| line.bbox.x).sum::<f32>() / lines.len() as f32
1603}
1604
1605fn y_overlap(left: &[&TextLine], right: &[&TextLine]) -> Option<f32> {
1606 let left_min = left.iter().map(|line| line.bbox.y).reduce(f32::min)?;
1607 let left_max = left
1608 .iter()
1609 .map(|line| line.bbox.y + line.bbox.height)
1610 .reduce(f32::max)?;
1611 let right_min = right.iter().map(|line| line.bbox.y).reduce(f32::min)?;
1612 let right_max = right
1613 .iter()
1614 .map(|line| line.bbox.y + line.bbox.height)
1615 .reduce(f32::max)?;
1616 Some((left_max.min(right_max) - left_min.max(right_min)).max(0.0))
1617}
1618
1619fn average_line_height(lines: &[TextLine]) -> f32 {
1620 let total = lines.iter().map(|line| line.bbox.height).sum::<f32>();
1621 total / lines.len() as f32
1622}
1623
1624fn text_block_from_line(page_number: usize, line: &TextLine, body_size: f32) -> Option<TextBlock> {
1625 let text = text_from_line_runs(line);
1626 let text = clean_pdf_line_text(&text);
1627 if text.is_empty() {
1628 return None;
1629 }
1630
1631 Some(TextBlock {
1632 text: text.clone(),
1633 kind: classify_text_line(&text, line_dominant_size(line), body_size),
1634 bbox: Some(line.bbox),
1635 lines: vec![Line {
1636 text,
1637 bbox: Some(line.bbox),
1638 spans: line
1639 .runs
1640 .iter()
1641 .filter_map(|run| {
1642 let text = clean_pdf_span_text(&run.text);
1643 (!text.is_empty()).then(|| Span {
1644 text,
1645 bbox: Some(run.bbox),
1646 font: run.font.clone(),
1647 size: Some(run.size),
1648 bold: run.bold,
1649 italic: run.italic,
1650 })
1651 })
1652 .collect(),
1653 }],
1654 source_anchors: vec![anchor(
1655 page_number,
1656 Some(line.bbox),
1657 source_ids_for_line(line),
1658 )],
1659 confidence: Some(Confidence {
1660 score: 0.82,
1661 calibrated: false,
1662 }), ..Default::default()
1663 })
1664}
1665
1666fn adaptive_single_glyph_gap(runs: &[TextRun]) -> Option<f32> {
1682 let mut gaps: Vec<f32> = Vec::new();
1683 let mut space_w = 0.0f32;
1684 let mut prev_end: Option<f32> = None;
1685 for run in runs {
1686 if run.text.is_empty() {
1687 continue;
1688 }
1689 space_w = space_w.max(run.space_width);
1690 if let Some(end) = prev_end {
1691 let gap = run.bbox.x - end;
1692 if gap.is_finite() && gap > 0.0 {
1693 gaps.push(gap);
1694 }
1695 }
1696 prev_end = Some(run.bbox.x + run.bbox.width);
1697 }
1698 if gaps.len() < 3 || space_w <= 0.0 {
1699 return None;
1700 }
1701 gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1702 let median = gaps[gaps.len() / 2];
1703 Some((median * 1.8).clamp(space_w * 0.08, space_w * 0.4))
1709}
1710
1711fn join_runs_spaced(runs: &[TextRun]) -> String {
1712 let mut out = String::new();
1713 let adaptive_glyph_gap = adaptive_single_glyph_gap(runs);
1715 let mut previous: Option<(f32, f32, f32, bool)> = None;
1717 for run in runs {
1718 if run.text.is_empty() {
1719 continue;
1720 }
1721 let multi_char = run.text.trim().chars().count() >= 2;
1722 if let Some((prev_end_x, prev_space_width, prev_baseline_y, prev_multi)) = previous {
1723 let boundary_has_space = out.ends_with(char::is_whitespace)
1724 || run.text.starts_with(char::is_whitespace);
1725 let gap = run.bbox.x - prev_end_x;
1726 let numeric_continuation = out.trim_end().ends_with(|c: char| c.is_ascii_digit())
1736 && run.text.trim_start().starts_with(|c: char| c.is_ascii_digit());
1737 let tokens_separate = (prev_multi || multi_char) && !numeric_continuation;
1738 let threshold = match adaptive_glyph_gap {
1741 Some(adaptive) if !tokens_separate => adaptive,
1742 _ => word_gap_threshold(prev_space_width, run.space_width, run.size, tokens_separate),
1743 };
1744 let baseline_break =
1748 (prev_baseline_y - run.baseline_y).abs() >= run.size.max(1.0) * 0.18;
1749 let overlap_break =
1755 tokens_separate && gap <= -(prev_space_width.max(run.space_width) * 0.6).max(0.5);
1756 if !out.is_empty()
1757 && !boundary_has_space
1758 && (gap >= threshold || baseline_break || overlap_break)
1759 {
1760 out.push(' ');
1761 }
1762 }
1763 out.push_str(&run.text);
1764 previous = Some((
1765 run.bbox.x + run.bbox.width,
1766 run.space_width,
1767 run.baseline_y,
1768 multi_char,
1769 ));
1770 }
1771 out
1772}
1773
1774fn word_gap_threshold(
1780 left_space_width: f32,
1781 right_space_width: f32,
1782 size: f32,
1783 tokens_separate: bool,
1784) -> f32 {
1785 let space = left_space_width
1786 .max(right_space_width)
1787 .max(size * 0.25)
1788 .max(0.1);
1789 space * if tokens_separate { 0.1 } else { 0.4 }
1790}
1791
1792fn text_from_line_runs(line: &TextLine) -> String {
1793 let runs = runs_sorted_by_x(line);
1794 if !line_has_math_script_context(&runs[..]) {
1795 return join_runs_spaced(&runs[..]);
1796 }
1797
1798 let Some(baseline_y) = dominant_baseline_y(&runs[..]) else {
1799 return join_runs_spaced(&runs[..]);
1800 };
1801 let mut pieces: Vec<String> = Vec::new();
1802
1803 for run in runs.iter() {
1804 let token = run.text.trim();
1805 if token.is_empty() {
1806 continue;
1807 }
1808
1809 if let Some(script) = script_kind_for_run(run, baseline_y) {
1810 if let Some(previous) = pieces.last_mut() {
1811 if can_attach_math_script(previous, token) {
1812 previous.push_str(&format_math_script(script, token));
1813 continue;
1814 }
1815 }
1816 }
1817
1818 pieces.push(token.to_owned());
1819 }
1820
1821 pieces.join(" ")
1822}
1823
1824fn dominant_baseline_y(runs: &[TextRun]) -> Option<f32> {
1825 let max_size = runs
1826 .iter()
1827 .map(|run| run.size)
1828 .reduce(f32::max)
1829 .filter(|size| *size > 0.0)?;
1830 let mut baselines = runs
1831 .iter()
1832 .filter(|run| run.size >= max_size * 0.8)
1833 .map(|run| run.baseline_y)
1834 .collect::<Vec<_>>();
1835 if baselines.is_empty() {
1836 baselines = runs.iter().map(|run| run.baseline_y).collect();
1837 }
1838 baselines.sort_by(|left, right| left.total_cmp(right));
1839 baselines.get(baselines.len() / 2).copied()
1840}
1841
1842fn script_kind_for_run(run: &TextRun, baseline_y: f32) -> Option<ScriptKind> {
1843 let delta = run.baseline_y - baseline_y;
1844 let threshold = (run.size * 0.25).clamp(2.0, 4.0);
1845 if delta >= threshold {
1846 Some(ScriptKind::Superscript)
1847 } else if delta <= -threshold {
1848 Some(ScriptKind::Subscript)
1849 } else {
1850 None
1851 }
1852}
1853
1854fn line_has_math_script_context(runs: &[TextRun]) -> bool {
1855 let joined = runs
1856 .iter()
1857 .map(|run| run.text.as_str())
1858 .collect::<Vec<_>>()
1859 .join(" ");
1860 joined.chars().any(|character| {
1861 matches!(
1862 character,
1863 '=' | '+'
1867 | '−'
1868 | '×'
1869 | '*'
1870 | '^'
1871 | '_'
1872 | '∈'
1873 | '≤'
1874 | '≥'
1875 | '≠'
1876 | 'λ'
1877 | 'θ'
1878 | 'ρ'
1879 | 'τ'
1880 | 'Σ'
1881 | '∑'
1882 )
1883 }) || runs.windows(2).any(|window| {
1884 let left = window[0].text.trim();
1885 let right = window[1].text.trim();
1886 let baseline_delta = (window[0].baseline_y - window[1].baseline_y).abs();
1892 let script_offset = window[0].size.max(window[1].size) * 0.2;
1893 baseline_delta >= script_offset
1894 && is_math_script_base(left)
1895 && is_math_script_text(right)
1896 })
1897}
1898
1899fn can_attach_math_script(previous: &str, token: &str) -> bool {
1900 !previous.ends_with('^')
1901 && !previous.ends_with('_')
1902 && is_math_script_text(token)
1903 && previous_has_math_script_base(previous)
1904}
1905
1906fn is_math_script_base(token: &str) -> bool {
1907 let trimmed = token.trim_matches(|character: char| matches!(character, '(' | '[' | '{'));
1908 let count = trimmed.chars().count();
1909 (count == 1 && trimmed.chars().any(|character| character.is_alphanumeric()))
1910 || trimmed.starts_with('\\')
1911}
1912
1913fn previous_has_math_script_base(previous: &str) -> bool {
1914 let trimmed = previous.trim_end();
1915 if trimmed.ends_with('}') || trimmed.ends_with(']') || trimmed.ends_with(')') {
1916 return trimmed.contains('\\') || trimmed.contains('_') || trimmed.contains('^');
1917 }
1918 trimmed
1919 .chars()
1920 .rev()
1921 .find(|character| !matches!(character, '*' | '\'' | '′'))
1922 .is_some_and(|character| character.is_alphabetic() || character == '\\')
1923}
1924
1925fn is_math_script_text(token: &str) -> bool {
1926 let cleaned = token.trim_matches(|character: char| matches!(character, '(' | ')' | '[' | ']'));
1927 !cleaned.is_empty()
1928 && cleaned.chars().all(|character| {
1929 character.is_alphanumeric()
1930 || matches!(character, '+' | '-' | '−' | '=' | ',' | '.' | '\\')
1931 })
1932}
1933
1934fn format_math_script(kind: ScriptKind, token: &str) -> String {
1935 let marker = match kind {
1936 ScriptKind::Superscript => '^',
1937 ScriptKind::Subscript => '_',
1938 };
1939 let cleaned = token.trim();
1940 if cleaned.chars().count() == 1
1941 || cleaned
1942 .chars()
1943 .all(|character| character.is_ascii_alphanumeric())
1944 {
1945 format!("{marker}{cleaned}")
1946 } else {
1947 format!("{marker}{{{cleaned}}}")
1948 }
1949}
1950
1951fn merge_wrapped_text_blocks(blocks: Vec<TextBlock>) -> Vec<TextBlock> {
1952 let mut merged: Vec<TextBlock> = Vec::new();
1953 for block in blocks {
1954 if let Some(previous) = merged.last_mut() {
1955 if should_merge_text_blocks(previous, &block) {
1956 merge_text_block(previous, block);
1957 continue;
1958 }
1959 }
1960 merged.push(block);
1961 }
1962 merged
1963}
1964
1965fn should_merge_text_blocks(previous: &TextBlock, next: &TextBlock) -> bool {
1966 let Some(previous_bbox) = previous.bbox else {
1967 return false;
1968 };
1969 let Some(next_bbox) = next.bbox else {
1970 return false;
1971 };
1972 let baseline_gap = previous_bbox.y - next_bbox.y;
1973 if baseline_gap <= 0.0 || baseline_gap > previous_bbox.height.max(next_bbox.height) * 1.8 {
1974 return false;
1975 }
1976 let x_aligned = (previous_bbox.x - next_bbox.x).abs() <= 18.0;
1977 let hyphenated = previous.text.ends_with('-') && starts_with_lowercase(&next.text);
1978 if x_aligned && hyphenated {
1979 return true;
1980 }
1981 if starts_with_numbered_step(&previous.text) && starts_with_numbered_step(&next.text) {
1982 return false;
1983 }
1984 if previous.kind != "paragraph" || next.kind != "paragraph" {
1985 return false;
1986 }
1987 let lowercase_continuation =
1988 starts_with_lowercase(&next.text) && !ends_sentence(&previous.text);
1989 x_aligned && (hyphenated || lowercase_continuation)
1990}
1991
1992fn merge_text_block(previous: &mut TextBlock, next: TextBlock) {
1993 previous.text = join_wrapped_text(&previous.text, &next.text);
1994 previous.bbox = union_boxes(previous.bbox.into_iter().chain(next.bbox)).or(previous.bbox);
1995 previous.lines.extend(next.lines);
1996 for anchor in next.source_anchors {
1997 previous.source_anchors.push(anchor);
1998 }
1999}
2000
2001fn join_wrapped_text(previous: &str, next: &str) -> String {
2002 if let Some(stem) = previous.strip_suffix('-') {
2003 format!("{stem}{}", next.trim_start())
2004 } else {
2005 format!("{} {}", previous.trim_end(), next.trim_start())
2006 }
2007}
2008
2009fn starts_with_lowercase(text: &str) -> bool {
2010 text.chars()
2011 .find(|character| character.is_alphabetic())
2012 .is_some_and(|character| character.is_lowercase())
2013}
2014
2015fn starts_with_numbered_step(text: &str) -> bool {
2016 let trimmed = text.trim_start();
2017 let digit_count = trimmed
2018 .chars()
2019 .take_while(|character| character.is_ascii_digit())
2020 .count();
2021 digit_count > 0
2022 && trimmed
2023 .chars()
2024 .nth(digit_count)
2025 .is_some_and(|character| matches!(character, ':' | '.'))
2026}
2027
2028fn ends_sentence(text: &str) -> bool {
2029 text.trim_end()
2030 .chars()
2031 .last()
2032 .is_some_and(|character| matches!(character, '.' | '!' | '?'))
2033}
2034
2035fn clean_pdf_line_text(text: &str) -> String {
2036 let text = repair_windows_1252_ellipsis_before_tokenizing(text);
2037 let tokens = text
2038 .split_whitespace()
2039 .map(normalize_pdf_token)
2040 .filter(|token| !token.is_empty())
2041 .collect::<Vec<_>>();
2042 let mut cleaned: Vec<String> = Vec::new();
2043 let mut index = 0;
2044 while index < tokens.len() {
2045 let token = tokens[index].as_str();
2046 if is_closing_punctuation_token(token) && !cleaned.is_empty() {
2047 let previous = cleaned.last_mut().expect("checked non-empty");
2048 previous.push_str(token);
2049 index += 1;
2050 continue;
2051 }
2052 if is_joining_apostrophe(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
2053 let next = tokens[index + 1].as_str();
2054 if is_word_piece(next) {
2055 let previous = cleaned.last_mut().expect("checked non-empty");
2056 previous.push('\'');
2057 previous.push_str(next);
2058 index += 2;
2059 continue;
2060 }
2061 }
2062 if is_joining_hyphen(token) && !cleaned.is_empty() && index + 1 < tokens.len() {
2063 let next = tokens[index + 1].as_str();
2064 if is_word_piece(next) {
2065 let previous = cleaned.last_mut().expect("checked non-empty");
2066 previous.push('-');
2067 previous.push_str(next);
2068 index += 2;
2069 continue;
2070 }
2071 }
2072 if let Some(previous) = cleaned.last_mut() {
2073 if should_join_after_trailing_hyphen(previous, token) {
2074 previous.push_str(token);
2075 index += 1;
2076 continue;
2077 }
2078 if should_join_pdf_word_piece(previous, token) {
2079 previous.push_str(token);
2080 index += 1;
2081 continue;
2082 }
2083 }
2084 if is_letter_fragment(token) {
2085 let mut merged = String::new();
2086 let mut end = index;
2087 while end < tokens.len() && is_letter_fragment(tokens[end].as_str()) {
2088 merged.push_str(tokens[end].as_str());
2089 end += 1;
2090 }
2091 if end - index >= 2 {
2092 cleaned.push(merged);
2093 index = end;
2094 continue;
2095 }
2096 }
2097 cleaned.push(token.to_owned());
2098 index += 1;
2099 }
2100 repair_pdf_math_notation(&repair_pdf_word_fragment_phrases(&cleaned.join(" ")))
2101}
2102
2103fn clean_pdf_span_text(text: &str) -> String {
2104 repair_pdf_math_notation(&normalize_pdf_token(text))
2105}
2106
2107fn repair_pdf_word_fragment_phrases(text: &str) -> String {
2108 let mut repaired = text.to_owned();
2109 for (broken, fixed) in [
2110 ("a c onversatio n", "a conversation"),
2111 ("ac onversatio n", "a conversation"),
2112 ("an other", "another"),
2113 ("ce nters", "centers"),
2114 ("prod uction", "production"),
2115 ("de mands", "demands"),
2116 ("turn s", "turns"),
2117 ("coordinate s", "coordinates"),
2118 ("coordinat e", "coordinate"),
2119 ("facilitat e", "facilitate"),
2120 ("speake rs", "speakers"),
2121 ("listener s'", "listeners'"),
2122 ("th e", "the"),
2123 ("p resent", "present"),
2124 ("linguisti c", "linguistic"),
2125 ("an d", "and"),
2126 ("inferen ces", "inferences"),
2127 ("attentio n", "attention"),
2128 ("B eyond", "Beyond"),
2129 ("variabilit y", "variability"),
2130 ("l essons", "lessons"),
2131 ("re peating", "repeating"),
2132 ("import ant", "important"),
2133 ("sp ecified", "specified"),
2134 ] {
2135 repaired = repaired.replace(broken, fixed);
2136 }
2137 repaired
2138}
2139
2140fn normalize_pdf_token(token: &str) -> String {
2141 let normalized = token
2142 .replace("â\u{80}\u{98}", "'")
2143 .replace("â\u{80}\u{99}", "'")
2144 .replace("·", "·")
2145 .replace("â\u{84}\u{93}", "ℓ")
2146 .replace("Γ", "Γ")
2147 .replace("Θ", "Θ")
2148 .replace("Λ", "Λ")
2149 .replace("Î\u{a0}", "Π")
2150 .replace("Σ", "Σ")
2151 .replace("Φ", "Φ")
2152 .replace("Ω", "Ω")
2153 .replace("λ", "λ")
2154 .replace("Ï\u{84}", "τ")
2155 .replace("Ã\u{97}", "×")
2156 .replace("â\u{86}\u{92}", "→")
2157 .replace("â\u{89}¥", "≥")
2158 .replace("â\u{89}¤", "≤")
2159 .replace("â\u{88}\u{88}", "∈")
2160 .replace("â\u{88}\u{91}", "∑")
2161 .replace(['‘', '’'], "'")
2162 .replace(['“', '”'], "\"");
2163 let normalized = expand_latin_ligatures(&normalized);
2164 let normalized = repair_windows_1252_control_punctuation(&normalized);
2165 repair_embedded_pdf_control_glyphs(&normalized)
2166}
2167
2168fn expand_latin_ligatures(text: &str) -> String {
2174 if !text.chars().any(|character| ('\u{FB00}'..='\u{FB06}').contains(&character)) {
2175 return text.to_owned();
2176 }
2177 let mut output = String::with_capacity(text.len());
2178 for character in text.chars() {
2179 match character {
2180 '\u{FB00}' => output.push_str("ff"),
2181 '\u{FB01}' => output.push_str("fi"),
2182 '\u{FB02}' => output.push_str("fl"),
2183 '\u{FB03}' => output.push_str("ffi"),
2184 '\u{FB04}' => output.push_str("ffl"),
2185 '\u{FB05}' | '\u{FB06}' => output.push_str("st"),
2186 other => output.push(other),
2187 }
2188 }
2189 output
2190}
2191
2192fn repair_windows_1252_control_punctuation(text: &str) -> String {
2193 let mut output = String::with_capacity(text.len());
2194
2195 for character in text.chars() {
2196 match character {
2197 '\u{80}' => output.push_str("EUR"),
2198 '\u{82}' => output.push(','),
2199 '\u{83}' => output.push('f'),
2200 '\u{84}' => output.push('"'),
2201 '\u{85}' => output.push_str("..."),
2202 '\u{86}' => output.push_str("†"),
2203 '\u{87}' => output.push_str("‡"),
2204 '\u{88}' => output.push('^'),
2205 '\u{89}' => output.push_str("‰"),
2206 '\u{8a}' => output.push_str("Š"),
2207 '\u{8b}' => output.push('<'),
2208 '\u{8c}' => output.push_str("OE"),
2209 '\u{8e}' => output.push_str("Ž"),
2210 '\u{91}' | '\u{92}' => output.push('\''),
2211 '\u{93}' | '\u{94}' => output.push('"'),
2212 '\u{95}' => output.push('*'),
2213 '\u{96}' => output.push('–'),
2214 '\u{97}' => output.push('—'),
2215 '\u{98}' => output.push('~'),
2216 '\u{99}' => output.push_str("(TM)"),
2217 '\u{9a}' => output.push_str("š"),
2218 '\u{9b}' => output.push('>'),
2219 '\u{9c}' => output.push_str("oe"),
2220 '\u{9e}' => output.push_str("ž"),
2221 '\u{9f}' => output.push_str("Ÿ"),
2222 _ => output.push(character),
2223 }
2224 }
2225
2226 output
2227}
2228
2229fn repair_windows_1252_ellipsis_before_tokenizing(text: &str) -> String {
2230 text.replace('\u{85}', "...")
2231}
2232
2233fn repair_embedded_pdf_control_glyphs(token: &str) -> String {
2234 let characters = token.chars().collect::<Vec<_>>();
2235 let mut output = String::with_capacity(token.len());
2236 for (index, character) in characters.iter().enumerate() {
2237 match character {
2238 '\u{2}' if has_following_alphabetic(&characters, index + 1) => {
2239 output.push_str("fi");
2240 }
2241 '\u{2}' => {}
2242 '\u{3}' if has_following_alphabetic(&characters, index + 1) => {
2243 output.push_str("fl");
2244 }
2245 _ => output.push(*character),
2246 }
2247 }
2248 output
2249}
2250
2251fn has_following_alphabetic(characters: &[char], index: usize) -> bool {
2252 characters
2253 .get(index)
2254 .is_some_and(|character| character.is_alphabetic())
2255}
2256
2257fn is_closing_punctuation_token(token: &str) -> bool {
2258 matches!(token, "." | "," | ":" | ";" | "!" | "?" | ")" | "]" | "}")
2259}
2260
2261fn should_join_after_trailing_hyphen(previous: &str, token: &str) -> bool {
2262 previous.ends_with('-')
2263 && token
2264 .chars()
2265 .next()
2266 .is_some_and(|character| character.is_ascii_alphanumeric())
2267 && previous
2268 .chars()
2269 .any(|character| character.is_ascii_alphanumeric())
2270}
2271
2272fn should_join_pdf_word_piece(previous: &str, token: &str) -> bool {
2273 if !is_alphabetic_word(previous) || !is_alphabetic_word(token) {
2274 return false;
2275 }
2276 if !previous
2277 .chars()
2278 .last()
2279 .is_some_and(|character| character.is_lowercase())
2280 || !starts_with_lowercase(token)
2281 {
2282 return false;
2283 }
2284
2285 matches!(
2286 (previous, token),
2287 ("coordina", "ting") | ("de", "scribe") | ("foc", "i") | ("pro", "posed")
2288 )
2289}
2290
2291fn is_alphabetic_word(token: &str) -> bool {
2292 !token.is_empty() && token.chars().all(|character| character.is_alphabetic())
2293}
2294
2295fn repair_pdf_math_notation(text: &str) -> String {
2296 let normalized = text.replace("·", "·").replace("â\u{84}\u{93}", "ℓ");
2297 if !looks_like_pdf_math_notation(&normalized) {
2298 return strip_pdf_control_glyphs(&normalized);
2299 }
2300
2301 let normalized = repair_combining_math_operator_sequences(&normalized);
2302 let symbols = replace_math_symbols(&normalized);
2303 strip_pdf_control_glyphs(&repair_math_subscript_spacing(&symbols))
2304}
2305
2306fn repair_combining_math_operator_sequences(text: &str) -> String {
2307 text.replace("\u{338} =", "≠")
2308 .replace("\u{338}=", "≠")
2309 .replace("=\u{338}", "≠")
2310}
2311
2312fn looks_like_pdf_math_notation(text: &str) -> bool {
2313 text.chars().any(|character| {
2314 matches!(
2315 character,
2316 'ℓ' | 'λ'
2317 | 'θ'
2318 | 'ρ'
2319 | 'τ'
2320 | '∆'
2321 | 'Δ'
2322 | '≤'
2323 | '≥'
2324 | '∈'
2325 | '∪'
2326 | '∑'
2327 | '∅'
2328 | '·'
2329 | '−'
2330 | '±'
2331 | '⊆'
2332 | '∼'
2333 | '≠'
2334 | '→'
2335 )
2336 }) || has_math_ellipsis_context(text)
2337 || text.contains("Fq")
2338 || text.contains(" 6 =")
2339}
2340
2341fn has_math_ellipsis_context(text: &str) -> bool {
2342 if !text.contains("...") {
2343 return false;
2344 }
2345
2346 let compact = text.split_whitespace().collect::<String>();
2347 compact.contains(",...,")
2348 || compact.contains("),...")
2349 || compact.contains("...,(")
2350 || text.chars().any(|character| {
2351 matches!(
2352 character,
2353 '=' | '+' | '_' | '^' | '\\' | '∈' | '≤' | '≥' | '≠' | 'λ' | 'θ' | 'ρ' | 'τ'
2354 )
2355 })
2356}
2357
2358fn replace_math_symbols(text: &str) -> String {
2359 let collapsed = text
2360 .replace("· · ·", r"\cdots")
2361 .replace("...", r"\ldots")
2362 .replace("6 =", r"\neq")
2363 .replace("Fq", r"\mathbb{F}_q");
2364 let mut output = String::with_capacity(collapsed.len());
2365
2366 for character in collapsed.chars() {
2367 match character {
2368 '\u{3}' => output.push_str(r"\Lambda"),
2369 'Γ' => output.push_str(r"\Gamma"),
2370 'Θ' => output.push_str(r"\Theta"),
2371 'ℓ' => output.push_str(r"\ell"),
2372 'λ' => output.push_str(r"\lambda"),
2373 'Λ' => output.push_str(r"\Lambda"),
2374 'Π' => output.push_str(r"\Pi"),
2375 'Σ' => output.push_str(r"\Sigma"),
2376 'Φ' => output.push_str(r"\Phi"),
2377 'Ω' => output.push_str(r"\Omega"),
2378 'θ' => output.push_str(r"\theta"),
2379 'ρ' => output.push_str(r"\rho"),
2380 'τ' => output.push_str(r"\tau"),
2381 '∆' | 'Δ' => output.push_str(r"\Delta"),
2382 '≤' => output.push_str(r"\leq"),
2383 '≥' => output.push_str(r"\geq"),
2384 '∈' => output.push_str(r"\in"),
2385 '∪' => output.push_str(r"\cup"),
2386 '∑' => output.push_str(r"\sum"),
2387 '∅' => output.push_str(r"\varnothing"),
2388 '−' => output.push('-'),
2389 '±' => output.push_str(r"\pm"),
2390 '⊆' => output.push_str(r"\subseteq"),
2391 '∼' => output.push_str(r"\sim"),
2392 '≠' => output.push_str(r"\neq"),
2393 '×' => output.push_str(r"\times"),
2394 '→' => output.push_str(r"\to"),
2395 '·' => output.push_str(r"\cdot"),
2396 _ => output.push(character),
2397 }
2398 }
2399
2400 output
2401}
2402
2403fn strip_pdf_control_glyphs(text: &str) -> String {
2404 let mut sanitized = String::with_capacity(text.len());
2405 let mut last_was_space = false;
2406
2407 for character in text.chars() {
2408 if is_nonprinting_pdf_control(character) {
2409 if !last_was_space {
2410 sanitized.push(' ');
2411 last_was_space = true;
2412 }
2413 continue;
2414 }
2415
2416 sanitized.push(character);
2417 last_was_space = character.is_whitespace();
2418 }
2419
2420 sanitized.split_whitespace().collect::<Vec<_>>().join(" ")
2421}
2422
2423fn is_nonprinting_pdf_control(character: char) -> bool {
2424 character.is_control() && !matches!(character, '\n' | '\r' | '\t')
2425}
2426
2427fn repair_math_subscript_spacing(text: &str) -> String {
2428 let tokens = text.split_whitespace().collect::<Vec<_>>();
2429 let mut repaired = Vec::with_capacity(tokens.len());
2430 let mut index = 0;
2431
2432 while index < tokens.len() {
2433 let token = tokens[index];
2434 if is_math_base_token(token) && index + 1 < tokens.len() {
2435 if tokens[index + 1].starts_with('_') {
2436 repaired.push(format!("{}{}", token, tokens[index + 1]));
2437 index += 2;
2438 continue;
2439 }
2440 if let Some((subscript, suffix)) = split_math_subscript_token(tokens[index + 1]) {
2441 repaired.push(format!(
2442 "{}{}{}",
2443 token,
2444 format_math_subscript(subscript),
2445 suffix
2446 ));
2447 index += 2;
2448 continue;
2449 }
2450 }
2451
2452 repaired.push(repair_compact_math_subscript(token));
2453 index += 1;
2454 }
2455
2456 repaired.join(" ")
2457}
2458
2459fn repair_compact_math_subscript(token: &str) -> String {
2460 if token.chars().count() > 2 && token.chars().all(|character| character.is_alphabetic()) {
2461 return token.to_owned();
2462 }
2463
2464 for base in ["m", "n", "N", "T", "V", "C", "x", "t", "i", "k", "h", "g"] {
2465 if let Some(rest) = token.strip_prefix(base) {
2466 if rest.is_empty() || rest.starts_with('_') {
2467 continue;
2468 }
2469 if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
2470 return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
2471 }
2472 }
2473 }
2474
2475 for base in [r"\lambda", r"\theta", r"\rho"] {
2476 if let Some(rest) = token.strip_prefix(base) {
2477 if rest.is_empty() || rest.starts_with('_') {
2478 continue;
2479 }
2480 if let Some((subscript, suffix)) = split_math_subscript_token(rest) {
2481 return format!("{}{}{}", base, format_math_subscript(subscript), suffix);
2482 }
2483 }
2484 }
2485
2486 token.to_owned()
2487}
2488
2489fn is_math_base_token(token: &str) -> bool {
2490 matches!(
2491 token,
2492 "m" | "n"
2493 | "N"
2494 | "T"
2495 | "V"
2496 | "C"
2497 | "x"
2498 | "t"
2499 | "i"
2500 | "k"
2501 | "h"
2502 | "g"
2503 | r"\lambda"
2504 | r"\theta"
2505 | r"\rho"
2506 )
2507}
2508
2509fn split_math_subscript_token(token: &str) -> Option<(&str, &str)> {
2510 for command in [r"\ell", r"\lambda", r"\theta", r"\rho"] {
2511 if let Some(suffix) = token.strip_prefix(command) {
2512 return Some((command, suffix));
2513 }
2514 }
2515 for word in ["init", "cl"] {
2516 if let Some(suffix) = token.strip_prefix(word) {
2517 return Some((word, suffix));
2518 }
2519 }
2520
2521 let mut end = 0;
2522 for (offset, character) in token.char_indices() {
2523 if character.is_ascii_digit() {
2524 end = offset + character.len_utf8();
2525 continue;
2526 }
2527 break;
2528 }
2529 if end > 0 {
2530 return Some((&token[..end], &token[end..]));
2531 }
2532
2533 let mut chars = token.char_indices();
2534 let (_, first) = chars.next()?;
2535 if matches!(first, 'i' | 'j' | 'k' | 'l' | 'n' | 'r' | 's') {
2536 let end = first.len_utf8();
2537 return Some((&token[..end], &token[end..]));
2538 }
2539 None
2540}
2541
2542fn format_math_subscript(subscript: &str) -> String {
2543 match subscript {
2544 "init" => r"_{\text{init}}".to_owned(),
2545 _ => format!("_{subscript}"),
2546 }
2547}
2548
2549fn is_letter_fragment(token: &str) -> bool {
2550 let chars = token.chars().collect::<Vec<_>>();
2551 matches!(chars.as_slice(), [character] if character.is_ascii_alphabetic())
2552 || matches!(chars.as_slice(), [character, '-'] if character.is_ascii_alphabetic())
2553}
2554
2555fn is_word_piece(token: &str) -> bool {
2556 token.chars().any(|character| character.is_alphabetic())
2557}
2558
2559fn is_joining_apostrophe(token: &str) -> bool {
2560 matches!(token, "'" | "’")
2561}
2562
2563fn is_joining_hyphen(token: &str) -> bool {
2564 matches!(token, "-" | "‐" | "‑")
2565}
2566
2567fn detect_table(
2568 page_number: usize,
2569 lines: &[TextLine],
2570 edges: &[GraphicEdge],
2571) -> Option<DetectedTable> {
2572 detect_ruled_grid_table(page_number, lines, edges)
2573 .or_else(|| detect_exact_run_table(page_number, lines))
2574 .or_else(|| detect_columnar_numeric_table(page_number, lines))
2575 .or_else(|| detect_implied_alignment_table(page_number, lines))
2576}
2577
2578fn detect_columnar_numeric_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
2587 let line_cells: Vec<Vec<TextRun>> = lines
2588 .iter()
2589 .map(|line| coalesce_currency_prefixes(implied_table_cells(line)))
2590 .collect();
2591
2592 let mut right_edges: Vec<f32> = Vec::new();
2597 let mut data_rows = 0usize;
2598 for cells in &line_cells {
2599 if cells_contain_prose(cells) {
2603 continue;
2604 }
2605 let values = cells.iter().filter(|cell| is_value_cell(&cell.text)).count();
2606 if values >= 2 {
2607 data_rows += 1;
2608 for cell in cells.iter().filter(|cell| is_value_cell(&cell.text)) {
2609 right_edges.push(cell.bbox.x + cell.bbox.width);
2610 }
2611 }
2612 }
2613 if data_rows < 4 {
2614 return None;
2615 }
2616
2617 let min_support = ((data_rows as f32) * 0.35).ceil().max(3.0) as usize;
2618 let all_clusters = cluster_column_right_edges_with_support(&right_edges, 8.0);
2619 let mut columns: Vec<f32> = all_clusters
2620 .iter()
2621 .filter(|(_, support)| *support >= min_support)
2622 .map(|(position, _)| *position)
2623 .collect();
2624 columns.extend(rescue_periodic_subcolumns(
2627 &all_clusters,
2628 &columns,
2629 min_support,
2630 data_rows,
2631 ));
2632 columns.sort_by(f32::total_cmp);
2633 if columns.len() < 2 {
2634 return None;
2635 }
2636 let cell_width = column_cell_width(&line_cells, columns[0]);
2641 let half_gap = columns
2642 .get(1)
2643 .map_or(cell_width * 2.5, |next| (next - columns[0]) / 2.0);
2644 let first_column_left = columns[0] - (cell_width * 2.5).min(half_gap.max(cell_width * 1.5));
2645 let table_right = columns.last().copied().unwrap_or_default();
2646
2647 let aligned: Vec<usize> = (0..lines.len())
2649 .filter(|&index| {
2650 line_cells[index]
2651 .iter()
2652 .filter(|cell| is_value_cell(&cell.text))
2653 .any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some())
2654 })
2655 .collect();
2656 let (first, last) = (*aligned.first()?, *aligned.last()?);
2657
2658 let mut row_indices: Vec<usize> = Vec::new();
2662 let mut previous_y: Option<f32> = None;
2663 for index in first..=last {
2664 let line = &lines[index];
2665 let cells = &line_cells[index];
2666 let aligned_here = cells
2667 .iter()
2668 .filter(|cell| is_value_cell(&cell.text))
2669 .any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some());
2670 let numeric_here = cells.iter().any(|cell| is_numeric_value(&cell.text));
2671 let label_only = !numeric_here && line.bbox.x <= table_right;
2672 if !aligned_here && !label_only {
2673 break;
2674 }
2675 if let Some(prev) = previous_y {
2676 if (prev - line.bbox.y).abs() > average_run_size(line).max(line.bbox.height) * 3.5 {
2677 break;
2678 }
2679 }
2680 row_indices.push(index);
2681 previous_y = Some(line.bbox.y);
2682 }
2683 let aligned_in_span = row_indices
2684 .iter()
2685 .filter(|&&index| {
2686 line_cells[index]
2687 .iter()
2688 .filter(|cell| is_value_cell(&cell.text))
2689 .any(|cell| nearest_column(cell.bbox.x + cell.bbox.width, &columns).is_some())
2690 })
2691 .count();
2692 if aligned_in_span < 4 {
2693 return None;
2694 }
2695
2696 build_columnar_table(page_number, lines, &line_cells, &columns, first_column_left, &row_indices)
2697}
2698
2699fn coalesce_currency_prefixes(cells: Vec<TextRun>) -> Vec<TextRun> {
2704 const SYMBOLS: [char; 4] = ['$', '€', '£', '¥'];
2705 let mut out: Vec<TextRun> = Vec::with_capacity(cells.len());
2706 let mut pending: Option<TextRun> = None;
2707 for mut cell in cells {
2708 let mut text = cell.text.trim().to_string();
2709 if let Some(prefix) = pending.take() {
2710 cell.bbox = union_boxes([prefix.bbox, cell.bbox]).unwrap_or(cell.bbox);
2711 text = format!("{}{}", prefix.text.trim(), text);
2712 }
2713 if text.chars().count() == 1 && text.chars().all(|c| SYMBOLS.contains(&c)) {
2715 cell.text = text;
2716 pending = Some(cell);
2717 continue;
2718 }
2719 if let Some(last) = text.chars().last() {
2722 if SYMBOLS.contains(&last) {
2723 let stripped = text[..text.len() - last.len_utf8()].trim_end();
2724 if !stripped.is_empty() {
2725 let mut carry = cell.clone();
2726 carry.text = last.to_string();
2727 text = stripped.to_string();
2728 pending = Some(carry);
2729 }
2730 }
2731 }
2732 cell.text = text;
2733 out.push(cell);
2734 }
2735 if let Some(prefix) = pending {
2736 out.push(prefix);
2737 }
2738 out
2739}
2740
2741fn is_numeric_value(text: &str) -> bool {
2745 let trimmed = text.trim();
2746 if trimmed.is_empty() {
2747 return false;
2748 }
2749 let mut digits = 0usize;
2750 for character in trimmed.chars() {
2751 match character {
2752 '0'..='9' => digits += 1,
2753 '$' | '(' | ')' | ',' | '.' | '%' | '-' | '+' | ' ' | '\u{2014}' | '\u{2013}' => {}
2754 _ => return false,
2755 }
2756 }
2757 digits >= 1
2758}
2759
2760fn is_value_cell(text: &str) -> bool {
2764 is_numeric_value(text) || matches!(text.trim(), "—" | "–")
2765}
2766
2767fn cells_contain_prose(cells: &[TextRun]) -> bool {
2771 if cells.iter().filter(|cell| is_value_cell(&cell.text)).count() >= 2 {
2778 return false;
2779 }
2780 cells.iter().any(|cell| {
2781 cell.text
2782 .split_whitespace()
2783 .filter(|word| word.chars().any(|c| c.is_alphabetic()))
2784 .count()
2785 > 12
2786 })
2787}
2788
2789fn cluster_column_right_edges_with_support(values: &[f32], tol: f32) -> Vec<(f32, usize)> {
2793 let mut sorted = values.to_vec();
2794 sorted.sort_by(f32::total_cmp);
2795 let mut clusters: Vec<(f32, usize)> = Vec::new();
2796 let mut start = 0usize;
2797 for index in 1..=sorted.len() {
2798 let split = index == sorted.len() || sorted[index] - sorted[index - 1] > tol;
2799 if split {
2800 let cluster = &sorted[start..index];
2801 if !cluster.is_empty() {
2802 clusters.push((cluster[cluster.len() / 2], cluster.len()));
2803 }
2804 start = index;
2805 }
2806 }
2807 clusters
2808}
2809
2810fn rescue_periodic_subcolumns(
2820 all_clusters: &[(f32, usize)],
2821 kept: &[f32],
2822 min_support: usize,
2823 data_rows: usize,
2824) -> Vec<f32> {
2825 if kept.len() < 2 {
2826 return Vec::new();
2827 }
2828 let floor = ((data_rows as f32) * 0.15).ceil().max(3.0) as usize;
2829 if floor >= min_support {
2830 return Vec::new();
2831 }
2832 let mut diffs: Vec<f32> = kept.windows(2).map(|window| window[1] - window[0]).collect();
2833 diffs.sort_by(f32::total_cmp);
2834 let pitch = diffs[diffs.len() / 2];
2835 if pitch <= 0.0 {
2836 return Vec::new();
2837 }
2838 let anchor = kept[0];
2839 let (first, last) = (kept[0], kept[kept.len() - 1]);
2840
2841 let candidates: Vec<f32> = all_clusters
2843 .iter()
2844 .filter(|(position, support)| {
2845 *support >= floor
2846 && *support < min_support
2847 && *position >= first - pitch
2848 && *position <= last + pitch
2849 })
2850 .map(|(position, _)| *position)
2851 .collect();
2852
2853 let residue = |position: f32| ((position - anchor) % pitch + pitch) % pitch;
2854 let group_of = |position: f32| ((position - anchor) / pitch).round() as i32;
2855
2856 let mut rescued = Vec::new();
2857 let mut used = vec![false; candidates.len()];
2858 for index in 0..candidates.len() {
2859 if used[index] {
2860 continue;
2861 }
2862 let target = residue(candidates[index]);
2863 let mut class = vec![index];
2864 for other in (index + 1)..candidates.len() {
2865 if used[other] {
2866 continue;
2867 }
2868 let delta = (target - residue(candidates[other])).abs();
2869 if delta.min(pitch - delta) <= 8.0 {
2870 class.push(other);
2871 }
2872 }
2873 let groups: std::collections::HashSet<i32> =
2874 class.iter().map(|&member| group_of(candidates[member])).collect();
2875 if class.len() >= 2 && groups.len() >= 2 {
2876 for &member in &class {
2877 used[member] = true;
2878 rescued.push(candidates[member]);
2879 }
2880 }
2881 }
2882 rescued
2883}
2884
2885fn nearest_column(right_edge: f32, columns: &[f32]) -> Option<usize> {
2887 columns
2888 .iter()
2889 .enumerate()
2890 .map(|(index, edge)| (index, (right_edge - edge).abs()))
2891 .filter(|(_, distance)| *distance <= 14.0)
2892 .min_by(|left, right| left.1.total_cmp(&right.1))
2893 .map(|(index, _)| index)
2894}
2895
2896fn column_cell_width(line_cells: &[Vec<TextRun>], first_column: f32) -> f32 {
2899 let widths: Vec<f32> = line_cells
2900 .iter()
2901 .flat_map(|cells| cells.iter())
2902 .filter(|cell| is_numeric_value(&cell.text))
2903 .filter(|cell| ((cell.bbox.x + cell.bbox.width) - first_column).abs() <= 14.0)
2904 .map(|cell| cell.bbox.width)
2905 .collect();
2906 if widths.is_empty() {
2907 return 40.0;
2908 }
2909 let mut sorted = widths.clone();
2910 sorted.sort_by(f32::total_cmp);
2911 sorted[sorted.len() / 2].max(20.0)
2912}
2913
2914fn wrapped_label_above(
2920 lines: &[TextLine],
2921 line_cells: &[Vec<TextRun>],
2922 row_index: usize,
2923 first_column_left: f32,
2924 used: &[usize],
2925) -> Vec<usize> {
2926 let label_x = lines[row_index].bbox.x;
2927 let line_height = average_run_size(&lines[row_index]).max(lines[row_index].bbox.height);
2928 let mut result: Vec<usize> = Vec::new();
2929 let mut current_y = lines[row_index].bbox.y;
2930 loop {
2931 let above = (0..lines.len())
2932 .filter(|&index| {
2933 index != row_index
2934 && !used.contains(&index)
2935 && !result.contains(&index)
2936 && lines[index].bbox.y > current_y
2937 })
2938 .min_by(|&left, &right| lines[left].bbox.y.total_cmp(&lines[right].bbox.y));
2939 let Some(above) = above else { break };
2940 let line = &lines[above];
2941 let text = text_line_plain_text(line);
2942 let long_enough = text.chars().count() >= 28
2948 || line.bbox.x + line.bbox.width >= first_column_left - 12.0;
2949 let all_caps_heading = text.chars().any(char::is_alphabetic)
2952 && text.chars().filter(|c| c.is_alphabetic()).all(char::is_uppercase);
2953 if line.bbox.y - current_y > line_height * 1.8
2954 || (line.bbox.x - label_x).abs() > 16.0
2955 || !long_enough
2956 || all_caps_heading
2957 || text.trim().is_empty()
2958 || text.trim_end().ends_with(':')
2959 || line_cells[above].iter().any(|cell| is_numeric_value(&cell.text))
2960 {
2961 break;
2962 }
2963 result.push(above);
2964 current_y = line.bbox.y;
2965 }
2966 result.reverse();
2967 result
2968}
2969
2970fn is_period_header_row(row: &[String]) -> bool {
2974 let values: Vec<&str> = row[1..]
2975 .iter()
2976 .map(|cell| cell.trim())
2977 .filter(|cell| !cell.is_empty())
2978 .collect();
2979 !values.is_empty()
2980 && values.iter().all(|cell| {
2981 cell.len() == 4
2982 && cell.chars().all(|c| c.is_ascii_digit())
2983 && cell.parse::<i32>().is_ok_and(|year| (1900..=2100).contains(&year))
2984 })
2985}
2986
2987fn build_columnar_table(
2988 page_number: usize,
2989 lines: &[TextLine],
2990 line_cells: &[Vec<TextRun>],
2991 columns: &[f32],
2992 first_column_left: f32,
2993 row_indices: &[usize],
2994) -> Option<DetectedTable> {
2995 let column_count = columns.len() + 1; let assign_row = |index: usize| -> Vec<String> {
2997 let mut row = vec![String::new(); column_count];
2998 for cell in &line_cells[index] {
2999 let column = assign_cell_column(cell, columns, first_column_left);
3000 push_table_cell_text(&mut row[column], &cell.text);
3001 }
3002 row
3003 };
3004
3005 let span_top_y = lines[*row_indices.first()?].bbox.y;
3010 let mut header_indices: Vec<usize> = (0..lines.len())
3011 .filter(|&index| {
3012 let line = &lines[index];
3013 !row_indices.contains(&index)
3014 && line.bbox.y > span_top_y
3015 && line.bbox.y - span_top_y
3016 <= average_run_size(line).max(line.bbox.height) * 5.0
3017 && line.bbox.x + line.bbox.width >= first_column_left - 24.0
3018 && !text_line_plain_text(line).to_ascii_lowercase().starts_with("table ")
3019 && !line_is_data_row(line, column_count)
3020 && !cells_contain_prose(&line_cells[index])
3021 && assign_row(index)[1..].iter().any(|cell| !cell.trim().is_empty())
3025 })
3026 .collect();
3027
3028 let mut data_start = 0usize;
3029 for (position, &index) in row_indices.iter().enumerate() {
3030 let row = assign_row(index);
3031 if row[0].trim().is_empty() || is_period_header_row(&row) {
3036 header_indices.push(index);
3037 data_start = position + 1;
3038 } else {
3039 data_start = position;
3040 break;
3041 }
3042 }
3043 header_indices.sort_by(|left, right| lines[*right].bbox.y.total_cmp(&lines[*left].bbox.y));
3044
3045 let mut header_cells: Vec<String> = vec![String::new(); column_count];
3046 for &index in &header_indices {
3047 for (column, text) in assign_row(index).into_iter().enumerate() {
3048 push_table_cell_text(&mut header_cells[column], &text);
3049 }
3050 }
3051 let header_has_text = header_cells.iter().any(|cell| !cell.is_empty());
3052
3053 let mut rows: Vec<Vec<String>> = Vec::new();
3054 let mut cell_records: Vec<TableCell> = Vec::new();
3055 if header_has_text {
3056 for (column, text) in header_cells.iter().enumerate() {
3057 cell_records.push(table_cell(0, column, text.clone(), true));
3058 }
3059 }
3060
3061 let mut consumed: Vec<usize> = Vec::new();
3065 let mut prefixes: Vec<(usize, String)> = Vec::new();
3066 for &index in &row_indices[data_start..] {
3067 if !line_cells[index].iter().any(|cell| is_numeric_value(&cell.text)) {
3068 continue;
3069 }
3070 if assign_row(index)[0].trim().chars().count() > 11 {
3075 continue;
3076 }
3077 let mut search_used = header_indices.clone();
3078 search_used.extend_from_slice(&consumed);
3079 let chain = wrapped_label_above(lines, line_cells, index, first_column_left, &search_used);
3080 if !chain.is_empty() {
3081 let prefix = chain
3082 .iter()
3083 .map(|&line| text_line_plain_text(&lines[line]))
3084 .collect::<Vec<_>>()
3085 .join(" ");
3086 prefixes.push((index, prefix));
3087 consumed.extend(chain);
3088 }
3089 }
3090
3091 let mut prose_skipped: Vec<usize> = Vec::new();
3092 for &index in &row_indices[data_start..] {
3093 if consumed.contains(&index) {
3094 continue;
3095 }
3096 if cells_contain_prose(&line_cells[index]) {
3099 prose_skipped.push(index);
3100 continue;
3101 }
3102 let mut row = assign_row(index);
3103 if let Some((_, prefix)) = prefixes.iter().find(|(line, _)| *line == index) {
3104 row[0] = if row[0].trim().is_empty() {
3105 prefix.clone()
3106 } else {
3107 format!("{prefix} {}", row[0])
3108 };
3109 }
3110 if row.iter().all(|cell| cell.is_empty()) {
3111 continue;
3112 }
3113 let table_row = rows.len() + usize::from(header_has_text);
3114 for (column, text) in row.iter().enumerate() {
3115 cell_records.push(table_cell(table_row, column, text.clone(), false));
3116 }
3117 rows.push(row);
3118 }
3119 if rows.is_empty() {
3120 return None;
3121 }
3122
3123 let value_rows = rows.iter().filter(|row| !row[0].trim().is_empty()).count();
3129 let label_only_rows = rows
3130 .iter()
3131 .filter(|row| !row[0].trim().is_empty() && row[1..].iter().all(|cell| cell.trim().is_empty()))
3132 .count();
3133 let data_with_figures = rows
3134 .iter()
3135 .filter(|row| row[1..].iter().any(|cell| !cell.trim().is_empty()))
3136 .count();
3137 let multi_section = label_only_rows >= 2 && value_rows >= 8;
3144 let wide_table = columns.len() >= 5 && value_rows >= 6;
3145 if data_with_figures < 6 || !(multi_section || wide_table) {
3146 return None;
3147 }
3148
3149 let mut line_index_set: Vec<usize> = row_indices.to_vec();
3150 line_index_set.extend(header_indices.iter().copied());
3151 line_index_set.extend(consumed.iter().copied());
3152 line_index_set.retain(|index| !prose_skipped.contains(index));
3155 line_index_set.sort_unstable();
3156 line_index_set.dedup();
3157 let bbox = union_boxes(line_index_set.iter().map(|&index| lines[index].bbox))?;
3158
3159 Some(DetectedTable {
3160 table: TableBlock {
3161 headers: if header_has_text {
3162 header_cells
3163 } else {
3164 Vec::new()
3165 },
3166 rows,
3167 caption: None,
3168 bbox: Some(bbox),
3169 cells: cell_records,
3170 source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3171 confidence: Some(Confidence {
3172 score: 0.7,
3173 calibrated: false,
3174 }), ..Default::default()
3175 },
3176 line_indices: line_index_set,
3177 })
3178}
3179
3180fn assign_cell_column(cell: &TextRun, columns: &[f32], first_column_left: f32) -> usize {
3186 if is_numeric_value(&cell.text) {
3187 if let Some(column) = nearest_column(cell.bbox.x + cell.bbox.width, columns) {
3188 return column + 1;
3189 }
3190 }
3191 column_band(cell, columns, first_column_left)
3195}
3196
3197fn column_band(cell: &TextRun, columns: &[f32], first_column_left: f32) -> usize {
3201 let center = cell.bbox.x + cell.bbox.width / 2.0;
3202 if center < first_column_left {
3203 return 0;
3204 }
3205 for index in 0..columns.len() {
3206 let upper = columns
3207 .get(index + 1)
3208 .map_or(f32::INFINITY, |next| (columns[index] + next) / 2.0);
3209 if center <= upper {
3210 return index + 1;
3211 }
3212 }
3213 columns.len()
3214}
3215
3216fn push_table_cell_text(target: &mut String, text: &str) {
3217 let text = text.trim();
3218 if text.is_empty() {
3219 return;
3220 }
3221 if !target.is_empty() {
3222 target.push(' ');
3223 }
3224 target.push_str(text);
3225}
3226
3227fn table_cell(row: usize, column: usize, text: String, is_header: bool) -> TableCell {
3228 TableCell {
3229 row,
3230 column,
3231 text,
3232 bbox: None,
3233 is_header,
3234 col_span: 1,
3235 row_span: 1,
3236 }
3237}
3238
3239fn sort_runs_reading_order(runs: &mut [TextRun]) {
3243 runs.sort_by(|a, b| {
3244 let line_a = (a.baseline_y / 3.0).round();
3245 let line_b = (b.baseline_y / 3.0).round();
3246 line_b
3247 .total_cmp(&line_a)
3248 .then(a.bbox.x.total_cmp(&b.bbox.x))
3249 });
3250}
3251
3252fn row_is_prose(cells: &[String]) -> bool {
3256 let word_counts: Vec<usize> = cells.iter().map(|c| c.split_whitespace().count()).collect();
3257 if word_counts.iter().copied().max().unwrap_or(0) >= 12 {
3258 return true;
3259 }
3260 let nonempty = cells.iter().filter(|c| !c.trim().is_empty()).count();
3261 let total_words: usize = word_counts.iter().sum();
3262 let numeric = cells.iter().filter(|c| is_value_cell(c)).count();
3263 nonempty >= 5 && total_words >= 25 && (numeric as f32) < nonempty as f32 * 0.3
3264}
3265
3266fn detect_ruled_grid_table(
3267 page_number: usize,
3268 lines: &[TextLine],
3269 edges: &[GraphicEdge],
3270) -> Option<DetectedTable> {
3271 let verticals = grid_axis_values(edges, EdgeOrientation::Vertical);
3272 let horizontals = grid_axis_values(edges, EdgeOrientation::Horizontal);
3273 if verticals.len() < 2 || horizontals.len() < 2 {
3274 return None;
3275 }
3276
3277 let columns = verticals.len() - 1;
3278 let rows = horizontals.len() - 1;
3279 if columns < 2 || rows < 2 {
3280 return None;
3281 }
3282 if !has_nearby_ruled_table_label(lines, &verticals, &horizontals)
3283 && !has_multirow_ruled_grid_evidence(columns, rows)
3284 {
3285 return None;
3286 }
3287
3288 let mut grid_runs: Vec<Vec<Vec<TextRun>>> = vec![vec![Vec::new(); columns]; rows];
3293 let mut cell_boxes = vec![vec![None; columns]; rows];
3294 let mut line_indices = Vec::new();
3295
3296 for (line_index, line) in lines.iter().enumerate() {
3297 let mut used_line = false;
3298 for run in &line.runs {
3299 let center_x = run.bbox.x + run.bbox.width / 2.0;
3300 let center_y = run.bbox.y + run.bbox.height / 2.0;
3301 let Some(column) = grid_column_for(center_x, &verticals) else {
3302 continue;
3303 };
3304 let Some(row) = grid_row_for(center_y, &horizontals) else {
3305 continue;
3306 };
3307 grid_runs[row][column].push(run.clone());
3308 cell_boxes[row][column] = Some(
3309 cell_boxes[row][column]
3310 .and_then(|bbox| union_boxes([bbox, run.bbox]))
3311 .unwrap_or(run.bbox),
3312 );
3313 used_line = true;
3314 }
3315 if used_line {
3316 line_indices.push(line_index);
3317 }
3318 }
3319
3320 let mut grid = vec![vec![String::new(); columns]; rows];
3321 let mut prose_rows = vec![false; rows];
3322 for row in 0..rows {
3323 let mut cell_texts = vec![String::new(); columns];
3324 for column in 0..columns {
3325 if grid_runs[row][column].is_empty() {
3326 continue;
3327 }
3328 let mut runs = grid_runs[row][column].clone();
3329 sort_runs_reading_order(&mut runs);
3330 cell_texts[column] = clean_pdf_line_text(&join_runs_spaced(&runs));
3331 }
3332 if row_is_prose(&cell_texts) {
3337 prose_rows[row] = true;
3338 let mut all: Vec<TextRun> = grid_runs[row].iter().flatten().cloned().collect();
3339 sort_runs_reading_order(&mut all);
3340 grid[row][0] = clean_pdf_line_text(&join_runs_spaced(&all));
3341 } else {
3342 grid[row] = cell_texts;
3343 }
3344 }
3345
3346 if grid
3347 .iter()
3348 .flatten()
3349 .filter(|text| !text.trim().is_empty())
3350 .count()
3351 < 3
3352 {
3353 return None;
3354 }
3355
3356 let headers = grid[0].clone();
3357 let body_rows = grid.iter().skip(1).cloned().collect::<Vec<_>>();
3358 if headers.iter().all(|text| text.trim().is_empty())
3359 || body_rows
3360 .iter()
3361 .flatten()
3362 .all(|text| text.trim().is_empty())
3363 {
3364 return None;
3365 }
3366
3367 let (mut col_span, mut covered) = merged_cell_col_spans(&cell_boxes, &verticals);
3371 for row in 0..rows {
3373 if prose_rows[row] {
3374 covered[row][0] = false;
3375 col_span[row][0] = columns;
3376 for column in 1..columns {
3377 covered[row][column] = true;
3378 }
3379 }
3380 }
3381
3382 let mut cells = Vec::new();
3383 for row in 0..rows {
3384 for column in 0..columns {
3385 if covered[row][column] {
3386 continue;
3387 }
3388 cells.push(TableCell {
3389 row,
3390 column,
3391 text: grid[row][column].clone(),
3392 bbox: cell_boxes[row][column],
3393 is_header: row == 0,
3394 col_span: col_span[row][column],
3395 row_span: 1,
3396 });
3397 }
3398 }
3399
3400 let bbox = BBox {
3401 x: *verticals.first()?,
3402 y: *horizontals.first()?,
3403 width: *verticals.last()? - *verticals.first()?,
3404 height: *horizontals.last()? - *horizontals.first()?,
3405 };
3406
3407 Some(DetectedTable {
3408 table: TableBlock {
3409 headers,
3410 rows: body_rows,
3411 caption: None,
3412 bbox: Some(bbox),
3413 cells,
3414 source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3415 confidence: Some(Confidence {
3416 score: 0.7,
3417 calibrated: false,
3418 }), ..Default::default()
3419 },
3420 line_indices,
3421 })
3422}
3423
3424fn merged_cell_col_spans(
3439 cell_boxes: &[Vec<Option<BBox>>],
3440 verticals: &[f32],
3441) -> (Vec<Vec<usize>>, Vec<Vec<bool>>) {
3442 const SPAN_MARGIN: f32 = 2.0;
3443 let rows = cell_boxes.len();
3444 let columns = cell_boxes.first().map_or(0, Vec::len);
3445 let mut col_span = vec![vec![1usize; columns]; rows];
3446 let mut covered = vec![vec![false; columns]; rows];
3447
3448 for row in 0..rows {
3449 for column in 0..columns {
3450 if covered[row][column] {
3451 continue;
3452 }
3453 let Some(bbox) = cell_boxes[row][column] else {
3454 continue;
3455 };
3456
3457 let content_right = bbox.x + bbox.width;
3458 let mut next_column = column + 1;
3459 while next_column < columns
3460 && cell_boxes[row][next_column].is_none()
3461 && !covered[row][next_column]
3462 && verticals
3463 .get(next_column)
3464 .is_some_and(|edge| content_right > edge + SPAN_MARGIN)
3465 {
3466 covered[row][next_column] = true;
3467 next_column += 1;
3468 }
3469 col_span[row][column] = next_column - column;
3470 }
3471 }
3472
3473 (col_span, covered)
3474}
3475
3476fn has_nearby_ruled_table_label(
3477 lines: &[TextLine],
3478 verticals: &[f32],
3479 horizontals: &[f32],
3480) -> bool {
3481 let Some(left) = verticals.first().copied() else {
3482 return false;
3483 };
3484 let Some(right) = verticals.last().copied() else {
3485 return false;
3486 };
3487 let Some(top) = horizontals.last().copied() else {
3488 return false;
3489 };
3490
3491 lines.iter().any(|line| {
3492 let text = text_line_plain_text(line).to_ascii_lowercase();
3493 text.starts_with("table")
3494 && line.bbox.y >= top
3495 && line.bbox.y <= top + 96.0
3496 && line.bbox.x <= right + 24.0
3497 && line.bbox.x + line.bbox.width >= left - 24.0
3498 })
3499}
3500
3501fn has_multirow_ruled_grid_evidence(columns: usize, rows: usize) -> bool {
3502 columns >= 2 && rows >= 4
3503}
3504
3505#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3506enum EdgeOrientation {
3507 Horizontal,
3508 Vertical,
3509}
3510
3511fn grid_axis_values(edges: &[GraphicEdge], orientation: EdgeOrientation) -> Vec<f32> {
3512 let mut values = edges
3513 .iter()
3514 .filter_map(|edge| match orientation {
3515 EdgeOrientation::Horizontal if is_horizontal_edge(edge) => {
3516 Some((edge.y0 + edge.y1) / 2.0)
3517 }
3518 EdgeOrientation::Vertical if is_vertical_edge(edge) => Some((edge.x0 + edge.x1) / 2.0),
3519 _ => None,
3520 })
3521 .collect::<Vec<_>>();
3522 values.sort_by(f32::total_cmp);
3523 dedup_axis_values(values, 2.0)
3524}
3525
3526fn is_horizontal_edge(edge: &GraphicEdge) -> bool {
3527 (edge.y0 - edge.y1).abs() <= 1.0 && (edge.x0 - edge.x1).abs() >= 12.0
3528}
3529
3530fn is_vertical_edge(edge: &GraphicEdge) -> bool {
3531 (edge.x0 - edge.x1).abs() <= 1.0 && (edge.y0 - edge.y1).abs() >= 12.0
3532}
3533
3534fn dedup_axis_values(values: Vec<f32>, tolerance: f32) -> Vec<f32> {
3535 let mut deduped: Vec<f32> = Vec::new();
3536 for value in values {
3537 if let Some(previous) = deduped.last_mut() {
3538 if (value - *previous).abs() <= tolerance {
3539 *previous = (*previous + value) / 2.0;
3540 continue;
3541 }
3542 }
3543 deduped.push(value);
3544 }
3545 deduped
3546}
3547
3548fn grid_column_for(x: f32, verticals: &[f32]) -> Option<usize> {
3549 verticals
3550 .windows(2)
3551 .position(|window| x >= window[0] - 1.0 && x <= window[1] + 1.0)
3552}
3553
3554fn grid_row_for(y: f32, horizontals: &[f32]) -> Option<usize> {
3555 let band = horizontals
3556 .windows(2)
3557 .position(|window| y >= window[0] - 1.0 && y <= window[1] + 1.0)?;
3558 Some(horizontals.len().saturating_sub(2).saturating_sub(band))
3559}
3560
3561
3562fn detect_exact_run_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
3563 let candidate_lines = lines
3564 .iter()
3565 .enumerate()
3566 .filter(|(_, line)| line.runs.len() >= 2)
3567 .collect::<Vec<_>>();
3568 if candidate_lines.len() < 2 {
3569 return None;
3570 }
3571
3572 let width = candidate_lines[0].1.runs.len();
3573 if !candidate_lines.iter().all(|(_, line)| {
3574 line.runs.len() == width && columns_align(&candidate_lines[0].1.runs, &line.runs)
3575 }) {
3576 return None;
3577 }
3578 if !has_table_evidence(&candidate_lines) {
3579 return None;
3580 }
3581
3582 let headers = candidate_lines[0]
3583 .1
3584 .runs
3585 .iter()
3586 .map(|run| run.text.trim().to_owned())
3587 .collect::<Vec<_>>();
3588 let rows = candidate_lines
3589 .iter()
3590 .skip(1)
3591 .map(|(_, line)| {
3592 line.runs
3593 .iter()
3594 .map(|run| run.text.trim().to_owned())
3595 .collect::<Vec<_>>()
3596 })
3597 .collect::<Vec<_>>();
3598 let bbox = union_boxes(candidate_lines.iter().map(|(_, line)| line.bbox))?;
3599 let mut cells = Vec::new();
3600
3601 for (row_index, (_, line)) in candidate_lines.iter().enumerate() {
3602 for (column_index, run) in line.runs.iter().enumerate() {
3603 cells.push(TableCell {
3604 row: row_index,
3605 column: column_index,
3606 text: run.text.clone(),
3607 bbox: Some(run.bbox),
3608 is_header: row_index == 0,
3609 col_span: 1,
3610 row_span: 1,
3611 });
3612 }
3613 }
3614
3615 Some(DetectedTable {
3616 table: TableBlock {
3617 headers,
3618 rows,
3619 caption: None,
3620 bbox: Some(bbox),
3621 cells,
3622 source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3623 confidence: Some(Confidence {
3624 score: 0.72,
3625 calibrated: false,
3626 }), ..Default::default()
3627 },
3628 line_indices: candidate_lines
3629 .iter()
3630 .map(|(line_index, _)| *line_index)
3631 .collect(),
3632 })
3633}
3634
3635fn detect_implied_alignment_table(page_number: usize, lines: &[TextLine]) -> Option<DetectedTable> {
3636 let row_candidates = lines
3637 .iter()
3638 .enumerate()
3639 .filter_map(|(line_index, line)| {
3640 let cells = implied_table_cells(line);
3641 (cells.len() >= 3 && row_has_numeric_table_evidence(&cells))
3642 .then_some(TableRowCandidate { line_index, cells })
3643 })
3644 .collect::<Vec<_>>();
3645 let group = best_aligned_table_row_group(&row_candidates)?;
3646 if !has_nearby_table_label(lines, &group) && !has_strong_numeric_table_evidence(&group) {
3651 return None;
3652 }
3653 build_implied_alignment_table(page_number, lines, &group)
3654}
3655
3656fn has_strong_numeric_table_evidence(rows: &[TableRowCandidate]) -> bool {
3661 let columns = rows.first().map_or(0, |row| row.cells.len());
3662 if rows.len() < 4 || columns < 3 {
3663 return false;
3664 }
3665 let numeric_rows = rows
3666 .iter()
3667 .filter(|row| row_has_numeric_table_evidence(&row.cells))
3668 .count();
3669 numeric_rows * 4 >= rows.len() * 3
3670}
3671
3672fn has_nearby_table_label(lines: &[TextLine], rows: &[TableRowCandidate]) -> bool {
3673 let Some(first_row) = rows.first() else {
3674 return false;
3675 };
3676 let first_y = first_row
3677 .cells
3678 .iter()
3679 .map(|cell| cell.bbox.y)
3680 .reduce(f32::max)
3681 .unwrap_or_default();
3682 let table_left = first_row
3683 .cells
3684 .iter()
3685 .map(|cell| cell.bbox.x)
3686 .reduce(f32::min)
3687 .unwrap_or_default();
3688 let table_right = first_row
3689 .cells
3690 .iter()
3691 .map(|cell| cell.bbox.x + cell.bbox.width)
3692 .reduce(f32::max)
3693 .unwrap_or_default();
3694
3695 lines.iter().any(|line| {
3696 let text = text_line_plain_text(line).to_ascii_lowercase();
3697 text.starts_with("table")
3698 && line.bbox.y >= first_y
3699 && line.bbox.y <= first_y + 96.0
3700 && line.bbox.x <= table_right + 24.0
3701 && line.bbox.x + line.bbox.width >= table_left - 24.0
3702 })
3703}
3704
3705fn implied_table_cells(line: &TextLine) -> Vec<TextRun> {
3706 if line.runs.len() < 2 {
3707 return line.runs.clone();
3708 }
3709
3710 let mut runs = line.runs.clone();
3711 runs.sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
3712 let threshold = implied_cell_gap_threshold(line);
3713 let mut groups: Vec<Vec<TextRun>> = Vec::new();
3714 let mut current: Vec<TextRun> = Vec::new();
3715
3716 for run in runs {
3717 if let Some(previous) = current.last() {
3718 let gap = run.bbox.x - (previous.bbox.x + previous.bbox.width);
3719 let starts_currency = run.text.trim_start().starts_with('$');
3727 let previous_attaches_currency = matches!(previous.text.trim(), "$" | "(" | "($");
3731 if gap >= threshold || (starts_currency && !previous_attaches_currency) {
3732 groups.push(std::mem::take(&mut current));
3733 }
3734 }
3735 current.push(run);
3736 }
3737 if !current.is_empty() {
3738 groups.push(current);
3739 }
3740
3741 groups
3742 .into_iter()
3743 .filter_map(|runs| text_run_from_cell_runs(&runs))
3744 .collect()
3745}
3746
3747fn implied_cell_gap_threshold(line: &TextLine) -> f32 {
3748 let height = average_run_size(line).max(line.bbox.height);
3749 (height * 1.5).clamp(10.0, 18.0)
3750}
3751
3752fn text_run_from_cell_runs(runs: &[TextRun]) -> Option<TextRun> {
3753 let bbox = union_boxes(runs.iter().map(|run| run.bbox))?;
3754 let text = clean_pdf_line_text(&join_runs_spaced(runs));
3755 if text.is_empty() {
3756 return None;
3757 }
3758
3759 Some(TextRun {
3760 text,
3761 bbox,
3762 baseline_y: runs.iter().map(|run| run.baseline_y).sum::<f32>() / runs.len() as f32,
3763 font: runs.iter().find_map(|run| run.font.clone()),
3764 size: runs.iter().map(|run| run.size).sum::<f32>() / runs.len() as f32,
3765 space_width: runs.iter().map(|run| run.space_width).fold(0.0, f32::max),
3766 bold: !runs.is_empty() && runs.iter().all(|run| run.bold),
3767 italic: !runs.is_empty() && runs.iter().all(|run| run.italic),
3768 source_object_ids: source_ids_for_runs(runs),
3769 })
3770}
3771
3772fn row_has_numeric_table_evidence(cells: &[TextRun]) -> bool {
3773 cells.iter().skip(1).any(|cell| {
3774 cell.text
3775 .chars()
3776 .any(|character| character.is_ascii_digit())
3777 })
3778}
3779
3780fn best_aligned_table_row_group(rows: &[TableRowCandidate]) -> Option<Vec<TableRowCandidate>> {
3781 let mut best: Option<Vec<TableRowCandidate>> = None;
3782 let mut current: Vec<TableRowCandidate> = Vec::new();
3783
3784 for row in rows {
3785 if current.is_empty() {
3786 current.push(row.clone());
3787 continue;
3788 }
3789
3790 let compatible = current
3791 .first()
3792 .is_some_and(|first| table_rows_align(first, row))
3793 && current
3794 .last()
3795 .is_some_and(|previous| table_row_vertical_gap(previous, row) <= 28.0);
3796 if compatible {
3797 current.push(row.clone());
3798 } else {
3799 record_table_row_group(&mut best, ¤t);
3800 current.clear();
3801 current.push(row.clone());
3802 }
3803 }
3804 record_table_row_group(&mut best, ¤t);
3805 best
3806}
3807
3808fn record_table_row_group(
3809 best: &mut Option<Vec<TableRowCandidate>>,
3810 candidate: &[TableRowCandidate],
3811) {
3812 if candidate.len() < 2 {
3813 return;
3814 }
3815 let Some(width) = candidate.first().map(|row| row.cells.len()) else {
3816 return;
3817 };
3818 if width < 3 {
3819 return;
3820 }
3821 let score = candidate.len() * width;
3822 let best_score = best
3823 .as_ref()
3824 .and_then(|rows| rows.first().map(|row| rows.len() * row.cells.len()))
3825 .unwrap_or_default();
3826 if score > best_score {
3827 *best = Some(candidate.to_vec());
3828 }
3829}
3830
3831fn table_rows_align(first: &TableRowCandidate, next: &TableRowCandidate) -> bool {
3832 first.cells.len() == next.cells.len()
3833 && first
3834 .cells
3835 .iter()
3836 .zip(&next.cells)
3837 .all(|(left, right)| cells_column_aligned(left, right))
3838}
3839
3840fn cells_column_aligned(left: &TextRun, right: &TextRun) -> bool {
3844 let left_edge = (left.bbox.x - right.bbox.x).abs() <= 14.0;
3845 let right_edge =
3846 ((left.bbox.x + left.bbox.width) - (right.bbox.x + right.bbox.width)).abs() <= 14.0;
3847 left_edge || right_edge
3848}
3849
3850fn table_row_vertical_gap(previous: &TableRowCandidate, next: &TableRowCandidate) -> f32 {
3851 let previous_y = previous
3852 .cells
3853 .iter()
3854 .map(|cell| cell.bbox.y)
3855 .reduce(f32::max)
3856 .unwrap_or_default();
3857 let next_y = next
3858 .cells
3859 .iter()
3860 .map(|cell| cell.bbox.y)
3861 .reduce(f32::max)
3862 .unwrap_or_default();
3863 (previous_y - next_y).abs()
3864}
3865
3866fn build_implied_alignment_table(
3867 page_number: usize,
3868 lines: &[TextLine],
3869 rows: &[TableRowCandidate],
3870) -> Option<DetectedTable> {
3871 let columns = rows.first()?.cells.len();
3872 let bbox = union_boxes(
3873 rows.iter()
3874 .flat_map(|row| row.cells.iter().map(|cell| cell.bbox)),
3875 )?;
3876 let header = implied_table_header(lines, rows, columns);
3877 let has_explicit_header = header.has_text();
3878 let mut line_indices = rows.iter().map(|row| row.line_index).collect::<Vec<_>>();
3879 line_indices.extend(header.line_indices.iter().copied());
3880 line_indices.sort_unstable();
3881 line_indices.dedup();
3882
3883 let (headers, body_rows, header_cells) = if has_explicit_header {
3884 (
3885 header
3886 .cells
3887 .iter()
3888 .map(|cell| {
3889 cell.as_ref()
3890 .map(|cell| cell.text.clone())
3891 .unwrap_or_default()
3892 })
3893 .collect::<Vec<_>>(),
3894 rows.iter()
3895 .map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
3896 .collect::<Vec<Vec<_>>>(),
3897 header.cells,
3898 )
3899 } else {
3900 (
3901 rows.first()?
3902 .cells
3903 .iter()
3904 .map(|cell| cell.text.clone())
3905 .collect::<Vec<_>>(),
3906 rows.iter()
3907 .skip(1)
3908 .map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
3909 .collect::<Vec<Vec<_>>>(),
3910 rows.first()?.cells.iter().cloned().map(Some).collect(),
3911 )
3912 };
3913
3914 let mut cells = Vec::new();
3915 for (column, cell) in header_cells.into_iter().enumerate() {
3916 let text = headers.get(column).cloned().unwrap_or_default();
3917 cells.push(TableCell {
3918 row: 0,
3919 column,
3920 text,
3921 bbox: cell.map(|cell| cell.bbox),
3922 is_header: true,
3923 col_span: 1,
3924 row_span: 1,
3925 });
3926 }
3927 for (row_index, row) in rows.iter().enumerate() {
3928 let table_row = if has_explicit_header {
3929 row_index + 1
3930 } else {
3931 row_index
3932 };
3933 if !has_explicit_header && row_index == 0 {
3934 continue;
3935 }
3936 for (column, cell) in row.cells.iter().enumerate() {
3937 cells.push(TableCell {
3938 row: table_row,
3939 column,
3940 text: cell.text.clone(),
3941 bbox: Some(cell.bbox),
3942 is_header: false,
3943 col_span: 1,
3944 row_span: 1,
3945 });
3946 }
3947 }
3948
3949 Some(DetectedTable {
3950 table: TableBlock {
3951 headers,
3952 rows: body_rows,
3953 caption: None,
3954 bbox: Some(bbox),
3955 cells,
3956 source_anchors: vec![anchor(page_number, Some(bbox), Vec::new())],
3957 confidence: Some(Confidence {
3958 score: 0.68,
3959 calibrated: false,
3960 }), ..Default::default()
3961 },
3962 line_indices,
3963 })
3964}
3965
3966#[derive(Debug, Clone)]
3967struct ImpliedTableHeader {
3968 cells: Vec<Option<TextRun>>,
3969 line_indices: Vec<usize>,
3970}
3971
3972impl ImpliedTableHeader {
3973 fn has_text(&self) -> bool {
3974 self.cells
3975 .iter()
3976 .any(|cell| cell.as_ref().is_some_and(|cell| !cell.text.is_empty()))
3977 }
3978}
3979
3980fn implied_table_header(
3981 lines: &[TextLine],
3982 rows: &[TableRowCandidate],
3983 columns: usize,
3984) -> ImpliedTableHeader {
3985 let mut header = ImpliedTableHeader {
3986 cells: vec![None; columns],
3987 line_indices: Vec::new(),
3988 };
3989 let Some(first_row) = rows.first() else {
3990 return header;
3991 };
3992 let first_y = first_row
3993 .cells
3994 .iter()
3995 .map(|cell| cell.bbox.y)
3996 .reduce(f32::max)
3997 .unwrap_or_default();
3998 let table_left = first_row
3999 .cells
4000 .iter()
4001 .map(|cell| cell.bbox.x)
4002 .reduce(f32::min)
4003 .unwrap_or_default();
4004 let table_right = first_row
4005 .cells
4006 .iter()
4007 .map(|cell| cell.bbox.x + cell.bbox.width)
4008 .reduce(f32::max)
4009 .unwrap_or_default();
4010 let column_refs = first_row
4011 .cells
4012 .iter()
4013 .map(|cell| (cell.bbox.x, cell.bbox.x + cell.bbox.width))
4014 .collect::<Vec<_>>();
4015
4016 let mut candidates = lines
4017 .iter()
4018 .enumerate()
4019 .filter(|(line_index, line)| {
4020 !rows.iter().any(|row| row.line_index == *line_index)
4021 && line.bbox.y > first_y
4022 && line.bbox.y <= first_y + 80.0
4023 && line.bbox.x <= table_right + 12.0
4024 && line.bbox.x + line.bbox.width >= table_left - 12.0
4025 && !text_line_plain_text(line)
4026 .to_ascii_lowercase()
4027 .starts_with("table ")
4028 && !line_is_data_row(line, columns)
4032 })
4033 .collect::<Vec<_>>();
4034 candidates.sort_by(|left, right| right.1.bbox.y.total_cmp(&left.1.bbox.y));
4035
4036 for (line_index, line) in candidates {
4037 let mut used_line = false;
4038 for cell in implied_table_cells(line) {
4039 if cell.text.chars().count() > 40 {
4040 continue;
4041 }
4042 let Some(column) = nearest_table_column(&cell, &column_refs) else {
4043 continue;
4044 };
4045 append_header_cell(&mut header.cells[column], cell);
4046 used_line = true;
4047 }
4048 if used_line {
4049 header.line_indices.push(line_index);
4050 }
4051 }
4052
4053 header
4054}
4055
4056fn line_is_data_row(line: &TextLine, columns: usize) -> bool {
4060 let cells = implied_table_cells(line);
4061 cells.len() >= columns && row_has_numeric_table_evidence(&cells)
4062}
4063
4064fn nearest_table_column(cell: &TextRun, column_refs: &[(f32, f32)]) -> Option<usize> {
4068 let cell_center = cell.bbox.x + cell.bbox.width / 2.0;
4069 let (column, distance) = column_refs
4070 .iter()
4071 .enumerate()
4072 .map(|(index, (left, right))| {
4073 let column_center = (left + right) / 2.0;
4074 (index, (cell_center - column_center).abs())
4075 })
4076 .min_by(|left, right| left.1.total_cmp(&right.1))?;
4077 let (left, right) = column_refs[column];
4078 let tolerance = ((right - left) / 2.0 + 18.0).max(24.0);
4079 (distance <= tolerance).then_some(column)
4080}
4081
4082fn append_header_cell(target: &mut Option<TextRun>, fragment: TextRun) {
4083 if let Some(existing) = target {
4084 if !existing.text.is_empty() {
4085 existing.text.push(' ');
4086 }
4087 existing.text.push_str(&fragment.text);
4088 existing.bbox = union_boxes([existing.bbox, fragment.bbox]).unwrap_or(existing.bbox);
4089 for id in fragment.source_object_ids {
4090 if !existing.source_object_ids.contains(&id) {
4091 existing.source_object_ids.push(id);
4092 }
4093 }
4094 } else {
4095 *target = Some(fragment);
4096 }
4097}
4098
4099fn has_table_evidence(candidate_lines: &[(usize, &TextLine)]) -> bool {
4100 if candidate_lines.len() >= 3 {
4101 return true;
4102 }
4103 candidate_lines
4104 .iter()
4105 .skip(1)
4106 .flat_map(|(_, line)| line.runs.iter())
4107 .any(|run| run.text.chars().any(|character| character.is_ascii_digit()))
4108}
4109
4110fn columns_align(first: &[TextRun], next: &[TextRun]) -> bool {
4111 first
4112 .iter()
4113 .zip(next)
4114 .all(|(left, right)| (left.bbox.x - right.bbox.x).abs() <= 6.0)
4115}
4116
4117fn rotate_point(x: f32, y: f32, rotation: i32, width: f32, height: f32) -> (f32, f32) {
4121 match rotation.rem_euclid(360) {
4122 90 => (y, width - x),
4123 180 => (width - x, height - y),
4124 270 => (height - y, x),
4125 _ => (x, y),
4126 }
4127}
4128
4129fn rotate_bbox(bbox: BBox, rotation: i32, width: f32, height: f32) -> BBox {
4132 if rotation.rem_euclid(360) == 0 {
4133 return bbox;
4134 }
4135 let (x0, y0) = rotate_point(bbox.x, bbox.y, rotation, width, height);
4136 let (x1, y1) = rotate_point(bbox.x + bbox.width, bbox.y + bbox.height, rotation, width, height);
4137 BBox {
4138 x: x0.min(x1),
4139 y: y0.min(y1),
4140 width: (x1 - x0).abs(),
4141 height: (y1 - y0).abs(),
4142 }
4143}
4144
4145fn group_text_runs(mut runs: Vec<TextRun>) -> Vec<TextLine> {
4146 runs.sort_by(|left, right| {
4147 right
4148 .baseline_y
4149 .total_cmp(&left.baseline_y)
4150 .then(left.bbox.x.total_cmp(&right.bbox.x))
4151 });
4152
4153 let mut lines: Vec<TextLine> = Vec::new();
4154 for run in runs {
4155 if let Some(line) = lines
4159 .iter_mut()
4160 .find(|line| (line.baseline_y - run.baseline_y).abs() <= 3.0)
4161 {
4162 line.bbox = union_boxes([line.bbox, run.bbox]).unwrap_or(line.bbox);
4163 line.baseline_y = line.baseline_y.min(run.baseline_y);
4167 line.runs.push(run);
4168 } else {
4169 lines.push(TextLine {
4170 baseline_y: run.baseline_y,
4171 bbox: run.bbox,
4172 runs: vec![run],
4173 });
4174 }
4175 }
4176
4177 for line in &mut lines {
4180 line.runs
4181 .sort_by(|left, right| left.bbox.x.total_cmp(&right.bbox.x));
4182 }
4183
4184 lines
4185}
4186
4187fn parse_content_ops(bytes: &[u8]) -> Vec<ContentOp> {
4188 let mut parser = ContentParser::new(bytes);
4189 let mut stack = Vec::new();
4190 let mut ops = Vec::new();
4191
4192 while let Some(token) = parser.next_operand_or_operator() {
4193 match token {
4194 ContentToken::Operand(operand) => stack.push(operand),
4195 ContentToken::Operator(operator) => {
4196 ops.push(ContentOp {
4197 operands: std::mem::take(&mut stack),
4198 operator,
4199 });
4200 }
4201 }
4202 }
4203
4204 ops
4205}
4206
4207#[derive(Debug)]
4208enum ContentToken {
4209 Operand(Operand),
4210 Operator(String),
4211}
4212
4213struct ContentParser<'a> {
4214 bytes: &'a [u8],
4215 pos: usize,
4216}
4217
4218impl<'a> ContentParser<'a> {
4219 fn new(bytes: &'a [u8]) -> Self {
4220 Self { bytes, pos: 0 }
4221 }
4222
4223 fn next_operand_or_operator(&mut self) -> Option<ContentToken> {
4224 self.skip_ws_and_comments();
4225 if self.pos >= self.bytes.len() {
4226 return None;
4227 }
4228
4229 let byte = self.bytes[self.pos];
4230 match byte {
4231 b'/' => Some(ContentToken::Operand(Operand::Name(self.read_name()))),
4232 b'(' => Some(ContentToken::Operand(Operand::Literal(self.read_literal()))),
4233 b'[' => Some(ContentToken::Operand(Operand::Array(self.read_array()))),
4234 b'<' if self.peek(1) != Some(b'<') => {
4235 Some(ContentToken::Operand(Operand::Hex(self.read_hex_string())))
4236 }
4237 b'+' | b'-' | b'.' | b'0'..=b'9' => self
4238 .read_number()
4239 .map(|number| ContentToken::Operand(Operand::Number(number))),
4240 _ => {
4241 let word = self.read_word();
4242 if word.is_empty() {
4243 self.pos += 1;
4244 Some(ContentToken::Operand(Operand::Other))
4245 } else {
4246 Some(ContentToken::Operator(word))
4247 }
4248 }
4249 }
4250 }
4251
4252 fn read_array(&mut self) -> Vec<Operand> {
4253 self.pos += 1;
4254 let mut items = Vec::new();
4255 loop {
4256 self.skip_ws_and_comments();
4257 if self.pos >= self.bytes.len() || self.bytes[self.pos] == b']' {
4258 self.pos = (self.pos + 1).min(self.bytes.len());
4259 break;
4260 }
4261
4262 match self.next_operand_or_operator() {
4263 Some(ContentToken::Operand(operand)) => items.push(operand),
4264 Some(ContentToken::Operator(_)) | None => {}
4265 }
4266 }
4267 items
4268 }
4269
4270 fn read_name(&mut self) -> String {
4271 self.pos += 1;
4272 let start = self.pos;
4273 while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
4274 self.pos += 1;
4275 }
4276 lossy(&self.bytes[start..self.pos])
4277 }
4278
4279 fn read_literal(&mut self) -> Vec<u8> {
4280 self.pos += 1;
4281 let mut depth = 1;
4282 let mut output = Vec::new();
4283
4284 while self.pos < self.bytes.len() && depth > 0 {
4285 let byte = self.bytes[self.pos];
4286 self.pos += 1;
4287 match byte {
4288 b'\\' => {
4289 if self.pos < self.bytes.len() {
4290 match self.bytes[self.pos] {
4291 b'n' => {
4292 output.push(b'\n');
4293 self.pos += 1;
4294 }
4295 b'r' => {
4296 output.push(b'\r');
4297 self.pos += 1;
4298 }
4299 b't' => {
4300 output.push(b'\t');
4301 self.pos += 1;
4302 }
4303 b'b' => {
4304 output.push(0x08);
4305 self.pos += 1;
4306 }
4307 b'f' => {
4308 output.push(0x0c);
4309 self.pos += 1;
4310 }
4311 b'\n' => {
4312 self.pos += 1;
4313 }
4314 b'\r' => {
4315 self.pos += 1;
4316 if self.bytes.get(self.pos) == Some(&b'\n') {
4317 self.pos += 1;
4318 }
4319 }
4320 b'0'..=b'7' => output.push(self.read_octal_escape()),
4321 other => {
4322 output.push(other);
4323 self.pos += 1;
4324 }
4325 }
4326 }
4327 }
4328 b'(' => {
4329 depth += 1;
4330 output.push(byte);
4331 }
4332 b')' => {
4333 depth -= 1;
4334 if depth > 0 {
4335 output.push(byte);
4336 }
4337 }
4338 _ => output.push(byte),
4339 }
4340 }
4341
4342 output
4343 }
4344
4345 fn read_octal_escape(&mut self) -> u8 {
4346 let mut value = 0u16;
4347 let mut digits = 0;
4348 while self.pos < self.bytes.len()
4349 && digits < 3
4350 && matches!(self.bytes[self.pos], b'0'..=b'7')
4351 {
4352 value = (value << 3) + u16::from(self.bytes[self.pos] - b'0');
4353 self.pos += 1;
4354 digits += 1;
4355 }
4356 value.min(u16::from(u8::MAX)) as u8
4357 }
4358
4359 fn read_hex_string(&mut self) -> Vec<u8> {
4360 self.pos += 1;
4361 let start = self.pos;
4362 while self.pos < self.bytes.len() && self.bytes[self.pos] != b'>' {
4363 self.pos += 1;
4364 }
4365 let raw = self.bytes[start..self.pos].to_vec();
4366 self.pos = (self.pos + 1).min(self.bytes.len());
4367 decode_hex(&raw)
4368 }
4369
4370 fn read_number(&mut self) -> Option<f32> {
4371 let start = self.pos;
4372 while self.pos < self.bytes.len()
4373 && matches!(self.bytes[self.pos], b'+' | b'-' | b'.' | b'0'..=b'9')
4374 {
4375 self.pos += 1;
4376 }
4377 std::str::from_utf8(&self.bytes[start..self.pos])
4378 .ok()
4379 .and_then(|text| text.parse().ok())
4380 }
4381
4382 fn read_word(&mut self) -> String {
4383 let start = self.pos;
4384 while self.pos < self.bytes.len() && !is_delimiter_or_ws(self.bytes[self.pos]) {
4385 self.pos += 1;
4386 }
4387 lossy(&self.bytes[start..self.pos])
4388 }
4389
4390 fn skip_ws_and_comments(&mut self) {
4391 loop {
4392 while self.pos < self.bytes.len() && is_ws(self.bytes[self.pos]) {
4393 self.pos += 1;
4394 }
4395 if self.pos < self.bytes.len() && self.bytes[self.pos] == b'%' {
4396 while self.pos < self.bytes.len() && !matches!(self.bytes[self.pos], b'\n' | b'\r')
4397 {
4398 self.pos += 1;
4399 }
4400 } else {
4401 break;
4402 }
4403 }
4404 }
4405
4406 fn peek(&self, offset: usize) -> Option<u8> {
4407 self.bytes.get(self.pos + offset).copied()
4408 }
4409}
4410
4411fn parse_indirect_objects(bytes: &[u8]) -> Vec<PdfObject> {
4412 let mut objects = Vec::new();
4413 let mut pos = 0;
4414
4415 while pos < bytes.len() {
4416 if !is_ws_or_line_start(bytes, pos) && pos != 0 {
4417 pos += 1;
4418 continue;
4419 }
4420
4421 let Some((object_number, after_object_number)) = parse_unsigned_at(bytes, pos) else {
4422 pos += 1;
4423 continue;
4424 };
4425 let Some(after_space) = skip_required_ws(bytes, after_object_number) else {
4426 pos += 1;
4427 continue;
4428 };
4429 let Some((generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
4430 pos += 1;
4431 continue;
4432 };
4433 let Some(after_space) = skip_required_ws(bytes, after_generation) else {
4434 pos += 1;
4435 continue;
4436 };
4437 if !bytes[after_space..].starts_with(b"obj") {
4438 pos += 1;
4439 continue;
4440 }
4441
4442 let body_start = after_space + 3;
4443 if let Some(relative_end) = find_subslice(&bytes[body_start..], b"endobj") {
4444 let body_end = body_start + relative_end;
4445 objects.push(PdfObject {
4446 object_number: object_number as u32,
4447 generation: generation as u16,
4448 body: bytes[body_start..body_end].to_vec(),
4449 });
4450 pos = body_end + b"endobj".len();
4451 } else {
4452 break;
4453 }
4454 }
4455
4456 objects
4457}
4458
4459fn expand_object_streams(objects: &mut Vec<PdfObject>) {
4460 let object_streams = objects
4461 .iter()
4462 .filter(|object| {
4463 lossy(&object.body)
4464 .split_whitespace()
4465 .collect::<String>()
4466 .contains("/Type/ObjStm")
4467 })
4468 .cloned()
4469 .collect::<Vec<_>>();
4470 let existing = objects
4471 .iter()
4472 .map(|object| object.object_number)
4473 .collect::<std::collections::HashSet<_>>();
4474 let mut expanded = Vec::new();
4475
4476 for object_stream in object_streams {
4477 let object_body = lossy(&object_stream.body);
4478 let Some(count) = parse_number_after(&object_body, "/N").map(|value| value as usize) else {
4479 continue;
4480 };
4481 let Some(first) = parse_number_after(&object_body, "/First").map(|value| value as usize)
4482 else {
4483 continue;
4484 };
4485 let Ok(Some(decoded)) = decode_stream_object(&object_stream) else {
4486 continue;
4487 };
4488 if first > decoded.len() {
4489 continue;
4490 }
4491
4492 let header = lossy(&decoded[..first]);
4493 let header_numbers = header
4494 .split_whitespace()
4495 .filter_map(|part| part.parse::<usize>().ok())
4496 .collect::<Vec<_>>();
4497 let mut entries = Vec::new();
4498 for pair in header_numbers.chunks_exact(2).take(count) {
4499 entries.push((pair[0] as u32, pair[1]));
4500 }
4501
4502 for (index, (object_number, offset)) in entries.iter().enumerate() {
4503 if existing.contains(object_number) {
4504 continue;
4505 }
4506 let next_offset = entries
4507 .get(index + 1)
4508 .map(|(_, next_offset)| *next_offset)
4509 .unwrap_or(decoded.len() - first);
4510 if *offset > next_offset || first + next_offset > decoded.len() {
4511 continue;
4512 }
4513 expanded.push(PdfObject {
4514 object_number: *object_number,
4515 generation: 0,
4516 body: decoded[first + *offset..first + next_offset].to_vec(),
4517 });
4518 }
4519 }
4520
4521 objects.extend(expanded);
4522}
4523
4524fn page_seed(object: &PdfObject, object_map: &HashMap<u32, Arc<PdfObject>>) -> Option<PageSeed> {
4525 let body = lossy(&object.body);
4526 let compact = body.split_whitespace().collect::<String>();
4527 if compact.contains("/Type/Page") && !compact.contains("/Type/Pages") {
4528 Some(PageSeed {
4529 number: 0,
4530 body: body_with_inherited_page_tree_entries(&body, object_map),
4531 })
4532 } else {
4533 None
4534 }
4535}
4536
4537fn body_with_inherited_page_tree_entries(
4538 page_body: &str,
4539 object_map: &HashMap<u32, Arc<PdfObject>>,
4540) -> String {
4541 let mut body = page_body.to_owned();
4542 append_parent_page_tree_entries(page_body, object_map, &mut body, 0);
4543 body
4544}
4545
4546fn append_parent_page_tree_entries(
4547 body: &str,
4548 object_map: &HashMap<u32, Arc<PdfObject>>,
4549 output: &mut String,
4550 depth: usize,
4551) {
4552 if depth >= 16 {
4553 return;
4554 }
4555 let Some(parent_ref) = parse_direct_ref_after_key(body, "/Parent") else {
4556 return;
4557 };
4558 let Some(parent) = object_map.get(&(parent_ref as u32)) else {
4559 return;
4560 };
4561 let parent_body = lossy(&parent.body);
4562 output.push('\n');
4563 output.push_str(&parent_body);
4564 append_parent_page_tree_entries(&parent_body, object_map, output, depth + 1);
4565}
4566
4567fn decode_stream_object(object: &PdfObject) -> Result<Option<Vec<u8>>> {
4568 let Some(stream_marker) = find_subslice(&object.body, b"stream") else {
4569 return Ok(None);
4570 };
4571 let Some(end_marker) = find_subslice(&object.body, b"endstream") else {
4572 return Err(DonglerError::pdf("stream is missing endstream marker"));
4573 };
4574 if end_marker <= stream_marker {
4575 return Err(DonglerError::pdf("stream markers are malformed"));
4576 }
4577
4578 let dict = lossy(&object.body[..stream_marker]);
4579 let mut stream = object.body[stream_marker + b"stream".len()..end_marker].to_vec();
4580 trim_stream_edges(&mut stream);
4581
4582 for filter in stream_filters(&dict) {
4583 stream = decode_stream_filter(&filter, &stream)?;
4584 }
4585 Ok(Some(stream))
4586}
4587
4588fn decode_stream_filter(filter: &str, stream: &[u8]) -> Result<Vec<u8>> {
4589 match filter {
4590 "FlateDecode" | "Fl" => {
4591 let mut decoder = ZlibDecoder::new(stream);
4592 let mut decoded = Vec::new();
4593 decoder
4594 .read_to_end(&mut decoded)
4595 .map_err(|error| DonglerError::pdf(format!("FlateDecode failed: {error}")))?;
4596 Ok(decoded)
4597 }
4598 "ASCII85Decode" | "A85" => ascii85_decode(stream),
4599 other => Err(DonglerError::pdf(format!(
4600 "unsupported stream filter: {other}"
4601 ))),
4602 }
4603}
4604
4605fn stream_filters(dict: &str) -> Vec<String> {
4606 let Some(mut index) = dict.find("/Filter").map(|index| index + "/Filter".len()) else {
4607 return Vec::new();
4608 };
4609 let bytes = dict.as_bytes();
4610 skip_pdf_whitespace(bytes, &mut index);
4611 if bytes.get(index) == Some(&b'[') {
4612 index += 1;
4613 let mut filters = Vec::new();
4614 while index < bytes.len() && bytes[index] != b']' {
4615 skip_pdf_whitespace(bytes, &mut index);
4616 if bytes.get(index) == Some(&b']') {
4617 break;
4618 }
4619 if bytes.get(index) == Some(&b'/') {
4620 index += 1;
4621 let start = index;
4622 while index < bytes.len() && !is_pdf_name_delimiter(bytes[index]) {
4623 index += 1;
4624 }
4625 if start < index {
4626 filters.push(dict[start..index].to_owned());
4627 }
4628 } else {
4629 index += 1;
4630 }
4631 }
4632 filters
4633 } else if bytes.get(index) == Some(&b'/') {
4634 index += 1;
4635 let start = index;
4636 while index < bytes.len() && !is_pdf_name_delimiter(bytes[index]) {
4637 index += 1;
4638 }
4639 (start < index)
4640 .then(|| vec![dict[start..index].to_owned()])
4641 .unwrap_or_default()
4642 } else {
4643 Vec::new()
4644 }
4645}
4646
4647fn skip_pdf_whitespace(bytes: &[u8], index: &mut usize) {
4648 while bytes
4649 .get(*index)
4650 .is_some_and(|byte| matches!(byte, b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' '))
4651 {
4652 *index += 1;
4653 }
4654}
4655
4656fn is_pdf_name_delimiter(byte: u8) -> bool {
4657 matches!(
4658 byte,
4659 b'\0'
4660 | b'\t'
4661 | b'\n'
4662 | b'\x0c'
4663 | b'\r'
4664 | b' '
4665 | b'('
4666 | b')'
4667 | b'<'
4668 | b'>'
4669 | b'['
4670 | b']'
4671 | b'{'
4672 | b'}'
4673 | b'/'
4674 | b'%'
4675 )
4676}
4677
4678fn ascii85_decode(bytes: &[u8]) -> Result<Vec<u8>> {
4679 let mut output = Vec::new();
4680 let mut group = Vec::new();
4681 let mut index = 0;
4682 while index < bytes.len() {
4683 let byte = bytes[index];
4684 match byte {
4685 b'\0' | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ' => {}
4686 b'<' if bytes.get(index + 1) == Some(&b'~') => {
4687 index += 1;
4688 }
4689 b'~' if bytes.get(index + 1) == Some(&b'>') => break,
4690 b'z' if group.is_empty() => output.extend_from_slice(&[0, 0, 0, 0]),
4691 b'!'..=b'u' => {
4692 group.push(byte - b'!');
4693 if group.len() == 5 {
4694 output.extend_from_slice(&ascii85_group_to_bytes(&group)?);
4695 group.clear();
4696 }
4697 }
4698 _ => {
4699 return Err(DonglerError::pdf(format!(
4700 "ASCII85Decode failed: invalid byte 0x{byte:02x}"
4701 )));
4702 }
4703 }
4704 index += 1;
4705 }
4706
4707 if !group.is_empty() {
4708 if group.len() == 1 {
4709 return Err(DonglerError::pdf(
4710 "ASCII85Decode failed: dangling single digit",
4711 ));
4712 }
4713 let output_len = group.len() - 1;
4714 while group.len() < 5 {
4715 group.push(b'u' - b'!');
4716 }
4717 output.extend_from_slice(&ascii85_group_to_bytes(&group)?[..output_len]);
4718 }
4719
4720 Ok(output)
4721}
4722
4723fn ascii85_group_to_bytes(group: &[u8]) -> Result<[u8; 4]> {
4724 let mut value = 0u64;
4725 for digit in group {
4726 value = value * 85 + u64::from(*digit);
4727 }
4728 if value > u64::from(u32::MAX) {
4729 return Err(DonglerError::pdf("ASCII85Decode failed: invalid group"));
4730 }
4731 Ok((value as u32).to_be_bytes())
4732}
4733
4734fn trim_stream_edges(stream: &mut Vec<u8>) {
4735 while matches!(stream.first(), Some(b'\n' | b'\r')) {
4736 stream.remove(0);
4737 }
4738 while matches!(stream.last(), Some(b'\n' | b'\r')) {
4739 stream.pop();
4740 }
4741}
4742
4743fn parse_refs_after_key(text: &str, key: &str) -> Vec<usize> {
4744 let Some(start) = text.find(key) else {
4745 return Vec::new();
4746 };
4747 let rest = &text[start + key.len()..];
4748 if let Some(array_start) = rest.find('[') {
4749 let before_array = rest[..array_start].trim();
4750 if before_array.is_empty() {
4751 if let Some(array_end) = rest[array_start..].find(']') {
4752 return parse_refs(&rest[array_start..array_start + array_end]);
4753 }
4754 }
4755 }
4756 parse_refs(rest).into_iter().take(1).collect()
4757}
4758
4759fn parse_direct_ref_after_key(text: &str, key: &str) -> Option<usize> {
4760 let start = text.find(key)?;
4761 let bytes = text.as_bytes();
4762 let mut pos = start + key.len();
4763 while pos < bytes.len() && is_ws(bytes[pos]) {
4764 pos += 1;
4765 }
4766 let (object, after_object) = parse_unsigned_at(bytes, pos)?;
4767 let after_space = skip_required_ws(bytes, after_object)?;
4768 let (_generation, after_generation) = parse_unsigned_at(bytes, after_space)?;
4769 let after_space = skip_required_ws(bytes, after_generation)?;
4770 if bytes.get(after_space) == Some(&b'R') {
4771 Some(object)
4772 } else {
4773 None
4774 }
4775}
4776
4777fn parse_resource_refs(text: &str, key: &str) -> HashMap<String, u32> {
4778 let Some(start) = text.find(key) else {
4779 return HashMap::new();
4780 };
4781 let rest = &text[start + key.len()..];
4782 let Some(dict_start) = rest.find("<<") else {
4783 return HashMap::new();
4784 };
4785 let Some(dict_end) = rest[dict_start + 2..].find(">>") else {
4786 return HashMap::new();
4787 };
4788 let dict = &rest[dict_start + 2..dict_start + 2 + dict_end];
4789 parse_named_refs(dict)
4790}
4791
4792fn resolve_resource_body(page_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> Option<String> {
4793 let resource_ref = parse_direct_ref_after_key(page_body, "/Resources")?;
4794 object_map
4795 .get(&(resource_ref as u32))
4796 .map(|object| lossy(&object.body))
4797}
4798
4799fn load_font_decoders(
4800 resource_text: &str,
4801 object_map: &HashMap<u32, Arc<PdfObject>>,
4802 font_cache: &HashMap<u32, Arc<FontDecoder>>,
4803) -> HashMap<String, Arc<FontDecoder>> {
4804 resolve_named_resource_refs(resource_text, "/Font", object_map)
4805 .into_iter()
4806 .map(|(name, object_number)| {
4807 let decoder = font_cache.get(&object_number).cloned().unwrap_or_else(|| {
4808 Arc::new(
4809 object_map
4810 .get(&object_number)
4811 .map(|font| font_decoder(font.as_ref(), object_map))
4812 .unwrap_or_default(),
4813 )
4814 });
4815 (name, decoder)
4816 })
4817 .collect()
4818}
4819
4820fn resolve_named_resource_refs(
4821 resource_text: &str,
4822 key: &str,
4823 object_map: &HashMap<u32, Arc<PdfObject>>,
4824) -> HashMap<String, u32> {
4825 let direct = parse_resource_refs(resource_text, key);
4826 if !direct.is_empty() {
4827 return direct;
4828 }
4829
4830 parse_direct_ref_after_key(resource_text, key)
4831 .and_then(|object_number| object_map.get(&(object_number as u32)))
4832 .map(|object| parse_named_refs(&lossy(&object.body)))
4833 .unwrap_or_default()
4834}
4835
4836fn font_decoder(font: &PdfObject, object_map: &HashMap<u32, Arc<PdfObject>>) -> FontDecoder {
4837 let font_body = lossy(&font.body);
4838 let encoding = font_encoding_differences(&font_body, object_map);
4839 let widths = font_widths(&font_body, &encoding);
4840 let (bold, italic) = font_style(&font_body, object_map);
4841 let (ascent, descent) = font_vertical_metrics(&font_body, object_map);
4842 let Some(to_unicode_ref) = parse_refs_after_key(&font_body, "/ToUnicode")
4843 .into_iter()
4844 .next()
4845 else {
4846 return FontDecoder {
4847 cmap: HashMap::new(),
4848 encoding,
4849 widths,
4850 max_code_len: 1,
4851 bold,
4852 italic,
4853 ascent,
4854 descent,
4855 };
4856 };
4857 let Some(to_unicode) = object_map.get(&(to_unicode_ref as u32)) else {
4858 return FontDecoder {
4859 cmap: HashMap::new(),
4860 encoding,
4861 widths,
4862 max_code_len: 1,
4863 bold,
4864 italic,
4865 ascent,
4866 descent,
4867 };
4868 };
4869 let Ok(Some(cmap_stream)) = decode_stream_object(to_unicode.as_ref()) else {
4870 return FontDecoder {
4871 cmap: HashMap::new(),
4872 encoding,
4873 widths,
4874 max_code_len: 1,
4875 bold,
4876 italic,
4877 ascent,
4878 descent,
4879 };
4880 };
4881
4882 let mut decoder = parse_to_unicode_cmap(&lossy(&cmap_stream));
4883 decoder.encoding = encoding;
4884 decoder.widths = if widths.is_empty() {
4885 cid_char_widths(&decoder.cmap, &font_cid_widths(&font_body, object_map))
4886 } else {
4887 widths
4888 };
4889 decoder.bold = bold;
4890 decoder.italic = italic;
4891 decoder.ascent = ascent;
4892 decoder.descent = descent;
4893 decoder
4894}
4895
4896fn font_vertical_metrics(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> (f32, f32) {
4900 let mut ascent = 0.75;
4901 let mut descent = -0.25;
4902 if let Some(descriptor_ref) = parse_direct_ref_after_key(font_body, "/FontDescriptor") {
4903 if let Some(object) = object_map.get(&(descriptor_ref as u32)) {
4904 let body = lossy(&object.body);
4905 if let Some(value) = parse_number_after(&body, "/Ascent") {
4906 if value != 0.0 {
4907 ascent = value / 1000.0;
4908 }
4909 }
4910 if let Some(value) = parse_number_after(&body, "/Descent") {
4911 if value != 0.0 {
4912 descent = value / 1000.0;
4913 }
4914 }
4915 }
4916 }
4917 (ascent, descent)
4918}
4919
4920fn font_style(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> (bool, bool) {
4924 let mut bold = false;
4925 let mut italic = false;
4926 if let Some(name) = parse_name_after(font_body, "/BaseFont") {
4927 let bare = name.rsplit('+').next().unwrap_or(name.as_str()).to_ascii_lowercase();
4928 bold |= ["bold", "black", "heavy", "semibold", "demibold", "-bd", "demi"]
4929 .iter()
4930 .any(|needle| bare.contains(needle));
4931 italic |= ["italic", "oblique", "-it"]
4932 .iter()
4933 .any(|needle| bare.contains(needle));
4934 }
4935 if let Some(descriptor_ref) = parse_direct_ref_after_key(font_body, "/FontDescriptor") {
4936 if let Some(object) = object_map.get(&(descriptor_ref as u32)) {
4937 let body = lossy(&object.body);
4938 if let Some(flags) = parse_number_after(&body, "/Flags") {
4939 let flags = flags as i64;
4940 italic |= flags & 64 != 0;
4941 bold |= flags & 262_144 != 0;
4942 }
4943 if let Some(angle) = parse_number_after(&body, "/ItalicAngle") {
4944 italic |= angle.abs() > f32::EPSILON;
4945 }
4946 }
4947 }
4948 (bold, italic)
4949}
4950
4951fn parse_name_after(text: &str, key: &str) -> Option<String> {
4953 let start = text.find(key)? + key.len();
4954 let rest = text[start..].trim_start();
4955 let mut chars = rest.chars();
4956 if chars.next()? != '/' {
4957 return None;
4958 }
4959 let name: String = chars
4960 .take_while(|character| {
4961 !character.is_whitespace()
4962 && !matches!(character, '/' | '[' | ']' | '<' | '>' | '(' | ')')
4963 })
4964 .collect();
4965 (!name.is_empty()).then_some(name)
4966}
4967
4968fn font_widths(font_body: &str, encoding: &HashMap<u8, String>) -> HashMap<char, f32> {
4969 let Some(first_char) = parse_number_after(font_body, "/FirstChar").map(|value| value as u8)
4970 else {
4971 return HashMap::new();
4972 };
4973 let Some(widths) = parse_number_array_after(font_body, "/Widths") else {
4974 return HashMap::new();
4975 };
4976
4977 widths
4978 .into_iter()
4979 .enumerate()
4980 .filter_map(|(index, width)| {
4981 let code = first_char.wrapping_add(index as u8);
4982 let text = encoding
4983 .get(&code)
4984 .cloned()
4985 .unwrap_or_else(|| (code as char).to_string());
4986 let mut chars = text.chars();
4987 let character = chars.next()?;
4988 chars.next().is_none().then_some((character, width))
4989 })
4990 .collect()
4991}
4992
4993fn font_cid_widths(font_body: &str, object_map: &HashMap<u32, Arc<PdfObject>>) -> HashMap<u32, f32> {
5002 let mut widths = HashMap::new();
5003 if parse_name_after(font_body, "/Subtype").as_deref() != Some("Type0") {
5004 return widths;
5005 }
5006 let Some(descendant) = parse_refs_after_key(font_body, "/DescendantFonts")
5007 .into_iter()
5008 .next()
5009 else {
5010 return widths;
5011 };
5012 let Some(cidfont) = object_map.get(&(descendant as u32)) else {
5013 return widths;
5014 };
5015 let body = lossy(&cidfont.body);
5016 let Some((open, close)) = find_w_array(&body) else {
5017 return widths;
5018 };
5019 let mut parser = ContentParser::new(&body.as_bytes()[open..=close]);
5020 let Some(ContentToken::Operand(Operand::Array(items))) = parser.next_operand_or_operator() else {
5021 return widths;
5022 };
5023
5024 let mut index = 0;
5025 while index < items.len() {
5026 match (&items[index], items.get(index + 1)) {
5027 (Operand::Number(first), Some(Operand::Array(list))) => {
5028 let base = *first as i64;
5029 for (offset, width) in list.iter().enumerate() {
5030 if let Operand::Number(width) = width {
5031 let cid = base + offset as i64;
5032 if cid >= 0 {
5033 widths.insert(cid as u32, *width);
5034 }
5035 }
5036 }
5037 index += 2;
5038 }
5039 (Operand::Number(first), Some(Operand::Number(last))) => {
5040 if let Some(Operand::Number(width)) = items.get(index + 2) {
5041 let (lo, hi) = (*first as i64, *last as i64);
5042 if lo >= 0 && hi >= lo && hi - lo < 70_000 {
5043 for cid in lo..=hi {
5044 widths.insert(cid as u32, *width);
5045 }
5046 }
5047 index += 3;
5048 } else {
5049 index += 1;
5050 }
5051 }
5052 _ => index += 1,
5053 }
5054 }
5055 widths
5056}
5057
5058fn find_w_array(body: &str) -> Option<(usize, usize)> {
5062 let bytes = body.as_bytes();
5063 let mut search = 0;
5064 while let Some(rel) = body[search..].find("/W") {
5065 let key_end = search + rel + 2;
5066 if matches!(bytes.get(key_end), Some(byte) if is_ws(*byte) || *byte == b'[') {
5067 let mut pos = key_end;
5068 while pos < bytes.len() && is_ws(bytes[pos]) {
5069 pos += 1;
5070 }
5071 if bytes.get(pos) == Some(&b'[') {
5072 if let Some(close) = matching_array_close(body, pos) {
5073 return Some((pos, close));
5074 }
5075 }
5076 }
5077 search = key_end;
5078 }
5079 None
5080}
5081
5082fn cid_char_widths(
5087 cmap: &HashMap<Vec<u8>, String>,
5088 cid_widths: &HashMap<u32, f32>,
5089) -> HashMap<char, f32> {
5090 let mut out = HashMap::new();
5091 if cid_widths.is_empty() {
5092 return out;
5093 }
5094 for (code, text) in cmap {
5095 if code.is_empty() || code.len() > 4 {
5096 continue;
5097 }
5098 let mut chars = text.chars();
5099 let (Some(character), None) = (chars.next(), chars.next()) else {
5100 continue;
5101 };
5102 let cid = code.iter().fold(0u32, |acc, byte| (acc << 8) | u32::from(*byte));
5103 if let Some(width) = cid_widths.get(&cid) {
5104 out.insert(character, *width);
5105 }
5106 }
5107 out
5108}
5109
5110fn font_encoding_differences(
5111 font_body: &str,
5112 object_map: &HashMap<u32, Arc<PdfObject>>,
5113) -> HashMap<u8, String> {
5114 if let Some(encoding_ref) = parse_direct_ref_after_key(font_body, "/Encoding") {
5115 if let Some(object) = object_map.get(&(encoding_ref as u32)) {
5116 let differences = parse_encoding_differences(&lossy(&object.body));
5117 if !differences.is_empty() {
5118 return differences;
5119 }
5120 }
5121 }
5122 parse_encoding_differences(font_body)
5123}
5124
5125fn parse_encoding_differences(text: &str) -> HashMap<u8, String> {
5126 let Some(start) = text.find("/Differences") else {
5127 return HashMap::new();
5128 };
5129 let rest = &text[start + "/Differences".len()..];
5130 let Some(open) = rest.find('[') else {
5131 return HashMap::new();
5132 };
5133 let Some(close) = matching_array_close(rest, open) else {
5134 return HashMap::new();
5135 };
5136 let mut parser = ContentParser::new(rest[open..=close].as_bytes());
5137 let Some(ContentToken::Operand(Operand::Array(items))) = parser.next_operand_or_operator()
5138 else {
5139 return HashMap::new();
5140 };
5141
5142 let mut differences = HashMap::new();
5143 let mut code: Option<u16> = None;
5144 for item in items {
5145 match item {
5146 Operand::Number(value) if value >= 0.0 => {
5147 code = Some(value as u16);
5148 }
5149 Operand::Name(name) => {
5150 let Some(current_code) = code else {
5151 continue;
5152 };
5153 if current_code <= u16::from(u8::MAX) {
5154 if let Some(text) = glyph_name_to_text(&name) {
5155 differences.insert(current_code as u8, text);
5156 }
5157 }
5158 code = current_code.checked_add(1);
5159 }
5160 _ => {}
5161 }
5162 }
5163 differences
5164}
5165
5166fn matching_array_close(text: &str, open: usize) -> Option<usize> {
5167 let mut depth = 0usize;
5168 for (offset, byte) in text.as_bytes().iter().enumerate().skip(open) {
5169 match byte {
5170 b'[' => depth += 1,
5171 b']' => {
5172 depth = depth.checked_sub(1)?;
5173 if depth == 0 {
5174 return Some(offset);
5175 }
5176 }
5177 _ => {}
5178 }
5179 }
5180 None
5181}
5182
5183fn parse_to_unicode_cmap(text: &str) -> FontDecoder {
5184 let mut cmap = HashMap::new();
5185 let mut in_bfchar = false;
5186 let mut in_bfrange = false;
5187 let mut bfrange_array_entry = String::new();
5188 let mut bfrange_array_depth = 0i32;
5189
5190 for line in text.lines() {
5191 let trimmed = line.trim();
5192 match trimmed {
5193 value if value.ends_with("beginbfchar") => {
5194 in_bfchar = true;
5195 continue;
5196 }
5197 "endbfchar" => {
5198 in_bfchar = false;
5199 continue;
5200 }
5201 value if value.ends_with("beginbfrange") => {
5202 in_bfrange = true;
5203 continue;
5204 }
5205 "endbfrange" => {
5206 in_bfrange = false;
5207 bfrange_array_entry.clear();
5208 bfrange_array_depth = 0;
5209 continue;
5210 }
5211 _ => {}
5212 }
5213
5214 if in_bfrange {
5215 if bfrange_array_depth > 0 {
5216 bfrange_array_entry.push(' ');
5217 bfrange_array_entry.push_str(trimmed);
5218 bfrange_array_depth += bracket_delta(trimmed);
5219 if bfrange_array_depth <= 0 {
5220 add_bfrange_entry(&mut cmap, &bfrange_array_entry);
5221 bfrange_array_entry.clear();
5222 bfrange_array_depth = 0;
5223 }
5224 continue;
5225 }
5226
5227 let depth = bracket_delta(trimmed);
5228 if depth > 0 {
5229 bfrange_array_entry.clear();
5230 bfrange_array_entry.push_str(trimmed);
5231 bfrange_array_depth = depth;
5232 continue;
5233 }
5234
5235 add_bfrange_entry(&mut cmap, trimmed);
5236 continue;
5237 }
5238
5239 let hexes = hex_strings_in_line(trimmed);
5240 if in_bfchar && hexes.len() >= 2 {
5241 cmap.insert(
5242 hexes[0].clone(),
5243 cmap_text_for_mapping(&hexes[0], &hexes[1]),
5244 );
5245 }
5246 }
5247
5248 let max_code_len = cmap.keys().map(Vec::len).max().unwrap_or(1);
5249 FontDecoder {
5250 cmap,
5251 encoding: HashMap::new(),
5252 widths: HashMap::new(),
5253 max_code_len,
5254 bold: false,
5255 italic: false,
5256 ascent: 0.75,
5257 descent: -0.25,
5258 }
5259}
5260
5261fn bracket_delta(text: &str) -> i32 {
5262 text.chars().fold(0, |depth, character| match character {
5263 '[' => depth + 1,
5264 ']' => depth - 1,
5265 _ => depth,
5266 })
5267}
5268
5269fn add_bfrange_entry(cmap: &mut HashMap<Vec<u8>, String>, line: &str) {
5270 let hexes = hex_strings_in_line(line);
5271 if hexes.len() < 3 {
5272 return;
5273 }
5274 if line.contains('[') {
5275 add_bfrange_array(cmap, &hexes);
5276 } else {
5277 add_bfrange(cmap, &hexes);
5278 }
5279}
5280
5281fn add_bfrange(cmap: &mut HashMap<Vec<u8>, String>, hexes: &[Vec<u8>]) {
5282 let Some(start) = hex_to_u32(&hexes[0]) else {
5283 return;
5284 };
5285 let Some(end) = hex_to_u32(&hexes[1]) else {
5286 return;
5287 };
5288 let Some(destination) = hex_to_u32(&hexes[2]) else {
5289 return;
5290 };
5291 let source_len = hexes[0].len();
5292
5293 for offset in 0..=(end.saturating_sub(start)).min(512) {
5294 let source = start + offset;
5295 let destination = destination + offset;
5296 cmap.insert(
5297 number_to_be_bytes(source, source_len),
5298 cmap_text_for_codes(source, destination),
5299 );
5300 }
5301}
5302
5303fn add_bfrange_array(cmap: &mut HashMap<Vec<u8>, String>, hexes: &[Vec<u8>]) {
5304 let Some(start) = hex_to_u32(&hexes[0]) else {
5305 return;
5306 };
5307 let Some(end) = hex_to_u32(&hexes[1]) else {
5308 return;
5309 };
5310 let source_len = hexes[0].len();
5311 let range_len = end.saturating_sub(start).saturating_add(1) as usize;
5312
5313 for (offset, destination) in hexes.iter().skip(2).take(range_len.min(512)).enumerate() {
5314 let source = start + offset as u32;
5315 let source_bytes = number_to_be_bytes(source, source_len);
5316 cmap.insert(
5317 source_bytes.clone(),
5318 cmap_text_for_mapping(&source_bytes, destination),
5319 );
5320 }
5321}
5322
5323fn cmap_text_for_mapping(source: &[u8], destination: &[u8]) -> String {
5324 if destination.len() > 2 {
5325 return utf16be_hex_to_string(destination);
5326 }
5327 let Some(source_code) = hex_to_u32(source) else {
5328 return utf16be_hex_to_string(destination);
5329 };
5330 let Some(destination_code) = hex_to_u32(destination) else {
5331 return utf16be_hex_to_string(destination);
5332 };
5333 cmap_text_for_codes(source_code, destination_code)
5334}
5335
5336fn cmap_text_for_codes(source: u32, destination: u32) -> String {
5337 if is_private_use_text_code(destination) {
5338 if let Some(character) = private_use_source_ascii(source) {
5339 return character.to_string();
5340 }
5341 }
5342 char::from_u32(destination)
5343 .map(|character| character.to_string())
5344 .unwrap_or_default()
5345}
5346
5347fn is_private_use_text_code(code: u32) -> bool {
5348 (0xe000..=0xf8ff).contains(&code)
5349}
5350
5351fn private_use_source_ascii(source: u32) -> Option<char> {
5352 let ascii = source + 28;
5353 (0x20..=0x7e)
5354 .contains(&ascii)
5355 .then(|| char::from_u32(ascii))
5356 .flatten()
5357}
5358
5359fn hex_strings_in_line(line: &str) -> Vec<Vec<u8>> {
5360 let bytes = line.as_bytes();
5361 let mut hexes = Vec::new();
5362 let mut pos = 0;
5363
5364 while pos < bytes.len() {
5365 if bytes[pos] == b'<' && bytes.get(pos + 1) != Some(&b'<') {
5366 let start = pos + 1;
5367 if let Some(end) = bytes[start..].iter().position(|byte| *byte == b'>') {
5368 hexes.push(decode_hex(&bytes[start..start + end]));
5369 pos = start + end + 1;
5370 continue;
5371 }
5372 }
5373 pos += 1;
5374 }
5375
5376 hexes
5377}
5378
5379fn utf16be_hex_to_string(bytes: &[u8]) -> String {
5380 if bytes.len() >= 2 {
5381 let units = bytes
5382 .chunks_exact(2)
5383 .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
5384 .collect::<Vec<_>>();
5385 String::from_utf16_lossy(&units)
5386 } else {
5387 bytes.iter().map(|byte| *byte as char).collect()
5388 }
5389}
5390
5391fn hex_to_u32(bytes: &[u8]) -> Option<u32> {
5392 let mut value = 0u32;
5393 for byte in bytes {
5394 value = (value << 8) | (*byte as u32);
5395 }
5396 Some(value)
5397}
5398
5399fn number_to_be_bytes(value: u32, len: usize) -> Vec<u8> {
5400 (0..len)
5401 .rev()
5402 .map(|shift| ((value >> (shift * 8)) & 0xff) as u8)
5403 .collect()
5404}
5405
5406fn parse_named_refs(text: &str) -> HashMap<String, u32> {
5407 let mut refs = HashMap::new();
5408 let bytes = text.as_bytes();
5409 let mut pos = 0;
5410
5411 while pos < bytes.len() {
5412 if bytes[pos] != b'/' || bytes.get(pos + 1) == Some(&b'/') {
5413 pos += 1;
5414 continue;
5415 }
5416 pos += 1;
5417 let name_start = pos;
5418 while pos < bytes.len() && !is_delimiter_or_ws(bytes[pos]) {
5419 pos += 1;
5420 }
5421 let name = lossy(&bytes[name_start..pos]);
5422 while pos < bytes.len() && is_ws(bytes[pos]) {
5423 pos += 1;
5424 }
5425 let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
5426 continue;
5427 };
5428 let Some(after_space) = skip_required_ws(bytes, after_object) else {
5429 pos += 1;
5430 continue;
5431 };
5432 let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
5433 pos += 1;
5434 continue;
5435 };
5436 let Some(after_space) = skip_required_ws(bytes, after_generation) else {
5437 pos += 1;
5438 continue;
5439 };
5440 if bytes.get(after_space) == Some(&b'R') {
5441 refs.insert(name, object as u32);
5442 pos = after_space + 1;
5443 }
5444 }
5445
5446 refs
5447}
5448
5449fn parse_refs(text: &str) -> Vec<usize> {
5450 let mut refs = Vec::new();
5451 let bytes = text.as_bytes();
5452 let mut pos = 0;
5453
5454 while pos < bytes.len() {
5455 let Some((object, after_object)) = parse_unsigned_at(bytes, pos) else {
5456 pos += 1;
5457 continue;
5458 };
5459 let Some(after_space) = skip_required_ws(bytes, after_object) else {
5460 pos += 1;
5461 continue;
5462 };
5463 let Some((_generation, after_generation)) = parse_unsigned_at(bytes, after_space) else {
5464 pos += 1;
5465 continue;
5466 };
5467 let Some(after_space) = skip_required_ws(bytes, after_generation) else {
5468 pos += 1;
5469 continue;
5470 };
5471 if bytes.get(after_space) == Some(&b'R') {
5472 refs.push(object);
5473 pos = after_space + 1;
5474 } else {
5475 pos += 1;
5476 }
5477 }
5478
5479 refs
5480}
5481
5482fn parse_number_array_after(text: &str, key: &str) -> Option<Vec<f32>> {
5483 let start = text.find(key)?;
5484 let rest = &text[start + key.len()..];
5485 let open = rest.find('[')?;
5486 let close = rest[open + 1..].find(']')?;
5487 Some(
5488 rest[open + 1..open + 1 + close]
5489 .split_whitespace()
5490 .filter_map(|part| part.parse::<f32>().ok())
5491 .collect(),
5492 )
5493}
5494
5495fn parse_number_after(text: &str, key: &str) -> Option<f32> {
5496 let start = text.find(key)?;
5497 let bytes = text.as_bytes();
5498 let mut pos = start + key.len();
5499 while pos < bytes.len() && (is_ws(bytes[pos]) || matches!(bytes[pos], b'[' | b']')) {
5500 pos += 1;
5501 }
5502 let number_start = pos;
5503 while pos < bytes.len() && matches!(bytes[pos], b'+' | b'-' | b'.' | b'0'..=b'9') {
5504 pos += 1;
5505 }
5506 if pos == number_start {
5507 return None;
5508 }
5509 text[number_start..pos].parse().ok()
5510}
5511
5512fn first_text_operand(
5513 operands: &[Operand],
5514 state: &GraphicsState,
5515 fonts: &HashMap<String, Arc<FontDecoder>>,
5516) -> Option<String> {
5517 operands
5518 .first()
5519 .and_then(|operand| operand_text(operand, state, fonts))
5520}
5521
5522fn operand_text(
5523 operand: &Operand,
5524 state: &GraphicsState,
5525 fonts: &HashMap<String, Arc<FontDecoder>>,
5526) -> Option<String> {
5527 match operand {
5528 Operand::Literal(bytes) | Operand::Hex(bytes) => Some(decode_pdf_text(
5529 bytes,
5530 state
5531 .font_name
5532 .as_ref()
5533 .and_then(|font_name| fonts.get(font_name))
5534 .map(|font| font.as_ref()),
5535 )),
5536 _ => None,
5537 }
5538}
5539
5540fn text_from_array(
5541 items: &[Operand],
5542 state: &GraphicsState,
5543 fonts: &HashMap<String, Arc<FontDecoder>>,
5544) -> String {
5545 let space_width = space_advance_width(state, fonts).max(state.font_size * 0.04);
5553 let gap_threshold = space_width * SPACE_GAP_FRACTION;
5554 let mut text = String::new();
5555 for item in items {
5556 match item {
5557 Operand::Number(value) => {
5558 let gap = -value / 1000.0 * state.font_size * state.horizontal_scaling;
5559 if gap >= gap_threshold && !text.ends_with(' ') {
5560 text.push(' ');
5561 }
5562 }
5563 _ => {
5564 if let Some(part) = operand_text(item, state, fonts) {
5565 text.push_str(&part);
5566 }
5567 }
5568 }
5569 }
5570 text
5571}
5572
5573const SPACE_GAP_FRACTION: f32 = 0.3;
5577
5578fn decode_pdf_text(bytes: &[u8], font: Option<&FontDecoder>) -> String {
5579 if let Some(font) = font {
5580 if !font.cmap.is_empty() {
5581 return decode_with_cmap(bytes, font);
5582 }
5583 if !font.encoding.is_empty() {
5584 return bytes.iter().map(|byte| font.decode_byte(*byte)).collect();
5585 }
5586 }
5587
5588 if bytes.starts_with(&[0xfe, 0xff]) {
5589 let utf16 = bytes[2..]
5590 .chunks_exact(2)
5591 .map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
5592 .collect::<Vec<_>>();
5593 String::from_utf16_lossy(&utf16)
5594 } else {
5595 bytes.iter().map(|byte| *byte as char).collect()
5596 }
5597}
5598
5599fn decode_with_cmap(bytes: &[u8], font: &FontDecoder) -> String {
5600 let mut output = String::new();
5601 let mut index = 0;
5602
5603 while index < bytes.len() {
5604 let max_len = font.max_code_len.min(bytes.len() - index).max(1);
5605 let mut matched = false;
5606 for len in (1..=max_len).rev() {
5607 if let Some(text) = font.cmap.get(&bytes[index..index + len]) {
5608 output.push_str(text);
5609 index += len;
5610 matched = true;
5611 break;
5612 }
5613 }
5614 if !matched {
5615 output.push_str(&font.decode_byte(bytes[index]));
5616 index += 1;
5617 }
5618 }
5619
5620 output
5621}
5622
5623fn glyph_name_to_text(name: &str) -> Option<String> {
5624 let text = match name {
5625 "space" => " ",
5626 "exclam" => "!",
5627 "quotedbl" => "\"",
5628 "numbersign" => "#",
5629 "dollar" => "$",
5630 "percent" => "%",
5631 "ampersand" => "&",
5632 "quotesingle" | "quoteright" | "quoteleft" => "'",
5633 "parenleft" | "parenleftbig" | "parenleftBig" | "parenleftbigg" | "parenleftBigg" => "(",
5634 "parenright" | "parenrightbig" | "parenrightBig" | "parenrightbigg" | "parenrightBigg" => {
5635 ")"
5636 }
5637 "asterisk" | "asteriskmath" => "*",
5638 "plus" => "+",
5639 "comma" => ",",
5640 "hyphen" => "-",
5641 "period" => ".",
5642 "slash" => "/",
5643 "zero" => "0",
5644 "one" => "1",
5645 "two" => "2",
5646 "three" => "3",
5647 "four" => "4",
5648 "five" => "5",
5649 "six" => "6",
5650 "seven" => "7",
5651 "eight" => "8",
5652 "nine" => "9",
5653 "colon" => ":",
5654 "semicolon" => ";",
5655 "less" => "<",
5656 "equal" => "=",
5657 "greater" => ">",
5658 "question" => "?",
5659 "at" => "@",
5660 "bracketleft" => "[",
5661 "backslash" => "\\",
5662 "bracketright" => "]",
5663 "circumflex" | "hatwide" | "hatwider" | "hatwidest" => "^",
5664 "underscore" => "_",
5665 "braceleft" | "braceleftBig" | "braceleftBigg" | "bracelefttp" | "braceleftbt"
5666 | "braceleftmid" => "{",
5667 "bar" | "vextendsingle" | "braceex" => "|",
5668 "braceright" | "bracerightBig" => "}",
5669 "tilde" | "tildewide" => "~",
5670 "ff" => "ff",
5671 "fi" => "fi",
5672 "fl" => "fl",
5673 "ffi" => "ffi",
5674 "ffl" => "ffl",
5675 "Gamma" => "Γ",
5676 "Theta" => "Θ",
5677 "Lambda" => "Λ",
5678 "Pi" => "Π",
5679 "Sigma" => "Σ",
5680 "Phi" => "Φ",
5681 "Omega" => "Ω",
5682 "alpha" => "α",
5683 "beta" => "β",
5684 "gamma" => "γ",
5685 "delta" => "δ",
5686 "epsilon" => "ε",
5687 "zeta" => "ζ",
5688 "lambda" => "λ",
5689 "mu" => "μ",
5690 "pi" | "pi1" => "π",
5691 "rho" => "ρ",
5692 "sigma" => "σ",
5693 "tau" => "τ",
5694 "phi" => "φ",
5695 "chi" => "χ",
5696 "omega" => "ω",
5697 "partialdiff" => "∂",
5698 "minus" => "−",
5699 "periodcentered" => "·",
5700 "multiply" => "×",
5701 "plusminus" => "±",
5702 "circlemultiply" => "⊗",
5703 "openbullet" | "bullet" => "•",
5704 "lessequal" => "≤",
5705 "greaterequal" => "≥",
5706 "similar" => "∼",
5707 "arrowright" => "→",
5708 "mapsto" => "↦",
5709 "prime" => "′",
5710 "infinity" => "∞",
5711 "element" => "∈",
5712 "universal" => "∀",
5713 "union" | "uniontext" | "uniondisplay" => "∪",
5714 "intersection" | "intersectiontext" | "intersectiondisplay" => "∩",
5715 "reflexsubset" => "⊇",
5716 "reflexsuperset" => "⊆",
5717 "summationtext" | "summationdisplay" => "∑",
5718 "productdisplay" => "∏",
5719 "integraldisplay" => "∫",
5720 "circleplusdisplay" => "⊕",
5721 "unionsqdisplay" => "⊔",
5722 "negationslash" => "̸",
5723 _ if name.chars().count() == 1 => name,
5724 _ => return unicode_glyph_name_to_text(name),
5725 };
5726 Some(text.to_owned())
5727}
5728
5729fn unicode_glyph_name_to_text(name: &str) -> Option<String> {
5730 if let Some(hex) = name.strip_prefix("uni") {
5731 if hex.len() >= 4 && hex.len() % 4 == 0 {
5732 let mut output = String::new();
5733 for chunk in hex.as_bytes().chunks(4) {
5734 let chunk = std::str::from_utf8(chunk).ok()?;
5735 let code = u32::from_str_radix(chunk, 16).ok()?;
5736 output.push(char::from_u32(code)?);
5737 }
5738 return Some(output);
5739 }
5740 }
5741 if let Some(hex) = name.strip_prefix('u') {
5742 if (4..=6).contains(&hex.len()) {
5743 let code = u32::from_str_radix(hex, 16).ok()?;
5744 return char::from_u32(code).map(|character| character.to_string());
5745 }
5746 }
5747 None
5748}
5749
5750fn numbers(operands: &[Operand], count: usize) -> Option<Vec<f32>> {
5751 if operands.len() < count {
5752 return None;
5753 }
5754 let values = operands[operands.len() - count..]
5755 .iter()
5756 .map(|operand| match operand {
5757 Operand::Number(value) => Some(*value),
5758 _ => None,
5759 })
5760 .collect::<Option<Vec<_>>>()?;
5761 Some(values)
5762}
5763
5764fn block_text(block: &Block) -> String {
5765 match block {
5766 Block::Text(text) => text.text.clone(),
5767 Block::Table(table) => {
5768 let mut rows = Vec::new();
5769 if !table.headers.is_empty() {
5770 rows.push(table.headers.join(" "));
5771 }
5772 rows.extend(table.rows.iter().map(|row| row.join(" ")));
5773 rows.join("\n")
5774 }
5775 Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
5776 }
5777}
5778
5779fn classify_text_line(text: &str, line_size: f32, body_size: f32) -> String {
5784 let chars = text.chars().count();
5785 if chars == 0 || chars >= 200 || body_size <= 0.0 || line_size <= 0.0 {
5788 return "paragraph".to_owned();
5789 }
5790 let ratio = line_size / body_size;
5791 if ratio >= 1.5 {
5792 "heading_1".to_owned()
5793 } else if ratio >= 1.3 {
5794 "heading_2".to_owned()
5795 } else if ratio >= 1.12 {
5796 "heading_3".to_owned()
5797 } else {
5798 "paragraph".to_owned()
5799 }
5800}
5801
5802fn line_dominant_size(line: &TextLine) -> f32 {
5804 let mut best_chars = 0usize;
5805 let mut best_size = 0.0f32;
5806 for run in &line.runs {
5807 if run.size <= 0.0 {
5808 continue;
5809 }
5810 let chars = run.text.chars().count();
5811 if chars >= best_chars {
5812 best_chars = chars;
5813 best_size = run.size;
5814 }
5815 }
5816 best_size
5817}
5818
5819fn page_body_size(lines: &[TextLine]) -> f32 {
5822 let mut weights: Vec<(u32, usize)> = Vec::new();
5823 for line in lines {
5824 for run in &line.runs {
5825 if run.size <= 0.0 {
5826 continue;
5827 }
5828 let bucket = (run.size * 2.0).round() as u32;
5829 let chars = run.text.chars().count();
5830 if let Some(entry) = weights.iter_mut().find(|(value, _)| *value == bucket) {
5831 entry.1 += chars;
5832 } else {
5833 weights.push((bucket, chars));
5834 }
5835 }
5836 }
5837 weights
5838 .into_iter()
5839 .max_by_key(|(_, chars)| *chars)
5840 .map(|(bucket, _)| bucket as f32 / 2.0)
5841 .unwrap_or(0.0)
5842}
5843
5844fn source_ids_for_line(line: &TextLine) -> Vec<String> {
5845 source_ids_for_runs(&line.runs)
5846}
5847
5848fn source_ids_for_runs(runs: &[TextRun]) -> Vec<String> {
5849 let mut ids = Vec::new();
5850 for run in runs {
5851 for id in &run.source_object_ids {
5852 if !ids.contains(id) {
5853 ids.push(id.clone());
5854 }
5855 }
5856 }
5857 ids
5858}
5859
5860fn anchor(page_number: usize, bbox: Option<BBox>, pdf_object_ids: Vec<String>) -> SourceAnchor {
5861 SourceAnchor {
5862 page_number,
5863 pdf_object_ids,
5864 bbox,
5865 extraction_method: "native_pdf".to_owned(),
5866 }
5867}
5868
5869fn warning(code: &str, severity: &str, message: &str, page_number: Option<usize>) -> Warning {
5870 Warning {
5871 code: code.to_owned(),
5872 severity: severity.to_owned(),
5873 message: message.to_owned(),
5874 source_anchor: page_number.map(|page_number| anchor(page_number, None, Vec::new())),
5875 }
5876}
5877
5878#[cfg(test)]
5879mod tests {
5880 use super::*;
5881
5882 #[test]
5883 fn text_from_line_runs_does_not_treat_slash_prose_page_number_as_script() {
5884 let line = TextLine {
5885 runs: vec![
5886 test_run("Art Cutting / Bates Technical College", 72.0, 720.0, 12.0),
5887 test_run("24", 300.0, 722.0, 8.0),
5888 test_run("Core Competencies", 315.0, 720.0, 12.0),
5889 ],
5890 bbox: BBox {
5891 x: 72.0,
5892 y: 720.0,
5893 width: 360.0,
5894 height: 12.0,
5895 },
5896 baseline_y: 720.0,
5897 };
5898
5899 assert_eq!(
5900 text_from_line_runs(&line),
5901 "Art Cutting / Bates Technical College 24 Core Competencies"
5902 );
5903 }
5904
5905 fn test_run(text: &str, x: f32, y: f32, size: f32) -> TextRun {
5906 TextRun {
5907 text: text.to_owned(),
5908 bbox: BBox {
5909 x,
5910 y,
5911 width: text.len() as f32 * size * 0.4,
5912 height: size,
5913 },
5914 baseline_y: y,
5915 font: None,
5916 size,
5917 space_width: size * 0.25,
5918 bold: false,
5919 italic: false,
5920 source_object_ids: Vec::new(),
5921 }
5922 }
5923}
5924
5925fn union_boxes(boxes: impl IntoIterator<Item = BBox>) -> Option<BBox> {
5926 let mut iter = boxes.into_iter();
5927 let first = iter.next()?;
5928 let mut min_x = first.x;
5929 let mut min_y = first.y;
5930 let mut max_x = first.x + first.width;
5931 let mut max_y = first.y + first.height;
5932
5933 for bbox in iter {
5934 min_x = min_x.min(bbox.x);
5935 min_y = min_y.min(bbox.y);
5936 max_x = max_x.max(bbox.x + bbox.width);
5937 max_y = max_y.max(bbox.y + bbox.height);
5938 }
5939
5940 Some(BBox {
5941 x: min_x,
5942 y: min_y,
5943 width: max_x - min_x,
5944 height: max_y - min_y,
5945 })
5946}
5947
5948fn extract_info_string(objects: &[PdfObject], key: &str) -> Option<String> {
5949 let needle = format!("/{key}");
5950 objects.iter().find_map(|object| {
5951 let body = lossy(&object.body);
5952 if !(body.contains("/Producer") || body.contains("/Creator") || body.contains("/Author")) {
5953 return None;
5954 }
5955 let start = body.find(&needle)?;
5956 let rest = &object.body[start + needle.len()..];
5957 let open = rest.iter().position(|byte| *byte == b'(')?;
5958 let mut parser = ContentParser::new(&rest[open..]);
5959 match parser.next_operand_or_operator()? {
5960 ContentToken::Operand(Operand::Literal(bytes)) => Some(decode_pdf_text(&bytes, None)),
5961 _ => None,
5962 }
5963 })
5964}
5965
5966fn pdf_version(bytes: &[u8]) -> Option<String> {
5967 let first_line = bytes.split(|byte| matches!(byte, b'\n' | b'\r')).next()?;
5968 let text = std::str::from_utf8(first_line).ok()?;
5969 text.strip_prefix("%PDF-").map(ToOwned::to_owned)
5970}
5971
5972fn decode_hex(bytes: &[u8]) -> Vec<u8> {
5973 let hex = bytes
5974 .iter()
5975 .copied()
5976 .filter(|byte| !is_ws(*byte))
5977 .collect::<Vec<_>>();
5978 let mut output = Vec::new();
5979 let mut index = 0;
5980 while index < hex.len() {
5981 let high = hex_value(hex[index]).unwrap_or(0);
5982 let low = hex
5983 .get(index + 1)
5984 .and_then(|byte| hex_value(*byte))
5985 .unwrap_or(0);
5986 output.push((high << 4) | low);
5987 index += 2;
5988 }
5989 output
5990}
5991
5992fn hex_value(byte: u8) -> Option<u8> {
5993 match byte {
5994 b'0'..=b'9' => Some(byte - b'0'),
5995 b'a'..=b'f' => Some(byte - b'a' + 10),
5996 b'A'..=b'F' => Some(byte - b'A' + 10),
5997 _ => None,
5998 }
5999}
6000
6001fn parse_unsigned_at(bytes: &[u8], mut pos: usize) -> Option<(usize, usize)> {
6002 let start = pos;
6003 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
6004 pos += 1;
6005 }
6006 if pos == start {
6007 return None;
6008 }
6009 std::str::from_utf8(&bytes[start..pos])
6010 .ok()?
6011 .parse()
6012 .ok()
6013 .map(|value| (value, pos))
6014}
6015
6016fn skip_required_ws(bytes: &[u8], mut pos: usize) -> Option<usize> {
6017 if pos >= bytes.len() || !is_ws(bytes[pos]) {
6018 return None;
6019 }
6020 while pos < bytes.len() && is_ws(bytes[pos]) {
6021 pos += 1;
6022 }
6023 Some(pos)
6024}
6025
6026fn is_ws_or_line_start(bytes: &[u8], pos: usize) -> bool {
6027 pos == 0 || matches!(bytes[pos - 1], b'\n' | b'\r')
6028}
6029
6030fn is_delimiter_or_ws(byte: u8) -> bool {
6031 is_ws(byte) || matches!(byte, b'[' | b']' | b'<' | b'>' | b'/' | b'(' | b')')
6032}
6033
6034fn is_ws(byte: u8) -> bool {
6035 matches!(byte, 0x00 | b'\t' | b'\n' | b'\x0c' | b'\r' | b' ')
6036}
6037
6038fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
6039 haystack
6040 .windows(needle.len())
6041 .position(|window| window == needle)
6042}
6043
6044fn contains_name(bytes: &[u8], name: &[u8]) -> bool {
6045 find_subslice(bytes, name).is_some()
6046}
6047
6048fn lossy(bytes: &[u8]) -> String {
6049 String::from_utf8_lossy(bytes).into_owned()
6050}
6051
6052#[allow(dead_code)]
6053fn sha256_hex(bytes: &[u8]) -> String {
6054 let digest = Sha256::digest(bytes);
6055 digest.iter().map(|byte| format!("{byte:02x}")).collect()
6056}