1use crate::error::{ExtractError, Result};
7use lopdf::content::{Content, Operation};
8use lopdf::{Document, Object, ObjectId};
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12const APPROX_CHAR_WIDTH: f64 = 0.5;
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub enum WidthSource {
18 Metric,
20 #[default]
22 Estimate,
23}
24
25#[derive(Debug, Clone, Default)]
27pub struct TextBlock {
28 pub text: String,
30 pub page: u32,
32 pub bbox: [f64; 4],
34 pub font_name: String,
36 pub font_size: f64,
38 pub actual_text: Option<String>,
43
44 pub base_font: Option<String>,
49 pub is_bold: bool,
55 pub is_italic: bool,
59 pub color: Option<[u8; 4]>,
63
64 pub width_source: WidthSource,
67 pub char_bounds: Vec<[f64; 4]>,
71}
72
73#[derive(Debug, Clone)]
75pub struct PositionedChar {
76 pub ch: char,
78 pub page: u32,
80 pub bbox: [f64; 4],
82}
83
84#[derive(Debug, Clone)]
86struct GraphicsState {
87 ctm: [f64; 6],
88 fill_color: Option<[u8; 4]>,
90}
91
92#[derive(Debug, Clone)]
94struct TextState {
95 tm: [f64; 6],
97 tlm: [f64; 6],
99 font_name: String,
101 font_size: f64,
103 tc: f64,
105 tw: f64,
107 th: f64,
109 tl: f64,
111 ts: f64,
113 gs_stack: Vec<GraphicsState>,
115 ctm: [f64; 6],
117 fill_color: Option<[u8; 4]>,
123}
124
125impl Default for TextState {
126 fn default() -> Self {
127 Self {
128 tm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
129 tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
130 font_name: String::new(),
131 font_size: 12.0,
132 tc: 0.0,
133 tw: 0.0,
134 th: 100.0,
135 tl: 0.0,
136 ts: 0.0,
137 gs_stack: Vec::new(),
138 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139 fill_color: Some([0, 0, 0, 255]),
140 }
141 }
142}
143
144#[derive(Clone)]
146struct FontInfo {
147 is_cid: bool,
149 to_unicode: HashMap<u32, String>,
152 encoding_map: [Option<char>; 256],
155 ct_codes: [bool; 256],
163 base_font: Option<String>,
165 is_bold: bool,
168 is_italic: bool,
171
172 simple_widths: Box<[Option<f32>; 256]>,
177 cid_default_width: f32,
180 cid_widths: HashMap<u32, f32>,
183}
184
185fn strip_subset_prefix(name: &str) -> &str {
187 match name.split_once('+') {
188 Some((prefix, rest)) if prefix.len() == 6 => rest,
189 _ => name,
190 }
191}
192
193const FONT_FLAG_ITALIC: u32 = 1 << 6;
197const FONT_FLAG_FORCE_BOLD: u32 = 1 << 18;
198
199fn derive_font_style(doc: &Document, font: &lopdf::Dictionary) -> (Option<String>, bool, bool) {
203 let base_font_raw = font.get(b"BaseFont").ok().and_then(|o| match o {
204 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
205 _ => None,
206 });
207 let base_font = base_font_raw
208 .as_deref()
209 .map(|s| strip_subset_prefix(s).to_string());
210
211 let (name_bold, name_italic) = base_font
212 .as_deref()
213 .map(name_style_hints)
214 .unwrap_or((false, false));
215
216 let descriptor = resolve_dict(doc, font, b"FontDescriptor");
217 let (desc_bold, desc_italic) = descriptor
218 .as_ref()
219 .map(|d| {
220 let weight_bold = d
221 .get(b"FontWeight")
222 .ok()
223 .and_then(|o| match o {
224 Object::Integer(i) => Some(*i as u32),
225 Object::Real(r) => Some(*r as u32),
226 _ => None,
227 })
228 .is_some_and(|w| w >= 700);
229 let flags = d
230 .get(b"Flags")
231 .ok()
232 .and_then(|o| match o {
233 Object::Integer(i) => Some(*i as u32),
234 Object::Real(r) => Some(*r as u32),
235 _ => None,
236 })
237 .unwrap_or(0);
238 let flag_italic = (flags & FONT_FLAG_ITALIC) != 0;
239 let flag_force_bold = (flags & FONT_FLAG_FORCE_BOLD) != 0;
240 (weight_bold || flag_force_bold, flag_italic)
241 })
242 .unwrap_or((false, false));
243
244 (
245 base_font,
246 desc_bold || name_bold,
247 desc_italic || name_italic,
248 )
249}
250
251fn clamp_unit_to_u8(v: f64) -> u8 {
253 (v.clamp(0.0, 1.0) * 255.0).round() as u8
254}
255
256fn cmyk_to_rgba(c: f64, m: f64, y: f64, k: f64) -> [u8; 4] {
260 let c = c.clamp(0.0, 1.0);
261 let m = m.clamp(0.0, 1.0);
262 let y = y.clamp(0.0, 1.0);
263 let k = k.clamp(0.0, 1.0);
264 let r = (1.0 - c) * (1.0 - k);
265 let g = (1.0 - m) * (1.0 - k);
266 let b = (1.0 - y) * (1.0 - k);
267 [
268 clamp_unit_to_u8(r),
269 clamp_unit_to_u8(g),
270 clamp_unit_to_u8(b),
271 255,
272 ]
273}
274
275fn name_style_hints(name: &str) -> (bool, bool) {
278 let lower = name.to_ascii_lowercase();
279 let italic = lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
280 let bold = lower.contains("bold")
281 || lower.contains("demi")
282 || lower.contains("semibold")
283 || lower.contains("heavy")
284 || lower.contains("black");
285 (bold, italic)
286}
287
288fn font_glyph_advance(fi: &FontInfo, char_code: u32) -> Option<f32> {
294 if fi.is_cid {
295 fi.cid_widths
296 .get(&char_code)
297 .copied()
298 .or(if fi.cid_default_width > 0.0 {
299 Some(fi.cid_default_width)
300 } else {
301 None
302 })
303 } else {
304 fi.simple_widths.get(char_code as usize).and_then(|w| *w)
305 }
306}
307
308fn fi_has_metrics(fi: &FontInfo) -> bool {
311 if fi.is_cid {
312 true
314 } else {
315 fi.simple_widths.iter().any(|w| w.is_some())
316 }
317}
318
319fn compute_glyph_advances(
327 bytes: &[u8],
328 fi: &FontInfo,
329 font_size: f64,
330 tc: f64,
331 tw: f64,
332 th: f64,
333) -> (Vec<f64>, WidthSource) {
334 let has_metrics = fi_has_metrics(fi);
335 let width_source = if has_metrics {
336 WidthSource::Metric
337 } else {
338 WidthSource::Estimate
339 };
340 let approx = APPROX_CHAR_WIDTH * 1000.0; let mut advances = Vec::new();
343
344 if fi.is_cid {
345 let mut i = 0;
346 while i + 1 < bytes.len() {
347 let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
348 i += 2;
349 let glyph_w = font_glyph_advance(fi, code).unwrap_or(approx as f32);
350 let adv = glyph_w as f64 / 1000.0 * font_size * (th / 100.0) + tc;
351 advances.push(adv);
352 }
353 } else {
354 for &b in bytes {
355 let glyph_w = if has_metrics {
356 font_glyph_advance(fi, b as u32).unwrap_or(approx as f32)
357 } else {
358 approx as f32
359 };
360 let extra_word = if b == 0x20 { tw } else { 0.0 };
361 let adv = glyph_w as f64 / 1000.0 * font_size * (th / 100.0) + tc + extra_word;
362 advances.push(adv);
363 }
364 }
365
366 (advances, width_source)
367}
368
369struct GlyphCtx {
371 font_size: f64,
372 tc: f64,
373 tw: f64,
374 th: f64,
375}
376
377fn char_bounds_from_bytes(
379 bytes: &[u8],
380 fi: &FontInfo,
381 x: f64,
382 y: f64,
383 ctx: &GlyphCtx,
384) -> (Vec<[f64; 4]>, WidthSource, f64) {
385 let (advances, ws) = compute_glyph_advances(bytes, fi, ctx.font_size, ctx.tc, ctx.tw, ctx.th);
386 let mut cx = x;
387 let mut bounds = Vec::with_capacity(advances.len());
388 for adv in &advances {
389 bounds.push([cx, y, cx + adv, y + ctx.font_size]);
390 cx += adv;
391 }
392 (bounds, ws, cx - x)
393}
394
395fn char_bounds_from_tj_array(
401 arr: &[Object],
402 fi: &FontInfo,
403 x_start: f64,
404 y: f64,
405 ctx: &GlyphCtx,
406) -> (Vec<[f64; 4]>, WidthSource, f64) {
407 let font_size = ctx.font_size;
408 let tc = ctx.tc;
409 let tw = ctx.tw;
410 let th = ctx.th;
411 let has_metrics = fi_has_metrics(fi);
412 let width_source = if has_metrics {
413 WidthSource::Metric
414 } else {
415 WidthSource::Estimate
416 };
417 let approx = APPROX_CHAR_WIDTH * 1000.0_f64;
418
419 let mut bounds = Vec::new();
420 let mut cx = x_start;
421
422 for item in arr {
423 match item {
424 Object::String(bytes, _) => {
425 if fi.is_cid {
426 let mut i = 0;
427 while i + 1 < bytes.len() {
428 let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
429 i += 2;
430 let glyph_w = font_glyph_advance(fi, code).unwrap_or(approx as f32) as f64;
431 let adv = glyph_w / 1000.0 * font_size * (th / 100.0) + tc;
432 bounds.push([cx, y, cx + adv, y + font_size]);
433 cx += adv;
434 }
435 } else {
436 for &b in bytes.iter() {
437 let glyph_w = if has_metrics {
438 font_glyph_advance(fi, b as u32).unwrap_or(approx as f32) as f64
439 } else {
440 approx
441 };
442 let extra_word = if b == 0x20 { tw } else { 0.0 };
443 let adv = glyph_w / 1000.0 * font_size * (th / 100.0) + tc + extra_word;
444 bounds.push([cx, y, cx + adv, y + font_size]);
445 cx += adv;
446 }
447 }
448 }
449 _ => {
450 if let Some(adj) = as_number(item) {
451 cx -= adj / 1000.0 * font_size * (th / 100.0);
453 }
454 }
455 }
456 }
457
458 (bounds, width_source, cx)
459}
460
461fn build_font_map(doc: &Document, page_id: ObjectId) -> HashMap<String, FontInfo> {
463 let mut map = HashMap::new();
464
465 let resources = get_page_resources(doc, page_id);
467 let font_dict = match resources.and_then(|res| match res.get(b"Font").ok()? {
468 Object::Dictionary(d) => Some(d.clone()),
469 Object::Reference(r) => match doc.get_object(*r).ok()? {
470 Object::Dictionary(d) => Some(d.clone()),
471 _ => None,
472 },
473 _ => None,
474 }) {
475 Some(d) => d,
476 None => return map,
477 };
478
479 for (name_bytes, value) in font_dict.iter() {
480 let font_name = String::from_utf8_lossy(name_bytes).to_string();
481
482 let font = match value {
484 Object::Reference(r) => match doc.get_object(*r).ok() {
485 Some(Object::Dictionary(d)) => d.clone(),
486 _ => continue,
487 },
488 Object::Dictionary(d) => d.clone(),
489 _ => continue,
490 };
491
492 let subtype = font
493 .get(b"Subtype")
494 .ok()
495 .and_then(|o| match o {
496 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
497 _ => None,
498 })
499 .unwrap_or_default();
500
501 let is_cid = subtype == "Type0";
502
503 let to_unicode = parse_to_unicode_from_font(doc, &font);
505
506 let to_unicode = if to_unicode.is_empty() && is_cid {
508 if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
509 descendants
510 .iter()
511 .find_map(|d| {
512 let desc_dict = match d {
513 Object::Reference(r) => match doc.get_object(*r).ok()? {
514 Object::Dictionary(d) => d,
515 _ => return None,
516 },
517 Object::Dictionary(d) => d,
518 _ => return None,
519 };
520 let tu = parse_to_unicode_from_font(doc, desc_dict);
521 if tu.is_empty() {
522 None
523 } else {
524 Some(tu)
525 }
526 })
527 .unwrap_or_default()
528 } else {
529 HashMap::new()
530 }
531 } else {
532 to_unicode
533 };
534
535 let (encoding_map, ct_codes) = if !is_cid {
537 build_encoding_map(doc, &font)
538 } else {
539 ([None; 256], [false; 256])
540 };
541
542 let (base_font, is_bold, is_italic) = derive_font_style(doc, &font);
543
544 let (simple_widths, cid_default_width, cid_widths) = parse_font_widths(doc, &font, is_cid);
545
546 map.insert(
547 font_name,
548 FontInfo {
549 is_cid,
550 to_unicode,
551 encoding_map,
552 ct_codes,
553 base_font,
554 is_bold,
555 is_italic,
556 simple_widths,
557 cid_default_width,
558 cid_widths,
559 },
560 );
561 }
562
563 map
564}
565
566fn parse_font_widths(
573 doc: &Document,
574 font: &lopdf::Dictionary,
575 is_cid: bool,
576) -> (Box<[Option<f32>; 256]>, f32, HashMap<u32, f32>) {
577 if is_cid {
578 let (dw, cid_map) = parse_cid_widths(doc, font);
579 return (Box::new([None; 256]), dw, cid_map);
580 }
581
582 let first_char = match font.get(b"FirstChar").ok() {
584 Some(Object::Integer(n)) => *n as usize,
585 _ => return (Box::new([None; 256]), 1000.0, HashMap::new()),
586 };
587
588 let widths_arr = match font.get(b"Widths").ok() {
589 Some(Object::Array(a)) => a.clone(),
590 Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
591 Some(Object::Array(a)) => a.clone(),
592 _ => return (Box::new([None; 256]), 1000.0, HashMap::new()),
593 },
594 _ => return (Box::new([None; 256]), 1000.0, HashMap::new()),
595 };
596
597 let mut simple_widths = Box::new([None::<f32>; 256]);
598 for (i, obj) in widths_arr.iter().enumerate() {
599 let code = first_char + i;
600 if code > 255 {
601 break;
602 }
603 let w = match obj {
604 Object::Integer(n) => *n as f32,
605 Object::Real(n) => *n,
606 _ => continue,
607 };
608 simple_widths[code] = Some(w);
609 }
610
611 (simple_widths, 1000.0, HashMap::new())
612}
613
614fn parse_cid_widths(doc: &Document, font: &lopdf::Dictionary) -> (f32, HashMap<u32, f32>) {
617 let descendants = match font.get(b"DescendantFonts").ok() {
618 Some(Object::Array(a)) => a.clone(),
619 Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
620 Some(Object::Array(a)) => a.clone(),
621 _ => return (1000.0, HashMap::new()),
622 },
623 _ => return (1000.0, HashMap::new()),
624 };
625
626 let desc_dict = match descendants.first() {
627 Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
628 Some(Object::Dictionary(d)) => d.clone(),
629 _ => return (1000.0, HashMap::new()),
630 },
631 Some(Object::Dictionary(d)) => d.clone(),
632 _ => return (1000.0, HashMap::new()),
633 };
634
635 let dw = match desc_dict.get(b"DW").ok() {
636 Some(Object::Integer(n)) => *n as f32,
637 Some(Object::Real(n)) => *n,
638 _ => 1000.0,
639 };
640
641 let w_arr = match desc_dict.get(b"W").ok() {
642 Some(Object::Array(a)) => a.clone(),
643 Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
644 Some(Object::Array(a)) => a.clone(),
645 _ => return (dw, HashMap::new()),
646 },
647 _ => return (dw, HashMap::new()),
648 };
649
650 let mut map = HashMap::new();
652 let mut idx = 0;
653 while idx < w_arr.len() {
654 let cid_start = match &w_arr[idx] {
655 Object::Integer(n) => *n as u32,
656 _ => {
657 idx += 1;
658 continue;
659 }
660 };
661 idx += 1;
662 if idx >= w_arr.len() {
663 break;
664 }
665 match &w_arr[idx] {
666 Object::Array(widths) => {
667 for (i, wobj) in widths.iter().enumerate() {
669 let w = match wobj {
670 Object::Integer(n) => *n as f32,
671 Object::Real(n) => *n,
672 _ => continue,
673 };
674 map.insert(cid_start + i as u32, w);
675 }
676 idx += 1;
677 }
678 Object::Integer(c2) => {
679 let cid_end = *c2 as u32;
681 idx += 1;
682 if idx >= w_arr.len() {
683 break;
684 }
685 let w = match &w_arr[idx] {
686 Object::Integer(n) => *n as f32,
687 Object::Real(n) => *n,
688 _ => {
689 idx += 1;
690 continue;
691 }
692 };
693 for cid in cid_start..=cid_end {
694 map.insert(cid, w);
695 }
696 idx += 1;
697 }
698 _ => {
699 idx += 1;
700 }
701 }
702 }
703
704 (dw, map)
705}
706
707fn parse_to_unicode_from_font(doc: &Document, font: &lopdf::Dictionary) -> HashMap<u32, String> {
709 let tu_obj = match font.get(b"ToUnicode").ok() {
710 Some(Object::Reference(r)) => doc.get_object(*r).ok(),
711 Some(obj) => Some(obj),
712 None => return HashMap::new(),
713 };
714
715 let stream_bytes = match tu_obj {
716 Some(Object::Stream(ref s)) => s
717 .decompressed_content()
718 .ok()
719 .unwrap_or_else(|| s.content.clone()),
720 _ => return HashMap::new(),
721 };
722
723 parse_to_unicode_cmap(&stream_bytes)
724}
725
726fn parse_to_unicode_cmap(data: &[u8]) -> HashMap<u32, String> {
730 let text = String::from_utf8_lossy(data);
731 let mut map = HashMap::new();
732
733 for section in text.split("beginbfchar") {
735 let section = match section.split("endbfchar").next() {
736 Some(s) => s,
737 None => continue,
738 };
739 let tokens = extract_hex_tokens(section);
740 for pair in tokens.chunks(2) {
741 if pair.len() == 2 {
742 let code = parse_hex_u32(&pair[0]);
743 let unicode = hex_to_unicode_string(&pair[1]);
744 map.insert(code, unicode);
745 }
746 }
747 }
748
749 for section in text.split("beginbfrange") {
751 let section = match section.split("endbfrange").next() {
752 Some(s) => s,
753 None => continue,
754 };
755
756 let mut chars = section.chars().peekable();
758 let mut tokens: Vec<String> = Vec::new();
759 let mut arrays: Vec<Vec<String>> = Vec::new();
760 let mut in_array = false;
761 let mut current_array: Vec<String> = Vec::new();
762
763 while let Some(&ch) = chars.peek() {
764 if ch == '<' {
765 chars.next();
766 let hex: String = chars
767 .by_ref()
768 .take_while(|&c| c != '>')
769 .filter(|c| !c.is_whitespace())
770 .collect();
771 if in_array {
772 current_array.push(hex);
773 } else {
774 tokens.push(hex);
775 }
776 } else if ch == '[' {
777 chars.next();
778 in_array = true;
779 current_array = Vec::new();
780 } else if ch == ']' {
781 chars.next();
782 in_array = false;
783 arrays.push(std::mem::take(&mut current_array));
784 tokens.push(String::new()); } else {
786 chars.next();
787 }
788 }
789
790 let mut array_idx = 0;
792 let mut i = 0;
793 while i + 2 < tokens.len() {
794 let lo = parse_hex_u32(&tokens[i]);
795 let hi = parse_hex_u32(&tokens[i + 1]);
796
797 if tokens[i + 2].is_empty() {
798 if array_idx < arrays.len() {
800 let arr = &arrays[array_idx];
801 for (offset, dst) in arr.iter().enumerate() {
802 let code = lo + offset as u32;
803 if code <= hi {
804 map.insert(code, hex_to_unicode_string(dst));
805 }
806 }
807 array_idx += 1;
808 }
809 } else {
810 let dst_start = parse_hex_u32(&tokens[i + 2]);
812 let dst_len = tokens[i + 2].len();
813 for code in lo..=hi {
814 let dst_val = dst_start + (code - lo);
815 let s = if dst_len <= 4 {
816 char::from_u32(dst_val)
818 .map(|c| c.to_string())
819 .unwrap_or_default()
820 } else {
821 let hex = format!("{:0>width$X}", dst_val, width = dst_len);
823 hex_to_unicode_string(&hex)
824 };
825 map.insert(code, s);
826 }
827 }
828 i += 3;
829 }
830 }
831
832 map
833}
834
835fn extract_hex_tokens(text: &str) -> Vec<String> {
837 let mut tokens = Vec::new();
838 let mut in_hex = false;
839 let mut current = String::new();
840 for ch in text.chars() {
841 if ch == '<' {
842 in_hex = true;
843 current.clear();
844 } else if ch == '>' && in_hex {
845 in_hex = false;
846 tokens.push(current.clone());
847 } else if in_hex && !ch.is_whitespace() {
848 current.push(ch);
849 }
850 }
851 tokens
852}
853
854fn parse_hex_u32(hex: &str) -> u32 {
856 u32::from_str_radix(hex, 16).unwrap_or(0)
857}
858
859fn hex_to_unicode_string(hex: &str) -> String {
861 let bytes: Vec<u8> = (0..hex.len())
862 .step_by(2)
863 .filter_map(|i| u8::from_str_radix(&hex[i..i + 2.min(hex.len() - i)], 16).ok())
864 .collect();
865
866 if bytes.len() >= 2 && bytes.len().is_multiple_of(2) {
867 let u16s: Vec<u16> = bytes
869 .chunks(2)
870 .map(|c| u16::from_be_bytes([c[0], c[1]]))
871 .collect();
872 String::from_utf16_lossy(&u16s)
873 } else if bytes.len() == 1 {
874 char::from_u32(bytes[0] as u32)
875 .map(|c| c.to_string())
876 .unwrap_or_default()
877 } else {
878 String::new()
879 }
880}
881
882fn get_page_resources(doc: &Document, page_id: ObjectId) -> Option<lopdf::Dictionary> {
884 let page = match doc.get_object(page_id).ok()? {
885 Object::Dictionary(d) => d.clone(),
886 _ => return None,
887 };
888
889 if let Some(res) = resolve_dict(doc, &page, b"Resources") {
891 return Some(res);
892 }
893
894 let mut current = page;
896 for _ in 0..20 {
897 let parent_ref = match current.get(b"Parent").ok()? {
898 Object::Reference(r) => *r,
899 _ => break,
900 };
901 let parent = match doc.get_object(parent_ref).ok()? {
902 Object::Dictionary(d) => d.clone(),
903 _ => break,
904 };
905 if let Some(res) = resolve_dict(doc, &parent, b"Resources") {
906 return Some(res);
907 }
908 current = parent;
909 }
910
911 None
912}
913
914fn resolve_dict(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<lopdf::Dictionary> {
916 match dict.get(key).ok()? {
917 Object::Dictionary(d) => Some(d.clone()),
918 Object::Reference(r) => match doc.get_object(*r).ok()? {
919 Object::Dictionary(d) => Some(d.clone()),
920 _ => None,
921 },
922 _ => None,
923 }
924}
925
926fn build_encoding_map(
933 doc: &Document,
934 font: &lopdf::Dictionary,
935) -> ([Option<char>; 256], [bool; 256]) {
936 let mut table = [None::<char>; 256];
937 let mut ct_codes = [false; 256];
938
939 let encoding = match font.get(b"Encoding").ok() {
940 Some(obj) => obj,
941 None => return (table, ct_codes),
942 };
943
944 match encoding {
945 Object::Name(name) => {
946 let name_str = String::from_utf8_lossy(name);
948 apply_base_encoding(&mut table, &name_str);
949 }
950 Object::Reference(r) => match doc.get_object(*r) {
951 Ok(Object::Dictionary(enc_dict)) => {
952 parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
953 }
954 Ok(Object::Name(name)) => {
955 let name_str = String::from_utf8_lossy(name);
956 apply_base_encoding(&mut table, &name_str);
957 }
958 _ => {}
959 },
960 Object::Dictionary(enc_dict) => {
961 parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
962 }
963 _ => {}
964 }
965
966 (table, ct_codes)
967}
968
969fn parse_encoding_dict(
971 doc: &Document,
972 enc_dict: &lopdf::Dictionary,
973 table: &mut [Option<char>; 256],
974 ct_codes: &mut [bool; 256],
975) {
976 if let Ok(Object::Name(base)) = enc_dict.get(b"BaseEncoding") {
978 let base_str = String::from_utf8_lossy(base);
979 apply_base_encoding(table, &base_str);
980 }
981
982 let diffs = match enc_dict.get(b"Differences").ok() {
984 Some(Object::Array(arr)) => arr.clone(),
985 Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
986 Some(Object::Array(arr)) => arr.clone(),
987 _ => return,
988 },
989 _ => return,
990 };
991
992 let mut code: Option<u32> = None;
993 for item in &diffs {
994 match item {
995 Object::Integer(n) => {
996 code = Some(*n as u32);
997 }
998 Object::Name(name) => {
999 if let Some(c) = code {
1000 if c < 256 {
1001 let glyph = String::from_utf8_lossy(name);
1002 apply_glyph_name(&glyph, c as usize, table, ct_codes);
1003 }
1004 code = Some(c + 1);
1005 }
1006 }
1007 Object::Reference(r) => {
1008 if let Ok(Object::Name(name)) = doc.get_object(*r) {
1010 if let Some(c) = code {
1011 if c < 256 {
1012 let glyph = String::from_utf8_lossy(name);
1013 apply_glyph_name(&glyph, c as usize, table, ct_codes);
1014 }
1015 code = Some(c + 1);
1016 }
1017 }
1018 }
1019 _ => {}
1020 }
1021 }
1022}
1023
1024fn apply_glyph_name(
1030 glyph: &str,
1031 code: usize,
1032 table: &mut [Option<char>; 256],
1033 ct_codes: &mut [bool; 256],
1034) {
1035 if glyph == "ct" {
1036 table[code] = None;
1039 ct_codes[code] = true;
1040 return;
1041 }
1042 if let Some(ch) = glyph_name_to_unicode(glyph) {
1043 table[code] = Some(ch);
1044 ct_codes[code] = false;
1045 }
1046}
1047
1048fn apply_base_encoding(table: &mut [Option<char>; 256], name: &str) {
1050 let source = match name {
1051 "WinAnsiEncoding" => winansi_encoding(),
1052 "MacRomanEncoding" => mac_roman_encoding(),
1053 _ => return,
1054 };
1055 for (i, &ch) in source.iter().enumerate() {
1056 if ch != '\0' {
1057 table[i] = Some(ch);
1058 }
1059 }
1060}
1061
1062fn winansi_encoding() -> &'static [char; 256] {
1064 static TABLE: OnceLock<[char; 256]> = OnceLock::new();
1065 TABLE.get_or_init(|| {
1066 let mut t = ['\0'; 256];
1067 for i in 0x20..=0x7Eu8 {
1069 t[i as usize] = i as char;
1070 }
1071 t[0x09] = '\t';
1073 t[0x0A] = '\n';
1074 t[0x0D] = '\r';
1075 let cp1252: [(u8, char); 27] = [
1077 (0x80, '\u{20AC}'), (0x82, '\u{201A}'), (0x83, '\u{0192}'), (0x84, '\u{201E}'), (0x85, '\u{2026}'), (0x86, '\u{2020}'), (0x87, '\u{2021}'), (0x88, '\u{02C6}'), (0x89, '\u{2030}'), (0x8A, '\u{0160}'), (0x8B, '\u{2039}'), (0x8C, '\u{0152}'), (0x8E, '\u{017D}'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201C}'), (0x94, '\u{201D}'), (0x95, '\u{2022}'), (0x96, '\u{2013}'), (0x97, '\u{2014}'), (0x98, '\u{02DC}'), (0x99, '\u{2122}'), (0x9A, '\u{0161}'), (0x9B, '\u{203A}'), (0x9C, '\u{0153}'), (0x9E, '\u{017E}'), (0x9F, '\u{0178}'), ];
1105 for (code, ch) in cp1252 {
1106 t[code as usize] = ch;
1107 }
1108 for i in 0xA0..=0xFFu16 {
1110 t[i as usize] = char::from_u32(i as u32).unwrap_or('\0');
1111 }
1112 t
1113 })
1114}
1115
1116fn mac_roman_encoding() -> &'static [char; 256] {
1118 static TABLE: OnceLock<[char; 256]> = OnceLock::new();
1119 TABLE.get_or_init(|| {
1120 let mut t = ['\0'; 256];
1121 for i in 0x20..=0x7Eu8 {
1123 t[i as usize] = i as char;
1124 }
1125 t[0x09] = '\t';
1126 t[0x0A] = '\n';
1127 t[0x0D] = '\r';
1128 let mac_upper: [char; 128] = [
1130 '\u{00C4}', '\u{00C5}', '\u{00C7}', '\u{00C9}', '\u{00D1}', '\u{00D6}', '\u{00DC}',
1131 '\u{00E1}', '\u{00E0}', '\u{00E2}', '\u{00E4}', '\u{00E3}', '\u{00E5}', '\u{00E7}',
1132 '\u{00E9}', '\u{00E8}', '\u{00EA}', '\u{00EB}', '\u{00ED}', '\u{00EC}', '\u{00EE}',
1133 '\u{00EF}', '\u{00F1}', '\u{00F3}', '\u{00F2}', '\u{00F4}', '\u{00F6}', '\u{00F5}',
1134 '\u{00FA}', '\u{00F9}', '\u{00FB}', '\u{00FC}', '\u{2020}', '\u{00B0}', '\u{00A2}',
1135 '\u{00A3}', '\u{00A7}', '\u{2022}', '\u{00B6}', '\u{00DF}', '\u{00AE}', '\u{00A9}',
1136 '\u{2122}', '\u{00B4}', '\u{00A8}', '\u{2260}', '\u{00C6}', '\u{00D8}', '\u{221E}',
1137 '\u{00B1}', '\u{2264}', '\u{2265}', '\u{00A5}', '\u{00B5}', '\u{2202}', '\u{2211}',
1138 '\u{220F}', '\u{03C0}', '\u{222B}', '\u{00AA}', '\u{00BA}', '\u{03A9}', '\u{00E6}',
1139 '\u{00F8}', '\u{00BF}', '\u{00A1}', '\u{00AC}', '\u{221A}', '\u{0192}', '\u{2248}',
1140 '\u{2206}', '\u{00AB}', '\u{00BB}', '\u{2026}', '\u{00A0}', '\u{00C0}', '\u{00C3}',
1141 '\u{00D5}', '\u{0152}', '\u{0153}', '\u{2013}', '\u{2014}', '\u{201C}', '\u{201D}',
1142 '\u{2018}', '\u{2019}', '\u{00F7}', '\u{25CA}', '\u{00FF}', '\u{0178}', '\u{2044}',
1143 '\u{20AC}', '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{2021}', '\u{00B7}',
1144 '\u{201A}', '\u{201E}', '\u{2030}', '\u{00C2}', '\u{00CA}', '\u{00C1}', '\u{00CB}',
1145 '\u{00C8}', '\u{00CD}', '\u{00CE}', '\u{00CF}', '\u{00CC}', '\u{00D3}', '\u{00D4}',
1146 '\u{F8FF}', '\u{00D2}', '\u{00DA}', '\u{00DB}', '\u{00D9}', '\u{0131}', '\u{02C6}',
1147 '\u{02DC}', '\u{00AF}', '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{00B8}', '\u{02DD}',
1148 '\u{02DB}', '\u{02C7}',
1149 ];
1150 for (i, &ch) in mac_upper.iter().enumerate() {
1151 t[0x80 + i] = ch;
1152 }
1153 t
1154 })
1155}
1156
1157fn glyph_name_to_unicode(name: &str) -> Option<char> {
1168 if name.starts_with("uni") && name.len() >= 7 {
1170 return u32::from_str_radix(&name[3..7], 16)
1171 .ok()
1172 .and_then(char::from_u32);
1173 }
1174 if name.starts_with('u') && name.len() >= 5 && name[1..].chars().all(|c| c.is_ascii_hexdigit())
1175 {
1176 return u32::from_str_radix(&name[1..], 16)
1177 .ok()
1178 .and_then(char::from_u32);
1179 }
1180
1181 if let Some(c) = agl_table().get(name).copied() {
1182 return Some(c);
1183 }
1184
1185 match name {
1186 "st" => Some('\u{FB06}'),
1187 "longst" => Some('\u{FB05}'),
1188 _ => None,
1189 }
1190}
1191
1192fn agl_table() -> &'static HashMap<&'static str, char> {
1194 static TABLE: OnceLock<HashMap<&'static str, char>> = OnceLock::new();
1195 TABLE.get_or_init(|| {
1196 let entries: &[(&str, char)] = &[
1197 ("space", ' '),
1198 ("exclam", '!'),
1199 ("quotedbl", '"'),
1200 ("numbersign", '#'),
1201 ("dollar", '$'),
1202 ("percent", '%'),
1203 ("ampersand", '&'),
1204 ("quotesingle", '\''),
1205 ("parenleft", '('),
1206 ("parenright", ')'),
1207 ("asterisk", '*'),
1208 ("plus", '+'),
1209 ("comma", ','),
1210 ("hyphen", '-'),
1211 ("period", '.'),
1212 ("slash", '/'),
1213 ("zero", '0'),
1214 ("one", '1'),
1215 ("two", '2'),
1216 ("three", '3'),
1217 ("four", '4'),
1218 ("five", '5'),
1219 ("six", '6'),
1220 ("seven", '7'),
1221 ("eight", '8'),
1222 ("nine", '9'),
1223 ("colon", ':'),
1224 ("semicolon", ';'),
1225 ("less", '<'),
1226 ("equal", '='),
1227 ("greater", '>'),
1228 ("question", '?'),
1229 ("at", '@'),
1230 ("A", 'A'),
1231 ("B", 'B'),
1232 ("C", 'C'),
1233 ("D", 'D'),
1234 ("E", 'E'),
1235 ("F", 'F'),
1236 ("G", 'G'),
1237 ("H", 'H'),
1238 ("I", 'I'),
1239 ("J", 'J'),
1240 ("K", 'K'),
1241 ("L", 'L'),
1242 ("M", 'M'),
1243 ("N", 'N'),
1244 ("O", 'O'),
1245 ("P", 'P'),
1246 ("Q", 'Q'),
1247 ("R", 'R'),
1248 ("S", 'S'),
1249 ("T", 'T'),
1250 ("U", 'U'),
1251 ("V", 'V'),
1252 ("W", 'W'),
1253 ("X", 'X'),
1254 ("Y", 'Y'),
1255 ("Z", 'Z'),
1256 ("bracketleft", '['),
1257 ("backslash", '\\'),
1258 ("bracketright", ']'),
1259 ("asciicircum", '^'),
1260 ("underscore", '_'),
1261 ("grave", '`'),
1262 ("a", 'a'),
1263 ("b", 'b'),
1264 ("c", 'c'),
1265 ("d", 'd'),
1266 ("e", 'e'),
1267 ("f", 'f'),
1268 ("g", 'g'),
1269 ("h", 'h'),
1270 ("i", 'i'),
1271 ("j", 'j'),
1272 ("k", 'k'),
1273 ("l", 'l'),
1274 ("m", 'm'),
1275 ("n", 'n'),
1276 ("o", 'o'),
1277 ("p", 'p'),
1278 ("q", 'q'),
1279 ("r", 'r'),
1280 ("s", 's'),
1281 ("t", 't'),
1282 ("u", 'u'),
1283 ("v", 'v'),
1284 ("w", 'w'),
1285 ("x", 'x'),
1286 ("y", 'y'),
1287 ("z", 'z'),
1288 ("braceleft", '{'),
1289 ("bar", '|'),
1290 ("braceright", '}'),
1291 ("asciitilde", '~'),
1292 ("Agrave", '\u{00C0}'),
1294 ("Aacute", '\u{00C1}'),
1295 ("Acircumflex", '\u{00C2}'),
1296 ("Atilde", '\u{00C3}'),
1297 ("Adieresis", '\u{00C4}'),
1298 ("Aring", '\u{00C5}'),
1299 ("AE", '\u{00C6}'),
1300 ("Ccedilla", '\u{00C7}'),
1301 ("Egrave", '\u{00C8}'),
1302 ("Eacute", '\u{00C9}'),
1303 ("Ecircumflex", '\u{00CA}'),
1304 ("Edieresis", '\u{00CB}'),
1305 ("Igrave", '\u{00CC}'),
1306 ("Iacute", '\u{00CD}'),
1307 ("Icircumflex", '\u{00CE}'),
1308 ("Idieresis", '\u{00CF}'),
1309 ("Eth", '\u{00D0}'),
1310 ("Ntilde", '\u{00D1}'),
1311 ("Ograve", '\u{00D2}'),
1312 ("Oacute", '\u{00D3}'),
1313 ("Ocircumflex", '\u{00D4}'),
1314 ("Otilde", '\u{00D5}'),
1315 ("Odieresis", '\u{00D6}'),
1316 ("Ugrave", '\u{00D9}'),
1317 ("Uacute", '\u{00DA}'),
1318 ("Ucircumflex", '\u{00DB}'),
1319 ("Udieresis", '\u{00DC}'),
1320 ("Yacute", '\u{00DD}'),
1321 ("Thorn", '\u{00DE}'),
1322 ("germandbls", '\u{00DF}'),
1323 ("agrave", '\u{00E0}'),
1324 ("aacute", '\u{00E1}'),
1325 ("acircumflex", '\u{00E2}'),
1326 ("atilde", '\u{00E3}'),
1327 ("adieresis", '\u{00E4}'),
1328 ("aring", '\u{00E5}'),
1329 ("ae", '\u{00E6}'),
1330 ("ccedilla", '\u{00E7}'),
1331 ("egrave", '\u{00E8}'),
1332 ("eacute", '\u{00E9}'),
1333 ("ecircumflex", '\u{00EA}'),
1334 ("edieresis", '\u{00EB}'),
1335 ("igrave", '\u{00EC}'),
1336 ("iacute", '\u{00ED}'),
1337 ("icircumflex", '\u{00EE}'),
1338 ("idieresis", '\u{00EF}'),
1339 ("eth", '\u{00F0}'),
1340 ("ntilde", '\u{00F1}'),
1341 ("ograve", '\u{00F2}'),
1342 ("oacute", '\u{00F3}'),
1343 ("ocircumflex", '\u{00F4}'),
1344 ("otilde", '\u{00F5}'),
1345 ("odieresis", '\u{00F6}'),
1346 ("ugrave", '\u{00F9}'),
1347 ("uacute", '\u{00FA}'),
1348 ("ucircumflex", '\u{00FB}'),
1349 ("udieresis", '\u{00FC}'),
1350 ("yacute", '\u{00FD}'),
1351 ("thorn", '\u{00FE}'),
1352 ("ydieresis", '\u{00FF}'),
1353 ("fi", '\u{FB01}'),
1355 ("fl", '\u{FB02}'),
1356 ("ff", '\u{FB00}'),
1357 ("ffi", '\u{FB03}'),
1358 ("ffl", '\u{FB04}'),
1359 ("endash", '\u{2013}'),
1361 ("emdash", '\u{2014}'),
1362 ("bullet", '\u{2022}'),
1363 ("ellipsis", '\u{2026}'),
1364 ("quoteleft", '\u{2018}'),
1365 ("quoteright", '\u{2019}'),
1366 ("quotedblleft", '\u{201C}'),
1367 ("quotedblright", '\u{201D}'),
1368 ("quotesinglebase", '\u{201A}'),
1369 ("quotesinglbase", '\u{201A}'),
1370 ("quotedblbase", '\u{201E}'),
1371 ("dagger", '\u{2020}'),
1372 ("daggerdbl", '\u{2021}'),
1373 ("perthousand", '\u{2030}'),
1374 ("guilsinglleft", '\u{2039}'),
1375 ("guilsinglright", '\u{203A}'),
1376 ("guillemotleft", '\u{00AB}'),
1377 ("guillemotright", '\u{00BB}'),
1378 ("trademark", '\u{2122}'),
1379 ("copyright", '\u{00A9}'),
1380 ("registered", '\u{00AE}'),
1381 ("degree", '\u{00B0}'),
1382 ("plusminus", '\u{00B1}'),
1383 ("multiply", '\u{00D7}'),
1384 ("divide", '\u{00F7}'),
1385 ("fraction", '\u{2044}'),
1386 ("Euro", '\u{20AC}'),
1387 ("sterling", '\u{00A3}'),
1388 ("yen", '\u{00A5}'),
1389 ("cent", '\u{00A2}'),
1390 ("currency", '\u{00A4}'),
1391 ("section", '\u{00A7}'),
1392 ("paragraph", '\u{00B6}'),
1393 ("brokenbar", '\u{00A6}'),
1394 ("ordfeminine", '\u{00AA}'),
1395 ("ordmasculine", '\u{00BA}'),
1396 ("exclamdown", '\u{00A1}'),
1397 ("questiondown", '\u{00BF}'),
1398 ("logicalnot", '\u{00AC}'),
1399 ("mu", '\u{00B5}'),
1400 ("macron", '\u{00AF}'),
1401 ("acute", '\u{00B4}'),
1402 ("cedilla", '\u{00B8}'),
1403 ("dieresis", '\u{00A8}'),
1404 ("circumflex", '\u{02C6}'),
1405 ("tilde", '\u{02DC}'),
1406 ("caron", '\u{02C7}'),
1407 ("ring", '\u{02DA}'),
1408 ("breve", '\u{02D8}'),
1409 ("dotaccent", '\u{02D9}'),
1410 ("hungarumlaut", '\u{02DD}'),
1411 ("ogonek", '\u{02DB}'),
1412 ("nbspace", '\u{00A0}'),
1413 ("nonbreakingspace", '\u{00A0}'),
1414 ("softhyphen", '\u{00AD}'),
1415 ("periodcentered", '\u{00B7}'),
1416 ("middot", '\u{00B7}'),
1417 ("florin", '\u{0192}'),
1418 ("OE", '\u{0152}'),
1419 ("oe", '\u{0153}'),
1420 ("Scaron", '\u{0160}'),
1421 ("scaron", '\u{0161}'),
1422 ("Zcaron", '\u{017D}'),
1423 ("zcaron", '\u{017E}'),
1424 ("Ydieresis", '\u{0178}'),
1425 ("Lslash", '\u{0141}'),
1426 ("lslash", '\u{0142}'),
1427 ("Oslash", '\u{00D8}'),
1428 ("oslash", '\u{00F8}'),
1429 ("dotlessi", '\u{0131}'),
1430 ("onesuperior", '\u{00B9}'),
1432 ("twosuperior", '\u{00B2}'),
1433 ("threesuperior", '\u{00B3}'),
1434 ("onequarter", '\u{00BC}'),
1435 ("onehalf", '\u{00BD}'),
1436 ("threequarters", '\u{00BE}'),
1437 ("minus", '\u{2212}'),
1439 ("notequal", '\u{2260}'),
1440 ("lessequal", '\u{2264}'),
1441 ("greaterequal", '\u{2265}'),
1442 ("infinity", '\u{221E}'),
1443 ("partialdiff", '\u{2202}'),
1444 ("summation", '\u{2211}'),
1445 ("product", '\u{220F}'),
1446 ("integral", '\u{222B}'),
1447 ("radical", '\u{221A}'),
1448 ("approxequal", '\u{2248}'),
1449 ("Delta", '\u{0394}'),
1450 ("lozenge", '\u{25CA}'),
1451 ("pi", '\u{03C0}'),
1452 ("Omega", '\u{03A9}'),
1453 ];
1454 entries.iter().cloned().collect()
1455 })
1456}
1457
1458const CT_LIGATURE_MARKER: char = '\u{E007}';
1459
1460#[derive(Clone, Default)]
1461struct DecodedPdfString {
1462 text: String,
1463 ct_origins: Vec<bool>,
1464}
1465
1466impl DecodedPdfString {
1467 fn from_text(text: String) -> Self {
1468 let ct_origins = vec![false; text.chars().count()];
1469 Self { text, ct_origins }
1470 }
1471
1472 fn push_char(&mut self, ch: char, is_ct_origin: bool) {
1473 self.text.push(ch);
1474 self.ct_origins.push(is_ct_origin);
1475 }
1476
1477 fn push_str(&mut self, s: &str) {
1478 for ch in s.chars() {
1479 self.push_char(ch, false);
1480 }
1481 }
1482
1483 fn extend(&mut self, other: DecodedPdfString) {
1484 self.text.push_str(&other.text);
1485 self.ct_origins.extend(other.ct_origins);
1486 }
1487
1488 fn is_empty(&self) -> bool {
1489 self.text.is_empty()
1490 }
1491
1492 fn glyph_count(&self) -> usize {
1493 self.ct_origins.len()
1494 }
1495
1496 fn iter(&self) -> impl Iterator<Item = (char, bool)> + '_ {
1497 self.text.chars().zip(self.ct_origins.iter().copied())
1498 }
1499}
1500
1501fn decode_pdf_string_with_font_marked(
1504 bytes: &[u8],
1505 font_info: Option<&FontInfo>,
1506) -> DecodedPdfString {
1507 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1509 let chars: Vec<u16> = bytes[2..]
1510 .chunks(2)
1511 .filter_map(|chunk| {
1512 if chunk.len() == 2 {
1513 Some(u16::from_be_bytes([chunk[0], chunk[1]]))
1514 } else {
1515 None
1516 }
1517 })
1518 .collect();
1519 return DecodedPdfString::from_text(String::from_utf16_lossy(&chars));
1520 }
1521
1522 if let Some(info) = font_info {
1523 if info.is_cid && !info.to_unicode.is_empty() {
1524 let mut result = DecodedPdfString::default();
1526 let mut i = 0;
1527 while i + 1 < bytes.len() {
1528 let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
1529 if let Some(s) = info.to_unicode.get(&code) {
1530 result.push_str(s);
1531 } else {
1532 if let Some(ch) = char::from_u32(code) {
1534 if !ch.is_control() || ch == ' ' || ch == '\t' || ch == '\n' {
1535 result.push_char(ch, false);
1536 }
1537 }
1538 }
1539 i += 2;
1540 }
1541 return result;
1542 }
1543
1544 if !info.is_cid && !info.to_unicode.is_empty() {
1545 let mut result = DecodedPdfString::default();
1547 for &b in bytes {
1548 if let Some(s) = info.to_unicode.get(&(b as u32)) {
1549 result.push_str(s);
1550 } else if info.ct_codes[b as usize] {
1551 result.push_char(CT_LIGATURE_MARKER, true);
1552 } else if let Some(ch) = info.encoding_map[b as usize] {
1553 result.push_char(ch, false);
1554 } else {
1555 let ch = b as char;
1556 if is_printable_or_space(ch) {
1557 result.push_char(ch, false);
1558 }
1559 }
1560 }
1561 return result;
1562 }
1563
1564 if !info.is_cid
1566 && (info.encoding_map.iter().any(|c| c.is_some()) || info.ct_codes.iter().any(|f| *f))
1567 {
1568 let mut result = DecodedPdfString::default();
1569 for &b in bytes {
1570 if info.ct_codes[b as usize] {
1571 result.push_char(CT_LIGATURE_MARKER, true);
1572 } else if let Some(ch) = info.encoding_map[b as usize] {
1573 result.push_char(ch, false);
1574 } else {
1575 let ch = b as char;
1576 if is_printable_or_space(ch) {
1577 result.push_char(ch, false);
1578 }
1579 }
1580 }
1581 return result;
1582 }
1583 }
1584
1585 let mut result = DecodedPdfString::default();
1587 for &b in bytes {
1588 let ch = b as char;
1589 if is_printable_or_space(ch) {
1590 result.push_char(ch, false);
1591 }
1592 }
1593 result
1594}
1595
1596fn is_printable_or_space(ch: char) -> bool {
1599 let cp = ch as u32;
1600 cp >= 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D
1601}
1602
1603const LIGATURE_DECOMP: bool = true;
1611
1612fn decompose_ligature_char(c: char) -> Option<&'static str> {
1620 Some(match c {
1621 '\u{FB00}' => "ff",
1622 '\u{FB01}' => "fi",
1623 '\u{FB02}' => "fl",
1624 '\u{FB03}' => "ffi",
1625 '\u{FB04}' => "ffl",
1626 '\u{FB05}' | '\u{FB06}' => "st",
1630 _ => return None,
1631 })
1632}
1633
1634fn decompose_ligature_char_with_origin(c: char, is_ct_origin: bool) -> Option<&'static str> {
1635 if is_ct_origin && c == CT_LIGATURE_MARKER {
1636 Some("ct")
1637 } else {
1638 decompose_ligature_char(c)
1639 }
1640}
1641
1642fn decompose_glyph_to_string(c: char) -> Option<String> {
1650 use unicode_normalization::UnicodeNormalization;
1651 if let Some(s) = decompose_ligature_char(c) {
1652 return Some(s.to_string());
1653 }
1654 if matches!(c, '\u{FB00}'..='\u{FB4F}') {
1655 let nfkd: String = c.nfkd().collect();
1656 if nfkd != c.to_string() {
1661 return Some(nfkd);
1662 }
1663 }
1664 None
1665}
1666
1667fn decompose_ligatures(s: &str) -> String {
1674 let mut out = String::with_capacity(s.len());
1675 for c in s.chars() {
1676 if let Some(replacement) = decompose_glyph_to_string(c) {
1677 out.push_str(&replacement);
1678 } else {
1679 out.push(c);
1680 }
1681 }
1682 out
1683}
1684
1685fn decompose_decoded_ligatures(s: &DecodedPdfString) -> String {
1686 let mut out = String::with_capacity(s.text.len());
1687 for (c, is_ct_origin) in s.iter() {
1688 if let Some(replacement) = decompose_ligature_char_with_origin(c, is_ct_origin) {
1689 out.push_str(replacement);
1690 } else if let Some(replacement) = decompose_glyph_to_string(c) {
1691 out.push_str(&replacement);
1692 } else {
1693 out.push(c);
1694 }
1695 }
1696 out
1697}
1698
1699fn maybe_decompose_decoded(s: &DecodedPdfString) -> String {
1703 if LIGATURE_DECOMP {
1704 if s.ct_origins.iter().any(|origin| *origin) {
1705 decompose_decoded_ligatures(s)
1706 } else {
1707 decompose_ligatures(&s.text)
1708 }
1709 } else {
1710 s.text.clone()
1711 }
1712}
1713
1714pub fn extract_page_blocks(doc: &Document, page_num: u32) -> Vec<TextBlock> {
1716 let pages = doc.get_pages();
1717 let Some(&page_id) = pages.get(&page_num) else {
1718 return Vec::new();
1719 };
1720
1721 let font_map = build_font_map(doc, page_id);
1722 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1723 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1724 if let Ok(content) = Content::decode(&content_bytes) {
1725 return extract_blocks_from_ops_inner(
1726 &content.operations,
1727 page_num,
1728 &font_map,
1729 Some((doc, &resources)),
1730 0,
1731 None,
1732 );
1733 }
1734 }
1735
1736 Vec::new()
1737}
1738
1739pub fn extract_blocks_from_page_id(
1745 doc: &Document,
1746 page_id: lopdf::ObjectId,
1747 page_num: u32,
1748) -> Vec<TextBlock> {
1749 let font_map = build_font_map(doc, page_id);
1750 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1751 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1752 if let Ok(content) = Content::decode(&content_bytes) {
1753 return extract_blocks_from_ops_inner(
1754 &content.operations,
1755 page_num,
1756 &font_map,
1757 Some((doc, &resources)),
1758 0,
1759 None,
1760 );
1761 }
1762 }
1763 Vec::new()
1764}
1765
1766pub fn extract_text(doc: &Document) -> Vec<TextBlock> {
1768 let pages = doc.get_pages();
1769 let mut blocks = Vec::new();
1770
1771 for (&page_num, &page_id) in &pages {
1772 let font_map = build_font_map(doc, page_id);
1773 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1774 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1775 if let Ok(content) = Content::decode(&content_bytes) {
1776 let page_blocks = extract_blocks_from_ops_inner(
1777 &content.operations,
1778 page_num,
1779 &font_map,
1780 Some((doc, &resources)),
1781 0,
1782 None,
1783 );
1784 blocks.extend(page_blocks);
1785 }
1786 }
1787 }
1788
1789 blocks
1790}
1791
1792pub fn extract_page_text(doc: &Document, page_num: u32) -> Result<String> {
1794 let pages = doc.get_pages();
1795 let total = pages.len() as u32;
1796
1797 if page_num == 0 || page_num > total {
1798 return Err(ExtractError::PageOutOfRange(page_num, total));
1799 }
1800
1801 let page_id = *pages
1802 .get(&page_num)
1803 .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1804
1805 let font_map = build_font_map(doc, page_id);
1806 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1807 let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1808 let content = match Content::decode(&content_bytes) {
1809 Ok(c) => c,
1810 Err(_) => return Ok(String::new()),
1811 };
1812
1813 let blocks = extract_blocks_from_ops_inner(
1814 &content.operations,
1815 page_num,
1816 &font_map,
1817 Some((doc, &resources)),
1818 0,
1819 None,
1820 );
1821 let text = blocks
1822 .iter()
1823 .map(|b| b.text.as_str())
1824 .collect::<Vec<_>>()
1825 .join("");
1826
1827 Ok(text)
1828}
1829
1830pub fn extract_positioned_chars(doc: &Document, page_num: u32) -> Result<Vec<PositionedChar>> {
1832 let pages = doc.get_pages();
1833 let total = pages.len() as u32;
1834
1835 if page_num == 0 || page_num > total {
1836 return Err(ExtractError::PageOutOfRange(page_num, total));
1837 }
1838
1839 let page_id = *pages
1840 .get(&page_num)
1841 .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1842
1843 let font_map = build_font_map(doc, page_id);
1844 let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1845 let content = match Content::decode(&content_bytes) {
1846 Ok(c) => c,
1847 Err(_) => return Ok(Vec::new()),
1848 };
1849
1850 let chars = extract_chars_from_ops(&content.operations, page_num, &font_map);
1851 Ok(chars)
1852}
1853
1854fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> std::result::Result<Vec<u8>, ()> {
1856 doc.get_page_content(page_id).map_err(|_| ())
1857}
1858
1859#[derive(Debug, Clone, Default)]
1864struct MarkedContentEntry {
1865 actual_text: Option<String>,
1868}
1869
1870fn resolve_bdc_properties(
1874 doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1875 operand: &Object,
1876) -> Option<lopdf::Dictionary> {
1877 match operand {
1878 Object::Dictionary(d) => Some(d.clone()),
1879 Object::Reference(r) => doc_and_resources.and_then(|(doc, _)| {
1880 doc.get_object(*r).ok().and_then(|o| match o {
1881 Object::Dictionary(d) => Some(d.clone()),
1882 _ => None,
1883 })
1884 }),
1885 Object::Name(n) => {
1886 let (doc, resources) = doc_and_resources?;
1887 let props = resolve_dict(doc, resources, b"Properties")?;
1888 match props.get(n.as_slice()).ok()? {
1889 Object::Dictionary(d) => Some(d.clone()),
1890 Object::Reference(r) => match doc.get_object(*r).ok()? {
1891 Object::Dictionary(d) => Some(d.clone()),
1892 _ => None,
1893 },
1894 _ => None,
1895 }
1896 }
1897 _ => None,
1898 }
1899}
1900
1901fn decode_pdf_text_string(bytes: &[u8]) -> String {
1905 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1906 let u16s: Vec<u16> = bytes[2..]
1907 .chunks_exact(2)
1908 .map(|c| u16::from_be_bytes([c[0], c[1]]))
1909 .collect();
1910 return String::from_utf16_lossy(&u16s);
1911 }
1912 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
1913 return String::from_utf8_lossy(&bytes[3..]).into_owned();
1914 }
1915 bytes
1916 .iter()
1917 .filter_map(|&b| {
1918 let ch = b as char;
1919 if is_printable_or_space(ch) {
1920 Some(ch)
1921 } else {
1922 None
1923 }
1924 })
1925 .collect()
1926}
1927
1928fn extract_actual_text(props: &lopdf::Dictionary) -> Option<String> {
1930 let obj = props.get(b"ActualText").ok()?;
1931 match obj {
1932 Object::String(bytes, _) => Some(decode_pdf_text_string(bytes)),
1933 _ => None,
1934 }
1935}
1936
1937fn current_actual_text(stack: &[MarkedContentEntry], inherited: Option<&str>) -> Option<String> {
1943 stack
1944 .iter()
1945 .rev()
1946 .find_map(|e| e.actual_text.clone())
1947 .or_else(|| inherited.map(|s| s.to_string()))
1948}
1949
1950fn extract_blocks_from_ops_inner(
1956 ops: &[Operation],
1957 page: u32,
1958 font_map: &HashMap<String, FontInfo>,
1959 doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1960 depth: u32,
1961 inherited_actual_text: Option<&str>,
1962) -> Vec<TextBlock> {
1963 let mut state = TextState::default();
1964 let mut blocks = Vec::new();
1965 let mut mc_stack: Vec<MarkedContentEntry> = Vec::new();
1966
1967 for op in ops {
1968 match op.operator.as_str() {
1969 "q" => {
1970 state.gs_stack.push(GraphicsState {
1971 ctm: state.ctm,
1972 fill_color: state.fill_color,
1973 });
1974 }
1975 "Q" => {
1976 if let Some(gs) = state.gs_stack.pop() {
1977 state.ctm = gs.ctm;
1978 state.fill_color = gs.fill_color;
1979 }
1980 }
1981 "cm" => {
1982 if let Some(m) = extract_matrix(&op.operands) {
1983 state.ctm = multiply_matrix(&state.ctm, &m);
1984 }
1985 }
1986 "rg" if op.operands.len() >= 3 => {
1993 if let (Some(r), Some(g), Some(b)) = (
1994 op.operands.first().and_then(as_number),
1995 op.operands.get(1).and_then(as_number),
1996 op.operands.get(2).and_then(as_number),
1997 ) {
1998 state.fill_color = Some([
1999 clamp_unit_to_u8(r),
2000 clamp_unit_to_u8(g),
2001 clamp_unit_to_u8(b),
2002 255,
2003 ]);
2004 }
2005 }
2006 "g" => {
2007 if let Some(v) = op.operands.first().and_then(as_number) {
2008 let byte = clamp_unit_to_u8(v);
2009 state.fill_color = Some([byte, byte, byte, 255]);
2010 }
2011 }
2012 "k" if op.operands.len() >= 4 => {
2013 if let (Some(c), Some(m), Some(y), Some(kk)) = (
2014 op.operands.first().and_then(as_number),
2015 op.operands.get(1).and_then(as_number),
2016 op.operands.get(2).and_then(as_number),
2017 op.operands.get(3).and_then(as_number),
2018 ) {
2019 state.fill_color = Some(cmyk_to_rgba(c, m, y, kk));
2020 }
2021 }
2022 "RG" | "G" | "K" => {}
2023 "sc" | "scn" => {
2024 state.fill_color = match op.operands.len() {
2025 1 => op.operands.first().and_then(as_number).map(|v| {
2026 let b = clamp_unit_to_u8(v);
2027 [b, b, b, 255]
2028 }),
2029 3 => {
2030 let r = op.operands.first().and_then(as_number);
2031 let g = op.operands.get(1).and_then(as_number);
2032 let b = op.operands.get(2).and_then(as_number);
2033 match (r, g, b) {
2034 (Some(r), Some(g), Some(b)) => Some([
2035 clamp_unit_to_u8(r),
2036 clamp_unit_to_u8(g),
2037 clamp_unit_to_u8(b),
2038 255,
2039 ]),
2040 _ => None,
2041 }
2042 }
2043 4 => {
2044 let c = op.operands.first().and_then(as_number);
2045 let m = op.operands.get(1).and_then(as_number);
2046 let y = op.operands.get(2).and_then(as_number);
2047 let kk = op.operands.get(3).and_then(as_number);
2048 match (c, m, y, kk) {
2049 (Some(c), Some(m), Some(y), Some(kk)) => {
2050 Some(cmyk_to_rgba(c, m, y, kk))
2051 }
2052 _ => None,
2053 }
2054 }
2055 _ => None,
2056 };
2057 }
2058 "cs" | "CS" => {
2059 state.fill_color = None;
2060 }
2061 "BT" => {
2062 state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2063 state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2064 }
2065 "Tf" if op.operands.len() >= 2 => {
2066 if let Object::Name(ref name) = op.operands[0] {
2067 state.font_name = String::from_utf8_lossy(name).to_string();
2068 }
2069 if let Some(size) = as_number(&op.operands[1]) {
2070 state.font_size = size;
2071 }
2072 }
2073 "Tc" => {
2074 if let Some(v) = op.operands.first().and_then(as_number) {
2075 state.tc = v;
2076 }
2077 }
2078 "Tw" => {
2079 if let Some(v) = op.operands.first().and_then(as_number) {
2080 state.tw = v;
2081 }
2082 }
2083 "Tz" => {
2084 if let Some(v) = op.operands.first().and_then(as_number) {
2085 state.th = v;
2086 }
2087 }
2088 "TL" => {
2089 if let Some(v) = op.operands.first().and_then(as_number) {
2090 state.tl = v;
2091 }
2092 }
2093 "Ts" => {
2094 if let Some(v) = op.operands.first().and_then(as_number) {
2095 state.ts = v;
2096 }
2097 }
2098 "Td" if op.operands.len() >= 2 => {
2099 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2100 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2101 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2102 state.tlm = new_tlm;
2103 state.tm = new_tlm;
2104 }
2105 "TD" if op.operands.len() >= 2 => {
2106 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2107 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2108 state.tl = -ty;
2109 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2110 state.tlm = new_tlm;
2111 state.tm = new_tlm;
2112 }
2113 "Tm" => {
2114 if let Some(m) = extract_matrix(&op.operands) {
2115 state.tm = m;
2116 state.tlm = m;
2117 }
2118 }
2119 "T*" => {
2120 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2121 state.tlm = new_tlm;
2122 state.tm = new_tlm;
2123 }
2124 "Tj" => {
2125 let fi = font_map.get(&state.font_name);
2126 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2127 let raw = op.operands.iter().find_map(|o| {
2128 if let Object::String(b, _) = o {
2129 Some(b.as_slice())
2130 } else {
2131 None
2132 }
2133 });
2134
2135 let x = state.tm[4];
2137 let y = state.tm[5];
2138 let gctx = GlyphCtx {
2139 font_size: state.font_size,
2140 tc: state.tc,
2141 tw: state.tw,
2142 th: state.th,
2143 };
2144 let (char_bounds, width_source, total_adv) = match (raw, fi) {
2145 (Some(raw), Some(fi)) => {
2146 let (cb, ws, adv) = char_bounds_from_bytes(raw, fi, x, y, &gctx);
2147 (cb, ws, adv)
2148 }
2149 _ => {
2150 let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2151 let n = text.glyph_count();
2152 let cb = (0..n)
2153 .map(|i| {
2154 let cx = x + i as f64 * cw;
2155 [cx, y, cx + cw, y + state.font_size]
2156 })
2157 .collect();
2158 (cb, WidthSource::Estimate, n as f64 * cw)
2159 }
2160 };
2161
2162 if !text.is_empty() {
2165 let display_text = maybe_decompose_decoded(&text);
2166 blocks.push(TextBlock {
2167 text: display_text,
2168 page,
2169 bbox: [x, y, x + total_adv, y + state.font_size],
2170 font_name: state.font_name.clone(),
2171 font_size: state.font_size,
2172 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2173 base_font: font_map
2174 .get(&state.font_name)
2175 .and_then(|fi| fi.base_font.clone()),
2176 is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2177 is_italic: font_map
2178 .get(&state.font_name)
2179 .is_some_and(|fi| fi.is_italic),
2180 color: state.fill_color,
2181 width_source,
2182 char_bounds,
2183 });
2184 }
2185
2186 state.tm[4] = x + total_adv;
2187 }
2188 }
2189 "TJ" => {
2190 if let Some(Object::Array(ref arr)) = op.operands.first() {
2191 let x_start = state.tm[4];
2192 let y = state.tm[5];
2193 let fi = font_map.get(&state.font_name);
2194
2195 let gctx = GlyphCtx {
2197 font_size: state.font_size,
2198 tc: state.tc,
2199 tw: state.tw,
2200 th: state.th,
2201 };
2202 let (char_bounds, width_source, x_end) = match fi {
2203 Some(fi) => char_bounds_from_tj_array(arr, fi, x_start, y, &gctx),
2204 None => {
2205 let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2206 let mut cx = x_start;
2208 let mut cb = Vec::new();
2209 for item in arr {
2210 if let Object::String(bytes, _) = item {
2211 let n = if fi.map(|f| f.is_cid).unwrap_or(false) {
2212 bytes.len() / 2
2213 } else {
2214 bytes.len()
2215 };
2216 for _ in 0..n {
2217 cb.push([cx, y, cx + cw, y + state.font_size]);
2218 cx += cw + state.tc;
2219 }
2220 } else if let Some(adj) = as_number(item) {
2221 cx -= adj / 1000.0 * state.font_size;
2222 }
2223 }
2224 (cb, WidthSource::Estimate, cx)
2225 }
2226 };
2227
2228 let mut combined_text = DecodedPdfString::default();
2230 for item in arr {
2231 if let Object::String(bytes, _) = item {
2232 combined_text.extend(decode_pdf_string_with_font_marked(bytes, fi));
2233 }
2234 }
2235
2236 state.tm[4] = x_end;
2237
2238 if !combined_text.is_empty() {
2239 blocks.push(TextBlock {
2240 text: maybe_decompose_decoded(&combined_text),
2241 page,
2242 bbox: [x_start, y, x_end, y + state.font_size],
2243 font_name: state.font_name.clone(),
2244 font_size: state.font_size,
2245 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2246 base_font: font_map
2247 .get(&state.font_name)
2248 .and_then(|fi| fi.base_font.clone()),
2249 is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2250 is_italic: font_map
2251 .get(&state.font_name)
2252 .is_some_and(|fi| fi.is_italic),
2253 color: state.fill_color,
2254 width_source,
2255 char_bounds,
2256 });
2257 }
2258 }
2259 }
2260 "'" => {
2261 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2263 state.tlm = new_tlm;
2264 state.tm = new_tlm;
2265
2266 let fi = font_map.get(&state.font_name);
2267 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2268 let raw = op.operands.iter().find_map(|o| {
2269 if let Object::String(b, _) = o {
2270 Some(b.as_slice())
2271 } else {
2272 None
2273 }
2274 });
2275 let x = state.tm[4];
2276 let y = state.tm[5];
2277 let gctx = GlyphCtx {
2278 font_size: state.font_size,
2279 tc: state.tc,
2280 tw: state.tw,
2281 th: state.th,
2282 };
2283 let (char_bounds, width_source, total_adv) = match (raw, fi) {
2284 (Some(raw), Some(fi)) => {
2285 let (cb, ws, adv) = char_bounds_from_bytes(raw, fi, x, y, &gctx);
2286 (cb, ws, adv)
2287 }
2288 _ => {
2289 let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2290 let n = text.glyph_count();
2291 let cb = (0..n)
2292 .map(|i| {
2293 let cx = x + i as f64 * cw;
2294 [cx, y, cx + cw, y + state.font_size]
2295 })
2296 .collect();
2297 (cb, WidthSource::Estimate, n as f64 * cw)
2298 }
2299 };
2300
2301 if !text.is_empty() {
2302 let display_text = maybe_decompose_decoded(&text);
2303 blocks.push(TextBlock {
2304 text: display_text,
2305 page,
2306 bbox: [x, y, x + total_adv, y + state.font_size],
2307 font_name: state.font_name.clone(),
2308 font_size: state.font_size,
2309 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2310 base_font: font_map
2311 .get(&state.font_name)
2312 .and_then(|fi| fi.base_font.clone()),
2313 is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2314 is_italic: font_map
2315 .get(&state.font_name)
2316 .is_some_and(|fi| fi.is_italic),
2317 color: state.fill_color,
2318 width_source,
2319 char_bounds,
2320 });
2321 }
2322
2323 state.tm[4] = x + total_adv;
2324 }
2325 }
2326 "\"" if op.operands.len() >= 3 => {
2327 if let Some(tw) = as_number(&op.operands[0]) {
2329 state.tw = tw;
2330 }
2331 if let Some(tc) = as_number(&op.operands[1]) {
2332 state.tc = tc;
2333 }
2334
2335 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2336 state.tlm = new_tlm;
2337 state.tm = new_tlm;
2338
2339 let fi = font_map.get(&state.font_name);
2340 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2341 {
2342 let raw = op.operands[2..].iter().find_map(|o| {
2343 if let Object::String(b, _) = o {
2344 Some(b.as_slice())
2345 } else {
2346 None
2347 }
2348 });
2349 let x = state.tm[4];
2350 let y = state.tm[5];
2351 let gctx = GlyphCtx {
2352 font_size: state.font_size,
2353 tc: state.tc,
2354 tw: state.tw,
2355 th: state.th,
2356 };
2357 let (char_bounds, width_source, total_adv) = match (raw, fi) {
2358 (Some(raw), Some(fi)) => {
2359 let (cb, ws, adv) = char_bounds_from_bytes(raw, fi, x, y, &gctx);
2360 (cb, ws, adv)
2361 }
2362 _ => {
2363 let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2364 let n = text.glyph_count();
2365 let cb = (0..n)
2366 .map(|i| {
2367 let cx = x + i as f64 * cw;
2368 [cx, y, cx + cw, y + state.font_size]
2369 })
2370 .collect();
2371 (cb, WidthSource::Estimate, n as f64 * cw)
2372 }
2373 };
2374
2375 if !text.is_empty() {
2376 let display_text = maybe_decompose_decoded(&text);
2377 blocks.push(TextBlock {
2378 text: display_text,
2379 page,
2380 bbox: [x, y, x + total_adv, y + state.font_size],
2381 font_name: state.font_name.clone(),
2382 font_size: state.font_size,
2383 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2384 base_font: font_map
2385 .get(&state.font_name)
2386 .and_then(|fi| fi.base_font.clone()),
2387 is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2388 is_italic: font_map
2389 .get(&state.font_name)
2390 .is_some_and(|fi| fi.is_italic),
2391 color: state.fill_color,
2392 width_source,
2393 char_bounds,
2394 });
2395 }
2396
2397 state.tm[4] = x + total_adv;
2398 }
2399 }
2400 "BMC" => {
2401 mc_stack.push(MarkedContentEntry::default());
2405 }
2406 "BDC" => {
2407 let actual_text = op
2411 .operands
2412 .get(1)
2413 .and_then(|o| resolve_bdc_properties(doc_and_resources, o))
2414 .as_ref()
2415 .and_then(extract_actual_text);
2416 mc_stack.push(MarkedContentEntry { actual_text });
2417 }
2418 "EMC" => {
2419 mc_stack.pop();
2422 }
2423 "Do" if depth < 5 => {
2424 if let Some((doc, resources)) = doc_and_resources {
2429 if let Some(Object::Name(ref xobj_name)) = op.operands.first() {
2430 let xobj_name_str = String::from_utf8_lossy(xobj_name);
2431 let inherited_for_child =
2432 current_actual_text(&mc_stack, inherited_actual_text);
2433 if let Some(xobj_blocks) = extract_form_xobject_text(
2434 doc,
2435 resources,
2436 &xobj_name_str,
2437 page,
2438 font_map,
2439 depth,
2440 inherited_for_child.as_deref(),
2441 ) {
2442 blocks.extend(xobj_blocks);
2443 }
2444 }
2445 }
2446 }
2447 _ => {}
2448 }
2449 }
2450
2451 blocks
2452}
2453
2454fn extract_form_xobject_text(
2461 doc: &Document,
2462 resources: &lopdf::Dictionary,
2463 name: &str,
2464 page: u32,
2465 font_map: &HashMap<String, FontInfo>,
2466 depth: u32,
2467 inherited_actual_text: Option<&str>,
2468) -> Option<Vec<TextBlock>> {
2469 let xobj_dict = match resources.get(b"XObject").ok()? {
2471 Object::Dictionary(d) => d.clone(),
2472 Object::Reference(r) => match doc.get_object(*r).ok()? {
2473 Object::Dictionary(d) => d.clone(),
2474 _ => return None,
2475 },
2476 _ => return None,
2477 };
2478
2479 let xobj_ref = match xobj_dict.get(name.as_bytes()).ok()? {
2480 Object::Reference(r) => *r,
2481 _ => return None,
2482 };
2483
2484 let stream = match doc.get_object(xobj_ref).ok()? {
2485 Object::Stream(s) => s.clone(),
2486 _ => return None,
2487 };
2488
2489 let subtype = stream
2491 .dict
2492 .get(b"Subtype")
2493 .ok()
2494 .and_then(|o| match o {
2495 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
2496 _ => None,
2497 })
2498 .unwrap_or_default();
2499 if subtype != "Form" {
2500 return None;
2501 }
2502
2503 let content_bytes = stream
2505 .decompressed_content()
2506 .ok()
2507 .unwrap_or_else(|| stream.content.clone());
2508 let content = Content::decode(&content_bytes).ok()?;
2509
2510 let mut xobj_font_map = font_map.clone();
2513 if let Some(xobj_resources) = resolve_dict(doc, &stream.dict, b"Resources") {
2514 if let Some(xobj_fonts) = resolve_dict(doc, &xobj_resources, b"Font") {
2515 for (name_bytes, value) in xobj_fonts.iter() {
2516 let fname = String::from_utf8_lossy(name_bytes).to_string();
2517 if let Some(fi) = build_font_info_from_value(doc, value) {
2518 xobj_font_map.insert(fname, fi);
2519 }
2520 }
2521 }
2522 }
2523
2524 let xobj_resources =
2526 resolve_dict(doc, &stream.dict, b"Resources").unwrap_or_else(|| resources.clone());
2527
2528 Some(extract_blocks_from_ops_inner(
2529 &content.operations,
2530 page,
2531 &xobj_font_map,
2532 Some((doc, &xobj_resources)),
2533 depth + 1,
2534 inherited_actual_text,
2535 ))
2536}
2537
2538fn build_font_info_from_value(doc: &Document, value: &Object) -> Option<FontInfo> {
2540 let font = match value {
2541 Object::Reference(r) => match doc.get_object(*r).ok()? {
2542 Object::Dictionary(d) => d.clone(),
2543 _ => return None,
2544 },
2545 Object::Dictionary(d) => d.clone(),
2546 _ => return None,
2547 };
2548
2549 let subtype = font
2550 .get(b"Subtype")
2551 .ok()
2552 .and_then(|o| match o {
2553 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
2554 _ => None,
2555 })
2556 .unwrap_or_default();
2557
2558 let is_cid = subtype == "Type0";
2559 let mut to_unicode = parse_to_unicode_from_font(doc, &font);
2560
2561 if to_unicode.is_empty() && is_cid {
2562 if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
2563 for d in descendants {
2564 let desc_dict = match d {
2565 Object::Reference(r) => {
2566 let Some(Object::Dictionary(d)) = doc.get_object(*r).ok() else {
2569 continue;
2570 };
2571 d
2572 }
2573 Object::Dictionary(d) => d,
2574 _ => continue,
2575 };
2576 let tu = parse_to_unicode_from_font(doc, desc_dict);
2577 if !tu.is_empty() {
2578 to_unicode = tu;
2579 break;
2580 }
2581 }
2582 }
2583 }
2584
2585 let (encoding_map, ct_codes) = if !is_cid {
2586 build_encoding_map(doc, &font)
2587 } else {
2588 ([None; 256], [false; 256])
2589 };
2590
2591 let (base_font, is_bold, is_italic) = derive_font_style(doc, &font);
2592 let (simple_widths, cid_default_width, cid_widths) = parse_font_widths(doc, &font, is_cid);
2593
2594 Some(FontInfo {
2595 is_cid,
2596 to_unicode,
2597 encoding_map,
2598 ct_codes,
2599 base_font,
2600 is_bold,
2601 is_italic,
2602 simple_widths,
2603 cid_default_width,
2604 cid_widths,
2605 })
2606}
2607
2608fn push_glyph_positioned(
2622 chars: &mut Vec<PositionedChar>,
2623 state: &mut TextState,
2624 page: u32,
2625 glyph: char,
2626 is_ct_origin: bool,
2627 char_w: f64,
2628) {
2629 let (gx, gy) = apply_ctm(state);
2630 let constituents: Option<String> = if LIGATURE_DECOMP {
2631 if let Some(s) = decompose_ligature_char_with_origin(glyph, is_ct_origin) {
2635 Some(s.to_string())
2636 } else {
2637 decompose_glyph_to_string(glyph)
2638 }
2639 } else {
2640 None
2641 };
2642 match constituents {
2643 None => {
2644 chars.push(PositionedChar {
2645 ch: glyph,
2646 page,
2647 bbox: [gx, gy, gx + char_w, gy + state.font_size],
2648 });
2649 }
2650 Some(s) => {
2651 let n = s.chars().count() as f64;
2652 let part_w = char_w / n;
2653 for (i, c) in s.chars().enumerate() {
2654 let x = gx + part_w * i as f64;
2655 chars.push(PositionedChar {
2656 ch: c,
2657 page,
2658 bbox: [x, gy, x + part_w, gy + state.font_size],
2659 });
2660 }
2661 }
2662 }
2663 state.tm[4] += char_w + state.tc;
2664}
2665
2666fn extract_chars_from_ops(
2668 ops: &[Operation],
2669 page: u32,
2670 font_map: &HashMap<String, FontInfo>,
2671) -> Vec<PositionedChar> {
2672 let mut state = TextState::default();
2673 let mut chars = Vec::new();
2674
2675 for op in ops {
2676 match op.operator.as_str() {
2677 "q" => {
2678 state.gs_stack.push(GraphicsState {
2679 ctm: state.ctm,
2680 fill_color: state.fill_color,
2681 });
2682 }
2683 "Q" => {
2684 if let Some(gs) = state.gs_stack.pop() {
2685 state.ctm = gs.ctm;
2686 state.fill_color = gs.fill_color;
2687 }
2688 }
2689 "cm" => {
2690 if let Some(m) = extract_matrix(&op.operands) {
2691 state.ctm = multiply_matrix(&state.ctm, &m);
2692 }
2693 }
2694 "rg" if op.operands.len() >= 3 => {
2701 if let (Some(r), Some(g), Some(b)) = (
2702 op.operands.first().and_then(as_number),
2703 op.operands.get(1).and_then(as_number),
2704 op.operands.get(2).and_then(as_number),
2705 ) {
2706 state.fill_color = Some([
2707 clamp_unit_to_u8(r),
2708 clamp_unit_to_u8(g),
2709 clamp_unit_to_u8(b),
2710 255,
2711 ]);
2712 }
2713 }
2714 "g" => {
2715 if let Some(v) = op.operands.first().and_then(as_number) {
2716 let byte = clamp_unit_to_u8(v);
2717 state.fill_color = Some([byte, byte, byte, 255]);
2718 }
2719 }
2720 "k" if op.operands.len() >= 4 => {
2721 if let (Some(c), Some(m), Some(y), Some(kk)) = (
2722 op.operands.first().and_then(as_number),
2723 op.operands.get(1).and_then(as_number),
2724 op.operands.get(2).and_then(as_number),
2725 op.operands.get(3).and_then(as_number),
2726 ) {
2727 state.fill_color = Some(cmyk_to_rgba(c, m, y, kk));
2728 }
2729 }
2730 "RG" | "G" | "K" => {}
2731 "sc" | "scn" => {
2732 state.fill_color = match op.operands.len() {
2733 1 => op.operands.first().and_then(as_number).map(|v| {
2734 let b = clamp_unit_to_u8(v);
2735 [b, b, b, 255]
2736 }),
2737 3 => {
2738 let r = op.operands.first().and_then(as_number);
2739 let g = op.operands.get(1).and_then(as_number);
2740 let b = op.operands.get(2).and_then(as_number);
2741 match (r, g, b) {
2742 (Some(r), Some(g), Some(b)) => Some([
2743 clamp_unit_to_u8(r),
2744 clamp_unit_to_u8(g),
2745 clamp_unit_to_u8(b),
2746 255,
2747 ]),
2748 _ => None,
2749 }
2750 }
2751 4 => {
2752 let c = op.operands.first().and_then(as_number);
2753 let m = op.operands.get(1).and_then(as_number);
2754 let y = op.operands.get(2).and_then(as_number);
2755 let kk = op.operands.get(3).and_then(as_number);
2756 match (c, m, y, kk) {
2757 (Some(c), Some(m), Some(y), Some(kk)) => {
2758 Some(cmyk_to_rgba(c, m, y, kk))
2759 }
2760 _ => None,
2761 }
2762 }
2763 _ => None,
2764 };
2765 }
2766 "cs" | "CS" => {
2767 state.fill_color = None;
2768 }
2769 "BT" => {
2770 state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2771 state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2772 }
2773 "Tf" if op.operands.len() >= 2 => {
2774 if let Object::Name(ref name) = op.operands[0] {
2775 state.font_name = String::from_utf8_lossy(name).to_string();
2776 }
2777 if let Some(size) = as_number(&op.operands[1]) {
2778 state.font_size = size;
2779 }
2780 }
2781 "Tc" => {
2782 if let Some(v) = op.operands.first().and_then(as_number) {
2783 state.tc = v;
2784 }
2785 }
2786 "Tw" => {
2787 if let Some(v) = op.operands.first().and_then(as_number) {
2788 state.tw = v;
2789 }
2790 }
2791 "Tz" => {
2792 if let Some(v) = op.operands.first().and_then(as_number) {
2793 state.th = v;
2794 }
2795 }
2796 "TL" => {
2797 if let Some(v) = op.operands.first().and_then(as_number) {
2798 state.tl = v;
2799 }
2800 }
2801 "Ts" => {
2802 if let Some(v) = op.operands.first().and_then(as_number) {
2803 state.ts = v;
2804 }
2805 }
2806 "Td" if op.operands.len() >= 2 => {
2807 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2808 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2809 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2810 state.tlm = new_tlm;
2811 state.tm = new_tlm;
2812 }
2813 "TD" if op.operands.len() >= 2 => {
2814 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2815 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2816 state.tl = -ty;
2817 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2818 state.tlm = new_tlm;
2819 state.tm = new_tlm;
2820 }
2821 "Tm" => {
2822 if let Some(m) = extract_matrix(&op.operands) {
2823 state.tm = m;
2824 state.tlm = m;
2825 }
2826 }
2827 "T*" => {
2828 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2829 state.tlm = new_tlm;
2830 state.tm = new_tlm;
2831 }
2832 "Tj" => {
2833 let fi = font_map.get(&state.font_name);
2834 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2835 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2836 for (ch, is_ct_origin) in text.iter() {
2837 push_glyph_positioned(
2838 &mut chars,
2839 &mut state,
2840 page,
2841 ch,
2842 is_ct_origin,
2843 char_w,
2844 );
2845 }
2846 }
2847 }
2848 "TJ" => {
2849 if let Some(Object::Array(ref arr)) = op.operands.first() {
2850 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2851 let fi = font_map.get(&state.font_name);
2852 for item in arr {
2853 match item {
2854 Object::String(bytes, _) => {
2855 let text = decode_pdf_string_with_font_marked(bytes, fi);
2856 for (ch, is_ct_origin) in text.iter() {
2857 push_glyph_positioned(
2858 &mut chars,
2859 &mut state,
2860 page,
2861 ch,
2862 is_ct_origin,
2863 char_w,
2864 );
2865 }
2866 }
2867 _ => {
2868 if let Some(adj) = as_number(item) {
2869 state.tm[4] -= adj / 1000.0 * state.font_size;
2870 }
2871 }
2872 }
2873 }
2874 }
2875 }
2876 "'" => {
2877 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2878 state.tlm = new_tlm;
2879 state.tm = new_tlm;
2880
2881 let fi = font_map.get(&state.font_name);
2882 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2883 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2884 for (ch, is_ct_origin) in text.iter() {
2885 push_glyph_positioned(
2886 &mut chars,
2887 &mut state,
2888 page,
2889 ch,
2890 is_ct_origin,
2891 char_w,
2892 );
2893 }
2894 }
2895 }
2896 "\"" if op.operands.len() >= 3 => {
2897 if let Some(tw) = as_number(&op.operands[0]) {
2898 state.tw = tw;
2899 }
2900 if let Some(tc) = as_number(&op.operands[1]) {
2901 state.tc = tc;
2902 }
2903
2904 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2905 state.tlm = new_tlm;
2906 state.tm = new_tlm;
2907
2908 let fi = font_map.get(&state.font_name);
2909 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2910 {
2911 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2912 for (ch, is_ct_origin) in text.iter() {
2913 push_glyph_positioned(
2914 &mut chars,
2915 &mut state,
2916 page,
2917 ch,
2918 is_ct_origin,
2919 char_w,
2920 );
2921 }
2922 }
2923 }
2924 _ => {}
2925 }
2926 }
2927
2928 chars
2929}
2930
2931#[inline]
2940fn apply_ctm(state: &TextState) -> (f64, f64) {
2941 let x = state.ctm[0] * state.tm[4] + state.ctm[2] * state.tm[5] + state.ctm[4];
2942 let y = state.ctm[1] * state.tm[4] + state.ctm[3] * state.tm[5] + state.ctm[5];
2943 (x, y)
2944}
2945
2946fn extract_decoded_string_operand_with_font(
2948 operands: &[Object],
2949 font_info: Option<&FontInfo>,
2950) -> Option<DecodedPdfString> {
2951 for op in operands {
2952 if let Object::String(bytes, _) = op {
2953 return Some(decode_pdf_string_with_font_marked(bytes, font_info));
2954 }
2955 }
2956 None
2957}
2958
2959fn as_number(obj: &Object) -> Option<f64> {
2961 match obj {
2962 Object::Integer(i) => Some(*i as f64),
2963 Object::Real(f) => Some(*f as f64),
2964 _ => None,
2965 }
2966}
2967
2968fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
2970 if operands.len() < 6 {
2971 return None;
2972 }
2973 let a = as_number(&operands[0])?;
2974 let b = as_number(&operands[1])?;
2975 let c = as_number(&operands[2])?;
2976 let d = as_number(&operands[3])?;
2977 let e = as_number(&operands[4])?;
2978 let f = as_number(&operands[5])?;
2979 Some([a, b, c, d, e, f])
2980}
2981
2982fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
2984 [
2985 m1[0] * m2[0] + m1[1] * m2[2],
2986 m1[0] * m2[1] + m1[1] * m2[3],
2987 m1[2] * m2[0] + m1[3] * m2[2],
2988 m1[2] * m2[1] + m1[3] * m2[3],
2989 m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
2990 m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
2991 ]
2992}
2993
2994#[cfg(test)]
2995mod tests {
2996 use super::*;
2997 use lopdf::{dictionary, Document, Object, Stream};
2998
2999 fn make_doc_with_text(content: &[u8]) -> Document {
3001 let mut doc = Document::with_version("1.7");
3002
3003 let content_stream = Stream::new(dictionary! {}, content.to_vec());
3004 let content_id = doc.add_object(Object::Stream(content_stream));
3005
3006 let page_dict = dictionary! {
3007 "Type" => "Page",
3008 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3009 "Contents" => Object::Reference(content_id),
3010 };
3011 let page_id = doc.add_object(Object::Dictionary(page_dict));
3012
3013 let pages_dict = dictionary! {
3014 "Type" => "Pages",
3015 "Kids" => vec![Object::Reference(page_id)],
3016 "Count" => 1_i64,
3017 };
3018 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3019
3020 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3021 d.set("Parent", Object::Reference(pages_id));
3022 }
3023
3024 let catalog = dictionary! {
3025 "Type" => "Catalog",
3026 "Pages" => Object::Reference(pages_id),
3027 };
3028 let catalog_id = doc.add_object(Object::Dictionary(catalog));
3029 doc.trailer.set("Root", Object::Reference(catalog_id));
3030
3031 doc
3032 }
3033
3034 #[test]
3035 fn extract_simple_text() {
3036 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
3037 let blocks = extract_text(&doc);
3038 assert_eq!(blocks.len(), 1);
3039 assert_eq!(blocks[0].text, "Hello World");
3040 assert_eq!(blocks[0].page, 1);
3041 assert_eq!(blocks[0].font_size, 12.0);
3042 }
3043
3044 #[test]
3045 fn extract_page_text_single() {
3046 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
3047 let text = extract_page_text(&doc, 1).unwrap();
3048 assert_eq!(text, "Hello");
3049 }
3050
3051 #[test]
3052 fn extract_page_text_out_of_range() {
3053 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
3054 let result = extract_page_text(&doc, 5);
3055 assert!(result.is_err());
3056 }
3057
3058 #[test]
3059 fn extract_positioned_chars_basic() {
3060 let doc = make_doc_with_text(b"BT /F1 12 Tf (AB) Tj ET");
3061 let chars = extract_positioned_chars(&doc, 1).unwrap();
3062 assert_eq!(chars.len(), 2);
3063 assert_eq!(chars[0].ch, 'A');
3064 assert_eq!(chars[1].ch, 'B');
3065 assert_eq!(chars[0].page, 1);
3066 assert!(chars[1].bbox[0] > chars[0].bbox[0]);
3068 }
3069
3070 #[test]
3071 fn extract_tj_array() {
3072 let doc = make_doc_with_text(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET");
3073 let blocks = extract_text(&doc);
3074 assert_eq!(blocks.len(), 1);
3075 assert_eq!(blocks[0].text, "Hello");
3076 }
3077
3078 #[test]
3079 fn empty_page_extracts_no_text() {
3080 let doc = make_doc_with_text(b"q Q");
3081 let blocks = extract_text(&doc);
3082 assert!(blocks.is_empty());
3083 }
3084
3085 #[test]
3086 fn multiline_text_extraction() {
3087 let doc = make_doc_with_text(b"BT /F1 12 Tf 12 TL (Line1) Tj T* (Line2) Tj ET");
3088 let blocks = extract_text(&doc);
3089 assert_eq!(blocks.len(), 2);
3090 assert_eq!(blocks[0].text, "Line1");
3091 assert_eq!(blocks[1].text, "Line2");
3092 }
3093
3094 #[test]
3098 fn actual_text_basic_bdc_override() {
3099 let doc = make_doc_with_text(b"BT /F1 12 Tf /Span <</ActualText (fi)>> BDC (X) Tj EMC ET");
3100 let blocks = extract_text(&doc);
3101 assert_eq!(blocks.len(), 1);
3102 assert_eq!(blocks[0].text, "X");
3103 assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
3104 }
3105
3106 #[test]
3110 fn actual_text_nested_bdc_inner_overrides_outer() {
3111 let doc = make_doc_with_text(
3112 b"BT /F1 12 Tf \
3113 /Span <</ActualText (outer)>> BDC \
3114 (A) Tj \
3115 /Span <</ActualText (inner)>> BDC \
3116 (B) Tj \
3117 EMC \
3118 (C) Tj \
3119 EMC ET",
3120 );
3121 let blocks = extract_text(&doc);
3122 assert_eq!(blocks.len(), 3);
3123 assert_eq!(blocks[0].text, "A");
3124 assert_eq!(blocks[0].actual_text.as_deref(), Some("outer"));
3125 assert_eq!(blocks[1].text, "B");
3126 assert_eq!(blocks[1].actual_text.as_deref(), Some("inner"));
3127 assert_eq!(blocks[2].text, "C");
3128 assert_eq!(blocks[2].actual_text.as_deref(), Some("outer"));
3129 }
3130
3131 #[test]
3134 fn actual_text_bmc_does_not_leak_emc() {
3135 let doc = make_doc_with_text(
3136 b"BT /F1 12 Tf \
3137 /Span <</ActualText (X)>> BDC \
3138 /Artifact BMC (in) Tj EMC \
3139 (out) Tj \
3140 EMC \
3141 (after) Tj ET",
3142 );
3143 let blocks = extract_text(&doc);
3144 assert_eq!(blocks.len(), 3);
3145 assert_eq!(blocks[0].actual_text.as_deref(), Some("X"));
3146 assert_eq!(blocks[1].actual_text.as_deref(), Some("X"));
3147 assert_eq!(blocks[2].actual_text, None);
3148 }
3149
3150 fn make_doc_with_xobj_inside_bdc(
3156 page_pre: &[u8],
3157 xobj_content: &[u8],
3158 page_post: &[u8],
3159 actual_text: &str,
3160 ) -> Document {
3161 let mut doc = Document::with_version("1.7");
3162
3163 let xobj_dict = dictionary! {
3164 "Type" => "XObject",
3165 "Subtype" => "Form",
3166 "BBox" => vec![0.into(), 0.into(), 100.into(), 100.into()],
3167 };
3168 let xobj_stream = Stream::new(xobj_dict, xobj_content.to_vec());
3169 let xobj_id = doc.add_object(Object::Stream(xobj_stream));
3170
3171 let mut page_content = Vec::<u8>::new();
3173 page_content.extend_from_slice(b"BT /F1 12 Tf /Span <</ActualText (");
3174 page_content.extend_from_slice(actual_text.as_bytes());
3175 page_content.extend_from_slice(b")>> BDC ");
3176 page_content.extend_from_slice(page_pre);
3177 page_content.extend_from_slice(b" /Fm0 Do ");
3178 page_content.extend_from_slice(page_post);
3179 page_content.extend_from_slice(b" EMC ET");
3180
3181 let content_stream = Stream::new(dictionary! {}, page_content);
3182 let content_id = doc.add_object(Object::Stream(content_stream));
3183
3184 let resources = dictionary! {
3185 "XObject" => dictionary! { "Fm0" => Object::Reference(xobj_id) },
3186 };
3187 let resources_id = doc.add_object(Object::Dictionary(resources));
3188
3189 let page_dict = dictionary! {
3190 "Type" => "Page",
3191 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3192 "Contents" => Object::Reference(content_id),
3193 "Resources" => Object::Reference(resources_id),
3194 };
3195 let page_id = doc.add_object(Object::Dictionary(page_dict));
3196
3197 let pages_dict = dictionary! {
3198 "Type" => "Pages",
3199 "Kids" => vec![Object::Reference(page_id)],
3200 "Count" => 1_i64,
3201 };
3202 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3203
3204 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3205 d.set("Parent", Object::Reference(pages_id));
3206 }
3207
3208 let catalog = dictionary! {
3209 "Type" => "Catalog",
3210 "Pages" => Object::Reference(pages_id),
3211 };
3212 let catalog_id = doc.add_object(Object::Dictionary(catalog));
3213 doc.trailer.set("Root", Object::Reference(catalog_id));
3214
3215 doc
3216 }
3217
3218 #[test]
3225 fn actual_text_propagates_into_form_xobject_recursion() {
3226 let doc = make_doc_with_xobj_inside_bdc(
3227 b"(pre) Tj",
3228 b"BT /F1 12 Tf (inside) Tj ET",
3229 b"(post) Tj",
3230 "wrapped",
3231 );
3232 let blocks = extract_text(&doc);
3233 assert_eq!(blocks.len(), 3);
3234 assert_eq!(blocks[0].text, "pre");
3235 assert_eq!(blocks[0].actual_text.as_deref(), Some("wrapped"));
3236 assert_eq!(blocks[1].text, "inside");
3239 assert_eq!(
3240 blocks[1].actual_text.as_deref(),
3241 Some("wrapped"),
3242 "Form XObject text lost surrounding /ActualText (issue #1358)"
3243 );
3244 assert_eq!(blocks[2].text, "post");
3245 assert_eq!(blocks[2].actual_text.as_deref(), Some("wrapped"));
3246 }
3247
3248 #[test]
3255 fn actual_text_inner_xobj_bdc_overrides_inherited() {
3256 let doc = make_doc_with_xobj_inside_bdc(
3257 b"",
3258 b"BT /F1 12 Tf \
3259 /Span <</ActualText (inner)>> BDC (B) Tj EMC \
3260 (after) Tj ET",
3261 b"",
3262 "outer",
3263 );
3264 let blocks = extract_text(&doc);
3265 assert_eq!(blocks.len(), 2);
3266 assert_eq!(blocks[0].text, "B");
3267 assert_eq!(blocks[0].actual_text.as_deref(), Some("inner"));
3268 assert_eq!(blocks[1].text, "after");
3269 assert_eq!(blocks[1].actual_text.as_deref(), Some("outer"));
3270 }
3271
3272 #[test]
3275 fn actual_text_utf16be_bom_decodes() {
3276 let doc = make_doc_with_text(
3278 b"BT /F1 12 Tf /Span <</ActualText <FEFF00660069> >> BDC (X) Tj EMC ET",
3279 );
3280 let blocks = extract_text(&doc);
3281 assert_eq!(blocks.len(), 1);
3282 assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
3283 }
3284
3285 #[test]
3288 fn ligature_ff_decomposes() {
3289 assert_eq!(decompose_ligature_char('\u{FB00}'), Some("ff"));
3290 assert_eq!(decompose_ligatures("\u{FB00}"), "ff");
3291 assert_eq!(decompose_ligatures("o\u{FB00}ice"), "office");
3292 }
3293
3294 #[test]
3295 fn ligature_fi_decomposes() {
3296 assert_eq!(decompose_ligature_char('\u{FB01}'), Some("fi"));
3297 assert_eq!(decompose_ligatures("\u{FB01}"), "fi");
3298 assert_eq!(decompose_ligatures("of\u{FB01}ce"), "office");
3299 }
3300
3301 #[test]
3302 fn ligature_fl_decomposes() {
3303 assert_eq!(decompose_ligature_char('\u{FB02}'), Some("fl"));
3304 assert_eq!(decompose_ligatures("\u{FB02}ame"), "flame");
3305 }
3306
3307 #[test]
3308 fn ligature_ffi_decomposes() {
3309 assert_eq!(decompose_ligature_char('\u{FB03}'), Some("ffi"));
3310 assert_eq!(decompose_ligatures("o\u{FB03}ce"), "office");
3311 }
3312
3313 #[test]
3314 fn ligature_ffl_decomposes() {
3315 assert_eq!(decompose_ligature_char('\u{FB04}'), Some("ffl"));
3316 assert_eq!(decompose_ligatures("ba\u{FB04}e"), "baffle");
3317 }
3318
3319 #[test]
3324 fn ligature_st_decomposes() {
3325 assert_eq!(decompose_ligature_char('\u{FB06}'), Some("st"));
3326 assert_eq!(decompose_ligature_char('\u{FB05}'), Some("st"));
3327 assert_eq!(decompose_ligatures("fa\u{FB06}"), "fast");
3328 assert_eq!(glyph_name_to_unicode("st"), Some('\u{FB06}'));
3330 }
3331
3332 #[test]
3339 fn ligature_ct_via_glyph_name_emits_string() {
3340 assert_eq!(glyph_name_to_unicode("ct"), None);
3341 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6101> Tj ET", "ct");
3343 let blocks = extract_text(&doc);
3344 assert_eq!(blocks.len(), 1);
3345 assert_eq!(blocks[0].text, "act");
3346 }
3347
3348 #[test]
3353 fn tounicode_pua_e007_is_preserved() {
3354 let doc = make_doc_with_tounicode_cmap(b"BT /F1 12 Tf <01> Tj ET", 0x01, 0xE007);
3355 let blocks = extract_text(&doc);
3356 assert_eq!(blocks.len(), 1);
3357 assert_eq!(blocks[0].text, "\u{E007}");
3358 assert_ne!(blocks[0].text, "ct");
3359 }
3360
3361 #[test]
3365 fn ligature_ct_positioned_chars_advance_as_single_glyph() {
3366 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <610162> Tj ET", "ct");
3368 let chars = extract_positioned_chars(&doc, 1).unwrap();
3369 let extracted: String = chars.iter().map(|c| c.ch).collect();
3370 assert_eq!(extracted, "actb");
3371 assert_eq!(chars.len(), 4);
3372
3373 let char_w = 12.0 * APPROX_CHAR_WIDTH;
3374 assert!((chars[1].bbox[0] - char_w).abs() < 1e-6);
3375 assert!((chars[3].bbox[0] - (2.0 * char_w)).abs() < 1e-6);
3376 }
3377
3378 fn make_doc_with_ligature_font(content: &[u8], glyph_name: &str) -> Document {
3383 let mut doc = Document::with_version("1.7");
3384
3385 let content_stream = Stream::new(dictionary! {}, content.to_vec());
3386 let content_id = doc.add_object(Object::Stream(content_stream));
3387
3388 let encoding = dictionary! {
3389 "Type" => "Encoding",
3390 "BaseEncoding" => "WinAnsiEncoding",
3391 "Differences" => vec![
3392 1_i64.into(),
3393 Object::Name(glyph_name.as_bytes().to_vec()),
3394 ],
3395 };
3396 let encoding_id = doc.add_object(Object::Dictionary(encoding));
3397
3398 let font = dictionary! {
3399 "Type" => "Font",
3400 "Subtype" => "Type1",
3401 "BaseFont" => "Helvetica",
3402 "Encoding" => Object::Reference(encoding_id),
3403 };
3404 let font_id = doc.add_object(Object::Dictionary(font));
3405
3406 let resources = dictionary! {
3407 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3408 };
3409 let resources_id = doc.add_object(Object::Dictionary(resources));
3410
3411 let page_dict = dictionary! {
3412 "Type" => "Page",
3413 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3414 "Contents" => Object::Reference(content_id),
3415 "Resources" => Object::Reference(resources_id),
3416 };
3417 let page_id = doc.add_object(Object::Dictionary(page_dict));
3418
3419 let pages_dict = dictionary! {
3420 "Type" => "Pages",
3421 "Kids" => vec![Object::Reference(page_id)],
3422 "Count" => 1_i64,
3423 };
3424 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3425
3426 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3427 d.set("Parent", Object::Reference(pages_id));
3428 }
3429
3430 let catalog = dictionary! {
3431 "Type" => "Catalog",
3432 "Pages" => Object::Reference(pages_id),
3433 };
3434 let catalog_id = doc.add_object(Object::Dictionary(catalog));
3435 doc.trailer.set("Root", Object::Reference(catalog_id));
3436
3437 doc
3438 }
3439
3440 #[test]
3446 fn ligature_office_golden_ffi() {
3447 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6F016365> Tj ET", "ffi");
3449 let blocks = extract_text(&doc);
3450 assert_eq!(blocks.len(), 1);
3451 assert_eq!(blocks[0].text, "office");
3452 }
3453
3454 fn make_doc_with_tounicode_cmap(content: &[u8], src: u8, dst: u32) -> Document {
3459 let mut doc = Document::with_version("1.7");
3460
3461 let content_stream = Stream::new(dictionary! {}, content.to_vec());
3462 let content_id = doc.add_object(Object::Stream(content_stream));
3463
3464 let cmap_text = format!(
3466 "/CIDInit /ProcSet findresource begin\n\
3467 12 dict begin\n\
3468 begincmap\n\
3469 /CMapType 2 def\n\
3470 1 beginbfchar\n\
3471 <{:02X}> <{:04X}>\n\
3472 endbfchar\n\
3473 endcmap CMapName currentdict /CMap defineresource pop end end",
3474 src, dst
3475 );
3476 let cmap_stream = Stream::new(dictionary! {}, cmap_text.into_bytes());
3477 let cmap_id = doc.add_object(Object::Stream(cmap_stream));
3478
3479 let font = dictionary! {
3480 "Type" => "Font",
3481 "Subtype" => "Type1",
3482 "BaseFont" => "Helvetica",
3483 "Encoding" => "WinAnsiEncoding",
3484 "ToUnicode" => Object::Reference(cmap_id),
3485 };
3486 let font_id = doc.add_object(Object::Dictionary(font));
3487
3488 let resources = dictionary! {
3489 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3490 };
3491 let resources_id = doc.add_object(Object::Dictionary(resources));
3492
3493 let page_dict = dictionary! {
3494 "Type" => "Page",
3495 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3496 "Contents" => Object::Reference(content_id),
3497 "Resources" => Object::Reference(resources_id),
3498 };
3499 let page_id = doc.add_object(Object::Dictionary(page_dict));
3500
3501 let pages_dict = dictionary! {
3502 "Type" => "Pages",
3503 "Kids" => vec![Object::Reference(page_id)],
3504 "Count" => 1_i64,
3505 };
3506 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3507
3508 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3509 d.set("Parent", Object::Reference(pages_id));
3510 }
3511
3512 let catalog = dictionary! {
3513 "Type" => "Catalog",
3514 "Pages" => Object::Reference(pages_id),
3515 };
3516 let catalog_id = doc.add_object(Object::Dictionary(catalog));
3517 doc.trailer.set("Root", Object::Reference(catalog_id));
3518
3519 doc
3520 }
3521
3522 #[test]
3527 fn ligature_positioned_chars_split_bbox() {
3528 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3529 let chars = extract_positioned_chars(&doc, 1).unwrap();
3530 assert_eq!(chars.len(), 3, "ffi → 3 chars");
3531 assert_eq!(chars[0].ch, 'f');
3532 assert_eq!(chars[1].ch, 'f');
3533 assert_eq!(chars[2].ch, 'i');
3534 assert!(chars[0].bbox[2] <= chars[1].bbox[0] + 1e-9);
3536 assert!(chars[1].bbox[2] <= chars[2].bbox[0] + 1e-9);
3537 let total = chars[2].bbox[2] - chars[0].bbox[0];
3539 let expected = 12.0 * APPROX_CHAR_WIDTH;
3540 assert!((total - expected).abs() < 1e-6);
3541 }
3542
3543 #[test]
3553 fn positioned_chars_match_extracted_text_for_armenian_ligature() {
3554 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB13");
3557 let blocks = extract_text(&doc);
3558 let chars = extract_positioned_chars(&doc, 1).unwrap();
3559 assert_eq!(blocks.len(), 1);
3560 assert_eq!(
3562 blocks[0].text.chars().count(),
3563 chars.len(),
3564 "PositionedChar count drifted from extract_text() char count \
3565 (issue #1359 problem 1): blocks[0].text = {:?}, chars = {:?}",
3566 blocks[0].text,
3567 chars.iter().map(|c| c.ch).collect::<String>(),
3568 );
3569 let chars_str: String = chars.iter().map(|c| c.ch).collect();
3571 assert_eq!(blocks[0].text, chars_str);
3572 let total = chars.last().unwrap().bbox[2] - chars.first().unwrap().bbox[0];
3574 let expected = 12.0 * APPROX_CHAR_WIDTH;
3575 assert!((total - expected).abs() < 1e-6);
3576 }
3577
3578 #[test]
3582 fn hebrew_presentation_form_extracts_single_codepoint_nfkd_mapping() {
3583 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
3585 let blocks = extract_text(&doc);
3586 assert_eq!(blocks.len(), 1);
3587 assert_eq!(blocks[0].text, "\u{05D0}");
3588 assert_eq!(
3589 decompose_glyph_to_string('\u{FB21}').as_deref(),
3590 Some("\u{05D0}")
3591 );
3592 }
3593
3594 #[test]
3598 fn positioned_chars_match_extracted_text_for_hebrew_presentation_form() {
3599 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
3600 let blocks = extract_text(&doc);
3601 let chars = extract_positioned_chars(&doc, 1).unwrap();
3602 let chars_str: String = chars.iter().map(|c| c.ch).collect();
3603
3604 assert_eq!(blocks.len(), 1);
3605 assert_eq!(blocks[0].text, "\u{05D0}");
3606 assert_eq!(chars_str, blocks[0].text);
3607 assert_eq!(blocks[0].text.chars().count(), chars.len());
3608 }
3609
3610 #[test]
3617 fn tj_block_width_matches_rendered_glyph_count() {
3618 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3619 let blocks = extract_text(&doc);
3620 assert_eq!(blocks.len(), 1);
3621 assert_eq!(blocks[0].text, "ffi");
3622 let width = blocks[0].bbox[2] - blocks[0].bbox[0];
3623 let expected = 12.0 * APPROX_CHAR_WIDTH;
3624 assert!(
3625 (width - expected).abs() < 1e-6,
3626 "Tj bbox width {} != one glyph's char_w {} (issue #1359 problem 2)",
3627 width,
3628 expected,
3629 );
3630 }
3631
3632 #[test]
3638 fn tj_and_tj_array_bbox_widths_agree_for_ligature() {
3639 let tj_doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3640 let tj_blocks = extract_text(&tj_doc);
3641 let tj_doc_arr = make_doc_with_ligature_font(b"BT /F1 12 Tf [<01>] TJ ET", "ffi");
3642 let tj_arr_blocks = extract_text(&tj_doc_arr);
3643 assert_eq!(tj_blocks.len(), 1);
3644 assert_eq!(tj_arr_blocks.len(), 1);
3645 let tj_w = tj_blocks[0].bbox[2] - tj_blocks[0].bbox[0];
3646 let tj_arr_w = tj_arr_blocks[0].bbox[2] - tj_arr_blocks[0].bbox[0];
3647 assert!(
3648 (tj_w - tj_arr_w).abs() < 1e-6,
3649 "Tj bbox width {} disagrees with TJ bbox width {} for the same \
3650 single-glyph ligature input (issue #1359 problem 2)",
3651 tj_w,
3652 tj_arr_w,
3653 );
3654 }
3655
3656 fn make_doc_with_font(content: &[u8], base_font: &str, desc_flags: Option<u32>) -> Document {
3661 let mut doc = Document::with_version("1.7");
3662
3663 let mut font_dict = dictionary! {
3664 "Type" => "Font",
3665 "Subtype" => "Type1",
3666 "BaseFont" => Object::Name(base_font.as_bytes().to_vec()),
3667 };
3668 if let Some(flags) = desc_flags {
3669 let descriptor = dictionary! {
3670 "Type" => "FontDescriptor",
3671 "FontName" => Object::Name(base_font.as_bytes().to_vec()),
3672 "Flags" => Object::Integer(flags as i64),
3673 };
3674 let desc_id = doc.add_object(Object::Dictionary(descriptor));
3675 font_dict.set("FontDescriptor", Object::Reference(desc_id));
3676 }
3677 let font_id = doc.add_object(Object::Dictionary(font_dict));
3678
3679 let resources = dictionary! {
3680 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3681 };
3682 let resources_id = doc.add_object(Object::Dictionary(resources));
3683
3684 let content_stream = Stream::new(dictionary! {}, content.to_vec());
3685 let content_id = doc.add_object(Object::Stream(content_stream));
3686
3687 let page_dict = dictionary! {
3688 "Type" => "Page",
3689 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3690 "Contents" => Object::Reference(content_id),
3691 "Resources" => Object::Reference(resources_id),
3692 };
3693 let page_id = doc.add_object(Object::Dictionary(page_dict));
3694
3695 let pages_dict = dictionary! {
3696 "Type" => "Pages",
3697 "Kids" => vec![Object::Reference(page_id)],
3698 "Count" => 1_i64,
3699 };
3700 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3701
3702 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3703 d.set("Parent", Object::Reference(pages_id));
3704 }
3705
3706 let catalog = dictionary! {
3707 "Type" => "Catalog",
3708 "Pages" => Object::Reference(pages_id),
3709 };
3710 let catalog_id = doc.add_object(Object::Dictionary(catalog));
3711 doc.trailer.set("Root", Object::Reference(catalog_id));
3712
3713 doc
3714 }
3715
3716 #[test]
3717 fn g1_tj_carries_basefont_metadata() {
3718 let doc = make_doc_with_font(b"BT /F1 12 Tf (Hello) Tj ET", "Helvetica", None);
3719 let blocks = extract_text(&doc);
3720 assert_eq!(blocks.len(), 1);
3721 assert_eq!(blocks[0].text, "Hello");
3722 assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica"));
3723 assert!(!blocks[0].is_bold);
3724 assert!(!blocks[0].is_italic);
3725 }
3726
3727 #[test]
3728 fn g1_tj_array_carries_basefont_metadata() {
3729 let doc = make_doc_with_font(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET", "Helvetica", None);
3730 let blocks = extract_text(&doc);
3731 assert_eq!(blocks.len(), 1);
3732 assert_eq!(blocks[0].text, "Hello");
3733 assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica"));
3734 }
3735
3736 #[test]
3737 fn g1_bold_inferred_from_basefont_name() {
3738 let doc = make_doc_with_font(b"BT /F1 12 Tf (Bold) Tj ET", "Helvetica-Bold", None);
3739 let blocks = extract_text(&doc);
3740 assert_eq!(blocks.len(), 1);
3741 assert!(blocks[0].is_bold);
3742 assert!(!blocks[0].is_italic);
3743 assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica-Bold"));
3744 }
3745
3746 #[test]
3747 fn g1_italic_inferred_from_basefont_name() {
3748 let doc = make_doc_with_font(b"BT /F1 12 Tf (Slanted) Tj ET", "Times-Italic", None);
3749 let blocks = extract_text(&doc);
3750 assert_eq!(blocks.len(), 1);
3751 assert!(blocks[0].is_italic);
3752 assert!(!blocks[0].is_bold);
3753 }
3754
3755 #[test]
3756 fn g1_subset_prefix_stripped_from_basefont() {
3757 let doc = make_doc_with_font(
3758 b"BT /F1 12 Tf (Subset) Tj ET",
3759 "ABCDEF+Helvetica-Bold",
3760 None,
3761 );
3762 let blocks = extract_text(&doc);
3763 assert_eq!(blocks.len(), 1);
3764 assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica-Bold"));
3765 assert!(blocks[0].is_bold);
3766 }
3767
3768 #[test]
3769 fn g1_italic_inferred_from_descriptor_flag() {
3770 let doc = make_doc_with_font(b"BT /F1 12 Tf (X) Tj ET", "Custom-Roman", Some(0x40));
3772 let blocks = extract_text(&doc);
3773 assert_eq!(blocks.len(), 1);
3774 assert!(blocks[0].is_italic);
3775 assert_eq!(blocks[0].base_font.as_deref(), Some("Custom-Roman"));
3777 }
3778
3779 #[test]
3780 fn g1_rg_color_propagates_to_block() {
3781 let doc = make_doc_with_font(b"1 0 0 rg BT /F1 12 Tf (Red) Tj ET", "Helvetica", None);
3783 let blocks = extract_text(&doc);
3784 assert_eq!(blocks.len(), 1);
3785 assert_eq!(blocks[0].color, Some([255, 0, 0, 255]));
3786 }
3787
3788 #[test]
3789 fn g1_gray_color_via_g_operator() {
3790 let doc = make_doc_with_font(b"0.5 g BT /F1 12 Tf (Gray) Tj ET", "Helvetica", None);
3792 let blocks = extract_text(&doc);
3793 assert_eq!(blocks.len(), 1);
3794 let c = blocks[0].color.expect("expected gray color");
3795 assert_eq!(c[3], 255);
3797 assert_eq!(c[0], c[1]);
3798 assert_eq!(c[1], c[2]);
3799 assert!((c[0] as i32 - 128).abs() <= 1);
3800 }
3801
3802 #[test]
3803 fn g1_default_color_is_black_for_compliant_initial_state() {
3804 let doc = make_doc_with_font(b"BT /F1 12 Tf (Default) Tj ET", "Helvetica", None);
3806 let blocks = extract_text(&doc);
3807 assert_eq!(blocks.len(), 1);
3808 assert_eq!(blocks[0].color, Some([0, 0, 0, 255]));
3809 }
3810
3811 #[test]
3812 fn g1_cmyk_color_via_k_operator() {
3813 let doc = make_doc_with_font(b"1 0 0 0 k BT /F1 12 Tf (Cyan) Tj ET", "Helvetica", None);
3815 let blocks = extract_text(&doc);
3816 assert_eq!(blocks.len(), 1);
3817 let c = blocks[0].color.expect("k should set color");
3818 assert_eq!(c, [0, 255, 255, 255]);
3819 }
3820
3821 #[test]
3822 fn g1_tj_and_tj_array_emit_same_metadata() {
3823 let tj_doc =
3825 make_doc_with_font(b"1 0 0 rg BT /F1 12 Tf (Hi) Tj ET", "Helvetica-Bold", None);
3826 let tj_array_doc = make_doc_with_font(
3827 b"1 0 0 rg BT /F1 12 Tf [(Hi)] TJ ET",
3828 "Helvetica-Bold",
3829 None,
3830 );
3831 let a = extract_text(&tj_doc);
3832 let b = extract_text(&tj_array_doc);
3833 assert_eq!(a.len(), 1);
3834 assert_eq!(b.len(), 1);
3835 assert_eq!(a[0].base_font, b[0].base_font);
3836 assert_eq!(a[0].is_bold, b[0].is_bold);
3837 assert_eq!(a[0].is_italic, b[0].is_italic);
3838 assert_eq!(a[0].color, b[0].color);
3839 }
3840
3841 #[test]
3842 fn g1_color_restored_across_q_capital_q() {
3843 let doc = make_doc_with_font(
3845 b"q 1 0 0 rg Q BT /F1 12 Tf (Outside) Tj ET",
3846 "Helvetica",
3847 None,
3848 );
3849 let blocks = extract_text(&doc);
3850 assert_eq!(blocks.len(), 1);
3851 assert_eq!(blocks[0].color, Some([0, 0, 0, 255]));
3853 }
3854
3855 #[test]
3856 fn g1_no_font_resource_means_no_base_font() {
3857 let doc = make_doc_with_text(b"BT /F1 12 Tf (NoFont) Tj ET");
3860 let blocks = extract_text(&doc);
3861 assert_eq!(blocks.len(), 1);
3862 assert_eq!(blocks[0].base_font, None);
3863 assert!(!blocks[0].is_bold);
3864 assert!(!blocks[0].is_italic);
3865 assert_eq!(blocks[0].color, Some([0, 0, 0, 255]));
3867 assert_eq!(blocks[0].font_name, "F1");
3869 }
3870
3871 #[test]
3872 fn g1_unknown_color_space_marks_color_none() {
3873 let doc = make_doc_with_font(
3875 b"/Cs1 cs 0.5 0.5 0.5 0.5 0.5 scn BT /F1 12 Tf (X) Tj ET",
3876 "Helvetica",
3877 None,
3878 );
3879 let blocks = extract_text(&doc);
3880 assert_eq!(blocks.len(), 1);
3881 assert_eq!(blocks[0].color, None);
3882 }
3883
3884 fn make_doc_with_widths(content: &[u8], first_char: i64, widths: Vec<i64>) -> Document {
3888 let mut doc = Document::with_version("1.7");
3889
3890 let widths_obj = Object::Array(widths.iter().map(|w| Object::Integer(*w)).collect());
3891
3892 let font_dict = dictionary! {
3893 "Type" => "Font",
3894 "Subtype" => "Type1",
3895 "BaseFont" => Object::Name(b"TestFont".to_vec()),
3896 "FirstChar" => Object::Integer(first_char),
3897 "LastChar" => Object::Integer(first_char + widths.len() as i64 - 1),
3898 "Widths" => widths_obj,
3899 };
3900 let font_id = doc.add_object(Object::Dictionary(font_dict));
3901
3902 let resources = dictionary! {
3903 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3904 };
3905 let resources_id = doc.add_object(Object::Dictionary(resources));
3906
3907 let content_stream = Stream::new(dictionary! {}, content.to_vec());
3908 let content_id = doc.add_object(Object::Stream(content_stream));
3909
3910 let page_dict = dictionary! {
3911 "Type" => "Page",
3912 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3913 "Contents" => Object::Reference(content_id),
3914 "Resources" => Object::Reference(resources_id),
3915 };
3916 let page_id = doc.add_object(Object::Dictionary(page_dict));
3917
3918 let pages_dict = dictionary! {
3919 "Type" => "Pages",
3920 "Kids" => vec![Object::Reference(page_id)],
3921 "Count" => 1_i64,
3922 };
3923 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3924
3925 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3926 d.set("Parent", Object::Reference(pages_id));
3927 }
3928
3929 let catalog = dictionary! {
3930 "Type" => "Catalog",
3931 "Pages" => Object::Reference(pages_id),
3932 };
3933 let catalog_id = doc.add_object(Object::Dictionary(catalog));
3934 doc.trailer.set("Root", Object::Reference(catalog_id));
3935
3936 doc
3937 }
3938
3939 #[test]
3940 fn g2_no_widths_gives_estimate_source() {
3941 let doc = make_doc_with_font(b"BT /F1 10 Tf (A) Tj ET", "Helvetica", None);
3943 let blocks = extract_text(&doc);
3944 assert_eq!(blocks.len(), 1);
3945 assert_eq!(blocks[0].width_source, WidthSource::Estimate);
3946 }
3947
3948 #[test]
3949 fn g2_widths_array_gives_metric_source() {
3950 let doc = make_doc_with_widths(b"BT /F1 10 Tf (A) Tj ET", 65, vec![722]);
3953 let blocks = extract_text(&doc);
3954 assert_eq!(blocks.len(), 1);
3955 assert_eq!(blocks[0].width_source, WidthSource::Metric);
3956 assert_eq!(blocks[0].char_bounds.len(), 1);
3958 let [x0, _y0, x1, _y1] = blocks[0].char_bounds[0];
3959 let advance = x1 - x0;
3961 assert!((advance - 7.22).abs() < 0.01, "advance={advance:.4}");
3962 }
3963
3964 #[test]
3965 fn g2_monospace_uniform_char_bounds() {
3966 let widths: Vec<i64> = vec![600, 600, 600, 600, 600];
3969 let doc = make_doc_with_widths(b"BT /F1 12 Tf (ABCDE) Tj ET", 65, widths);
3970 let blocks = extract_text(&doc);
3971 assert_eq!(blocks.len(), 1);
3972 assert_eq!(blocks[0].width_source, WidthSource::Metric);
3973 assert_eq!(blocks[0].char_bounds.len(), 5);
3974 let expected_adv = 600.0 / 1000.0 * 12.0; for i in 0..5 {
3976 let [x0, _, x1, _] = blocks[0].char_bounds[i];
3977 let adv = x1 - x0;
3978 assert!(
3979 (adv - expected_adv).abs() < 0.01,
3980 "char {i}: advance={adv:.4} expected={expected_adv:.4}"
3981 );
3982 }
3983 }
3984
3985 #[test]
3986 fn g2_proportional_char_bounds_are_contiguous() {
3987 let widths: Vec<i64> = vec![722, 667, 667];
3989 let doc = make_doc_with_widths(b"BT /F1 10 Tf (ABC) Tj ET", 65, widths);
3990 let blocks = extract_text(&doc);
3991 assert_eq!(blocks.len(), 1);
3992 assert_eq!(blocks[0].char_bounds.len(), 3);
3993 for i in 0..2 {
3996 let x1_prev = blocks[0].char_bounds[i][2];
3997 let x0_next = blocks[0].char_bounds[i + 1][0];
3998 assert!(
3999 (x1_prev - x0_next).abs() < 0.001,
4000 "gap between char {i} and {}: {:.4}",
4001 i + 1,
4002 x0_next - x1_prev
4003 );
4004 }
4005 }
4006
4007 #[test]
4008 fn g2_tj_kerning_shifts_subsequent_bounds() {
4009 let widths: Vec<i64> = vec![722, 667];
4016 let doc = make_doc_with_widths(b"BT /F1 10 Tf [(A) -200 (B)] TJ ET", 65, widths);
4017 let blocks = extract_text(&doc);
4018 assert_eq!(blocks.len(), 1);
4019 assert_eq!(blocks[0].char_bounds.len(), 2);
4020 let [x0_a, _, x1_a, _] = blocks[0].char_bounds[0];
4021 let [x0_b, _, _, _] = blocks[0].char_bounds[1];
4022 assert!(
4024 (x1_a - x0_a - 7.22).abs() < 0.01,
4025 "A advance={:.4}",
4026 x1_a - x0_a
4027 );
4028 assert!(x0_b > x1_a, "B should start after A ends");
4030 let gap = x0_b - x1_a;
4031 assert!(
4032 (gap - 2.0).abs() < 0.01,
4033 "kerning gap={gap:.4} expected=2.0"
4034 );
4035 }
4036
4037 #[test]
4038 fn g2_subset_font_prefix_does_not_affect_widths() {
4039 let widths: Vec<i64> = vec![722];
4041 let doc = make_doc_with_widths(b"BT /F1 10 Tf (A) Tj ET", 65, widths);
4042 let blocks = extract_text(&doc);
4043 assert_eq!(blocks.len(), 1);
4044 assert_eq!(blocks[0].width_source, WidthSource::Metric);
4046 assert_eq!(blocks[0].char_bounds.len(), 1);
4047 }
4048
4049 #[test]
4050 fn g2_char_bounds_count_equals_glyph_count_not_unicode_len() {
4051 let doc = make_doc_with_font(b"BT /F1 12 Tf (A) Tj ET", "TestFont", None);
4055 let blocks = extract_text(&doc);
4056 assert_eq!(blocks.len(), 1);
4057 assert_eq!(blocks[0].char_bounds.len(), 1);
4059 }
4060}