1use crate::error::{ExtractError, Result};
7use lopdf::content::{Content, Operation};
8use lopdf::{Document, Object, ObjectId};
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12const APPROX_CHAR_WIDTH: f64 = 0.5;
14
15#[derive(Debug, Clone)]
17pub struct TextBlock {
18 pub text: String,
20 pub page: u32,
22 pub bbox: [f64; 4],
24 pub font_name: String,
26 pub font_size: f64,
28 pub actual_text: Option<String>,
33}
34
35#[derive(Debug, Clone)]
37pub struct PositionedChar {
38 pub ch: char,
40 pub page: u32,
42 pub bbox: [f64; 4],
44}
45
46#[derive(Debug, Clone)]
48struct GraphicsState {
49 ctm: [f64; 6],
50}
51
52#[derive(Debug, Clone)]
54struct TextState {
55 tm: [f64; 6],
57 tlm: [f64; 6],
59 font_name: String,
61 font_size: f64,
63 tc: f64,
65 tw: f64,
67 th: f64,
69 tl: f64,
71 ts: f64,
73 gs_stack: Vec<GraphicsState>,
75 ctm: [f64; 6],
77}
78
79impl Default for TextState {
80 fn default() -> Self {
81 Self {
82 tm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
83 tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
84 font_name: String::new(),
85 font_size: 12.0,
86 tc: 0.0,
87 tw: 0.0,
88 th: 100.0,
89 tl: 0.0,
90 ts: 0.0,
91 gs_stack: Vec::new(),
92 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
93 }
94 }
95}
96
97#[derive(Clone)]
99struct FontInfo {
100 is_cid: bool,
102 to_unicode: HashMap<u32, String>,
105 encoding_map: [Option<char>; 256],
108 ct_codes: [bool; 256],
116}
117
118fn build_font_map(doc: &Document, page_id: ObjectId) -> HashMap<String, FontInfo> {
120 let mut map = HashMap::new();
121
122 let resources = get_page_resources(doc, page_id);
124 let font_dict = match resources.and_then(|res| match res.get(b"Font").ok()? {
125 Object::Dictionary(d) => Some(d.clone()),
126 Object::Reference(r) => match doc.get_object(*r).ok()? {
127 Object::Dictionary(d) => Some(d.clone()),
128 _ => None,
129 },
130 _ => None,
131 }) {
132 Some(d) => d,
133 None => return map,
134 };
135
136 for (name_bytes, value) in font_dict.iter() {
137 let font_name = String::from_utf8_lossy(name_bytes).to_string();
138
139 let font = match value {
141 Object::Reference(r) => match doc.get_object(*r).ok() {
142 Some(Object::Dictionary(d)) => d.clone(),
143 _ => continue,
144 },
145 Object::Dictionary(d) => d.clone(),
146 _ => continue,
147 };
148
149 let subtype = font
150 .get(b"Subtype")
151 .ok()
152 .and_then(|o| match o {
153 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
154 _ => None,
155 })
156 .unwrap_or_default();
157
158 let is_cid = subtype == "Type0";
159
160 let to_unicode = parse_to_unicode_from_font(doc, &font);
162
163 let to_unicode = if to_unicode.is_empty() && is_cid {
165 if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
166 descendants
167 .iter()
168 .find_map(|d| {
169 let desc_dict = match d {
170 Object::Reference(r) => match doc.get_object(*r).ok()? {
171 Object::Dictionary(d) => d,
172 _ => return None,
173 },
174 Object::Dictionary(d) => d,
175 _ => return None,
176 };
177 let tu = parse_to_unicode_from_font(doc, desc_dict);
178 if tu.is_empty() {
179 None
180 } else {
181 Some(tu)
182 }
183 })
184 .unwrap_or_default()
185 } else {
186 HashMap::new()
187 }
188 } else {
189 to_unicode
190 };
191
192 let (encoding_map, ct_codes) = if !is_cid {
194 build_encoding_map(doc, &font)
195 } else {
196 ([None; 256], [false; 256])
197 };
198
199 map.insert(
200 font_name,
201 FontInfo {
202 is_cid,
203 to_unicode,
204 encoding_map,
205 ct_codes,
206 },
207 );
208 }
209
210 map
211}
212
213fn parse_to_unicode_from_font(doc: &Document, font: &lopdf::Dictionary) -> HashMap<u32, String> {
215 let tu_obj = match font.get(b"ToUnicode").ok() {
216 Some(Object::Reference(r)) => doc.get_object(*r).ok(),
217 Some(obj) => Some(obj),
218 None => return HashMap::new(),
219 };
220
221 let stream_bytes = match tu_obj {
222 Some(Object::Stream(ref s)) => s
223 .decompressed_content()
224 .ok()
225 .unwrap_or_else(|| s.content.clone()),
226 _ => return HashMap::new(),
227 };
228
229 parse_to_unicode_cmap(&stream_bytes)
230}
231
232fn parse_to_unicode_cmap(data: &[u8]) -> HashMap<u32, String> {
236 let text = String::from_utf8_lossy(data);
237 let mut map = HashMap::new();
238
239 for section in text.split("beginbfchar") {
241 let section = match section.split("endbfchar").next() {
242 Some(s) => s,
243 None => continue,
244 };
245 let tokens = extract_hex_tokens(section);
246 for pair in tokens.chunks(2) {
247 if pair.len() == 2 {
248 let code = parse_hex_u32(&pair[0]);
249 let unicode = hex_to_unicode_string(&pair[1]);
250 map.insert(code, unicode);
251 }
252 }
253 }
254
255 for section in text.split("beginbfrange") {
257 let section = match section.split("endbfrange").next() {
258 Some(s) => s,
259 None => continue,
260 };
261
262 let mut chars = section.chars().peekable();
264 let mut tokens: Vec<String> = Vec::new();
265 let mut arrays: Vec<Vec<String>> = Vec::new();
266 let mut in_array = false;
267 let mut current_array: Vec<String> = Vec::new();
268
269 while let Some(&ch) = chars.peek() {
270 if ch == '<' {
271 chars.next();
272 let hex: String = chars
273 .by_ref()
274 .take_while(|&c| c != '>')
275 .filter(|c| !c.is_whitespace())
276 .collect();
277 if in_array {
278 current_array.push(hex);
279 } else {
280 tokens.push(hex);
281 }
282 } else if ch == '[' {
283 chars.next();
284 in_array = true;
285 current_array = Vec::new();
286 } else if ch == ']' {
287 chars.next();
288 in_array = false;
289 arrays.push(std::mem::take(&mut current_array));
290 tokens.push(String::new()); } else {
292 chars.next();
293 }
294 }
295
296 let mut array_idx = 0;
298 let mut i = 0;
299 while i + 2 < tokens.len() {
300 let lo = parse_hex_u32(&tokens[i]);
301 let hi = parse_hex_u32(&tokens[i + 1]);
302
303 if tokens[i + 2].is_empty() {
304 if array_idx < arrays.len() {
306 let arr = &arrays[array_idx];
307 for (offset, dst) in arr.iter().enumerate() {
308 let code = lo + offset as u32;
309 if code <= hi {
310 map.insert(code, hex_to_unicode_string(dst));
311 }
312 }
313 array_idx += 1;
314 }
315 } else {
316 let dst_start = parse_hex_u32(&tokens[i + 2]);
318 let dst_len = tokens[i + 2].len();
319 for code in lo..=hi {
320 let dst_val = dst_start + (code - lo);
321 let s = if dst_len <= 4 {
322 char::from_u32(dst_val)
324 .map(|c| c.to_string())
325 .unwrap_or_default()
326 } else {
327 let hex = format!("{:0>width$X}", dst_val, width = dst_len);
329 hex_to_unicode_string(&hex)
330 };
331 map.insert(code, s);
332 }
333 }
334 i += 3;
335 }
336 }
337
338 map
339}
340
341fn extract_hex_tokens(text: &str) -> Vec<String> {
343 let mut tokens = Vec::new();
344 let mut in_hex = false;
345 let mut current = String::new();
346 for ch in text.chars() {
347 if ch == '<' {
348 in_hex = true;
349 current.clear();
350 } else if ch == '>' && in_hex {
351 in_hex = false;
352 tokens.push(current.clone());
353 } else if in_hex && !ch.is_whitespace() {
354 current.push(ch);
355 }
356 }
357 tokens
358}
359
360fn parse_hex_u32(hex: &str) -> u32 {
362 u32::from_str_radix(hex, 16).unwrap_or(0)
363}
364
365fn hex_to_unicode_string(hex: &str) -> String {
367 let bytes: Vec<u8> = (0..hex.len())
368 .step_by(2)
369 .filter_map(|i| u8::from_str_radix(&hex[i..i + 2.min(hex.len() - i)], 16).ok())
370 .collect();
371
372 if bytes.len() >= 2 && bytes.len().is_multiple_of(2) {
373 let u16s: Vec<u16> = bytes
375 .chunks(2)
376 .map(|c| u16::from_be_bytes([c[0], c[1]]))
377 .collect();
378 String::from_utf16_lossy(&u16s)
379 } else if bytes.len() == 1 {
380 char::from_u32(bytes[0] as u32)
381 .map(|c| c.to_string())
382 .unwrap_or_default()
383 } else {
384 String::new()
385 }
386}
387
388fn get_page_resources(doc: &Document, page_id: ObjectId) -> Option<lopdf::Dictionary> {
390 let page = match doc.get_object(page_id).ok()? {
391 Object::Dictionary(d) => d.clone(),
392 _ => return None,
393 };
394
395 if let Some(res) = resolve_dict(doc, &page, b"Resources") {
397 return Some(res);
398 }
399
400 let mut current = page;
402 for _ in 0..20 {
403 let parent_ref = match current.get(b"Parent").ok()? {
404 Object::Reference(r) => *r,
405 _ => break,
406 };
407 let parent = match doc.get_object(parent_ref).ok()? {
408 Object::Dictionary(d) => d.clone(),
409 _ => break,
410 };
411 if let Some(res) = resolve_dict(doc, &parent, b"Resources") {
412 return Some(res);
413 }
414 current = parent;
415 }
416
417 None
418}
419
420fn resolve_dict(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<lopdf::Dictionary> {
422 match dict.get(key).ok()? {
423 Object::Dictionary(d) => Some(d.clone()),
424 Object::Reference(r) => match doc.get_object(*r).ok()? {
425 Object::Dictionary(d) => Some(d.clone()),
426 _ => None,
427 },
428 _ => None,
429 }
430}
431
432fn build_encoding_map(
439 doc: &Document,
440 font: &lopdf::Dictionary,
441) -> ([Option<char>; 256], [bool; 256]) {
442 let mut table = [None::<char>; 256];
443 let mut ct_codes = [false; 256];
444
445 let encoding = match font.get(b"Encoding").ok() {
446 Some(obj) => obj,
447 None => return (table, ct_codes),
448 };
449
450 match encoding {
451 Object::Name(name) => {
452 let name_str = String::from_utf8_lossy(name);
454 apply_base_encoding(&mut table, &name_str);
455 }
456 Object::Reference(r) => match doc.get_object(*r) {
457 Ok(Object::Dictionary(enc_dict)) => {
458 parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
459 }
460 Ok(Object::Name(name)) => {
461 let name_str = String::from_utf8_lossy(name);
462 apply_base_encoding(&mut table, &name_str);
463 }
464 _ => {}
465 },
466 Object::Dictionary(enc_dict) => {
467 parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
468 }
469 _ => {}
470 }
471
472 (table, ct_codes)
473}
474
475fn parse_encoding_dict(
477 doc: &Document,
478 enc_dict: &lopdf::Dictionary,
479 table: &mut [Option<char>; 256],
480 ct_codes: &mut [bool; 256],
481) {
482 if let Ok(Object::Name(base)) = enc_dict.get(b"BaseEncoding") {
484 let base_str = String::from_utf8_lossy(base);
485 apply_base_encoding(table, &base_str);
486 }
487
488 let diffs = match enc_dict.get(b"Differences").ok() {
490 Some(Object::Array(arr)) => arr.clone(),
491 Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
492 Some(Object::Array(arr)) => arr.clone(),
493 _ => return,
494 },
495 _ => return,
496 };
497
498 let mut code: Option<u32> = None;
499 for item in &diffs {
500 match item {
501 Object::Integer(n) => {
502 code = Some(*n as u32);
503 }
504 Object::Name(name) => {
505 if let Some(c) = code {
506 if c < 256 {
507 let glyph = String::from_utf8_lossy(name);
508 apply_glyph_name(&glyph, c as usize, table, ct_codes);
509 }
510 code = Some(c + 1);
511 }
512 }
513 Object::Reference(r) => {
514 if let Ok(Object::Name(name)) = doc.get_object(*r) {
516 if let Some(c) = code {
517 if c < 256 {
518 let glyph = String::from_utf8_lossy(name);
519 apply_glyph_name(&glyph, c as usize, table, ct_codes);
520 }
521 code = Some(c + 1);
522 }
523 }
524 }
525 _ => {}
526 }
527 }
528}
529
530fn apply_glyph_name(
536 glyph: &str,
537 code: usize,
538 table: &mut [Option<char>; 256],
539 ct_codes: &mut [bool; 256],
540) {
541 if glyph == "ct" {
542 table[code] = None;
545 ct_codes[code] = true;
546 return;
547 }
548 if let Some(ch) = glyph_name_to_unicode(glyph) {
549 table[code] = Some(ch);
550 ct_codes[code] = false;
551 }
552}
553
554fn apply_base_encoding(table: &mut [Option<char>; 256], name: &str) {
556 let source = match name {
557 "WinAnsiEncoding" => winansi_encoding(),
558 "MacRomanEncoding" => mac_roman_encoding(),
559 _ => return,
560 };
561 for (i, &ch) in source.iter().enumerate() {
562 if ch != '\0' {
563 table[i] = Some(ch);
564 }
565 }
566}
567
568fn winansi_encoding() -> &'static [char; 256] {
570 static TABLE: OnceLock<[char; 256]> = OnceLock::new();
571 TABLE.get_or_init(|| {
572 let mut t = ['\0'; 256];
573 for i in 0x20..=0x7Eu8 {
575 t[i as usize] = i as char;
576 }
577 t[0x09] = '\t';
579 t[0x0A] = '\n';
580 t[0x0D] = '\r';
581 let cp1252: [(u8, char); 27] = [
583 (0x80, '\u{20AC}'), (0x82, '\u{201A}'), (0x83, '\u{0192}'), (0x84, '\u{201E}'), (0x85, '\u{2026}'), (0x86, '\u{2020}'), (0x87, '\u{2021}'), (0x88, '\u{02C6}'), (0x89, '\u{2030}'), (0x8A, '\u{0160}'), (0x8B, '\u{2039}'), (0x8C, '\u{0152}'), (0x8E, '\u{017D}'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201C}'), (0x94, '\u{201D}'), (0x95, '\u{2022}'), (0x96, '\u{2013}'), (0x97, '\u{2014}'), (0x98, '\u{02DC}'), (0x99, '\u{2122}'), (0x9A, '\u{0161}'), (0x9B, '\u{203A}'), (0x9C, '\u{0153}'), (0x9E, '\u{017E}'), (0x9F, '\u{0178}'), ];
611 for (code, ch) in cp1252 {
612 t[code as usize] = ch;
613 }
614 for i in 0xA0..=0xFFu16 {
616 t[i as usize] = char::from_u32(i as u32).unwrap_or('\0');
617 }
618 t
619 })
620}
621
622fn mac_roman_encoding() -> &'static [char; 256] {
624 static TABLE: OnceLock<[char; 256]> = OnceLock::new();
625 TABLE.get_or_init(|| {
626 let mut t = ['\0'; 256];
627 for i in 0x20..=0x7Eu8 {
629 t[i as usize] = i as char;
630 }
631 t[0x09] = '\t';
632 t[0x0A] = '\n';
633 t[0x0D] = '\r';
634 let mac_upper: [char; 128] = [
636 '\u{00C4}', '\u{00C5}', '\u{00C7}', '\u{00C9}', '\u{00D1}', '\u{00D6}', '\u{00DC}',
637 '\u{00E1}', '\u{00E0}', '\u{00E2}', '\u{00E4}', '\u{00E3}', '\u{00E5}', '\u{00E7}',
638 '\u{00E9}', '\u{00E8}', '\u{00EA}', '\u{00EB}', '\u{00ED}', '\u{00EC}', '\u{00EE}',
639 '\u{00EF}', '\u{00F1}', '\u{00F3}', '\u{00F2}', '\u{00F4}', '\u{00F6}', '\u{00F5}',
640 '\u{00FA}', '\u{00F9}', '\u{00FB}', '\u{00FC}', '\u{2020}', '\u{00B0}', '\u{00A2}',
641 '\u{00A3}', '\u{00A7}', '\u{2022}', '\u{00B6}', '\u{00DF}', '\u{00AE}', '\u{00A9}',
642 '\u{2122}', '\u{00B4}', '\u{00A8}', '\u{2260}', '\u{00C6}', '\u{00D8}', '\u{221E}',
643 '\u{00B1}', '\u{2264}', '\u{2265}', '\u{00A5}', '\u{00B5}', '\u{2202}', '\u{2211}',
644 '\u{220F}', '\u{03C0}', '\u{222B}', '\u{00AA}', '\u{00BA}', '\u{03A9}', '\u{00E6}',
645 '\u{00F8}', '\u{00BF}', '\u{00A1}', '\u{00AC}', '\u{221A}', '\u{0192}', '\u{2248}',
646 '\u{2206}', '\u{00AB}', '\u{00BB}', '\u{2026}', '\u{00A0}', '\u{00C0}', '\u{00C3}',
647 '\u{00D5}', '\u{0152}', '\u{0153}', '\u{2013}', '\u{2014}', '\u{201C}', '\u{201D}',
648 '\u{2018}', '\u{2019}', '\u{00F7}', '\u{25CA}', '\u{00FF}', '\u{0178}', '\u{2044}',
649 '\u{20AC}', '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{2021}', '\u{00B7}',
650 '\u{201A}', '\u{201E}', '\u{2030}', '\u{00C2}', '\u{00CA}', '\u{00C1}', '\u{00CB}',
651 '\u{00C8}', '\u{00CD}', '\u{00CE}', '\u{00CF}', '\u{00CC}', '\u{00D3}', '\u{00D4}',
652 '\u{F8FF}', '\u{00D2}', '\u{00DA}', '\u{00DB}', '\u{00D9}', '\u{0131}', '\u{02C6}',
653 '\u{02DC}', '\u{00AF}', '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{00B8}', '\u{02DD}',
654 '\u{02DB}', '\u{02C7}',
655 ];
656 for (i, &ch) in mac_upper.iter().enumerate() {
657 t[0x80 + i] = ch;
658 }
659 t
660 })
661}
662
663fn glyph_name_to_unicode(name: &str) -> Option<char> {
674 if name.starts_with("uni") && name.len() >= 7 {
676 return u32::from_str_radix(&name[3..7], 16)
677 .ok()
678 .and_then(char::from_u32);
679 }
680 if name.starts_with('u') && name.len() >= 5 && name[1..].chars().all(|c| c.is_ascii_hexdigit())
681 {
682 return u32::from_str_radix(&name[1..], 16)
683 .ok()
684 .and_then(char::from_u32);
685 }
686
687 if let Some(c) = agl_table().get(name).copied() {
688 return Some(c);
689 }
690
691 match name {
692 "st" => Some('\u{FB06}'),
693 "longst" => Some('\u{FB05}'),
694 _ => None,
695 }
696}
697
698fn agl_table() -> &'static HashMap<&'static str, char> {
700 static TABLE: OnceLock<HashMap<&'static str, char>> = OnceLock::new();
701 TABLE.get_or_init(|| {
702 let entries: &[(&str, char)] = &[
703 ("space", ' '),
704 ("exclam", '!'),
705 ("quotedbl", '"'),
706 ("numbersign", '#'),
707 ("dollar", '$'),
708 ("percent", '%'),
709 ("ampersand", '&'),
710 ("quotesingle", '\''),
711 ("parenleft", '('),
712 ("parenright", ')'),
713 ("asterisk", '*'),
714 ("plus", '+'),
715 ("comma", ','),
716 ("hyphen", '-'),
717 ("period", '.'),
718 ("slash", '/'),
719 ("zero", '0'),
720 ("one", '1'),
721 ("two", '2'),
722 ("three", '3'),
723 ("four", '4'),
724 ("five", '5'),
725 ("six", '6'),
726 ("seven", '7'),
727 ("eight", '8'),
728 ("nine", '9'),
729 ("colon", ':'),
730 ("semicolon", ';'),
731 ("less", '<'),
732 ("equal", '='),
733 ("greater", '>'),
734 ("question", '?'),
735 ("at", '@'),
736 ("A", 'A'),
737 ("B", 'B'),
738 ("C", 'C'),
739 ("D", 'D'),
740 ("E", 'E'),
741 ("F", 'F'),
742 ("G", 'G'),
743 ("H", 'H'),
744 ("I", 'I'),
745 ("J", 'J'),
746 ("K", 'K'),
747 ("L", 'L'),
748 ("M", 'M'),
749 ("N", 'N'),
750 ("O", 'O'),
751 ("P", 'P'),
752 ("Q", 'Q'),
753 ("R", 'R'),
754 ("S", 'S'),
755 ("T", 'T'),
756 ("U", 'U'),
757 ("V", 'V'),
758 ("W", 'W'),
759 ("X", 'X'),
760 ("Y", 'Y'),
761 ("Z", 'Z'),
762 ("bracketleft", '['),
763 ("backslash", '\\'),
764 ("bracketright", ']'),
765 ("asciicircum", '^'),
766 ("underscore", '_'),
767 ("grave", '`'),
768 ("a", 'a'),
769 ("b", 'b'),
770 ("c", 'c'),
771 ("d", 'd'),
772 ("e", 'e'),
773 ("f", 'f'),
774 ("g", 'g'),
775 ("h", 'h'),
776 ("i", 'i'),
777 ("j", 'j'),
778 ("k", 'k'),
779 ("l", 'l'),
780 ("m", 'm'),
781 ("n", 'n'),
782 ("o", 'o'),
783 ("p", 'p'),
784 ("q", 'q'),
785 ("r", 'r'),
786 ("s", 's'),
787 ("t", 't'),
788 ("u", 'u'),
789 ("v", 'v'),
790 ("w", 'w'),
791 ("x", 'x'),
792 ("y", 'y'),
793 ("z", 'z'),
794 ("braceleft", '{'),
795 ("bar", '|'),
796 ("braceright", '}'),
797 ("asciitilde", '~'),
798 ("Agrave", '\u{00C0}'),
800 ("Aacute", '\u{00C1}'),
801 ("Acircumflex", '\u{00C2}'),
802 ("Atilde", '\u{00C3}'),
803 ("Adieresis", '\u{00C4}'),
804 ("Aring", '\u{00C5}'),
805 ("AE", '\u{00C6}'),
806 ("Ccedilla", '\u{00C7}'),
807 ("Egrave", '\u{00C8}'),
808 ("Eacute", '\u{00C9}'),
809 ("Ecircumflex", '\u{00CA}'),
810 ("Edieresis", '\u{00CB}'),
811 ("Igrave", '\u{00CC}'),
812 ("Iacute", '\u{00CD}'),
813 ("Icircumflex", '\u{00CE}'),
814 ("Idieresis", '\u{00CF}'),
815 ("Eth", '\u{00D0}'),
816 ("Ntilde", '\u{00D1}'),
817 ("Ograve", '\u{00D2}'),
818 ("Oacute", '\u{00D3}'),
819 ("Ocircumflex", '\u{00D4}'),
820 ("Otilde", '\u{00D5}'),
821 ("Odieresis", '\u{00D6}'),
822 ("Ugrave", '\u{00D9}'),
823 ("Uacute", '\u{00DA}'),
824 ("Ucircumflex", '\u{00DB}'),
825 ("Udieresis", '\u{00DC}'),
826 ("Yacute", '\u{00DD}'),
827 ("Thorn", '\u{00DE}'),
828 ("germandbls", '\u{00DF}'),
829 ("agrave", '\u{00E0}'),
830 ("aacute", '\u{00E1}'),
831 ("acircumflex", '\u{00E2}'),
832 ("atilde", '\u{00E3}'),
833 ("adieresis", '\u{00E4}'),
834 ("aring", '\u{00E5}'),
835 ("ae", '\u{00E6}'),
836 ("ccedilla", '\u{00E7}'),
837 ("egrave", '\u{00E8}'),
838 ("eacute", '\u{00E9}'),
839 ("ecircumflex", '\u{00EA}'),
840 ("edieresis", '\u{00EB}'),
841 ("igrave", '\u{00EC}'),
842 ("iacute", '\u{00ED}'),
843 ("icircumflex", '\u{00EE}'),
844 ("idieresis", '\u{00EF}'),
845 ("eth", '\u{00F0}'),
846 ("ntilde", '\u{00F1}'),
847 ("ograve", '\u{00F2}'),
848 ("oacute", '\u{00F3}'),
849 ("ocircumflex", '\u{00F4}'),
850 ("otilde", '\u{00F5}'),
851 ("odieresis", '\u{00F6}'),
852 ("ugrave", '\u{00F9}'),
853 ("uacute", '\u{00FA}'),
854 ("ucircumflex", '\u{00FB}'),
855 ("udieresis", '\u{00FC}'),
856 ("yacute", '\u{00FD}'),
857 ("thorn", '\u{00FE}'),
858 ("ydieresis", '\u{00FF}'),
859 ("fi", '\u{FB01}'),
861 ("fl", '\u{FB02}'),
862 ("ff", '\u{FB00}'),
863 ("ffi", '\u{FB03}'),
864 ("ffl", '\u{FB04}'),
865 ("endash", '\u{2013}'),
867 ("emdash", '\u{2014}'),
868 ("bullet", '\u{2022}'),
869 ("ellipsis", '\u{2026}'),
870 ("quoteleft", '\u{2018}'),
871 ("quoteright", '\u{2019}'),
872 ("quotedblleft", '\u{201C}'),
873 ("quotedblright", '\u{201D}'),
874 ("quotesinglebase", '\u{201A}'),
875 ("quotesinglbase", '\u{201A}'),
876 ("quotedblbase", '\u{201E}'),
877 ("dagger", '\u{2020}'),
878 ("daggerdbl", '\u{2021}'),
879 ("perthousand", '\u{2030}'),
880 ("guilsinglleft", '\u{2039}'),
881 ("guilsinglright", '\u{203A}'),
882 ("guillemotleft", '\u{00AB}'),
883 ("guillemotright", '\u{00BB}'),
884 ("trademark", '\u{2122}'),
885 ("copyright", '\u{00A9}'),
886 ("registered", '\u{00AE}'),
887 ("degree", '\u{00B0}'),
888 ("plusminus", '\u{00B1}'),
889 ("multiply", '\u{00D7}'),
890 ("divide", '\u{00F7}'),
891 ("fraction", '\u{2044}'),
892 ("Euro", '\u{20AC}'),
893 ("sterling", '\u{00A3}'),
894 ("yen", '\u{00A5}'),
895 ("cent", '\u{00A2}'),
896 ("currency", '\u{00A4}'),
897 ("section", '\u{00A7}'),
898 ("paragraph", '\u{00B6}'),
899 ("brokenbar", '\u{00A6}'),
900 ("ordfeminine", '\u{00AA}'),
901 ("ordmasculine", '\u{00BA}'),
902 ("exclamdown", '\u{00A1}'),
903 ("questiondown", '\u{00BF}'),
904 ("logicalnot", '\u{00AC}'),
905 ("mu", '\u{00B5}'),
906 ("macron", '\u{00AF}'),
907 ("acute", '\u{00B4}'),
908 ("cedilla", '\u{00B8}'),
909 ("dieresis", '\u{00A8}'),
910 ("circumflex", '\u{02C6}'),
911 ("tilde", '\u{02DC}'),
912 ("caron", '\u{02C7}'),
913 ("ring", '\u{02DA}'),
914 ("breve", '\u{02D8}'),
915 ("dotaccent", '\u{02D9}'),
916 ("hungarumlaut", '\u{02DD}'),
917 ("ogonek", '\u{02DB}'),
918 ("nbspace", '\u{00A0}'),
919 ("nonbreakingspace", '\u{00A0}'),
920 ("softhyphen", '\u{00AD}'),
921 ("periodcentered", '\u{00B7}'),
922 ("middot", '\u{00B7}'),
923 ("florin", '\u{0192}'),
924 ("OE", '\u{0152}'),
925 ("oe", '\u{0153}'),
926 ("Scaron", '\u{0160}'),
927 ("scaron", '\u{0161}'),
928 ("Zcaron", '\u{017D}'),
929 ("zcaron", '\u{017E}'),
930 ("Ydieresis", '\u{0178}'),
931 ("Lslash", '\u{0141}'),
932 ("lslash", '\u{0142}'),
933 ("Oslash", '\u{00D8}'),
934 ("oslash", '\u{00F8}'),
935 ("dotlessi", '\u{0131}'),
936 ("onesuperior", '\u{00B9}'),
938 ("twosuperior", '\u{00B2}'),
939 ("threesuperior", '\u{00B3}'),
940 ("onequarter", '\u{00BC}'),
941 ("onehalf", '\u{00BD}'),
942 ("threequarters", '\u{00BE}'),
943 ("minus", '\u{2212}'),
945 ("notequal", '\u{2260}'),
946 ("lessequal", '\u{2264}'),
947 ("greaterequal", '\u{2265}'),
948 ("infinity", '\u{221E}'),
949 ("partialdiff", '\u{2202}'),
950 ("summation", '\u{2211}'),
951 ("product", '\u{220F}'),
952 ("integral", '\u{222B}'),
953 ("radical", '\u{221A}'),
954 ("approxequal", '\u{2248}'),
955 ("Delta", '\u{0394}'),
956 ("lozenge", '\u{25CA}'),
957 ("pi", '\u{03C0}'),
958 ("Omega", '\u{03A9}'),
959 ];
960 entries.iter().cloned().collect()
961 })
962}
963
964const CT_LIGATURE_MARKER: char = '\u{E007}';
965
966#[derive(Clone, Default)]
967struct DecodedPdfString {
968 text: String,
969 ct_origins: Vec<bool>,
970}
971
972impl DecodedPdfString {
973 fn from_text(text: String) -> Self {
974 let ct_origins = vec![false; text.chars().count()];
975 Self { text, ct_origins }
976 }
977
978 fn push_char(&mut self, ch: char, is_ct_origin: bool) {
979 self.text.push(ch);
980 self.ct_origins.push(is_ct_origin);
981 }
982
983 fn push_str(&mut self, s: &str) {
984 for ch in s.chars() {
985 self.push_char(ch, false);
986 }
987 }
988
989 fn extend(&mut self, other: DecodedPdfString) {
990 self.text.push_str(&other.text);
991 self.ct_origins.extend(other.ct_origins);
992 }
993
994 fn is_empty(&self) -> bool {
995 self.text.is_empty()
996 }
997
998 fn glyph_count(&self) -> usize {
999 self.ct_origins.len()
1000 }
1001
1002 fn iter(&self) -> impl Iterator<Item = (char, bool)> + '_ {
1003 self.text.chars().zip(self.ct_origins.iter().copied())
1004 }
1005}
1006
1007fn decode_pdf_string_with_font_marked(
1010 bytes: &[u8],
1011 font_info: Option<&FontInfo>,
1012) -> DecodedPdfString {
1013 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1015 let chars: Vec<u16> = bytes[2..]
1016 .chunks(2)
1017 .filter_map(|chunk| {
1018 if chunk.len() == 2 {
1019 Some(u16::from_be_bytes([chunk[0], chunk[1]]))
1020 } else {
1021 None
1022 }
1023 })
1024 .collect();
1025 return DecodedPdfString::from_text(String::from_utf16_lossy(&chars));
1026 }
1027
1028 if let Some(info) = font_info {
1029 if info.is_cid && !info.to_unicode.is_empty() {
1030 let mut result = DecodedPdfString::default();
1032 let mut i = 0;
1033 while i + 1 < bytes.len() {
1034 let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
1035 if let Some(s) = info.to_unicode.get(&code) {
1036 result.push_str(s);
1037 } else {
1038 if let Some(ch) = char::from_u32(code) {
1040 if !ch.is_control() || ch == ' ' || ch == '\t' || ch == '\n' {
1041 result.push_char(ch, false);
1042 }
1043 }
1044 }
1045 i += 2;
1046 }
1047 return result;
1048 }
1049
1050 if !info.is_cid && !info.to_unicode.is_empty() {
1051 let mut result = DecodedPdfString::default();
1053 for &b in bytes {
1054 if let Some(s) = info.to_unicode.get(&(b as u32)) {
1055 result.push_str(s);
1056 } else if info.ct_codes[b as usize] {
1057 result.push_char(CT_LIGATURE_MARKER, true);
1058 } else if let Some(ch) = info.encoding_map[b as usize] {
1059 result.push_char(ch, false);
1060 } else {
1061 let ch = b as char;
1062 if is_printable_or_space(ch) {
1063 result.push_char(ch, false);
1064 }
1065 }
1066 }
1067 return result;
1068 }
1069
1070 if !info.is_cid
1072 && (info.encoding_map.iter().any(|c| c.is_some()) || info.ct_codes.iter().any(|f| *f))
1073 {
1074 let mut result = DecodedPdfString::default();
1075 for &b in bytes {
1076 if info.ct_codes[b as usize] {
1077 result.push_char(CT_LIGATURE_MARKER, true);
1078 } else if let Some(ch) = info.encoding_map[b as usize] {
1079 result.push_char(ch, false);
1080 } else {
1081 let ch = b as char;
1082 if is_printable_or_space(ch) {
1083 result.push_char(ch, false);
1084 }
1085 }
1086 }
1087 return result;
1088 }
1089 }
1090
1091 let mut result = DecodedPdfString::default();
1093 for &b in bytes {
1094 let ch = b as char;
1095 if is_printable_or_space(ch) {
1096 result.push_char(ch, false);
1097 }
1098 }
1099 result
1100}
1101
1102fn is_printable_or_space(ch: char) -> bool {
1105 let cp = ch as u32;
1106 cp >= 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D
1107}
1108
1109const LIGATURE_DECOMP: bool = true;
1117
1118fn decompose_ligature_char(c: char) -> Option<&'static str> {
1126 Some(match c {
1127 '\u{FB00}' => "ff",
1128 '\u{FB01}' => "fi",
1129 '\u{FB02}' => "fl",
1130 '\u{FB03}' => "ffi",
1131 '\u{FB04}' => "ffl",
1132 '\u{FB05}' | '\u{FB06}' => "st",
1136 _ => return None,
1137 })
1138}
1139
1140fn decompose_ligature_char_with_origin(c: char, is_ct_origin: bool) -> Option<&'static str> {
1141 if is_ct_origin && c == CT_LIGATURE_MARKER {
1142 Some("ct")
1143 } else {
1144 decompose_ligature_char(c)
1145 }
1146}
1147
1148fn decompose_glyph_to_string(c: char) -> Option<String> {
1156 use unicode_normalization::UnicodeNormalization;
1157 if let Some(s) = decompose_ligature_char(c) {
1158 return Some(s.to_string());
1159 }
1160 if matches!(c, '\u{FB00}'..='\u{FB4F}') {
1161 let nfkd: String = c.nfkd().collect();
1162 if nfkd != c.to_string() {
1167 return Some(nfkd);
1168 }
1169 }
1170 None
1171}
1172
1173fn decompose_ligatures(s: &str) -> String {
1180 let mut out = String::with_capacity(s.len());
1181 for c in s.chars() {
1182 if let Some(replacement) = decompose_glyph_to_string(c) {
1183 out.push_str(&replacement);
1184 } else {
1185 out.push(c);
1186 }
1187 }
1188 out
1189}
1190
1191fn decompose_decoded_ligatures(s: &DecodedPdfString) -> String {
1192 let mut out = String::with_capacity(s.text.len());
1193 for (c, is_ct_origin) in s.iter() {
1194 if let Some(replacement) = decompose_ligature_char_with_origin(c, is_ct_origin) {
1195 out.push_str(replacement);
1196 } else if let Some(replacement) = decompose_glyph_to_string(c) {
1197 out.push_str(&replacement);
1198 } else {
1199 out.push(c);
1200 }
1201 }
1202 out
1203}
1204
1205fn maybe_decompose_decoded(s: &DecodedPdfString) -> String {
1209 if LIGATURE_DECOMP {
1210 if s.ct_origins.iter().any(|origin| *origin) {
1211 decompose_decoded_ligatures(s)
1212 } else {
1213 decompose_ligatures(&s.text)
1214 }
1215 } else {
1216 s.text.clone()
1217 }
1218}
1219
1220pub fn extract_page_blocks(doc: &Document, page_num: u32) -> Vec<TextBlock> {
1222 let pages = doc.get_pages();
1223 let Some(&page_id) = pages.get(&page_num) else {
1224 return Vec::new();
1225 };
1226
1227 let font_map = build_font_map(doc, page_id);
1228 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1229 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1230 if let Ok(content) = Content::decode(&content_bytes) {
1231 return extract_blocks_from_ops_inner(
1232 &content.operations,
1233 page_num,
1234 &font_map,
1235 Some((doc, &resources)),
1236 0,
1237 None,
1238 );
1239 }
1240 }
1241
1242 Vec::new()
1243}
1244
1245pub fn extract_blocks_from_page_id(
1251 doc: &Document,
1252 page_id: lopdf::ObjectId,
1253 page_num: u32,
1254) -> Vec<TextBlock> {
1255 let font_map = build_font_map(doc, page_id);
1256 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1257 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1258 if let Ok(content) = Content::decode(&content_bytes) {
1259 return extract_blocks_from_ops_inner(
1260 &content.operations,
1261 page_num,
1262 &font_map,
1263 Some((doc, &resources)),
1264 0,
1265 None,
1266 );
1267 }
1268 }
1269 Vec::new()
1270}
1271
1272pub fn extract_text(doc: &Document) -> Vec<TextBlock> {
1274 let pages = doc.get_pages();
1275 let mut blocks = Vec::new();
1276
1277 for (&page_num, &page_id) in &pages {
1278 let font_map = build_font_map(doc, page_id);
1279 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1280 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1281 if let Ok(content) = Content::decode(&content_bytes) {
1282 let page_blocks = extract_blocks_from_ops_inner(
1283 &content.operations,
1284 page_num,
1285 &font_map,
1286 Some((doc, &resources)),
1287 0,
1288 None,
1289 );
1290 blocks.extend(page_blocks);
1291 }
1292 }
1293 }
1294
1295 blocks
1296}
1297
1298pub fn extract_page_text(doc: &Document, page_num: u32) -> Result<String> {
1300 let pages = doc.get_pages();
1301 let total = pages.len() as u32;
1302
1303 if page_num == 0 || page_num > total {
1304 return Err(ExtractError::PageOutOfRange(page_num, total));
1305 }
1306
1307 let page_id = *pages
1308 .get(&page_num)
1309 .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1310
1311 let font_map = build_font_map(doc, page_id);
1312 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1313 let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1314 let content = match Content::decode(&content_bytes) {
1315 Ok(c) => c,
1316 Err(_) => return Ok(String::new()),
1317 };
1318
1319 let blocks = extract_blocks_from_ops_inner(
1320 &content.operations,
1321 page_num,
1322 &font_map,
1323 Some((doc, &resources)),
1324 0,
1325 None,
1326 );
1327 let text = blocks
1328 .iter()
1329 .map(|b| b.text.as_str())
1330 .collect::<Vec<_>>()
1331 .join("");
1332
1333 Ok(text)
1334}
1335
1336pub fn extract_positioned_chars(doc: &Document, page_num: u32) -> Result<Vec<PositionedChar>> {
1338 let pages = doc.get_pages();
1339 let total = pages.len() as u32;
1340
1341 if page_num == 0 || page_num > total {
1342 return Err(ExtractError::PageOutOfRange(page_num, total));
1343 }
1344
1345 let page_id = *pages
1346 .get(&page_num)
1347 .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1348
1349 let font_map = build_font_map(doc, page_id);
1350 let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1351 let content = match Content::decode(&content_bytes) {
1352 Ok(c) => c,
1353 Err(_) => return Ok(Vec::new()),
1354 };
1355
1356 let chars = extract_chars_from_ops(&content.operations, page_num, &font_map);
1357 Ok(chars)
1358}
1359
1360fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> std::result::Result<Vec<u8>, ()> {
1362 doc.get_page_content(page_id).map_err(|_| ())
1363}
1364
1365#[derive(Debug, Clone, Default)]
1370struct MarkedContentEntry {
1371 actual_text: Option<String>,
1374}
1375
1376fn resolve_bdc_properties(
1380 doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1381 operand: &Object,
1382) -> Option<lopdf::Dictionary> {
1383 match operand {
1384 Object::Dictionary(d) => Some(d.clone()),
1385 Object::Reference(r) => doc_and_resources.and_then(|(doc, _)| {
1386 doc.get_object(*r).ok().and_then(|o| match o {
1387 Object::Dictionary(d) => Some(d.clone()),
1388 _ => None,
1389 })
1390 }),
1391 Object::Name(n) => {
1392 let (doc, resources) = doc_and_resources?;
1393 let props = resolve_dict(doc, resources, b"Properties")?;
1394 match props.get(n.as_slice()).ok()? {
1395 Object::Dictionary(d) => Some(d.clone()),
1396 Object::Reference(r) => match doc.get_object(*r).ok()? {
1397 Object::Dictionary(d) => Some(d.clone()),
1398 _ => None,
1399 },
1400 _ => None,
1401 }
1402 }
1403 _ => None,
1404 }
1405}
1406
1407fn decode_pdf_text_string(bytes: &[u8]) -> String {
1411 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1412 let u16s: Vec<u16> = bytes[2..]
1413 .chunks_exact(2)
1414 .map(|c| u16::from_be_bytes([c[0], c[1]]))
1415 .collect();
1416 return String::from_utf16_lossy(&u16s);
1417 }
1418 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
1419 return String::from_utf8_lossy(&bytes[3..]).into_owned();
1420 }
1421 bytes
1422 .iter()
1423 .filter_map(|&b| {
1424 let ch = b as char;
1425 if is_printable_or_space(ch) {
1426 Some(ch)
1427 } else {
1428 None
1429 }
1430 })
1431 .collect()
1432}
1433
1434fn extract_actual_text(props: &lopdf::Dictionary) -> Option<String> {
1436 let obj = props.get(b"ActualText").ok()?;
1437 match obj {
1438 Object::String(bytes, _) => Some(decode_pdf_text_string(bytes)),
1439 _ => None,
1440 }
1441}
1442
1443fn current_actual_text(stack: &[MarkedContentEntry], inherited: Option<&str>) -> Option<String> {
1449 stack
1450 .iter()
1451 .rev()
1452 .find_map(|e| e.actual_text.clone())
1453 .or_else(|| inherited.map(|s| s.to_string()))
1454}
1455
1456fn extract_blocks_from_ops_inner(
1462 ops: &[Operation],
1463 page: u32,
1464 font_map: &HashMap<String, FontInfo>,
1465 doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1466 depth: u32,
1467 inherited_actual_text: Option<&str>,
1468) -> Vec<TextBlock> {
1469 let mut state = TextState::default();
1470 let mut blocks = Vec::new();
1471 let mut mc_stack: Vec<MarkedContentEntry> = Vec::new();
1472
1473 for op in ops {
1474 match op.operator.as_str() {
1475 "q" => {
1476 state.gs_stack.push(GraphicsState { ctm: state.ctm });
1477 }
1478 "Q" => {
1479 if let Some(gs) = state.gs_stack.pop() {
1480 state.ctm = gs.ctm;
1481 }
1482 }
1483 "cm" => {
1484 if let Some(m) = extract_matrix(&op.operands) {
1485 state.ctm = multiply_matrix(&state.ctm, &m);
1486 }
1487 }
1488 "BT" => {
1489 state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1490 state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1491 }
1492 "Tf" => {
1493 if op.operands.len() >= 2 {
1494 if let Object::Name(ref name) = op.operands[0] {
1495 state.font_name = String::from_utf8_lossy(name).to_string();
1496 }
1497 if let Some(size) = as_number(&op.operands[1]) {
1498 state.font_size = size;
1499 }
1500 }
1501 }
1502 "Tc" => {
1503 if let Some(v) = op.operands.first().and_then(as_number) {
1504 state.tc = v;
1505 }
1506 }
1507 "Tw" => {
1508 if let Some(v) = op.operands.first().and_then(as_number) {
1509 state.tw = v;
1510 }
1511 }
1512 "Tz" => {
1513 if let Some(v) = op.operands.first().and_then(as_number) {
1514 state.th = v;
1515 }
1516 }
1517 "TL" => {
1518 if let Some(v) = op.operands.first().and_then(as_number) {
1519 state.tl = v;
1520 }
1521 }
1522 "Ts" => {
1523 if let Some(v) = op.operands.first().and_then(as_number) {
1524 state.ts = v;
1525 }
1526 }
1527 "Td" => {
1528 if op.operands.len() >= 2 {
1529 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1530 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1531 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1532 state.tlm = new_tlm;
1533 state.tm = new_tlm;
1534 }
1535 }
1536 "TD" => {
1537 if op.operands.len() >= 2 {
1538 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1539 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1540 state.tl = -ty;
1541 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1542 state.tlm = new_tlm;
1543 state.tm = new_tlm;
1544 }
1545 }
1546 "Tm" => {
1547 if let Some(m) = extract_matrix(&op.operands) {
1548 state.tm = m;
1549 state.tlm = m;
1550 }
1551 }
1552 "T*" => {
1553 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1554 state.tlm = new_tlm;
1555 state.tm = new_tlm;
1556 }
1557 "Tj" => {
1558 let fi = font_map.get(&state.font_name);
1559 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1560 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1561
1562 if !text.is_empty() {
1566 let x = state.tm[4];
1567 let y = state.tm[5];
1568 let text_width = text.glyph_count() as f64 * char_w;
1572 let display_text = maybe_decompose_decoded(&text);
1573 blocks.push(TextBlock {
1574 text: display_text,
1575 page,
1576 bbox: [x, y, x + text_width, y + state.font_size],
1577 font_name: state.font_name.clone(),
1578 font_size: state.font_size,
1579 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1580 });
1581 }
1582
1583 for _ in text.iter() {
1585 state.tm[4] += char_w + state.tc;
1586 }
1587 }
1588 }
1589 "TJ" => {
1590 if let Some(Object::Array(ref arr)) = op.operands.first() {
1591 let x_start = state.tm[4];
1592 let y = state.tm[5];
1593 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1594 let mut combined_text = DecodedPdfString::default();
1595 let fi = font_map.get(&state.font_name);
1596
1597 for item in arr {
1598 match item {
1599 Object::String(bytes, _) => {
1600 let text = decode_pdf_string_with_font_marked(bytes, fi);
1601 for _ in text.iter() {
1602 state.tm[4] += char_w + state.tc;
1603 }
1604 combined_text.extend(text);
1605 }
1606 _ => {
1607 if let Some(adj) = as_number(item) {
1608 state.tm[4] -= adj / 1000.0 * state.font_size;
1610 }
1611 }
1612 }
1613 }
1614
1615 if !combined_text.is_empty() {
1616 let x_end = state.tm[4];
1617 blocks.push(TextBlock {
1618 text: maybe_decompose_decoded(&combined_text),
1619 page,
1620 bbox: [x_start, y, x_end, y + state.font_size],
1621 font_name: state.font_name.clone(),
1622 font_size: state.font_size,
1623 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1624 });
1625 }
1626 }
1627 }
1628 "'" => {
1629 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1631 state.tlm = new_tlm;
1632 state.tm = new_tlm;
1633
1634 let fi = font_map.get(&state.font_name);
1635 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1636 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1637
1638 if !text.is_empty() {
1639 let x = state.tm[4];
1640 let y = state.tm[5];
1641 let text_width = text.glyph_count() as f64 * char_w;
1644 let display_text = maybe_decompose_decoded(&text);
1645 blocks.push(TextBlock {
1646 text: display_text,
1647 page,
1648 bbox: [x, y, x + text_width, y + state.font_size],
1649 font_name: state.font_name.clone(),
1650 font_size: state.font_size,
1651 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1652 });
1653 }
1654
1655 for _ in text.iter() {
1656 state.tm[4] += char_w + state.tc;
1657 }
1658 }
1659 }
1660 "\"" => {
1661 if op.operands.len() >= 3 {
1663 if let Some(tw) = as_number(&op.operands[0]) {
1664 state.tw = tw;
1665 }
1666 if let Some(tc) = as_number(&op.operands[1]) {
1667 state.tc = tc;
1668 }
1669
1670 let new_tlm =
1671 multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1672 state.tlm = new_tlm;
1673 state.tm = new_tlm;
1674
1675 let fi = font_map.get(&state.font_name);
1676 if let Some(text) =
1677 extract_decoded_string_operand_with_font(&op.operands[2..], fi)
1678 {
1679 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1680
1681 if !text.is_empty() {
1682 let x = state.tm[4];
1683 let y = state.tm[5];
1684 let text_width = text.glyph_count() as f64 * char_w;
1687 let display_text = maybe_decompose_decoded(&text);
1688 blocks.push(TextBlock {
1689 text: display_text,
1690 page,
1691 bbox: [x, y, x + text_width, y + state.font_size],
1692 font_name: state.font_name.clone(),
1693 font_size: state.font_size,
1694 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1695 });
1696 }
1697
1698 for _ in text.iter() {
1699 state.tm[4] += char_w + state.tc;
1700 }
1701 }
1702 }
1703 }
1704 "BMC" => {
1705 mc_stack.push(MarkedContentEntry::default());
1709 }
1710 "BDC" => {
1711 let actual_text = op
1715 .operands
1716 .get(1)
1717 .and_then(|o| resolve_bdc_properties(doc_and_resources, o))
1718 .as_ref()
1719 .and_then(extract_actual_text);
1720 mc_stack.push(MarkedContentEntry { actual_text });
1721 }
1722 "EMC" => {
1723 mc_stack.pop();
1726 }
1727 "Do" => {
1728 if depth < 5 {
1733 if let Some((doc, resources)) = doc_and_resources {
1734 if let Some(Object::Name(ref xobj_name)) = op.operands.first() {
1735 let xobj_name_str = String::from_utf8_lossy(xobj_name);
1736 let inherited_for_child =
1737 current_actual_text(&mc_stack, inherited_actual_text);
1738 if let Some(xobj_blocks) = extract_form_xobject_text(
1739 doc,
1740 resources,
1741 &xobj_name_str,
1742 page,
1743 font_map,
1744 depth,
1745 inherited_for_child.as_deref(),
1746 ) {
1747 blocks.extend(xobj_blocks);
1748 }
1749 }
1750 }
1751 }
1752 }
1753 _ => {}
1754 }
1755 }
1756
1757 blocks
1758}
1759
1760fn extract_form_xobject_text(
1767 doc: &Document,
1768 resources: &lopdf::Dictionary,
1769 name: &str,
1770 page: u32,
1771 font_map: &HashMap<String, FontInfo>,
1772 depth: u32,
1773 inherited_actual_text: Option<&str>,
1774) -> Option<Vec<TextBlock>> {
1775 let xobj_dict = match resources.get(b"XObject").ok()? {
1777 Object::Dictionary(d) => d.clone(),
1778 Object::Reference(r) => match doc.get_object(*r).ok()? {
1779 Object::Dictionary(d) => d.clone(),
1780 _ => return None,
1781 },
1782 _ => return None,
1783 };
1784
1785 let xobj_ref = match xobj_dict.get(name.as_bytes()).ok()? {
1786 Object::Reference(r) => *r,
1787 _ => return None,
1788 };
1789
1790 let stream = match doc.get_object(xobj_ref).ok()? {
1791 Object::Stream(s) => s.clone(),
1792 _ => return None,
1793 };
1794
1795 let subtype = stream
1797 .dict
1798 .get(b"Subtype")
1799 .ok()
1800 .and_then(|o| match o {
1801 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1802 _ => None,
1803 })
1804 .unwrap_or_default();
1805 if subtype != "Form" {
1806 return None;
1807 }
1808
1809 let content_bytes = stream
1811 .decompressed_content()
1812 .ok()
1813 .unwrap_or_else(|| stream.content.clone());
1814 let content = Content::decode(&content_bytes).ok()?;
1815
1816 let mut xobj_font_map = font_map.clone();
1819 if let Some(xobj_resources) = resolve_dict(doc, &stream.dict, b"Resources") {
1820 if let Some(xobj_fonts) = resolve_dict(doc, &xobj_resources, b"Font") {
1821 for (name_bytes, value) in xobj_fonts.iter() {
1822 let fname = String::from_utf8_lossy(name_bytes).to_string();
1823 if let Some(fi) = build_font_info_from_value(doc, value) {
1824 xobj_font_map.insert(fname, fi);
1825 }
1826 }
1827 }
1828 }
1829
1830 let xobj_resources =
1832 resolve_dict(doc, &stream.dict, b"Resources").unwrap_or_else(|| resources.clone());
1833
1834 Some(extract_blocks_from_ops_inner(
1835 &content.operations,
1836 page,
1837 &xobj_font_map,
1838 Some((doc, &xobj_resources)),
1839 depth + 1,
1840 inherited_actual_text,
1841 ))
1842}
1843
1844fn build_font_info_from_value(doc: &Document, value: &Object) -> Option<FontInfo> {
1846 let font = match value {
1847 Object::Reference(r) => match doc.get_object(*r).ok()? {
1848 Object::Dictionary(d) => d.clone(),
1849 _ => return None,
1850 },
1851 Object::Dictionary(d) => d.clone(),
1852 _ => return None,
1853 };
1854
1855 let subtype = font
1856 .get(b"Subtype")
1857 .ok()
1858 .and_then(|o| match o {
1859 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1860 _ => None,
1861 })
1862 .unwrap_or_default();
1863
1864 let is_cid = subtype == "Type0";
1865 let mut to_unicode = parse_to_unicode_from_font(doc, &font);
1866
1867 if to_unicode.is_empty() && is_cid {
1868 if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
1869 for d in descendants {
1870 let desc_dict = match d {
1871 Object::Reference(r) => {
1872 let Some(Object::Dictionary(d)) = doc.get_object(*r).ok() else {
1875 continue;
1876 };
1877 d
1878 }
1879 Object::Dictionary(d) => d,
1880 _ => continue,
1881 };
1882 let tu = parse_to_unicode_from_font(doc, desc_dict);
1883 if !tu.is_empty() {
1884 to_unicode = tu;
1885 break;
1886 }
1887 }
1888 }
1889 }
1890
1891 let (encoding_map, ct_codes) = if !is_cid {
1892 build_encoding_map(doc, &font)
1893 } else {
1894 ([None; 256], [false; 256])
1895 };
1896
1897 Some(FontInfo {
1898 is_cid,
1899 to_unicode,
1900 encoding_map,
1901 ct_codes,
1902 })
1903}
1904
1905fn push_glyph_positioned(
1919 chars: &mut Vec<PositionedChar>,
1920 state: &mut TextState,
1921 page: u32,
1922 glyph: char,
1923 is_ct_origin: bool,
1924 char_w: f64,
1925) {
1926 let (gx, gy) = apply_ctm(state);
1927 let constituents: Option<String> = if LIGATURE_DECOMP {
1928 if let Some(s) = decompose_ligature_char_with_origin(glyph, is_ct_origin) {
1932 Some(s.to_string())
1933 } else {
1934 decompose_glyph_to_string(glyph)
1935 }
1936 } else {
1937 None
1938 };
1939 match constituents {
1940 None => {
1941 chars.push(PositionedChar {
1942 ch: glyph,
1943 page,
1944 bbox: [gx, gy, gx + char_w, gy + state.font_size],
1945 });
1946 }
1947 Some(s) => {
1948 let n = s.chars().count() as f64;
1949 let part_w = char_w / n;
1950 for (i, c) in s.chars().enumerate() {
1951 let x = gx + part_w * i as f64;
1952 chars.push(PositionedChar {
1953 ch: c,
1954 page,
1955 bbox: [x, gy, x + part_w, gy + state.font_size],
1956 });
1957 }
1958 }
1959 }
1960 state.tm[4] += char_w + state.tc;
1961}
1962
1963fn extract_chars_from_ops(
1965 ops: &[Operation],
1966 page: u32,
1967 font_map: &HashMap<String, FontInfo>,
1968) -> Vec<PositionedChar> {
1969 let mut state = TextState::default();
1970 let mut chars = Vec::new();
1971
1972 for op in ops {
1973 match op.operator.as_str() {
1974 "q" => {
1975 state.gs_stack.push(GraphicsState { ctm: state.ctm });
1976 }
1977 "Q" => {
1978 if let Some(gs) = state.gs_stack.pop() {
1979 state.ctm = gs.ctm;
1980 }
1981 }
1982 "cm" => {
1983 if let Some(m) = extract_matrix(&op.operands) {
1984 state.ctm = multiply_matrix(&state.ctm, &m);
1985 }
1986 }
1987 "BT" => {
1988 state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1989 state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1990 }
1991 "Tf" => {
1992 if op.operands.len() >= 2 {
1993 if let Object::Name(ref name) = op.operands[0] {
1994 state.font_name = String::from_utf8_lossy(name).to_string();
1995 }
1996 if let Some(size) = as_number(&op.operands[1]) {
1997 state.font_size = size;
1998 }
1999 }
2000 }
2001 "Tc" => {
2002 if let Some(v) = op.operands.first().and_then(as_number) {
2003 state.tc = v;
2004 }
2005 }
2006 "Tw" => {
2007 if let Some(v) = op.operands.first().and_then(as_number) {
2008 state.tw = v;
2009 }
2010 }
2011 "Tz" => {
2012 if let Some(v) = op.operands.first().and_then(as_number) {
2013 state.th = v;
2014 }
2015 }
2016 "TL" => {
2017 if let Some(v) = op.operands.first().and_then(as_number) {
2018 state.tl = v;
2019 }
2020 }
2021 "Ts" => {
2022 if let Some(v) = op.operands.first().and_then(as_number) {
2023 state.ts = v;
2024 }
2025 }
2026 "Td" => {
2027 if op.operands.len() >= 2 {
2028 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2029 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2030 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2031 state.tlm = new_tlm;
2032 state.tm = new_tlm;
2033 }
2034 }
2035 "TD" => {
2036 if op.operands.len() >= 2 {
2037 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2038 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2039 state.tl = -ty;
2040 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2041 state.tlm = new_tlm;
2042 state.tm = new_tlm;
2043 }
2044 }
2045 "Tm" => {
2046 if let Some(m) = extract_matrix(&op.operands) {
2047 state.tm = m;
2048 state.tlm = m;
2049 }
2050 }
2051 "T*" => {
2052 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2053 state.tlm = new_tlm;
2054 state.tm = new_tlm;
2055 }
2056 "Tj" => {
2057 let fi = font_map.get(&state.font_name);
2058 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2059 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2060 for (ch, is_ct_origin) in text.iter() {
2061 push_glyph_positioned(
2062 &mut chars,
2063 &mut state,
2064 page,
2065 ch,
2066 is_ct_origin,
2067 char_w,
2068 );
2069 }
2070 }
2071 }
2072 "TJ" => {
2073 if let Some(Object::Array(ref arr)) = op.operands.first() {
2074 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2075 let fi = font_map.get(&state.font_name);
2076 for item in arr {
2077 match item {
2078 Object::String(bytes, _) => {
2079 let text = decode_pdf_string_with_font_marked(bytes, fi);
2080 for (ch, is_ct_origin) in text.iter() {
2081 push_glyph_positioned(
2082 &mut chars,
2083 &mut state,
2084 page,
2085 ch,
2086 is_ct_origin,
2087 char_w,
2088 );
2089 }
2090 }
2091 _ => {
2092 if let Some(adj) = as_number(item) {
2093 state.tm[4] -= adj / 1000.0 * state.font_size;
2094 }
2095 }
2096 }
2097 }
2098 }
2099 }
2100 "'" => {
2101 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2102 state.tlm = new_tlm;
2103 state.tm = new_tlm;
2104
2105 let fi = font_map.get(&state.font_name);
2106 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2107 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2108 for (ch, is_ct_origin) in text.iter() {
2109 push_glyph_positioned(
2110 &mut chars,
2111 &mut state,
2112 page,
2113 ch,
2114 is_ct_origin,
2115 char_w,
2116 );
2117 }
2118 }
2119 }
2120 "\"" => {
2121 if op.operands.len() >= 3 {
2122 if let Some(tw) = as_number(&op.operands[0]) {
2123 state.tw = tw;
2124 }
2125 if let Some(tc) = as_number(&op.operands[1]) {
2126 state.tc = tc;
2127 }
2128
2129 let new_tlm =
2130 multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2131 state.tlm = new_tlm;
2132 state.tm = new_tlm;
2133
2134 let fi = font_map.get(&state.font_name);
2135 if let Some(text) =
2136 extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2137 {
2138 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2139 for (ch, is_ct_origin) in text.iter() {
2140 push_glyph_positioned(
2141 &mut chars,
2142 &mut state,
2143 page,
2144 ch,
2145 is_ct_origin,
2146 char_w,
2147 );
2148 }
2149 }
2150 }
2151 }
2152 _ => {}
2153 }
2154 }
2155
2156 chars
2157}
2158
2159#[inline]
2168fn apply_ctm(state: &TextState) -> (f64, f64) {
2169 let x = state.ctm[0] * state.tm[4] + state.ctm[2] * state.tm[5] + state.ctm[4];
2170 let y = state.ctm[1] * state.tm[4] + state.ctm[3] * state.tm[5] + state.ctm[5];
2171 (x, y)
2172}
2173
2174fn extract_decoded_string_operand_with_font(
2176 operands: &[Object],
2177 font_info: Option<&FontInfo>,
2178) -> Option<DecodedPdfString> {
2179 for op in operands {
2180 if let Object::String(bytes, _) = op {
2181 return Some(decode_pdf_string_with_font_marked(bytes, font_info));
2182 }
2183 }
2184 None
2185}
2186
2187fn as_number(obj: &Object) -> Option<f64> {
2189 match obj {
2190 Object::Integer(i) => Some(*i as f64),
2191 Object::Real(f) => Some(*f as f64),
2192 _ => None,
2193 }
2194}
2195
2196fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
2198 if operands.len() < 6 {
2199 return None;
2200 }
2201 let a = as_number(&operands[0])?;
2202 let b = as_number(&operands[1])?;
2203 let c = as_number(&operands[2])?;
2204 let d = as_number(&operands[3])?;
2205 let e = as_number(&operands[4])?;
2206 let f = as_number(&operands[5])?;
2207 Some([a, b, c, d, e, f])
2208}
2209
2210fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
2212 [
2213 m1[0] * m2[0] + m1[1] * m2[2],
2214 m1[0] * m2[1] + m1[1] * m2[3],
2215 m1[2] * m2[0] + m1[3] * m2[2],
2216 m1[2] * m2[1] + m1[3] * m2[3],
2217 m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
2218 m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
2219 ]
2220}
2221
2222#[cfg(test)]
2223mod tests {
2224 use super::*;
2225 use lopdf::{dictionary, Document, Object, Stream};
2226
2227 fn make_doc_with_text(content: &[u8]) -> Document {
2229 let mut doc = Document::with_version("1.7");
2230
2231 let content_stream = Stream::new(dictionary! {}, content.to_vec());
2232 let content_id = doc.add_object(Object::Stream(content_stream));
2233
2234 let page_dict = dictionary! {
2235 "Type" => "Page",
2236 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2237 "Contents" => Object::Reference(content_id),
2238 };
2239 let page_id = doc.add_object(Object::Dictionary(page_dict));
2240
2241 let pages_dict = dictionary! {
2242 "Type" => "Pages",
2243 "Kids" => vec![Object::Reference(page_id)],
2244 "Count" => 1_i64,
2245 };
2246 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2247
2248 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2249 d.set("Parent", Object::Reference(pages_id));
2250 }
2251
2252 let catalog = dictionary! {
2253 "Type" => "Catalog",
2254 "Pages" => Object::Reference(pages_id),
2255 };
2256 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2257 doc.trailer.set("Root", Object::Reference(catalog_id));
2258
2259 doc
2260 }
2261
2262 #[test]
2263 fn extract_simple_text() {
2264 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
2265 let blocks = extract_text(&doc);
2266 assert_eq!(blocks.len(), 1);
2267 assert_eq!(blocks[0].text, "Hello World");
2268 assert_eq!(blocks[0].page, 1);
2269 assert_eq!(blocks[0].font_size, 12.0);
2270 }
2271
2272 #[test]
2273 fn extract_page_text_single() {
2274 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2275 let text = extract_page_text(&doc, 1).unwrap();
2276 assert_eq!(text, "Hello");
2277 }
2278
2279 #[test]
2280 fn extract_page_text_out_of_range() {
2281 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2282 let result = extract_page_text(&doc, 5);
2283 assert!(result.is_err());
2284 }
2285
2286 #[test]
2287 fn extract_positioned_chars_basic() {
2288 let doc = make_doc_with_text(b"BT /F1 12 Tf (AB) Tj ET");
2289 let chars = extract_positioned_chars(&doc, 1).unwrap();
2290 assert_eq!(chars.len(), 2);
2291 assert_eq!(chars[0].ch, 'A');
2292 assert_eq!(chars[1].ch, 'B');
2293 assert_eq!(chars[0].page, 1);
2294 assert!(chars[1].bbox[0] > chars[0].bbox[0]);
2296 }
2297
2298 #[test]
2299 fn extract_tj_array() {
2300 let doc = make_doc_with_text(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET");
2301 let blocks = extract_text(&doc);
2302 assert_eq!(blocks.len(), 1);
2303 assert_eq!(blocks[0].text, "Hello");
2304 }
2305
2306 #[test]
2307 fn empty_page_extracts_no_text() {
2308 let doc = make_doc_with_text(b"q Q");
2309 let blocks = extract_text(&doc);
2310 assert!(blocks.is_empty());
2311 }
2312
2313 #[test]
2314 fn multiline_text_extraction() {
2315 let doc = make_doc_with_text(b"BT /F1 12 Tf 12 TL (Line1) Tj T* (Line2) Tj ET");
2316 let blocks = extract_text(&doc);
2317 assert_eq!(blocks.len(), 2);
2318 assert_eq!(blocks[0].text, "Line1");
2319 assert_eq!(blocks[1].text, "Line2");
2320 }
2321
2322 #[test]
2326 fn actual_text_basic_bdc_override() {
2327 let doc = make_doc_with_text(b"BT /F1 12 Tf /Span <</ActualText (fi)>> BDC (X) Tj EMC ET");
2328 let blocks = extract_text(&doc);
2329 assert_eq!(blocks.len(), 1);
2330 assert_eq!(blocks[0].text, "X");
2331 assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2332 }
2333
2334 #[test]
2338 fn actual_text_nested_bdc_inner_overrides_outer() {
2339 let doc = make_doc_with_text(
2340 b"BT /F1 12 Tf \
2341 /Span <</ActualText (outer)>> BDC \
2342 (A) Tj \
2343 /Span <</ActualText (inner)>> BDC \
2344 (B) Tj \
2345 EMC \
2346 (C) Tj \
2347 EMC ET",
2348 );
2349 let blocks = extract_text(&doc);
2350 assert_eq!(blocks.len(), 3);
2351 assert_eq!(blocks[0].text, "A");
2352 assert_eq!(blocks[0].actual_text.as_deref(), Some("outer"));
2353 assert_eq!(blocks[1].text, "B");
2354 assert_eq!(blocks[1].actual_text.as_deref(), Some("inner"));
2355 assert_eq!(blocks[2].text, "C");
2356 assert_eq!(blocks[2].actual_text.as_deref(), Some("outer"));
2357 }
2358
2359 #[test]
2362 fn actual_text_bmc_does_not_leak_emc() {
2363 let doc = make_doc_with_text(
2364 b"BT /F1 12 Tf \
2365 /Span <</ActualText (X)>> BDC \
2366 /Artifact BMC (in) Tj EMC \
2367 (out) Tj \
2368 EMC \
2369 (after) Tj ET",
2370 );
2371 let blocks = extract_text(&doc);
2372 assert_eq!(blocks.len(), 3);
2373 assert_eq!(blocks[0].actual_text.as_deref(), Some("X"));
2374 assert_eq!(blocks[1].actual_text.as_deref(), Some("X"));
2375 assert_eq!(blocks[2].actual_text, None);
2376 }
2377
2378 fn make_doc_with_xobj_inside_bdc(
2384 page_pre: &[u8],
2385 xobj_content: &[u8],
2386 page_post: &[u8],
2387 actual_text: &str,
2388 ) -> Document {
2389 let mut doc = Document::with_version("1.7");
2390
2391 let xobj_dict = dictionary! {
2392 "Type" => "XObject",
2393 "Subtype" => "Form",
2394 "BBox" => vec![0.into(), 0.into(), 100.into(), 100.into()],
2395 };
2396 let xobj_stream = Stream::new(xobj_dict, xobj_content.to_vec());
2397 let xobj_id = doc.add_object(Object::Stream(xobj_stream));
2398
2399 let mut page_content = Vec::<u8>::new();
2401 page_content.extend_from_slice(b"BT /F1 12 Tf /Span <</ActualText (");
2402 page_content.extend_from_slice(actual_text.as_bytes());
2403 page_content.extend_from_slice(b")>> BDC ");
2404 page_content.extend_from_slice(page_pre);
2405 page_content.extend_from_slice(b" /Fm0 Do ");
2406 page_content.extend_from_slice(page_post);
2407 page_content.extend_from_slice(b" EMC ET");
2408
2409 let content_stream = Stream::new(dictionary! {}, page_content);
2410 let content_id = doc.add_object(Object::Stream(content_stream));
2411
2412 let resources = dictionary! {
2413 "XObject" => dictionary! { "Fm0" => Object::Reference(xobj_id) },
2414 };
2415 let resources_id = doc.add_object(Object::Dictionary(resources));
2416
2417 let page_dict = dictionary! {
2418 "Type" => "Page",
2419 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2420 "Contents" => Object::Reference(content_id),
2421 "Resources" => Object::Reference(resources_id),
2422 };
2423 let page_id = doc.add_object(Object::Dictionary(page_dict));
2424
2425 let pages_dict = dictionary! {
2426 "Type" => "Pages",
2427 "Kids" => vec![Object::Reference(page_id)],
2428 "Count" => 1_i64,
2429 };
2430 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2431
2432 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2433 d.set("Parent", Object::Reference(pages_id));
2434 }
2435
2436 let catalog = dictionary! {
2437 "Type" => "Catalog",
2438 "Pages" => Object::Reference(pages_id),
2439 };
2440 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2441 doc.trailer.set("Root", Object::Reference(catalog_id));
2442
2443 doc
2444 }
2445
2446 #[test]
2453 fn actual_text_propagates_into_form_xobject_recursion() {
2454 let doc = make_doc_with_xobj_inside_bdc(
2455 b"(pre) Tj",
2456 b"BT /F1 12 Tf (inside) Tj ET",
2457 b"(post) Tj",
2458 "wrapped",
2459 );
2460 let blocks = extract_text(&doc);
2461 assert_eq!(blocks.len(), 3);
2462 assert_eq!(blocks[0].text, "pre");
2463 assert_eq!(blocks[0].actual_text.as_deref(), Some("wrapped"));
2464 assert_eq!(blocks[1].text, "inside");
2467 assert_eq!(
2468 blocks[1].actual_text.as_deref(),
2469 Some("wrapped"),
2470 "Form XObject text lost surrounding /ActualText (issue #1358)"
2471 );
2472 assert_eq!(blocks[2].text, "post");
2473 assert_eq!(blocks[2].actual_text.as_deref(), Some("wrapped"));
2474 }
2475
2476 #[test]
2483 fn actual_text_inner_xobj_bdc_overrides_inherited() {
2484 let doc = make_doc_with_xobj_inside_bdc(
2485 b"",
2486 b"BT /F1 12 Tf \
2487 /Span <</ActualText (inner)>> BDC (B) Tj EMC \
2488 (after) Tj ET",
2489 b"",
2490 "outer",
2491 );
2492 let blocks = extract_text(&doc);
2493 assert_eq!(blocks.len(), 2);
2494 assert_eq!(blocks[0].text, "B");
2495 assert_eq!(blocks[0].actual_text.as_deref(), Some("inner"));
2496 assert_eq!(blocks[1].text, "after");
2497 assert_eq!(blocks[1].actual_text.as_deref(), Some("outer"));
2498 }
2499
2500 #[test]
2503 fn actual_text_utf16be_bom_decodes() {
2504 let doc = make_doc_with_text(
2506 b"BT /F1 12 Tf /Span <</ActualText <FEFF00660069> >> BDC (X) Tj EMC ET",
2507 );
2508 let blocks = extract_text(&doc);
2509 assert_eq!(blocks.len(), 1);
2510 assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2511 }
2512
2513 #[test]
2516 fn ligature_ff_decomposes() {
2517 assert_eq!(decompose_ligature_char('\u{FB00}'), Some("ff"));
2518 assert_eq!(decompose_ligatures("\u{FB00}"), "ff");
2519 assert_eq!(decompose_ligatures("o\u{FB00}ice"), "office");
2520 }
2521
2522 #[test]
2523 fn ligature_fi_decomposes() {
2524 assert_eq!(decompose_ligature_char('\u{FB01}'), Some("fi"));
2525 assert_eq!(decompose_ligatures("\u{FB01}"), "fi");
2526 assert_eq!(decompose_ligatures("of\u{FB01}ce"), "office");
2527 }
2528
2529 #[test]
2530 fn ligature_fl_decomposes() {
2531 assert_eq!(decompose_ligature_char('\u{FB02}'), Some("fl"));
2532 assert_eq!(decompose_ligatures("\u{FB02}ame"), "flame");
2533 }
2534
2535 #[test]
2536 fn ligature_ffi_decomposes() {
2537 assert_eq!(decompose_ligature_char('\u{FB03}'), Some("ffi"));
2538 assert_eq!(decompose_ligatures("o\u{FB03}ce"), "office");
2539 }
2540
2541 #[test]
2542 fn ligature_ffl_decomposes() {
2543 assert_eq!(decompose_ligature_char('\u{FB04}'), Some("ffl"));
2544 assert_eq!(decompose_ligatures("ba\u{FB04}e"), "baffle");
2545 }
2546
2547 #[test]
2552 fn ligature_st_decomposes() {
2553 assert_eq!(decompose_ligature_char('\u{FB06}'), Some("st"));
2554 assert_eq!(decompose_ligature_char('\u{FB05}'), Some("st"));
2555 assert_eq!(decompose_ligatures("fa\u{FB06}"), "fast");
2556 assert_eq!(glyph_name_to_unicode("st"), Some('\u{FB06}'));
2558 }
2559
2560 #[test]
2567 fn ligature_ct_via_glyph_name_emits_string() {
2568 assert_eq!(glyph_name_to_unicode("ct"), None);
2569 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6101> Tj ET", "ct");
2571 let blocks = extract_text(&doc);
2572 assert_eq!(blocks.len(), 1);
2573 assert_eq!(blocks[0].text, "act");
2574 }
2575
2576 #[test]
2581 fn tounicode_pua_e007_is_preserved() {
2582 let doc = make_doc_with_tounicode_cmap(b"BT /F1 12 Tf <01> Tj ET", 0x01, 0xE007);
2583 let blocks = extract_text(&doc);
2584 assert_eq!(blocks.len(), 1);
2585 assert_eq!(blocks[0].text, "\u{E007}");
2586 assert_ne!(blocks[0].text, "ct");
2587 }
2588
2589 #[test]
2593 fn ligature_ct_positioned_chars_advance_as_single_glyph() {
2594 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <610162> Tj ET", "ct");
2596 let chars = extract_positioned_chars(&doc, 1).unwrap();
2597 let extracted: String = chars.iter().map(|c| c.ch).collect();
2598 assert_eq!(extracted, "actb");
2599 assert_eq!(chars.len(), 4);
2600
2601 let char_w = 12.0 * APPROX_CHAR_WIDTH;
2602 assert!((chars[1].bbox[0] - char_w).abs() < 1e-6);
2603 assert!((chars[3].bbox[0] - (2.0 * char_w)).abs() < 1e-6);
2604 }
2605
2606 fn make_doc_with_ligature_font(content: &[u8], glyph_name: &str) -> Document {
2611 let mut doc = Document::with_version("1.7");
2612
2613 let content_stream = Stream::new(dictionary! {}, content.to_vec());
2614 let content_id = doc.add_object(Object::Stream(content_stream));
2615
2616 let encoding = dictionary! {
2617 "Type" => "Encoding",
2618 "BaseEncoding" => "WinAnsiEncoding",
2619 "Differences" => vec![
2620 1_i64.into(),
2621 Object::Name(glyph_name.as_bytes().to_vec()),
2622 ],
2623 };
2624 let encoding_id = doc.add_object(Object::Dictionary(encoding));
2625
2626 let font = dictionary! {
2627 "Type" => "Font",
2628 "Subtype" => "Type1",
2629 "BaseFont" => "Helvetica",
2630 "Encoding" => Object::Reference(encoding_id),
2631 };
2632 let font_id = doc.add_object(Object::Dictionary(font));
2633
2634 let resources = dictionary! {
2635 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2636 };
2637 let resources_id = doc.add_object(Object::Dictionary(resources));
2638
2639 let page_dict = dictionary! {
2640 "Type" => "Page",
2641 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2642 "Contents" => Object::Reference(content_id),
2643 "Resources" => Object::Reference(resources_id),
2644 };
2645 let page_id = doc.add_object(Object::Dictionary(page_dict));
2646
2647 let pages_dict = dictionary! {
2648 "Type" => "Pages",
2649 "Kids" => vec![Object::Reference(page_id)],
2650 "Count" => 1_i64,
2651 };
2652 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2653
2654 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2655 d.set("Parent", Object::Reference(pages_id));
2656 }
2657
2658 let catalog = dictionary! {
2659 "Type" => "Catalog",
2660 "Pages" => Object::Reference(pages_id),
2661 };
2662 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2663 doc.trailer.set("Root", Object::Reference(catalog_id));
2664
2665 doc
2666 }
2667
2668 #[test]
2674 fn ligature_office_golden_ffi() {
2675 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6F016365> Tj ET", "ffi");
2677 let blocks = extract_text(&doc);
2678 assert_eq!(blocks.len(), 1);
2679 assert_eq!(blocks[0].text, "office");
2680 }
2681
2682 fn make_doc_with_tounicode_cmap(content: &[u8], src: u8, dst: u32) -> Document {
2687 let mut doc = Document::with_version("1.7");
2688
2689 let content_stream = Stream::new(dictionary! {}, content.to_vec());
2690 let content_id = doc.add_object(Object::Stream(content_stream));
2691
2692 let cmap_text = format!(
2694 "/CIDInit /ProcSet findresource begin\n\
2695 12 dict begin\n\
2696 begincmap\n\
2697 /CMapType 2 def\n\
2698 1 beginbfchar\n\
2699 <{:02X}> <{:04X}>\n\
2700 endbfchar\n\
2701 endcmap CMapName currentdict /CMap defineresource pop end end",
2702 src, dst
2703 );
2704 let cmap_stream = Stream::new(dictionary! {}, cmap_text.into_bytes());
2705 let cmap_id = doc.add_object(Object::Stream(cmap_stream));
2706
2707 let font = dictionary! {
2708 "Type" => "Font",
2709 "Subtype" => "Type1",
2710 "BaseFont" => "Helvetica",
2711 "Encoding" => "WinAnsiEncoding",
2712 "ToUnicode" => Object::Reference(cmap_id),
2713 };
2714 let font_id = doc.add_object(Object::Dictionary(font));
2715
2716 let resources = dictionary! {
2717 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2718 };
2719 let resources_id = doc.add_object(Object::Dictionary(resources));
2720
2721 let page_dict = dictionary! {
2722 "Type" => "Page",
2723 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2724 "Contents" => Object::Reference(content_id),
2725 "Resources" => Object::Reference(resources_id),
2726 };
2727 let page_id = doc.add_object(Object::Dictionary(page_dict));
2728
2729 let pages_dict = dictionary! {
2730 "Type" => "Pages",
2731 "Kids" => vec![Object::Reference(page_id)],
2732 "Count" => 1_i64,
2733 };
2734 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2735
2736 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2737 d.set("Parent", Object::Reference(pages_id));
2738 }
2739
2740 let catalog = dictionary! {
2741 "Type" => "Catalog",
2742 "Pages" => Object::Reference(pages_id),
2743 };
2744 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2745 doc.trailer.set("Root", Object::Reference(catalog_id));
2746
2747 doc
2748 }
2749
2750 #[test]
2755 fn ligature_positioned_chars_split_bbox() {
2756 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2757 let chars = extract_positioned_chars(&doc, 1).unwrap();
2758 assert_eq!(chars.len(), 3, "ffi → 3 chars");
2759 assert_eq!(chars[0].ch, 'f');
2760 assert_eq!(chars[1].ch, 'f');
2761 assert_eq!(chars[2].ch, 'i');
2762 assert!(chars[0].bbox[2] <= chars[1].bbox[0] + 1e-9);
2764 assert!(chars[1].bbox[2] <= chars[2].bbox[0] + 1e-9);
2765 let total = chars[2].bbox[2] - chars[0].bbox[0];
2767 let expected = 12.0 * APPROX_CHAR_WIDTH;
2768 assert!((total - expected).abs() < 1e-6);
2769 }
2770
2771 #[test]
2781 fn positioned_chars_match_extracted_text_for_armenian_ligature() {
2782 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB13");
2785 let blocks = extract_text(&doc);
2786 let chars = extract_positioned_chars(&doc, 1).unwrap();
2787 assert_eq!(blocks.len(), 1);
2788 assert_eq!(
2790 blocks[0].text.chars().count(),
2791 chars.len(),
2792 "PositionedChar count drifted from extract_text() char count \
2793 (issue #1359 problem 1): blocks[0].text = {:?}, chars = {:?}",
2794 blocks[0].text,
2795 chars.iter().map(|c| c.ch).collect::<String>(),
2796 );
2797 let chars_str: String = chars.iter().map(|c| c.ch).collect();
2799 assert_eq!(blocks[0].text, chars_str);
2800 let total = chars.last().unwrap().bbox[2] - chars.first().unwrap().bbox[0];
2802 let expected = 12.0 * APPROX_CHAR_WIDTH;
2803 assert!((total - expected).abs() < 1e-6);
2804 }
2805
2806 #[test]
2810 fn hebrew_presentation_form_extracts_single_codepoint_nfkd_mapping() {
2811 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2813 let blocks = extract_text(&doc);
2814 assert_eq!(blocks.len(), 1);
2815 assert_eq!(blocks[0].text, "\u{05D0}");
2816 assert_eq!(
2817 decompose_glyph_to_string('\u{FB21}').as_deref(),
2818 Some("\u{05D0}")
2819 );
2820 }
2821
2822 #[test]
2826 fn positioned_chars_match_extracted_text_for_hebrew_presentation_form() {
2827 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2828 let blocks = extract_text(&doc);
2829 let chars = extract_positioned_chars(&doc, 1).unwrap();
2830 let chars_str: String = chars.iter().map(|c| c.ch).collect();
2831
2832 assert_eq!(blocks.len(), 1);
2833 assert_eq!(blocks[0].text, "\u{05D0}");
2834 assert_eq!(chars_str, blocks[0].text);
2835 assert_eq!(blocks[0].text.chars().count(), chars.len());
2836 }
2837
2838 #[test]
2845 fn tj_block_width_matches_rendered_glyph_count() {
2846 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2847 let blocks = extract_text(&doc);
2848 assert_eq!(blocks.len(), 1);
2849 assert_eq!(blocks[0].text, "ffi");
2850 let width = blocks[0].bbox[2] - blocks[0].bbox[0];
2851 let expected = 12.0 * APPROX_CHAR_WIDTH;
2852 assert!(
2853 (width - expected).abs() < 1e-6,
2854 "Tj bbox width {} != one glyph's char_w {} (issue #1359 problem 2)",
2855 width,
2856 expected,
2857 );
2858 }
2859
2860 #[test]
2866 fn tj_and_tj_array_bbox_widths_agree_for_ligature() {
2867 let tj_doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2868 let tj_blocks = extract_text(&tj_doc);
2869 let tj_doc_arr = make_doc_with_ligature_font(b"BT /F1 12 Tf [<01>] TJ ET", "ffi");
2870 let tj_arr_blocks = extract_text(&tj_doc_arr);
2871 assert_eq!(tj_blocks.len(), 1);
2872 assert_eq!(tj_arr_blocks.len(), 1);
2873 let tj_w = tj_blocks[0].bbox[2] - tj_blocks[0].bbox[0];
2874 let tj_arr_w = tj_arr_blocks[0].bbox[2] - tj_arr_blocks[0].bbox[0];
2875 assert!(
2876 (tj_w - tj_arr_w).abs() < 1e-6,
2877 "Tj bbox width {} disagrees with TJ bbox width {} for the same \
2878 single-glyph ligature input (issue #1359 problem 2)",
2879 tj_w,
2880 tj_arr_w,
2881 );
2882 }
2883}