1use crate::error::{ExtractError, Result};
7use lopdf::content::{Content, Operation};
8use lopdf::{Document, Object, ObjectId};
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12const APPROX_CHAR_WIDTH: f64 = 0.5;
14
15#[derive(Debug, Clone)]
17pub struct TextBlock {
18 pub text: String,
20 pub page: u32,
22 pub bbox: [f64; 4],
24 pub font_name: String,
26 pub font_size: f64,
28 pub actual_text: Option<String>,
33}
34
35#[derive(Debug, Clone)]
37pub struct PositionedChar {
38 pub ch: char,
40 pub page: u32,
42 pub bbox: [f64; 4],
44}
45
46#[derive(Debug, Clone)]
48struct GraphicsState {
49 ctm: [f64; 6],
50}
51
52#[derive(Debug, Clone)]
54struct TextState {
55 tm: [f64; 6],
57 tlm: [f64; 6],
59 font_name: String,
61 font_size: f64,
63 tc: f64,
65 tw: f64,
67 th: f64,
69 tl: f64,
71 ts: f64,
73 gs_stack: Vec<GraphicsState>,
75 ctm: [f64; 6],
77}
78
79impl Default for TextState {
80 fn default() -> Self {
81 Self {
82 tm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
83 tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
84 font_name: String::new(),
85 font_size: 12.0,
86 tc: 0.0,
87 tw: 0.0,
88 th: 100.0,
89 tl: 0.0,
90 ts: 0.0,
91 gs_stack: Vec::new(),
92 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
93 }
94 }
95}
96
97#[derive(Clone)]
99struct FontInfo {
100 is_cid: bool,
102 to_unicode: HashMap<u32, String>,
105 encoding_map: [Option<char>; 256],
108 ct_codes: [bool; 256],
116}
117
118fn build_font_map(doc: &Document, page_id: ObjectId) -> HashMap<String, FontInfo> {
120 let mut map = HashMap::new();
121
122 let resources = get_page_resources(doc, page_id);
124 let font_dict = match resources.and_then(|res| match res.get(b"Font").ok()? {
125 Object::Dictionary(d) => Some(d.clone()),
126 Object::Reference(r) => match doc.get_object(*r).ok()? {
127 Object::Dictionary(d) => Some(d.clone()),
128 _ => None,
129 },
130 _ => None,
131 }) {
132 Some(d) => d,
133 None => return map,
134 };
135
136 for (name_bytes, value) in font_dict.iter() {
137 let font_name = String::from_utf8_lossy(name_bytes).to_string();
138
139 let font = match value {
141 Object::Reference(r) => match doc.get_object(*r).ok() {
142 Some(Object::Dictionary(d)) => d.clone(),
143 _ => continue,
144 },
145 Object::Dictionary(d) => d.clone(),
146 _ => continue,
147 };
148
149 let subtype = font
150 .get(b"Subtype")
151 .ok()
152 .and_then(|o| match o {
153 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
154 _ => None,
155 })
156 .unwrap_or_default();
157
158 let is_cid = subtype == "Type0";
159
160 let to_unicode = parse_to_unicode_from_font(doc, &font);
162
163 let to_unicode = if to_unicode.is_empty() && is_cid {
165 if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
166 descendants
167 .iter()
168 .find_map(|d| {
169 let desc_dict = match d {
170 Object::Reference(r) => match doc.get_object(*r).ok()? {
171 Object::Dictionary(d) => d,
172 _ => return None,
173 },
174 Object::Dictionary(d) => d,
175 _ => return None,
176 };
177 let tu = parse_to_unicode_from_font(doc, desc_dict);
178 if tu.is_empty() {
179 None
180 } else {
181 Some(tu)
182 }
183 })
184 .unwrap_or_default()
185 } else {
186 HashMap::new()
187 }
188 } else {
189 to_unicode
190 };
191
192 let (encoding_map, ct_codes) = if !is_cid {
194 build_encoding_map(doc, &font)
195 } else {
196 ([None; 256], [false; 256])
197 };
198
199 map.insert(
200 font_name,
201 FontInfo {
202 is_cid,
203 to_unicode,
204 encoding_map,
205 ct_codes,
206 },
207 );
208 }
209
210 map
211}
212
213fn parse_to_unicode_from_font(doc: &Document, font: &lopdf::Dictionary) -> HashMap<u32, String> {
215 let tu_obj = match font.get(b"ToUnicode").ok() {
216 Some(Object::Reference(r)) => doc.get_object(*r).ok(),
217 Some(obj) => Some(obj),
218 None => return HashMap::new(),
219 };
220
221 let stream_bytes = match tu_obj {
222 Some(Object::Stream(ref s)) => s
223 .decompressed_content()
224 .ok()
225 .unwrap_or_else(|| s.content.clone()),
226 _ => return HashMap::new(),
227 };
228
229 parse_to_unicode_cmap(&stream_bytes)
230}
231
232fn parse_to_unicode_cmap(data: &[u8]) -> HashMap<u32, String> {
236 let text = String::from_utf8_lossy(data);
237 let mut map = HashMap::new();
238
239 for section in text.split("beginbfchar") {
241 let section = match section.split("endbfchar").next() {
242 Some(s) => s,
243 None => continue,
244 };
245 let tokens = extract_hex_tokens(section);
246 for pair in tokens.chunks(2) {
247 if pair.len() == 2 {
248 let code = parse_hex_u32(&pair[0]);
249 let unicode = hex_to_unicode_string(&pair[1]);
250 map.insert(code, unicode);
251 }
252 }
253 }
254
255 for section in text.split("beginbfrange") {
257 let section = match section.split("endbfrange").next() {
258 Some(s) => s,
259 None => continue,
260 };
261
262 let mut chars = section.chars().peekable();
264 let mut tokens: Vec<String> = Vec::new();
265 let mut arrays: Vec<Vec<String>> = Vec::new();
266 let mut in_array = false;
267 let mut current_array: Vec<String> = Vec::new();
268
269 while let Some(&ch) = chars.peek() {
270 if ch == '<' {
271 chars.next();
272 let hex: String = chars
273 .by_ref()
274 .take_while(|&c| c != '>')
275 .filter(|c| !c.is_whitespace())
276 .collect();
277 if in_array {
278 current_array.push(hex);
279 } else {
280 tokens.push(hex);
281 }
282 } else if ch == '[' {
283 chars.next();
284 in_array = true;
285 current_array = Vec::new();
286 } else if ch == ']' {
287 chars.next();
288 in_array = false;
289 arrays.push(std::mem::take(&mut current_array));
290 tokens.push(String::new()); } else {
292 chars.next();
293 }
294 }
295
296 let mut array_idx = 0;
298 let mut i = 0;
299 while i + 2 < tokens.len() {
300 let lo = parse_hex_u32(&tokens[i]);
301 let hi = parse_hex_u32(&tokens[i + 1]);
302
303 if tokens[i + 2].is_empty() {
304 if array_idx < arrays.len() {
306 let arr = &arrays[array_idx];
307 for (offset, dst) in arr.iter().enumerate() {
308 let code = lo + offset as u32;
309 if code <= hi {
310 map.insert(code, hex_to_unicode_string(dst));
311 }
312 }
313 array_idx += 1;
314 }
315 } else {
316 let dst_start = parse_hex_u32(&tokens[i + 2]);
318 let dst_len = tokens[i + 2].len();
319 for code in lo..=hi {
320 let dst_val = dst_start + (code - lo);
321 let s = if dst_len <= 4 {
322 char::from_u32(dst_val)
324 .map(|c| c.to_string())
325 .unwrap_or_default()
326 } else {
327 let hex = format!("{:0>width$X}", dst_val, width = dst_len);
329 hex_to_unicode_string(&hex)
330 };
331 map.insert(code, s);
332 }
333 }
334 i += 3;
335 }
336 }
337
338 map
339}
340
341fn extract_hex_tokens(text: &str) -> Vec<String> {
343 let mut tokens = Vec::new();
344 let mut in_hex = false;
345 let mut current = String::new();
346 for ch in text.chars() {
347 if ch == '<' {
348 in_hex = true;
349 current.clear();
350 } else if ch == '>' && in_hex {
351 in_hex = false;
352 tokens.push(current.clone());
353 } else if in_hex && !ch.is_whitespace() {
354 current.push(ch);
355 }
356 }
357 tokens
358}
359
360fn parse_hex_u32(hex: &str) -> u32 {
362 u32::from_str_radix(hex, 16).unwrap_or(0)
363}
364
365fn hex_to_unicode_string(hex: &str) -> String {
374 let hex_bytes = hex.as_bytes();
375 let bytes: Vec<u8> = (0..hex_bytes.len())
376 .step_by(2)
377 .filter_map(|i| {
378 let end = (i + 2).min(hex_bytes.len());
379 std::str::from_utf8(&hex_bytes[i..end])
380 .ok()
381 .and_then(|s| u8::from_str_radix(s, 16).ok())
382 })
383 .collect();
384
385 if bytes.len() >= 2 && bytes.len().is_multiple_of(2) {
386 let u16s: Vec<u16> = bytes
388 .chunks(2)
389 .map(|c| u16::from_be_bytes([c[0], c[1]]))
390 .collect();
391 String::from_utf16_lossy(&u16s)
392 } else if bytes.len() == 1 {
393 char::from_u32(bytes[0] as u32)
394 .map(|c| c.to_string())
395 .unwrap_or_default()
396 } else {
397 String::new()
398 }
399}
400
401pub fn suspected_garbled(text: &str) -> bool {
441 let sample: String = text.chars().take(500).collect();
442 let body_len = sample.chars().filter(|c| !c.is_whitespace()).count();
443 if body_len < 30 {
444 return false;
445 }
446 let total = sample.chars().count() as f64;
447 let alpha = sample.chars().filter(|c| c.is_ascii_alphabetic()).count() as f64;
448 (alpha / total) < 0.25
449}
450
451#[cfg(test)]
452mod __suspected_garbled_unit_tests {
453 use super::suspected_garbled;
454
455 #[test]
456 fn empty_string_is_not_garbled() {
457 assert!(!suspected_garbled(""));
458 }
459
460 #[test]
461 fn whitespace_only_is_not_garbled() {
462 assert!(!suspected_garbled(" \n\t "));
463 }
464
465 #[test]
466 fn short_non_empty_is_not_garbled_under_threshold() {
467 assert!(!suspected_garbled("Hi there!"));
469 }
470
471 #[test]
472 fn real_english_is_not_garbled() {
473 let text =
474 "Department of Homeland Security U.S. Citizenship and Immigration Services Form I-129. \
475 Filing fees apply per applicant.";
476 assert!(!suspected_garbled(text));
477 }
478
479 #[test]
480 fn caesar_shifted_is_documented_limitation() {
481 let text =
488 "3(5)250$1&(%21'6HHLQVWUXFWLRQVRQUHYHUVH'$7(%21'(;(&87('0XVWEHVDPHRUODWHUWKDQGDWHRIFRQWUDFW20%1R3XEOLF";
489 assert!(!suspected_garbled(text));
490 }
491
492 #[test]
493 fn custom_encoding_garbled_is_detected() {
494 let text =
499 " !\" ! !# $%&\u{2019}\"((\")*+,$&\"+-(&$&.$%&-&/\"/&0, &$*$%&/$%&/$\u{2019}$& * 1 -2 $23% $23\"";
500 assert!(suspected_garbled(text));
501 }
502
503 #[test]
504 fn mostly_numbers_text_is_not_falsely_flagged() {
505 let text =
507 "Invoice 12345 Total 67890.00 USD Date 2026-05-13 Vendor name and contact details here.";
508 assert!(!suspected_garbled(text));
509 }
510}
511
512fn get_page_resources(doc: &Document, page_id: ObjectId) -> Option<lopdf::Dictionary> {
514 let page = match doc.get_object(page_id).ok()? {
515 Object::Dictionary(d) => d.clone(),
516 _ => return None,
517 };
518
519 if let Some(res) = resolve_dict(doc, &page, b"Resources") {
521 return Some(res);
522 }
523
524 let mut current = page;
526 for _ in 0..20 {
527 let parent_ref = match current.get(b"Parent").ok()? {
528 Object::Reference(r) => *r,
529 _ => break,
530 };
531 let parent = match doc.get_object(parent_ref).ok()? {
532 Object::Dictionary(d) => d.clone(),
533 _ => break,
534 };
535 if let Some(res) = resolve_dict(doc, &parent, b"Resources") {
536 return Some(res);
537 }
538 current = parent;
539 }
540
541 None
542}
543
544fn resolve_dict(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<lopdf::Dictionary> {
546 match dict.get(key).ok()? {
547 Object::Dictionary(d) => Some(d.clone()),
548 Object::Reference(r) => match doc.get_object(*r).ok()? {
549 Object::Dictionary(d) => Some(d.clone()),
550 _ => None,
551 },
552 _ => None,
553 }
554}
555
556fn build_encoding_map(
563 doc: &Document,
564 font: &lopdf::Dictionary,
565) -> ([Option<char>; 256], [bool; 256]) {
566 let mut table = [None::<char>; 256];
567 let mut ct_codes = [false; 256];
568
569 let encoding = match font.get(b"Encoding").ok() {
570 Some(obj) => obj,
571 None => return (table, ct_codes),
572 };
573
574 match encoding {
575 Object::Name(name) => {
576 let name_str = String::from_utf8_lossy(name);
578 apply_base_encoding(&mut table, &name_str);
579 }
580 Object::Reference(r) => match doc.get_object(*r) {
581 Ok(Object::Dictionary(enc_dict)) => {
582 parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
583 }
584 Ok(Object::Name(name)) => {
585 let name_str = String::from_utf8_lossy(name);
586 apply_base_encoding(&mut table, &name_str);
587 }
588 _ => {}
589 },
590 Object::Dictionary(enc_dict) => {
591 parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
592 }
593 _ => {}
594 }
595
596 (table, ct_codes)
597}
598
599fn parse_encoding_dict(
601 doc: &Document,
602 enc_dict: &lopdf::Dictionary,
603 table: &mut [Option<char>; 256],
604 ct_codes: &mut [bool; 256],
605) {
606 if let Ok(Object::Name(base)) = enc_dict.get(b"BaseEncoding") {
608 let base_str = String::from_utf8_lossy(base);
609 apply_base_encoding(table, &base_str);
610 }
611
612 let diffs = match enc_dict.get(b"Differences").ok() {
614 Some(Object::Array(arr)) => arr.clone(),
615 Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
616 Some(Object::Array(arr)) => arr.clone(),
617 _ => return,
618 },
619 _ => return,
620 };
621
622 let mut code: Option<u32> = None;
623 for item in &diffs {
624 match item {
625 Object::Integer(n) => {
626 code = Some(*n as u32);
627 }
628 Object::Name(name) => {
629 if let Some(c) = code {
630 if c < 256 {
631 let glyph = String::from_utf8_lossy(name);
632 apply_glyph_name(&glyph, c as usize, table, ct_codes);
633 }
634 code = Some(c + 1);
635 }
636 }
637 Object::Reference(r) => {
638 if let Ok(Object::Name(name)) = doc.get_object(*r) {
640 if let Some(c) = code {
641 if c < 256 {
642 let glyph = String::from_utf8_lossy(name);
643 apply_glyph_name(&glyph, c as usize, table, ct_codes);
644 }
645 code = Some(c + 1);
646 }
647 }
648 }
649 _ => {}
650 }
651 }
652}
653
654fn apply_glyph_name(
660 glyph: &str,
661 code: usize,
662 table: &mut [Option<char>; 256],
663 ct_codes: &mut [bool; 256],
664) {
665 if glyph == "ct" {
666 table[code] = None;
669 ct_codes[code] = true;
670 return;
671 }
672 if let Some(ch) = glyph_name_to_unicode(glyph) {
673 table[code] = Some(ch);
674 ct_codes[code] = false;
675 }
676}
677
678fn apply_base_encoding(table: &mut [Option<char>; 256], name: &str) {
680 let source = match name {
681 "WinAnsiEncoding" => winansi_encoding(),
682 "MacRomanEncoding" => mac_roman_encoding(),
683 _ => return,
684 };
685 for (i, &ch) in source.iter().enumerate() {
686 if ch != '\0' {
687 table[i] = Some(ch);
688 }
689 }
690}
691
692fn winansi_encoding() -> &'static [char; 256] {
694 static TABLE: OnceLock<[char; 256]> = OnceLock::new();
695 TABLE.get_or_init(|| {
696 let mut t = ['\0'; 256];
697 for i in 0x20..=0x7Eu8 {
699 t[i as usize] = i as char;
700 }
701 t[0x09] = '\t';
703 t[0x0A] = '\n';
704 t[0x0D] = '\r';
705 let cp1252: [(u8, char); 27] = [
707 (0x80, '\u{20AC}'), (0x82, '\u{201A}'), (0x83, '\u{0192}'), (0x84, '\u{201E}'), (0x85, '\u{2026}'), (0x86, '\u{2020}'), (0x87, '\u{2021}'), (0x88, '\u{02C6}'), (0x89, '\u{2030}'), (0x8A, '\u{0160}'), (0x8B, '\u{2039}'), (0x8C, '\u{0152}'), (0x8E, '\u{017D}'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201C}'), (0x94, '\u{201D}'), (0x95, '\u{2022}'), (0x96, '\u{2013}'), (0x97, '\u{2014}'), (0x98, '\u{02DC}'), (0x99, '\u{2122}'), (0x9A, '\u{0161}'), (0x9B, '\u{203A}'), (0x9C, '\u{0153}'), (0x9E, '\u{017E}'), (0x9F, '\u{0178}'), ];
735 for (code, ch) in cp1252 {
736 t[code as usize] = ch;
737 }
738 for i in 0xA0..=0xFFu16 {
740 t[i as usize] = char::from_u32(i as u32).unwrap_or('\0');
741 }
742 t
743 })
744}
745
746fn mac_roman_encoding() -> &'static [char; 256] {
748 static TABLE: OnceLock<[char; 256]> = OnceLock::new();
749 TABLE.get_or_init(|| {
750 let mut t = ['\0'; 256];
751 for i in 0x20..=0x7Eu8 {
753 t[i as usize] = i as char;
754 }
755 t[0x09] = '\t';
756 t[0x0A] = '\n';
757 t[0x0D] = '\r';
758 let mac_upper: [char; 128] = [
760 '\u{00C4}', '\u{00C5}', '\u{00C7}', '\u{00C9}', '\u{00D1}', '\u{00D6}', '\u{00DC}',
761 '\u{00E1}', '\u{00E0}', '\u{00E2}', '\u{00E4}', '\u{00E3}', '\u{00E5}', '\u{00E7}',
762 '\u{00E9}', '\u{00E8}', '\u{00EA}', '\u{00EB}', '\u{00ED}', '\u{00EC}', '\u{00EE}',
763 '\u{00EF}', '\u{00F1}', '\u{00F3}', '\u{00F2}', '\u{00F4}', '\u{00F6}', '\u{00F5}',
764 '\u{00FA}', '\u{00F9}', '\u{00FB}', '\u{00FC}', '\u{2020}', '\u{00B0}', '\u{00A2}',
765 '\u{00A3}', '\u{00A7}', '\u{2022}', '\u{00B6}', '\u{00DF}', '\u{00AE}', '\u{00A9}',
766 '\u{2122}', '\u{00B4}', '\u{00A8}', '\u{2260}', '\u{00C6}', '\u{00D8}', '\u{221E}',
767 '\u{00B1}', '\u{2264}', '\u{2265}', '\u{00A5}', '\u{00B5}', '\u{2202}', '\u{2211}',
768 '\u{220F}', '\u{03C0}', '\u{222B}', '\u{00AA}', '\u{00BA}', '\u{03A9}', '\u{00E6}',
769 '\u{00F8}', '\u{00BF}', '\u{00A1}', '\u{00AC}', '\u{221A}', '\u{0192}', '\u{2248}',
770 '\u{2206}', '\u{00AB}', '\u{00BB}', '\u{2026}', '\u{00A0}', '\u{00C0}', '\u{00C3}',
771 '\u{00D5}', '\u{0152}', '\u{0153}', '\u{2013}', '\u{2014}', '\u{201C}', '\u{201D}',
772 '\u{2018}', '\u{2019}', '\u{00F7}', '\u{25CA}', '\u{00FF}', '\u{0178}', '\u{2044}',
773 '\u{20AC}', '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{2021}', '\u{00B7}',
774 '\u{201A}', '\u{201E}', '\u{2030}', '\u{00C2}', '\u{00CA}', '\u{00C1}', '\u{00CB}',
775 '\u{00C8}', '\u{00CD}', '\u{00CE}', '\u{00CF}', '\u{00CC}', '\u{00D3}', '\u{00D4}',
776 '\u{F8FF}', '\u{00D2}', '\u{00DA}', '\u{00DB}', '\u{00D9}', '\u{0131}', '\u{02C6}',
777 '\u{02DC}', '\u{00AF}', '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{00B8}', '\u{02DD}',
778 '\u{02DB}', '\u{02C7}',
779 ];
780 for (i, &ch) in mac_upper.iter().enumerate() {
781 t[0x80 + i] = ch;
782 }
783 t
784 })
785}
786
787fn glyph_name_to_unicode(name: &str) -> Option<char> {
798 if name.starts_with("uni") && name.len() >= 7 {
800 return u32::from_str_radix(&name[3..7], 16)
801 .ok()
802 .and_then(char::from_u32);
803 }
804 if name.starts_with('u') && name.len() >= 5 && name[1..].chars().all(|c| c.is_ascii_hexdigit())
805 {
806 return u32::from_str_radix(&name[1..], 16)
807 .ok()
808 .and_then(char::from_u32);
809 }
810
811 if let Some(c) = agl_table().get(name).copied() {
812 return Some(c);
813 }
814
815 match name {
816 "st" => Some('\u{FB06}'),
817 "longst" => Some('\u{FB05}'),
818 _ => None,
819 }
820}
821
822fn agl_table() -> &'static HashMap<&'static str, char> {
824 static TABLE: OnceLock<HashMap<&'static str, char>> = OnceLock::new();
825 TABLE.get_or_init(|| {
826 let entries: &[(&str, char)] = &[
827 ("space", ' '),
828 ("exclam", '!'),
829 ("quotedbl", '"'),
830 ("numbersign", '#'),
831 ("dollar", '$'),
832 ("percent", '%'),
833 ("ampersand", '&'),
834 ("quotesingle", '\''),
835 ("parenleft", '('),
836 ("parenright", ')'),
837 ("asterisk", '*'),
838 ("plus", '+'),
839 ("comma", ','),
840 ("hyphen", '-'),
841 ("period", '.'),
842 ("slash", '/'),
843 ("zero", '0'),
844 ("one", '1'),
845 ("two", '2'),
846 ("three", '3'),
847 ("four", '4'),
848 ("five", '5'),
849 ("six", '6'),
850 ("seven", '7'),
851 ("eight", '8'),
852 ("nine", '9'),
853 ("colon", ':'),
854 ("semicolon", ';'),
855 ("less", '<'),
856 ("equal", '='),
857 ("greater", '>'),
858 ("question", '?'),
859 ("at", '@'),
860 ("A", 'A'),
861 ("B", 'B'),
862 ("C", 'C'),
863 ("D", 'D'),
864 ("E", 'E'),
865 ("F", 'F'),
866 ("G", 'G'),
867 ("H", 'H'),
868 ("I", 'I'),
869 ("J", 'J'),
870 ("K", 'K'),
871 ("L", 'L'),
872 ("M", 'M'),
873 ("N", 'N'),
874 ("O", 'O'),
875 ("P", 'P'),
876 ("Q", 'Q'),
877 ("R", 'R'),
878 ("S", 'S'),
879 ("T", 'T'),
880 ("U", 'U'),
881 ("V", 'V'),
882 ("W", 'W'),
883 ("X", 'X'),
884 ("Y", 'Y'),
885 ("Z", 'Z'),
886 ("bracketleft", '['),
887 ("backslash", '\\'),
888 ("bracketright", ']'),
889 ("asciicircum", '^'),
890 ("underscore", '_'),
891 ("grave", '`'),
892 ("a", 'a'),
893 ("b", 'b'),
894 ("c", 'c'),
895 ("d", 'd'),
896 ("e", 'e'),
897 ("f", 'f'),
898 ("g", 'g'),
899 ("h", 'h'),
900 ("i", 'i'),
901 ("j", 'j'),
902 ("k", 'k'),
903 ("l", 'l'),
904 ("m", 'm'),
905 ("n", 'n'),
906 ("o", 'o'),
907 ("p", 'p'),
908 ("q", 'q'),
909 ("r", 'r'),
910 ("s", 's'),
911 ("t", 't'),
912 ("u", 'u'),
913 ("v", 'v'),
914 ("w", 'w'),
915 ("x", 'x'),
916 ("y", 'y'),
917 ("z", 'z'),
918 ("braceleft", '{'),
919 ("bar", '|'),
920 ("braceright", '}'),
921 ("asciitilde", '~'),
922 ("Agrave", '\u{00C0}'),
924 ("Aacute", '\u{00C1}'),
925 ("Acircumflex", '\u{00C2}'),
926 ("Atilde", '\u{00C3}'),
927 ("Adieresis", '\u{00C4}'),
928 ("Aring", '\u{00C5}'),
929 ("AE", '\u{00C6}'),
930 ("Ccedilla", '\u{00C7}'),
931 ("Egrave", '\u{00C8}'),
932 ("Eacute", '\u{00C9}'),
933 ("Ecircumflex", '\u{00CA}'),
934 ("Edieresis", '\u{00CB}'),
935 ("Igrave", '\u{00CC}'),
936 ("Iacute", '\u{00CD}'),
937 ("Icircumflex", '\u{00CE}'),
938 ("Idieresis", '\u{00CF}'),
939 ("Eth", '\u{00D0}'),
940 ("Ntilde", '\u{00D1}'),
941 ("Ograve", '\u{00D2}'),
942 ("Oacute", '\u{00D3}'),
943 ("Ocircumflex", '\u{00D4}'),
944 ("Otilde", '\u{00D5}'),
945 ("Odieresis", '\u{00D6}'),
946 ("Ugrave", '\u{00D9}'),
947 ("Uacute", '\u{00DA}'),
948 ("Ucircumflex", '\u{00DB}'),
949 ("Udieresis", '\u{00DC}'),
950 ("Yacute", '\u{00DD}'),
951 ("Thorn", '\u{00DE}'),
952 ("germandbls", '\u{00DF}'),
953 ("agrave", '\u{00E0}'),
954 ("aacute", '\u{00E1}'),
955 ("acircumflex", '\u{00E2}'),
956 ("atilde", '\u{00E3}'),
957 ("adieresis", '\u{00E4}'),
958 ("aring", '\u{00E5}'),
959 ("ae", '\u{00E6}'),
960 ("ccedilla", '\u{00E7}'),
961 ("egrave", '\u{00E8}'),
962 ("eacute", '\u{00E9}'),
963 ("ecircumflex", '\u{00EA}'),
964 ("edieresis", '\u{00EB}'),
965 ("igrave", '\u{00EC}'),
966 ("iacute", '\u{00ED}'),
967 ("icircumflex", '\u{00EE}'),
968 ("idieresis", '\u{00EF}'),
969 ("eth", '\u{00F0}'),
970 ("ntilde", '\u{00F1}'),
971 ("ograve", '\u{00F2}'),
972 ("oacute", '\u{00F3}'),
973 ("ocircumflex", '\u{00F4}'),
974 ("otilde", '\u{00F5}'),
975 ("odieresis", '\u{00F6}'),
976 ("ugrave", '\u{00F9}'),
977 ("uacute", '\u{00FA}'),
978 ("ucircumflex", '\u{00FB}'),
979 ("udieresis", '\u{00FC}'),
980 ("yacute", '\u{00FD}'),
981 ("thorn", '\u{00FE}'),
982 ("ydieresis", '\u{00FF}'),
983 ("fi", '\u{FB01}'),
985 ("fl", '\u{FB02}'),
986 ("ff", '\u{FB00}'),
987 ("ffi", '\u{FB03}'),
988 ("ffl", '\u{FB04}'),
989 ("endash", '\u{2013}'),
991 ("emdash", '\u{2014}'),
992 ("bullet", '\u{2022}'),
993 ("ellipsis", '\u{2026}'),
994 ("quoteleft", '\u{2018}'),
995 ("quoteright", '\u{2019}'),
996 ("quotedblleft", '\u{201C}'),
997 ("quotedblright", '\u{201D}'),
998 ("quotesinglebase", '\u{201A}'),
999 ("quotesinglbase", '\u{201A}'),
1000 ("quotedblbase", '\u{201E}'),
1001 ("dagger", '\u{2020}'),
1002 ("daggerdbl", '\u{2021}'),
1003 ("perthousand", '\u{2030}'),
1004 ("guilsinglleft", '\u{2039}'),
1005 ("guilsinglright", '\u{203A}'),
1006 ("guillemotleft", '\u{00AB}'),
1007 ("guillemotright", '\u{00BB}'),
1008 ("trademark", '\u{2122}'),
1009 ("copyright", '\u{00A9}'),
1010 ("registered", '\u{00AE}'),
1011 ("degree", '\u{00B0}'),
1012 ("plusminus", '\u{00B1}'),
1013 ("multiply", '\u{00D7}'),
1014 ("divide", '\u{00F7}'),
1015 ("fraction", '\u{2044}'),
1016 ("Euro", '\u{20AC}'),
1017 ("sterling", '\u{00A3}'),
1018 ("yen", '\u{00A5}'),
1019 ("cent", '\u{00A2}'),
1020 ("currency", '\u{00A4}'),
1021 ("section", '\u{00A7}'),
1022 ("paragraph", '\u{00B6}'),
1023 ("brokenbar", '\u{00A6}'),
1024 ("ordfeminine", '\u{00AA}'),
1025 ("ordmasculine", '\u{00BA}'),
1026 ("exclamdown", '\u{00A1}'),
1027 ("questiondown", '\u{00BF}'),
1028 ("logicalnot", '\u{00AC}'),
1029 ("mu", '\u{00B5}'),
1030 ("macron", '\u{00AF}'),
1031 ("acute", '\u{00B4}'),
1032 ("cedilla", '\u{00B8}'),
1033 ("dieresis", '\u{00A8}'),
1034 ("circumflex", '\u{02C6}'),
1035 ("tilde", '\u{02DC}'),
1036 ("caron", '\u{02C7}'),
1037 ("ring", '\u{02DA}'),
1038 ("breve", '\u{02D8}'),
1039 ("dotaccent", '\u{02D9}'),
1040 ("hungarumlaut", '\u{02DD}'),
1041 ("ogonek", '\u{02DB}'),
1042 ("nbspace", '\u{00A0}'),
1043 ("nonbreakingspace", '\u{00A0}'),
1044 ("softhyphen", '\u{00AD}'),
1045 ("periodcentered", '\u{00B7}'),
1046 ("middot", '\u{00B7}'),
1047 ("florin", '\u{0192}'),
1048 ("OE", '\u{0152}'),
1049 ("oe", '\u{0153}'),
1050 ("Scaron", '\u{0160}'),
1051 ("scaron", '\u{0161}'),
1052 ("Zcaron", '\u{017D}'),
1053 ("zcaron", '\u{017E}'),
1054 ("Ydieresis", '\u{0178}'),
1055 ("Lslash", '\u{0141}'),
1056 ("lslash", '\u{0142}'),
1057 ("Oslash", '\u{00D8}'),
1058 ("oslash", '\u{00F8}'),
1059 ("dotlessi", '\u{0131}'),
1060 ("onesuperior", '\u{00B9}'),
1062 ("twosuperior", '\u{00B2}'),
1063 ("threesuperior", '\u{00B3}'),
1064 ("onequarter", '\u{00BC}'),
1065 ("onehalf", '\u{00BD}'),
1066 ("threequarters", '\u{00BE}'),
1067 ("minus", '\u{2212}'),
1069 ("notequal", '\u{2260}'),
1070 ("lessequal", '\u{2264}'),
1071 ("greaterequal", '\u{2265}'),
1072 ("infinity", '\u{221E}'),
1073 ("partialdiff", '\u{2202}'),
1074 ("summation", '\u{2211}'),
1075 ("product", '\u{220F}'),
1076 ("integral", '\u{222B}'),
1077 ("radical", '\u{221A}'),
1078 ("approxequal", '\u{2248}'),
1079 ("Delta", '\u{0394}'),
1080 ("lozenge", '\u{25CA}'),
1081 ("pi", '\u{03C0}'),
1082 ("Omega", '\u{03A9}'),
1083 ];
1084 entries.iter().cloned().collect()
1085 })
1086}
1087
1088const CT_LIGATURE_MARKER: char = '\u{E007}';
1089
1090#[derive(Clone, Default)]
1091struct DecodedPdfString {
1092 text: String,
1093 ct_origins: Vec<bool>,
1094}
1095
1096impl DecodedPdfString {
1097 fn from_text(text: String) -> Self {
1098 let ct_origins = vec![false; text.chars().count()];
1099 Self { text, ct_origins }
1100 }
1101
1102 fn push_char(&mut self, ch: char, is_ct_origin: bool) {
1103 self.text.push(ch);
1104 self.ct_origins.push(is_ct_origin);
1105 }
1106
1107 fn push_str(&mut self, s: &str) {
1108 for ch in s.chars() {
1109 self.push_char(ch, false);
1110 }
1111 }
1112
1113 fn extend(&mut self, other: DecodedPdfString) {
1114 self.text.push_str(&other.text);
1115 self.ct_origins.extend(other.ct_origins);
1116 }
1117
1118 fn is_empty(&self) -> bool {
1119 self.text.is_empty()
1120 }
1121
1122 fn glyph_count(&self) -> usize {
1123 self.ct_origins.len()
1124 }
1125
1126 fn iter(&self) -> impl Iterator<Item = (char, bool)> + '_ {
1127 self.text.chars().zip(self.ct_origins.iter().copied())
1128 }
1129}
1130
1131fn decode_pdf_string_with_font_marked(
1134 bytes: &[u8],
1135 font_info: Option<&FontInfo>,
1136) -> DecodedPdfString {
1137 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1139 let chars: Vec<u16> = bytes[2..]
1140 .chunks(2)
1141 .filter_map(|chunk| {
1142 if chunk.len() == 2 {
1143 Some(u16::from_be_bytes([chunk[0], chunk[1]]))
1144 } else {
1145 None
1146 }
1147 })
1148 .collect();
1149 return DecodedPdfString::from_text(String::from_utf16_lossy(&chars));
1150 }
1151
1152 if let Some(info) = font_info {
1153 if info.is_cid && !info.to_unicode.is_empty() {
1154 let mut result = DecodedPdfString::default();
1156 let mut i = 0;
1157 while i + 1 < bytes.len() {
1158 let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
1159 if let Some(s) = info.to_unicode.get(&code) {
1160 result.push_str(s);
1161 } else {
1162 if let Some(ch) = char::from_u32(code) {
1164 if !ch.is_control() || ch == ' ' || ch == '\t' || ch == '\n' {
1165 result.push_char(ch, false);
1166 }
1167 }
1168 }
1169 i += 2;
1170 }
1171 return result;
1172 }
1173
1174 if !info.is_cid && !info.to_unicode.is_empty() {
1175 let mut result = DecodedPdfString::default();
1177 for &b in bytes {
1178 if let Some(s) = info.to_unicode.get(&(b as u32)) {
1179 result.push_str(s);
1180 } else if info.ct_codes[b as usize] {
1181 result.push_char(CT_LIGATURE_MARKER, true);
1182 } else if let Some(ch) = info.encoding_map[b as usize] {
1183 result.push_char(ch, false);
1184 } else {
1185 let ch = b as char;
1186 if is_printable_or_space(ch) {
1187 result.push_char(ch, false);
1188 }
1189 }
1190 }
1191 return result;
1192 }
1193
1194 if !info.is_cid
1196 && (info.encoding_map.iter().any(|c| c.is_some()) || info.ct_codes.iter().any(|f| *f))
1197 {
1198 let mut result = DecodedPdfString::default();
1199 for &b in bytes {
1200 if info.ct_codes[b as usize] {
1201 result.push_char(CT_LIGATURE_MARKER, true);
1202 } else if let Some(ch) = info.encoding_map[b as usize] {
1203 result.push_char(ch, false);
1204 } else {
1205 let ch = b as char;
1206 if is_printable_or_space(ch) {
1207 result.push_char(ch, false);
1208 }
1209 }
1210 }
1211 return result;
1212 }
1213 }
1214
1215 let mut result = DecodedPdfString::default();
1217 for &b in bytes {
1218 let ch = b as char;
1219 if is_printable_or_space(ch) {
1220 result.push_char(ch, false);
1221 }
1222 }
1223 result
1224}
1225
1226fn is_printable_or_space(ch: char) -> bool {
1229 let cp = ch as u32;
1230 cp >= 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D
1231}
1232
1233const LIGATURE_DECOMP: bool = true;
1241
1242fn decompose_ligature_char(c: char) -> Option<&'static str> {
1250 Some(match c {
1251 '\u{FB00}' => "ff",
1252 '\u{FB01}' => "fi",
1253 '\u{FB02}' => "fl",
1254 '\u{FB03}' => "ffi",
1255 '\u{FB04}' => "ffl",
1256 '\u{FB05}' | '\u{FB06}' => "st",
1260 _ => return None,
1261 })
1262}
1263
1264fn decompose_ligature_char_with_origin(c: char, is_ct_origin: bool) -> Option<&'static str> {
1265 if is_ct_origin && c == CT_LIGATURE_MARKER {
1266 Some("ct")
1267 } else {
1268 decompose_ligature_char(c)
1269 }
1270}
1271
1272fn decompose_glyph_to_string(c: char) -> Option<String> {
1280 use unicode_normalization::UnicodeNormalization;
1281 if let Some(s) = decompose_ligature_char(c) {
1282 return Some(s.to_string());
1283 }
1284 if matches!(c, '\u{FB00}'..='\u{FB4F}') {
1285 let nfkd: String = c.nfkd().collect();
1286 if nfkd != c.to_string() {
1291 return Some(nfkd);
1292 }
1293 }
1294 None
1295}
1296
1297fn decompose_ligatures(s: &str) -> String {
1304 let mut out = String::with_capacity(s.len());
1305 for c in s.chars() {
1306 if let Some(replacement) = decompose_glyph_to_string(c) {
1307 out.push_str(&replacement);
1308 } else {
1309 out.push(c);
1310 }
1311 }
1312 out
1313}
1314
1315fn decompose_decoded_ligatures(s: &DecodedPdfString) -> String {
1316 let mut out = String::with_capacity(s.text.len());
1317 for (c, is_ct_origin) in s.iter() {
1318 if let Some(replacement) = decompose_ligature_char_with_origin(c, is_ct_origin) {
1319 out.push_str(replacement);
1320 } else if let Some(replacement) = decompose_glyph_to_string(c) {
1321 out.push_str(&replacement);
1322 } else {
1323 out.push(c);
1324 }
1325 }
1326 out
1327}
1328
1329fn maybe_decompose_decoded(s: &DecodedPdfString) -> String {
1333 if LIGATURE_DECOMP {
1334 if s.ct_origins.iter().any(|origin| *origin) {
1335 decompose_decoded_ligatures(s)
1336 } else {
1337 decompose_ligatures(&s.text)
1338 }
1339 } else {
1340 s.text.clone()
1341 }
1342}
1343
1344pub fn extract_page_blocks(doc: &Document, page_num: u32) -> Vec<TextBlock> {
1346 let pages = doc.get_pages();
1347 let Some(&page_id) = pages.get(&page_num) else {
1348 return Vec::new();
1349 };
1350
1351 let font_map = build_font_map(doc, page_id);
1352 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1353 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1354 if let Ok(content) = Content::decode(&content_bytes) {
1355 return extract_blocks_from_ops_inner(
1356 &content.operations,
1357 page_num,
1358 &font_map,
1359 Some((doc, &resources)),
1360 0,
1361 None,
1362 );
1363 }
1364 }
1365
1366 Vec::new()
1367}
1368
1369pub fn extract_blocks_from_page_id(
1375 doc: &Document,
1376 page_id: lopdf::ObjectId,
1377 page_num: u32,
1378) -> Vec<TextBlock> {
1379 let font_map = build_font_map(doc, page_id);
1380 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1381 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1382 if let Ok(content) = Content::decode(&content_bytes) {
1383 return extract_blocks_from_ops_inner(
1384 &content.operations,
1385 page_num,
1386 &font_map,
1387 Some((doc, &resources)),
1388 0,
1389 None,
1390 );
1391 }
1392 }
1393 Vec::new()
1394}
1395
1396pub fn extract_text(doc: &Document) -> Vec<TextBlock> {
1398 let pages = doc.get_pages();
1399 let mut blocks = Vec::new();
1400
1401 for (&page_num, &page_id) in &pages {
1402 let font_map = build_font_map(doc, page_id);
1403 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1404 if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1405 if let Ok(content) = Content::decode(&content_bytes) {
1406 let page_blocks = extract_blocks_from_ops_inner(
1407 &content.operations,
1408 page_num,
1409 &font_map,
1410 Some((doc, &resources)),
1411 0,
1412 None,
1413 );
1414 blocks.extend(page_blocks);
1415 }
1416 }
1417 }
1418
1419 blocks
1420}
1421
1422pub fn extract_page_text(doc: &Document, page_num: u32) -> Result<String> {
1424 let pages = doc.get_pages();
1425 let total = pages.len() as u32;
1426
1427 if page_num == 0 || page_num > total {
1428 return Err(ExtractError::PageOutOfRange(page_num, total));
1429 }
1430
1431 let page_id = *pages
1432 .get(&page_num)
1433 .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1434
1435 let font_map = build_font_map(doc, page_id);
1436 let resources = get_page_resources(doc, page_id).unwrap_or_default();
1437 let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1438 let content = match Content::decode(&content_bytes) {
1439 Ok(c) => c,
1440 Err(_) => return Ok(String::new()),
1441 };
1442
1443 let blocks = extract_blocks_from_ops_inner(
1444 &content.operations,
1445 page_num,
1446 &font_map,
1447 Some((doc, &resources)),
1448 0,
1449 None,
1450 );
1451 let text = blocks
1452 .iter()
1453 .map(|b| b.text.as_str())
1454 .collect::<Vec<_>>()
1455 .join("");
1456
1457 Ok(text)
1458}
1459
1460pub fn extract_positioned_chars(doc: &Document, page_num: u32) -> Result<Vec<PositionedChar>> {
1462 let pages = doc.get_pages();
1463 let total = pages.len() as u32;
1464
1465 if page_num == 0 || page_num > total {
1466 return Err(ExtractError::PageOutOfRange(page_num, total));
1467 }
1468
1469 let page_id = *pages
1470 .get(&page_num)
1471 .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1472
1473 let font_map = build_font_map(doc, page_id);
1474 let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1475 let content = match Content::decode(&content_bytes) {
1476 Ok(c) => c,
1477 Err(_) => return Ok(Vec::new()),
1478 };
1479
1480 let chars = extract_chars_from_ops(&content.operations, page_num, &font_map);
1481 Ok(chars)
1482}
1483
1484fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> std::result::Result<Vec<u8>, ()> {
1486 doc.get_page_content(page_id).map_err(|_| ())
1487}
1488
1489#[derive(Debug, Clone, Default)]
1494struct MarkedContentEntry {
1495 actual_text: Option<String>,
1498}
1499
1500fn resolve_bdc_properties(
1504 doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1505 operand: &Object,
1506) -> Option<lopdf::Dictionary> {
1507 match operand {
1508 Object::Dictionary(d) => Some(d.clone()),
1509 Object::Reference(r) => doc_and_resources.and_then(|(doc, _)| {
1510 doc.get_object(*r).ok().and_then(|o| match o {
1511 Object::Dictionary(d) => Some(d.clone()),
1512 _ => None,
1513 })
1514 }),
1515 Object::Name(n) => {
1516 let (doc, resources) = doc_and_resources?;
1517 let props = resolve_dict(doc, resources, b"Properties")?;
1518 match props.get(n.as_slice()).ok()? {
1519 Object::Dictionary(d) => Some(d.clone()),
1520 Object::Reference(r) => match doc.get_object(*r).ok()? {
1521 Object::Dictionary(d) => Some(d.clone()),
1522 _ => None,
1523 },
1524 _ => None,
1525 }
1526 }
1527 _ => None,
1528 }
1529}
1530
1531fn decode_pdf_text_string(bytes: &[u8]) -> String {
1535 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1536 let u16s: Vec<u16> = bytes[2..]
1537 .chunks_exact(2)
1538 .map(|c| u16::from_be_bytes([c[0], c[1]]))
1539 .collect();
1540 return String::from_utf16_lossy(&u16s);
1541 }
1542 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
1543 return String::from_utf8_lossy(&bytes[3..]).into_owned();
1544 }
1545 bytes
1546 .iter()
1547 .filter_map(|&b| {
1548 let ch = b as char;
1549 if is_printable_or_space(ch) {
1550 Some(ch)
1551 } else {
1552 None
1553 }
1554 })
1555 .collect()
1556}
1557
1558fn extract_actual_text(props: &lopdf::Dictionary) -> Option<String> {
1560 let obj = props.get(b"ActualText").ok()?;
1561 match obj {
1562 Object::String(bytes, _) => Some(decode_pdf_text_string(bytes)),
1563 _ => None,
1564 }
1565}
1566
1567fn current_actual_text(stack: &[MarkedContentEntry], inherited: Option<&str>) -> Option<String> {
1573 stack
1574 .iter()
1575 .rev()
1576 .find_map(|e| e.actual_text.clone())
1577 .or_else(|| inherited.map(|s| s.to_string()))
1578}
1579
1580fn extract_blocks_from_ops_inner(
1586 ops: &[Operation],
1587 page: u32,
1588 font_map: &HashMap<String, FontInfo>,
1589 doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1590 depth: u32,
1591 inherited_actual_text: Option<&str>,
1592) -> Vec<TextBlock> {
1593 let mut state = TextState::default();
1594 let mut blocks = Vec::new();
1595 let mut mc_stack: Vec<MarkedContentEntry> = Vec::new();
1596
1597 for op in ops {
1598 match op.operator.as_str() {
1599 "q" => {
1600 state.gs_stack.push(GraphicsState { ctm: state.ctm });
1601 }
1602 "Q" => {
1603 if let Some(gs) = state.gs_stack.pop() {
1604 state.ctm = gs.ctm;
1605 }
1606 }
1607 "cm" => {
1608 if let Some(m) = extract_matrix(&op.operands) {
1609 state.ctm = multiply_matrix(&state.ctm, &m);
1610 }
1611 }
1612 "BT" => {
1613 state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1614 state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1615 }
1616 "Tf" if op.operands.len() >= 2 => {
1617 if let Object::Name(ref name) = op.operands[0] {
1618 state.font_name = String::from_utf8_lossy(name).to_string();
1619 }
1620 if let Some(size) = as_number(&op.operands[1]) {
1621 state.font_size = size;
1622 }
1623 }
1624 "Tc" => {
1625 if let Some(v) = op.operands.first().and_then(as_number) {
1626 state.tc = v;
1627 }
1628 }
1629 "Tw" => {
1630 if let Some(v) = op.operands.first().and_then(as_number) {
1631 state.tw = v;
1632 }
1633 }
1634 "Tz" => {
1635 if let Some(v) = op.operands.first().and_then(as_number) {
1636 state.th = v;
1637 }
1638 }
1639 "TL" => {
1640 if let Some(v) = op.operands.first().and_then(as_number) {
1641 state.tl = v;
1642 }
1643 }
1644 "Ts" => {
1645 if let Some(v) = op.operands.first().and_then(as_number) {
1646 state.ts = v;
1647 }
1648 }
1649 "Td" if op.operands.len() >= 2 => {
1650 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1651 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1652 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1653 state.tlm = new_tlm;
1654 state.tm = new_tlm;
1655 }
1656 "TD" if op.operands.len() >= 2 => {
1657 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1658 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1659 state.tl = -ty;
1660 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1661 state.tlm = new_tlm;
1662 state.tm = new_tlm;
1663 }
1664 "Tm" => {
1665 if let Some(m) = extract_matrix(&op.operands) {
1666 state.tm = m;
1667 state.tlm = m;
1668 }
1669 }
1670 "T*" => {
1671 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1672 state.tlm = new_tlm;
1673 state.tm = new_tlm;
1674 }
1675 "Tj" => {
1676 let fi = font_map.get(&state.font_name);
1677 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1678 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1679
1680 if !text.is_empty() {
1684 let x = state.tm[4];
1685 let y = state.tm[5];
1686 let text_width = text.glyph_count() as f64 * char_w;
1690 let display_text = maybe_decompose_decoded(&text);
1691 blocks.push(TextBlock {
1692 text: display_text,
1693 page,
1694 bbox: [x, y, x + text_width, y + state.font_size],
1695 font_name: state.font_name.clone(),
1696 font_size: state.font_size,
1697 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1698 });
1699 }
1700
1701 for _ in text.iter() {
1703 state.tm[4] += char_w + state.tc;
1704 }
1705 }
1706 }
1707 "TJ" => {
1708 if let Some(Object::Array(ref arr)) = op.operands.first() {
1709 let x_start = state.tm[4];
1710 let y = state.tm[5];
1711 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1712 let mut combined_text = DecodedPdfString::default();
1713 let fi = font_map.get(&state.font_name);
1714
1715 for item in arr {
1716 match item {
1717 Object::String(bytes, _) => {
1718 let text = decode_pdf_string_with_font_marked(bytes, fi);
1719 for _ in text.iter() {
1720 state.tm[4] += char_w + state.tc;
1721 }
1722 combined_text.extend(text);
1723 }
1724 _ => {
1725 if let Some(adj) = as_number(item) {
1726 state.tm[4] -= adj / 1000.0 * state.font_size;
1728 }
1729 }
1730 }
1731 }
1732
1733 if !combined_text.is_empty() {
1734 let x_end = state.tm[4];
1735 blocks.push(TextBlock {
1736 text: maybe_decompose_decoded(&combined_text),
1737 page,
1738 bbox: [x_start, y, x_end, y + state.font_size],
1739 font_name: state.font_name.clone(),
1740 font_size: state.font_size,
1741 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1742 });
1743 }
1744 }
1745 }
1746 "'" => {
1747 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1749 state.tlm = new_tlm;
1750 state.tm = new_tlm;
1751
1752 let fi = font_map.get(&state.font_name);
1753 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1754 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1755
1756 if !text.is_empty() {
1757 let x = state.tm[4];
1758 let y = state.tm[5];
1759 let text_width = text.glyph_count() as f64 * char_w;
1762 let display_text = maybe_decompose_decoded(&text);
1763 blocks.push(TextBlock {
1764 text: display_text,
1765 page,
1766 bbox: [x, y, x + text_width, y + state.font_size],
1767 font_name: state.font_name.clone(),
1768 font_size: state.font_size,
1769 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1770 });
1771 }
1772
1773 for _ in text.iter() {
1774 state.tm[4] += char_w + state.tc;
1775 }
1776 }
1777 }
1778 "\"" if op.operands.len() >= 3 => {
1779 if let Some(tw) = as_number(&op.operands[0]) {
1781 state.tw = tw;
1782 }
1783 if let Some(tc) = as_number(&op.operands[1]) {
1784 state.tc = tc;
1785 }
1786
1787 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1788 state.tlm = new_tlm;
1789 state.tm = new_tlm;
1790
1791 let fi = font_map.get(&state.font_name);
1792 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
1793 {
1794 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1795
1796 if !text.is_empty() {
1797 let x = state.tm[4];
1798 let y = state.tm[5];
1799 let text_width = text.glyph_count() as f64 * char_w;
1802 let display_text = maybe_decompose_decoded(&text);
1803 blocks.push(TextBlock {
1804 text: display_text,
1805 page,
1806 bbox: [x, y, x + text_width, y + state.font_size],
1807 font_name: state.font_name.clone(),
1808 font_size: state.font_size,
1809 actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1810 });
1811 }
1812
1813 for _ in text.iter() {
1814 state.tm[4] += char_w + state.tc;
1815 }
1816 }
1817 }
1818 "BMC" => {
1819 mc_stack.push(MarkedContentEntry::default());
1823 }
1824 "BDC" => {
1825 let actual_text = op
1829 .operands
1830 .get(1)
1831 .and_then(|o| resolve_bdc_properties(doc_and_resources, o))
1832 .as_ref()
1833 .and_then(extract_actual_text);
1834 mc_stack.push(MarkedContentEntry { actual_text });
1835 }
1836 "EMC" => {
1837 mc_stack.pop();
1840 }
1841 "Do" if depth < 5 => {
1842 if let Some((doc, resources)) = doc_and_resources {
1847 if let Some(Object::Name(ref xobj_name)) = op.operands.first() {
1848 let xobj_name_str = String::from_utf8_lossy(xobj_name);
1849 let inherited_for_child =
1850 current_actual_text(&mc_stack, inherited_actual_text);
1851 if let Some(xobj_blocks) = extract_form_xobject_text(
1852 doc,
1853 resources,
1854 &xobj_name_str,
1855 page,
1856 font_map,
1857 depth,
1858 inherited_for_child.as_deref(),
1859 ) {
1860 blocks.extend(xobj_blocks);
1861 }
1862 }
1863 }
1864 }
1865 _ => {}
1866 }
1867 }
1868
1869 blocks
1870}
1871
1872fn extract_form_xobject_text(
1879 doc: &Document,
1880 resources: &lopdf::Dictionary,
1881 name: &str,
1882 page: u32,
1883 font_map: &HashMap<String, FontInfo>,
1884 depth: u32,
1885 inherited_actual_text: Option<&str>,
1886) -> Option<Vec<TextBlock>> {
1887 let xobj_dict = match resources.get(b"XObject").ok()? {
1889 Object::Dictionary(d) => d.clone(),
1890 Object::Reference(r) => match doc.get_object(*r).ok()? {
1891 Object::Dictionary(d) => d.clone(),
1892 _ => return None,
1893 },
1894 _ => return None,
1895 };
1896
1897 let xobj_ref = match xobj_dict.get(name.as_bytes()).ok()? {
1898 Object::Reference(r) => *r,
1899 _ => return None,
1900 };
1901
1902 let stream = match doc.get_object(xobj_ref).ok()? {
1903 Object::Stream(s) => s.clone(),
1904 _ => return None,
1905 };
1906
1907 let subtype = stream
1909 .dict
1910 .get(b"Subtype")
1911 .ok()
1912 .and_then(|o| match o {
1913 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1914 _ => None,
1915 })
1916 .unwrap_or_default();
1917 if subtype != "Form" {
1918 return None;
1919 }
1920
1921 let content_bytes = stream
1923 .decompressed_content()
1924 .ok()
1925 .unwrap_or_else(|| stream.content.clone());
1926 let content = Content::decode(&content_bytes).ok()?;
1927
1928 let mut xobj_font_map = font_map.clone();
1931 if let Some(xobj_resources) = resolve_dict(doc, &stream.dict, b"Resources") {
1932 if let Some(xobj_fonts) = resolve_dict(doc, &xobj_resources, b"Font") {
1933 for (name_bytes, value) in xobj_fonts.iter() {
1934 let fname = String::from_utf8_lossy(name_bytes).to_string();
1935 if let Some(fi) = build_font_info_from_value(doc, value) {
1936 xobj_font_map.insert(fname, fi);
1937 }
1938 }
1939 }
1940 }
1941
1942 let xobj_resources =
1944 resolve_dict(doc, &stream.dict, b"Resources").unwrap_or_else(|| resources.clone());
1945
1946 Some(extract_blocks_from_ops_inner(
1947 &content.operations,
1948 page,
1949 &xobj_font_map,
1950 Some((doc, &xobj_resources)),
1951 depth + 1,
1952 inherited_actual_text,
1953 ))
1954}
1955
1956fn build_font_info_from_value(doc: &Document, value: &Object) -> Option<FontInfo> {
1958 let font = match value {
1959 Object::Reference(r) => match doc.get_object(*r).ok()? {
1960 Object::Dictionary(d) => d.clone(),
1961 _ => return None,
1962 },
1963 Object::Dictionary(d) => d.clone(),
1964 _ => return None,
1965 };
1966
1967 let subtype = font
1968 .get(b"Subtype")
1969 .ok()
1970 .and_then(|o| match o {
1971 Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1972 _ => None,
1973 })
1974 .unwrap_or_default();
1975
1976 let is_cid = subtype == "Type0";
1977 let mut to_unicode = parse_to_unicode_from_font(doc, &font);
1978
1979 if to_unicode.is_empty() && is_cid {
1980 if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
1981 for d in descendants {
1982 let desc_dict = match d {
1983 Object::Reference(r) => {
1984 let Some(Object::Dictionary(d)) = doc.get_object(*r).ok() else {
1987 continue;
1988 };
1989 d
1990 }
1991 Object::Dictionary(d) => d,
1992 _ => continue,
1993 };
1994 let tu = parse_to_unicode_from_font(doc, desc_dict);
1995 if !tu.is_empty() {
1996 to_unicode = tu;
1997 break;
1998 }
1999 }
2000 }
2001 }
2002
2003 let (encoding_map, ct_codes) = if !is_cid {
2004 build_encoding_map(doc, &font)
2005 } else {
2006 ([None; 256], [false; 256])
2007 };
2008
2009 Some(FontInfo {
2010 is_cid,
2011 to_unicode,
2012 encoding_map,
2013 ct_codes,
2014 })
2015}
2016
2017fn push_glyph_positioned(
2031 chars: &mut Vec<PositionedChar>,
2032 state: &mut TextState,
2033 page: u32,
2034 glyph: char,
2035 is_ct_origin: bool,
2036 char_w: f64,
2037) {
2038 let (gx, gy) = apply_ctm(state);
2039 let constituents: Option<String> = if LIGATURE_DECOMP {
2040 if let Some(s) = decompose_ligature_char_with_origin(glyph, is_ct_origin) {
2044 Some(s.to_string())
2045 } else {
2046 decompose_glyph_to_string(glyph)
2047 }
2048 } else {
2049 None
2050 };
2051 match constituents {
2052 None => {
2053 chars.push(PositionedChar {
2054 ch: glyph,
2055 page,
2056 bbox: [gx, gy, gx + char_w, gy + state.font_size],
2057 });
2058 }
2059 Some(s) => {
2060 let n = s.chars().count() as f64;
2061 let part_w = char_w / n;
2062 for (i, c) in s.chars().enumerate() {
2063 let x = gx + part_w * i as f64;
2064 chars.push(PositionedChar {
2065 ch: c,
2066 page,
2067 bbox: [x, gy, x + part_w, gy + state.font_size],
2068 });
2069 }
2070 }
2071 }
2072 state.tm[4] += char_w + state.tc;
2073}
2074
2075fn extract_chars_from_ops(
2077 ops: &[Operation],
2078 page: u32,
2079 font_map: &HashMap<String, FontInfo>,
2080) -> Vec<PositionedChar> {
2081 let mut state = TextState::default();
2082 let mut chars = Vec::new();
2083
2084 for op in ops {
2085 match op.operator.as_str() {
2086 "q" => {
2087 state.gs_stack.push(GraphicsState { ctm: state.ctm });
2088 }
2089 "Q" => {
2090 if let Some(gs) = state.gs_stack.pop() {
2091 state.ctm = gs.ctm;
2092 }
2093 }
2094 "cm" => {
2095 if let Some(m) = extract_matrix(&op.operands) {
2096 state.ctm = multiply_matrix(&state.ctm, &m);
2097 }
2098 }
2099 "BT" => {
2100 state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2101 state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2102 }
2103 "Tf" if op.operands.len() >= 2 => {
2104 if let Object::Name(ref name) = op.operands[0] {
2105 state.font_name = String::from_utf8_lossy(name).to_string();
2106 }
2107 if let Some(size) = as_number(&op.operands[1]) {
2108 state.font_size = size;
2109 }
2110 }
2111 "Tc" => {
2112 if let Some(v) = op.operands.first().and_then(as_number) {
2113 state.tc = v;
2114 }
2115 }
2116 "Tw" => {
2117 if let Some(v) = op.operands.first().and_then(as_number) {
2118 state.tw = v;
2119 }
2120 }
2121 "Tz" => {
2122 if let Some(v) = op.operands.first().and_then(as_number) {
2123 state.th = v;
2124 }
2125 }
2126 "TL" => {
2127 if let Some(v) = op.operands.first().and_then(as_number) {
2128 state.tl = v;
2129 }
2130 }
2131 "Ts" => {
2132 if let Some(v) = op.operands.first().and_then(as_number) {
2133 state.ts = v;
2134 }
2135 }
2136 "Td" if op.operands.len() >= 2 => {
2137 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2138 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2139 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2140 state.tlm = new_tlm;
2141 state.tm = new_tlm;
2142 }
2143 "TD" if op.operands.len() >= 2 => {
2144 let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2145 let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2146 state.tl = -ty;
2147 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2148 state.tlm = new_tlm;
2149 state.tm = new_tlm;
2150 }
2151 "Tm" => {
2152 if let Some(m) = extract_matrix(&op.operands) {
2153 state.tm = m;
2154 state.tlm = m;
2155 }
2156 }
2157 "T*" => {
2158 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2159 state.tlm = new_tlm;
2160 state.tm = new_tlm;
2161 }
2162 "Tj" => {
2163 let fi = font_map.get(&state.font_name);
2164 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2165 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2166 for (ch, is_ct_origin) in text.iter() {
2167 push_glyph_positioned(
2168 &mut chars,
2169 &mut state,
2170 page,
2171 ch,
2172 is_ct_origin,
2173 char_w,
2174 );
2175 }
2176 }
2177 }
2178 "TJ" => {
2179 if let Some(Object::Array(ref arr)) = op.operands.first() {
2180 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2181 let fi = font_map.get(&state.font_name);
2182 for item in arr {
2183 match item {
2184 Object::String(bytes, _) => {
2185 let text = decode_pdf_string_with_font_marked(bytes, fi);
2186 for (ch, is_ct_origin) in text.iter() {
2187 push_glyph_positioned(
2188 &mut chars,
2189 &mut state,
2190 page,
2191 ch,
2192 is_ct_origin,
2193 char_w,
2194 );
2195 }
2196 }
2197 _ => {
2198 if let Some(adj) = as_number(item) {
2199 state.tm[4] -= adj / 1000.0 * state.font_size;
2200 }
2201 }
2202 }
2203 }
2204 }
2205 }
2206 "'" => {
2207 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2208 state.tlm = new_tlm;
2209 state.tm = new_tlm;
2210
2211 let fi = font_map.get(&state.font_name);
2212 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2213 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2214 for (ch, is_ct_origin) in text.iter() {
2215 push_glyph_positioned(
2216 &mut chars,
2217 &mut state,
2218 page,
2219 ch,
2220 is_ct_origin,
2221 char_w,
2222 );
2223 }
2224 }
2225 }
2226 "\"" if op.operands.len() >= 3 => {
2227 if let Some(tw) = as_number(&op.operands[0]) {
2228 state.tw = tw;
2229 }
2230 if let Some(tc) = as_number(&op.operands[1]) {
2231 state.tc = tc;
2232 }
2233
2234 let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2235 state.tlm = new_tlm;
2236 state.tm = new_tlm;
2237
2238 let fi = font_map.get(&state.font_name);
2239 if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2240 {
2241 let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2242 for (ch, is_ct_origin) in text.iter() {
2243 push_glyph_positioned(
2244 &mut chars,
2245 &mut state,
2246 page,
2247 ch,
2248 is_ct_origin,
2249 char_w,
2250 );
2251 }
2252 }
2253 }
2254 _ => {}
2255 }
2256 }
2257
2258 chars
2259}
2260
2261#[inline]
2270fn apply_ctm(state: &TextState) -> (f64, f64) {
2271 let x = state.ctm[0] * state.tm[4] + state.ctm[2] * state.tm[5] + state.ctm[4];
2272 let y = state.ctm[1] * state.tm[4] + state.ctm[3] * state.tm[5] + state.ctm[5];
2273 (x, y)
2274}
2275
2276fn extract_decoded_string_operand_with_font(
2278 operands: &[Object],
2279 font_info: Option<&FontInfo>,
2280) -> Option<DecodedPdfString> {
2281 for op in operands {
2282 if let Object::String(bytes, _) = op {
2283 return Some(decode_pdf_string_with_font_marked(bytes, font_info));
2284 }
2285 }
2286 None
2287}
2288
2289fn as_number(obj: &Object) -> Option<f64> {
2291 match obj {
2292 Object::Integer(i) => Some(*i as f64),
2293 Object::Real(f) => Some(*f as f64),
2294 _ => None,
2295 }
2296}
2297
2298fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
2300 if operands.len() < 6 {
2301 return None;
2302 }
2303 let a = as_number(&operands[0])?;
2304 let b = as_number(&operands[1])?;
2305 let c = as_number(&operands[2])?;
2306 let d = as_number(&operands[3])?;
2307 let e = as_number(&operands[4])?;
2308 let f = as_number(&operands[5])?;
2309 Some([a, b, c, d, e, f])
2310}
2311
2312fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
2314 [
2315 m1[0] * m2[0] + m1[1] * m2[2],
2316 m1[0] * m2[1] + m1[1] * m2[3],
2317 m1[2] * m2[0] + m1[3] * m2[2],
2318 m1[2] * m2[1] + m1[3] * m2[3],
2319 m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
2320 m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
2321 ]
2322}
2323
2324#[cfg(test)]
2325mod tests {
2326 use super::*;
2327 use lopdf::{dictionary, Document, Object, Stream};
2328
2329 #[test]
2336 fn hex_to_unicode_string_does_not_panic_on_replacement_char_input() {
2337 let _ = hex_to_unicode_string("\u{FFFD}F");
2338 let _ = hex_to_unicode_string("0041\u{FFFD}");
2339 let _ = hex_to_unicode_string("\u{FFFD}\u{FFFD}");
2340 let _ = hex_to_unicode_string("\u{FFFD}");
2341 assert_eq!(hex_to_unicode_string("0041"), "A");
2343 }
2344
2345 fn make_doc_with_text(content: &[u8]) -> Document {
2347 let mut doc = Document::with_version("1.7");
2348
2349 let content_stream = Stream::new(dictionary! {}, content.to_vec());
2350 let content_id = doc.add_object(Object::Stream(content_stream));
2351
2352 let page_dict = dictionary! {
2353 "Type" => "Page",
2354 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2355 "Contents" => Object::Reference(content_id),
2356 };
2357 let page_id = doc.add_object(Object::Dictionary(page_dict));
2358
2359 let pages_dict = dictionary! {
2360 "Type" => "Pages",
2361 "Kids" => vec![Object::Reference(page_id)],
2362 "Count" => 1_i64,
2363 };
2364 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2365
2366 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2367 d.set("Parent", Object::Reference(pages_id));
2368 }
2369
2370 let catalog = dictionary! {
2371 "Type" => "Catalog",
2372 "Pages" => Object::Reference(pages_id),
2373 };
2374 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2375 doc.trailer.set("Root", Object::Reference(catalog_id));
2376
2377 doc
2378 }
2379
2380 #[test]
2381 fn extract_simple_text() {
2382 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
2383 let blocks = extract_text(&doc);
2384 assert_eq!(blocks.len(), 1);
2385 assert_eq!(blocks[0].text, "Hello World");
2386 assert_eq!(blocks[0].page, 1);
2387 assert_eq!(blocks[0].font_size, 12.0);
2388 }
2389
2390 #[test]
2391 fn extract_page_text_single() {
2392 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2393 let text = extract_page_text(&doc, 1).unwrap();
2394 assert_eq!(text, "Hello");
2395 }
2396
2397 #[test]
2398 fn extract_page_text_out_of_range() {
2399 let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2400 let result = extract_page_text(&doc, 5);
2401 assert!(result.is_err());
2402 }
2403
2404 #[test]
2405 fn extract_positioned_chars_basic() {
2406 let doc = make_doc_with_text(b"BT /F1 12 Tf (AB) Tj ET");
2407 let chars = extract_positioned_chars(&doc, 1).unwrap();
2408 assert_eq!(chars.len(), 2);
2409 assert_eq!(chars[0].ch, 'A');
2410 assert_eq!(chars[1].ch, 'B');
2411 assert_eq!(chars[0].page, 1);
2412 assert!(chars[1].bbox[0] > chars[0].bbox[0]);
2414 }
2415
2416 #[test]
2417 fn extract_tj_array() {
2418 let doc = make_doc_with_text(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET");
2419 let blocks = extract_text(&doc);
2420 assert_eq!(blocks.len(), 1);
2421 assert_eq!(blocks[0].text, "Hello");
2422 }
2423
2424 #[test]
2442 fn word_split_baseline_pdf_extract_layer_combines_tj_pieces() {
2443 let doc1 = make_doc_with_text(
2445 b"BT /F1 12 Tf [(Range) -100 (Rs)] TJ ET");
2446 let b1 = extract_text(&doc1);
2447 assert_eq!(b1.len(), 1, "small tracking: 1 block expected");
2448 assert_eq!(b1[0].text, "RangeRs",
2449 "pdf-extract combines TJ pieces irrespective of tracking");
2450
2451 let doc2 = make_doc_with_text(
2456 b"BT /F1 12 Tf [(Range) -2000 (Rs)] TJ ET");
2457 let b2 = extract_text(&doc2);
2458 assert_eq!(b2.len(), 1,
2459 "large tracking: still 1 block at pdf-extract layer");
2460 assert_eq!(b2[0].text, "RangeRs",
2461 "pdf-extract layer is unaffected by tracking magnitude");
2462
2463 let doc3 = make_doc_with_text(
2466 b"BT /F1 12 Tf (Range) Tj (Rs) Tj ET");
2467 let b3 = extract_text(&doc3);
2468 assert_eq!(b3.len(), 2, "two Tj's = two blocks");
2469 assert_eq!(b3[0].text, "Range");
2470 assert_eq!(b3[1].text, "Rs");
2471 }
2472
2473 #[test]
2474 fn empty_page_extracts_no_text() {
2475 let doc = make_doc_with_text(b"q Q");
2476 let blocks = extract_text(&doc);
2477 assert!(blocks.is_empty());
2478 }
2479
2480 #[test]
2481 fn multiline_text_extraction() {
2482 let doc = make_doc_with_text(b"BT /F1 12 Tf 12 TL (Line1) Tj T* (Line2) Tj ET");
2483 let blocks = extract_text(&doc);
2484 assert_eq!(blocks.len(), 2);
2485 assert_eq!(blocks[0].text, "Line1");
2486 assert_eq!(blocks[1].text, "Line2");
2487 }
2488
2489 #[test]
2493 fn actual_text_basic_bdc_override() {
2494 let doc = make_doc_with_text(b"BT /F1 12 Tf /Span <</ActualText (fi)>> BDC (X) Tj EMC ET");
2495 let blocks = extract_text(&doc);
2496 assert_eq!(blocks.len(), 1);
2497 assert_eq!(blocks[0].text, "X");
2498 assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2499 }
2500
2501 #[test]
2505 fn actual_text_nested_bdc_inner_overrides_outer() {
2506 let doc = make_doc_with_text(
2507 b"BT /F1 12 Tf \
2508 /Span <</ActualText (outer)>> BDC \
2509 (A) Tj \
2510 /Span <</ActualText (inner)>> BDC \
2511 (B) Tj \
2512 EMC \
2513 (C) Tj \
2514 EMC ET",
2515 );
2516 let blocks = extract_text(&doc);
2517 assert_eq!(blocks.len(), 3);
2518 assert_eq!(blocks[0].text, "A");
2519 assert_eq!(blocks[0].actual_text.as_deref(), Some("outer"));
2520 assert_eq!(blocks[1].text, "B");
2521 assert_eq!(blocks[1].actual_text.as_deref(), Some("inner"));
2522 assert_eq!(blocks[2].text, "C");
2523 assert_eq!(blocks[2].actual_text.as_deref(), Some("outer"));
2524 }
2525
2526 #[test]
2529 fn actual_text_bmc_does_not_leak_emc() {
2530 let doc = make_doc_with_text(
2531 b"BT /F1 12 Tf \
2532 /Span <</ActualText (X)>> BDC \
2533 /Artifact BMC (in) Tj EMC \
2534 (out) Tj \
2535 EMC \
2536 (after) Tj ET",
2537 );
2538 let blocks = extract_text(&doc);
2539 assert_eq!(blocks.len(), 3);
2540 assert_eq!(blocks[0].actual_text.as_deref(), Some("X"));
2541 assert_eq!(blocks[1].actual_text.as_deref(), Some("X"));
2542 assert_eq!(blocks[2].actual_text, None);
2543 }
2544
2545 fn make_doc_with_xobj_inside_bdc(
2551 page_pre: &[u8],
2552 xobj_content: &[u8],
2553 page_post: &[u8],
2554 actual_text: &str,
2555 ) -> Document {
2556 let mut doc = Document::with_version("1.7");
2557
2558 let xobj_dict = dictionary! {
2559 "Type" => "XObject",
2560 "Subtype" => "Form",
2561 "BBox" => vec![0.into(), 0.into(), 100.into(), 100.into()],
2562 };
2563 let xobj_stream = Stream::new(xobj_dict, xobj_content.to_vec());
2564 let xobj_id = doc.add_object(Object::Stream(xobj_stream));
2565
2566 let mut page_content = Vec::<u8>::new();
2568 page_content.extend_from_slice(b"BT /F1 12 Tf /Span <</ActualText (");
2569 page_content.extend_from_slice(actual_text.as_bytes());
2570 page_content.extend_from_slice(b")>> BDC ");
2571 page_content.extend_from_slice(page_pre);
2572 page_content.extend_from_slice(b" /Fm0 Do ");
2573 page_content.extend_from_slice(page_post);
2574 page_content.extend_from_slice(b" EMC ET");
2575
2576 let content_stream = Stream::new(dictionary! {}, page_content);
2577 let content_id = doc.add_object(Object::Stream(content_stream));
2578
2579 let resources = dictionary! {
2580 "XObject" => dictionary! { "Fm0" => Object::Reference(xobj_id) },
2581 };
2582 let resources_id = doc.add_object(Object::Dictionary(resources));
2583
2584 let page_dict = dictionary! {
2585 "Type" => "Page",
2586 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2587 "Contents" => Object::Reference(content_id),
2588 "Resources" => Object::Reference(resources_id),
2589 };
2590 let page_id = doc.add_object(Object::Dictionary(page_dict));
2591
2592 let pages_dict = dictionary! {
2593 "Type" => "Pages",
2594 "Kids" => vec![Object::Reference(page_id)],
2595 "Count" => 1_i64,
2596 };
2597 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2598
2599 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2600 d.set("Parent", Object::Reference(pages_id));
2601 }
2602
2603 let catalog = dictionary! {
2604 "Type" => "Catalog",
2605 "Pages" => Object::Reference(pages_id),
2606 };
2607 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2608 doc.trailer.set("Root", Object::Reference(catalog_id));
2609
2610 doc
2611 }
2612
2613 #[test]
2620 fn actual_text_propagates_into_form_xobject_recursion() {
2621 let doc = make_doc_with_xobj_inside_bdc(
2622 b"(pre) Tj",
2623 b"BT /F1 12 Tf (inside) Tj ET",
2624 b"(post) Tj",
2625 "wrapped",
2626 );
2627 let blocks = extract_text(&doc);
2628 assert_eq!(blocks.len(), 3);
2629 assert_eq!(blocks[0].text, "pre");
2630 assert_eq!(blocks[0].actual_text.as_deref(), Some("wrapped"));
2631 assert_eq!(blocks[1].text, "inside");
2634 assert_eq!(
2635 blocks[1].actual_text.as_deref(),
2636 Some("wrapped"),
2637 "Form XObject text lost surrounding /ActualText (issue #1358)"
2638 );
2639 assert_eq!(blocks[2].text, "post");
2640 assert_eq!(blocks[2].actual_text.as_deref(), Some("wrapped"));
2641 }
2642
2643 #[test]
2650 fn actual_text_inner_xobj_bdc_overrides_inherited() {
2651 let doc = make_doc_with_xobj_inside_bdc(
2652 b"",
2653 b"BT /F1 12 Tf \
2654 /Span <</ActualText (inner)>> BDC (B) Tj EMC \
2655 (after) Tj ET",
2656 b"",
2657 "outer",
2658 );
2659 let blocks = extract_text(&doc);
2660 assert_eq!(blocks.len(), 2);
2661 assert_eq!(blocks[0].text, "B");
2662 assert_eq!(blocks[0].actual_text.as_deref(), Some("inner"));
2663 assert_eq!(blocks[1].text, "after");
2664 assert_eq!(blocks[1].actual_text.as_deref(), Some("outer"));
2665 }
2666
2667 #[test]
2670 fn actual_text_utf16be_bom_decodes() {
2671 let doc = make_doc_with_text(
2673 b"BT /F1 12 Tf /Span <</ActualText <FEFF00660069> >> BDC (X) Tj EMC ET",
2674 );
2675 let blocks = extract_text(&doc);
2676 assert_eq!(blocks.len(), 1);
2677 assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2678 }
2679
2680 #[test]
2683 fn ligature_ff_decomposes() {
2684 assert_eq!(decompose_ligature_char('\u{FB00}'), Some("ff"));
2685 assert_eq!(decompose_ligatures("\u{FB00}"), "ff");
2686 assert_eq!(decompose_ligatures("o\u{FB00}ice"), "office");
2687 }
2688
2689 #[test]
2690 fn ligature_fi_decomposes() {
2691 assert_eq!(decompose_ligature_char('\u{FB01}'), Some("fi"));
2692 assert_eq!(decompose_ligatures("\u{FB01}"), "fi");
2693 assert_eq!(decompose_ligatures("of\u{FB01}ce"), "office");
2694 }
2695
2696 #[test]
2697 fn ligature_fl_decomposes() {
2698 assert_eq!(decompose_ligature_char('\u{FB02}'), Some("fl"));
2699 assert_eq!(decompose_ligatures("\u{FB02}ame"), "flame");
2700 }
2701
2702 #[test]
2703 fn ligature_ffi_decomposes() {
2704 assert_eq!(decompose_ligature_char('\u{FB03}'), Some("ffi"));
2705 assert_eq!(decompose_ligatures("o\u{FB03}ce"), "office");
2706 }
2707
2708 #[test]
2709 fn ligature_ffl_decomposes() {
2710 assert_eq!(decompose_ligature_char('\u{FB04}'), Some("ffl"));
2711 assert_eq!(decompose_ligatures("ba\u{FB04}e"), "baffle");
2712 }
2713
2714 #[test]
2719 fn ligature_st_decomposes() {
2720 assert_eq!(decompose_ligature_char('\u{FB06}'), Some("st"));
2721 assert_eq!(decompose_ligature_char('\u{FB05}'), Some("st"));
2722 assert_eq!(decompose_ligatures("fa\u{FB06}"), "fast");
2723 assert_eq!(glyph_name_to_unicode("st"), Some('\u{FB06}'));
2725 }
2726
2727 #[test]
2734 fn ligature_ct_via_glyph_name_emits_string() {
2735 assert_eq!(glyph_name_to_unicode("ct"), None);
2736 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6101> Tj ET", "ct");
2738 let blocks = extract_text(&doc);
2739 assert_eq!(blocks.len(), 1);
2740 assert_eq!(blocks[0].text, "act");
2741 }
2742
2743 #[test]
2748 fn tounicode_pua_e007_is_preserved() {
2749 let doc = make_doc_with_tounicode_cmap(b"BT /F1 12 Tf <01> Tj ET", 0x01, 0xE007);
2750 let blocks = extract_text(&doc);
2751 assert_eq!(blocks.len(), 1);
2752 assert_eq!(blocks[0].text, "\u{E007}");
2753 assert_ne!(blocks[0].text, "ct");
2754 }
2755
2756 #[test]
2760 fn ligature_ct_positioned_chars_advance_as_single_glyph() {
2761 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <610162> Tj ET", "ct");
2763 let chars = extract_positioned_chars(&doc, 1).unwrap();
2764 let extracted: String = chars.iter().map(|c| c.ch).collect();
2765 assert_eq!(extracted, "actb");
2766 assert_eq!(chars.len(), 4);
2767
2768 let char_w = 12.0 * APPROX_CHAR_WIDTH;
2769 assert!((chars[1].bbox[0] - char_w).abs() < 1e-6);
2770 assert!((chars[3].bbox[0] - (2.0 * char_w)).abs() < 1e-6);
2771 }
2772
2773 fn make_doc_with_ligature_font(content: &[u8], glyph_name: &str) -> Document {
2778 let mut doc = Document::with_version("1.7");
2779
2780 let content_stream = Stream::new(dictionary! {}, content.to_vec());
2781 let content_id = doc.add_object(Object::Stream(content_stream));
2782
2783 let encoding = dictionary! {
2784 "Type" => "Encoding",
2785 "BaseEncoding" => "WinAnsiEncoding",
2786 "Differences" => vec![
2787 1_i64.into(),
2788 Object::Name(glyph_name.as_bytes().to_vec()),
2789 ],
2790 };
2791 let encoding_id = doc.add_object(Object::Dictionary(encoding));
2792
2793 let font = dictionary! {
2794 "Type" => "Font",
2795 "Subtype" => "Type1",
2796 "BaseFont" => "Helvetica",
2797 "Encoding" => Object::Reference(encoding_id),
2798 };
2799 let font_id = doc.add_object(Object::Dictionary(font));
2800
2801 let resources = dictionary! {
2802 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2803 };
2804 let resources_id = doc.add_object(Object::Dictionary(resources));
2805
2806 let page_dict = dictionary! {
2807 "Type" => "Page",
2808 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2809 "Contents" => Object::Reference(content_id),
2810 "Resources" => Object::Reference(resources_id),
2811 };
2812 let page_id = doc.add_object(Object::Dictionary(page_dict));
2813
2814 let pages_dict = dictionary! {
2815 "Type" => "Pages",
2816 "Kids" => vec![Object::Reference(page_id)],
2817 "Count" => 1_i64,
2818 };
2819 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2820
2821 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2822 d.set("Parent", Object::Reference(pages_id));
2823 }
2824
2825 let catalog = dictionary! {
2826 "Type" => "Catalog",
2827 "Pages" => Object::Reference(pages_id),
2828 };
2829 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2830 doc.trailer.set("Root", Object::Reference(catalog_id));
2831
2832 doc
2833 }
2834
2835 #[test]
2841 fn ligature_office_golden_ffi() {
2842 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6F016365> Tj ET", "ffi");
2844 let blocks = extract_text(&doc);
2845 assert_eq!(blocks.len(), 1);
2846 assert_eq!(blocks[0].text, "office");
2847 }
2848
2849 fn make_doc_with_tounicode_cmap(content: &[u8], src: u8, dst: u32) -> Document {
2854 let mut doc = Document::with_version("1.7");
2855
2856 let content_stream = Stream::new(dictionary! {}, content.to_vec());
2857 let content_id = doc.add_object(Object::Stream(content_stream));
2858
2859 let cmap_text = format!(
2861 "/CIDInit /ProcSet findresource begin\n\
2862 12 dict begin\n\
2863 begincmap\n\
2864 /CMapType 2 def\n\
2865 1 beginbfchar\n\
2866 <{:02X}> <{:04X}>\n\
2867 endbfchar\n\
2868 endcmap CMapName currentdict /CMap defineresource pop end end",
2869 src, dst
2870 );
2871 let cmap_stream = Stream::new(dictionary! {}, cmap_text.into_bytes());
2872 let cmap_id = doc.add_object(Object::Stream(cmap_stream));
2873
2874 let font = dictionary! {
2875 "Type" => "Font",
2876 "Subtype" => "Type1",
2877 "BaseFont" => "Helvetica",
2878 "Encoding" => "WinAnsiEncoding",
2879 "ToUnicode" => Object::Reference(cmap_id),
2880 };
2881 let font_id = doc.add_object(Object::Dictionary(font));
2882
2883 let resources = dictionary! {
2884 "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2885 };
2886 let resources_id = doc.add_object(Object::Dictionary(resources));
2887
2888 let page_dict = dictionary! {
2889 "Type" => "Page",
2890 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2891 "Contents" => Object::Reference(content_id),
2892 "Resources" => Object::Reference(resources_id),
2893 };
2894 let page_id = doc.add_object(Object::Dictionary(page_dict));
2895
2896 let pages_dict = dictionary! {
2897 "Type" => "Pages",
2898 "Kids" => vec![Object::Reference(page_id)],
2899 "Count" => 1_i64,
2900 };
2901 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2902
2903 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2904 d.set("Parent", Object::Reference(pages_id));
2905 }
2906
2907 let catalog = dictionary! {
2908 "Type" => "Catalog",
2909 "Pages" => Object::Reference(pages_id),
2910 };
2911 let catalog_id = doc.add_object(Object::Dictionary(catalog));
2912 doc.trailer.set("Root", Object::Reference(catalog_id));
2913
2914 doc
2915 }
2916
2917 #[test]
2922 fn ligature_positioned_chars_split_bbox() {
2923 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2924 let chars = extract_positioned_chars(&doc, 1).unwrap();
2925 assert_eq!(chars.len(), 3, "ffi → 3 chars");
2926 assert_eq!(chars[0].ch, 'f');
2927 assert_eq!(chars[1].ch, 'f');
2928 assert_eq!(chars[2].ch, 'i');
2929 assert!(chars[0].bbox[2] <= chars[1].bbox[0] + 1e-9);
2931 assert!(chars[1].bbox[2] <= chars[2].bbox[0] + 1e-9);
2932 let total = chars[2].bbox[2] - chars[0].bbox[0];
2934 let expected = 12.0 * APPROX_CHAR_WIDTH;
2935 assert!((total - expected).abs() < 1e-6);
2936 }
2937
2938 #[test]
2948 fn positioned_chars_match_extracted_text_for_armenian_ligature() {
2949 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB13");
2952 let blocks = extract_text(&doc);
2953 let chars = extract_positioned_chars(&doc, 1).unwrap();
2954 assert_eq!(blocks.len(), 1);
2955 assert_eq!(
2957 blocks[0].text.chars().count(),
2958 chars.len(),
2959 "PositionedChar count drifted from extract_text() char count \
2960 (issue #1359 problem 1): blocks[0].text = {:?}, chars = {:?}",
2961 blocks[0].text,
2962 chars.iter().map(|c| c.ch).collect::<String>(),
2963 );
2964 let chars_str: String = chars.iter().map(|c| c.ch).collect();
2966 assert_eq!(blocks[0].text, chars_str);
2967 let total = chars.last().unwrap().bbox[2] - chars.first().unwrap().bbox[0];
2969 let expected = 12.0 * APPROX_CHAR_WIDTH;
2970 assert!((total - expected).abs() < 1e-6);
2971 }
2972
2973 #[test]
2977 fn hebrew_presentation_form_extracts_single_codepoint_nfkd_mapping() {
2978 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2980 let blocks = extract_text(&doc);
2981 assert_eq!(blocks.len(), 1);
2982 assert_eq!(blocks[0].text, "\u{05D0}");
2983 assert_eq!(
2984 decompose_glyph_to_string('\u{FB21}').as_deref(),
2985 Some("\u{05D0}")
2986 );
2987 }
2988
2989 #[test]
2993 fn positioned_chars_match_extracted_text_for_hebrew_presentation_form() {
2994 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2995 let blocks = extract_text(&doc);
2996 let chars = extract_positioned_chars(&doc, 1).unwrap();
2997 let chars_str: String = chars.iter().map(|c| c.ch).collect();
2998
2999 assert_eq!(blocks.len(), 1);
3000 assert_eq!(blocks[0].text, "\u{05D0}");
3001 assert_eq!(chars_str, blocks[0].text);
3002 assert_eq!(blocks[0].text.chars().count(), chars.len());
3003 }
3004
3005 #[test]
3012 fn tj_block_width_matches_rendered_glyph_count() {
3013 let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3014 let blocks = extract_text(&doc);
3015 assert_eq!(blocks.len(), 1);
3016 assert_eq!(blocks[0].text, "ffi");
3017 let width = blocks[0].bbox[2] - blocks[0].bbox[0];
3018 let expected = 12.0 * APPROX_CHAR_WIDTH;
3019 assert!(
3020 (width - expected).abs() < 1e-6,
3021 "Tj bbox width {} != one glyph's char_w {} (issue #1359 problem 2)",
3022 width,
3023 expected,
3024 );
3025 }
3026
3027 #[test]
3033 fn tj_and_tj_array_bbox_widths_agree_for_ligature() {
3034 let tj_doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3035 let tj_blocks = extract_text(&tj_doc);
3036 let tj_doc_arr = make_doc_with_ligature_font(b"BT /F1 12 Tf [<01>] TJ ET", "ffi");
3037 let tj_arr_blocks = extract_text(&tj_doc_arr);
3038 assert_eq!(tj_blocks.len(), 1);
3039 assert_eq!(tj_arr_blocks.len(), 1);
3040 let tj_w = tj_blocks[0].bbox[2] - tj_blocks[0].bbox[0];
3041 let tj_arr_w = tj_arr_blocks[0].bbox[2] - tj_arr_blocks[0].bbox[0];
3042 assert!(
3043 (tj_w - tj_arr_w).abs() < 1e-6,
3044 "Tj bbox width {} disagrees with TJ bbox width {} for the same \
3045 single-glyph ligature input (issue #1359 problem 2)",
3046 tj_w,
3047 tj_arr_w,
3048 );
3049 }
3050}