1use std::collections::HashMap;
17
18use lopdf::{Dictionary, Document, Object};
19
20use crate::pdfium_backend::Glyph;
21
22#[derive(Clone, Copy)]
24struct Mat {
25 a: f64,
26 b: f64,
27 c: f64,
28 d: f64,
29 e: f64,
30 f: f64,
31}
32
33impl Mat {
34 const ID: Mat = Mat {
35 a: 1.0,
36 b: 0.0,
37 c: 0.0,
38 d: 1.0,
39 e: 0.0,
40 f: 0.0,
41 };
42
43 fn then(self, m: Mat) -> Mat {
45 Mat {
46 a: self.a * m.a + self.b * m.c,
47 b: self.a * m.b + self.b * m.d,
48 c: self.c * m.a + self.d * m.c,
49 d: self.c * m.b + self.d * m.d,
50 e: self.e * m.a + self.f * m.c + m.e,
51 f: self.e * m.b + self.f * m.d + m.f,
52 }
53 }
54
55 fn apply(self, x: f64, y: f64) -> (f64, f64) {
56 (
57 self.a * x + self.c * y + self.e,
58 self.b * x + self.d * y + self.f,
59 )
60 }
61}
62
63struct Font {
65 two_byte: bool,
67 to_unicode: HashMap<u32, String>,
69 widths: HashMap<u32, f64>,
71 default_width: f64,
72 simple_encoding: Option<HashMap<u8, char>>,
74 fallback_names: HashMap<u8, String>,
78 program_encoding: HashMap<u8, char>,
82 ascent: f64,
83 descent: f64,
84 hash: u64,
85}
86
87impl Font {
88 fn decode_code(&self, code: u32) -> (Option<String>, f64) {
89 let w = self
90 .widths
91 .get(&code)
92 .copied()
93 .unwrap_or(self.default_width);
94 if let Some(s) = self.to_unicode.get(&code) {
95 return (Some(decompose_ligatures(s)), w);
96 }
97 if !self.two_byte {
98 if let Some(name) = self.fallback_names.get(&(code as u8)) {
101 return (Some(format!("/{name}")), w);
102 }
103 if let Some(enc) = &self.simple_encoding {
104 if let Some(&ch) = enc.get(&(code as u8)) {
105 return (Some(decompose_ligatures(&ch.to_string())), w);
106 }
107 }
108 if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
116 return (Some(decompose_ligatures(&ch.to_string())), w);
117 }
118 }
119 (None, w)
120 }
121}
122
123fn decompose_ligatures(s: &str) -> String {
127 if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
128 return s.to_string();
129 }
130 s.chars()
131 .map(|c| {
132 match c {
133 '\u{FB00}' => "ff",
134 '\u{FB01}' => "fi",
135 '\u{FB02}' => "fl",
136 '\u{FB03}' => "ffi",
137 '\u{FB04}' => "ffl",
138 '\u{FB05}' => "ft",
139 '\u{FB06}' => "st",
140 _ => return c.to_string(),
141 }
142 .to_string()
143 })
144 .collect()
145}
146
147fn hash_name(name: &[u8]) -> u64 {
148 use std::hash::{Hash, Hasher};
149 let mut h = std::collections::hash_map::DefaultHasher::new();
150 name.hash(&mut h);
151 h.finish()
152}
153
154fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
156 match obj {
157 Object::Dictionary(d) => Some(d),
158 Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
159 _ => None,
160 }
161}
162
163fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
164 match obj {
165 Object::Reference(id) => doc.get_object(*id).ok(),
166 other => Some(other),
167 }
168}
169
170fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
172 let subtype: &[u8] = fdict
173 .get(b"Subtype")
174 .ok()
175 .and_then(|o| o.as_name().ok())
176 .unwrap_or(&[]);
177 let two_byte = subtype == b"Type0".as_slice();
178
179 let to_unicode = fdict
180 .get(b"ToUnicode")
181 .ok()
182 .and_then(|o| deref(doc, o))
183 .and_then(|o| o.as_stream().ok())
184 .and_then(|s| s.decompressed_content().ok())
185 .map(|data| parse_tounicode(&data))
186 .unwrap_or_default();
187
188 let (widths, default_width) = if two_byte {
189 cid_widths(doc, fdict)
190 } else {
191 simple_widths(doc, fdict)
192 };
193
194 let simple_encoding = if two_byte {
195 None
196 } else {
197 Some(simple_encoding_table(doc, fdict))
198 };
199 let fallback_names = if two_byte {
200 HashMap::new()
201 } else {
202 differences_gid_names(doc, fdict)
203 };
204 let program_encoding = if two_byte {
205 HashMap::new()
206 } else {
207 type1_program_encoding(doc, fdict)
208 };
209
210 let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
211
212 Font {
213 two_byte,
214 to_unicode,
215 widths,
216 default_width,
217 simple_encoding,
218 fallback_names,
219 program_encoding,
220 ascent,
221 descent,
222 hash: hash_name(name),
223 }
224}
225
226fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
233 let mut map = HashMap::new();
234 let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
235 else {
236 return map;
237 };
238 let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
239 else {
240 return map;
241 };
242 let mut code = 0u8;
243 for el in diffs {
244 match el {
245 Object::Integer(i) => code = *i as u8,
246 Object::Name(name) => {
247 if glyph_name_to_char(name).is_none() && is_gid_name(name) {
248 map.insert(code, String::from_utf8_lossy(name).into_owned());
249 }
250 code = code.wrapping_add(1);
251 }
252 _ => {}
253 }
254 }
255 map
256}
257
258fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
266 let mut map = HashMap::new();
267 let Some(desc) = fdict
268 .get(b"FontDescriptor")
269 .ok()
270 .and_then(|o| deref(doc, o))
271 .and_then(|o| o.as_dict().ok())
272 else {
273 return map;
274 };
275 let Some(data) = desc
276 .get(b"FontFile")
277 .ok()
278 .and_then(|o| deref(doc, o))
279 .and_then(|o| o.as_stream().ok())
280 .and_then(|s| s.decompressed_content().ok())
281 else {
282 return map;
283 };
284 let head_end = data
286 .windows(5)
287 .position(|w| w == b"eexec")
288 .unwrap_or(data.len());
289 let head = String::from_utf8_lossy(&data[..head_end]);
290 let toks: Vec<&str> = head.split_whitespace().collect();
292 for w in toks.windows(4) {
293 if w[0] == "dup" && w[3] == "put" {
294 if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
295 if code <= 255 {
296 if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
297 map.insert(code as u8, ch);
298 }
299 }
300 }
301 }
302 }
303 map
304}
305
306fn is_gid_name(name: &[u8]) -> bool {
312 let Ok(s) = std::str::from_utf8(name) else {
313 return false;
314 };
315 if s.starts_with("afii") || s.starts_with("uni") {
316 return false;
317 }
318 for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
319 if let Some(rest) = s.strip_prefix(prefix) {
320 if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
321 return true;
322 }
323 }
324 }
325 let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
329 let digits = s.len() - alpha;
330 (1..=3).contains(&alpha)
331 && digits >= 3
332 && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
333}
334
335fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
336 let descr_owner = if two_byte {
338 fdict
339 .get(b"DescendantFonts")
340 .ok()
341 .and_then(|o| deref(doc, o))
342 .and_then(|o| match o {
343 Object::Array(a) => a.first(),
344 _ => None,
345 })
346 .and_then(|o| as_dict(doc, o))
347 } else {
348 Some(fdict)
349 };
350 let fd = descr_owner
351 .and_then(|d| d.get(b"FontDescriptor").ok())
352 .and_then(|o| as_dict(doc, o));
353 let asc = fd
354 .and_then(|d| d.get(b"Ascent").ok())
355 .and_then(|o| {
356 o.as_float()
357 .ok()
358 .or_else(|| o.as_i64().ok().map(|i| i as f32))
359 })
360 .unwrap_or(750.0) as f64;
361 let desc = fd
362 .and_then(|d| d.get(b"Descent").ok())
363 .and_then(|o| {
364 o.as_float()
365 .ok()
366 .or_else(|| o.as_i64().ok().map(|i| i as f32))
367 })
368 .unwrap_or(-250.0) as f64;
369 if asc - desc <= 1.0 {
376 return (750.0, -250.0);
377 }
378 (asc, desc)
379}
380
381fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
383 let mut map = HashMap::new();
384 let first = fdict
385 .get(b"FirstChar")
386 .ok()
387 .and_then(|o| o.as_i64().ok())
388 .unwrap_or(0) as u32;
389 if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
390 for (i, w) in arr.iter().enumerate() {
391 if let Some(w) = num(w) {
392 map.insert(first + i as u32, w);
393 }
394 }
395 }
396 let dw = fdict
397 .get(b"FontDescriptor")
398 .ok()
399 .and_then(|o| as_dict(doc, o))
400 .and_then(|d| d.get(b"MissingWidth").ok())
401 .and_then(num)
402 .unwrap_or(0.0);
403 (map, dw)
404}
405
406fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
408 let mut map = HashMap::new();
409 let Some(desc) = fdict
410 .get(b"DescendantFonts")
411 .ok()
412 .and_then(|o| deref(doc, o))
413 .and_then(|o| match o {
414 Object::Array(a) => a.first(),
415 _ => None,
416 })
417 .and_then(|o| as_dict(doc, o))
418 else {
419 return (map, 1000.0);
420 };
421 let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
422 if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
423 let mut i = 0;
424 while i < w.len() {
425 let c = w.get(i).and_then(num);
426 match (c, w.get(i + 1)) {
427 (Some(c), Some(Object::Array(list))) => {
429 for (k, wv) in list.iter().enumerate() {
430 if let Some(wv) = num(wv) {
431 map.insert(c as u32 + k as u32, wv);
432 }
433 }
434 i += 2;
435 }
436 (Some(c1), Some(o2)) => {
438 if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
439 for cid in c1 as u32..=c2 as u32 {
440 map.insert(cid, wv);
441 }
442 }
443 i += 3;
444 }
445 _ => break,
446 }
447 }
448 }
449 (map, dw)
450}
451
452fn num(o: &Object) -> Option<f64> {
453 match o {
454 Object::Integer(i) => Some(*i as f64),
455 Object::Real(r) => Some(*r as f64),
456 _ => None,
457 }
458}
459
460fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
462 let text = String::from_utf8_lossy(data);
463 let mut map = HashMap::new();
464 let hex = |s: &str| -> Option<Vec<u16>> {
465 let s = s.trim();
466 if !s.starts_with('<') || !s.ends_with('>') {
467 return None;
468 }
469 let h = &s[1..s.len() - 1];
470 let bytes: Vec<u8> = (0..h.len())
471 .step_by(2)
472 .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
473 .collect();
474 Some(
475 bytes
476 .chunks(2)
477 .map(|c| {
478 if c.len() == 2 {
479 u16::from_be_bytes([c[0], c[1]])
480 } else {
481 c[0] as u16
482 }
483 })
484 .collect(),
485 )
486 };
487 let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
488 let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
489
490 let tokens: Vec<String> = {
494 let bytes = text.as_bytes();
495 let mut toks = Vec::new();
496 let mut i = 0;
497 while i < bytes.len() {
498 let c = bytes[i];
499 if c.is_ascii_whitespace() {
500 i += 1;
501 } else if c == b'<' {
502 let start = i;
503 while i < bytes.len() && bytes[i] != b'>' {
504 i += 1;
505 }
506 i += 1; toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
508 } else if c == b'[' || c == b']' {
509 toks.push((c as char).to_string());
510 i += 1;
511 } else {
512 let start = i;
513 while i < bytes.len()
514 && !bytes[i].is_ascii_whitespace()
515 && bytes[i] != b'<'
516 && bytes[i] != b'['
517 && bytes[i] != b']'
518 {
519 i += 1;
520 }
521 toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
522 }
523 }
524 toks
525 };
526 let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
527 let mut i = 0;
528 while i < tokens.len() {
529 match tokens[i] {
530 "beginbfchar" => {
531 i += 1;
532 while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
533 if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
534 map.insert(code_of(&src), u16s_to_string(&dst));
535 }
536 i += 2;
537 }
538 }
539 "beginbfrange" => {
540 i += 1;
541 while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
542 let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
543 i += 1;
544 continue;
545 };
546 let lo = code_of(&lo);
547 let hi = code_of(&hi);
548 if tokens[i + 2] == "[" {
549 let mut j = i + 3;
551 let mut code = lo;
552 while j < tokens.len() && tokens[j] != "]" {
553 if let Some(dst) = hex(tokens[j]) {
554 map.insert(code, u16s_to_string(&dst));
555 }
556 code += 1;
557 j += 1;
558 }
559 i = j + 1;
560 } else if let Some(dst) = hex(tokens[i + 2]) {
561 let base = code_of(&dst);
563 for (k, code) in (lo..=hi).enumerate() {
564 if let Some(ch) = char::from_u32(base + k as u32) {
565 map.insert(code, ch.to_string());
566 }
567 }
568 i += 3;
569 } else {
570 i += 1;
571 }
572 }
573 }
574 _ => i += 1,
575 }
576 }
577 map
578}
579
580fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
582 if font.two_byte {
583 bytes
584 .chunks(2)
585 .map(|c| {
586 if c.len() == 2 {
587 ((c[0] as u32) << 8) | c[1] as u32
588 } else {
589 c[0] as u32
590 }
591 })
592 .collect()
593 } else {
594 bytes.iter().map(|&b| b as u32).collect()
595 }
596}
597
598fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
600 let mb = doc
601 .get_object(page_id)
602 .ok()
603 .and_then(|o| o.as_dict().ok())
604 .and_then(|d| {
605 d.get(b"MediaBox").ok().cloned()
607 })
608 .or_else(|| {
609 doc.get_dictionary(page_id)
610 .ok()
611 .and_then(|d| d.get(b"MediaBox").ok().cloned())
612 });
613 if let Some(Object::Array(a)) = mb {
614 let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
615 if v.len() == 4 {
616 return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
617 }
618 }
619 (612.0, 792.0)
620}
621
622pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
625 let Ok(doc) = Document::load_mem(bytes) else {
626 return Vec::new();
627 };
628 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
629 pages.sort_by_key(|(n, _)| *n);
630 let Some((_, pid)) = pages.get(index) else {
631 return Vec::new();
632 };
633 page_glyphs(&doc, *pid)
634 .into_iter()
635 .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
636 .collect()
637}
638
639pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
643 let Ok(doc) = Document::load_mem(bytes) else {
644 return Vec::new();
645 };
646 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
647 pages.sort_by_key(|(n, _)| *n);
648 pages
649 .into_iter()
650 .map(|(_, pid)| {
651 let (w, h) = page_size(&doc, pid);
652 let glyphs = page_glyphs(&doc, pid);
653 let cells = crate::dp_lines::line_cells(&glyphs, h, true);
654 (w, h, cells)
655 })
656 .collect()
657}
658
659pub fn pdf_words(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
664 let Ok(doc) = Document::load_mem(bytes) else {
665 return Vec::new();
666 };
667 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
668 pages.sort_by_key(|(n, _)| *n);
669 pages
670 .into_iter()
671 .map(|(_, pid)| {
672 let (w, h) = page_size(&doc, pid);
673 let glyphs = page_glyphs(&doc, pid);
674 let cells = crate::dp_lines::word_cells(&glyphs, h, true);
675 (w, h, cells)
676 })
677 .collect()
678}
679
680#[derive(Default)]
684pub struct PageParserCells {
685 pub prose: Vec<crate::pdfium_backend::TextCell>,
686 pub words: Vec<crate::pdfium_backend::TextCell>,
687 pub code: Vec<crate::pdfium_backend::TextCell>,
688}
689
690pub fn pdf_all_cells(bytes: &[u8]) -> Vec<PageParserCells> {
695 let Ok(doc) = Document::load_mem(bytes) else {
696 return Vec::new();
697 };
698 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
699 pages.sort_by_key(|(n, _)| *n);
700 pages
701 .into_iter()
702 .map(|(_, pid)| {
703 let (_w, h) = page_size(&doc, pid);
704 let glyphs = page_glyphs(&doc, pid);
705 PageParserCells {
706 prose: crate::dp_lines::line_cells(&glyphs, h, true),
707 words: crate::dp_lines::word_cells(&glyphs, h, true),
708 code: crate::pdfium_backend::code_cells_from_glyphs(&glyphs, h),
709 }
710 })
711 .collect()
712}
713
714#[derive(Clone, Copy)]
718struct TextState {
719 tc: f64,
720 tw: f64,
721 th: f64,
722 tl: f64,
723 trise: f64,
724 fsize: f64,
725}
726
727impl TextState {
728 const INIT: TextState = TextState {
729 tc: 0.0,
730 tw: 0.0,
731 th: 1.0,
732 tl: 0.0,
733 trise: 0.0,
734 fsize: 0.0,
735 };
736}
737
738fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
741 let (inline, ids) = doc.get_page_resources(page_id).ok()?;
742 if let Some(d) = inline {
743 return Some(d);
744 }
745 ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
746}
747
748fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
750 let mut map = HashMap::new();
751 let font_dict = res
752 .get(b"Font")
753 .ok()
754 .and_then(|o| deref(doc, o))
755 .and_then(|o| o.as_dict().ok());
756 if let Some(fd) = font_dict {
757 for (name, value) in fd.iter() {
758 if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
759 map.insert(name.clone(), parse_font(doc, name, fdict));
760 }
761 }
762 }
763 map
764}
765
766pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
768 let mut out = Vec::new();
769 let Ok(content_bytes) = doc.get_page_content(page_id) else {
770 return out;
771 };
772 let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
773 return out;
774 };
775 if let Some(res) = page_res(doc, page_id) {
776 run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
777 }
778 out
779}
780
781fn run_content(
786 doc: &Document,
787 res: &Dictionary,
788 content: &lopdf::content::Content,
789 base_ctm: Mat,
790 init: TextState,
791 depth: u32,
792 out: &mut Vec<Glyph>,
793) {
794 let fonts = fonts_from_res(doc, res);
795 let xobjects = res
796 .get(b"XObject")
797 .ok()
798 .and_then(|o| deref(doc, o))
799 .and_then(|o| o.as_dict().ok());
800
801 #[allow(clippy::type_complexity)]
806 let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
807 let mut ctm = base_ctm;
808 let mut tm = Mat::ID;
809 let mut tlm = Mat::ID;
810 let mut font: Option<&Font> = None;
811 let mut fsize = init.fsize;
812 let mut tc = init.tc; let mut tw = init.tw; let mut th = init.th; let mut tl = init.tl; let mut trise = init.trise;
817
818 let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
819
820 for op in &content.operations {
821 let operands = &op.operands;
822 match op.operator.as_str() {
823 "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
824 "Q" => {
825 if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
826 ctm = c;
827 tc = a;
828 tw = b;
829 th = h;
830 tl = l;
831 trise = r;
832 fsize = fs;
833 font = f;
834 }
835 }
836 "cm" => {
837 let m = Mat {
838 a: op_f(operands, 0),
839 b: op_f(operands, 1),
840 c: op_f(operands, 2),
841 d: op_f(operands, 3),
842 e: op_f(operands, 4),
843 f: op_f(operands, 5),
844 };
845 ctm = m.then(ctm);
846 }
847 "BT" => {
848 tm = Mat::ID;
849 tlm = Mat::ID;
850 }
851 "ET" => {}
852 "Tf" => {
853 if let Some(Object::Name(n)) = operands.first() {
854 font = fonts.get(n.as_slice());
855 }
856 fsize = op_f(operands, 1);
857 }
858 "Td" => {
859 tlm = Mat {
860 a: 1.0,
861 b: 0.0,
862 c: 0.0,
863 d: 1.0,
864 e: op_f(operands, 0),
865 f: op_f(operands, 1),
866 }
867 .then(tlm);
868 tm = tlm;
869 }
870 "TD" => {
871 tl = -op_f(operands, 1);
872 tlm = Mat {
873 a: 1.0,
874 b: 0.0,
875 c: 0.0,
876 d: 1.0,
877 e: op_f(operands, 0),
878 f: op_f(operands, 1),
879 }
880 .then(tlm);
881 tm = tlm;
882 }
883 "Tm" => {
884 tlm = Mat {
885 a: op_f(operands, 0),
886 b: op_f(operands, 1),
887 c: op_f(operands, 2),
888 d: op_f(operands, 3),
889 e: op_f(operands, 4),
890 f: op_f(operands, 5),
891 };
892 tm = tlm;
893 }
894 "T*" => {
895 tlm = Mat {
896 a: 1.0,
897 b: 0.0,
898 c: 0.0,
899 d: 1.0,
900 e: 0.0,
901 f: -tl,
902 }
903 .then(tlm);
904 tm = tlm;
905 }
906 "Tc" => tc = op_f(operands, 0),
907 "Tw" => tw = op_f(operands, 0),
908 "Tz" => th = op_f(operands, 0) / 100.0,
909 "TL" => tl = op_f(operands, 0),
910 "Ts" => trise = op_f(operands, 0),
911 "Tj" | "'" | "\"" => {
912 if op.operator == "'" || op.operator == "\"" {
913 tlm = Mat {
915 a: 1.0,
916 b: 0.0,
917 c: 0.0,
918 d: 1.0,
919 e: 0.0,
920 f: -tl,
921 }
922 .then(tlm);
923 tm = tlm;
924 }
925 if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
926 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
927 }
928 }
929 "TJ" => {
930 if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
931 for el in arr {
932 match el {
933 Object::String(s, _) => {
934 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
935 }
936 other => {
937 if let Some(adj) = num(other) {
938 let tx = -adj / 1000.0 * fsize * th;
940 tm = Mat {
941 a: 1.0,
942 b: 0.0,
943 c: 0.0,
944 d: 1.0,
945 e: tx,
946 f: 0.0,
947 }
948 .then(tm);
949 }
950 }
951 }
952 }
953 }
954 }
955 "Do" => {
956 if depth >= 8 {
959 continue;
960 }
961 let Some(Object::Name(n)) = operands.first() else {
962 continue;
963 };
964 let stream = xobjects
965 .and_then(|d| d.get(n.as_slice()).ok())
966 .and_then(|o| deref(doc, o))
967 .and_then(|o| o.as_stream().ok());
968 let Some(stream) = stream else { continue };
969 let is_form = stream
970 .dict
971 .get(b"Subtype")
972 .ok()
973 .and_then(|o| o.as_name().ok())
974 == Some(b"Form".as_slice());
975 if !is_form {
976 continue;
977 }
978 let Ok(data) = stream.decompressed_content() else {
979 continue;
980 };
981 let Ok(form_content) = lopdf::content::Content::decode(&data) else {
982 continue;
983 };
984 let form_mat = match stream.dict.get(b"Matrix").ok() {
986 Some(Object::Array(a)) if a.len() == 6 => {
987 let v: Vec<f64> = a.iter().filter_map(num).collect();
988 if v.len() == 6 {
989 Mat {
990 a: v[0],
991 b: v[1],
992 c: v[2],
993 d: v[3],
994 e: v[4],
995 f: v[5],
996 }
997 } else {
998 Mat::ID
999 }
1000 }
1001 _ => Mat::ID,
1002 };
1003 let form_res = stream
1005 .dict
1006 .get(b"Resources")
1007 .ok()
1008 .and_then(|o| deref(doc, o))
1009 .and_then(|o| o.as_dict().ok())
1010 .unwrap_or(res);
1011 let state = TextState {
1012 tc,
1013 tw,
1014 th,
1015 tl,
1016 trise,
1017 fsize,
1018 };
1019 run_content(
1020 doc,
1021 form_res,
1022 &form_content,
1023 form_mat.then(ctm),
1024 state,
1025 depth + 1,
1026 out,
1027 );
1028 }
1029 _ => {}
1030 }
1031 }
1032}
1033
1034#[allow(clippy::too_many_arguments)]
1035fn show_text(
1036 font: &Font,
1037 bytes: &[u8],
1038 fsize: f64,
1039 tc: f64,
1040 tw: f64,
1041 th: f64,
1042 trise: f64,
1043 tm: &mut Mat,
1044 ctm: Mat,
1045 out: &mut Vec<Glyph>,
1046) {
1047 for code in codes(font, bytes) {
1048 let (text, w) = font.decode_code(code);
1049 let w0 = w / 1000.0; let scale = Mat {
1052 a: fsize * th,
1053 b: 0.0,
1054 c: 0.0,
1055 d: fsize,
1056 e: 0.0,
1057 f: trise,
1058 };
1059 let trm = scale.then(*tm).then(ctm);
1060 let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
1062 let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
1063 let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
1064 let (left, right) = (x0.min(x1), x0.max(x1));
1065 let (bot, top) = (y0.min(y2), y0.max(y2));
1066 if let Some(s) = text {
1067 for ch in s.chars() {
1069 if ch != '\u{0}' {
1070 out.push(Glyph {
1071 ch,
1072 l: left as f32,
1073 b: bot as f32,
1074 r: right as f32,
1075 t: top as f32,
1076 ll: left as f32,
1077 lb: bot as f32,
1078 lr: right as f32,
1079 lt: top as f32,
1080 font: font.hash,
1081 });
1082 }
1083 }
1084 }
1085 let is_space = !font.two_byte && code == 32;
1087 let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
1088 *tm = Mat {
1089 a: 1.0,
1090 b: 0.0,
1091 c: 0.0,
1092 d: 1.0,
1093 e: tx,
1094 f: 0.0,
1095 }
1096 .then(*tm);
1097 }
1098}
1099
1100fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
1104 let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
1105 let base_name = match enc {
1106 Some(Object::Name(n)) => n.clone(),
1107 Some(Object::Dictionary(d)) => d
1108 .get(b"BaseEncoding")
1109 .ok()
1110 .and_then(|o| o.as_name().ok())
1111 .map(|n| n.to_vec())
1112 .unwrap_or_default(),
1113 _ => Vec::new(),
1114 };
1115 let mut m = if base_name == b"MacRomanEncoding" {
1116 macroman_table()
1117 } else {
1118 winansi_table()
1119 };
1120 if let Some(Object::Dictionary(d)) = enc {
1122 if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
1123 let mut code = 0u8;
1124 for el in diffs {
1125 match el {
1126 Object::Integer(i) => code = *i as u8,
1127 Object::Name(name) => {
1128 if let Some(ch) = glyph_name_to_char(name) {
1129 m.insert(code, ch);
1130 }
1131 code = code.wrapping_add(1);
1132 }
1133 _ => {}
1134 }
1135 }
1136 }
1137 }
1138 m
1139}
1140
1141fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1146 let s = std::str::from_utf8(name).ok()?;
1147 if let Some(hex) = s.strip_prefix("uni") {
1148 if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1149 return char::from_u32(cp);
1150 }
1151 }
1152 if s.len() == 1 {
1154 let b = s.as_bytes()[0];
1155 if b.is_ascii_alphabetic() {
1156 return Some(b as char);
1157 }
1158 }
1159 let resolved = match s {
1160 "space" => ' ',
1161 "exclam" => '!',
1162 "quotedbl" => '"',
1163 "numbersign" => '#',
1164 "dollar" => '$',
1165 "percent" => '%',
1166 "ampersand" => '&',
1167 "quotesingle" => '\'',
1168 "parenleft" => '(',
1169 "parenright" => ')',
1170 "asterisk" => '*',
1171 "plus" => '+',
1172 "comma" => ',',
1173 "hyphen" => '-',
1174 "period" => '.',
1175 "slash" => '/',
1176 "zero" => '0',
1177 "one" => '1',
1178 "two" => '2',
1179 "three" => '3',
1180 "four" => '4',
1181 "five" => '5',
1182 "six" => '6',
1183 "seven" => '7',
1184 "eight" => '8',
1185 "nine" => '9',
1186 "colon" => ':',
1187 "semicolon" => ';',
1188 "less" => '<',
1189 "equal" => '=',
1190 "greater" => '>',
1191 "question" => '?',
1192 "at" => '@',
1193 "bracketleft" => '[',
1194 "backslash" => '\\',
1195 "bracketright" => ']',
1196 "asciicircum" => '^',
1197 "underscore" => '_',
1198 "grave" => '`',
1199 "braceleft" => '{',
1200 "bar" => '|',
1201 "braceright" => '}',
1202 "asciitilde" => '~',
1203 "bullet" => '\u{2022}',
1204 "periodcentered" => '\u{00B7}',
1205 "endash" => '\u{2013}',
1206 "emdash" => '\u{2014}',
1207 "quoteright" => '\u{2019}',
1208 "quoteleft" => '\u{2018}',
1209 "quotedblleft" => '\u{201C}',
1210 "quotedblright" => '\u{201D}',
1211 "quotedblbase" => '\u{201E}',
1212 "quotesinglbase" => '\u{201A}',
1213 "ff" => '\u{FB00}',
1218 "fi" => '\u{FB01}',
1219 "fl" => '\u{FB02}',
1220 "ffi" => '\u{FB03}',
1221 "ffl" => '\u{FB04}',
1222 "ft" => '\u{FB05}',
1223 "st" => '\u{FB06}',
1224 "degree" => '\u{00B0}',
1225 "trademark" => '\u{2122}',
1226 "registered" => '\u{00AE}',
1227 "copyright" => '\u{00A9}',
1228 "ellipsis" => '\u{2026}',
1229 "minus" => '\u{2212}',
1230 "fraction" => '\u{2044}',
1231 "nbspace" => '\u{00A0}',
1232 "alpha" => '\u{03B1}',
1237 "beta" => '\u{03B2}',
1238 "gamma" => '\u{03B3}',
1239 "delta" => '\u{03B4}',
1240 "epsilon" | "epsilon1" => '\u{03B5}',
1241 "zeta" => '\u{03B6}',
1242 "eta" => '\u{03B7}',
1243 "theta" | "theta1" => '\u{03B8}',
1244 "iota" => '\u{03B9}',
1245 "kappa" => '\u{03BA}',
1246 "lambda" => '\u{03BB}',
1247 "mu" => '\u{03BC}',
1248 "nu" => '\u{03BD}',
1249 "xi" => '\u{03BE}',
1250 "omicron" => '\u{03BF}',
1251 "pi" | "pi1" => '\u{03C0}',
1252 "rho" | "rho1" => '\u{03C1}',
1253 "sigma" => '\u{03C3}',
1254 "sigma1" => '\u{03C2}',
1255 "tau" => '\u{03C4}',
1256 "upsilon" => '\u{03C5}',
1257 "phi" | "phi1" => '\u{03C6}',
1258 "chi" => '\u{03C7}',
1259 "psi" => '\u{03C8}',
1260 "omega" | "omega1" => '\u{03C9}',
1261 "Gamma" => '\u{0393}',
1262 "Delta" => '\u{0394}',
1263 "Theta" => '\u{0398}',
1264 "Lambda" => '\u{039B}',
1265 "Xi" => '\u{039E}',
1266 "Pi" => '\u{03A0}',
1267 "Sigma" => '\u{03A3}',
1268 "Upsilon" => '\u{03A5}',
1269 "Phi" => '\u{03A6}',
1270 "Psi" => '\u{03A8}',
1271 "Omega" => '\u{03A9}',
1272 "lessequal" => '\u{2264}',
1273 "greaterequal" => '\u{2265}',
1274 "notequal" => '\u{2260}',
1275 "approxequal" => '\u{2248}',
1276 "equivalence" => '\u{2261}',
1277 "element" => '\u{2208}',
1278 "plusminus" => '\u{00B1}',
1279 "multiply" => '\u{00D7}',
1280 "divide" => '\u{00F7}',
1281 "infinity" => '\u{221E}',
1282 "partialdiff" => '\u{2202}',
1283 "gradient" => '\u{2207}',
1284 "summation" => '\u{2211}',
1285 "product" => '\u{220F}',
1286 "integral" => '\u{222B}',
1287 "radical" => '\u{221A}',
1288 "proportional" => '\u{221D}',
1289 "arrowright" => '\u{2192}',
1290 "arrowleft" => '\u{2190}',
1291 "arrowup" => '\u{2191}',
1292 "arrowdown" => '\u{2193}',
1293 "arrowboth" => '\u{2194}',
1294 "arrowdblright" => '\u{21D2}',
1295 "logicaland" => '\u{2227}',
1296 "logicalor" => '\u{2228}',
1297 "intersection" => '\u{2229}',
1298 "union" => '\u{222A}',
1299 "similar" => '\u{223C}',
1300 "congruent" => '\u{2245}',
1301 "dotmath" => '\u{22C5}',
1302 "asteriskmath" => '\u{2217}',
1303 _ => {
1304 if let Some((base, _)) = s.split_once('.') {
1306 if !base.is_empty() {
1307 return glyph_name_to_char(base.as_bytes());
1308 }
1309 }
1310 return None;
1311 }
1312 };
1313 Some(resolved)
1314}
1315
1316fn winansi_table() -> HashMap<u8, char> {
1318 let mut m = HashMap::new();
1319 for b in 0x20u8..=0x7e {
1320 m.insert(b, b as char);
1321 }
1322 let extra: &[(u8, char)] = &[
1324 (0x91, '\u{2018}'),
1325 (0x92, '\u{2019}'),
1326 (0x93, '\u{201C}'),
1327 (0x94, '\u{201D}'),
1328 (0x95, '\u{2022}'),
1329 (0x96, '\u{2013}'),
1330 (0x97, '\u{2014}'),
1331 (0x85, '\u{2026}'),
1332 (0xA0, '\u{00A0}'),
1333 ];
1334 for &(b, c) in extra {
1335 m.insert(b, c);
1336 }
1337 for b in 0xA1u8..=0xFF {
1338 m.entry(b).or_insert(b as char);
1339 }
1340 m
1341}
1342
1343fn macroman_table() -> HashMap<u8, char> {
1346 let mut m = HashMap::new();
1347 for b in 0x20u8..=0x7e {
1348 m.insert(b, b as char);
1349 }
1350 let high: &[(u8, char)] = &[
1351 (0xA5, '\u{2022}'), (0xD0, '\u{2013}'), (0xD1, '\u{2014}'), (0xD2, '\u{201C}'),
1355 (0xD3, '\u{201D}'),
1356 (0xD4, '\u{2018}'),
1357 (0xD5, '\u{2019}'),
1358 (0xCA, '\u{00A0}'),
1359 (0xC9, '\u{2026}'),
1360 (0xDE, '\u{FB01}'),
1361 (0xDF, '\u{FB02}'),
1362 ];
1363 for &(b, c) in high {
1364 m.insert(b, c);
1365 }
1366 m
1367}