1use std::collections::HashMap;
17use std::rc::Rc;
18
19use lopdf::{Dictionary, Document, Object};
20
21use crate::pdfium_backend::Glyph;
22
23#[derive(Default)]
32struct DocCaches {
33 fonts: HashMap<(lopdf::ObjectId, Vec<u8>), Rc<Font>>,
34 forms: HashMap<lopdf::ObjectId, Rc<lopdf::content::Content>>,
35}
36
37#[derive(Clone, Copy)]
39struct Mat {
40 a: f64,
41 b: f64,
42 c: f64,
43 d: f64,
44 e: f64,
45 f: f64,
46}
47
48impl Mat {
49 const ID: Mat = Mat {
50 a: 1.0,
51 b: 0.0,
52 c: 0.0,
53 d: 1.0,
54 e: 0.0,
55 f: 0.0,
56 };
57
58 fn then(self, m: Mat) -> Mat {
60 Mat {
61 a: self.a * m.a + self.b * m.c,
62 b: self.a * m.b + self.b * m.d,
63 c: self.c * m.a + self.d * m.c,
64 d: self.c * m.b + self.d * m.d,
65 e: self.e * m.a + self.f * m.c + m.e,
66 f: self.e * m.b + self.f * m.d + m.f,
67 }
68 }
69
70 fn apply(self, x: f64, y: f64) -> (f64, f64) {
71 (
72 self.a * x + self.c * y + self.e,
73 self.b * x + self.d * y + self.f,
74 )
75 }
76}
77
78struct Font {
80 two_byte: bool,
82 to_unicode: HashMap<u32, String>,
84 widths: HashMap<u32, f64>,
86 default_width: f64,
87 simple_encoding: Option<HashMap<u8, char>>,
89 fallback_names: HashMap<u8, String>,
93 program_encoding: HashMap<u8, char>,
97 ascent: f64,
98 descent: f64,
99 hash: u64,
100}
101
102impl Font {
103 fn decode_code(&self, code: u32) -> (Option<String>, f64) {
104 let w = self
105 .widths
106 .get(&code)
107 .copied()
108 .unwrap_or(self.default_width);
109 if let Some(s) = self.to_unicode.get(&code) {
110 return (Some(decompose_ligatures(s)), w);
111 }
112 if !self.two_byte {
113 if let Some(name) = self.fallback_names.get(&(code as u8)) {
116 return (Some(format!("/{name}")), w);
117 }
118 if let Some(enc) = &self.simple_encoding {
119 if let Some(&ch) = enc.get(&(code as u8)) {
120 return (Some(decompose_ligatures(&ch.to_string())), w);
121 }
122 }
123 if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
131 return (Some(decompose_ligatures(&ch.to_string())), w);
132 }
133 }
134 (None, w)
135 }
136}
137
138fn decompose_ligatures(s: &str) -> String {
142 if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
143 return s.to_string();
144 }
145 s.chars()
146 .map(|c| {
147 match c {
148 '\u{FB00}' => "ff",
149 '\u{FB01}' => "fi",
150 '\u{FB02}' => "fl",
151 '\u{FB03}' => "ffi",
152 '\u{FB04}' => "ffl",
153 '\u{FB05}' => "ft",
154 '\u{FB06}' => "st",
155 _ => return c.to_string(),
156 }
157 .to_string()
158 })
159 .collect()
160}
161
162fn hash_name(name: &[u8]) -> u64 {
163 use std::hash::{Hash, Hasher};
164 let mut h = std::collections::hash_map::DefaultHasher::new();
165 name.hash(&mut h);
166 h.finish()
167}
168
169fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
171 match obj {
172 Object::Dictionary(d) => Some(d),
173 Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
174 _ => None,
175 }
176}
177
178fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
179 match obj {
180 Object::Reference(id) => doc.get_object(*id).ok(),
181 other => Some(other),
182 }
183}
184
185fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
187 let subtype: &[u8] = fdict
188 .get(b"Subtype")
189 .ok()
190 .and_then(|o| o.as_name().ok())
191 .unwrap_or(&[]);
192 let two_byte = subtype == b"Type0".as_slice();
193
194 let to_unicode = fdict
195 .get(b"ToUnicode")
196 .ok()
197 .and_then(|o| deref(doc, o))
198 .and_then(|o| o.as_stream().ok())
199 .and_then(|s| s.decompressed_content().ok())
200 .map(|data| parse_tounicode(&data))
201 .unwrap_or_default();
202
203 let (widths, default_width) = if two_byte {
204 cid_widths(doc, fdict)
205 } else {
206 simple_widths(doc, fdict)
207 };
208
209 let simple_encoding = if two_byte {
210 None
211 } else {
212 Some(simple_encoding_table(doc, fdict))
213 };
214 let fallback_names = if two_byte {
215 HashMap::new()
216 } else {
217 differences_gid_names(doc, fdict)
218 };
219 let program_encoding = if two_byte {
220 HashMap::new()
221 } else {
222 type1_program_encoding(doc, fdict)
223 };
224
225 let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
226
227 Font {
228 two_byte,
229 to_unicode,
230 widths,
231 default_width,
232 simple_encoding,
233 fallback_names,
234 program_encoding,
235 ascent,
236 descent,
237 hash: hash_name(name),
238 }
239}
240
241fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
248 let mut map = HashMap::new();
249 let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
250 else {
251 return map;
252 };
253 let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
254 else {
255 return map;
256 };
257 let mut code = 0u8;
258 for el in diffs {
259 match el {
260 Object::Integer(i) => code = *i as u8,
261 Object::Name(name) => {
262 if glyph_name_to_char(name).is_none() && is_gid_name(name) {
263 map.insert(code, String::from_utf8_lossy(name).into_owned());
264 }
265 code = code.wrapping_add(1);
266 }
267 _ => {}
268 }
269 }
270 map
271}
272
273fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
281 let mut map = HashMap::new();
282 let Some(desc) = fdict
283 .get(b"FontDescriptor")
284 .ok()
285 .and_then(|o| deref(doc, o))
286 .and_then(|o| o.as_dict().ok())
287 else {
288 return map;
289 };
290 let Some(data) = desc
291 .get(b"FontFile")
292 .ok()
293 .and_then(|o| deref(doc, o))
294 .and_then(|o| o.as_stream().ok())
295 .and_then(|s| s.decompressed_content().ok())
296 else {
297 return map;
298 };
299 let head_end = data
301 .windows(5)
302 .position(|w| w == b"eexec")
303 .unwrap_or(data.len());
304 let head = String::from_utf8_lossy(&data[..head_end]);
305 let toks: Vec<&str> = head.split_whitespace().collect();
307 for w in toks.windows(4) {
308 if w[0] == "dup" && w[3] == "put" {
309 if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
310 if code <= 255 {
311 if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
312 map.insert(code as u8, ch);
313 }
314 }
315 }
316 }
317 }
318 map
319}
320
321fn is_gid_name(name: &[u8]) -> bool {
327 let Ok(s) = std::str::from_utf8(name) else {
328 return false;
329 };
330 if s.starts_with("afii") || s.starts_with("uni") {
331 return false;
332 }
333 for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
334 if let Some(rest) = s.strip_prefix(prefix) {
335 if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
336 return true;
337 }
338 }
339 }
340 let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
344 let digits = s.len() - alpha;
345 (1..=3).contains(&alpha)
346 && digits >= 3
347 && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
348}
349
350fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
351 let descr_owner = if two_byte {
353 fdict
354 .get(b"DescendantFonts")
355 .ok()
356 .and_then(|o| deref(doc, o))
357 .and_then(|o| match o {
358 Object::Array(a) => a.first(),
359 _ => None,
360 })
361 .and_then(|o| as_dict(doc, o))
362 } else {
363 Some(fdict)
364 };
365 let fd = descr_owner
366 .and_then(|d| d.get(b"FontDescriptor").ok())
367 .and_then(|o| as_dict(doc, o));
368 let asc = fd
369 .and_then(|d| d.get(b"Ascent").ok())
370 .and_then(|o| {
371 o.as_float()
372 .ok()
373 .or_else(|| o.as_i64().ok().map(|i| i as f32))
374 })
375 .unwrap_or(750.0) as f64;
376 let desc = fd
377 .and_then(|d| d.get(b"Descent").ok())
378 .and_then(|o| {
379 o.as_float()
380 .ok()
381 .or_else(|| o.as_i64().ok().map(|i| i as f32))
382 })
383 .unwrap_or(-250.0) as f64;
384 if asc - desc <= 1.0 {
391 return (750.0, -250.0);
392 }
393 (asc, desc)
394}
395
396fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
398 let mut map = HashMap::new();
399 let first = fdict
400 .get(b"FirstChar")
401 .ok()
402 .and_then(|o| o.as_i64().ok())
403 .unwrap_or(0) as u32;
404 if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
405 for (i, w) in arr.iter().enumerate() {
406 if let Some(w) = num(w) {
407 map.insert(first + i as u32, w);
408 }
409 }
410 }
411 let dw = fdict
412 .get(b"FontDescriptor")
413 .ok()
414 .and_then(|o| as_dict(doc, o))
415 .and_then(|d| d.get(b"MissingWidth").ok())
416 .and_then(num)
417 .unwrap_or(0.0);
418 (map, dw)
419}
420
421fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
423 let mut map = HashMap::new();
424 let Some(desc) = fdict
425 .get(b"DescendantFonts")
426 .ok()
427 .and_then(|o| deref(doc, o))
428 .and_then(|o| match o {
429 Object::Array(a) => a.first(),
430 _ => None,
431 })
432 .and_then(|o| as_dict(doc, o))
433 else {
434 return (map, 1000.0);
435 };
436 let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
437 if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
438 let mut i = 0;
439 while i < w.len() {
440 let c = w.get(i).and_then(num);
441 match (c, w.get(i + 1)) {
442 (Some(c), Some(Object::Array(list))) => {
444 for (k, wv) in list.iter().enumerate() {
445 if let Some(wv) = num(wv) {
446 map.insert(c as u32 + k as u32, wv);
447 }
448 }
449 i += 2;
450 }
451 (Some(c1), Some(o2)) => {
453 if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
454 for cid in c1 as u32..=c2 as u32 {
455 map.insert(cid, wv);
456 }
457 }
458 i += 3;
459 }
460 _ => break,
461 }
462 }
463 }
464 (map, dw)
465}
466
467fn num(o: &Object) -> Option<f64> {
468 match o {
469 Object::Integer(i) => Some(*i as f64),
470 Object::Real(r) => Some(*r as f64),
471 _ => None,
472 }
473}
474
475fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
477 let text = String::from_utf8_lossy(data);
478 let mut map = HashMap::new();
479 let hex = |s: &str| -> Option<Vec<u16>> {
480 let s = s.trim();
481 if !s.starts_with('<') || !s.ends_with('>') {
482 return None;
483 }
484 let h = &s[1..s.len() - 1];
485 let bytes: Vec<u8> = (0..h.len())
486 .step_by(2)
487 .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
488 .collect();
489 Some(
490 bytes
491 .chunks(2)
492 .map(|c| {
493 if c.len() == 2 {
494 u16::from_be_bytes([c[0], c[1]])
495 } else {
496 c[0] as u16
497 }
498 })
499 .collect(),
500 )
501 };
502 let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
503 let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
504
505 let tokens: Vec<String> = {
509 let bytes = text.as_bytes();
510 let mut toks = Vec::new();
511 let mut i = 0;
512 while i < bytes.len() {
513 let c = bytes[i];
514 if c.is_ascii_whitespace() {
515 i += 1;
516 } else if c == b'<' {
517 let start = i;
518 while i < bytes.len() && bytes[i] != b'>' {
519 i += 1;
520 }
521 i += 1; toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
523 } else if c == b'[' || c == b']' {
524 toks.push((c as char).to_string());
525 i += 1;
526 } else {
527 let start = i;
528 while i < bytes.len()
529 && !bytes[i].is_ascii_whitespace()
530 && bytes[i] != b'<'
531 && bytes[i] != b'['
532 && bytes[i] != b']'
533 {
534 i += 1;
535 }
536 toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
537 }
538 }
539 toks
540 };
541 let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
542 let mut i = 0;
543 while i < tokens.len() {
544 match tokens[i] {
545 "beginbfchar" => {
546 i += 1;
547 while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
548 if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
549 map.insert(code_of(&src), u16s_to_string(&dst));
550 }
551 i += 2;
552 }
553 }
554 "beginbfrange" => {
555 i += 1;
556 while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
557 let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
558 i += 1;
559 continue;
560 };
561 let lo = code_of(&lo);
562 let hi = code_of(&hi);
563 if tokens[i + 2] == "[" {
564 let mut j = i + 3;
566 let mut code = lo;
567 while j < tokens.len() && tokens[j] != "]" {
568 if let Some(dst) = hex(tokens[j]) {
569 map.insert(code, u16s_to_string(&dst));
570 }
571 code += 1;
572 j += 1;
573 }
574 i = j + 1;
575 } else if let Some(dst) = hex(tokens[i + 2]) {
576 let base = code_of(&dst);
578 for (k, code) in (lo..=hi).enumerate() {
579 if let Some(ch) = char::from_u32(base + k as u32) {
580 map.insert(code, ch.to_string());
581 }
582 }
583 i += 3;
584 } else {
585 i += 1;
586 }
587 }
588 }
589 _ => i += 1,
590 }
591 }
592 map
593}
594
595fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
597 if font.two_byte {
598 bytes
599 .chunks(2)
600 .map(|c| {
601 if c.len() == 2 {
602 ((c[0] as u32) << 8) | c[1] as u32
603 } else {
604 c[0] as u32
605 }
606 })
607 .collect()
608 } else {
609 bytes.iter().map(|&b| b as u32).collect()
610 }
611}
612
613fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
615 let mb = doc
616 .get_object(page_id)
617 .ok()
618 .and_then(|o| o.as_dict().ok())
619 .and_then(|d| {
620 d.get(b"MediaBox").ok().cloned()
622 })
623 .or_else(|| {
624 doc.get_dictionary(page_id)
625 .ok()
626 .and_then(|d| d.get(b"MediaBox").ok().cloned())
627 });
628 if let Some(Object::Array(a)) = mb {
629 let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
630 if v.len() == 4 {
631 return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
632 }
633 }
634 (612.0, 792.0)
635}
636
637pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
640 let Ok(doc) = Document::load_mem(bytes) else {
641 return Vec::new();
642 };
643 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
644 pages.sort_by_key(|(n, _)| *n);
645 let Some((_, pid)) = pages.get(index) else {
646 return Vec::new();
647 };
648 page_glyphs(&doc, *pid)
649 .into_iter()
650 .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
651 .collect()
652}
653
654pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
658 let Ok(doc) = Document::load_mem(bytes) else {
659 return Vec::new();
660 };
661 let mut caches = DocCaches::default();
662 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
663 pages.sort_by_key(|(n, _)| *n);
664 pages
665 .into_iter()
666 .map(|(_, pid)| {
667 let (w, h) = page_size(&doc, pid);
668 let glyphs = page_glyphs_cached(&doc, pid, &mut caches);
669 let cells = crate::dp_lines::line_cells(&glyphs, h, true);
670 (w, h, cells)
671 })
672 .collect()
673}
674
675pub fn pdf_words(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
680 let Ok(doc) = Document::load_mem(bytes) else {
681 return Vec::new();
682 };
683 let mut caches = DocCaches::default();
684 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
685 pages.sort_by_key(|(n, _)| *n);
686 pages
687 .into_iter()
688 .map(|(_, pid)| {
689 let (w, h) = page_size(&doc, pid);
690 let glyphs = page_glyphs_cached(&doc, pid, &mut caches);
691 let cells = crate::dp_lines::word_cells(&glyphs, h, true);
692 (w, h, cells)
693 })
694 .collect()
695}
696
697#[derive(Default)]
701pub struct PageParserCells {
702 pub prose: Vec<crate::pdfium_backend::TextCell>,
703 pub words: Vec<crate::pdfium_backend::TextCell>,
704 pub code: Vec<crate::pdfium_backend::TextCell>,
705}
706
707pub fn pdf_all_cells(bytes: &[u8]) -> Vec<PageParserCells> {
712 let Ok(doc) = Document::load_mem(bytes) else {
713 return Vec::new();
714 };
715 let mut caches = DocCaches::default();
716 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
717 pages.sort_by_key(|(n, _)| *n);
718 pages
719 .into_iter()
720 .map(|(_, pid)| {
721 let (_w, h) = page_size(&doc, pid);
722 let glyphs = page_glyphs_cached(&doc, pid, &mut caches);
723 let (prose, words) = crate::dp_lines::line_and_word_cells(&glyphs, h, true);
724 PageParserCells {
725 prose,
726 words,
727 code: crate::pdfium_backend::code_cells_from_glyphs(&glyphs, h),
728 }
729 })
730 .collect()
731}
732
733#[derive(Clone, Copy)]
737struct TextState {
738 tc: f64,
739 tw: f64,
740 th: f64,
741 tl: f64,
742 trise: f64,
743 fsize: f64,
744}
745
746impl TextState {
747 const INIT: TextState = TextState {
748 tc: 0.0,
749 tw: 0.0,
750 th: 1.0,
751 tl: 0.0,
752 trise: 0.0,
753 fsize: 0.0,
754 };
755}
756
757fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
760 let (inline, ids) = doc.get_page_resources(page_id).ok()?;
761 if let Some(d) = inline {
762 return Some(d);
763 }
764 ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
765}
766
767fn fonts_from_res(
771 doc: &Document,
772 res: &Dictionary,
773 caches: &mut DocCaches,
774) -> HashMap<Vec<u8>, Rc<Font>> {
775 let mut map = HashMap::new();
776 let font_dict = res
777 .get(b"Font")
778 .ok()
779 .and_then(|o| deref(doc, o))
780 .and_then(|o| o.as_dict().ok());
781 if let Some(fd) = font_dict {
782 for (name, value) in fd.iter() {
783 let font = match value {
784 Object::Reference(id) => {
785 let key = (*id, name.clone());
786 if let Some(f) = caches.fonts.get(&key) {
787 Rc::clone(f)
788 } else if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
789 let f = Rc::new(parse_font(doc, name, fdict));
790 caches.fonts.insert(key, Rc::clone(&f));
791 f
792 } else {
793 continue;
794 }
795 }
796 _ => {
797 if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
798 Rc::new(parse_font(doc, name, fdict))
799 } else {
800 continue;
801 }
802 }
803 };
804 map.insert(name.clone(), font);
805 }
806 }
807 map
808}
809
810pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
812 page_glyphs_cached(doc, page_id, &mut DocCaches::default())
813}
814
815fn page_glyphs_cached(
818 doc: &Document,
819 page_id: lopdf::ObjectId,
820 caches: &mut DocCaches,
821) -> Vec<Glyph> {
822 let mut out = Vec::new();
823 let Ok(content_bytes) = doc.get_page_content(page_id) else {
824 return out;
825 };
826 let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
827 return out;
828 };
829 if let Some(res) = page_res(doc, page_id) {
830 run_content(
831 doc,
832 res,
833 &content,
834 Mat::ID,
835 TextState::INIT,
836 0,
837 caches,
838 &mut out,
839 );
840 }
841 out
842}
843
844#[allow(clippy::too_many_arguments)]
849fn run_content(
850 doc: &Document,
851 res: &Dictionary,
852 content: &lopdf::content::Content,
853 base_ctm: Mat,
854 init: TextState,
855 depth: u32,
856 caches: &mut DocCaches,
857 out: &mut Vec<Glyph>,
858) {
859 let fonts = fonts_from_res(doc, res, caches);
860 let xobjects = res
861 .get(b"XObject")
862 .ok()
863 .and_then(|o| deref(doc, o))
864 .and_then(|o| o.as_dict().ok());
865
866 #[allow(clippy::type_complexity)]
871 let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Rc<Font>>)> = Vec::new();
872 let mut ctm = base_ctm;
873 let mut tm = Mat::ID;
874 let mut tlm = Mat::ID;
875 let mut font: Option<&Rc<Font>> = None;
876 let mut fsize = init.fsize;
877 let mut tc = init.tc; let mut tw = init.tw; let mut th = init.th; let mut tl = init.tl; let mut trise = init.trise;
882
883 let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
884
885 for op in &content.operations {
886 let operands = &op.operands;
887 match op.operator.as_str() {
888 "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
889 "Q" => {
890 if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
891 ctm = c;
892 tc = a;
893 tw = b;
894 th = h;
895 tl = l;
896 trise = r;
897 fsize = fs;
898 font = f;
899 }
900 }
901 "cm" => {
902 let m = Mat {
903 a: op_f(operands, 0),
904 b: op_f(operands, 1),
905 c: op_f(operands, 2),
906 d: op_f(operands, 3),
907 e: op_f(operands, 4),
908 f: op_f(operands, 5),
909 };
910 ctm = m.then(ctm);
911 }
912 "BT" => {
913 tm = Mat::ID;
914 tlm = Mat::ID;
915 }
916 "ET" => {}
917 "Tf" => {
918 if let Some(Object::Name(n)) = operands.first() {
919 font = fonts.get(n.as_slice());
920 }
921 fsize = op_f(operands, 1);
922 }
923 "Td" => {
924 tlm = Mat {
925 a: 1.0,
926 b: 0.0,
927 c: 0.0,
928 d: 1.0,
929 e: op_f(operands, 0),
930 f: op_f(operands, 1),
931 }
932 .then(tlm);
933 tm = tlm;
934 }
935 "TD" => {
936 tl = -op_f(operands, 1);
937 tlm = Mat {
938 a: 1.0,
939 b: 0.0,
940 c: 0.0,
941 d: 1.0,
942 e: op_f(operands, 0),
943 f: op_f(operands, 1),
944 }
945 .then(tlm);
946 tm = tlm;
947 }
948 "Tm" => {
949 tlm = Mat {
950 a: op_f(operands, 0),
951 b: op_f(operands, 1),
952 c: op_f(operands, 2),
953 d: op_f(operands, 3),
954 e: op_f(operands, 4),
955 f: op_f(operands, 5),
956 };
957 tm = tlm;
958 }
959 "T*" => {
960 tlm = Mat {
961 a: 1.0,
962 b: 0.0,
963 c: 0.0,
964 d: 1.0,
965 e: 0.0,
966 f: -tl,
967 }
968 .then(tlm);
969 tm = tlm;
970 }
971 "Tc" => tc = op_f(operands, 0),
972 "Tw" => tw = op_f(operands, 0),
973 "Tz" => th = op_f(operands, 0) / 100.0,
974 "TL" => tl = op_f(operands, 0),
975 "Ts" => trise = op_f(operands, 0),
976 "Tj" | "'" | "\"" => {
977 if op.operator == "'" || op.operator == "\"" {
978 tlm = Mat {
980 a: 1.0,
981 b: 0.0,
982 c: 0.0,
983 d: 1.0,
984 e: 0.0,
985 f: -tl,
986 }
987 .then(tlm);
988 tm = tlm;
989 }
990 if op.operator == "\"" {
991 tw = op_f(operands, 0);
994 tc = op_f(operands, 1);
995 }
996 if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
997 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
998 }
999 }
1000 "TJ" => {
1001 if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
1002 for el in arr {
1003 match el {
1004 Object::String(s, _) => {
1005 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
1006 }
1007 other => {
1008 if let Some(adj) = num(other) {
1009 let tx = -adj / 1000.0 * fsize * th;
1011 tm = Mat {
1012 a: 1.0,
1013 b: 0.0,
1014 c: 0.0,
1015 d: 1.0,
1016 e: tx,
1017 f: 0.0,
1018 }
1019 .then(tm);
1020 }
1021 }
1022 }
1023 }
1024 }
1025 }
1026 "Do" => {
1027 if depth >= 8 {
1030 continue;
1031 }
1032 let Some(Object::Name(n)) = operands.first() else {
1033 continue;
1034 };
1035 let obj = xobjects.and_then(|d| d.get(n.as_slice()).ok());
1036 let form_id = match obj {
1037 Some(Object::Reference(id)) => Some(*id),
1038 _ => None,
1039 };
1040 let stream = obj
1041 .and_then(|o| deref(doc, o))
1042 .and_then(|o| o.as_stream().ok());
1043 let Some(stream) = stream else { continue };
1044 let is_form = stream
1045 .dict
1046 .get(b"Subtype")
1047 .ok()
1048 .and_then(|o| o.as_name().ok())
1049 == Some(b"Form".as_slice());
1050 if !is_form {
1051 continue;
1052 }
1053 let cached = form_id.and_then(|id| caches.forms.get(&id).cloned());
1056 let form_content = match cached {
1057 Some(c) => c,
1058 None => {
1059 let Ok(data) = stream.decompressed_content() else {
1060 continue;
1061 };
1062 let Ok(c) = lopdf::content::Content::decode(&data) else {
1063 continue;
1064 };
1065 let c = Rc::new(c);
1066 if let Some(id) = form_id {
1067 caches.forms.insert(id, Rc::clone(&c));
1068 }
1069 c
1070 }
1071 };
1072 let form_mat = match stream.dict.get(b"Matrix").ok() {
1074 Some(Object::Array(a)) if a.len() == 6 => {
1075 let v: Vec<f64> = a.iter().filter_map(num).collect();
1076 if v.len() == 6 {
1077 Mat {
1078 a: v[0],
1079 b: v[1],
1080 c: v[2],
1081 d: v[3],
1082 e: v[4],
1083 f: v[5],
1084 }
1085 } else {
1086 Mat::ID
1087 }
1088 }
1089 _ => Mat::ID,
1090 };
1091 let form_res = stream
1093 .dict
1094 .get(b"Resources")
1095 .ok()
1096 .and_then(|o| deref(doc, o))
1097 .and_then(|o| o.as_dict().ok())
1098 .unwrap_or(res);
1099 let state = TextState {
1100 tc,
1101 tw,
1102 th,
1103 tl,
1104 trise,
1105 fsize,
1106 };
1107 run_content(
1108 doc,
1109 form_res,
1110 &form_content,
1111 form_mat.then(ctm),
1112 state,
1113 depth + 1,
1114 caches,
1115 out,
1116 );
1117 }
1118 _ => {}
1119 }
1120 }
1121}
1122
1123#[allow(clippy::too_many_arguments)]
1124fn show_text(
1125 font: &Font,
1126 bytes: &[u8],
1127 fsize: f64,
1128 tc: f64,
1129 tw: f64,
1130 th: f64,
1131 trise: f64,
1132 tm: &mut Mat,
1133 ctm: Mat,
1134 out: &mut Vec<Glyph>,
1135) {
1136 for code in codes(font, bytes) {
1137 let (text, w) = font.decode_code(code);
1138 let w0 = w / 1000.0; let scale = Mat {
1141 a: fsize * th,
1142 b: 0.0,
1143 c: 0.0,
1144 d: fsize,
1145 e: 0.0,
1146 f: trise,
1147 };
1148 let trm = scale.then(*tm).then(ctm);
1149 let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
1151 let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
1152 let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
1153 let (left, right) = (x0.min(x1), x0.max(x1));
1154 let (bot, top) = (y0.min(y2), y0.max(y2));
1155 if let Some(s) = text {
1156 for ch in s.chars() {
1158 if ch != '\u{0}' {
1159 out.push(Glyph {
1160 ch,
1161 l: left as f32,
1162 b: bot as f32,
1163 r: right as f32,
1164 t: top as f32,
1165 ll: left as f32,
1166 lb: bot as f32,
1167 lr: right as f32,
1168 lt: top as f32,
1169 font: font.hash,
1170 });
1171 }
1172 }
1173 }
1174 let is_space = !font.two_byte && code == 32;
1176 let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
1177 *tm = Mat {
1178 a: 1.0,
1179 b: 0.0,
1180 c: 0.0,
1181 d: 1.0,
1182 e: tx,
1183 f: 0.0,
1184 }
1185 .then(*tm);
1186 }
1187}
1188
1189fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
1193 let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
1194 let base_name = match enc {
1195 Some(Object::Name(n)) => n.clone(),
1196 Some(Object::Dictionary(d)) => d
1197 .get(b"BaseEncoding")
1198 .ok()
1199 .and_then(|o| o.as_name().ok())
1200 .map(|n| n.to_vec())
1201 .unwrap_or_default(),
1202 _ => Vec::new(),
1203 };
1204 let mut m = if base_name == b"MacRomanEncoding" {
1205 macroman_table()
1206 } else {
1207 winansi_table()
1208 };
1209 if let Some(Object::Dictionary(d)) = enc {
1211 if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
1212 let mut code = 0u8;
1213 for el in diffs {
1214 match el {
1215 Object::Integer(i) => code = *i as u8,
1216 Object::Name(name) => {
1217 if let Some(ch) = glyph_name_to_char(name) {
1218 m.insert(code, ch);
1219 }
1220 code = code.wrapping_add(1);
1221 }
1222 _ => {}
1223 }
1224 }
1225 }
1226 }
1227 m
1228}
1229
1230fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1235 let s = std::str::from_utf8(name).ok()?;
1236 if let Some(hex) = s.strip_prefix("uni") {
1237 if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1238 return char::from_u32(cp);
1239 }
1240 }
1241 if s.len() == 1 {
1243 let b = s.as_bytes()[0];
1244 if b.is_ascii_alphabetic() {
1245 return Some(b as char);
1246 }
1247 }
1248 let resolved = match s {
1249 "space" => ' ',
1250 "exclam" => '!',
1251 "quotedbl" => '"',
1252 "numbersign" => '#',
1253 "dollar" => '$',
1254 "percent" => '%',
1255 "ampersand" => '&',
1256 "quotesingle" => '\'',
1257 "parenleft" => '(',
1258 "parenright" => ')',
1259 "asterisk" => '*',
1260 "plus" => '+',
1261 "comma" => ',',
1262 "hyphen" => '-',
1263 "period" => '.',
1264 "slash" => '/',
1265 "zero" => '0',
1266 "one" => '1',
1267 "two" => '2',
1268 "three" => '3',
1269 "four" => '4',
1270 "five" => '5',
1271 "six" => '6',
1272 "seven" => '7',
1273 "eight" => '8',
1274 "nine" => '9',
1275 "colon" => ':',
1276 "semicolon" => ';',
1277 "less" => '<',
1278 "equal" => '=',
1279 "greater" => '>',
1280 "question" => '?',
1281 "at" => '@',
1282 "bracketleft" => '[',
1283 "backslash" => '\\',
1284 "bracketright" => ']',
1285 "asciicircum" => '^',
1286 "underscore" => '_',
1287 "grave" => '`',
1288 "braceleft" => '{',
1289 "bar" => '|',
1290 "braceright" => '}',
1291 "asciitilde" => '~',
1292 "bullet" => '\u{2022}',
1293 "periodcentered" => '\u{00B7}',
1294 "endash" => '\u{2013}',
1295 "emdash" => '\u{2014}',
1296 "quoteright" => '\u{2019}',
1297 "quoteleft" => '\u{2018}',
1298 "quotedblleft" => '\u{201C}',
1299 "quotedblright" => '\u{201D}',
1300 "quotedblbase" => '\u{201E}',
1301 "quotesinglbase" => '\u{201A}',
1302 "ff" => '\u{FB00}',
1307 "fi" => '\u{FB01}',
1308 "fl" => '\u{FB02}',
1309 "ffi" => '\u{FB03}',
1310 "ffl" => '\u{FB04}',
1311 "ft" => '\u{FB05}',
1312 "st" => '\u{FB06}',
1313 "degree" => '\u{00B0}',
1314 "trademark" => '\u{2122}',
1315 "registered" => '\u{00AE}',
1316 "copyright" => '\u{00A9}',
1317 "ellipsis" => '\u{2026}',
1318 "minus" => '\u{2212}',
1319 "fraction" => '\u{2044}',
1320 "nbspace" => '\u{00A0}',
1321 "alpha" => '\u{03B1}',
1326 "beta" => '\u{03B2}',
1327 "gamma" => '\u{03B3}',
1328 "delta" => '\u{03B4}',
1329 "epsilon" | "epsilon1" => '\u{03B5}',
1330 "zeta" => '\u{03B6}',
1331 "eta" => '\u{03B7}',
1332 "theta" | "theta1" => '\u{03B8}',
1333 "iota" => '\u{03B9}',
1334 "kappa" => '\u{03BA}',
1335 "lambda" => '\u{03BB}',
1336 "mu" => '\u{03BC}',
1337 "nu" => '\u{03BD}',
1338 "xi" => '\u{03BE}',
1339 "omicron" => '\u{03BF}',
1340 "pi" | "pi1" => '\u{03C0}',
1341 "rho" | "rho1" => '\u{03C1}',
1342 "sigma" => '\u{03C3}',
1343 "sigma1" => '\u{03C2}',
1344 "tau" => '\u{03C4}',
1345 "upsilon" => '\u{03C5}',
1346 "phi" | "phi1" => '\u{03C6}',
1347 "chi" => '\u{03C7}',
1348 "psi" => '\u{03C8}',
1349 "omega" | "omega1" => '\u{03C9}',
1350 "Gamma" => '\u{0393}',
1351 "Delta" => '\u{0394}',
1352 "Theta" => '\u{0398}',
1353 "Lambda" => '\u{039B}',
1354 "Xi" => '\u{039E}',
1355 "Pi" => '\u{03A0}',
1356 "Sigma" => '\u{03A3}',
1357 "Upsilon" => '\u{03A5}',
1358 "Phi" => '\u{03A6}',
1359 "Psi" => '\u{03A8}',
1360 "Omega" => '\u{03A9}',
1361 "lessequal" => '\u{2264}',
1362 "greaterequal" => '\u{2265}',
1363 "notequal" => '\u{2260}',
1364 "approxequal" => '\u{2248}',
1365 "equivalence" => '\u{2261}',
1366 "element" => '\u{2208}',
1367 "plusminus" => '\u{00B1}',
1368 "multiply" => '\u{00D7}',
1369 "divide" => '\u{00F7}',
1370 "infinity" => '\u{221E}',
1371 "partialdiff" => '\u{2202}',
1372 "gradient" => '\u{2207}',
1373 "summation" => '\u{2211}',
1374 "product" => '\u{220F}',
1375 "integral" => '\u{222B}',
1376 "radical" => '\u{221A}',
1377 "proportional" => '\u{221D}',
1378 "arrowright" => '\u{2192}',
1379 "arrowleft" => '\u{2190}',
1380 "arrowup" => '\u{2191}',
1381 "arrowdown" => '\u{2193}',
1382 "arrowboth" => '\u{2194}',
1383 "arrowdblright" => '\u{21D2}',
1384 "logicaland" => '\u{2227}',
1385 "logicalor" => '\u{2228}',
1386 "intersection" => '\u{2229}',
1387 "union" => '\u{222A}',
1388 "similar" => '\u{223C}',
1389 "congruent" => '\u{2245}',
1390 "dotmath" => '\u{22C5}',
1391 "asteriskmath" => '\u{2217}',
1392 _ => {
1393 if let Some((base, _)) = s.split_once('.') {
1395 if !base.is_empty() {
1396 return glyph_name_to_char(base.as_bytes());
1397 }
1398 }
1399 return None;
1400 }
1401 };
1402 Some(resolved)
1403}
1404
1405fn winansi_table() -> HashMap<u8, char> {
1407 let mut m = HashMap::new();
1408 for b in 0x20u8..=0x7e {
1409 m.insert(b, b as char);
1410 }
1411 let extra: &[(u8, char)] = &[
1413 (0x91, '\u{2018}'),
1414 (0x92, '\u{2019}'),
1415 (0x93, '\u{201C}'),
1416 (0x94, '\u{201D}'),
1417 (0x95, '\u{2022}'),
1418 (0x96, '\u{2013}'),
1419 (0x97, '\u{2014}'),
1420 (0x85, '\u{2026}'),
1421 (0xA0, '\u{00A0}'),
1422 ];
1423 for &(b, c) in extra {
1424 m.insert(b, c);
1425 }
1426 for b in 0xA1u8..=0xFF {
1427 m.entry(b).or_insert(b as char);
1428 }
1429 m
1430}
1431
1432fn macroman_table() -> HashMap<u8, char> {
1435 let mut m = HashMap::new();
1436 for b in 0x20u8..=0x7e {
1437 m.insert(b, b as char);
1438 }
1439 let high: &[(u8, char)] = &[
1440 (0xA5, '\u{2022}'), (0xD0, '\u{2013}'), (0xD1, '\u{2014}'), (0xD2, '\u{201C}'),
1444 (0xD3, '\u{201D}'),
1445 (0xD4, '\u{2018}'),
1446 (0xD5, '\u{2019}'),
1447 (0xCA, '\u{00A0}'),
1448 (0xC9, '\u{2026}'),
1449 (0xDE, '\u{FB01}'),
1450 (0xDF, '\u{FB02}'),
1451 ];
1452 for &(b, c) in high {
1453 m.insert(b, c);
1454 }
1455 m
1456}