1use std::collections::HashMap;
17
18use lopdf::{Dictionary, Document, Object};
19
20use crate::pdfium_backend::Glyph;
21
22#[derive(Clone, Copy)]
24struct Mat {
25 a: f64,
26 b: f64,
27 c: f64,
28 d: f64,
29 e: f64,
30 f: f64,
31}
32
33impl Mat {
34 const ID: Mat = Mat {
35 a: 1.0,
36 b: 0.0,
37 c: 0.0,
38 d: 1.0,
39 e: 0.0,
40 f: 0.0,
41 };
42
43 fn then(self, m: Mat) -> Mat {
45 Mat {
46 a: self.a * m.a + self.b * m.c,
47 b: self.a * m.b + self.b * m.d,
48 c: self.c * m.a + self.d * m.c,
49 d: self.c * m.b + self.d * m.d,
50 e: self.e * m.a + self.f * m.c + m.e,
51 f: self.e * m.b + self.f * m.d + m.f,
52 }
53 }
54
55 fn apply(self, x: f64, y: f64) -> (f64, f64) {
56 (
57 self.a * x + self.c * y + self.e,
58 self.b * x + self.d * y + self.f,
59 )
60 }
61}
62
63struct Font {
65 two_byte: bool,
67 to_unicode: HashMap<u32, String>,
69 widths: HashMap<u32, f64>,
71 default_width: f64,
72 simple_encoding: Option<HashMap<u8, char>>,
74 fallback_names: HashMap<u8, String>,
78 ascent: f64,
79 descent: f64,
80 hash: u64,
81}
82
83impl Font {
84 fn decode_code(&self, code: u32) -> (Option<String>, f64) {
85 let w = self
86 .widths
87 .get(&code)
88 .copied()
89 .unwrap_or(self.default_width);
90 if let Some(s) = self.to_unicode.get(&code) {
91 return (Some(decompose_ligatures(s)), w);
92 }
93 if !self.two_byte {
94 if let Some(name) = self.fallback_names.get(&(code as u8)) {
97 return (Some(format!("/{name}")), w);
98 }
99 if let Some(enc) = &self.simple_encoding {
100 if let Some(&ch) = enc.get(&(code as u8)) {
101 return (Some(decompose_ligatures(&ch.to_string())), w);
102 }
103 }
104 }
105 (None, w)
106 }
107}
108
109fn decompose_ligatures(s: &str) -> String {
113 if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
114 return s.to_string();
115 }
116 s.chars()
117 .map(|c| {
118 match c {
119 '\u{FB00}' => "ff",
120 '\u{FB01}' => "fi",
121 '\u{FB02}' => "fl",
122 '\u{FB03}' => "ffi",
123 '\u{FB04}' => "ffl",
124 '\u{FB05}' => "ft",
125 '\u{FB06}' => "st",
126 _ => return c.to_string(),
127 }
128 .to_string()
129 })
130 .collect()
131}
132
133fn hash_name(name: &[u8]) -> u64 {
134 use std::hash::{Hash, Hasher};
135 let mut h = std::collections::hash_map::DefaultHasher::new();
136 name.hash(&mut h);
137 h.finish()
138}
139
140fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
142 match obj {
143 Object::Dictionary(d) => Some(d),
144 Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
145 _ => None,
146 }
147}
148
149fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
150 match obj {
151 Object::Reference(id) => doc.get_object(*id).ok(),
152 other => Some(other),
153 }
154}
155
156fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
158 let subtype: &[u8] = fdict
159 .get(b"Subtype")
160 .ok()
161 .and_then(|o| o.as_name().ok())
162 .unwrap_or(&[]);
163 let two_byte = subtype == b"Type0".as_slice();
164
165 let to_unicode = fdict
166 .get(b"ToUnicode")
167 .ok()
168 .and_then(|o| deref(doc, o))
169 .and_then(|o| o.as_stream().ok())
170 .and_then(|s| s.decompressed_content().ok())
171 .map(|data| parse_tounicode(&data))
172 .unwrap_or_default();
173
174 let (widths, default_width) = if two_byte {
175 cid_widths(doc, fdict)
176 } else {
177 simple_widths(doc, fdict)
178 };
179
180 let simple_encoding = if two_byte {
181 None
182 } else {
183 Some(simple_encoding_table(doc, fdict))
184 };
185 let fallback_names = if two_byte {
186 HashMap::new()
187 } else {
188 differences_gid_names(doc, fdict)
189 };
190
191 let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
192
193 Font {
194 two_byte,
195 to_unicode,
196 widths,
197 default_width,
198 simple_encoding,
199 fallback_names,
200 ascent,
201 descent,
202 hash: hash_name(name),
203 }
204}
205
206fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
213 let mut map = HashMap::new();
214 let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
215 else {
216 return map;
217 };
218 let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
219 else {
220 return map;
221 };
222 let mut code = 0u8;
223 for el in diffs {
224 match el {
225 Object::Integer(i) => code = *i as u8,
226 Object::Name(name) => {
227 if glyph_name_to_char(name).is_none() && is_gid_name(name) {
228 map.insert(code, String::from_utf8_lossy(name).into_owned());
229 }
230 code = code.wrapping_add(1);
231 }
232 _ => {}
233 }
234 }
235 map
236}
237
238fn is_gid_name(name: &[u8]) -> bool {
244 let Ok(s) = std::str::from_utf8(name) else {
245 return false;
246 };
247 if s.starts_with("afii") || s.starts_with("uni") {
248 return false;
249 }
250 for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
251 if let Some(rest) = s.strip_prefix(prefix) {
252 if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
253 return true;
254 }
255 }
256 }
257 let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
261 let digits = s.len() - alpha;
262 (1..=3).contains(&alpha)
263 && digits >= 3
264 && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
265}
266
267fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
268 let descr_owner = if two_byte {
270 fdict
271 .get(b"DescendantFonts")
272 .ok()
273 .and_then(|o| deref(doc, o))
274 .and_then(|o| match o {
275 Object::Array(a) => a.first(),
276 _ => None,
277 })
278 .and_then(|o| as_dict(doc, o))
279 } else {
280 Some(fdict)
281 };
282 let fd = descr_owner
283 .and_then(|d| d.get(b"FontDescriptor").ok())
284 .and_then(|o| as_dict(doc, o));
285 let asc = fd
286 .and_then(|d| d.get(b"Ascent").ok())
287 .and_then(|o| {
288 o.as_float()
289 .ok()
290 .or_else(|| o.as_i64().ok().map(|i| i as f32))
291 })
292 .unwrap_or(750.0) as f64;
293 let desc = fd
294 .and_then(|d| d.get(b"Descent").ok())
295 .and_then(|o| {
296 o.as_float()
297 .ok()
298 .or_else(|| o.as_i64().ok().map(|i| i as f32))
299 })
300 .unwrap_or(-250.0) as f64;
301 (asc, desc)
302}
303
304fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
306 let mut map = HashMap::new();
307 let first = fdict
308 .get(b"FirstChar")
309 .ok()
310 .and_then(|o| o.as_i64().ok())
311 .unwrap_or(0) as u32;
312 if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
313 for (i, w) in arr.iter().enumerate() {
314 if let Some(w) = num(w) {
315 map.insert(first + i as u32, w);
316 }
317 }
318 }
319 let dw = fdict
320 .get(b"FontDescriptor")
321 .ok()
322 .and_then(|o| as_dict(doc, o))
323 .and_then(|d| d.get(b"MissingWidth").ok())
324 .and_then(num)
325 .unwrap_or(0.0);
326 (map, dw)
327}
328
329fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
331 let mut map = HashMap::new();
332 let Some(desc) = fdict
333 .get(b"DescendantFonts")
334 .ok()
335 .and_then(|o| deref(doc, o))
336 .and_then(|o| match o {
337 Object::Array(a) => a.first(),
338 _ => None,
339 })
340 .and_then(|o| as_dict(doc, o))
341 else {
342 return (map, 1000.0);
343 };
344 let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
345 if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
346 let mut i = 0;
347 while i < w.len() {
348 let c = w.get(i).and_then(num);
349 match (c, w.get(i + 1)) {
350 (Some(c), Some(Object::Array(list))) => {
352 for (k, wv) in list.iter().enumerate() {
353 if let Some(wv) = num(wv) {
354 map.insert(c as u32 + k as u32, wv);
355 }
356 }
357 i += 2;
358 }
359 (Some(c1), Some(o2)) => {
361 if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
362 for cid in c1 as u32..=c2 as u32 {
363 map.insert(cid, wv);
364 }
365 }
366 i += 3;
367 }
368 _ => break,
369 }
370 }
371 }
372 (map, dw)
373}
374
375fn num(o: &Object) -> Option<f64> {
376 match o {
377 Object::Integer(i) => Some(*i as f64),
378 Object::Real(r) => Some(*r as f64),
379 _ => None,
380 }
381}
382
383fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
385 let text = String::from_utf8_lossy(data);
386 let mut map = HashMap::new();
387 let hex = |s: &str| -> Option<Vec<u16>> {
388 let s = s.trim();
389 if !s.starts_with('<') || !s.ends_with('>') {
390 return None;
391 }
392 let h = &s[1..s.len() - 1];
393 let bytes: Vec<u8> = (0..h.len())
394 .step_by(2)
395 .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
396 .collect();
397 Some(
398 bytes
399 .chunks(2)
400 .map(|c| {
401 if c.len() == 2 {
402 u16::from_be_bytes([c[0], c[1]])
403 } else {
404 c[0] as u16
405 }
406 })
407 .collect(),
408 )
409 };
410 let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
411 let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
412
413 let tokens: Vec<String> = {
417 let bytes = text.as_bytes();
418 let mut toks = Vec::new();
419 let mut i = 0;
420 while i < bytes.len() {
421 let c = bytes[i];
422 if c.is_ascii_whitespace() {
423 i += 1;
424 } else if c == b'<' {
425 let start = i;
426 while i < bytes.len() && bytes[i] != b'>' {
427 i += 1;
428 }
429 i += 1; toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
431 } else if c == b'[' || c == b']' {
432 toks.push((c as char).to_string());
433 i += 1;
434 } else {
435 let start = i;
436 while i < bytes.len()
437 && !bytes[i].is_ascii_whitespace()
438 && bytes[i] != b'<'
439 && bytes[i] != b'['
440 && bytes[i] != b']'
441 {
442 i += 1;
443 }
444 toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
445 }
446 }
447 toks
448 };
449 let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
450 let mut i = 0;
451 while i < tokens.len() {
452 match tokens[i] {
453 "beginbfchar" => {
454 i += 1;
455 while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
456 if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
457 map.insert(code_of(&src), u16s_to_string(&dst));
458 }
459 i += 2;
460 }
461 }
462 "beginbfrange" => {
463 i += 1;
464 while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
465 let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
466 i += 1;
467 continue;
468 };
469 let lo = code_of(&lo);
470 let hi = code_of(&hi);
471 if tokens[i + 2] == "[" {
472 let mut j = i + 3;
474 let mut code = lo;
475 while j < tokens.len() && tokens[j] != "]" {
476 if let Some(dst) = hex(tokens[j]) {
477 map.insert(code, u16s_to_string(&dst));
478 }
479 code += 1;
480 j += 1;
481 }
482 i = j + 1;
483 } else if let Some(dst) = hex(tokens[i + 2]) {
484 let base = code_of(&dst);
486 for (k, code) in (lo..=hi).enumerate() {
487 if let Some(ch) = char::from_u32(base + k as u32) {
488 map.insert(code, ch.to_string());
489 }
490 }
491 i += 3;
492 } else {
493 i += 1;
494 }
495 }
496 }
497 _ => i += 1,
498 }
499 }
500 map
501}
502
503fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
505 if font.two_byte {
506 bytes
507 .chunks(2)
508 .map(|c| {
509 if c.len() == 2 {
510 ((c[0] as u32) << 8) | c[1] as u32
511 } else {
512 c[0] as u32
513 }
514 })
515 .collect()
516 } else {
517 bytes.iter().map(|&b| b as u32).collect()
518 }
519}
520
521fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
523 let mb = doc
524 .get_object(page_id)
525 .ok()
526 .and_then(|o| o.as_dict().ok())
527 .and_then(|d| {
528 d.get(b"MediaBox").ok().cloned()
530 })
531 .or_else(|| {
532 doc.get_dictionary(page_id)
533 .ok()
534 .and_then(|d| d.get(b"MediaBox").ok().cloned())
535 });
536 if let Some(Object::Array(a)) = mb {
537 let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
538 if v.len() == 4 {
539 return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
540 }
541 }
542 (612.0, 792.0)
543}
544
545pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
548 let Ok(doc) = Document::load_mem(bytes) else {
549 return Vec::new();
550 };
551 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
552 pages.sort_by_key(|(n, _)| *n);
553 let Some((_, pid)) = pages.get(index) else {
554 return Vec::new();
555 };
556 page_glyphs(&doc, *pid)
557 .into_iter()
558 .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
559 .collect()
560}
561
562pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
566 let Ok(doc) = Document::load_mem(bytes) else {
567 return Vec::new();
568 };
569 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
570 pages.sort_by_key(|(n, _)| *n);
571 pages
572 .into_iter()
573 .map(|(_, pid)| {
574 let (w, h) = page_size(&doc, pid);
575 let glyphs = page_glyphs(&doc, pid);
576 let cells = crate::dp_lines::line_cells(&glyphs, h, true);
577 (w, h, cells)
578 })
579 .collect()
580}
581
582#[derive(Clone, Copy)]
586struct TextState {
587 tc: f64,
588 tw: f64,
589 th: f64,
590 tl: f64,
591 trise: f64,
592 fsize: f64,
593}
594
595impl TextState {
596 const INIT: TextState = TextState {
597 tc: 0.0,
598 tw: 0.0,
599 th: 1.0,
600 tl: 0.0,
601 trise: 0.0,
602 fsize: 0.0,
603 };
604}
605
606fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
609 let (inline, ids) = doc.get_page_resources(page_id).ok()?;
610 if let Some(d) = inline {
611 return Some(d);
612 }
613 ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
614}
615
616fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
618 let mut map = HashMap::new();
619 let font_dict = res
620 .get(b"Font")
621 .ok()
622 .and_then(|o| deref(doc, o))
623 .and_then(|o| o.as_dict().ok());
624 if let Some(fd) = font_dict {
625 for (name, value) in fd.iter() {
626 if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
627 map.insert(name.clone(), parse_font(doc, name, fdict));
628 }
629 }
630 }
631 map
632}
633
634pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
636 let mut out = Vec::new();
637 let Ok(content_bytes) = doc.get_page_content(page_id) else {
638 return out;
639 };
640 let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
641 return out;
642 };
643 if let Some(res) = page_res(doc, page_id) {
644 run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
645 }
646 out
647}
648
649fn run_content(
654 doc: &Document,
655 res: &Dictionary,
656 content: &lopdf::content::Content,
657 base_ctm: Mat,
658 init: TextState,
659 depth: u32,
660 out: &mut Vec<Glyph>,
661) {
662 let fonts = fonts_from_res(doc, res);
663 let xobjects = res
664 .get(b"XObject")
665 .ok()
666 .and_then(|o| deref(doc, o))
667 .and_then(|o| o.as_dict().ok());
668
669 #[allow(clippy::type_complexity)]
674 let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
675 let mut ctm = base_ctm;
676 let mut tm = Mat::ID;
677 let mut tlm = Mat::ID;
678 let mut font: Option<&Font> = None;
679 let mut fsize = init.fsize;
680 let mut tc = init.tc; let mut tw = init.tw; let mut th = init.th; let mut tl = init.tl; let mut trise = init.trise;
685
686 let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
687
688 for op in &content.operations {
689 let operands = &op.operands;
690 match op.operator.as_str() {
691 "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
692 "Q" => {
693 if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
694 ctm = c;
695 tc = a;
696 tw = b;
697 th = h;
698 tl = l;
699 trise = r;
700 fsize = fs;
701 font = f;
702 }
703 }
704 "cm" => {
705 let m = Mat {
706 a: op_f(operands, 0),
707 b: op_f(operands, 1),
708 c: op_f(operands, 2),
709 d: op_f(operands, 3),
710 e: op_f(operands, 4),
711 f: op_f(operands, 5),
712 };
713 ctm = m.then(ctm);
714 }
715 "BT" => {
716 tm = Mat::ID;
717 tlm = Mat::ID;
718 }
719 "ET" => {}
720 "Tf" => {
721 if let Some(Object::Name(n)) = operands.first() {
722 font = fonts.get(n.as_slice());
723 }
724 fsize = op_f(operands, 1);
725 }
726 "Td" => {
727 tlm = Mat {
728 a: 1.0,
729 b: 0.0,
730 c: 0.0,
731 d: 1.0,
732 e: op_f(operands, 0),
733 f: op_f(operands, 1),
734 }
735 .then(tlm);
736 tm = tlm;
737 }
738 "TD" => {
739 tl = -op_f(operands, 1);
740 tlm = Mat {
741 a: 1.0,
742 b: 0.0,
743 c: 0.0,
744 d: 1.0,
745 e: op_f(operands, 0),
746 f: op_f(operands, 1),
747 }
748 .then(tlm);
749 tm = tlm;
750 }
751 "Tm" => {
752 tlm = Mat {
753 a: op_f(operands, 0),
754 b: op_f(operands, 1),
755 c: op_f(operands, 2),
756 d: op_f(operands, 3),
757 e: op_f(operands, 4),
758 f: op_f(operands, 5),
759 };
760 tm = tlm;
761 }
762 "T*" => {
763 tlm = Mat {
764 a: 1.0,
765 b: 0.0,
766 c: 0.0,
767 d: 1.0,
768 e: 0.0,
769 f: -tl,
770 }
771 .then(tlm);
772 tm = tlm;
773 }
774 "Tc" => tc = op_f(operands, 0),
775 "Tw" => tw = op_f(operands, 0),
776 "Tz" => th = op_f(operands, 0) / 100.0,
777 "TL" => tl = op_f(operands, 0),
778 "Ts" => trise = op_f(operands, 0),
779 "Tj" | "'" | "\"" => {
780 if op.operator == "'" || op.operator == "\"" {
781 tlm = Mat {
783 a: 1.0,
784 b: 0.0,
785 c: 0.0,
786 d: 1.0,
787 e: 0.0,
788 f: -tl,
789 }
790 .then(tlm);
791 tm = tlm;
792 }
793 if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
794 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
795 }
796 }
797 "TJ" => {
798 if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
799 for el in arr {
800 match el {
801 Object::String(s, _) => {
802 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
803 }
804 other => {
805 if let Some(adj) = num(other) {
806 let tx = -adj / 1000.0 * fsize * th;
808 tm = Mat {
809 a: 1.0,
810 b: 0.0,
811 c: 0.0,
812 d: 1.0,
813 e: tx,
814 f: 0.0,
815 }
816 .then(tm);
817 }
818 }
819 }
820 }
821 }
822 }
823 "Do" => {
824 if depth >= 8 {
827 continue;
828 }
829 let Some(Object::Name(n)) = operands.first() else {
830 continue;
831 };
832 let stream = xobjects
833 .and_then(|d| d.get(n.as_slice()).ok())
834 .and_then(|o| deref(doc, o))
835 .and_then(|o| o.as_stream().ok());
836 let Some(stream) = stream else { continue };
837 let is_form = stream
838 .dict
839 .get(b"Subtype")
840 .ok()
841 .and_then(|o| o.as_name().ok())
842 == Some(b"Form".as_slice());
843 if !is_form {
844 continue;
845 }
846 let Ok(data) = stream.decompressed_content() else {
847 continue;
848 };
849 let Ok(form_content) = lopdf::content::Content::decode(&data) else {
850 continue;
851 };
852 let form_mat = match stream.dict.get(b"Matrix").ok() {
854 Some(Object::Array(a)) if a.len() == 6 => {
855 let v: Vec<f64> = a.iter().filter_map(num).collect();
856 if v.len() == 6 {
857 Mat {
858 a: v[0],
859 b: v[1],
860 c: v[2],
861 d: v[3],
862 e: v[4],
863 f: v[5],
864 }
865 } else {
866 Mat::ID
867 }
868 }
869 _ => Mat::ID,
870 };
871 let form_res = stream
873 .dict
874 .get(b"Resources")
875 .ok()
876 .and_then(|o| deref(doc, o))
877 .and_then(|o| o.as_dict().ok())
878 .unwrap_or(res);
879 let state = TextState {
880 tc,
881 tw,
882 th,
883 tl,
884 trise,
885 fsize,
886 };
887 run_content(
888 doc,
889 form_res,
890 &form_content,
891 form_mat.then(ctm),
892 state,
893 depth + 1,
894 out,
895 );
896 }
897 _ => {}
898 }
899 }
900}
901
902#[allow(clippy::too_many_arguments)]
903fn show_text(
904 font: &Font,
905 bytes: &[u8],
906 fsize: f64,
907 tc: f64,
908 tw: f64,
909 th: f64,
910 trise: f64,
911 tm: &mut Mat,
912 ctm: Mat,
913 out: &mut Vec<Glyph>,
914) {
915 for code in codes(font, bytes) {
916 let (text, w) = font.decode_code(code);
917 let w0 = w / 1000.0; let scale = Mat {
920 a: fsize * th,
921 b: 0.0,
922 c: 0.0,
923 d: fsize,
924 e: 0.0,
925 f: trise,
926 };
927 let trm = scale.then(*tm).then(ctm);
928 let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
930 let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
931 let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
932 let (left, right) = (x0.min(x1), x0.max(x1));
933 let (bot, top) = (y0.min(y2), y0.max(y2));
934 if let Some(s) = text {
935 for ch in s.chars() {
937 if ch != '\u{0}' {
938 out.push(Glyph {
939 ch,
940 l: left as f32,
941 b: bot as f32,
942 r: right as f32,
943 t: top as f32,
944 ll: left as f32,
945 lb: bot as f32,
946 lr: right as f32,
947 lt: top as f32,
948 font: font.hash,
949 });
950 }
951 }
952 }
953 let is_space = !font.two_byte && code == 32;
955 let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
956 *tm = Mat {
957 a: 1.0,
958 b: 0.0,
959 c: 0.0,
960 d: 1.0,
961 e: tx,
962 f: 0.0,
963 }
964 .then(*tm);
965 }
966}
967
968fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
972 let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
973 let base_name = match enc {
974 Some(Object::Name(n)) => n.clone(),
975 Some(Object::Dictionary(d)) => d
976 .get(b"BaseEncoding")
977 .ok()
978 .and_then(|o| o.as_name().ok())
979 .map(|n| n.to_vec())
980 .unwrap_or_default(),
981 _ => Vec::new(),
982 };
983 let mut m = if base_name == b"MacRomanEncoding" {
984 macroman_table()
985 } else {
986 winansi_table()
987 };
988 if let Some(Object::Dictionary(d)) = enc {
990 if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
991 let mut code = 0u8;
992 for el in diffs {
993 match el {
994 Object::Integer(i) => code = *i as u8,
995 Object::Name(name) => {
996 if let Some(ch) = glyph_name_to_char(name) {
997 m.insert(code, ch);
998 }
999 code = code.wrapping_add(1);
1000 }
1001 _ => {}
1002 }
1003 }
1004 }
1005 }
1006 m
1007}
1008
1009fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1014 let s = std::str::from_utf8(name).ok()?;
1015 if let Some(hex) = s.strip_prefix("uni") {
1016 if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1017 return char::from_u32(cp);
1018 }
1019 }
1020 if s.len() == 1 {
1022 let b = s.as_bytes()[0];
1023 if b.is_ascii_alphabetic() {
1024 return Some(b as char);
1025 }
1026 }
1027 let resolved = match s {
1028 "space" => ' ',
1029 "exclam" => '!',
1030 "quotedbl" => '"',
1031 "numbersign" => '#',
1032 "dollar" => '$',
1033 "percent" => '%',
1034 "ampersand" => '&',
1035 "quotesingle" => '\'',
1036 "parenleft" => '(',
1037 "parenright" => ')',
1038 "asterisk" => '*',
1039 "plus" => '+',
1040 "comma" => ',',
1041 "hyphen" => '-',
1042 "period" => '.',
1043 "slash" => '/',
1044 "zero" => '0',
1045 "one" => '1',
1046 "two" => '2',
1047 "three" => '3',
1048 "four" => '4',
1049 "five" => '5',
1050 "six" => '6',
1051 "seven" => '7',
1052 "eight" => '8',
1053 "nine" => '9',
1054 "colon" => ':',
1055 "semicolon" => ';',
1056 "less" => '<',
1057 "equal" => '=',
1058 "greater" => '>',
1059 "question" => '?',
1060 "at" => '@',
1061 "bracketleft" => '[',
1062 "backslash" => '\\',
1063 "bracketright" => ']',
1064 "asciicircum" => '^',
1065 "underscore" => '_',
1066 "grave" => '`',
1067 "braceleft" => '{',
1068 "bar" => '|',
1069 "braceright" => '}',
1070 "asciitilde" => '~',
1071 "bullet" => '\u{2022}',
1072 "periodcentered" => '\u{00B7}',
1073 "endash" => '\u{2013}',
1074 "emdash" => '\u{2014}',
1075 "quoteright" => '\u{2019}',
1076 "quoteleft" => '\u{2018}',
1077 "quotedblleft" => '\u{201C}',
1078 "quotedblright" => '\u{201D}',
1079 "quotedblbase" => '\u{201E}',
1080 "quotesinglbase" => '\u{201A}',
1081 "fi" => '\u{FB01}',
1082 "fl" => '\u{FB02}',
1083 "degree" => '\u{00B0}',
1084 "trademark" => '\u{2122}',
1085 "registered" => '\u{00AE}',
1086 "copyright" => '\u{00A9}',
1087 "ellipsis" => '\u{2026}',
1088 "minus" => '\u{2212}',
1089 "fraction" => '\u{2044}',
1090 "nbspace" => '\u{00A0}',
1091 _ => {
1092 if let Some((base, _)) = s.split_once('.') {
1094 if !base.is_empty() {
1095 return glyph_name_to_char(base.as_bytes());
1096 }
1097 }
1098 return None;
1099 }
1100 };
1101 Some(resolved)
1102}
1103
1104fn winansi_table() -> HashMap<u8, char> {
1106 let mut m = HashMap::new();
1107 for b in 0x20u8..=0x7e {
1108 m.insert(b, b as char);
1109 }
1110 let extra: &[(u8, char)] = &[
1112 (0x91, '\u{2018}'),
1113 (0x92, '\u{2019}'),
1114 (0x93, '\u{201C}'),
1115 (0x94, '\u{201D}'),
1116 (0x95, '\u{2022}'),
1117 (0x96, '\u{2013}'),
1118 (0x97, '\u{2014}'),
1119 (0x85, '\u{2026}'),
1120 (0xA0, '\u{00A0}'),
1121 ];
1122 for &(b, c) in extra {
1123 m.insert(b, c);
1124 }
1125 for b in 0xA1u8..=0xFF {
1126 m.entry(b).or_insert(b as char);
1127 }
1128 m
1129}
1130
1131fn macroman_table() -> HashMap<u8, char> {
1134 let mut m = HashMap::new();
1135 for b in 0x20u8..=0x7e {
1136 m.insert(b, b as char);
1137 }
1138 let high: &[(u8, char)] = &[
1139 (0xA5, '\u{2022}'), (0xD0, '\u{2013}'), (0xD1, '\u{2014}'), (0xD2, '\u{201C}'),
1143 (0xD3, '\u{201D}'),
1144 (0xD4, '\u{2018}'),
1145 (0xD5, '\u{2019}'),
1146 (0xCA, '\u{00A0}'),
1147 (0xC9, '\u{2026}'),
1148 (0xDE, '\u{FB01}'),
1149 (0xDF, '\u{FB02}'),
1150 ];
1151 for &(b, c) in high {
1152 m.insert(b, c);
1153 }
1154 m
1155}