1use std::collections::HashMap;
17
18use lopdf::{Dictionary, Document, Object};
19
20use crate::pdfium_backend::Glyph;
21
22#[derive(Clone, Copy)]
24struct Mat {
25 a: f64,
26 b: f64,
27 c: f64,
28 d: f64,
29 e: f64,
30 f: f64,
31}
32
33impl Mat {
34 const ID: Mat = Mat {
35 a: 1.0,
36 b: 0.0,
37 c: 0.0,
38 d: 1.0,
39 e: 0.0,
40 f: 0.0,
41 };
42
43 fn then(self, m: Mat) -> Mat {
45 Mat {
46 a: self.a * m.a + self.b * m.c,
47 b: self.a * m.b + self.b * m.d,
48 c: self.c * m.a + self.d * m.c,
49 d: self.c * m.b + self.d * m.d,
50 e: self.e * m.a + self.f * m.c + m.e,
51 f: self.e * m.b + self.f * m.d + m.f,
52 }
53 }
54
55 fn apply(self, x: f64, y: f64) -> (f64, f64) {
56 (
57 self.a * x + self.c * y + self.e,
58 self.b * x + self.d * y + self.f,
59 )
60 }
61}
62
63struct Font {
65 two_byte: bool,
67 to_unicode: HashMap<u32, String>,
69 widths: HashMap<u32, f64>,
71 default_width: f64,
72 simple_encoding: Option<HashMap<u8, char>>,
74 fallback_names: HashMap<u8, String>,
78 program_encoding: HashMap<u8, char>,
82 ascent: f64,
83 descent: f64,
84 hash: u64,
85}
86
87impl Font {
88 fn decode_code(&self, code: u32) -> (Option<String>, f64) {
89 let w = self
90 .widths
91 .get(&code)
92 .copied()
93 .unwrap_or(self.default_width);
94 if let Some(s) = self.to_unicode.get(&code) {
95 return (Some(decompose_ligatures(s)), w);
96 }
97 if !self.two_byte {
98 if let Some(name) = self.fallback_names.get(&(code as u8)) {
101 return (Some(format!("/{name}")), w);
102 }
103 if let Some(enc) = &self.simple_encoding {
104 if let Some(&ch) = enc.get(&(code as u8)) {
105 return (Some(decompose_ligatures(&ch.to_string())), w);
106 }
107 }
108 if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
116 return (Some(decompose_ligatures(&ch.to_string())), w);
117 }
118 }
119 (None, w)
120 }
121}
122
123fn decompose_ligatures(s: &str) -> String {
127 if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
128 return s.to_string();
129 }
130 s.chars()
131 .map(|c| {
132 match c {
133 '\u{FB00}' => "ff",
134 '\u{FB01}' => "fi",
135 '\u{FB02}' => "fl",
136 '\u{FB03}' => "ffi",
137 '\u{FB04}' => "ffl",
138 '\u{FB05}' => "ft",
139 '\u{FB06}' => "st",
140 _ => return c.to_string(),
141 }
142 .to_string()
143 })
144 .collect()
145}
146
147fn hash_name(name: &[u8]) -> u64 {
148 use std::hash::{Hash, Hasher};
149 let mut h = std::collections::hash_map::DefaultHasher::new();
150 name.hash(&mut h);
151 h.finish()
152}
153
154fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
156 match obj {
157 Object::Dictionary(d) => Some(d),
158 Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
159 _ => None,
160 }
161}
162
163fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
164 match obj {
165 Object::Reference(id) => doc.get_object(*id).ok(),
166 other => Some(other),
167 }
168}
169
170fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
172 let subtype: &[u8] = fdict
173 .get(b"Subtype")
174 .ok()
175 .and_then(|o| o.as_name().ok())
176 .unwrap_or(&[]);
177 let two_byte = subtype == b"Type0".as_slice();
178
179 let to_unicode = fdict
180 .get(b"ToUnicode")
181 .ok()
182 .and_then(|o| deref(doc, o))
183 .and_then(|o| o.as_stream().ok())
184 .and_then(|s| s.decompressed_content().ok())
185 .map(|data| parse_tounicode(&data))
186 .unwrap_or_default();
187
188 let (widths, default_width) = if two_byte {
189 cid_widths(doc, fdict)
190 } else {
191 simple_widths(doc, fdict)
192 };
193
194 let simple_encoding = if two_byte {
195 None
196 } else {
197 Some(simple_encoding_table(doc, fdict))
198 };
199 let fallback_names = if two_byte {
200 HashMap::new()
201 } else {
202 differences_gid_names(doc, fdict)
203 };
204 let program_encoding = if two_byte {
205 HashMap::new()
206 } else {
207 type1_program_encoding(doc, fdict)
208 };
209
210 let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
211
212 Font {
213 two_byte,
214 to_unicode,
215 widths,
216 default_width,
217 simple_encoding,
218 fallback_names,
219 program_encoding,
220 ascent,
221 descent,
222 hash: hash_name(name),
223 }
224}
225
226fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
233 let mut map = HashMap::new();
234 let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
235 else {
236 return map;
237 };
238 let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
239 else {
240 return map;
241 };
242 let mut code = 0u8;
243 for el in diffs {
244 match el {
245 Object::Integer(i) => code = *i as u8,
246 Object::Name(name) => {
247 if glyph_name_to_char(name).is_none() && is_gid_name(name) {
248 map.insert(code, String::from_utf8_lossy(name).into_owned());
249 }
250 code = code.wrapping_add(1);
251 }
252 _ => {}
253 }
254 }
255 map
256}
257
258fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
266 let mut map = HashMap::new();
267 let Some(desc) = fdict
268 .get(b"FontDescriptor")
269 .ok()
270 .and_then(|o| deref(doc, o))
271 .and_then(|o| o.as_dict().ok())
272 else {
273 return map;
274 };
275 let Some(data) = desc
276 .get(b"FontFile")
277 .ok()
278 .and_then(|o| deref(doc, o))
279 .and_then(|o| o.as_stream().ok())
280 .and_then(|s| s.decompressed_content().ok())
281 else {
282 return map;
283 };
284 let head_end = data
286 .windows(5)
287 .position(|w| w == b"eexec")
288 .unwrap_or(data.len());
289 let head = String::from_utf8_lossy(&data[..head_end]);
290 let toks: Vec<&str> = head.split_whitespace().collect();
292 for w in toks.windows(4) {
293 if w[0] == "dup" && w[3] == "put" {
294 if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
295 if code <= 255 {
296 if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
297 map.insert(code as u8, ch);
298 }
299 }
300 }
301 }
302 }
303 map
304}
305
306fn is_gid_name(name: &[u8]) -> bool {
312 let Ok(s) = std::str::from_utf8(name) else {
313 return false;
314 };
315 if s.starts_with("afii") || s.starts_with("uni") {
316 return false;
317 }
318 for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
319 if let Some(rest) = s.strip_prefix(prefix) {
320 if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
321 return true;
322 }
323 }
324 }
325 let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
329 let digits = s.len() - alpha;
330 (1..=3).contains(&alpha)
331 && digits >= 3
332 && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
333}
334
335fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
336 let descr_owner = if two_byte {
338 fdict
339 .get(b"DescendantFonts")
340 .ok()
341 .and_then(|o| deref(doc, o))
342 .and_then(|o| match o {
343 Object::Array(a) => a.first(),
344 _ => None,
345 })
346 .and_then(|o| as_dict(doc, o))
347 } else {
348 Some(fdict)
349 };
350 let fd = descr_owner
351 .and_then(|d| d.get(b"FontDescriptor").ok())
352 .and_then(|o| as_dict(doc, o));
353 let asc = fd
354 .and_then(|d| d.get(b"Ascent").ok())
355 .and_then(|o| {
356 o.as_float()
357 .ok()
358 .or_else(|| o.as_i64().ok().map(|i| i as f32))
359 })
360 .unwrap_or(750.0) as f64;
361 let desc = fd
362 .and_then(|d| d.get(b"Descent").ok())
363 .and_then(|o| {
364 o.as_float()
365 .ok()
366 .or_else(|| o.as_i64().ok().map(|i| i as f32))
367 })
368 .unwrap_or(-250.0) as f64;
369 if asc - desc <= 1.0 {
376 return (750.0, -250.0);
377 }
378 (asc, desc)
379}
380
381fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
383 let mut map = HashMap::new();
384 let first = fdict
385 .get(b"FirstChar")
386 .ok()
387 .and_then(|o| o.as_i64().ok())
388 .unwrap_or(0) as u32;
389 if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
390 for (i, w) in arr.iter().enumerate() {
391 if let Some(w) = num(w) {
392 map.insert(first + i as u32, w);
393 }
394 }
395 }
396 let dw = fdict
397 .get(b"FontDescriptor")
398 .ok()
399 .and_then(|o| as_dict(doc, o))
400 .and_then(|d| d.get(b"MissingWidth").ok())
401 .and_then(num)
402 .unwrap_or(0.0);
403 (map, dw)
404}
405
406fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
408 let mut map = HashMap::new();
409 let Some(desc) = fdict
410 .get(b"DescendantFonts")
411 .ok()
412 .and_then(|o| deref(doc, o))
413 .and_then(|o| match o {
414 Object::Array(a) => a.first(),
415 _ => None,
416 })
417 .and_then(|o| as_dict(doc, o))
418 else {
419 return (map, 1000.0);
420 };
421 let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
422 if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
423 let mut i = 0;
424 while i < w.len() {
425 let c = w.get(i).and_then(num);
426 match (c, w.get(i + 1)) {
427 (Some(c), Some(Object::Array(list))) => {
429 for (k, wv) in list.iter().enumerate() {
430 if let Some(wv) = num(wv) {
431 map.insert(c as u32 + k as u32, wv);
432 }
433 }
434 i += 2;
435 }
436 (Some(c1), Some(o2)) => {
438 if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
439 for cid in c1 as u32..=c2 as u32 {
440 map.insert(cid, wv);
441 }
442 }
443 i += 3;
444 }
445 _ => break,
446 }
447 }
448 }
449 (map, dw)
450}
451
452fn num(o: &Object) -> Option<f64> {
453 match o {
454 Object::Integer(i) => Some(*i as f64),
455 Object::Real(r) => Some(*r as f64),
456 _ => None,
457 }
458}
459
460fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
462 let text = String::from_utf8_lossy(data);
463 let mut map = HashMap::new();
464 let hex = |s: &str| -> Option<Vec<u16>> {
465 let s = s.trim();
466 if !s.starts_with('<') || !s.ends_with('>') {
467 return None;
468 }
469 let h = &s[1..s.len() - 1];
470 let bytes: Vec<u8> = (0..h.len())
471 .step_by(2)
472 .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
473 .collect();
474 Some(
475 bytes
476 .chunks(2)
477 .map(|c| {
478 if c.len() == 2 {
479 u16::from_be_bytes([c[0], c[1]])
480 } else {
481 c[0] as u16
482 }
483 })
484 .collect(),
485 )
486 };
487 let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
488 let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
489
490 let tokens: Vec<String> = {
494 let bytes = text.as_bytes();
495 let mut toks = Vec::new();
496 let mut i = 0;
497 while i < bytes.len() {
498 let c = bytes[i];
499 if c.is_ascii_whitespace() {
500 i += 1;
501 } else if c == b'<' {
502 let start = i;
503 while i < bytes.len() && bytes[i] != b'>' {
504 i += 1;
505 }
506 i += 1; toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
508 } else if c == b'[' || c == b']' {
509 toks.push((c as char).to_string());
510 i += 1;
511 } else {
512 let start = i;
513 while i < bytes.len()
514 && !bytes[i].is_ascii_whitespace()
515 && bytes[i] != b'<'
516 && bytes[i] != b'['
517 && bytes[i] != b']'
518 {
519 i += 1;
520 }
521 toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
522 }
523 }
524 toks
525 };
526 let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
527 let mut i = 0;
528 while i < tokens.len() {
529 match tokens[i] {
530 "beginbfchar" => {
531 i += 1;
532 while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
533 if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
534 map.insert(code_of(&src), u16s_to_string(&dst));
535 }
536 i += 2;
537 }
538 }
539 "beginbfrange" => {
540 i += 1;
541 while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
542 let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
543 i += 1;
544 continue;
545 };
546 let lo = code_of(&lo);
547 let hi = code_of(&hi);
548 if tokens[i + 2] == "[" {
549 let mut j = i + 3;
551 let mut code = lo;
552 while j < tokens.len() && tokens[j] != "]" {
553 if let Some(dst) = hex(tokens[j]) {
554 map.insert(code, u16s_to_string(&dst));
555 }
556 code += 1;
557 j += 1;
558 }
559 i = j + 1;
560 } else if let Some(dst) = hex(tokens[i + 2]) {
561 let base = code_of(&dst);
563 for (k, code) in (lo..=hi).enumerate() {
564 if let Some(ch) = char::from_u32(base + k as u32) {
565 map.insert(code, ch.to_string());
566 }
567 }
568 i += 3;
569 } else {
570 i += 1;
571 }
572 }
573 }
574 _ => i += 1,
575 }
576 }
577 map
578}
579
580fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
582 if font.two_byte {
583 bytes
584 .chunks(2)
585 .map(|c| {
586 if c.len() == 2 {
587 ((c[0] as u32) << 8) | c[1] as u32
588 } else {
589 c[0] as u32
590 }
591 })
592 .collect()
593 } else {
594 bytes.iter().map(|&b| b as u32).collect()
595 }
596}
597
598fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
600 let mb = doc
601 .get_object(page_id)
602 .ok()
603 .and_then(|o| o.as_dict().ok())
604 .and_then(|d| {
605 d.get(b"MediaBox").ok().cloned()
607 })
608 .or_else(|| {
609 doc.get_dictionary(page_id)
610 .ok()
611 .and_then(|d| d.get(b"MediaBox").ok().cloned())
612 });
613 if let Some(Object::Array(a)) = mb {
614 let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
615 if v.len() == 4 {
616 return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
617 }
618 }
619 (612.0, 792.0)
620}
621
622pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
625 let Ok(doc) = Document::load_mem(bytes) else {
626 return Vec::new();
627 };
628 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
629 pages.sort_by_key(|(n, _)| *n);
630 let Some((_, pid)) = pages.get(index) else {
631 return Vec::new();
632 };
633 page_glyphs(&doc, *pid)
634 .into_iter()
635 .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
636 .collect()
637}
638
639pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
643 let Ok(doc) = Document::load_mem(bytes) else {
644 return Vec::new();
645 };
646 let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
647 pages.sort_by_key(|(n, _)| *n);
648 pages
649 .into_iter()
650 .map(|(_, pid)| {
651 let (w, h) = page_size(&doc, pid);
652 let glyphs = page_glyphs(&doc, pid);
653 let cells = crate::dp_lines::line_cells(&glyphs, h, true);
654 (w, h, cells)
655 })
656 .collect()
657}
658
659#[derive(Clone, Copy)]
663struct TextState {
664 tc: f64,
665 tw: f64,
666 th: f64,
667 tl: f64,
668 trise: f64,
669 fsize: f64,
670}
671
672impl TextState {
673 const INIT: TextState = TextState {
674 tc: 0.0,
675 tw: 0.0,
676 th: 1.0,
677 tl: 0.0,
678 trise: 0.0,
679 fsize: 0.0,
680 };
681}
682
683fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
686 let (inline, ids) = doc.get_page_resources(page_id).ok()?;
687 if let Some(d) = inline {
688 return Some(d);
689 }
690 ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
691}
692
693fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
695 let mut map = HashMap::new();
696 let font_dict = res
697 .get(b"Font")
698 .ok()
699 .and_then(|o| deref(doc, o))
700 .and_then(|o| o.as_dict().ok());
701 if let Some(fd) = font_dict {
702 for (name, value) in fd.iter() {
703 if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
704 map.insert(name.clone(), parse_font(doc, name, fdict));
705 }
706 }
707 }
708 map
709}
710
711pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
713 let mut out = Vec::new();
714 let Ok(content_bytes) = doc.get_page_content(page_id) else {
715 return out;
716 };
717 let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
718 return out;
719 };
720 if let Some(res) = page_res(doc, page_id) {
721 run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
722 }
723 out
724}
725
726fn run_content(
731 doc: &Document,
732 res: &Dictionary,
733 content: &lopdf::content::Content,
734 base_ctm: Mat,
735 init: TextState,
736 depth: u32,
737 out: &mut Vec<Glyph>,
738) {
739 let fonts = fonts_from_res(doc, res);
740 let xobjects = res
741 .get(b"XObject")
742 .ok()
743 .and_then(|o| deref(doc, o))
744 .and_then(|o| o.as_dict().ok());
745
746 #[allow(clippy::type_complexity)]
751 let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
752 let mut ctm = base_ctm;
753 let mut tm = Mat::ID;
754 let mut tlm = Mat::ID;
755 let mut font: Option<&Font> = None;
756 let mut fsize = init.fsize;
757 let mut tc = init.tc; let mut tw = init.tw; let mut th = init.th; let mut tl = init.tl; let mut trise = init.trise;
762
763 let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
764
765 for op in &content.operations {
766 let operands = &op.operands;
767 match op.operator.as_str() {
768 "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
769 "Q" => {
770 if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
771 ctm = c;
772 tc = a;
773 tw = b;
774 th = h;
775 tl = l;
776 trise = r;
777 fsize = fs;
778 font = f;
779 }
780 }
781 "cm" => {
782 let m = Mat {
783 a: op_f(operands, 0),
784 b: op_f(operands, 1),
785 c: op_f(operands, 2),
786 d: op_f(operands, 3),
787 e: op_f(operands, 4),
788 f: op_f(operands, 5),
789 };
790 ctm = m.then(ctm);
791 }
792 "BT" => {
793 tm = Mat::ID;
794 tlm = Mat::ID;
795 }
796 "ET" => {}
797 "Tf" => {
798 if let Some(Object::Name(n)) = operands.first() {
799 font = fonts.get(n.as_slice());
800 }
801 fsize = op_f(operands, 1);
802 }
803 "Td" => {
804 tlm = Mat {
805 a: 1.0,
806 b: 0.0,
807 c: 0.0,
808 d: 1.0,
809 e: op_f(operands, 0),
810 f: op_f(operands, 1),
811 }
812 .then(tlm);
813 tm = tlm;
814 }
815 "TD" => {
816 tl = -op_f(operands, 1);
817 tlm = Mat {
818 a: 1.0,
819 b: 0.0,
820 c: 0.0,
821 d: 1.0,
822 e: op_f(operands, 0),
823 f: op_f(operands, 1),
824 }
825 .then(tlm);
826 tm = tlm;
827 }
828 "Tm" => {
829 tlm = Mat {
830 a: op_f(operands, 0),
831 b: op_f(operands, 1),
832 c: op_f(operands, 2),
833 d: op_f(operands, 3),
834 e: op_f(operands, 4),
835 f: op_f(operands, 5),
836 };
837 tm = tlm;
838 }
839 "T*" => {
840 tlm = Mat {
841 a: 1.0,
842 b: 0.0,
843 c: 0.0,
844 d: 1.0,
845 e: 0.0,
846 f: -tl,
847 }
848 .then(tlm);
849 tm = tlm;
850 }
851 "Tc" => tc = op_f(operands, 0),
852 "Tw" => tw = op_f(operands, 0),
853 "Tz" => th = op_f(operands, 0) / 100.0,
854 "TL" => tl = op_f(operands, 0),
855 "Ts" => trise = op_f(operands, 0),
856 "Tj" | "'" | "\"" => {
857 if op.operator == "'" || op.operator == "\"" {
858 tlm = Mat {
860 a: 1.0,
861 b: 0.0,
862 c: 0.0,
863 d: 1.0,
864 e: 0.0,
865 f: -tl,
866 }
867 .then(tlm);
868 tm = tlm;
869 }
870 if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
871 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
872 }
873 }
874 "TJ" => {
875 if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
876 for el in arr {
877 match el {
878 Object::String(s, _) => {
879 show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
880 }
881 other => {
882 if let Some(adj) = num(other) {
883 let tx = -adj / 1000.0 * fsize * th;
885 tm = Mat {
886 a: 1.0,
887 b: 0.0,
888 c: 0.0,
889 d: 1.0,
890 e: tx,
891 f: 0.0,
892 }
893 .then(tm);
894 }
895 }
896 }
897 }
898 }
899 }
900 "Do" => {
901 if depth >= 8 {
904 continue;
905 }
906 let Some(Object::Name(n)) = operands.first() else {
907 continue;
908 };
909 let stream = xobjects
910 .and_then(|d| d.get(n.as_slice()).ok())
911 .and_then(|o| deref(doc, o))
912 .and_then(|o| o.as_stream().ok());
913 let Some(stream) = stream else { continue };
914 let is_form = stream
915 .dict
916 .get(b"Subtype")
917 .ok()
918 .and_then(|o| o.as_name().ok())
919 == Some(b"Form".as_slice());
920 if !is_form {
921 continue;
922 }
923 let Ok(data) = stream.decompressed_content() else {
924 continue;
925 };
926 let Ok(form_content) = lopdf::content::Content::decode(&data) else {
927 continue;
928 };
929 let form_mat = match stream.dict.get(b"Matrix").ok() {
931 Some(Object::Array(a)) if a.len() == 6 => {
932 let v: Vec<f64> = a.iter().filter_map(num).collect();
933 if v.len() == 6 {
934 Mat {
935 a: v[0],
936 b: v[1],
937 c: v[2],
938 d: v[3],
939 e: v[4],
940 f: v[5],
941 }
942 } else {
943 Mat::ID
944 }
945 }
946 _ => Mat::ID,
947 };
948 let form_res = stream
950 .dict
951 .get(b"Resources")
952 .ok()
953 .and_then(|o| deref(doc, o))
954 .and_then(|o| o.as_dict().ok())
955 .unwrap_or(res);
956 let state = TextState {
957 tc,
958 tw,
959 th,
960 tl,
961 trise,
962 fsize,
963 };
964 run_content(
965 doc,
966 form_res,
967 &form_content,
968 form_mat.then(ctm),
969 state,
970 depth + 1,
971 out,
972 );
973 }
974 _ => {}
975 }
976 }
977}
978
979#[allow(clippy::too_many_arguments)]
980fn show_text(
981 font: &Font,
982 bytes: &[u8],
983 fsize: f64,
984 tc: f64,
985 tw: f64,
986 th: f64,
987 trise: f64,
988 tm: &mut Mat,
989 ctm: Mat,
990 out: &mut Vec<Glyph>,
991) {
992 for code in codes(font, bytes) {
993 let (text, w) = font.decode_code(code);
994 let w0 = w / 1000.0; let scale = Mat {
997 a: fsize * th,
998 b: 0.0,
999 c: 0.0,
1000 d: fsize,
1001 e: 0.0,
1002 f: trise,
1003 };
1004 let trm = scale.then(*tm).then(ctm);
1005 let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
1007 let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
1008 let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
1009 let (left, right) = (x0.min(x1), x0.max(x1));
1010 let (bot, top) = (y0.min(y2), y0.max(y2));
1011 if let Some(s) = text {
1012 for ch in s.chars() {
1014 if ch != '\u{0}' {
1015 out.push(Glyph {
1016 ch,
1017 l: left as f32,
1018 b: bot as f32,
1019 r: right as f32,
1020 t: top as f32,
1021 ll: left as f32,
1022 lb: bot as f32,
1023 lr: right as f32,
1024 lt: top as f32,
1025 font: font.hash,
1026 });
1027 }
1028 }
1029 }
1030 let is_space = !font.two_byte && code == 32;
1032 let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
1033 *tm = Mat {
1034 a: 1.0,
1035 b: 0.0,
1036 c: 0.0,
1037 d: 1.0,
1038 e: tx,
1039 f: 0.0,
1040 }
1041 .then(*tm);
1042 }
1043}
1044
1045fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
1049 let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
1050 let base_name = match enc {
1051 Some(Object::Name(n)) => n.clone(),
1052 Some(Object::Dictionary(d)) => d
1053 .get(b"BaseEncoding")
1054 .ok()
1055 .and_then(|o| o.as_name().ok())
1056 .map(|n| n.to_vec())
1057 .unwrap_or_default(),
1058 _ => Vec::new(),
1059 };
1060 let mut m = if base_name == b"MacRomanEncoding" {
1061 macroman_table()
1062 } else {
1063 winansi_table()
1064 };
1065 if let Some(Object::Dictionary(d)) = enc {
1067 if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
1068 let mut code = 0u8;
1069 for el in diffs {
1070 match el {
1071 Object::Integer(i) => code = *i as u8,
1072 Object::Name(name) => {
1073 if let Some(ch) = glyph_name_to_char(name) {
1074 m.insert(code, ch);
1075 }
1076 code = code.wrapping_add(1);
1077 }
1078 _ => {}
1079 }
1080 }
1081 }
1082 }
1083 m
1084}
1085
1086fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1091 let s = std::str::from_utf8(name).ok()?;
1092 if let Some(hex) = s.strip_prefix("uni") {
1093 if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1094 return char::from_u32(cp);
1095 }
1096 }
1097 if s.len() == 1 {
1099 let b = s.as_bytes()[0];
1100 if b.is_ascii_alphabetic() {
1101 return Some(b as char);
1102 }
1103 }
1104 let resolved = match s {
1105 "space" => ' ',
1106 "exclam" => '!',
1107 "quotedbl" => '"',
1108 "numbersign" => '#',
1109 "dollar" => '$',
1110 "percent" => '%',
1111 "ampersand" => '&',
1112 "quotesingle" => '\'',
1113 "parenleft" => '(',
1114 "parenright" => ')',
1115 "asterisk" => '*',
1116 "plus" => '+',
1117 "comma" => ',',
1118 "hyphen" => '-',
1119 "period" => '.',
1120 "slash" => '/',
1121 "zero" => '0',
1122 "one" => '1',
1123 "two" => '2',
1124 "three" => '3',
1125 "four" => '4',
1126 "five" => '5',
1127 "six" => '6',
1128 "seven" => '7',
1129 "eight" => '8',
1130 "nine" => '9',
1131 "colon" => ':',
1132 "semicolon" => ';',
1133 "less" => '<',
1134 "equal" => '=',
1135 "greater" => '>',
1136 "question" => '?',
1137 "at" => '@',
1138 "bracketleft" => '[',
1139 "backslash" => '\\',
1140 "bracketright" => ']',
1141 "asciicircum" => '^',
1142 "underscore" => '_',
1143 "grave" => '`',
1144 "braceleft" => '{',
1145 "bar" => '|',
1146 "braceright" => '}',
1147 "asciitilde" => '~',
1148 "bullet" => '\u{2022}',
1149 "periodcentered" => '\u{00B7}',
1150 "endash" => '\u{2013}',
1151 "emdash" => '\u{2014}',
1152 "quoteright" => '\u{2019}',
1153 "quoteleft" => '\u{2018}',
1154 "quotedblleft" => '\u{201C}',
1155 "quotedblright" => '\u{201D}',
1156 "quotedblbase" => '\u{201E}',
1157 "quotesinglbase" => '\u{201A}',
1158 "ff" => '\u{FB00}',
1163 "fi" => '\u{FB01}',
1164 "fl" => '\u{FB02}',
1165 "ffi" => '\u{FB03}',
1166 "ffl" => '\u{FB04}',
1167 "ft" => '\u{FB05}',
1168 "st" => '\u{FB06}',
1169 "degree" => '\u{00B0}',
1170 "trademark" => '\u{2122}',
1171 "registered" => '\u{00AE}',
1172 "copyright" => '\u{00A9}',
1173 "ellipsis" => '\u{2026}',
1174 "minus" => '\u{2212}',
1175 "fraction" => '\u{2044}',
1176 "nbspace" => '\u{00A0}',
1177 "alpha" => '\u{03B1}',
1182 "beta" => '\u{03B2}',
1183 "gamma" => '\u{03B3}',
1184 "delta" => '\u{03B4}',
1185 "epsilon" | "epsilon1" => '\u{03B5}',
1186 "zeta" => '\u{03B6}',
1187 "eta" => '\u{03B7}',
1188 "theta" | "theta1" => '\u{03B8}',
1189 "iota" => '\u{03B9}',
1190 "kappa" => '\u{03BA}',
1191 "lambda" => '\u{03BB}',
1192 "mu" => '\u{03BC}',
1193 "nu" => '\u{03BD}',
1194 "xi" => '\u{03BE}',
1195 "omicron" => '\u{03BF}',
1196 "pi" | "pi1" => '\u{03C0}',
1197 "rho" | "rho1" => '\u{03C1}',
1198 "sigma" => '\u{03C3}',
1199 "sigma1" => '\u{03C2}',
1200 "tau" => '\u{03C4}',
1201 "upsilon" => '\u{03C5}',
1202 "phi" | "phi1" => '\u{03C6}',
1203 "chi" => '\u{03C7}',
1204 "psi" => '\u{03C8}',
1205 "omega" | "omega1" => '\u{03C9}',
1206 "Gamma" => '\u{0393}',
1207 "Delta" => '\u{0394}',
1208 "Theta" => '\u{0398}',
1209 "Lambda" => '\u{039B}',
1210 "Xi" => '\u{039E}',
1211 "Pi" => '\u{03A0}',
1212 "Sigma" => '\u{03A3}',
1213 "Upsilon" => '\u{03A5}',
1214 "Phi" => '\u{03A6}',
1215 "Psi" => '\u{03A8}',
1216 "Omega" => '\u{03A9}',
1217 "lessequal" => '\u{2264}',
1218 "greaterequal" => '\u{2265}',
1219 "notequal" => '\u{2260}',
1220 "approxequal" => '\u{2248}',
1221 "equivalence" => '\u{2261}',
1222 "element" => '\u{2208}',
1223 "plusminus" => '\u{00B1}',
1224 "multiply" => '\u{00D7}',
1225 "divide" => '\u{00F7}',
1226 "infinity" => '\u{221E}',
1227 "partialdiff" => '\u{2202}',
1228 "gradient" => '\u{2207}',
1229 "summation" => '\u{2211}',
1230 "product" => '\u{220F}',
1231 "integral" => '\u{222B}',
1232 "radical" => '\u{221A}',
1233 "proportional" => '\u{221D}',
1234 "arrowright" => '\u{2192}',
1235 "arrowleft" => '\u{2190}',
1236 "arrowup" => '\u{2191}',
1237 "arrowdown" => '\u{2193}',
1238 "arrowboth" => '\u{2194}',
1239 "arrowdblright" => '\u{21D2}',
1240 "logicaland" => '\u{2227}',
1241 "logicalor" => '\u{2228}',
1242 "intersection" => '\u{2229}',
1243 "union" => '\u{222A}',
1244 "similar" => '\u{223C}',
1245 "congruent" => '\u{2245}',
1246 "dotmath" => '\u{22C5}',
1247 "asteriskmath" => '\u{2217}',
1248 _ => {
1249 if let Some((base, _)) = s.split_once('.') {
1251 if !base.is_empty() {
1252 return glyph_name_to_char(base.as_bytes());
1253 }
1254 }
1255 return None;
1256 }
1257 };
1258 Some(resolved)
1259}
1260
1261fn winansi_table() -> HashMap<u8, char> {
1263 let mut m = HashMap::new();
1264 for b in 0x20u8..=0x7e {
1265 m.insert(b, b as char);
1266 }
1267 let extra: &[(u8, char)] = &[
1269 (0x91, '\u{2018}'),
1270 (0x92, '\u{2019}'),
1271 (0x93, '\u{201C}'),
1272 (0x94, '\u{201D}'),
1273 (0x95, '\u{2022}'),
1274 (0x96, '\u{2013}'),
1275 (0x97, '\u{2014}'),
1276 (0x85, '\u{2026}'),
1277 (0xA0, '\u{00A0}'),
1278 ];
1279 for &(b, c) in extra {
1280 m.insert(b, c);
1281 }
1282 for b in 0xA1u8..=0xFF {
1283 m.entry(b).or_insert(b as char);
1284 }
1285 m
1286}
1287
1288fn macroman_table() -> HashMap<u8, char> {
1291 let mut m = HashMap::new();
1292 for b in 0x20u8..=0x7e {
1293 m.insert(b, b as char);
1294 }
1295 let high: &[(u8, char)] = &[
1296 (0xA5, '\u{2022}'), (0xD0, '\u{2013}'), (0xD1, '\u{2014}'), (0xD2, '\u{201C}'),
1300 (0xD3, '\u{201D}'),
1301 (0xD4, '\u{2018}'),
1302 (0xD5, '\u{2019}'),
1303 (0xCA, '\u{00A0}'),
1304 (0xC9, '\u{2026}'),
1305 (0xDE, '\u{FB01}'),
1306 (0xDF, '\u{FB02}'),
1307 ];
1308 for &(b, c) in high {
1309 m.insert(b, c);
1310 }
1311 m
1312}