1use std::collections::HashMap;
4use std::sync::LazyLock;
5
6static AGL_DATA: &str = include_str!("data/glyphlist.txt");
8
9static AGL_MAP: LazyLock<HashMap<&'static str, String>> = LazyLock::new(|| {
11 let mut map = HashMap::with_capacity(4300);
12 for line in AGL_DATA.lines() {
13 let line = line.trim();
14 if line.is_empty() || line.starts_with('#') {
15 continue;
16 }
17 if let Some((name, hex_part)) = line.split_once(';') {
18 let mut s = String::new();
20 for hex_str in hex_part.split_whitespace() {
21 if let Ok(cp) = u32::from_str_radix(hex_str, 16) {
22 if let Some(c) = char::from_u32(cp) {
23 s.push(c);
24 }
25 }
26 }
27 if !s.is_empty() {
28 map.insert(name, s);
29 }
30 }
31 }
32 map
33});
34
35static TEX_GLYPH_MAP: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
38 let mut m = HashMap::new();
39 m.insert("asteriskmath", "\u{2217}"); m.insert("diamondmath", "\u{22C4}"); m.insert("minusplus", "\u{2213}"); m.insert("circleminus", "\u{2296}"); m.insert("circledivide", "\u{2298}"); m.insert("circledot", "\u{2299}"); m.insert("circlecopyrt", "\u{00A9}"); m.insert("equivasymptotic", "\u{224D}"); m.insert("precedesequal", "\u{227C}"); m.insert("followsequal", "\u{227D}"); m.insert("similarequal", "\u{2243}"); m.insert("lessmuch", "\u{226A}"); m.insert("greatermuch", "\u{226B}"); m.insert("follows", "\u{227B}"); m.insert("arrownortheast", "\u{2197}"); m.insert("arrowsoutheast", "\u{2198}"); m.insert("arrownorthwest", "\u{2196}"); m.insert("arrowsouthwest", "\u{2199}"); m.insert("negationslash", "\u{0338}"); m.insert("owner", "\u{220B}"); m.insert("triangleinv", "\u{25BD}"); m.insert("latticetop", "\u{22A4}"); m.insert("tie", "\u{2040}"); m.insert("dotlessj", "\u{0237}"); m.insert("vector", "\u{20D7}"); m.insert("bardbl", "\u{2016}"); m.insert("mapsto", "\u{21A6}"); m.insert("lscript", "\u{2113}"); m.insert("weierstrass", "\u{2118}"); m.insert("visiblespace", "\u{2423}"); m
73});
74
75#[derive(Debug, Clone)]
77pub struct PdfFont {
78 pub name: String,
80 pub base_font: String,
82 pub subtype: String,
84 pub widths: HashMap<u32, f64>,
86 pub default_width: f64,
88 pub to_unicode: HashMap<u32, String>,
90 pub encoding: String,
92 pub is_standard: bool,
94 pub flags: u32,
96 pub italic_angle: f64,
98 pub weight: f64,
100 pub bytes_per_code: u8,
102 pub ascent: f64,
104 pub descent: f64,
106 pub font_bbox: [f64; 4],
108}
109
110impl PdfFont {
111 pub fn default_font(name: &str) -> Self {
113 Self {
114 name: name.to_string(),
115 base_font: "Unknown".to_string(),
116 subtype: "Type1".to_string(),
117 widths: HashMap::new(),
118 default_width: 600.0, to_unicode: HashMap::new(),
120 encoding: "WinAnsiEncoding".to_string(),
121 is_standard: false,
122 flags: 0,
123 italic_angle: 0.0,
124 weight: 400.0,
125 bytes_per_code: 1,
126 ascent: 800.0,
127 descent: -200.0,
128 font_bbox: [0.0, -200.0, 1000.0, 800.0],
129 }
130 }
131
132 pub fn glyph_width(&self, char_code: u32) -> f64 {
134 *self.widths.get(&char_code).unwrap_or(&self.default_width)
135 }
136
137 pub fn decode_char(&self, char_code: u32) -> String {
139 if let Some(unicode) = self.to_unicode.get(&char_code) {
140 return unicode.clone();
141 }
142 match self.encoding.as_str() {
144 "MacRomanEncoding" => decode_macroman(char_code),
145 _ => decode_winansi(char_code),
146 }
147 }
148
149 pub fn is_bold(&self) -> bool {
151 let name_lower = self.base_font.to_lowercase();
152 name_lower.contains("bold")
153 || name_lower.contains("black")
154 || name_lower.contains("heavy")
155 || (self.flags & 0x40000) != 0
156 || self.weight >= 700.0
157 }
158
159 pub fn is_italic(&self) -> bool {
161 let name_lower = self.base_font.to_lowercase();
162 name_lower.contains("italic")
163 || name_lower.contains("oblique")
164 || (self.flags & 0x40) != 0
165 || self.italic_angle.abs() > 0.5
166 }
167}
168
169#[derive(Debug, Default, Clone)]
171pub struct FontCache {
172 fonts: HashMap<String, PdfFont>,
173}
174
175impl FontCache {
176 pub fn get_or_default(&mut self, name: &str) -> &PdfFont {
178 if !self.fonts.contains_key(name) {
179 self.fonts
180 .insert(name.to_string(), PdfFont::default_font(name));
181 }
182 &self.fonts[name]
183 }
184
185 pub fn insert(&mut self, name: String, font: PdfFont) {
187 self.fonts.insert(name, font);
188 }
189
190 pub fn get(&self, name: &str) -> Option<&PdfFont> {
192 self.fonts.get(name)
193 }
194
195 pub fn iter(&self) -> impl Iterator<Item = (&String, &PdfFont)> {
197 self.fonts.iter()
198 }
199}
200
201pub fn resolve_page_fonts(doc: &lopdf::Document, page_id: lopdf::ObjectId) -> FontCache {
203 let mut cache = FontCache::default();
204
205 let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict()) {
207 Ok(d) => d,
208 Err(_) => return cache,
209 };
210
211 let resources = match page_dict.get(b"Resources") {
213 Ok(r) => resolve_object(doc, r),
214 Err(_) => return cache,
215 };
216
217 let resources_dict = match resources.as_dict() {
218 Ok(d) => d,
219 Err(_) => return cache,
220 };
221
222 let font_dict = match resources_dict.get(b"Font") {
224 Ok(f) => resolve_object(doc, f),
225 Err(_) => return cache,
226 };
227
228 let font_dict = match font_dict.as_dict() {
229 Ok(d) => d,
230 Err(_) => return cache,
231 };
232
233 for (name_bytes, font_ref) in font_dict.iter() {
235 let name = String::from_utf8_lossy(name_bytes).to_string();
236 let font_obj = resolve_object(doc, font_ref);
237
238 if let Ok(fd) = font_obj.as_dict() {
239 let pdf_font = resolve_font_dict(doc, &name, fd);
240 cache.insert(name, pdf_font);
241 }
242 }
243
244 cache
245}
246
247pub(crate) fn resolve_font_dict(
249 doc: &lopdf::Document,
250 name: &str,
251 dict: &lopdf::Dictionary,
252) -> PdfFont {
253 let base_font = dict
254 .get(b"BaseFont")
255 .ok()
256 .and_then(|o| {
257 if let lopdf::Object::Name(n) = o {
258 Some(String::from_utf8_lossy(n).to_string())
259 } else {
260 None
261 }
262 })
263 .unwrap_or_else(|| "Unknown".to_string());
264
265 let subtype = dict
266 .get(b"Subtype")
267 .ok()
268 .and_then(|o| {
269 if let lopdf::Object::Name(n) = o {
270 Some(String::from_utf8_lossy(n).to_string())
271 } else {
272 None
273 }
274 })
275 .unwrap_or_else(|| "Type1".to_string());
276
277 let encoding = dict
278 .get(b"Encoding")
279 .ok()
280 .and_then(|o| {
281 let resolved = resolve_object(doc, o);
282 match resolved {
283 lopdf::Object::Name(n) => Some(String::from_utf8_lossy(&n).to_string()),
284 lopdf::Object::Dictionary(ref d) => {
285 d.get(b"BaseEncoding").ok().and_then(|be| {
287 if let lopdf::Object::Name(n) = be {
288 Some(String::from_utf8_lossy(n).to_string())
289 } else {
290 None
291 }
292 })
293 }
294 _ => None,
295 }
296 })
297 .unwrap_or_else(|| "WinAnsiEncoding".to_string());
298
299 let is_standard = is_standard_font(&base_font);
300 let mut default_width = if is_standard {
301 standard_font_default_width(&base_font)
302 } else {
303 1000.0
304 };
305
306 let is_type0 = subtype == "Type0";
308 let bytes_per_code: u8 = if is_type0 { 2 } else { 1 };
309
310 let mut widths = resolve_widths(doc, dict);
312
313 let mut to_unicode = resolve_tounicode(doc, dict);
315
316 let mut flags = 0u32;
318 let mut italic_angle = 0.0f64;
319 let mut weight = 400.0f64;
320 let mut ascent = 800.0f64;
321 let mut descent = -200.0f64;
322 let mut font_bbox = [0.0f64, -200.0, 1000.0, 800.0];
323
324 if is_type0 {
325 if let Ok(desc_ref) = dict.get(b"DescendantFonts") {
326 let desc_obj = resolve_object(doc, desc_ref);
327 if let Ok(desc_arr) = desc_obj.as_array() {
328 if let Some(first) = desc_arr.first() {
329 let desc_font_obj = resolve_object(doc, first);
330 if let Ok(desc_dict) = desc_font_obj.as_dict() {
331 if let Ok(dw) = desc_dict.get(b"DW") {
333 if let Some(dw_val) = obj_to_f64(resolve_object(doc, dw)) {
334 default_width = dw_val;
335 }
336 }
337 resolve_cid_widths(doc, desc_dict, &mut widths);
339 let (f, ia, w, a, d, fb) = resolve_font_descriptor(doc, desc_dict);
341 flags = f;
342 italic_angle = ia;
343 weight = w;
344 ascent = a;
345 descent = d;
346 font_bbox = fb;
347 }
348 }
349 }
350 }
351 } else {
352 resolve_encoding_differences(doc, dict, &mut to_unicode);
354 if subtype == "Type1" {
359 resolve_type1_font_program_encoding(doc, dict, &mut to_unicode);
360 }
361 let (f, ia, w, a, d, fb) = resolve_font_descriptor(doc, dict);
363 flags = f;
364 italic_angle = ia;
365 weight = w;
366 ascent = a;
367 descent = d;
368 font_bbox = fb;
369 }
370
371 let name_lower = base_font.to_lowercase();
375 let is_name_bold =
376 name_lower.contains("bold") || name_lower.contains("black") || name_lower.contains("heavy");
377 if is_name_bold && weight < 700.0 {
378 weight = 700.0;
379 }
380
381 PdfFont {
382 name: name.to_string(),
383 base_font,
384 subtype,
385 widths,
386 default_width,
387 to_unicode,
388 encoding,
389 is_standard,
390 flags,
391 italic_angle,
392 weight,
393 bytes_per_code,
394 ascent,
395 descent,
396 font_bbox,
397 }
398}
399
400fn resolve_widths(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> HashMap<u32, f64> {
402 let mut widths = HashMap::new();
403
404 let first_char = dict
405 .get(b"FirstChar")
406 .ok()
407 .and_then(|o| obj_to_i64(resolve_object(doc, o)))
408 .unwrap_or(0) as u32;
409
410 if let Ok(widths_ref) = dict.get(b"Widths") {
411 let widths_obj = resolve_object(doc, widths_ref);
412 if let Ok(arr) = widths_obj.as_array() {
413 for (i, w) in arr.iter().enumerate() {
414 if let Some(width) = obj_to_f64(resolve_object(doc, w)) {
415 widths.insert(first_char + i as u32, width);
416 }
417 }
418 }
419 }
420
421 widths
422}
423
424fn resolve_cid_widths(
431 doc: &lopdf::Document,
432 dict: &lopdf::Dictionary,
433 widths: &mut HashMap<u32, f64>,
434) {
435 let w_obj = match dict.get(b"W") {
436 Ok(o) => resolve_object(doc, o),
437 Err(_) => return,
438 };
439 let w_arr = match w_obj.as_array() {
440 Ok(a) => a,
441 Err(_) => return,
442 };
443
444 let mut i = 0;
445 while i < w_arr.len() {
446 let first = resolve_object(doc, &w_arr[i]);
447 if let Some(cid_start) = obj_to_i64(first) {
448 let cid_start = cid_start as u32;
449 i += 1;
450 if i >= w_arr.len() {
451 break;
452 }
453 let next = resolve_object(doc, &w_arr[i]);
454 if let Ok(arr) = next.as_array() {
455 for (j, w) in arr.iter().enumerate() {
457 if let Some(width) = obj_to_f64(resolve_object(doc, w)) {
458 widths.insert(cid_start + j as u32, width);
459 }
460 }
461 i += 1;
462 } else if let Some(cid_end) = obj_to_i64(next) {
463 let cid_end = cid_end as u32;
465 i += 1;
466 if i >= w_arr.len() {
467 break;
468 }
469 let w_val = resolve_object(doc, &w_arr[i]);
470 if let Some(width) = obj_to_f64(w_val) {
471 for cid in cid_start..=cid_end {
472 widths.insert(cid, width);
473 }
474 }
475 i += 1;
476 } else {
477 i += 1;
478 }
479 } else {
480 i += 1;
481 }
482 }
483}
484
485fn resolve_tounicode(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> HashMap<u32, String> {
487 let mut mapping = HashMap::new();
488
489 if let Ok(tounicode_ref) = dict.get(b"ToUnicode") {
490 let tounicode_ref = match tounicode_ref {
491 lopdf::Object::Reference(r) => *r,
492 _ => return mapping,
493 };
494
495 if let Ok(stream) = doc.get_object(tounicode_ref) {
496 if let Ok(stream) = stream.as_stream() {
497 if let Ok(data) = stream.decompressed_content() {
498 parse_cmap(&data, &mut mapping);
499 }
500 }
501 }
502 }
503
504 mapping
505}
506
507fn resolve_encoding_differences(
512 doc: &lopdf::Document,
513 dict: &lopdf::Dictionary,
514 to_unicode: &mut HashMap<u32, String>,
515) {
516 let enc_obj = match dict.get(b"Encoding") {
517 Ok(o) => resolve_object(doc, o),
518 Err(_) => return,
519 };
520
521 let enc_dict = match enc_obj.as_dict() {
522 Ok(d) => d,
523 Err(_) => return, };
525
526 let diffs_obj = match enc_dict.get(b"Differences") {
527 Ok(o) => resolve_object(doc, o),
528 Err(_) => return,
529 };
530
531 let diffs = match diffs_obj.as_array() {
532 Ok(a) => a,
533 Err(_) => return,
534 };
535
536 let mut current_code: u32 = 0;
537 for item in diffs {
538 let resolved = resolve_object(doc, item);
539 match resolved {
540 lopdf::Object::Integer(i) => {
541 current_code = i as u32;
542 }
543 lopdf::Object::Name(ref name_bytes) => {
544 let glyph_name = String::from_utf8_lossy(name_bytes).to_string();
545 if let std::collections::hash_map::Entry::Vacant(e) = to_unicode.entry(current_code)
547 {
548 if let Some(unicode) = glyph_name_to_unicode(&glyph_name) {
549 e.insert(unicode);
550 }
551 }
552 current_code += 1;
553 }
554 _ => {}
555 }
556 }
557}
558
559fn resolve_type1_font_program_encoding(
565 doc: &lopdf::Document,
566 dict: &lopdf::Dictionary,
567 to_unicode: &mut HashMap<u32, String>,
568) {
569 let fd_obj = match dict.get(b"FontDescriptor") {
571 Ok(o) => resolve_object(doc, o),
572 Err(_) => return,
573 };
574 let fd = match fd_obj.as_dict() {
575 Ok(d) => d,
576 Err(_) => return,
577 };
578
579 let stream_data = None
581 .or_else(|| get_font_file_data(doc, fd, b"FontFile"))
582 .or_else(|| get_font_file_data(doc, fd, b"FontFile3"));
583
584 let data = match stream_data {
585 Some(d) => d,
586 None => return,
587 };
588
589 let text = String::from_utf8_lossy(&data);
592
593 if text.contains("/Encoding StandardEncoding def") {
595 return; }
597
598 for line in text.lines() {
599 let trimmed = line.trim();
600 if !trimmed.starts_with("dup ") || !trimmed.ends_with(" put") {
602 continue;
603 }
604 let inner = &trimmed[4..trimmed.len() - 4].trim();
605 let parts: Vec<&str> = inner.splitn(2, ' ').collect();
607 if parts.len() != 2 {
608 continue;
609 }
610 let code: u32 = match parts[0].trim().parse() {
611 Ok(c) => c,
612 Err(_) => continue,
613 };
614 let glyph = parts[1].trim().trim_start_matches('/');
615 if glyph.is_empty() || glyph == ".notdef" {
616 continue;
617 }
618 if let std::collections::hash_map::Entry::Vacant(e) = to_unicode.entry(code) {
620 if let Some(unicode) = glyph_name_to_unicode(glyph) {
621 e.insert(unicode);
622 }
623 }
624 }
625}
626
627fn get_font_file_data(
629 doc: &lopdf::Document,
630 fd: &lopdf::Dictionary,
631 key: &[u8],
632) -> Option<Vec<u8>> {
633 let ff_ref = fd.get(key).ok()?;
634 let ff_id = match ff_ref {
635 lopdf::Object::Reference(r) => *r,
636 _ => return None,
637 };
638 let ff_obj = doc.get_object(ff_id).ok()?;
639 let stream = ff_obj.as_stream().ok()?;
640 stream.decompressed_content().ok()
641}
642
643fn glyph_name_to_unicode(name: &str) -> Option<String> {
649 match name {
651 "fi" => return Some("fi".to_string()),
652 "fl" => return Some("fl".to_string()),
653 "ff" => return Some("ff".to_string()),
654 "ffi" => return Some("ffi".to_string()),
655 "ffl" => return Some("ffl".to_string()),
656 "IJ" => return Some("IJ".to_string()),
657 "ij" => return Some("ij".to_string()),
658 _ => {}
659 }
660
661 if let Some(s) = resolve_glyph_component(name) {
663 return Some(s);
664 }
665
666 let base = if let Some(dot_pos) = name.find('.') {
669 &name[..dot_pos]
670 } else {
671 name
672 };
673
674 if base != name {
675 if let Some(s) = resolve_glyph_component(base) {
676 return Some(s);
677 }
678 }
679
680 if base.contains('_') {
683 let mut result = String::new();
684 for component in base.split('_') {
685 if let Some(s) = resolve_glyph_component(component) {
686 result.push_str(&s);
687 } else {
688 return None; }
690 }
691 if !result.is_empty() {
692 return Some(result);
693 }
694 }
695
696 None
697}
698
699fn resolve_glyph_component(name: &str) -> Option<String> {
701 if name.is_empty() {
702 return None;
703 }
704
705 if let Some(s) = AGL_MAP.get(name) {
707 return Some(s.clone());
708 }
709
710 if let Some(s) = TEX_GLYPH_MAP.get(name) {
712 return Some((*s).to_string());
713 }
714
715 if name.len() == 1 {
717 return Some(name.to_string());
718 }
719
720 if let Some(hex) = name.strip_prefix("uni") {
722 if hex.len() == 4 {
723 if let Ok(cp) = u32::from_str_radix(hex, 16) {
724 if let Some(c) = char::from_u32(cp) {
725 return Some(c.to_string());
726 }
727 }
728 }
729 if hex.len() > 4 && hex.len() % 4 == 0 {
731 let mut s = String::new();
732 for chunk in hex.as_bytes().chunks(4) {
733 if let Ok(h) = std::str::from_utf8(chunk) {
734 if let Ok(cp) = u32::from_str_radix(h, 16) {
735 if let Some(c) = char::from_u32(cp) {
736 s.push(c);
737 }
738 }
739 }
740 }
741 if !s.is_empty() {
742 return Some(s);
743 }
744 }
745 }
746
747 if let Some(hex) = name.strip_prefix('u') {
749 if (4..=6).contains(&hex.len()) && hex.chars().all(|c| c.is_ascii_hexdigit()) {
750 if let Ok(cp) = u32::from_str_radix(hex, 16) {
751 if let Some(c) = char::from_u32(cp) {
752 return Some(c.to_string());
753 }
754 }
755 }
756 }
757
758 None
759}
760
761fn parse_cmap(data: &[u8], mapping: &mut HashMap<u32, String>) {
763 let text = String::from_utf8_lossy(data);
764
765 let mut in_bfchar = false;
767 for line in text.lines() {
768 let trimmed = line.trim();
769 if trimmed.contains("beginbfchar") {
770 in_bfchar = true;
771 continue;
772 }
773 if trimmed.contains("endbfchar") {
774 in_bfchar = false;
775 continue;
776 }
777 if in_bfchar {
778 let parts: Vec<&str> = trimmed.split('>').collect();
780 if parts.len() >= 2 {
781 if let (Some(src), Some(dst)) =
782 (parse_hex_value(parts[0]), parse_hex_unicode(parts[1]))
783 {
784 mapping.insert(src, dst);
785 }
786 }
787 }
788 }
789
790 let mut in_bfrange = false;
795 for line in text.lines() {
796 let trimmed = line.trim();
797 if trimmed.contains("beginbfrange") {
798 in_bfrange = true;
799 continue;
800 }
801 if trimmed.contains("endbfrange") {
802 in_bfrange = false;
803 continue;
804 }
805 if in_bfrange {
806 if let Some(bracket_start) = trimmed.find('[') {
808 let before_bracket = &trimmed[..bracket_start];
810 let parts: Vec<&str> = before_bracket.split('>').collect();
811 if parts.len() >= 2 {
812 if let (Some(start), Some(end)) =
813 (parse_hex_value(parts[0]), parse_hex_value(parts[1]))
814 {
815 let bracket_end = trimmed.rfind(']').unwrap_or(trimmed.len());
817 let inside = &trimmed[bracket_start + 1..bracket_end];
818 let values: Vec<String> = inside
819 .split('>')
820 .filter_map(|s| {
821 let s = s.trim().trim_start_matches('<');
822 if s.is_empty() {
823 None
824 } else {
825 parse_hex_unicode_str(s)
826 }
827 })
828 .collect();
829 for (i, val) in values.iter().enumerate() {
830 let code = start + i as u32;
831 if code > end {
832 break;
833 }
834 mapping.insert(code, val.clone());
835 }
836 }
837 }
838 } else {
839 let parts: Vec<&str> = trimmed.split('>').collect();
841 if parts.len() >= 3 {
842 if let (Some(start), Some(end), Some(dst_start)) = (
843 parse_hex_value(parts[0]),
844 parse_hex_value(parts[1]),
845 parse_hex_value(parts[2]),
846 ) {
847 for code in start..=end {
848 let unicode_point = dst_start + (code - start);
849 if let Some(c) = char::from_u32(unicode_point) {
850 mapping.insert(code, c.to_string());
851 }
852 }
853 }
854 }
855 }
856 }
857 }
858}
859
860fn parse_hex_value(s: &str) -> Option<u32> {
862 let cleaned = s.trim().trim_start_matches('<').trim();
863 if cleaned.is_empty() {
864 return None;
865 }
866 u32::from_str_radix(cleaned, 16).ok()
867}
868
869fn parse_hex_unicode(s: &str) -> Option<String> {
871 let cleaned = s
872 .trim()
873 .trim_start_matches('<')
874 .trim_end_matches('>')
875 .trim();
876 parse_hex_unicode_str(cleaned)
877}
878
879fn parse_hex_unicode_str(cleaned: &str) -> Option<String> {
881 if cleaned.is_empty() {
882 return None;
883 }
884
885 let mut result = String::new();
887 let bytes: Vec<&str> = cleaned
888 .as_bytes()
889 .chunks(4)
890 .map(|c| std::str::from_utf8(c).unwrap_or(""))
891 .collect();
892
893 for hex_str in bytes {
894 if let Ok(code_point) = u32::from_str_radix(hex_str, 16) {
895 if let Some(c) = char::from_u32(code_point) {
896 result.push(c);
897 }
898 }
899 }
900
901 if result.is_empty() {
902 None
903 } else {
904 Some(result)
905 }
906}
907
908fn resolve_font_descriptor(
910 doc: &lopdf::Document,
911 dict: &lopdf::Dictionary,
912) -> (u32, f64, f64, f64, f64, [f64; 4]) {
913 let mut flags = 0u32;
914 let mut italic_angle = 0.0f64;
915 let mut weight = 400.0f64;
916 let mut ascent = 800.0f64;
917 let mut descent = -200.0f64;
918 let mut font_bbox = [0.0f64, -200.0, 1000.0, 800.0];
919
920 if let Ok(fd_ref) = dict.get(b"FontDescriptor") {
921 let fd_obj = resolve_object(doc, fd_ref);
922 if let Ok(fd) = fd_obj.as_dict() {
923 flags = fd
924 .get(b"Flags")
925 .ok()
926 .and_then(|o| obj_to_i64(resolve_object(doc, o)))
927 .unwrap_or(0) as u32;
928
929 italic_angle = fd
930 .get(b"ItalicAngle")
931 .ok()
932 .and_then(|o| obj_to_f64(resolve_object(doc, o)))
933 .unwrap_or(0.0);
934
935 let stem_v = fd
937 .get(b"StemV")
938 .ok()
939 .and_then(|o| obj_to_f64(resolve_object(doc, o)))
940 .unwrap_or(0.0);
941
942 weight = if stem_v >= 140.0 {
943 700.0 } else if stem_v >= 100.0 {
945 500.0 } else {
947 400.0 };
949
950 if let Ok(bbox_ref) = fd.get(b"FontBBox") {
952 let bbox_obj = resolve_object(doc, bbox_ref);
953 if let Ok(bbox_arr) = bbox_obj.as_array() {
954 if bbox_arr.len() >= 4 {
955 let vals: Vec<f64> = bbox_arr
956 .iter()
957 .filter_map(|o| obj_to_f64(resolve_object(doc, o)))
958 .collect();
959 if vals.len() >= 4 {
960 font_bbox = [vals[0], vals[1], vals[2], vals[3]];
961 }
962 }
963 }
964 }
965
966 if let Ok(a_ref) = fd.get(b"Ascent") {
968 if let Some(a) = obj_to_f64(resolve_object(doc, a_ref)) {
969 ascent = a;
970 }
971 } else {
972 ascent = font_bbox[3]; }
974
975 if let Ok(d_ref) = fd.get(b"Descent") {
977 if let Some(d) = obj_to_f64(resolve_object(doc, d_ref)) {
978 descent = d;
979 }
980 } else {
981 descent = font_bbox[1]; }
983 }
984 }
985
986 (flags, italic_angle, weight, ascent, descent, font_bbox)
987}
988
989fn resolve_object<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> lopdf::Object {
991 match obj {
992 lopdf::Object::Reference(id) => doc.get_object(*id).cloned().unwrap_or(lopdf::Object::Null),
993 other => other.clone(),
994 }
995}
996
997fn obj_to_f64(obj: lopdf::Object) -> Option<f64> {
999 match obj {
1000 lopdf::Object::Integer(i) => Some(i as f64),
1001 lopdf::Object::Real(f) => Some(f),
1002 _ => None,
1003 }
1004}
1005
1006fn obj_to_i64(obj: lopdf::Object) -> Option<i64> {
1008 match obj {
1009 lopdf::Object::Integer(i) => Some(i),
1010 lopdf::Object::Real(f) => Some(f as i64),
1011 _ => None,
1012 }
1013}
1014
1015fn is_standard_font(name: &str) -> bool {
1017 matches!(
1018 name,
1019 "Courier"
1020 | "Courier-Bold"
1021 | "Courier-Oblique"
1022 | "Courier-BoldOblique"
1023 | "Helvetica"
1024 | "Helvetica-Bold"
1025 | "Helvetica-Oblique"
1026 | "Helvetica-BoldOblique"
1027 | "Times-Roman"
1028 | "Times-Bold"
1029 | "Times-Italic"
1030 | "Times-BoldItalic"
1031 | "Symbol"
1032 | "ZapfDingbats"
1033 )
1034}
1035
1036fn standard_font_default_width(name: &str) -> f64 {
1038 if name.starts_with("Courier") {
1039 600.0
1040 } else {
1041 500.0
1043 }
1044}
1045
1046fn decode_macroman(code: u32) -> String {
1051 if code < 128 {
1052 if let Some(c) = char::from_u32(code) {
1053 return c.to_string();
1054 }
1055 }
1056 let mapped = match code {
1057 0x80 => '\u{00C4}', 0x81 => '\u{00C5}', 0x82 => '\u{00C7}', 0x83 => '\u{00C9}', 0x84 => '\u{00D1}', 0x85 => '\u{00D6}', 0x86 => '\u{00DC}', 0x87 => '\u{00E1}', 0x88 => '\u{00E0}', 0x89 => '\u{00E2}', 0x8A => '\u{00E4}', 0x8B => '\u{00E3}', 0x8C => '\u{00E5}', 0x8D => '\u{00E7}', 0x8E => '\u{00E9}', 0x8F => '\u{00E8}', 0x90 => '\u{00EA}', 0x91 => '\u{00EB}', 0x92 => '\u{00ED}', 0x93 => '\u{00EC}', 0x94 => '\u{00EE}', 0x95 => '\u{00EF}', 0x96 => '\u{00F1}', 0x97 => '\u{00F3}', 0x98 => '\u{00F2}', 0x99 => '\u{00F4}', 0x9A => '\u{00F6}', 0x9B => '\u{00F5}', 0x9C => '\u{00FA}', 0x9D => '\u{00F9}', 0x9E => '\u{00FB}', 0x9F => '\u{00FC}', 0xA0 => '\u{2020}', 0xA1 => '\u{00B0}', 0xA2 => '\u{00A2}', 0xA3 => '\u{00A3}', 0xA4 => '\u{00A7}', 0xA5 => '\u{2022}', 0xA6 => '\u{00B6}', 0xA7 => '\u{00DF}', 0xA8 => '\u{00AE}', 0xA9 => '\u{00A9}', 0xAA => '\u{2122}', 0xAB => '\u{00B4}', 0xAC => '\u{00A8}', 0xAD => '\u{2260}', 0xAE => '\u{00C6}', 0xAF => '\u{00D8}', 0xB0 => '\u{221E}', 0xB1 => '\u{00B1}', 0xB2 => '\u{2264}', 0xB3 => '\u{2265}', 0xB4 => '\u{00A5}', 0xB5 => '\u{00B5}', 0xB6 => '\u{2202}', 0xB7 => '\u{2211}', 0xB8 => '\u{220F}', 0xB9 => '\u{03C0}', 0xBA => '\u{222B}', 0xBB => '\u{00AA}', 0xBC => '\u{00BA}', 0xBD => '\u{03A9}', 0xBE => '\u{00E6}', 0xBF => '\u{00F8}', 0xC0 => '\u{00BF}', 0xC1 => '\u{00A1}', 0xC2 => '\u{00AC}', 0xC3 => '\u{221A}', 0xC4 => '\u{0192}', 0xC5 => '\u{2248}', 0xC6 => '\u{2206}', 0xC7 => '\u{00AB}', 0xC8 => '\u{00BB}', 0xC9 => '\u{2026}', 0xCA => '\u{00A0}', 0xCB => '\u{00C0}', 0xCC => '\u{00C3}', 0xCD => '\u{00D5}', 0xCE => '\u{0152}', 0xCF => '\u{0153}', 0xD0 => '\u{2013}', 0xD1 => '\u{2014}', 0xD2 => '\u{201C}', 0xD3 => '\u{201D}', 0xD4 => '\u{2018}', 0xD5 => '\u{2019}', 0xD6 => '\u{00F7}', 0xD7 => '\u{25CA}', 0xD8 => '\u{00FF}', 0xD9 => '\u{0178}', 0xDA => '\u{2044}', 0xDB => '\u{20AC}', 0xDC => '\u{2039}', 0xDD => '\u{203A}', 0xDE => '\u{FB01}', 0xDF => '\u{FB02}', 0xE0 => '\u{2021}', 0xE1 => '\u{00B7}', 0xE2 => '\u{201A}', 0xE3 => '\u{201E}', 0xE4 => '\u{2030}', 0xE5 => '\u{00C2}', 0xE6 => '\u{00CA}', 0xE7 => '\u{00C1}', 0xE8 => '\u{00CB}', 0xE9 => '\u{00C8}', 0xEA => '\u{00CD}', 0xEB => '\u{00CE}', 0xEC => '\u{00CF}', 0xED => '\u{00CC}', 0xEE => '\u{00D3}', 0xEF => '\u{00D4}', 0xF0 => '\u{F8FF}', 0xF1 => '\u{00D2}', 0xF2 => '\u{00DA}', 0xF3 => '\u{00DB}', 0xF4 => '\u{00D9}', 0xF5 => '\u{0131}', 0xF6 => '\u{02C6}', 0xF7 => '\u{02DC}', 0xF8 => '\u{00AF}', 0xF9 => '\u{02D8}', 0xFA => '\u{02D9}', 0xFB => '\u{02DA}', 0xFC => '\u{00B8}', 0xFD => '\u{02DD}', 0xFE => '\u{02DB}', 0xFF => '\u{02C7}', _ => {
1186 return char::from_u32(code)
1187 .map(|c| c.to_string())
1188 .unwrap_or_default();
1189 }
1190 };
1191 mapped.to_string()
1192}
1193
1194fn decode_winansi(code: u32) -> String {
1196 if code < 128 {
1197 if let Some(c) = char::from_u32(code) {
1199 return c.to_string();
1200 }
1201 }
1202 let mapped = match code {
1204 0x80 => '\u{20AC}', 0x82 => '\u{201A}', 0x83 => '\u{0192}', 0x84 => '\u{201E}', 0x85 => '\u{2026}', 0x86 => '\u{2020}', 0x87 => '\u{2021}', 0x88 => '\u{02C6}', 0x89 => '\u{2030}', 0x8A => '\u{0160}', 0x8B => '\u{2039}', 0x8C => '\u{0152}', 0x8E => '\u{017D}', 0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '\u{201C}', 0x94 => '\u{201D}', 0x95 => '\u{2022}', 0x96 => '\u{2013}', 0x97 => '\u{2014}', 0x98 => '\u{02DC}', 0x99 => '\u{2122}', 0x9A => '\u{0161}', 0x9B => '\u{203A}', 0x9C => '\u{0153}', 0x9E => '\u{017E}', 0x9F => '\u{0178}', c @ 0xA0..=0xFF => {
1233 return char::from_u32(c)
1234 .map(|ch| ch.to_string())
1235 .unwrap_or_default();
1236 }
1237 _ => {
1238 return char::from_u32(code)
1239 .map(|c| c.to_string())
1240 .unwrap_or_default();
1241 }
1242 };
1243 mapped.to_string()
1244}
1245
1246#[cfg(test)]
1247mod tests {
1248 use super::*;
1249
1250 #[test]
1251 fn test_default_font() {
1252 let font = PdfFont::default_font("F1");
1253 assert_eq!(font.name, "F1");
1254 assert!((font.default_width - 600.0).abs() < 1e-10);
1255 }
1256
1257 #[test]
1258 fn test_bold_detection() {
1259 let mut font = PdfFont::default_font("F1");
1260 font.base_font = "Helvetica-Bold".to_string();
1261 assert!(font.is_bold());
1262
1263 font.base_font = "Helvetica".to_string();
1264 assert!(!font.is_bold());
1265 }
1266
1267 #[test]
1268 fn test_decode_winansi() {
1269 assert_eq!(decode_winansi(65), "A");
1270 assert_eq!(decode_winansi(0x93), "\u{201C}");
1271 }
1272
1273 #[test]
1274 fn test_parse_hex_value() {
1275 assert_eq!(parse_hex_value("<0041"), Some(0x41));
1276 assert_eq!(parse_hex_value("<00FF"), Some(0xFF));
1277 }
1278
1279 #[test]
1280 fn test_standard_font() {
1281 assert!(is_standard_font("Helvetica"));
1282 assert!(is_standard_font("Courier-Bold"));
1283 assert!(!is_standard_font("ArialMT"));
1284 }
1285
1286 #[test]
1287 fn test_glyph_name_to_unicode_ligatures() {
1288 assert_eq!(glyph_name_to_unicode("fi"), Some("fi".to_string()));
1289 assert_eq!(glyph_name_to_unicode("fl"), Some("fl".to_string()));
1290 assert_eq!(glyph_name_to_unicode("ff"), Some("ff".to_string()));
1291 assert_eq!(glyph_name_to_unicode("ffi"), Some("ffi".to_string()));
1292 assert_eq!(glyph_name_to_unicode("ffl"), Some("ffl".to_string()));
1293 }
1294
1295 #[test]
1296 fn test_glyph_name_to_unicode_common() {
1297 assert_eq!(glyph_name_to_unicode("percent"), Some("%".to_string()));
1298 assert_eq!(glyph_name_to_unicode("ampersand"), Some("&".to_string()));
1299 assert_eq!(glyph_name_to_unicode("parenleft"), Some("(".to_string()));
1300 assert_eq!(
1301 glyph_name_to_unicode("endash"),
1302 Some("\u{2013}".to_string())
1303 );
1304 assert_eq!(glyph_name_to_unicode("A"), Some("A".to_string()));
1305 assert_eq!(glyph_name_to_unicode("uni0041"), Some("A".to_string()));
1306 }
1307
1308 #[test]
1309 fn test_glyph_name_to_unicode_unknown() {
1310 assert_eq!(glyph_name_to_unicode("nonexistent_glyph_xyz"), None);
1311 }
1312
1313 #[test]
1314 fn test_glyph_name_to_unicode_agl_extended() {
1315 assert_eq!(
1317 glyph_name_to_unicode("Dcroat"),
1318 Some("\u{0110}".to_string())
1319 );
1320 assert_eq!(
1321 glyph_name_to_unicode("dcroat"),
1322 Some("\u{0111}".to_string())
1323 );
1324 assert_eq!(
1325 glyph_name_to_unicode("Emacron"),
1326 Some("\u{0112}".to_string())
1327 );
1328 assert_eq!(
1329 glyph_name_to_unicode("afii10017"),
1330 Some("\u{0410}".to_string())
1331 ); assert_eq!(
1333 glyph_name_to_unicode("afii57636"),
1334 Some("\u{20AA}".to_string())
1335 ); assert_eq!(
1338 glyph_name_to_unicode("dalethatafpatah"),
1339 Some("\u{05D3}\u{05B2}".to_string())
1340 );
1341 }
1342
1343 #[test]
1344 fn test_glyph_name_to_unicode_uni_formats() {
1345 assert_eq!(glyph_name_to_unicode("uni0041"), Some("A".to_string()));
1347 assert_eq!(glyph_name_to_unicode("uni00E9"), Some("é".to_string()));
1348 assert_eq!(glyph_name_to_unicode("uni00410042"), Some("AB".to_string()));
1350 assert_eq!(
1352 glyph_name_to_unicode("u1F600"),
1353 Some("\u{1F600}".to_string())
1354 );
1355 }
1356
1357 #[test]
1358 fn test_parse_cmap_bfrange_array() {
1359 let cmap_data = b"beginbfrange\n<0001> <0003> [<0041> <0042> <0043>]\nendbfrange\n";
1361 let mut mapping = HashMap::new();
1362 parse_cmap(cmap_data, &mut mapping);
1363 assert_eq!(mapping.get(&1), Some(&"A".to_string()));
1364 assert_eq!(mapping.get(&2), Some(&"B".to_string()));
1365 assert_eq!(mapping.get(&3), Some(&"C".to_string()));
1366 }
1367
1368 #[test]
1369 fn test_parse_cmap_bfrange_incremented() {
1370 let cmap_data = b"beginbfrange\n<0041> <0043> <0061>\nendbfrange\n";
1372 let mut mapping = HashMap::new();
1373 parse_cmap(cmap_data, &mut mapping);
1374 assert_eq!(mapping.get(&0x41), Some(&"a".to_string()));
1375 assert_eq!(mapping.get(&0x42), Some(&"b".to_string()));
1376 assert_eq!(mapping.get(&0x43), Some(&"c".to_string()));
1377 }
1378
1379 #[test]
1380 fn test_decode_winansi_extended() {
1381 assert_eq!(decode_winansi(0x82), "\u{201A}"); assert_eq!(decode_winansi(0x83), "\u{0192}"); assert_eq!(decode_winansi(0x8A), "\u{0160}"); assert_eq!(decode_winansi(0x8C), "\u{0152}"); assert_eq!(decode_winansi(0x99), "\u{2122}"); assert_eq!(decode_winansi(0x9C), "\u{0153}"); assert_eq!(decode_winansi(0xA0), "\u{00A0}"); assert_eq!(decode_winansi(0xE9), "\u{00E9}"); }
1390
1391 #[test]
1392 fn test_tex_glyph_names() {
1393 assert_eq!(
1395 glyph_name_to_unicode("asteriskmath"),
1396 Some("\u{2217}".to_string())
1397 ); assert_eq!(
1399 glyph_name_to_unicode("diamondmath"),
1400 Some("\u{22C4}".to_string())
1401 ); assert_eq!(
1403 glyph_name_to_unicode("minusplus"),
1404 Some("\u{2213}".to_string())
1405 ); assert_eq!(
1407 glyph_name_to_unicode("circleminus"),
1408 Some("\u{2296}".to_string())
1409 ); assert_eq!(
1411 glyph_name_to_unicode("circledot"),
1412 Some("\u{2299}".to_string())
1413 ); assert_eq!(
1415 glyph_name_to_unicode("follows"),
1416 Some("\u{227B}".to_string())
1417 ); assert_eq!(
1419 glyph_name_to_unicode("lessmuch"),
1420 Some("\u{226A}".to_string())
1421 ); assert_eq!(
1423 glyph_name_to_unicode("greatermuch"),
1424 Some("\u{226B}".to_string())
1425 ); assert_eq!(
1427 glyph_name_to_unicode("latticetop"),
1428 Some("\u{22A4}".to_string())
1429 ); assert_eq!(
1431 glyph_name_to_unicode("mapsto"),
1432 Some("\u{21A6}".to_string())
1433 ); assert_eq!(
1436 glyph_name_to_unicode("dagger"),
1437 Some("\u{2020}".to_string())
1438 ); assert_eq!(
1440 glyph_name_to_unicode("daggerdbl"),
1441 Some("\u{2021}".to_string())
1442 ); assert_eq!(
1444 glyph_name_to_unicode("braceleft"),
1445 Some("\u{007B}".to_string())
1446 ); assert_eq!(
1448 glyph_name_to_unicode("braceright"),
1449 Some("\u{007D}".to_string())
1450 ); }
1452}