use std::collections::HashMap;
use std::sync::LazyLock;
static AGL_DATA: &str = include_str!("data/glyphlist.txt");
static AGL_MAP: LazyLock<HashMap<&'static str, String>> = LazyLock::new(|| {
let mut map = HashMap::with_capacity(4300);
for line in AGL_DATA.lines() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if let Some((name, hex_part)) = line.split_once(';') {
let mut s = String::new();
for hex_str in hex_part.split_whitespace() {
if let Ok(cp) = u32::from_str_radix(hex_str, 16) {
if let Some(c) = char::from_u32(cp) {
s.push(c);
}
}
}
if !s.is_empty() {
map.insert(name, s);
}
}
}
map
});
static TEX_GLYPH_MAP: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
let mut m = HashMap::new();
m.insert("asteriskmath", "\u{2217}"); m.insert("diamondmath", "\u{22C4}"); m.insert("minusplus", "\u{2213}"); m.insert("circleminus", "\u{2296}"); m.insert("circledivide", "\u{2298}"); m.insert("circledot", "\u{2299}"); m.insert("circlecopyrt", "\u{00A9}"); m.insert("equivasymptotic", "\u{224D}"); m.insert("precedesequal", "\u{227C}"); m.insert("followsequal", "\u{227D}"); m.insert("similarequal", "\u{2243}"); m.insert("lessmuch", "\u{226A}"); m.insert("greatermuch", "\u{226B}"); m.insert("follows", "\u{227B}"); m.insert("arrownortheast", "\u{2197}"); m.insert("arrowsoutheast", "\u{2198}"); m.insert("arrownorthwest", "\u{2196}"); m.insert("arrowsouthwest", "\u{2199}"); m.insert("negationslash", "\u{0338}"); m.insert("owner", "\u{220B}"); m.insert("triangleinv", "\u{25BD}"); m.insert("latticetop", "\u{22A4}"); m.insert("tie", "\u{2040}"); m.insert("dotlessj", "\u{0237}"); m.insert("vector", "\u{20D7}"); m.insert("bardbl", "\u{2016}"); m.insert("mapsto", "\u{21A6}"); m.insert("lscript", "\u{2113}"); m.insert("weierstrass", "\u{2118}"); m.insert("visiblespace", "\u{2423}"); m
});
#[derive(Debug, Clone)]
pub struct PdfFont {
pub name: String,
pub base_font: String,
pub subtype: String,
pub widths: HashMap<u32, f64>,
pub default_width: f64,
pub to_unicode: HashMap<u32, String>,
pub encoding: String,
pub is_standard: bool,
pub flags: u32,
pub italic_angle: f64,
pub weight: f64,
pub bytes_per_code: u8,
pub ascent: f64,
pub descent: f64,
pub font_bbox: [f64; 4],
}
impl PdfFont {
pub fn default_font(name: &str) -> Self {
Self {
name: name.to_string(),
base_font: "Unknown".to_string(),
subtype: "Type1".to_string(),
widths: HashMap::new(),
default_width: 600.0, to_unicode: HashMap::new(),
encoding: "WinAnsiEncoding".to_string(),
is_standard: false,
flags: 0,
italic_angle: 0.0,
weight: 400.0,
bytes_per_code: 1,
ascent: 800.0,
descent: -200.0,
font_bbox: [0.0, -200.0, 1000.0, 800.0],
}
}
pub fn glyph_width(&self, char_code: u32) -> f64 {
*self.widths.get(&char_code).unwrap_or(&self.default_width)
}
pub fn decode_char(&self, char_code: u32) -> String {
if let Some(unicode) = self.to_unicode.get(&char_code) {
return unicode.clone();
}
match self.encoding.as_str() {
"MacRomanEncoding" => decode_macroman(char_code),
_ => decode_winansi(char_code),
}
}
pub fn is_bold(&self) -> bool {
let name_lower = self.base_font.to_lowercase();
name_lower.contains("bold")
|| name_lower.contains("black")
|| name_lower.contains("heavy")
|| (self.flags & 0x40000) != 0
|| self.weight >= 700.0
}
pub fn is_italic(&self) -> bool {
let name_lower = self.base_font.to_lowercase();
name_lower.contains("italic")
|| name_lower.contains("oblique")
|| (self.flags & 0x40) != 0
|| self.italic_angle.abs() > 0.5
}
}
#[derive(Debug, Default, Clone)]
pub struct FontCache {
fonts: HashMap<String, PdfFont>,
}
impl FontCache {
pub fn get_or_default(&mut self, name: &str) -> &PdfFont {
if !self.fonts.contains_key(name) {
self.fonts
.insert(name.to_string(), PdfFont::default_font(name));
}
&self.fonts[name]
}
pub fn insert(&mut self, name: String, font: PdfFont) {
self.fonts.insert(name, font);
}
pub fn get(&self, name: &str) -> Option<&PdfFont> {
self.fonts.get(name)
}
pub fn iter(&self) -> impl Iterator<Item = (&String, &PdfFont)> {
self.fonts.iter()
}
}
pub fn resolve_page_fonts(doc: &lopdf::Document, page_id: lopdf::ObjectId) -> FontCache {
let mut cache = FontCache::default();
let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict()) {
Ok(d) => d,
Err(_) => return cache,
};
let resources = match page_dict.get(b"Resources") {
Ok(r) => resolve_object(doc, r),
Err(_) => return cache,
};
let resources_dict = match resources.as_dict() {
Ok(d) => d,
Err(_) => return cache,
};
let font_dict = match resources_dict.get(b"Font") {
Ok(f) => resolve_object(doc, f),
Err(_) => return cache,
};
let font_dict = match font_dict.as_dict() {
Ok(d) => d,
Err(_) => return cache,
};
for (name_bytes, font_ref) in font_dict.iter() {
let name = String::from_utf8_lossy(name_bytes).to_string();
let font_obj = resolve_object(doc, font_ref);
if let Ok(fd) = font_obj.as_dict() {
let pdf_font = resolve_font_dict(doc, &name, fd);
cache.insert(name, pdf_font);
}
}
cache
}
pub(crate) fn resolve_font_dict(
doc: &lopdf::Document,
name: &str,
dict: &lopdf::Dictionary,
) -> PdfFont {
let base_font = dict
.get(b"BaseFont")
.ok()
.and_then(|o| {
if let lopdf::Object::Name(n) = o {
Some(String::from_utf8_lossy(n).to_string())
} else {
None
}
})
.unwrap_or_else(|| "Unknown".to_string());
let subtype = dict
.get(b"Subtype")
.ok()
.and_then(|o| {
if let lopdf::Object::Name(n) = o {
Some(String::from_utf8_lossy(n).to_string())
} else {
None
}
})
.unwrap_or_else(|| "Type1".to_string());
let encoding = dict
.get(b"Encoding")
.ok()
.and_then(|o| {
let resolved = resolve_object(doc, o);
match resolved {
lopdf::Object::Name(n) => Some(String::from_utf8_lossy(&n).to_string()),
lopdf::Object::Dictionary(ref d) => {
d.get(b"BaseEncoding").ok().and_then(|be| {
if let lopdf::Object::Name(n) = be {
Some(String::from_utf8_lossy(n).to_string())
} else {
None
}
})
}
_ => None,
}
})
.unwrap_or_else(|| "WinAnsiEncoding".to_string());
let is_standard = is_standard_font(&base_font);
let mut default_width = if is_standard {
standard_font_default_width(&base_font)
} else {
1000.0
};
let is_type0 = subtype == "Type0";
let bytes_per_code: u8 = if is_type0 { 2 } else { 1 };
let mut widths = resolve_widths(doc, dict);
let mut to_unicode = resolve_tounicode(doc, dict);
let mut flags = 0u32;
let mut italic_angle = 0.0f64;
let mut weight = 400.0f64;
let mut ascent = 800.0f64;
let mut descent = -200.0f64;
let mut font_bbox = [0.0f64, -200.0, 1000.0, 800.0];
if is_type0 {
if let Ok(desc_ref) = dict.get(b"DescendantFonts") {
let desc_obj = resolve_object(doc, desc_ref);
if let Ok(desc_arr) = desc_obj.as_array() {
if let Some(first) = desc_arr.first() {
let desc_font_obj = resolve_object(doc, first);
if let Ok(desc_dict) = desc_font_obj.as_dict() {
if let Ok(dw) = desc_dict.get(b"DW") {
if let Some(dw_val) = obj_to_f64(resolve_object(doc, dw)) {
default_width = dw_val;
}
}
resolve_cid_widths(doc, desc_dict, &mut widths);
let (f, ia, w, a, d, fb) = resolve_font_descriptor(doc, desc_dict);
flags = f;
italic_angle = ia;
weight = w;
ascent = a;
descent = d;
font_bbox = fb;
}
}
}
}
} else {
resolve_encoding_differences(doc, dict, &mut to_unicode);
if subtype == "Type1" {
resolve_type1_font_program_encoding(doc, dict, &mut to_unicode);
}
let (f, ia, w, a, d, fb) = resolve_font_descriptor(doc, dict);
flags = f;
italic_angle = ia;
weight = w;
ascent = a;
descent = d;
font_bbox = fb;
}
let name_lower = base_font.to_lowercase();
let is_name_bold =
name_lower.contains("bold") || name_lower.contains("black") || name_lower.contains("heavy");
if is_name_bold && weight < 700.0 {
weight = 700.0;
}
PdfFont {
name: name.to_string(),
base_font,
subtype,
widths,
default_width,
to_unicode,
encoding,
is_standard,
flags,
italic_angle,
weight,
bytes_per_code,
ascent,
descent,
font_bbox,
}
}
fn resolve_widths(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> HashMap<u32, f64> {
let mut widths = HashMap::new();
let first_char = dict
.get(b"FirstChar")
.ok()
.and_then(|o| obj_to_i64(resolve_object(doc, o)))
.unwrap_or(0) as u32;
if let Ok(widths_ref) = dict.get(b"Widths") {
let widths_obj = resolve_object(doc, widths_ref);
if let Ok(arr) = widths_obj.as_array() {
for (i, w) in arr.iter().enumerate() {
if let Some(width) = obj_to_f64(resolve_object(doc, w)) {
widths.insert(first_char + i as u32, width);
}
}
}
}
widths
}
fn resolve_cid_widths(
doc: &lopdf::Document,
dict: &lopdf::Dictionary,
widths: &mut HashMap<u32, f64>,
) {
let w_obj = match dict.get(b"W") {
Ok(o) => resolve_object(doc, o),
Err(_) => return,
};
let w_arr = match w_obj.as_array() {
Ok(a) => a,
Err(_) => return,
};
let mut i = 0;
while i < w_arr.len() {
let first = resolve_object(doc, &w_arr[i]);
if let Some(cid_start) = obj_to_i64(first) {
let cid_start = cid_start as u32;
i += 1;
if i >= w_arr.len() {
break;
}
let next = resolve_object(doc, &w_arr[i]);
if let Ok(arr) = next.as_array() {
for (j, w) in arr.iter().enumerate() {
if let Some(width) = obj_to_f64(resolve_object(doc, w)) {
widths.insert(cid_start + j as u32, width);
}
}
i += 1;
} else if let Some(cid_end) = obj_to_i64(next) {
let cid_end = cid_end as u32;
i += 1;
if i >= w_arr.len() {
break;
}
let w_val = resolve_object(doc, &w_arr[i]);
if let Some(width) = obj_to_f64(w_val) {
for cid in cid_start..=cid_end {
widths.insert(cid, width);
}
}
i += 1;
} else {
i += 1;
}
} else {
i += 1;
}
}
}
fn resolve_tounicode(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> HashMap<u32, String> {
let mut mapping = HashMap::new();
if let Ok(tounicode_ref) = dict.get(b"ToUnicode") {
let tounicode_ref = match tounicode_ref {
lopdf::Object::Reference(r) => *r,
_ => return mapping,
};
if let Ok(stream) = doc.get_object(tounicode_ref) {
if let Ok(stream) = stream.as_stream() {
if let Ok(data) = stream.decompressed_content() {
parse_cmap(&data, &mut mapping);
}
}
}
}
mapping
}
fn resolve_encoding_differences(
doc: &lopdf::Document,
dict: &lopdf::Dictionary,
to_unicode: &mut HashMap<u32, String>,
) {
let enc_obj = match dict.get(b"Encoding") {
Ok(o) => resolve_object(doc, o),
Err(_) => return,
};
let enc_dict = match enc_obj.as_dict() {
Ok(d) => d,
Err(_) => return, };
let diffs_obj = match enc_dict.get(b"Differences") {
Ok(o) => resolve_object(doc, o),
Err(_) => return,
};
let diffs = match diffs_obj.as_array() {
Ok(a) => a,
Err(_) => return,
};
let mut current_code: u32 = 0;
for item in diffs {
let resolved = resolve_object(doc, item);
match resolved {
lopdf::Object::Integer(i) => {
current_code = i as u32;
}
lopdf::Object::Name(ref name_bytes) => {
let glyph_name = String::from_utf8_lossy(name_bytes).to_string();
if let std::collections::hash_map::Entry::Vacant(e) = to_unicode.entry(current_code)
{
if let Some(unicode) = glyph_name_to_unicode(&glyph_name) {
e.insert(unicode);
}
}
current_code += 1;
}
_ => {}
}
}
}
fn resolve_type1_font_program_encoding(
doc: &lopdf::Document,
dict: &lopdf::Dictionary,
to_unicode: &mut HashMap<u32, String>,
) {
let fd_obj = match dict.get(b"FontDescriptor") {
Ok(o) => resolve_object(doc, o),
Err(_) => return,
};
let fd = match fd_obj.as_dict() {
Ok(d) => d,
Err(_) => return,
};
let stream_data = None
.or_else(|| get_font_file_data(doc, fd, b"FontFile"))
.or_else(|| get_font_file_data(doc, fd, b"FontFile3"));
let data = match stream_data {
Some(d) => d,
None => return,
};
let text = String::from_utf8_lossy(&data);
if text.contains("/Encoding StandardEncoding def") {
return; }
for line in text.lines() {
let trimmed = line.trim();
if !trimmed.starts_with("dup ") || !trimmed.ends_with(" put") {
continue;
}
let inner = &trimmed[4..trimmed.len() - 4].trim();
let parts: Vec<&str> = inner.splitn(2, ' ').collect();
if parts.len() != 2 {
continue;
}
let code: u32 = match parts[0].trim().parse() {
Ok(c) => c,
Err(_) => continue,
};
let glyph = parts[1].trim().trim_start_matches('/');
if glyph.is_empty() || glyph == ".notdef" {
continue;
}
if let std::collections::hash_map::Entry::Vacant(e) = to_unicode.entry(code) {
if let Some(unicode) = glyph_name_to_unicode(glyph) {
e.insert(unicode);
}
}
}
}
fn get_font_file_data(
doc: &lopdf::Document,
fd: &lopdf::Dictionary,
key: &[u8],
) -> Option<Vec<u8>> {
let ff_ref = fd.get(key).ok()?;
let ff_id = match ff_ref {
lopdf::Object::Reference(r) => *r,
_ => return None,
};
let ff_obj = doc.get_object(ff_id).ok()?;
let stream = ff_obj.as_stream().ok()?;
stream.decompressed_content().ok()
}
fn glyph_name_to_unicode(name: &str) -> Option<String> {
match name {
"fi" => return Some("fi".to_string()),
"fl" => return Some("fl".to_string()),
"ff" => return Some("ff".to_string()),
"ffi" => return Some("ffi".to_string()),
"ffl" => return Some("ffl".to_string()),
"IJ" => return Some("IJ".to_string()),
"ij" => return Some("ij".to_string()),
_ => {}
}
if let Some(s) = resolve_glyph_component(name) {
return Some(s);
}
let base = if let Some(dot_pos) = name.find('.') {
&name[..dot_pos]
} else {
name
};
if base != name {
if let Some(s) = resolve_glyph_component(base) {
return Some(s);
}
}
if base.contains('_') {
let mut result = String::new();
for component in base.split('_') {
if let Some(s) = resolve_glyph_component(component) {
result.push_str(&s);
} else {
return None; }
}
if !result.is_empty() {
return Some(result);
}
}
None
}
fn resolve_glyph_component(name: &str) -> Option<String> {
if name.is_empty() {
return None;
}
if let Some(s) = AGL_MAP.get(name) {
return Some(s.clone());
}
if let Some(s) = TEX_GLYPH_MAP.get(name) {
return Some((*s).to_string());
}
if name.len() == 1 {
return Some(name.to_string());
}
if let Some(hex) = name.strip_prefix("uni") {
if hex.len() == 4 {
if let Ok(cp) = u32::from_str_radix(hex, 16) {
if let Some(c) = char::from_u32(cp) {
return Some(c.to_string());
}
}
}
if hex.len() > 4 && hex.len() % 4 == 0 {
let mut s = String::new();
for chunk in hex.as_bytes().chunks(4) {
if let Ok(h) = std::str::from_utf8(chunk) {
if let Ok(cp) = u32::from_str_radix(h, 16) {
if let Some(c) = char::from_u32(cp) {
s.push(c);
}
}
}
}
if !s.is_empty() {
return Some(s);
}
}
}
if let Some(hex) = name.strip_prefix('u') {
if (4..=6).contains(&hex.len()) && hex.chars().all(|c| c.is_ascii_hexdigit()) {
if let Ok(cp) = u32::from_str_radix(hex, 16) {
if let Some(c) = char::from_u32(cp) {
return Some(c.to_string());
}
}
}
}
None
}
fn parse_cmap(data: &[u8], mapping: &mut HashMap<u32, String>) {
let text = String::from_utf8_lossy(data);
let mut in_bfchar = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.contains("beginbfchar") {
in_bfchar = true;
continue;
}
if trimmed.contains("endbfchar") {
in_bfchar = false;
continue;
}
if in_bfchar {
let parts: Vec<&str> = trimmed.split('>').collect();
if parts.len() >= 2 {
if let (Some(src), Some(dst)) =
(parse_hex_value(parts[0]), parse_hex_unicode(parts[1]))
{
mapping.insert(src, dst);
}
}
}
}
let mut in_bfrange = false;
for line in text.lines() {
let trimmed = line.trim();
if trimmed.contains("beginbfrange") {
in_bfrange = true;
continue;
}
if trimmed.contains("endbfrange") {
in_bfrange = false;
continue;
}
if in_bfrange {
if let Some(bracket_start) = trimmed.find('[') {
let before_bracket = &trimmed[..bracket_start];
let parts: Vec<&str> = before_bracket.split('>').collect();
if parts.len() >= 2 {
if let (Some(start), Some(end)) =
(parse_hex_value(parts[0]), parse_hex_value(parts[1]))
{
let bracket_end = trimmed.rfind(']').unwrap_or(trimmed.len());
let inside = &trimmed[bracket_start + 1..bracket_end];
let values: Vec<String> = inside
.split('>')
.filter_map(|s| {
let s = s.trim().trim_start_matches('<');
if s.is_empty() {
None
} else {
parse_hex_unicode_str(s)
}
})
.collect();
for (i, val) in values.iter().enumerate() {
let code = start + i as u32;
if code > end {
break;
}
mapping.insert(code, val.clone());
}
}
}
} else {
let parts: Vec<&str> = trimmed.split('>').collect();
if parts.len() >= 3 {
if let (Some(start), Some(end), Some(dst_start)) = (
parse_hex_value(parts[0]),
parse_hex_value(parts[1]),
parse_hex_value(parts[2]),
) {
for code in start..=end {
let unicode_point = dst_start + (code - start);
if let Some(c) = char::from_u32(unicode_point) {
mapping.insert(code, c.to_string());
}
}
}
}
}
}
}
}
fn parse_hex_value(s: &str) -> Option<u32> {
let cleaned = s.trim().trim_start_matches('<').trim();
if cleaned.is_empty() {
return None;
}
u32::from_str_radix(cleaned, 16).ok()
}
fn parse_hex_unicode(s: &str) -> Option<String> {
let cleaned = s
.trim()
.trim_start_matches('<')
.trim_end_matches('>')
.trim();
parse_hex_unicode_str(cleaned)
}
fn parse_hex_unicode_str(cleaned: &str) -> Option<String> {
if cleaned.is_empty() {
return None;
}
let mut result = String::new();
let bytes: Vec<&str> = cleaned
.as_bytes()
.chunks(4)
.map(|c| std::str::from_utf8(c).unwrap_or(""))
.collect();
for hex_str in bytes {
if let Ok(code_point) = u32::from_str_radix(hex_str, 16) {
if let Some(c) = char::from_u32(code_point) {
result.push(c);
}
}
}
if result.is_empty() {
None
} else {
Some(result)
}
}
fn resolve_font_descriptor(
doc: &lopdf::Document,
dict: &lopdf::Dictionary,
) -> (u32, f64, f64, f64, f64, [f64; 4]) {
let mut flags = 0u32;
let mut italic_angle = 0.0f64;
let mut weight = 400.0f64;
let mut ascent = 800.0f64;
let mut descent = -200.0f64;
let mut font_bbox = [0.0f64, -200.0, 1000.0, 800.0];
if let Ok(fd_ref) = dict.get(b"FontDescriptor") {
let fd_obj = resolve_object(doc, fd_ref);
if let Ok(fd) = fd_obj.as_dict() {
flags = fd
.get(b"Flags")
.ok()
.and_then(|o| obj_to_i64(resolve_object(doc, o)))
.unwrap_or(0) as u32;
italic_angle = fd
.get(b"ItalicAngle")
.ok()
.and_then(|o| obj_to_f64(resolve_object(doc, o)))
.unwrap_or(0.0);
let stem_v = fd
.get(b"StemV")
.ok()
.and_then(|o| obj_to_f64(resolve_object(doc, o)))
.unwrap_or(0.0);
weight = if stem_v >= 140.0 {
700.0 } else if stem_v >= 100.0 {
500.0 } else {
400.0 };
if let Ok(bbox_ref) = fd.get(b"FontBBox") {
let bbox_obj = resolve_object(doc, bbox_ref);
if let Ok(bbox_arr) = bbox_obj.as_array() {
if bbox_arr.len() >= 4 {
let vals: Vec<f64> = bbox_arr
.iter()
.filter_map(|o| obj_to_f64(resolve_object(doc, o)))
.collect();
if vals.len() >= 4 {
font_bbox = [vals[0], vals[1], vals[2], vals[3]];
}
}
}
}
if let Ok(a_ref) = fd.get(b"Ascent") {
if let Some(a) = obj_to_f64(resolve_object(doc, a_ref)) {
ascent = a;
}
} else {
ascent = font_bbox[3]; }
if let Ok(d_ref) = fd.get(b"Descent") {
if let Some(d) = obj_to_f64(resolve_object(doc, d_ref)) {
descent = d;
}
} else {
descent = font_bbox[1]; }
}
}
(flags, italic_angle, weight, ascent, descent, font_bbox)
}
fn resolve_object<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> lopdf::Object {
match obj {
lopdf::Object::Reference(id) => doc.get_object(*id).cloned().unwrap_or(lopdf::Object::Null),
other => other.clone(),
}
}
fn obj_to_f64(obj: lopdf::Object) -> Option<f64> {
match obj {
lopdf::Object::Integer(i) => Some(i as f64),
lopdf::Object::Real(f) => Some(f),
_ => None,
}
}
fn obj_to_i64(obj: lopdf::Object) -> Option<i64> {
match obj {
lopdf::Object::Integer(i) => Some(i),
lopdf::Object::Real(f) => Some(f as i64),
_ => None,
}
}
fn is_standard_font(name: &str) -> bool {
matches!(
name,
"Courier"
| "Courier-Bold"
| "Courier-Oblique"
| "Courier-BoldOblique"
| "Helvetica"
| "Helvetica-Bold"
| "Helvetica-Oblique"
| "Helvetica-BoldOblique"
| "Times-Roman"
| "Times-Bold"
| "Times-Italic"
| "Times-BoldItalic"
| "Symbol"
| "ZapfDingbats"
)
}
fn standard_font_default_width(name: &str) -> f64 {
if name.starts_with("Courier") {
600.0
} else {
500.0
}
}
fn decode_macroman(code: u32) -> String {
if code < 128 {
if let Some(c) = char::from_u32(code) {
return c.to_string();
}
}
let mapped = match code {
0x80 => '\u{00C4}', 0x81 => '\u{00C5}', 0x82 => '\u{00C7}', 0x83 => '\u{00C9}', 0x84 => '\u{00D1}', 0x85 => '\u{00D6}', 0x86 => '\u{00DC}', 0x87 => '\u{00E1}', 0x88 => '\u{00E0}', 0x89 => '\u{00E2}', 0x8A => '\u{00E4}', 0x8B => '\u{00E3}', 0x8C => '\u{00E5}', 0x8D => '\u{00E7}', 0x8E => '\u{00E9}', 0x8F => '\u{00E8}', 0x90 => '\u{00EA}', 0x91 => '\u{00EB}', 0x92 => '\u{00ED}', 0x93 => '\u{00EC}', 0x94 => '\u{00EE}', 0x95 => '\u{00EF}', 0x96 => '\u{00F1}', 0x97 => '\u{00F3}', 0x98 => '\u{00F2}', 0x99 => '\u{00F4}', 0x9A => '\u{00F6}', 0x9B => '\u{00F5}', 0x9C => '\u{00FA}', 0x9D => '\u{00F9}', 0x9E => '\u{00FB}', 0x9F => '\u{00FC}', 0xA0 => '\u{2020}', 0xA1 => '\u{00B0}', 0xA2 => '\u{00A2}', 0xA3 => '\u{00A3}', 0xA4 => '\u{00A7}', 0xA5 => '\u{2022}', 0xA6 => '\u{00B6}', 0xA7 => '\u{00DF}', 0xA8 => '\u{00AE}', 0xA9 => '\u{00A9}', 0xAA => '\u{2122}', 0xAB => '\u{00B4}', 0xAC => '\u{00A8}', 0xAD => '\u{2260}', 0xAE => '\u{00C6}', 0xAF => '\u{00D8}', 0xB0 => '\u{221E}', 0xB1 => '\u{00B1}', 0xB2 => '\u{2264}', 0xB3 => '\u{2265}', 0xB4 => '\u{00A5}', 0xB5 => '\u{00B5}', 0xB6 => '\u{2202}', 0xB7 => '\u{2211}', 0xB8 => '\u{220F}', 0xB9 => '\u{03C0}', 0xBA => '\u{222B}', 0xBB => '\u{00AA}', 0xBC => '\u{00BA}', 0xBD => '\u{03A9}', 0xBE => '\u{00E6}', 0xBF => '\u{00F8}', 0xC0 => '\u{00BF}', 0xC1 => '\u{00A1}', 0xC2 => '\u{00AC}', 0xC3 => '\u{221A}', 0xC4 => '\u{0192}', 0xC5 => '\u{2248}', 0xC6 => '\u{2206}', 0xC7 => '\u{00AB}', 0xC8 => '\u{00BB}', 0xC9 => '\u{2026}', 0xCA => '\u{00A0}', 0xCB => '\u{00C0}', 0xCC => '\u{00C3}', 0xCD => '\u{00D5}', 0xCE => '\u{0152}', 0xCF => '\u{0153}', 0xD0 => '\u{2013}', 0xD1 => '\u{2014}', 0xD2 => '\u{201C}', 0xD3 => '\u{201D}', 0xD4 => '\u{2018}', 0xD5 => '\u{2019}', 0xD6 => '\u{00F7}', 0xD7 => '\u{25CA}', 0xD8 => '\u{00FF}', 0xD9 => '\u{0178}', 0xDA => '\u{2044}', 0xDB => '\u{20AC}', 0xDC => '\u{2039}', 0xDD => '\u{203A}', 0xDE => '\u{FB01}', 0xDF => '\u{FB02}', 0xE0 => '\u{2021}', 0xE1 => '\u{00B7}', 0xE2 => '\u{201A}', 0xE3 => '\u{201E}', 0xE4 => '\u{2030}', 0xE5 => '\u{00C2}', 0xE6 => '\u{00CA}', 0xE7 => '\u{00C1}', 0xE8 => '\u{00CB}', 0xE9 => '\u{00C8}', 0xEA => '\u{00CD}', 0xEB => '\u{00CE}', 0xEC => '\u{00CF}', 0xED => '\u{00CC}', 0xEE => '\u{00D3}', 0xEF => '\u{00D4}', 0xF0 => '\u{F8FF}', 0xF1 => '\u{00D2}', 0xF2 => '\u{00DA}', 0xF3 => '\u{00DB}', 0xF4 => '\u{00D9}', 0xF5 => '\u{0131}', 0xF6 => '\u{02C6}', 0xF7 => '\u{02DC}', 0xF8 => '\u{00AF}', 0xF9 => '\u{02D8}', 0xFA => '\u{02D9}', 0xFB => '\u{02DA}', 0xFC => '\u{00B8}', 0xFD => '\u{02DD}', 0xFE => '\u{02DB}', 0xFF => '\u{02C7}', _ => {
return char::from_u32(code)
.map(|c| c.to_string())
.unwrap_or_default();
}
};
mapped.to_string()
}
fn decode_winansi(code: u32) -> String {
if code < 128 {
if let Some(c) = char::from_u32(code) {
return c.to_string();
}
}
let mapped = match code {
0x80 => '\u{20AC}', 0x82 => '\u{201A}', 0x83 => '\u{0192}', 0x84 => '\u{201E}', 0x85 => '\u{2026}', 0x86 => '\u{2020}', 0x87 => '\u{2021}', 0x88 => '\u{02C6}', 0x89 => '\u{2030}', 0x8A => '\u{0160}', 0x8B => '\u{2039}', 0x8C => '\u{0152}', 0x8E => '\u{017D}', 0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '\u{201C}', 0x94 => '\u{201D}', 0x95 => '\u{2022}', 0x96 => '\u{2013}', 0x97 => '\u{2014}', 0x98 => '\u{02DC}', 0x99 => '\u{2122}', 0x9A => '\u{0161}', 0x9B => '\u{203A}', 0x9C => '\u{0153}', 0x9E => '\u{017E}', 0x9F => '\u{0178}', c @ 0xA0..=0xFF => {
return char::from_u32(c)
.map(|ch| ch.to_string())
.unwrap_or_default();
}
_ => {
return char::from_u32(code)
.map(|c| c.to_string())
.unwrap_or_default();
}
};
mapped.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_font() {
let font = PdfFont::default_font("F1");
assert_eq!(font.name, "F1");
assert!((font.default_width - 600.0).abs() < 1e-10);
}
#[test]
fn test_bold_detection() {
let mut font = PdfFont::default_font("F1");
font.base_font = "Helvetica-Bold".to_string();
assert!(font.is_bold());
font.base_font = "Helvetica".to_string();
assert!(!font.is_bold());
}
#[test]
fn test_decode_winansi() {
assert_eq!(decode_winansi(65), "A");
assert_eq!(decode_winansi(0x93), "\u{201C}");
}
#[test]
fn test_parse_hex_value() {
assert_eq!(parse_hex_value("<0041"), Some(0x41));
assert_eq!(parse_hex_value("<00FF"), Some(0xFF));
}
#[test]
fn test_standard_font() {
assert!(is_standard_font("Helvetica"));
assert!(is_standard_font("Courier-Bold"));
assert!(!is_standard_font("ArialMT"));
}
#[test]
fn test_glyph_name_to_unicode_ligatures() {
assert_eq!(glyph_name_to_unicode("fi"), Some("fi".to_string()));
assert_eq!(glyph_name_to_unicode("fl"), Some("fl".to_string()));
assert_eq!(glyph_name_to_unicode("ff"), Some("ff".to_string()));
assert_eq!(glyph_name_to_unicode("ffi"), Some("ffi".to_string()));
assert_eq!(glyph_name_to_unicode("ffl"), Some("ffl".to_string()));
}
#[test]
fn test_glyph_name_to_unicode_common() {
assert_eq!(glyph_name_to_unicode("percent"), Some("%".to_string()));
assert_eq!(glyph_name_to_unicode("ampersand"), Some("&".to_string()));
assert_eq!(glyph_name_to_unicode("parenleft"), Some("(".to_string()));
assert_eq!(
glyph_name_to_unicode("endash"),
Some("\u{2013}".to_string())
);
assert_eq!(glyph_name_to_unicode("A"), Some("A".to_string()));
assert_eq!(glyph_name_to_unicode("uni0041"), Some("A".to_string()));
}
#[test]
fn test_glyph_name_to_unicode_unknown() {
assert_eq!(glyph_name_to_unicode("nonexistent_glyph_xyz"), None);
}
#[test]
fn test_glyph_name_to_unicode_agl_extended() {
assert_eq!(
glyph_name_to_unicode("Dcroat"),
Some("\u{0110}".to_string())
);
assert_eq!(
glyph_name_to_unicode("dcroat"),
Some("\u{0111}".to_string())
);
assert_eq!(
glyph_name_to_unicode("Emacron"),
Some("\u{0112}".to_string())
);
assert_eq!(
glyph_name_to_unicode("afii10017"),
Some("\u{0410}".to_string())
); assert_eq!(
glyph_name_to_unicode("afii57636"),
Some("\u{20AA}".to_string())
); assert_eq!(
glyph_name_to_unicode("dalethatafpatah"),
Some("\u{05D3}\u{05B2}".to_string())
);
}
#[test]
fn test_glyph_name_to_unicode_uni_formats() {
assert_eq!(glyph_name_to_unicode("uni0041"), Some("A".to_string()));
assert_eq!(glyph_name_to_unicode("uni00E9"), Some("é".to_string()));
assert_eq!(glyph_name_to_unicode("uni00410042"), Some("AB".to_string()));
assert_eq!(
glyph_name_to_unicode("u1F600"),
Some("\u{1F600}".to_string())
);
}
#[test]
fn test_parse_cmap_bfrange_array() {
let cmap_data = b"beginbfrange\n<0001> <0003> [<0041> <0042> <0043>]\nendbfrange\n";
let mut mapping = HashMap::new();
parse_cmap(cmap_data, &mut mapping);
assert_eq!(mapping.get(&1), Some(&"A".to_string()));
assert_eq!(mapping.get(&2), Some(&"B".to_string()));
assert_eq!(mapping.get(&3), Some(&"C".to_string()));
}
#[test]
fn test_parse_cmap_bfrange_incremented() {
let cmap_data = b"beginbfrange\n<0041> <0043> <0061>\nendbfrange\n";
let mut mapping = HashMap::new();
parse_cmap(cmap_data, &mut mapping);
assert_eq!(mapping.get(&0x41), Some(&"a".to_string()));
assert_eq!(mapping.get(&0x42), Some(&"b".to_string()));
assert_eq!(mapping.get(&0x43), Some(&"c".to_string()));
}
#[test]
fn test_decode_winansi_extended() {
assert_eq!(decode_winansi(0x82), "\u{201A}"); assert_eq!(decode_winansi(0x83), "\u{0192}"); assert_eq!(decode_winansi(0x8A), "\u{0160}"); assert_eq!(decode_winansi(0x8C), "\u{0152}"); assert_eq!(decode_winansi(0x99), "\u{2122}"); assert_eq!(decode_winansi(0x9C), "\u{0153}"); assert_eq!(decode_winansi(0xA0), "\u{00A0}"); assert_eq!(decode_winansi(0xE9), "\u{00E9}"); }
#[test]
fn test_tex_glyph_names() {
assert_eq!(
glyph_name_to_unicode("asteriskmath"),
Some("\u{2217}".to_string())
); assert_eq!(
glyph_name_to_unicode("diamondmath"),
Some("\u{22C4}".to_string())
); assert_eq!(
glyph_name_to_unicode("minusplus"),
Some("\u{2213}".to_string())
); assert_eq!(
glyph_name_to_unicode("circleminus"),
Some("\u{2296}".to_string())
); assert_eq!(
glyph_name_to_unicode("circledot"),
Some("\u{2299}".to_string())
); assert_eq!(
glyph_name_to_unicode("follows"),
Some("\u{227B}".to_string())
); assert_eq!(
glyph_name_to_unicode("lessmuch"),
Some("\u{226A}".to_string())
); assert_eq!(
glyph_name_to_unicode("greatermuch"),
Some("\u{226B}".to_string())
); assert_eq!(
glyph_name_to_unicode("latticetop"),
Some("\u{22A4}".to_string())
); assert_eq!(
glyph_name_to_unicode("mapsto"),
Some("\u{21A6}".to_string())
); assert_eq!(
glyph_name_to_unicode("dagger"),
Some("\u{2020}".to_string())
); assert_eq!(
glyph_name_to_unicode("daggerdbl"),
Some("\u{2021}".to_string())
); assert_eq!(
glyph_name_to_unicode("braceleft"),
Some("\u{007B}".to_string())
); assert_eq!(
glyph_name_to_unicode("braceright"),
Some("\u{007D}".to_string())
); }
}