use zpdf_core::{ObjectId, PdfObject, Result};
use zpdf_font::{CidWidths, FontCache, LoadedFont, PdfFontType};
use zpdf_parser::PdfFile;
use crate::page::PdfPage;
pub fn load_page_fonts(file: &PdfFile, page: &PdfPage) -> FontCache {
let mut cache = FontCache::new();
for (name, &font_ref) in &page.resources.fonts {
match load_single_font(file, font_ref) {
Ok(font) => {
cache.insert(name.clone(), font);
}
Err(e) => {
tracing::debug!("font {name} ({font_ref}): fallback - {e}");
cache.insert(name.clone(), LoadedFont::new_placeholder(name.clone()));
}
}
}
cache
}
pub fn load_single_font(file: &PdfFile, font_ref: ObjectId) -> Result<LoadedFont> {
let obj = file.resolve(font_ref)?;
let dict = obj.as_dict()?;
let subtype = dict.get_name("Subtype").unwrap_or("");
let base_font = dict.get_name("BaseFont").unwrap_or("Unknown").to_string();
let mut font = match subtype {
"Type0" => load_type0_font(file, dict, base_font)?,
"TrueType" => load_truetype_font(file, dict, base_font)?,
"Type3" => load_type3_font(file, dict, base_font)?,
"Type1" | "MMType1" => load_type1_font(file, dict, base_font)?,
_ => LoadedFont::new_placeholder(base_font),
};
attach_text_mappings(file, dict, subtype, &mut font);
font.build_substitute_cid_to_gid();
Ok(font)
}
fn substitute_hints(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
) -> zpdf_font::system::SubstituteHints {
let mut hints = zpdf_font::system::SubstituteHints::default();
if let Ok(fd_ref) = dict.get_ref("FontDescriptor") {
if let Ok(fd) = file.resolve(fd_ref) {
if let Ok(fd) = fd.as_dict() {
if let Ok(flags) = fd.get_i64("Flags") {
hints.fixed_pitch = flags & 1 != 0;
hints.serif = flags & 2 != 0;
hints.italic = flags & 64 != 0;
hints.bold = flags & (1 << 18) != 0; }
if let Ok(w) = fd.get_f64("StemV") {
hints.bold |= w >= 160.0;
}
}
}
}
hints
}
fn try_system_substitute_simple(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
base_font: &str,
font_type: PdfFontType,
mut cid_widths: CidWidths,
) -> Option<LoadedFont> {
let hints = substitute_hints(file, dict);
let m = zpdf_font::system::find_system_font(base_font, hints, None)?;
if cid_widths.is_empty() {
if let Some(metrics) = zpdf_font::standard_fonts::lookup(base_font) {
for (code, &w) in metrics.widths.iter().enumerate() {
if w > 0 {
cid_widths.set(code as u16, w as f64);
}
}
}
}
LoadedFont::new_substitute(
font_type,
base_font.to_string(),
m.data,
m.face_index,
cid_widths,
)
}
fn attach_text_mappings(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
subtype: &str,
font: &mut LoadedFont,
) {
if let Ok(tu_ref) = dict.get_ref("ToUnicode") {
if let Ok(data) = file.resolve_stream_data(tu_ref) {
let map = zpdf_font::cmap::ToUnicodeMap::parse(&data);
if !map.is_empty() {
font.to_unicode = Some(map);
}
}
}
if subtype == "Type0" {
return;
}
font.symbolic = font_descriptor_symbolic(file, dict);
let encoding = if dict.get("Encoding").is_none() {
builtin_symbol_encoding(&font.base_font)
.or_else(|| parse_encoding(file, dict, subtype, font.symbolic))
} else {
parse_encoding(file, dict, subtype, font.symbolic)
};
if let Some(enc) = encoding {
font.encoding = Some(enc);
}
font.map_unencoded_orphans();
}
fn builtin_symbol_encoding(base_font: &str) -> Option<zpdf_font::encoding::Encoding> {
use zpdf_font::encoding::{base_encoding_by_name, Encoding};
let name = base_font.rsplit('+').next().unwrap_or(base_font);
let canonical = if name.contains("ZapfDingbats") || name.contains("Dingbats") {
"ZapfDingbats"
} else if name.contains("Symbol") {
"Symbol"
} else {
return None;
};
base_encoding_by_name(canonical).map(Encoding::from_base)
}
fn font_descriptor_symbolic(file: &PdfFile, dict: &zpdf_core::PdfDict) -> bool {
let fd_ref = match dict.get_ref("FontDescriptor") {
Ok(r) => r,
Err(_) => return false,
};
let flags = file
.resolve(fd_ref)
.ok()
.and_then(|o| o.as_dict().ok().and_then(|d| d.get_i64("Flags").ok()));
matches!(flags, Some(f) if (f & 4) != 0 && (f & 32) == 0)
}
fn parse_encoding(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
subtype: &str,
symbolic: bool,
) -> Option<zpdf_font::encoding::Encoding> {
use zpdf_font::encoding::{base_encoding_by_name, Encoding};
let enc_obj = match dict.get("Encoding").cloned() {
Some(PdfObject::Ref(r)) => file.resolve(r).ok(),
other => other,
};
match enc_obj {
Some(PdfObject::Name(n)) => base_encoding_by_name(n.as_str()).map(Encoding::from_base),
Some(PdfObject::Dict(enc_dict)) => {
let base = enc_dict
.get_name("BaseEncoding")
.ok()
.and_then(base_encoding_by_name)
.unwrap_or_else(|| default_simple_base(subtype));
let mut encoding = Encoding::from_base(base);
apply_differences(&enc_dict, &mut encoding);
Some(encoding)
}
_ if symbolic => None,
_ => Some(Encoding::from_base(default_simple_base(subtype))),
}
}
fn default_simple_base(subtype: &str) -> &'static zpdf_font::encoding::EncodingTable {
match subtype {
"TrueType" => &zpdf_font::encoding::WIN_ANSI_ENCODING,
_ => &zpdf_font::encoding::STANDARD_ENCODING,
}
}
fn apply_differences(enc_dict: &zpdf_core::PdfDict, encoding: &mut zpdf_font::encoding::Encoding) {
if let Ok(diffs) = enc_dict.get_array("Differences") {
let mut code = 0u32;
for obj in diffs {
match obj {
PdfObject::Integer(n) => code = (*n).max(0) as u32,
PdfObject::Name(name) => {
if code <= 255 {
encoding.apply_difference(code as u8, name.as_str());
}
code += 1;
}
_ => {}
}
}
}
}
fn parse_type0_encoding(file: &PdfFile, dict: &zpdf_core::PdfDict) -> zpdf_font::cmap::CidCMap {
use zpdf_font::cmap::CidCMap;
fn identity_fallback(name: &str) -> CidCMap {
let wmode = name.ends_with("-V") as u8;
tracing::warn!(
"unsupported predefined CMap {name}; using Identity-{}",
if wmode == 1 { "V" } else { "H" }
);
CidCMap::identity(wmode)
}
match dict.get("Encoding") {
Some(PdfObject::Name(n)) => {
CidCMap::predefined(n.as_str()).unwrap_or_else(|| identity_fallback(n.as_str()))
}
Some(PdfObject::Ref(r)) => match file.resolve(*r) {
Ok(PdfObject::Name(n)) => {
CidCMap::predefined(n.as_str()).unwrap_or_else(|| identity_fallback(n.as_str()))
}
Ok(PdfObject::Stream(s)) => {
let data = file
.resolve_stream_data(*r)
.or_else(|_| zpdf_parser::filters::decode_stream(&s.data, &s.dict));
let mut cmap = match data {
Ok(d) => CidCMap::parse(&d),
Err(e) => {
tracing::warn!("undecodable embedded CMap: {e}; using Identity-H");
CidCMap::identity(0)
}
};
if let Ok(1) = s.dict.get_i64("WMode") {
cmap.wmode = 1;
}
cmap
}
_ => CidCMap::identity(0),
},
_ => CidCMap::identity(0),
}
}
fn parse_dw2(file: &PdfFile, desc_dict: &zpdf_core::PdfDict) -> (f64, f64) {
resolve_array(file, desc_dict, "DW2")
.and_then(|arr| {
let v: Vec<f64> = arr.iter().filter_map(|o| o.as_f64().ok()).collect();
(v.len() >= 2).then(|| (v[0], v[1]))
})
.unwrap_or((880.0, -1000.0))
}
fn load_type0_font(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
base_font: String,
) -> Result<LoadedFont> {
let descendants = resolve_array(file, dict, "DescendantFonts")
.ok_or_else(|| zpdf_core::Error::MissingKey("DescendantFonts".into()))?;
let desc_ref = descendants
.first()
.ok_or_else(|| zpdf_core::Error::MissingKey("DescendantFonts[0]".into()))?
.as_ref()?;
let desc_obj = file.resolve(desc_ref)?;
let desc_dict = desc_obj.as_dict()?;
let mut cid_widths = parse_cid_widths(file, desc_dict);
parse_cid_w2(file, desc_dict, &mut cid_widths);
let cmap = parse_type0_encoding(file, dict);
let dw2 = parse_dw2(file, desc_dict);
let font_data = extract_font_file(file, desc_dict);
let mut font = match font_data {
Some(data) => {
let mut font =
LoadedFont::new_with_data(PdfFontType::Type0CidType2, base_font, data, cid_widths);
if let Some(map) = parse_cid_to_gid_stream(file, desc_dict) {
let subtype = desc_dict.get_name("Subtype").unwrap_or("");
if subtype == "CIDFontType2" || font.cid_to_gid.is_none() {
font.cid_to_gid = Some(map);
}
}
font
}
None => {
let ordering = resolve_dict(file, desc_dict, "CIDSystemInfo").and_then(|csi| match csi
.get("Ordering")
{
Some(PdfObject::String(s)) => Some(s.to_string_lossy()),
Some(PdfObject::Name(n)) => Some(n.as_str().to_string()),
_ => None,
});
let hints = substitute_hints(file, desc_dict);
let substituted =
zpdf_font::system::find_system_font(&base_font, hints, ordering.as_deref())
.and_then(|m| {
LoadedFont::new_substitute(
PdfFontType::Type0CidType2,
base_font.clone(),
m.data,
m.face_index,
cid_widths,
)
});
substituted.unwrap_or_else(|| LoadedFont::new_placeholder(base_font))
}
};
font.cid_cmap = Some(cmap);
font.dw2 = dw2;
font.validate_cid_cmap();
Ok(font)
}
fn parse_cid_to_gid_stream(
file: &PdfFile,
desc_dict: &zpdf_core::PdfDict,
) -> Option<std::collections::HashMap<u16, u16>> {
let stream_ref = match desc_dict.get("CIDToGIDMap") {
Some(PdfObject::Ref(r)) => *r,
_ => return None,
};
let data = match file.resolve_stream_data(stream_ref) {
Ok(d) => d,
Err(e) => {
tracing::debug!("CIDToGIDMap {stream_ref}: not a decodable stream - {e}");
return None;
}
};
let mut map = std::collections::HashMap::new();
for (cid, gid_bytes) in data.chunks_exact(2).enumerate().take(u16::MAX as usize + 1) {
let gid = u16::from_be_bytes([gid_bytes[0], gid_bytes[1]]);
if gid != 0 {
map.insert(cid as u16, gid);
}
}
if map.is_empty() {
None
} else {
Some(map)
}
}
fn load_truetype_font(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
base_font: String,
) -> Result<LoadedFont> {
let cid_widths = parse_simple_widths(file, dict);
let font_data = extract_font_file_from_descriptor(file, dict);
match font_data {
Some(data) => Ok(LoadedFont::new_with_data(
PdfFontType::TrueType,
base_font,
data,
cid_widths,
)),
None => Ok(try_system_substitute_simple(
file,
dict,
&base_font,
PdfFontType::TrueType,
cid_widths,
)
.or_else(|| LoadedFont::new_standard(base_font.clone()))
.unwrap_or_else(|| LoadedFont::new_placeholder(base_font))),
}
}
fn load_type3_font(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
base_font: String,
) -> Result<LoadedFont> {
use std::sync::Arc;
let font_matrix = {
let mut m = [0.001, 0.0, 0.0, -0.001, 0.0, 0.0];
if let Some(arr) = resolve_array(file, dict, "FontMatrix") {
for (i, obj) in arr.iter().enumerate().take(6) {
if let Ok(v) = obj.as_f64() {
m[i] = v;
}
}
}
m
};
let mut encoding = Vec::new();
if let Some(enc_dict) = resolve_dict(file, dict, "Encoding") {
if let Some(diffs) = resolve_array(file, &enc_dict, "Differences") {
let mut current_code = 0usize;
for obj in &diffs {
match obj {
PdfObject::Integer(n) => {
current_code = *n as usize;
while encoding.len() < current_code {
encoding.push(String::new());
}
}
PdfObject::Name(n) => {
while encoding.len() <= current_code {
encoding.push(String::new());
}
encoding[current_code] = n.0.clone();
current_code += 1;
}
_ => {}
}
}
}
}
let mut char_procs = std::collections::HashMap::new();
if let Some(cp_dict) = resolve_dict(file, dict, "CharProcs") {
for (name, obj) in &cp_dict.0 {
if let PdfObject::Ref(r) = obj {
if let Ok(data) = file.resolve_stream_data(*r) {
char_procs.insert(name.0.clone(), Arc::from(data));
}
}
}
}
let first_char = dict.get_i64("FirstChar").unwrap_or(0) as u16;
let widths: Vec<f64> = resolve_array(file, dict, "Widths")
.unwrap_or_default()
.iter()
.map(|o| o.as_f64().unwrap_or(0.0))
.collect();
let font = LoadedFont {
font_type: zpdf_font::PdfFontType::Type3 {
font_matrix,
char_procs,
encoding,
widths,
first_char,
},
base_font,
font_data: None,
face_index: 0,
is_substitute: false,
cid_widths: CidWidths::new(1000.0),
units_per_em: 1000.0,
ascent: 880.0,
descent: -120.0,
cid_to_gid: None,
builtin_encoding_gids: None,
orphan_gids: Vec::new(),
encoding: None,
to_unicode: None,
symbolic: false,
type1: None,
cid_cmap: None,
dw2: (880.0, -1000.0),
};
Ok(font)
}
fn load_type1_font(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
base_font: String,
) -> Result<LoadedFont> {
let cid_widths = parse_simple_widths(file, dict);
let font_data = extract_font_file_from_descriptor(file, dict);
match font_data {
Some(data) => Ok(LoadedFont::new_with_data(
PdfFontType::Type1,
base_font,
data,
cid_widths,
)),
None => Ok(try_system_substitute_simple(
file,
dict,
&base_font,
PdfFontType::Type1,
cid_widths,
)
.or_else(|| LoadedFont::new_standard(base_font.clone()))
.unwrap_or_else(|| LoadedFont::new_placeholder(base_font))),
}
}
fn extract_font_file(file: &PdfFile, cid_dict: &zpdf_core::PdfDict) -> Option<Vec<u8>> {
let fd_ref = cid_dict.get_ref("FontDescriptor").ok()?;
let fd_obj = file.resolve(fd_ref).ok()?;
let fd_dict = fd_obj.as_dict().ok()?;
for key in &["FontFile2", "FontFile3", "FontFile"] {
if let Ok(ff_ref) = fd_dict.get_ref(key) {
if let Ok(data) = file.resolve_stream_data(ff_ref) {
if !data.is_empty() {
return Some(data);
}
}
}
}
None
}
fn extract_font_file_from_descriptor(
file: &PdfFile,
font_dict: &zpdf_core::PdfDict,
) -> Option<Vec<u8>> {
let fd_ref = font_dict.get_ref("FontDescriptor").ok()?;
let fd_obj = file.resolve(fd_ref).ok()?;
let fd_dict = fd_obj.as_dict().ok()?;
for key in &["FontFile2", "FontFile3", "FontFile"] {
if let Ok(ff_ref) = fd_dict.get_ref(key) {
if let Ok(data) = file.resolve_stream_data(ff_ref) {
if !data.is_empty() {
return Some(data);
}
}
}
}
None
}
fn resolve_array(file: &PdfFile, dict: &zpdf_core::PdfDict, key: &str) -> Option<Vec<PdfObject>> {
match dict.get(key) {
Some(PdfObject::Array(a)) => Some(a.clone()),
Some(PdfObject::Ref(id)) => file
.resolve(*id)
.ok()
.and_then(|o| o.as_array().ok().map(|a| a.to_vec())),
_ => None,
}
}
fn resolve_dict(
file: &PdfFile,
dict: &zpdf_core::PdfDict,
key: &str,
) -> Option<zpdf_core::PdfDict> {
match dict.get(key) {
Some(PdfObject::Dict(d)) => Some(d.clone()),
Some(PdfObject::Ref(id)) => file
.resolve(*id)
.ok()
.and_then(|o| o.as_dict().ok().cloned()),
_ => None,
}
}
fn parse_cid_widths(file: &PdfFile, dict: &zpdf_core::PdfDict) -> CidWidths {
let dw = dict.get_f64("DW").unwrap_or(1000.0);
let mut widths = CidWidths::new(dw);
let w_array = match resolve_array(file, dict, "W") {
Some(arr) => arr,
None => return widths,
};
let mut i = 0;
while i < w_array.len() {
let cid_start = match w_array[i].as_i64() {
Ok(v) => v as u16,
Err(_) => break,
};
i += 1;
if i >= w_array.len() {
break;
}
match &w_array[i] {
PdfObject::Array(arr) => {
for (j, obj) in arr.iter().enumerate() {
let Some(cid) = cid_start.checked_add(j as u16) else {
break;
};
if let Ok(w) = obj.as_f64() {
widths.set(cid, w);
}
}
i += 1;
}
PdfObject::Integer(_) | PdfObject::Real(_) => {
let cid_end = w_array[i].as_i64().unwrap_or(cid_start as i64) as u16;
i += 1;
if i < w_array.len() {
let w = w_array[i].as_f64().unwrap_or(dw);
for cid in cid_start..=cid_end {
widths.set(cid, w);
}
i += 1;
}
}
_ => {
i += 1;
}
}
}
widths
}
fn parse_cid_w2(file: &PdfFile, dict: &zpdf_core::PdfDict, widths: &mut CidWidths) {
if let Some(arr) = resolve_array(file, dict, "W2") {
apply_w2_array(&arr, widths);
}
}
fn apply_w2_array(w2_array: &[PdfObject], widths: &mut CidWidths) {
let mut i = 0;
while i < w2_array.len() {
let cid_start = match w2_array[i].as_i64() {
Ok(v) => v as u16,
Err(_) => break,
};
i += 1;
if i >= w2_array.len() {
break;
}
match &w2_array[i] {
PdfObject::Array(arr) => {
let mut k = 0;
while k + 2 < arr.len() {
let (Ok(w1y), Ok(vx), Ok(vy)) =
(arr[k].as_f64(), arr[k + 1].as_f64(), arr[k + 2].as_f64())
else {
break;
};
let Some(cid) = cid_start.checked_add((k / 3) as u16) else {
break;
};
widths.set_v(cid, w1y, vx, vy);
k += 3;
}
i += 1;
}
PdfObject::Integer(_) | PdfObject::Real(_) => {
let cid_end = w2_array[i].as_i64().unwrap_or(cid_start as i64) as u16;
if i + 3 < w2_array.len() {
let (Ok(w1y), Ok(vx), Ok(vy)) = (
w2_array[i + 1].as_f64(),
w2_array[i + 2].as_f64(),
w2_array[i + 3].as_f64(),
) else {
break;
};
for cid in cid_start..=cid_end {
widths.set_v(cid, w1y, vx, vy);
}
i += 4;
} else {
break;
}
}
_ => {
i += 1;
}
}
}
}
fn parse_simple_widths(file: &PdfFile, dict: &zpdf_core::PdfDict) -> CidWidths {
let first_char = dict.get_i64("FirstChar").unwrap_or(0) as u16;
let mut widths = CidWidths::new(1000.0);
if let Some(arr) = resolve_array(file, dict, "Widths") {
for (j, obj) in arr.iter().enumerate() {
let Some(code) = first_char.checked_add(j as u16) else {
break;
};
if let Ok(w) = obj.as_f64() {
widths.set(code, w);
}
}
}
widths
}
#[cfg(test)]
mod tests {
use super::*;
fn int(v: i64) -> PdfObject {
PdfObject::Integer(v)
}
fn real(v: f64) -> PdfObject {
PdfObject::Real(v)
}
#[test]
fn w2_list_form_assigns_consecutive_cids() {
let arr = vec![
int(120),
PdfObject::Array(vec![
real(-1000.0),
real(500.0),
real(880.0),
int(-900),
int(450),
int(820),
]),
];
let mut w = CidWidths::new(1000.0);
apply_w2_array(&arr, &mut w);
assert_eq!(w.get_v(120), Some((-1000.0, 500.0, 880.0)));
assert_eq!(w.get_v(121), Some((-900.0, 450.0, 820.0)));
assert_eq!(w.get_v(122), None);
}
#[test]
fn w2_range_form_assigns_inclusive_range() {
let arr = vec![int(10), int(12), int(-1000), int(500), int(880)];
let mut w = CidWidths::new(1000.0);
apply_w2_array(&arr, &mut w);
for cid in 10..=12 {
assert_eq!(w.get_v(cid), Some((-1000.0, 500.0, 880.0)));
}
assert_eq!(w.get_v(9), None);
assert_eq!(w.get_v(13), None);
}
#[test]
fn w2_truncated_entry_is_ignored_not_panic() {
let arr = vec![int(10), int(12), int(-1000)];
let mut w = CidWidths::new(1000.0);
apply_w2_array(&arr, &mut w);
assert_eq!(w.get_v(10), None);
}
}