use crate::cmap::ToUnicodeCMap;
use crate::error::Result;
use crate::object::{Dictionary, Object};
use crate::Document;
use std::collections::BTreeMap;
#[derive(Debug, Clone)]
pub enum Encoding {
OneByte(Box<[u32; 256]>),
Identity,
Unicode(ToUnicodeCMap),
}
impl Encoding {
pub fn bytes_to_string(&self, bytes: &[u8]) -> Result<String> {
match self {
Self::OneByte(map) => {
let mut out = String::with_capacity(bytes.len());
for &b in bytes {
let cp = map[b as usize];
if cp != 0 {
if let Some(c) = char::from_u32(cp) {
out.push(c);
}
} else {
out.push(b as char);
}
}
Ok(out)
}
Self::Identity => Ok(bytes.iter().map(|&b| b as char).collect()),
Self::Unicode(cmap) => cmap.decode(bytes),
}
}
}
pub fn resolve_page_encodings(
doc: &Document,
fonts: &BTreeMap<Vec<u8>, Dictionary>,
) -> BTreeMap<Vec<u8>, Encoding> {
let mut out = BTreeMap::new();
for (name, font_dict) in fonts {
let enc = resolve_font_encoding(doc, font_dict);
out.insert(name.clone(), enc);
}
out
}
fn resolve_font_encoding(doc: &Document, font: &Dictionary) -> Encoding {
if let Some(to_uni) = font.get_optional(b"ToUnicode") {
if let Some(enc) = try_to_unicode(doc, to_uni) {
return Encoding::Unicode(enc);
}
}
if let Some(enc_obj) = font.get_optional(b"Encoding") {
if let Ok(name) = enc_obj.as_name() {
return predefined_by_name(name);
}
if let Ok(id) = enc_obj.as_reference() {
if let Ok(d) = doc.get_dictionary(id) {
if let Some(base) = d.get_optional(b"BaseEncoding").and_then(|o| o.as_name().ok()) {
return predefined_by_name(base);
}
}
}
if let Ok(d) = enc_obj.as_dict() {
if let Some(base) = d.get_optional(b"BaseEncoding").and_then(|o| o.as_name().ok()) {
return predefined_by_name(base);
}
}
}
Encoding::OneByte(Box::new(win_ansi_encoding()))
}
fn try_to_unicode(doc: &Document, to_uni: &Object) -> Option<ToUnicodeCMap> {
let stream_obj = match to_uni {
Object::Reference(id) => doc.get_object(*id).ok()?,
Object::Stream(_) => to_uni.clone(),
_ => return None,
};
let stream = stream_obj.as_stream().ok()?;
Some(crate::cmap::ToUnicodeCMap::parse(&stream.content).unwrap_or_default())
}
fn predefined_by_name(name: &[u8]) -> Encoding {
match name {
b"StandardEncoding" => Encoding::OneByte(Box::new(standard_encoding())),
b"WinAnsiEncoding" => Encoding::OneByte(Box::new(win_ansi_encoding())),
b"MacRomanEncoding" => Encoding::OneByte(Box::new(mac_roman_encoding())),
b"MacExpertEncoding" => Encoding::OneByte(Box::new(mac_expert_encoding())),
b"PDFDocEncoding" => Encoding::OneByte(Box::new(pdf_doc_encoding())),
b"Identity-H" | b"Identity-V" => Encoding::Identity,
_ => Encoding::Identity,
}
}
fn standard_encoding() -> [u32; 256] {
let mut t = base_ascii_table();
t[0xA1] = 0x00A1; t[0xA2] = 0x00A2; t[0xA3] = 0x00A3; t[0xA4] = 0x2044; t[0xA5] = 0x00A5; t[0xA6] = 0x0192; t[0xA7] = 0x00A7; t[0xA8] = 0x00A4; t[0xA9] = 0x0027; t[0xAA] = 0x201C; t[0xAB] = 0x00AB; t[0xAC] = 0x2039; t[0xAD] = 0x203A; t[0xAE] = 0xFB01; t[0xAF] = 0xFB02; t[0xB1] = 0x2013; t[0xB2] = 0x2020; t[0xB3] = 0x2021; t[0xB4] = 0x00B7; t[0xB6] = 0x00B6; t[0xB7] = 0x2022; t[0xB8] = 0x201A; t[0xB9] = 0x201E; t[0xBA] = 0x201D; t[0xBB] = 0x00BB; t[0xBC] = 0x2026; t[0xBD] = 0x2030; t[0xBF] = 0x00BF; t[0xC1] = 0x0060; t[0xC2] = 0x00B4; t[0xC3] = 0x02C6; t[0xC4] = 0x02DC; t[0xC5] = 0x00AF; t[0xC6] = 0x02D8; t[0xC7] = 0x02D9; t[0xC8] = 0x00A8; t[0xCA] = 0x02DA; t[0xCB] = 0x00B8; t[0xCD] = 0x02DD; t[0xCE] = 0x02DB; t[0xCF] = 0x02C7; t[0xE1] = 0x00C6; t[0xE3] = 0x00AA; t[0xE8] = 0x0141; t[0xE9] = 0x00D8; t[0xEA] = 0x0152; t[0xEB] = 0x00BA; t[0xF1] = 0x00E6; t[0xF5] = 0x0131; t[0xF8] = 0x0142; t[0xF9] = 0x00F8; t[0xFA] = 0x0153; t[0xFB] = 0x00DF; t
}
fn win_ansi_encoding() -> [u32; 256] {
let mut t = base_ascii_table();
let upper: [u32; 128] = [
0x20AC, 0, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039,
0x0152, 0, 0x017D, 0, 0, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC,
0x2122, 0x0161, 0x203A, 0x0153, 0, 0x017E, 0x0178, 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4,
0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00BA,
0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5,
0x00C6, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x00D0,
0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, 0x00D8, 0x00D9, 0x00DA, 0x00DB,
0x00DC, 0x00DD, 0x00DE, 0x00DF, 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6,
0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x00F0, 0x00F1,
0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC,
0x00FD, 0x00FE, 0x00FF,
];
for (i, &cp) in upper.iter().enumerate() {
t[0x80 + i] = cp;
}
t
}
fn mac_roman_encoding() -> [u32; 256] {
let mut t = base_ascii_table();
for i in 0x80..=0xFF {
t[i] = i as u32;
}
t[0xA5] = 0x2022; t[0xC9] = 0x2026; t[0xD0] = 0x2013; t[0xD1] = 0x2014; t[0xD2] = 0x201C; t[0xD3] = 0x201D; t[0xD4] = 0x2018; t[0xD5] = 0x2019; t
}
fn mac_expert_encoding() -> [u32; 256] {
let mut t = base_ascii_table();
for i in 0x80..=0xFF {
t[i] = i as u32;
}
t
}
fn pdf_doc_encoding() -> [u32; 256] {
win_ansi_encoding()
}
fn base_ascii_table() -> [u32; 256] {
let mut t = [0u32; 256];
for i in 0..=0x7Fu32 {
t[i as usize] = i;
}
t
}