use std::collections::HashMap;
use lopdf::{Dictionary, Document, Object};
use crate::pdfium_backend::Glyph;
#[derive(Clone, Copy)]
struct Mat {
a: f64,
b: f64,
c: f64,
d: f64,
e: f64,
f: f64,
}
impl Mat {
const ID: Mat = Mat {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: 0.0,
f: 0.0,
};
fn then(self, m: Mat) -> Mat {
Mat {
a: self.a * m.a + self.b * m.c,
b: self.a * m.b + self.b * m.d,
c: self.c * m.a + self.d * m.c,
d: self.c * m.b + self.d * m.d,
e: self.e * m.a + self.f * m.c + m.e,
f: self.e * m.b + self.f * m.d + m.f,
}
}
fn apply(self, x: f64, y: f64) -> (f64, f64) {
(
self.a * x + self.c * y + self.e,
self.b * x + self.d * y + self.f,
)
}
}
struct Font {
two_byte: bool,
to_unicode: HashMap<u32, String>,
widths: HashMap<u32, f64>,
default_width: f64,
simple_encoding: Option<HashMap<u8, char>>,
fallback_names: HashMap<u8, String>,
program_encoding: HashMap<u8, char>,
ascent: f64,
descent: f64,
hash: u64,
}
impl Font {
fn decode_code(&self, code: u32) -> (Option<String>, f64) {
let w = self
.widths
.get(&code)
.copied()
.unwrap_or(self.default_width);
if let Some(s) = self.to_unicode.get(&code) {
return (Some(decompose_ligatures(s)), w);
}
if !self.two_byte {
if let Some(name) = self.fallback_names.get(&(code as u8)) {
return (Some(format!("/{name}")), w);
}
if let Some(enc) = &self.simple_encoding {
if let Some(&ch) = enc.get(&(code as u8)) {
return (Some(decompose_ligatures(&ch.to_string())), w);
}
}
if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
return (Some(decompose_ligatures(&ch.to_string())), w);
}
}
(None, w)
}
}
fn decompose_ligatures(s: &str) -> String {
if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
return s.to_string();
}
s.chars()
.map(|c| {
match c {
'\u{FB00}' => "ff",
'\u{FB01}' => "fi",
'\u{FB02}' => "fl",
'\u{FB03}' => "ffi",
'\u{FB04}' => "ffl",
'\u{FB05}' => "ft",
'\u{FB06}' => "st",
_ => return c.to_string(),
}
.to_string()
})
.collect()
}
fn hash_name(name: &[u8]) -> u64 {
use std::hash::{Hash, Hasher};
let mut h = std::collections::hash_map::DefaultHasher::new();
name.hash(&mut h);
h.finish()
}
fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
match obj {
Object::Dictionary(d) => Some(d),
Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
_ => None,
}
}
fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
match obj {
Object::Reference(id) => doc.get_object(*id).ok(),
other => Some(other),
}
}
fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
let subtype: &[u8] = fdict
.get(b"Subtype")
.ok()
.and_then(|o| o.as_name().ok())
.unwrap_or(&[]);
let two_byte = subtype == b"Type0".as_slice();
let to_unicode = fdict
.get(b"ToUnicode")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| o.as_stream().ok())
.and_then(|s| s.decompressed_content().ok())
.map(|data| parse_tounicode(&data))
.unwrap_or_default();
let (widths, default_width) = if two_byte {
cid_widths(doc, fdict)
} else {
simple_widths(doc, fdict)
};
let simple_encoding = if two_byte {
None
} else {
Some(simple_encoding_table(doc, fdict))
};
let fallback_names = if two_byte {
HashMap::new()
} else {
differences_gid_names(doc, fdict)
};
let program_encoding = if two_byte {
HashMap::new()
} else {
type1_program_encoding(doc, fdict)
};
let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
Font {
two_byte,
to_unicode,
widths,
default_width,
simple_encoding,
fallback_names,
program_encoding,
ascent,
descent,
hash: hash_name(name),
}
}
fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
let mut map = HashMap::new();
let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
else {
return map;
};
let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
else {
return map;
};
let mut code = 0u8;
for el in diffs {
match el {
Object::Integer(i) => code = *i as u8,
Object::Name(name) => {
if glyph_name_to_char(name).is_none() && is_gid_name(name) {
map.insert(code, String::from_utf8_lossy(name).into_owned());
}
code = code.wrapping_add(1);
}
_ => {}
}
}
map
}
fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
let mut map = HashMap::new();
let Some(desc) = fdict
.get(b"FontDescriptor")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| o.as_dict().ok())
else {
return map;
};
let Some(data) = desc
.get(b"FontFile")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| o.as_stream().ok())
.and_then(|s| s.decompressed_content().ok())
else {
return map;
};
let head_end = data
.windows(5)
.position(|w| w == b"eexec")
.unwrap_or(data.len());
let head = String::from_utf8_lossy(&data[..head_end]);
let toks: Vec<&str> = head.split_whitespace().collect();
for w in toks.windows(4) {
if w[0] == "dup" && w[3] == "put" {
if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
if code <= 255 {
if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
map.insert(code as u8, ch);
}
}
}
}
}
map
}
fn is_gid_name(name: &[u8]) -> bool {
let Ok(s) = std::str::from_utf8(name) else {
return false;
};
if s.starts_with("afii") || s.starts_with("uni") {
return false;
}
for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
if let Some(rest) = s.strip_prefix(prefix) {
if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
return true;
}
}
}
let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
let digits = s.len() - alpha;
(1..=3).contains(&alpha)
&& digits >= 3
&& s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
}
fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
let descr_owner = if two_byte {
fdict
.get(b"DescendantFonts")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| match o {
Object::Array(a) => a.first(),
_ => None,
})
.and_then(|o| as_dict(doc, o))
} else {
Some(fdict)
};
let fd = descr_owner
.and_then(|d| d.get(b"FontDescriptor").ok())
.and_then(|o| as_dict(doc, o));
let asc = fd
.and_then(|d| d.get(b"Ascent").ok())
.and_then(|o| {
o.as_float()
.ok()
.or_else(|| o.as_i64().ok().map(|i| i as f32))
})
.unwrap_or(750.0) as f64;
let desc = fd
.and_then(|d| d.get(b"Descent").ok())
.and_then(|o| {
o.as_float()
.ok()
.or_else(|| o.as_i64().ok().map(|i| i as f32))
})
.unwrap_or(-250.0) as f64;
if asc - desc <= 1.0 {
return (750.0, -250.0);
}
(asc, desc)
}
fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
let mut map = HashMap::new();
let first = fdict
.get(b"FirstChar")
.ok()
.and_then(|o| o.as_i64().ok())
.unwrap_or(0) as u32;
if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
for (i, w) in arr.iter().enumerate() {
if let Some(w) = num(w) {
map.insert(first + i as u32, w);
}
}
}
let dw = fdict
.get(b"FontDescriptor")
.ok()
.and_then(|o| as_dict(doc, o))
.and_then(|d| d.get(b"MissingWidth").ok())
.and_then(num)
.unwrap_or(0.0);
(map, dw)
}
fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
let mut map = HashMap::new();
let Some(desc) = fdict
.get(b"DescendantFonts")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| match o {
Object::Array(a) => a.first(),
_ => None,
})
.and_then(|o| as_dict(doc, o))
else {
return (map, 1000.0);
};
let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
let mut i = 0;
while i < w.len() {
let c = w.get(i).and_then(num);
match (c, w.get(i + 1)) {
(Some(c), Some(Object::Array(list))) => {
for (k, wv) in list.iter().enumerate() {
if let Some(wv) = num(wv) {
map.insert(c as u32 + k as u32, wv);
}
}
i += 2;
}
(Some(c1), Some(o2)) => {
if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
for cid in c1 as u32..=c2 as u32 {
map.insert(cid, wv);
}
}
i += 3;
}
_ => break,
}
}
}
(map, dw)
}
fn num(o: &Object) -> Option<f64> {
match o {
Object::Integer(i) => Some(*i as f64),
Object::Real(r) => Some(*r as f64),
_ => None,
}
}
fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
let text = String::from_utf8_lossy(data);
let mut map = HashMap::new();
let hex = |s: &str| -> Option<Vec<u16>> {
let s = s.trim();
if !s.starts_with('<') || !s.ends_with('>') {
return None;
}
let h = &s[1..s.len() - 1];
let bytes: Vec<u8> = (0..h.len())
.step_by(2)
.filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
.collect();
Some(
bytes
.chunks(2)
.map(|c| {
if c.len() == 2 {
u16::from_be_bytes([c[0], c[1]])
} else {
c[0] as u16
}
})
.collect(),
)
};
let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
let tokens: Vec<String> = {
let bytes = text.as_bytes();
let mut toks = Vec::new();
let mut i = 0;
while i < bytes.len() {
let c = bytes[i];
if c.is_ascii_whitespace() {
i += 1;
} else if c == b'<' {
let start = i;
while i < bytes.len() && bytes[i] != b'>' {
i += 1;
}
i += 1; toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
} else if c == b'[' || c == b']' {
toks.push((c as char).to_string());
i += 1;
} else {
let start = i;
while i < bytes.len()
&& !bytes[i].is_ascii_whitespace()
&& bytes[i] != b'<'
&& bytes[i] != b'['
&& bytes[i] != b']'
{
i += 1;
}
toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
}
}
toks
};
let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
let mut i = 0;
while i < tokens.len() {
match tokens[i] {
"beginbfchar" => {
i += 1;
while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
map.insert(code_of(&src), u16s_to_string(&dst));
}
i += 2;
}
}
"beginbfrange" => {
i += 1;
while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
i += 1;
continue;
};
let lo = code_of(&lo);
let hi = code_of(&hi);
if tokens[i + 2] == "[" {
let mut j = i + 3;
let mut code = lo;
while j < tokens.len() && tokens[j] != "]" {
if let Some(dst) = hex(tokens[j]) {
map.insert(code, u16s_to_string(&dst));
}
code += 1;
j += 1;
}
i = j + 1;
} else if let Some(dst) = hex(tokens[i + 2]) {
let base = code_of(&dst);
for (k, code) in (lo..=hi).enumerate() {
if let Some(ch) = char::from_u32(base + k as u32) {
map.insert(code, ch.to_string());
}
}
i += 3;
} else {
i += 1;
}
}
}
_ => i += 1,
}
}
map
}
fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
if font.two_byte {
bytes
.chunks(2)
.map(|c| {
if c.len() == 2 {
((c[0] as u32) << 8) | c[1] as u32
} else {
c[0] as u32
}
})
.collect()
} else {
bytes.iter().map(|&b| b as u32).collect()
}
}
fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
let mb = doc
.get_object(page_id)
.ok()
.and_then(|o| o.as_dict().ok())
.and_then(|d| {
d.get(b"MediaBox").ok().cloned()
})
.or_else(|| {
doc.get_dictionary(page_id)
.ok()
.and_then(|d| d.get(b"MediaBox").ok().cloned())
});
if let Some(Object::Array(a)) = mb {
let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
if v.len() == 4 {
return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
}
}
(612.0, 792.0)
}
pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
let Ok(doc) = Document::load_mem(bytes) else {
return Vec::new();
};
let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
pages.sort_by_key(|(n, _)| *n);
let Some((_, pid)) = pages.get(index) else {
return Vec::new();
};
page_glyphs(&doc, *pid)
.into_iter()
.map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
.collect()
}
pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
let Ok(doc) = Document::load_mem(bytes) else {
return Vec::new();
};
let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
pages.sort_by_key(|(n, _)| *n);
pages
.into_iter()
.map(|(_, pid)| {
let (w, h) = page_size(&doc, pid);
let glyphs = page_glyphs(&doc, pid);
let cells = crate::dp_lines::line_cells(&glyphs, h, true);
(w, h, cells)
})
.collect()
}
pub fn pdf_words(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
let Ok(doc) = Document::load_mem(bytes) else {
return Vec::new();
};
let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
pages.sort_by_key(|(n, _)| *n);
pages
.into_iter()
.map(|(_, pid)| {
let (w, h) = page_size(&doc, pid);
let glyphs = page_glyphs(&doc, pid);
let cells = crate::dp_lines::word_cells(&glyphs, h, true);
(w, h, cells)
})
.collect()
}
#[derive(Default)]
pub struct PageParserCells {
pub prose: Vec<crate::pdfium_backend::TextCell>,
pub words: Vec<crate::pdfium_backend::TextCell>,
pub code: Vec<crate::pdfium_backend::TextCell>,
}
pub fn pdf_all_cells(bytes: &[u8]) -> Vec<PageParserCells> {
let Ok(doc) = Document::load_mem(bytes) else {
return Vec::new();
};
let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
pages.sort_by_key(|(n, _)| *n);
pages
.into_iter()
.map(|(_, pid)| {
let (_w, h) = page_size(&doc, pid);
let glyphs = page_glyphs(&doc, pid);
PageParserCells {
prose: crate::dp_lines::line_cells(&glyphs, h, true),
words: crate::dp_lines::word_cells(&glyphs, h, true),
code: crate::pdfium_backend::code_cells_from_glyphs(&glyphs, h),
}
})
.collect()
}
#[derive(Clone, Copy)]
struct TextState {
tc: f64,
tw: f64,
th: f64,
tl: f64,
trise: f64,
fsize: f64,
}
impl TextState {
const INIT: TextState = TextState {
tc: 0.0,
tw: 0.0,
th: 1.0,
tl: 0.0,
trise: 0.0,
fsize: 0.0,
};
}
fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
let (inline, ids) = doc.get_page_resources(page_id).ok()?;
if let Some(d) = inline {
return Some(d);
}
ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
}
fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
let mut map = HashMap::new();
let font_dict = res
.get(b"Font")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| o.as_dict().ok());
if let Some(fd) = font_dict {
for (name, value) in fd.iter() {
if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
map.insert(name.clone(), parse_font(doc, name, fdict));
}
}
}
map
}
pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
let mut out = Vec::new();
let Ok(content_bytes) = doc.get_page_content(page_id) else {
return out;
};
let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
return out;
};
if let Some(res) = page_res(doc, page_id) {
run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
}
out
}
fn run_content(
doc: &Document,
res: &Dictionary,
content: &lopdf::content::Content,
base_ctm: Mat,
init: TextState,
depth: u32,
out: &mut Vec<Glyph>,
) {
let fonts = fonts_from_res(doc, res);
let xobjects = res
.get(b"XObject")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| o.as_dict().ok());
#[allow(clippy::type_complexity)]
let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
let mut ctm = base_ctm;
let mut tm = Mat::ID;
let mut tlm = Mat::ID;
let mut font: Option<&Font> = None;
let mut fsize = init.fsize;
let mut tc = init.tc; let mut tw = init.tw; let mut th = init.th; let mut tl = init.tl; let mut trise = init.trise;
let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
for op in &content.operations {
let operands = &op.operands;
match op.operator.as_str() {
"q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
"Q" => {
if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
ctm = c;
tc = a;
tw = b;
th = h;
tl = l;
trise = r;
fsize = fs;
font = f;
}
}
"cm" => {
let m = Mat {
a: op_f(operands, 0),
b: op_f(operands, 1),
c: op_f(operands, 2),
d: op_f(operands, 3),
e: op_f(operands, 4),
f: op_f(operands, 5),
};
ctm = m.then(ctm);
}
"BT" => {
tm = Mat::ID;
tlm = Mat::ID;
}
"ET" => {}
"Tf" => {
if let Some(Object::Name(n)) = operands.first() {
font = fonts.get(n.as_slice());
}
fsize = op_f(operands, 1);
}
"Td" => {
tlm = Mat {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: op_f(operands, 0),
f: op_f(operands, 1),
}
.then(tlm);
tm = tlm;
}
"TD" => {
tl = -op_f(operands, 1);
tlm = Mat {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: op_f(operands, 0),
f: op_f(operands, 1),
}
.then(tlm);
tm = tlm;
}
"Tm" => {
tlm = Mat {
a: op_f(operands, 0),
b: op_f(operands, 1),
c: op_f(operands, 2),
d: op_f(operands, 3),
e: op_f(operands, 4),
f: op_f(operands, 5),
};
tm = tlm;
}
"T*" => {
tlm = Mat {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: 0.0,
f: -tl,
}
.then(tlm);
tm = tlm;
}
"Tc" => tc = op_f(operands, 0),
"Tw" => tw = op_f(operands, 0),
"Tz" => th = op_f(operands, 0) / 100.0,
"TL" => tl = op_f(operands, 0),
"Ts" => trise = op_f(operands, 0),
"Tj" | "'" | "\"" => {
if op.operator == "'" || op.operator == "\"" {
tlm = Mat {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: 0.0,
f: -tl,
}
.then(tlm);
tm = tlm;
}
if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
}
}
"TJ" => {
if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
for el in arr {
match el {
Object::String(s, _) => {
show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
}
other => {
if let Some(adj) = num(other) {
let tx = -adj / 1000.0 * fsize * th;
tm = Mat {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: tx,
f: 0.0,
}
.then(tm);
}
}
}
}
}
}
"Do" => {
if depth >= 8 {
continue;
}
let Some(Object::Name(n)) = operands.first() else {
continue;
};
let stream = xobjects
.and_then(|d| d.get(n.as_slice()).ok())
.and_then(|o| deref(doc, o))
.and_then(|o| o.as_stream().ok());
let Some(stream) = stream else { continue };
let is_form = stream
.dict
.get(b"Subtype")
.ok()
.and_then(|o| o.as_name().ok())
== Some(b"Form".as_slice());
if !is_form {
continue;
}
let Ok(data) = stream.decompressed_content() else {
continue;
};
let Ok(form_content) = lopdf::content::Content::decode(&data) else {
continue;
};
let form_mat = match stream.dict.get(b"Matrix").ok() {
Some(Object::Array(a)) if a.len() == 6 => {
let v: Vec<f64> = a.iter().filter_map(num).collect();
if v.len() == 6 {
Mat {
a: v[0],
b: v[1],
c: v[2],
d: v[3],
e: v[4],
f: v[5],
}
} else {
Mat::ID
}
}
_ => Mat::ID,
};
let form_res = stream
.dict
.get(b"Resources")
.ok()
.and_then(|o| deref(doc, o))
.and_then(|o| o.as_dict().ok())
.unwrap_or(res);
let state = TextState {
tc,
tw,
th,
tl,
trise,
fsize,
};
run_content(
doc,
form_res,
&form_content,
form_mat.then(ctm),
state,
depth + 1,
out,
);
}
_ => {}
}
}
}
#[allow(clippy::too_many_arguments)]
fn show_text(
font: &Font,
bytes: &[u8],
fsize: f64,
tc: f64,
tw: f64,
th: f64,
trise: f64,
tm: &mut Mat,
ctm: Mat,
out: &mut Vec<Glyph>,
) {
for code in codes(font, bytes) {
let (text, w) = font.decode_code(code);
let w0 = w / 1000.0; let scale = Mat {
a: fsize * th,
b: 0.0,
c: 0.0,
d: fsize,
e: 0.0,
f: trise,
};
let trm = scale.then(*tm).then(ctm);
let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
let (left, right) = (x0.min(x1), x0.max(x1));
let (bot, top) = (y0.min(y2), y0.max(y2));
if let Some(s) = text {
for ch in s.chars() {
if ch != '\u{0}' {
out.push(Glyph {
ch,
l: left as f32,
b: bot as f32,
r: right as f32,
t: top as f32,
ll: left as f32,
lb: bot as f32,
lr: right as f32,
lt: top as f32,
font: font.hash,
});
}
}
}
let is_space = !font.two_byte && code == 32;
let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
*tm = Mat {
a: 1.0,
b: 0.0,
c: 0.0,
d: 1.0,
e: tx,
f: 0.0,
}
.then(*tm);
}
}
fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
let base_name = match enc {
Some(Object::Name(n)) => n.clone(),
Some(Object::Dictionary(d)) => d
.get(b"BaseEncoding")
.ok()
.and_then(|o| o.as_name().ok())
.map(|n| n.to_vec())
.unwrap_or_default(),
_ => Vec::new(),
};
let mut m = if base_name == b"MacRomanEncoding" {
macroman_table()
} else {
winansi_table()
};
if let Some(Object::Dictionary(d)) = enc {
if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
let mut code = 0u8;
for el in diffs {
match el {
Object::Integer(i) => code = *i as u8,
Object::Name(name) => {
if let Some(ch) = glyph_name_to_char(name) {
m.insert(code, ch);
}
code = code.wrapping_add(1);
}
_ => {}
}
}
}
}
m
}
fn glyph_name_to_char(name: &[u8]) -> Option<char> {
let s = std::str::from_utf8(name).ok()?;
if let Some(hex) = s.strip_prefix("uni") {
if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
return char::from_u32(cp);
}
}
if s.len() == 1 {
let b = s.as_bytes()[0];
if b.is_ascii_alphabetic() {
return Some(b as char);
}
}
let resolved = match s {
"space" => ' ',
"exclam" => '!',
"quotedbl" => '"',
"numbersign" => '#',
"dollar" => '$',
"percent" => '%',
"ampersand" => '&',
"quotesingle" => '\'',
"parenleft" => '(',
"parenright" => ')',
"asterisk" => '*',
"plus" => '+',
"comma" => ',',
"hyphen" => '-',
"period" => '.',
"slash" => '/',
"zero" => '0',
"one" => '1',
"two" => '2',
"three" => '3',
"four" => '4',
"five" => '5',
"six" => '6',
"seven" => '7',
"eight" => '8',
"nine" => '9',
"colon" => ':',
"semicolon" => ';',
"less" => '<',
"equal" => '=',
"greater" => '>',
"question" => '?',
"at" => '@',
"bracketleft" => '[',
"backslash" => '\\',
"bracketright" => ']',
"asciicircum" => '^',
"underscore" => '_',
"grave" => '`',
"braceleft" => '{',
"bar" => '|',
"braceright" => '}',
"asciitilde" => '~',
"bullet" => '\u{2022}',
"periodcentered" => '\u{00B7}',
"endash" => '\u{2013}',
"emdash" => '\u{2014}',
"quoteright" => '\u{2019}',
"quoteleft" => '\u{2018}',
"quotedblleft" => '\u{201C}',
"quotedblright" => '\u{201D}',
"quotedblbase" => '\u{201E}',
"quotesinglbase" => '\u{201A}',
"ff" => '\u{FB00}',
"fi" => '\u{FB01}',
"fl" => '\u{FB02}',
"ffi" => '\u{FB03}',
"ffl" => '\u{FB04}',
"ft" => '\u{FB05}',
"st" => '\u{FB06}',
"degree" => '\u{00B0}',
"trademark" => '\u{2122}',
"registered" => '\u{00AE}',
"copyright" => '\u{00A9}',
"ellipsis" => '\u{2026}',
"minus" => '\u{2212}',
"fraction" => '\u{2044}',
"nbspace" => '\u{00A0}',
"alpha" => '\u{03B1}',
"beta" => '\u{03B2}',
"gamma" => '\u{03B3}',
"delta" => '\u{03B4}',
"epsilon" | "epsilon1" => '\u{03B5}',
"zeta" => '\u{03B6}',
"eta" => '\u{03B7}',
"theta" | "theta1" => '\u{03B8}',
"iota" => '\u{03B9}',
"kappa" => '\u{03BA}',
"lambda" => '\u{03BB}',
"mu" => '\u{03BC}',
"nu" => '\u{03BD}',
"xi" => '\u{03BE}',
"omicron" => '\u{03BF}',
"pi" | "pi1" => '\u{03C0}',
"rho" | "rho1" => '\u{03C1}',
"sigma" => '\u{03C3}',
"sigma1" => '\u{03C2}',
"tau" => '\u{03C4}',
"upsilon" => '\u{03C5}',
"phi" | "phi1" => '\u{03C6}',
"chi" => '\u{03C7}',
"psi" => '\u{03C8}',
"omega" | "omega1" => '\u{03C9}',
"Gamma" => '\u{0393}',
"Delta" => '\u{0394}',
"Theta" => '\u{0398}',
"Lambda" => '\u{039B}',
"Xi" => '\u{039E}',
"Pi" => '\u{03A0}',
"Sigma" => '\u{03A3}',
"Upsilon" => '\u{03A5}',
"Phi" => '\u{03A6}',
"Psi" => '\u{03A8}',
"Omega" => '\u{03A9}',
"lessequal" => '\u{2264}',
"greaterequal" => '\u{2265}',
"notequal" => '\u{2260}',
"approxequal" => '\u{2248}',
"equivalence" => '\u{2261}',
"element" => '\u{2208}',
"plusminus" => '\u{00B1}',
"multiply" => '\u{00D7}',
"divide" => '\u{00F7}',
"infinity" => '\u{221E}',
"partialdiff" => '\u{2202}',
"gradient" => '\u{2207}',
"summation" => '\u{2211}',
"product" => '\u{220F}',
"integral" => '\u{222B}',
"radical" => '\u{221A}',
"proportional" => '\u{221D}',
"arrowright" => '\u{2192}',
"arrowleft" => '\u{2190}',
"arrowup" => '\u{2191}',
"arrowdown" => '\u{2193}',
"arrowboth" => '\u{2194}',
"arrowdblright" => '\u{21D2}',
"logicaland" => '\u{2227}',
"logicalor" => '\u{2228}',
"intersection" => '\u{2229}',
"union" => '\u{222A}',
"similar" => '\u{223C}',
"congruent" => '\u{2245}',
"dotmath" => '\u{22C5}',
"asteriskmath" => '\u{2217}',
_ => {
if let Some((base, _)) = s.split_once('.') {
if !base.is_empty() {
return glyph_name_to_char(base.as_bytes());
}
}
return None;
}
};
Some(resolved)
}
fn winansi_table() -> HashMap<u8, char> {
let mut m = HashMap::new();
for b in 0x20u8..=0x7e {
m.insert(b, b as char);
}
let extra: &[(u8, char)] = &[
(0x91, '\u{2018}'),
(0x92, '\u{2019}'),
(0x93, '\u{201C}'),
(0x94, '\u{201D}'),
(0x95, '\u{2022}'),
(0x96, '\u{2013}'),
(0x97, '\u{2014}'),
(0x85, '\u{2026}'),
(0xA0, '\u{00A0}'),
];
for &(b, c) in extra {
m.insert(b, c);
}
for b in 0xA1u8..=0xFF {
m.entry(b).or_insert(b as char);
}
m
}
fn macroman_table() -> HashMap<u8, char> {
let mut m = HashMap::new();
for b in 0x20u8..=0x7e {
m.insert(b, b as char);
}
let high: &[(u8, char)] = &[
(0xA5, '\u{2022}'), (0xD0, '\u{2013}'), (0xD1, '\u{2014}'), (0xD2, '\u{201C}'),
(0xD3, '\u{201D}'),
(0xD4, '\u{2018}'),
(0xD5, '\u{2019}'),
(0xCA, '\u{00A0}'),
(0xC9, '\u{2026}'),
(0xDE, '\u{FB01}'),
(0xDF, '\u{FB02}'),
];
for &(b, c) in high {
m.insert(b, c);
}
m
}