use std::collections::{BTreeMap, HashMap};
use lopdf::{Dictionary, Object, ObjectId};
use crate::error::Result;
#[derive(Debug, Clone, PartialEq)]
pub struct TextFragment {
pub text: String,
pub x: f32,
pub y: f32,
pub width: f32,
pub font_size: f32,
}
struct FontInfo {
to_unicode: BTreeMap<u16, char>,
dw: u32,
w_runs: Vec<WidthRun>,
bytes_per_char: u8,
}
struct WidthRun {
start_gid: u16,
widths: Vec<u32>,
}
impl FontInfo {
fn advance_width(&self, gid: u16) -> u32 {
for run in &self.w_runs {
if gid >= run.start_gid {
let idx = (gid - run.start_gid) as usize;
if idx < run.widths.len() {
return run.widths[idx];
}
}
}
self.dw
}
}
pub(crate) fn extract_text_runs_from_page(
doc: &lopdf::Document,
page_id: ObjectId,
) -> Result<Vec<TextFragment>> {
let streams = page_content_streams(doc, page_id);
let fonts = collect_fonts(doc, page_id);
let mut fragments = Vec::new();
for stream_bytes in &streams {
parse_content_stream(stream_bytes, &fonts, &mut fragments);
}
Ok(fragments)
}
fn page_content_streams(doc: &lopdf::Document, page_id: ObjectId) -> Vec<Vec<u8>> {
let Ok(page_obj) = doc.get_object(page_id) else {
return vec![];
};
let Ok(page_dict) = page_obj.as_dict() else {
return vec![];
};
let Ok(contents_obj) = page_dict.get(b"Contents") else {
return vec![];
};
let ids: Vec<ObjectId> = match contents_obj {
Object::Reference(id) => vec![*id],
Object::Array(arr) => arr
.iter()
.filter_map(|o| {
if let Object::Reference(id) = o { Some(*id) } else { None }
})
.collect(),
_ => return vec![],
};
let mut result = Vec::new();
for id in ids {
let Ok(stream_obj) = doc.get_object(id) else { continue };
let Ok(stream) = stream_obj.as_stream() else { continue };
let has_filter = stream.dict.get(b"Filter").is_ok();
if has_filter {
let mut owned = stream.clone();
if owned.decompress().is_ok() {
result.push(owned.content);
}
} else {
result.push(stream.content.clone());
}
}
result
}
fn resolve_dict<'a>(doc: &'a lopdf::Document, obj: &'a Object) -> Option<&'a Dictionary> {
match obj {
Object::Dictionary(d) => Some(d),
Object::Reference(id) => doc.get_object(*id).ok()?.as_dict().ok(),
_ => None,
}
}
fn collect_fonts(doc: &lopdf::Document, page_id: ObjectId) -> HashMap<Vec<u8>, FontInfo> {
collect_fonts_inner(doc, page_id).unwrap_or_default()
}
fn collect_fonts_inner(
doc: &lopdf::Document,
page_id: ObjectId,
) -> Option<HashMap<Vec<u8>, FontInfo>> {
let mut fonts = HashMap::new();
let page_dict = doc.get_object(page_id).ok()?.as_dict().ok()?;
let resources_obj = page_dict.get(b"Resources").ok()?;
let resources_dict = resolve_dict(doc, resources_obj)?;
let font_obj = resources_dict.get(b"Font").ok()?;
let font_dict = resolve_dict(doc, font_obj)?;
for (name, font_ref) in font_dict.iter() {
let Object::Reference(font_id) = font_ref else { continue };
let Ok(font_obj) = doc.get_object(*font_id) else { continue };
let Ok(fd) = font_obj.as_dict() else { continue };
let subtype = fd
.get(b"Subtype")
.ok()
.and_then(|o| if let Object::Name(n) = o { Some(n.as_slice()) } else { None });
let font_info = match subtype {
Some(b"Type0") => match collect_type0_font(fd, doc) {
Some(fi) => fi,
None => continue,
},
Some(b"Type1") | Some(b"MMType1") | Some(b"TrueType") => {
collect_simple_font(fd, doc)
}
_ => continue,
};
fonts.insert(name.clone(), font_info);
}
Some(fonts)
}
fn collect_type0_font(fd: &Dictionary, doc: &lopdf::Document) -> Option<FontInfo> {
let to_unicode = try_parse_to_unicode(fd, doc).unwrap_or_default();
let desc_obj = fd.get(b"DescendantFonts").ok()?;
let Object::Array(desc_arr) = desc_obj else { return None };
let Some(Object::Reference(cid_id)) = desc_arr.first() else { return None };
let Ok(cid_obj) = doc.get_object(*cid_id) else { return None };
let Ok(cid_dict) = cid_obj.as_dict() else { return None };
let dw = cid_dict
.get(b"DW")
.ok()
.and_then(|o| o.as_i64().ok())
.map(|n| n as u32)
.unwrap_or(1000);
let w_runs = cid_dict
.get(b"W")
.ok()
.and_then(|o| if let Object::Array(a) = o { Some(a.as_slice()) } else { None })
.map(parse_w_array)
.unwrap_or_default();
Some(FontInfo { to_unicode, dw, w_runs, bytes_per_char: 2 })
}
fn collect_simple_font(fd: &Dictionary, doc: &lopdf::Document) -> FontInfo {
let to_unicode = if let Some(map) = try_parse_to_unicode(fd, doc) {
map
} else {
build_encoding_map(fd, doc)
};
let (w_runs, dw) = collect_simple_font_widths(fd, doc);
FontInfo { to_unicode, dw, w_runs, bytes_per_char: 1 }
}
fn try_parse_to_unicode(
fd: &Dictionary,
doc: &lopdf::Document,
) -> Option<BTreeMap<u16, char>> {
let to_uni_ref = fd.get(b"ToUnicode").ok()?;
let Object::Reference(to_uni_id) = to_uni_ref else { return None };
let Ok(to_uni_obj) = doc.get_object(*to_uni_id) else { return None };
let Ok(stream) = to_uni_obj.as_stream() else { return None };
let cmap_bytes = if stream.dict.get(b"Filter").is_ok() {
let mut owned = stream.clone();
owned.decompress().ok()?;
owned.content
} else {
stream.content.clone()
};
let map = parse_to_unicode_cmap(&cmap_bytes);
if map.is_empty() { None } else { Some(map) }
}
fn collect_simple_font_widths(
fd: &Dictionary,
doc: &lopdf::Document,
) -> (Vec<WidthRun>, u32) {
let dw = missing_width_from_descriptor(fd, doc);
let first_char = match fd.get(b"FirstChar").ok().and_then(|o| o.as_i64().ok()) {
Some(n) => n as u16,
None => return (vec![], dw),
};
let widths_arr = match fd.get(b"Widths").ok() {
Some(Object::Array(a)) => a,
_ => return (vec![], dw),
};
let widths: Vec<u32> = widths_arr
.iter()
.filter_map(|o| o.as_i64().ok().map(|n| n as u32))
.collect();
if widths.is_empty() {
return (vec![], dw);
}
(vec![WidthRun { start_gid: first_char, widths }], dw)
}
fn missing_width_from_descriptor(fd: &Dictionary, doc: &lopdf::Document) -> u32 {
let desc = fd
.get(b"FontDescriptor")
.ok()
.and_then(|o| resolve_dict(doc, o));
desc.and_then(|d| d.get(b"MissingWidth").ok())
.and_then(|o| o.as_i64().ok())
.map(|n| n as u32)
.unwrap_or(1000)
}
fn build_encoding_map(fd: &Dictionary, doc: &lopdf::Document) -> BTreeMap<u16, char> {
let enc_obj = match fd.get(b"Encoding").ok() {
Some(o) => o,
None => return encoding_table_to_btree(&STANDARD_ENCODING),
};
if let Object::Name(name) = enc_obj {
return encoding_name_to_btree(name);
}
let enc_dict = match resolve_dict(doc, enc_obj) {
Some(d) => d,
None => return encoding_table_to_btree(&STANDARD_ENCODING),
};
let base = enc_dict
.get(b"BaseEncoding")
.ok()
.and_then(|o| if let Object::Name(n) = o { Some(n.as_slice()) } else { None })
.map(encoding_name_to_btree)
.unwrap_or_else(|| encoding_table_to_btree(&STANDARD_ENCODING));
apply_differences(enc_dict, base)
}
fn encoding_name_to_btree(name: &[u8]) -> BTreeMap<u16, char> {
match name {
b"WinAnsiEncoding" => encoding_table_to_btree(&WIN_ANSI_ENCODING),
b"MacRomanEncoding" => encoding_table_to_btree(&MAC_ROMAN_ENCODING),
b"StandardEncoding" => encoding_table_to_btree(&STANDARD_ENCODING),
_ => encoding_table_to_btree(&STANDARD_ENCODING),
}
}
fn encoding_table_to_btree(table: &[Option<char>; 256]) -> BTreeMap<u16, char> {
table
.iter()
.enumerate()
.filter_map(|(i, opt)| opt.map(|ch| (i as u16, ch)))
.collect()
}
fn apply_differences(
enc_dict: &Dictionary,
mut map: BTreeMap<u16, char>,
) -> BTreeMap<u16, char> {
let Ok(Object::Array(diffs)) = enc_dict.get(b"Differences") else {
return map;
};
let mut current_code: u16 = 0;
for obj in diffs {
match obj {
Object::Integer(n) => {
current_code = *n as u16;
}
Object::Name(glyph_name) => {
if let Some(ch) = glyph_name_to_char(glyph_name) {
map.insert(current_code, ch);
}
current_code = current_code.saturating_add(1);
}
_ => {}
}
}
map
}
#[rustfmt::skip]
const WIN_ANSI_ENCODING: [Option<char>; 256] = [
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
Some(' '), Some('!'), Some('"'), Some('#'),
Some('$'), Some('%'), Some('&'), Some('\''),
Some('('), Some(')'), Some('*'), Some('+'),
Some(','), Some('-'), Some('.'), Some('/'),
Some('0'), Some('1'), Some('2'), Some('3'),
Some('4'), Some('5'), Some('6'), Some('7'),
Some('8'), Some('9'), Some(':'), Some(';'),
Some('<'), Some('='), Some('>'), Some('?'),
Some('@'), Some('A'), Some('B'), Some('C'),
Some('D'), Some('E'), Some('F'), Some('G'),
Some('H'), Some('I'), Some('J'), Some('K'),
Some('L'), Some('M'), Some('N'), Some('O'),
Some('P'), Some('Q'), Some('R'), Some('S'),
Some('T'), Some('U'), Some('V'), Some('W'),
Some('X'), Some('Y'), Some('Z'), Some('['),
Some('\\'), Some(']'), Some('^'), Some('_'),
Some('`'), Some('a'), Some('b'), Some('c'),
Some('d'), Some('e'), Some('f'), Some('g'),
Some('h'), Some('i'), Some('j'), Some('k'),
Some('l'), Some('m'), Some('n'), Some('o'),
Some('p'), Some('q'), Some('r'), Some('s'),
Some('t'), Some('u'), Some('v'), Some('w'),
Some('x'), Some('y'), Some('z'), Some('{'),
Some('|'), Some('}'), Some('~'), None, Some('€'), None, Some('‚'), Some('ƒ'),
Some('„'), Some('…'), Some('†'), Some('‡'),
Some('ˆ'), Some('‰'), Some('Š'), Some('‹'),
Some('Œ'), None, Some('Ž'), None,
None, Some('\u{2018}'), Some('\u{2019}'), Some('\u{201C}'),
Some('\u{201D}'), Some('•'), Some('–'), Some('—'),
Some('˜'), Some('™'), Some('š'), Some('›'),
Some('œ'), None, Some('ž'), Some('Ÿ'),
Some('\u{00A0}'), Some('¡'), Some('¢'), Some('£'),
Some('¤'), Some('¥'), Some('¦'), Some('§'),
Some('¨'), Some('©'), Some('ª'), Some('«'),
Some('¬'), Some('-'), Some('®'), Some('¯'), Some('°'), Some('±'), Some('²'), Some('³'),
Some('´'), Some('µ'), Some('¶'), Some('·'),
Some('¸'), Some('¹'), Some('º'), Some('»'),
Some('¼'), Some('½'), Some('¾'), Some('¿'),
Some('À'), Some('Á'), Some('Â'), Some('Ã'),
Some('Ä'), Some('Å'), Some('Æ'), Some('Ç'),
Some('È'), Some('É'), Some('Ê'), Some('Ë'),
Some('Ì'), Some('Í'), Some('Î'), Some('Ï'),
Some('Ð'), Some('Ñ'), Some('Ò'), Some('Ó'),
Some('Ô'), Some('Õ'), Some('Ö'), Some('×'),
Some('Ø'), Some('Ù'), Some('Ú'), Some('Û'),
Some('Ü'), Some('Ý'), Some('Þ'), Some('ß'),
Some('à'), Some('á'), Some('â'), Some('ã'),
Some('ä'), Some('å'), Some('æ'), Some('ç'),
Some('è'), Some('é'), Some('ê'), Some('ë'),
Some('ì'), Some('í'), Some('î'), Some('ï'),
Some('ð'), Some('ñ'), Some('ò'), Some('ó'),
Some('ô'), Some('õ'), Some('ö'), Some('÷'),
Some('ø'), Some('ù'), Some('ú'), Some('û'),
Some('ü'), Some('ý'), Some('þ'), Some('ÿ'),
];
#[rustfmt::skip]
const MAC_ROMAN_ENCODING: [Option<char>; 256] = [
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
Some(' '), Some('!'), Some('"'), Some('#'),
Some('$'), Some('%'), Some('&'), Some('\''),
Some('('), Some(')'), Some('*'), Some('+'),
Some(','), Some('-'), Some('.'), Some('/'),
Some('0'), Some('1'), Some('2'), Some('3'),
Some('4'), Some('5'), Some('6'), Some('7'),
Some('8'), Some('9'), Some(':'), Some(';'),
Some('<'), Some('='), Some('>'), Some('?'),
Some('@'), Some('A'), Some('B'), Some('C'),
Some('D'), Some('E'), Some('F'), Some('G'),
Some('H'), Some('I'), Some('J'), Some('K'),
Some('L'), Some('M'), Some('N'), Some('O'),
Some('P'), Some('Q'), Some('R'), Some('S'),
Some('T'), Some('U'), Some('V'), Some('W'),
Some('X'), Some('Y'), Some('Z'), Some('['),
Some('\\'), Some(']'), Some('^'), Some('_'),
Some('`'), Some('a'), Some('b'), Some('c'),
Some('d'), Some('e'), Some('f'), Some('g'),
Some('h'), Some('i'), Some('j'), Some('k'),
Some('l'), Some('m'), Some('n'), Some('o'),
Some('p'), Some('q'), Some('r'), Some('s'),
Some('t'), Some('u'), Some('v'), Some('w'),
Some('x'), Some('y'), Some('z'), Some('{'),
Some('|'), Some('}'), Some('~'), None,
Some('Ä'), Some('Å'), Some('Ç'), Some('É'),
Some('Ñ'), Some('Ö'), Some('Ü'), Some('á'),
Some('à'), Some('â'), Some('ä'), Some('ã'),
Some('å'), Some('ç'), Some('é'), Some('è'),
Some('ê'), Some('ë'), Some('í'), Some('ì'),
Some('î'), Some('ï'), Some('ñ'), Some('ó'),
Some('ò'), Some('ô'), Some('ö'), Some('õ'),
Some('ú'), Some('ù'), Some('û'), Some('ü'),
Some('†'), Some('°'), Some('¢'), Some('£'),
Some('§'), Some('•'), Some('¶'), Some('ß'),
Some('®'), Some('©'), Some('™'), Some('´'),
Some('¨'), Some('≠'), Some('Æ'), Some('Ø'),
Some('∞'), Some('±'), Some('≤'), Some('≥'),
Some('¥'), Some('µ'), Some('∂'), Some('∑'),
Some('∏'), Some('π'), Some('∫'), Some('ª'),
Some('º'), Some('\u{2126}'), Some('æ'), Some('ø'), Some('¿'), Some('¡'), Some('¬'), Some('√'),
Some('ƒ'), Some('≈'), Some('∆'), Some('«'),
Some('»'), Some('…'), Some('\u{00A0}'), Some('À'), Some('Ã'), Some('Õ'), Some('Œ'), Some('œ'),
Some('–'), Some('—'), Some('"'), Some('"'),
Some('\u{2018}'), Some('\u{2019}'), Some('÷'), Some('\u{25CA}'), Some('ÿ'), Some('Ÿ'), Some('⁄'), Some('¤'), Some('‹'), Some('›'), Some('\u{FB01}'), Some('\u{FB02}'), Some('‡'), Some('·'), Some('‚'), Some('„'),
Some('‰'), Some('Â'), Some('Ê'), Some('Á'),
Some('Ë'), Some('È'), Some('Í'), Some('Î'),
Some('Ï'), Some('Ì'), Some('Ó'), Some('Ô'),
Some('\u{F8FF}'), Some('Ò'), Some('Ú'), Some('Û'), Some('Ù'), Some('ı'), Some('ˆ'), Some('˜'),
Some('¯'), Some('˘'), Some('˙'), Some('˚'),
Some('¸'), Some('˝'), Some('˛'), Some('ˇ'),
];
#[rustfmt::skip]
const STANDARD_ENCODING: [Option<char>; 256] = [
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
Some(' '), Some('!'), Some('"'), Some('#'),
Some('$'), Some('%'), Some('&'), Some('\u{2019}'), Some('('), Some(')'), Some('*'), Some('+'),
Some(','), Some('-'), Some('.'), Some('/'),
Some('0'), Some('1'), Some('2'), Some('3'),
Some('4'), Some('5'), Some('6'), Some('7'),
Some('8'), Some('9'), Some(':'), Some(';'),
Some('<'), Some('='), Some('>'), Some('?'),
Some('@'), Some('A'), Some('B'), Some('C'),
Some('D'), Some('E'), Some('F'), Some('G'),
Some('H'), Some('I'), Some('J'), Some('K'),
Some('L'), Some('M'), Some('N'), Some('O'),
Some('P'), Some('Q'), Some('R'), Some('S'),
Some('T'), Some('U'), Some('V'), Some('W'),
Some('X'), Some('Y'), Some('Z'), Some('['),
Some('\\'), Some(']'), Some('^'), Some('_'),
Some('\u{2018}'), Some('a'), Some('b'), Some('c'),
Some('d'), Some('e'), Some('f'), Some('g'),
Some('h'), Some('i'), Some('j'), Some('k'),
Some('l'), Some('m'), Some('n'), Some('o'),
Some('p'), Some('q'), Some('r'), Some('s'),
Some('t'), Some('u'), Some('v'), Some('w'),
Some('x'), Some('y'), Some('z'), Some('{'),
Some('|'), Some('}'), Some('~'), None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None,
None,
Some('¡'), Some('¢'), Some('£'), Some('⁄'), Some('¥'), Some('ƒ'), Some('§'), Some('¤'), Some('\''), Some('"'), Some('«'), Some('‹'),
Some('›'), Some('\u{FB01}'), Some('\u{FB02}'), None, Some('–'), Some('†'), Some('‡'),
Some('·'), None, Some('¶'), Some('•'),
Some('‚'), Some('„'), Some('"'), Some('»'),
Some('…'), Some('‰'), None, Some('¿'),
None, Some('`'), Some('´'), Some('ˆ'),
Some('˜'), Some('¯'), Some('˘'), Some('˙'),
Some('¨'), None, Some('˚'), Some('¸'),
None, Some('˝'), Some('˛'), Some('ˇ'),
Some('—'), None, None, None,
None, None, None, None,
None, None, None, None,
None, None, None, None,
None, Some('Æ'), None, Some('ª'),
None, None, None, None,
Some('Ł'), Some('Ø'), Some('Œ'), Some('º'),
None, None, None, None,
None, Some('æ'), None, None,
None, Some('ı'), None, None,
Some('ł'), Some('ø'), Some('œ'), Some('ß'),
None, None, None, None,
];
fn glyph_name_to_char(name: &[u8]) -> Option<char> {
let s = std::str::from_utf8(name).ok()?;
AGL_TABLE
.binary_search_by_key(&s, |&(n, _)| n)
.ok()
.map(|i| AGL_TABLE[i].1)
}
static AGL_TABLE: &[(&str, char)] = &[
("A", 'A'), ("AE", 'Æ'), ("Aacute", 'Á'), ("Acircumflex", 'Â'),
("Adieresis", 'Ä'), ("Agrave", 'À'), ("Aring", 'Å'), ("Atilde", 'Ã'),
("B", 'B'), ("C", 'C'), ("Ccedilla", 'Ç'), ("D", 'D'), ("Delta", '∆'),
("E", 'E'), ("Eacute", 'É'), ("Ecircumflex", 'Ê'), ("Edieresis", 'Ë'),
("Egrave", 'È'), ("Eth", 'Ð'), ("Euro", '€'),
("F", 'F'), ("G", 'G'), ("H", 'H'),
("I", 'I'), ("Iacute", 'Í'), ("Icircumflex", 'Î'), ("Idieresis", 'Ï'),
("Igrave", 'Ì'), ("J", 'J'), ("K", 'K'), ("L", 'L'), ("Lslash", 'Ł'),
("M", 'M'), ("N", 'N'), ("Ntilde", 'Ñ'),
("O", 'O'), ("OE", 'Œ'), ("Oacute", 'Ó'), ("Ocircumflex", 'Ô'),
("Odieresis", 'Ö'), ("Ograve", 'Ò'), ("Omega", '\u{2126}'),
("Oslash", 'Ø'), ("Otilde", 'Õ'),
("P", 'P'), ("Q", 'Q'), ("R", 'R'),
("S", 'S'), ("Scaron", 'Š'), ("T", 'T'), ("Thorn", 'Þ'),
("U", 'U'), ("Uacute", 'Ú'), ("Ucircumflex", 'Û'), ("Udieresis", 'Ü'),
("Ugrave", 'Ù'), ("V", 'V'), ("W", 'W'), ("X", 'X'),
("Y", 'Y'), ("Yacute", 'Ý'), ("Ydieresis", 'Ÿ'),
("Z", 'Z'), ("Zcaron", 'Ž'),
("a", 'a'), ("aacute", 'á'), ("acircumflex", 'â'), ("adieresis", 'ä'),
("ae", 'æ'), ("agrave", 'à'), ("ampersand", '&'), ("approxequal", '≈'),
("aring", 'å'), ("asciicircum", '^'), ("asciitilde", '~'),
("asterisk", '*'), ("at", '@'), ("atilde", 'ã'),
("b", 'b'), ("backslash", '\\'), ("bar", '|'), ("braceleft", '{'),
("braceright", '}'), ("bracketleft", '['), ("bracketright", ']'),
("breve", '˘'), ("brokenbar", '¦'), ("bullet", '•'),
("c", 'c'), ("caron", 'ˇ'), ("ccedilla", 'ç'), ("cedilla", '¸'),
("cent", '¢'), ("circumflex", 'ˆ'), ("colon", ':'), ("comma", ','),
("copyright", '©'), ("currency", '¤'),
("d", 'd'), ("dagger", '†'), ("daggerdbl", '‡'), ("degree", '°'),
("dieresis", '¨'), ("divide", '÷'), ("dollar", '$'),
("dotaccent", '˙'), ("dotlessi", 'ı'),
("e", 'e'), ("eacute", 'é'), ("ecircumflex", 'ê'), ("edieresis", 'ë'),
("egrave", 'è'), ("eight", '8'), ("ellipsis", '…'), ("emdash", '—'),
("endash", '–'), ("equal", '='), ("eth", 'ð'), ("exclam", '!'),
("exclamdown", '¡'),
("f", 'f'), ("fi", '\u{FB01}'), ("five", '5'), ("fl", '\u{FB02}'),
("florin", 'ƒ'), ("four", '4'), ("fraction", '⁄'),
("g", 'g'), ("germandbls", 'ß'), ("grave", '`'), ("greater", '>'),
("greaterequal", '≥'), ("guillemotleft", '«'), ("guillemotright", '»'),
("guilsinglleft", '‹'), ("guilsinglright", '›'),
("h", 'h'), ("hungarumlaut", '˝'), ("hyphen", '-'),
("i", 'i'), ("iacute", 'í'), ("icircumflex", 'î'), ("idieresis", 'ï'),
("igrave", 'ì'), ("infinity", '∞'), ("integral", '∫'),
("j", 'j'), ("k", 'k'),
("l", 'l'), ("less", '<'), ("lessequal", '≤'), ("logicalnot", '¬'),
("lozenge", '◊'), ("lslash", 'ł'),
("m", 'm'), ("macron", '¯'), ("mu", 'µ'), ("multiply", '×'),
("n", 'n'), ("nine", '9'), ("notequal", '≠'), ("ntilde", 'ñ'),
("numbersign", '#'),
("o", 'o'), ("oacute", 'ó'), ("ocircumflex", 'ô'), ("odieresis", 'ö'),
("oe", 'œ'), ("ogonek", '˛'), ("ograve", 'ò'), ("one", '1'),
("onehalf", '½'), ("onequarter", '¼'), ("onesuperior", '¹'),
("ordfeminine", 'ª'), ("ordmasculine", 'º'), ("oslash", 'ø'),
("otilde", 'õ'),
("p", 'p'), ("paragraph", '¶'), ("parenleft", '('), ("parenright", ')'),
("partialdiff", '∂'), ("percent", '%'), ("period", '.'),
("periodcentered", '·'), ("perthousand", '‰'), ("pi", 'π'),
("plus", '+'), ("plusminus", '±'), ("product", '∏'),
("q", 'q'), ("question", '?'), ("questiondown", '¿'),
("quotedbl", '"'), ("quotedblbase", '„'), ("quotedblleft", '"'),
("quotedblright", '"'), ("quoteleft", '\u{2018}'),
("quoteright", '\u{2019}'), ("quotesinglbase", '‚'),
("quotesingle", '\''),
("r", 'r'), ("radical", '√'), ("registered", '®'), ("ring", '˚'),
("s", 's'), ("scaron", 'š'), ("section", '§'), ("semicolon", ';'),
("seven", '7'), ("six", '6'), ("slash", '/'), ("space", ' '),
("sterling", '£'), ("summation", '∑'),
("t", 't'), ("thorn", 'þ'), ("three", '3'), ("threequarters", '¾'),
("threesuperior", '³'), ("tilde", '˜'), ("trademark", '™'),
("two", '2'), ("twosuperior", '²'),
("u", 'u'), ("uacute", 'ú'), ("ucircumflex", 'û'), ("udieresis", 'ü'),
("ugrave", 'ù'), ("underscore", '_'),
("v", 'v'), ("w", 'w'), ("x", 'x'),
("y", 'y'), ("yacute", 'ý'), ("ydieresis", 'ÿ'), ("yen", '¥'),
("z", 'z'), ("zcaron", 'ž'), ("zero", '0'),
];
fn parse_to_unicode_cmap(bytes: &[u8]) -> BTreeMap<u16, char> {
let mut map = BTreeMap::new();
let text = match std::str::from_utf8(bytes) {
Ok(s) => s,
Err(_) => return map,
};
enum Section {
None,
BfChar,
BfRange,
}
let mut section = Section::None;
for line in text.lines() {
let line = line.trim();
if line.ends_with("beginbfchar") {
section = Section::BfChar;
continue;
}
if line == "endbfchar" {
section = Section::None;
continue;
}
if line.ends_with("beginbfrange") {
section = Section::BfRange;
continue;
}
if line == "endbfrange" {
section = Section::None;
continue;
}
match section {
Section::BfChar => parse_bfchar_line(line, &mut map),
Section::BfRange => parse_bfrange_line(line, &mut map),
Section::None => {}
}
}
map
}
fn parse_bfchar_line(line: &str, map: &mut BTreeMap<u16, char>) {
let mut parts = line.splitn(2, ' ');
let gid_tok = match parts.next() { Some(s) => s, None => return };
let uni_tok = match parts.next() { Some(s) => s.trim(), None => return };
let gid_hex = gid_tok.trim_start_matches('<').trim_end_matches('>');
let uni_hex = uni_tok.trim_start_matches('<').trim_end_matches('>');
let Ok(gid) = u16::from_str_radix(gid_hex, 16) else { return };
let ch = hex_to_char(uni_hex);
if let Some(ch) = ch {
map.insert(gid, ch);
}
}
fn parse_bfrange_line(line: &str, map: &mut BTreeMap<u16, char>) {
let mut parts = line.splitn(3, ' ');
let lo_tok = match parts.next() { Some(s) => s, None => return };
let hi_tok = match parts.next() { Some(s) => s, None => return };
let rest = match parts.next() { Some(s) => s.trim(), None => return };
let lo_hex = lo_tok.trim_start_matches('<').trim_end_matches('>');
let hi_hex = hi_tok.trim_start_matches('<').trim_end_matches('>');
let Ok(lo) = u16::from_str_radix(lo_hex, 16) else { return };
let Ok(hi) = u16::from_str_radix(hi_hex, 16) else { return };
if lo > hi { return; }
if rest.starts_with('[') {
let inner = rest.trim_start_matches('[').trim_end_matches(']');
let mut code = lo;
for tok in inner.split_whitespace() {
if code > hi { break; }
let hex = tok.trim_start_matches('<').trim_end_matches('>');
if let Some(ch) = hex_to_char(hex) {
map.insert(code, ch);
}
code = code.saturating_add(1);
}
} else {
let dst_hex = rest.trim_start_matches('<').trim_end_matches('>');
let Ok(dst_start) = u32::from_str_radix(dst_hex, 16) else { return };
for i in 0..=(hi as u32).saturating_sub(lo as u32) {
let code = lo + i as u16;
let cp = dst_start + i;
if let Some(ch) = char::from_u32(cp) {
map.insert(code, ch);
}
}
}
}
fn hex_to_char(hex: &str) -> Option<char> {
match hex.len() {
1 | 2 => {
let cp = u32::from_str_radix(hex, 16).ok()?;
char::from_u32(cp)
}
3 | 4 => {
let cp = u32::from_str_radix(hex, 16).ok()?;
char::from_u32(cp)
}
8 => {
let hi = u16::from_str_radix(&hex[0..4], 16).ok()?;
let lo = u16::from_str_radix(&hex[4..8], 16).ok()?;
if (0xD800..=0xDBFF).contains(&hi) && (0xDC00..=0xDFFF).contains(&lo) {
let cp = 0x10000u32
+ ((hi as u32 - 0xD800) << 10)
+ (lo as u32 - 0xDC00);
char::from_u32(cp)
} else {
let cp = u32::from_str_radix(hex, 16).ok()?;
char::from_u32(cp)
}
}
_ => None,
}
}
fn parse_w_array(arr: &[Object]) -> Vec<WidthRun> {
let mut runs = Vec::new();
let mut i = 0;
while i < arr.len() {
let start_gid = match arr[i].as_i64() {
Ok(n) => n as u16,
Err(_) => { i += 1; continue; }
};
i += 1;
if i >= arr.len() { break; }
match &arr[i] {
Object::Array(widths_arr) => {
let widths: Vec<u32> = widths_arr
.iter()
.filter_map(|o| o.as_i64().ok().map(|n| n as u32))
.collect();
runs.push(WidthRun { start_gid, widths });
i += 1;
}
Object::Integer(_) | Object::Real(_) => {
let end_gid = match arr[i].as_i64() {
Ok(n) => n as u16,
Err(_) => { i += 1; continue; }
};
i += 1;
if i >= arr.len() { break; }
let w = match arr[i].as_i64() {
Ok(n) => n as u32,
Err(_) => { i += 1; continue; }
};
i += 1;
let count = (end_gid as usize).saturating_sub(start_gid as usize) + 1;
runs.push(WidthRun { start_gid, widths: vec![w; count] });
}
_ => { i += 1; }
}
}
runs
}
#[derive(Debug)]
enum Token {
HexStr(Vec<u8>),
LitStr(Vec<u8>),
Name(Vec<u8>),
Number(f32),
Keyword(Vec<u8>),
Array(Vec<Token>),
}
fn tokenize(input: &[u8]) -> Vec<Token> {
let mut tokens = Vec::new();
let mut i = 0;
while i < input.len() {
let b = input[i];
if is_pdf_whitespace(b) { i += 1; continue; }
if b == b'%' {
while i < input.len() && input[i] != b'\r' && input[i] != b'\n' { i += 1; }
continue;
}
if b == b'<' {
if i + 1 < input.len() && input[i + 1] == b'<' {
i += 2;
while i + 1 < input.len()
&& !(input[i] == b'>' && input[i + 1] == b'>')
{
i += 1;
}
if i + 1 < input.len() { i += 2; }
continue;
}
i += 1;
let start = i;
while i < input.len() && input[i] != b'>' { i += 1; }
let hex = &input[start..i];
if i < input.len() { i += 1; }
tokens.push(Token::HexStr(decode_hex_bytes(hex)));
continue;
}
if b == b'/' {
i += 1;
let start = i;
while i < input.len()
&& !is_pdf_whitespace(input[i])
&& !is_pdf_delimiter(input[i])
{
i += 1;
}
tokens.push(Token::Name(input[start..i].to_vec()));
continue;
}
if b == b'[' {
i += 1;
let (arr, consumed) = parse_array_tokens(&input[i..]);
i += consumed;
tokens.push(Token::Array(arr));
continue;
}
if b == b']' { i += 1; continue; }
if b == b'(' {
let (bytes, end_i) = parse_literal_string(input, i + 1);
i = end_i;
tokens.push(Token::LitStr(bytes));
continue;
}
let start = i;
while i < input.len()
&& !is_pdf_whitespace(input[i])
&& !is_pdf_delimiter(input[i])
{
i += 1;
}
let word = &input[start..i];
if word.is_empty() { i += 1; continue; }
if let Ok(s) = std::str::from_utf8(word)
&& let Ok(n) = s.parse::<f32>()
{
tokens.push(Token::Number(n));
continue;
}
tokens.push(Token::Keyword(word.to_vec()));
}
tokens
}
fn parse_array_tokens(input: &[u8]) -> (Vec<Token>, usize) {
let mut tokens = Vec::new();
let mut i = 0;
while i < input.len() {
let b = input[i];
if is_pdf_whitespace(b) { i += 1; continue; }
if b == b']' { i += 1; return (tokens, i); }
if b == b'<' && (i + 1 >= input.len() || input[i + 1] != b'<') {
i += 1;
let start = i;
while i < input.len() && input[i] != b'>' { i += 1; }
let hex = &input[start..i];
if i < input.len() { i += 1; }
tokens.push(Token::HexStr(decode_hex_bytes(hex)));
continue;
}
if b == b'(' {
let (bytes, end_i) = parse_literal_string(input, i + 1);
i = end_i;
tokens.push(Token::LitStr(bytes));
continue;
}
let start = i;
while i < input.len()
&& !is_pdf_whitespace(input[i])
&& !is_pdf_delimiter(input[i])
{
i += 1;
}
let word = &input[start..i];
if word.is_empty() { i += 1; continue; }
if let Ok(s) = std::str::from_utf8(word)
&& let Ok(n) = s.parse::<f32>()
{
tokens.push(Token::Number(n));
}
}
(tokens, i)
}
fn parse_literal_string(input: &[u8], mut i: usize) -> (Vec<u8>, usize) {
let mut depth = 1i32;
let mut out = Vec::new();
while i < input.len() && depth > 0 {
match input[i] {
b'\\' => {
i += 1;
if i >= input.len() { break; }
match input[i] {
b'n' => { out.push(b'\n'); i += 1; }
b'r' => { out.push(b'\r'); i += 1; }
b't' => { out.push(b'\t'); i += 1; }
b'\\' => { out.push(b'\\'); i += 1; }
b'(' => { out.push(b'('); i += 1; }
b')' => { out.push(b')'); i += 1; }
b'\r' => {
i += 1;
if i < input.len() && input[i] == b'\n' { i += 1; }
}
b'\n' => { i += 1; } d @ b'0'..=b'7' => {
let mut val = (d - b'0') as u16;
i += 1;
let mut count = 1;
while count < 3
&& i < input.len()
&& (b'0'..=b'7').contains(&input[i])
{
val = val * 8 + (input[i] - b'0') as u16;
i += 1;
count += 1;
}
out.push((val & 0xFF) as u8);
}
_ => { out.push(input[i]); i += 1; }
}
}
b'(' => { depth += 1; out.push(b'('); i += 1; }
b')' => {
depth -= 1;
if depth > 0 { out.push(b')'); }
i += 1;
}
b => { out.push(b); i += 1; }
}
}
(out, i)
}
fn decode_hex_bytes(hex: &[u8]) -> Vec<u8> {
let cleaned: Vec<u8> =
hex.iter().filter(|&&b| !is_pdf_whitespace(b)).copied().collect();
let mut padded = cleaned;
if padded.len() % 2 != 0 { padded.push(b'0'); }
padded
.chunks(2)
.filter_map(|chunk| {
let s = std::str::from_utf8(chunk).ok()?;
u8::from_str_radix(s, 16).ok()
})
.collect()
}
fn is_pdf_whitespace(b: u8) -> bool {
matches!(b, b' ' | b'\t' | b'\r' | b'\n' | 0x0C | 0x00)
}
fn is_pdf_delimiter(b: u8) -> bool {
matches!(b, b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%')
}
fn parse_content_stream(
bytes: &[u8],
fonts: &HashMap<Vec<u8>, FontInfo>,
out: &mut Vec<TextFragment>,
) {
let tokens = tokenize(bytes);
let mut stack: Vec<Token> = Vec::new();
let mut in_bt = false;
let mut font_name: Vec<u8> = Vec::new();
let mut font_size: f32 = 12.0;
let mut x: f32 = 0.0;
let mut y: f32 = 0.0;
for token in tokens {
match token {
Token::Keyword(kw) => match kw.as_slice() {
b"BT" => {
in_bt = true;
x = 0.0;
y = 0.0;
stack.clear();
}
b"ET" => {
in_bt = false;
stack.clear();
}
b"Tf" if in_bt => {
let top = stack.pop();
let second = stack.pop();
if let (Some(Token::Number(size)), Some(Token::Name(name))) =
(top, second)
{
font_name = name;
font_size = size;
}
stack.clear();
}
b"Td" | b"TD" if in_bt => {
let top = stack.pop();
let second = stack.pop();
if let (Some(Token::Number(ty)), Some(Token::Number(tx))) =
(top, second)
{
x += tx;
y += ty;
}
stack.clear();
}
b"Tm" if in_bt => {
let pop_f = stack.pop();
let pop_e = stack.pop();
for _ in 0..4 { stack.pop(); }
if let (Some(Token::Number(fy)), Some(Token::Number(ex))) =
(pop_f, pop_e)
{
x = ex;
y = fy;
}
stack.clear();
}
b"Tj" if in_bt => {
let bytes_opt = match stack.pop() {
Some(Token::HexStr(b)) => Some(b),
Some(Token::LitStr(b)) => Some(b),
_ => None,
};
if let Some(char_bytes) = bytes_opt
&& let Some(frag) = decode_chars_to_fragment(
&char_bytes, &font_name, font_size, x, y, fonts,
)
{
x += frag.width;
out.push(frag);
}
stack.clear();
}
b"TJ" if in_bt => {
if let Some(Token::Array(items)) = stack.pop() {
let mut cur_x = x;
for item in items {
match item {
Token::HexStr(ref b) | Token::LitStr(ref b) => {
if let Some(frag) = decode_chars_to_fragment(
b, &font_name, font_size, cur_x, y, fonts,
) {
cur_x += frag.width;
out.push(frag);
}
}
Token::Number(kern) => {
cur_x -= kern / 1000.0 * font_size;
}
_ => {}
}
}
x = cur_x;
}
stack.clear();
}
_ => { stack.clear(); }
},
other => { stack.push(other); }
}
}
}
fn decode_chars_to_fragment(
char_bytes: &[u8],
font_name: &[u8],
font_size: f32,
x: f32,
y: f32,
fonts: &HashMap<Vec<u8>, FontInfo>,
) -> Option<TextFragment> {
if char_bytes.is_empty() { return None; }
let font_info = fonts.get(font_name)?;
let mut text = String::new();
let mut total_width = 0.0f32;
match font_info.bytes_per_char {
2 => {
if char_bytes.len() % 2 != 0 { return None; }
for chunk in char_bytes.chunks(2) {
let gid = u16::from_be_bytes([chunk[0], chunk[1]]);
let Some(&ch) = font_info.to_unicode.get(&gid) else { continue };
text.push(ch);
let aw = font_info.advance_width(gid);
total_width += aw as f32 / 1000.0 * font_size;
}
}
_ => {
for &b in char_bytes {
let code = b as u16;
let Some(&ch) = font_info.to_unicode.get(&code) else { continue };
text.push(ch);
let aw = font_info.advance_width(code);
total_width += aw as f32 / 1000.0 * font_size;
}
}
}
if text.is_empty() { return None; }
Some(TextFragment { text, x, y, width: total_width, font_size })
}
#[cfg(test)]
mod tests {
use super::*;
use lopdf::Object;
#[test]
fn parse_to_unicode_cmap_basic() {
let cmap = b"/CIDInit /ProcSet findresource begin\n\
12 dict begin\n\
begincmap\n\
1 beginbfchar\n\
<0001> <65E5>\n\
endbfchar\n\
endcmap\n\
end\nend\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&1u16), Some(&'日'));
}
#[test]
fn parse_to_unicode_cmap_surrogate() {
let cmap = b"1 beginbfchar\n<0001> <D840DC00>\nendbfchar\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&1u16), Some(&'\u{20000}'));
}
#[test]
fn parse_bfrange_contiguous() {
let cmap = b"1 beginbfrange\n<20> <7E> <0020>\nendbfrange\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&0x20), Some(&' '));
assert_eq!(map.get(&0x41), Some(&'A'));
assert_eq!(map.get(&0x7E), Some(&'~'));
}
#[test]
fn parse_bfrange_explicit_array() {
let cmap = b"1 beginbfrange\n<20> <21> [<0048> <0069>]\nendbfrange\n";
let map = parse_to_unicode_cmap(cmap);
assert_eq!(map.get(&0x20), Some(&'H'));
assert_eq!(map.get(&0x21), Some(&'i'));
}
#[test]
fn decode_hex_bytes_roundtrip() {
let hex = b"00010002";
let bytes = decode_hex_bytes(hex);
assert_eq!(bytes, vec![0x00, 0x01, 0x00, 0x02]);
}
#[test]
fn litstr_tokenizer_basic() {
let stream = b"(Hello)";
let tokens = tokenize(stream);
assert!(matches!(&tokens[0], Token::LitStr(b) if b == b"Hello"));
}
#[test]
fn litstr_escapes() {
let stream = b"(He\\nllo\\041)"; let tokens = tokenize(stream);
match &tokens[0] {
Token::LitStr(b) => {
assert_eq!(b[0], b'H');
assert_eq!(b[1], b'e');
assert_eq!(b[2], b'\n');
assert_eq!(b[3], b'l');
assert_eq!(b[6], b'!');
}
_ => panic!("expected LitStr"),
}
}
#[test]
fn litstr_in_array() {
let stream = b"[(Hel) -50 (lo)]";
let tokens = tokenize(stream);
if let Token::Array(items) = &tokens[0] {
assert!(matches!(&items[0], Token::LitStr(b) if b == b"Hel"));
assert!(matches!(&items[1], Token::Number(n) if (*n + 50.0).abs() < 0.1));
assert!(matches!(&items[2], Token::LitStr(b) if b == b"lo"));
} else {
panic!("expected Array");
}
}
#[test]
fn tokenizer_smoke() {
let stream = b"BT\n/F0 12 Tf\n100 200 Td\n<0001> Tj\nET\n";
let tokens = tokenize(stream);
let keywords: Vec<&[u8]> = tokens
.iter()
.filter_map(|t| if let Token::Keyword(k) = t { Some(k.as_slice()) } else { None })
.collect();
assert!(keywords.contains(&b"BT".as_slice()));
assert!(keywords.contains(&b"Tf".as_slice()));
assert!(keywords.contains(&b"Td".as_slice()));
assert!(keywords.contains(&b"Tj".as_slice()));
assert!(keywords.contains(&b"ET".as_slice()));
}
#[test]
fn parse_w_array_run_format() {
let arr = vec![
Object::Integer(0),
Object::Array(vec![
Object::Integer(500),
Object::Integer(600),
Object::Integer(700),
]),
];
let runs = parse_w_array(&arr);
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].start_gid, 0);
assert_eq!(runs[0].widths, vec![500, 600, 700]);
}
#[test]
fn font_info_advance_width_fallback() {
let info = FontInfo {
to_unicode: BTreeMap::new(),
dw: 1000,
w_runs: vec![WidthRun { start_gid: 5, widths: vec![600] }],
bytes_per_char: 2,
};
assert_eq!(info.advance_width(5), 600);
assert_eq!(info.advance_width(0), 1000);
assert_eq!(info.advance_width(99), 1000);
}
#[test]
fn win_ansi_spot_checks() {
assert_eq!(WIN_ANSI_ENCODING[0x20], Some(' '));
assert_eq!(WIN_ANSI_ENCODING[0x41], Some('A'));
assert_eq!(WIN_ANSI_ENCODING[0x80], Some('€'));
assert_eq!(WIN_ANSI_ENCODING[0xE9], Some('é'));
assert_eq!(WIN_ANSI_ENCODING[0x7F], None);
}
#[test]
fn agl_table_sorted() {
for i in 1..AGL_TABLE.len() {
assert!(
AGL_TABLE[i - 1].0 < AGL_TABLE[i].0,
"AGL_TABLE not sorted at index {i}: {:?} >= {:?}",
AGL_TABLE[i - 1].0,
AGL_TABLE[i].0
);
}
}
#[test]
fn glyph_name_lookup_spot_checks() {
assert_eq!(glyph_name_to_char(b"space"), Some(' '));
assert_eq!(glyph_name_to_char(b"eacute"), Some('é'));
assert_eq!(glyph_name_to_char(b"euro"), None); assert_eq!(glyph_name_to_char(b"Euro"), Some('€'));
assert_eq!(glyph_name_to_char(b"fi"), Some('\u{FB01}'));
assert_eq!(glyph_name_to_char(b"nonexistent"), None);
}
#[test]
fn encoding_table_to_btree_basic() {
let map = encoding_table_to_btree(&WIN_ANSI_ENCODING);
assert_eq!(map.get(&0x41), Some(&'A'));
assert_eq!(map.get(&0x80), Some(&'€'));
assert!(!map.contains_key(&0x7F)); }
}