use crate::error::{ExtractError, Result};
use lopdf::content::{Content, Operation};
use lopdf::{Document, Object, ObjectId};
use std::collections::HashMap;
use std::sync::OnceLock;
const APPROX_CHAR_WIDTH: f64 = 0.5;
#[derive(Debug, Clone)]
pub struct TextBlock {
pub text: String,
pub page: u32,
pub bbox: [f64; 4],
pub font_name: String,
pub font_size: f64,
pub actual_text: Option<String>,
}
#[derive(Debug, Clone)]
pub struct PositionedChar {
pub ch: char,
pub page: u32,
pub bbox: [f64; 4],
}
#[derive(Debug, Clone)]
struct GraphicsState {
ctm: [f64; 6],
}
#[derive(Debug, Clone)]
struct TextState {
tm: [f64; 6],
tlm: [f64; 6],
font_name: String,
font_size: f64,
tc: f64,
tw: f64,
th: f64,
tl: f64,
ts: f64,
gs_stack: Vec<GraphicsState>,
ctm: [f64; 6],
}
impl Default for TextState {
fn default() -> Self {
Self {
tm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
font_name: String::new(),
font_size: 12.0,
tc: 0.0,
tw: 0.0,
th: 100.0,
tl: 0.0,
ts: 0.0,
gs_stack: Vec::new(),
ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
}
}
}
#[derive(Clone)]
struct FontInfo {
is_cid: bool,
to_unicode: HashMap<u32, String>,
encoding_map: [Option<char>; 256],
ct_codes: [bool; 256],
}
fn build_font_map(doc: &Document, page_id: ObjectId) -> HashMap<String, FontInfo> {
let mut map = HashMap::new();
let resources = get_page_resources(doc, page_id);
let font_dict = match resources.and_then(|res| match res.get(b"Font").ok()? {
Object::Dictionary(d) => Some(d.clone()),
Object::Reference(r) => match doc.get_object(*r).ok()? {
Object::Dictionary(d) => Some(d.clone()),
_ => None,
},
_ => None,
}) {
Some(d) => d,
None => return map,
};
for (name_bytes, value) in font_dict.iter() {
let font_name = String::from_utf8_lossy(name_bytes).to_string();
let font = match value {
Object::Reference(r) => match doc.get_object(*r).ok() {
Some(Object::Dictionary(d)) => d.clone(),
_ => continue,
},
Object::Dictionary(d) => d.clone(),
_ => continue,
};
let subtype = font
.get(b"Subtype")
.ok()
.and_then(|o| match o {
Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
_ => None,
})
.unwrap_or_default();
let is_cid = subtype == "Type0";
let to_unicode = parse_to_unicode_from_font(doc, &font);
let to_unicode = if to_unicode.is_empty() && is_cid {
if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
descendants
.iter()
.find_map(|d| {
let desc_dict = match d {
Object::Reference(r) => match doc.get_object(*r).ok()? {
Object::Dictionary(d) => d,
_ => return None,
},
Object::Dictionary(d) => d,
_ => return None,
};
let tu = parse_to_unicode_from_font(doc, desc_dict);
if tu.is_empty() {
None
} else {
Some(tu)
}
})
.unwrap_or_default()
} else {
HashMap::new()
}
} else {
to_unicode
};
let (encoding_map, ct_codes) = if !is_cid {
build_encoding_map(doc, &font)
} else {
([None; 256], [false; 256])
};
map.insert(
font_name,
FontInfo {
is_cid,
to_unicode,
encoding_map,
ct_codes,
},
);
}
map
}
fn parse_to_unicode_from_font(doc: &Document, font: &lopdf::Dictionary) -> HashMap<u32, String> {
let tu_obj = match font.get(b"ToUnicode").ok() {
Some(Object::Reference(r)) => doc.get_object(*r).ok(),
Some(obj) => Some(obj),
None => return HashMap::new(),
};
let stream_bytes = match tu_obj {
Some(Object::Stream(ref s)) => s
.decompressed_content()
.ok()
.unwrap_or_else(|| s.content.clone()),
_ => return HashMap::new(),
};
parse_to_unicode_cmap(&stream_bytes)
}
fn parse_to_unicode_cmap(data: &[u8]) -> HashMap<u32, String> {
let text = String::from_utf8_lossy(data);
let mut map = HashMap::new();
for section in text.split("beginbfchar") {
let section = match section.split("endbfchar").next() {
Some(s) => s,
None => continue,
};
let tokens = extract_hex_tokens(section);
for pair in tokens.chunks(2) {
if pair.len() == 2 {
let code = parse_hex_u32(&pair[0]);
let unicode = hex_to_unicode_string(&pair[1]);
map.insert(code, unicode);
}
}
}
for section in text.split("beginbfrange") {
let section = match section.split("endbfrange").next() {
Some(s) => s,
None => continue,
};
let mut chars = section.chars().peekable();
let mut tokens: Vec<String> = Vec::new();
let mut arrays: Vec<Vec<String>> = Vec::new();
let mut in_array = false;
let mut current_array: Vec<String> = Vec::new();
while let Some(&ch) = chars.peek() {
if ch == '<' {
chars.next();
let hex: String = chars
.by_ref()
.take_while(|&c| c != '>')
.filter(|c| !c.is_whitespace())
.collect();
if in_array {
current_array.push(hex);
} else {
tokens.push(hex);
}
} else if ch == '[' {
chars.next();
in_array = true;
current_array = Vec::new();
} else if ch == ']' {
chars.next();
in_array = false;
arrays.push(std::mem::take(&mut current_array));
tokens.push(String::new()); } else {
chars.next();
}
}
let mut array_idx = 0;
let mut i = 0;
while i + 2 < tokens.len() {
let lo = parse_hex_u32(&tokens[i]);
let hi = parse_hex_u32(&tokens[i + 1]);
if tokens[i + 2].is_empty() {
if array_idx < arrays.len() {
let arr = &arrays[array_idx];
for (offset, dst) in arr.iter().enumerate() {
let code = lo + offset as u32;
if code <= hi {
map.insert(code, hex_to_unicode_string(dst));
}
}
array_idx += 1;
}
} else {
let dst_start = parse_hex_u32(&tokens[i + 2]);
let dst_len = tokens[i + 2].len();
for code in lo..=hi {
let dst_val = dst_start + (code - lo);
let s = if dst_len <= 4 {
char::from_u32(dst_val)
.map(|c| c.to_string())
.unwrap_or_default()
} else {
let hex = format!("{:0>width$X}", dst_val, width = dst_len);
hex_to_unicode_string(&hex)
};
map.insert(code, s);
}
}
i += 3;
}
}
map
}
fn extract_hex_tokens(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut in_hex = false;
let mut current = String::new();
for ch in text.chars() {
if ch == '<' {
in_hex = true;
current.clear();
} else if ch == '>' && in_hex {
in_hex = false;
tokens.push(current.clone());
} else if in_hex && !ch.is_whitespace() {
current.push(ch);
}
}
tokens
}
fn parse_hex_u32(hex: &str) -> u32 {
u32::from_str_radix(hex, 16).unwrap_or(0)
}
fn hex_to_unicode_string(hex: &str) -> String {
let bytes: Vec<u8> = (0..hex.len())
.step_by(2)
.filter_map(|i| u8::from_str_radix(&hex[i..i + 2.min(hex.len() - i)], 16).ok())
.collect();
if bytes.len() >= 2 && bytes.len().is_multiple_of(2) {
let u16s: Vec<u16> = bytes
.chunks(2)
.map(|c| u16::from_be_bytes([c[0], c[1]]))
.collect();
String::from_utf16_lossy(&u16s)
} else if bytes.len() == 1 {
char::from_u32(bytes[0] as u32)
.map(|c| c.to_string())
.unwrap_or_default()
} else {
String::new()
}
}
fn get_page_resources(doc: &Document, page_id: ObjectId) -> Option<lopdf::Dictionary> {
let page = match doc.get_object(page_id).ok()? {
Object::Dictionary(d) => d.clone(),
_ => return None,
};
if let Some(res) = resolve_dict(doc, &page, b"Resources") {
return Some(res);
}
let mut current = page;
for _ in 0..20 {
let parent_ref = match current.get(b"Parent").ok()? {
Object::Reference(r) => *r,
_ => break,
};
let parent = match doc.get_object(parent_ref).ok()? {
Object::Dictionary(d) => d.clone(),
_ => break,
};
if let Some(res) = resolve_dict(doc, &parent, b"Resources") {
return Some(res);
}
current = parent;
}
None
}
fn resolve_dict(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<lopdf::Dictionary> {
match dict.get(key).ok()? {
Object::Dictionary(d) => Some(d.clone()),
Object::Reference(r) => match doc.get_object(*r).ok()? {
Object::Dictionary(d) => Some(d.clone()),
_ => None,
},
_ => None,
}
}
fn build_encoding_map(
doc: &Document,
font: &lopdf::Dictionary,
) -> ([Option<char>; 256], [bool; 256]) {
let mut table = [None::<char>; 256];
let mut ct_codes = [false; 256];
let encoding = match font.get(b"Encoding").ok() {
Some(obj) => obj,
None => return (table, ct_codes),
};
match encoding {
Object::Name(name) => {
let name_str = String::from_utf8_lossy(name);
apply_base_encoding(&mut table, &name_str);
}
Object::Reference(r) => match doc.get_object(*r) {
Ok(Object::Dictionary(enc_dict)) => {
parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
}
Ok(Object::Name(name)) => {
let name_str = String::from_utf8_lossy(name);
apply_base_encoding(&mut table, &name_str);
}
_ => {}
},
Object::Dictionary(enc_dict) => {
parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
}
_ => {}
}
(table, ct_codes)
}
fn parse_encoding_dict(
doc: &Document,
enc_dict: &lopdf::Dictionary,
table: &mut [Option<char>; 256],
ct_codes: &mut [bool; 256],
) {
if let Ok(Object::Name(base)) = enc_dict.get(b"BaseEncoding") {
let base_str = String::from_utf8_lossy(base);
apply_base_encoding(table, &base_str);
}
let diffs = match enc_dict.get(b"Differences").ok() {
Some(Object::Array(arr)) => arr.clone(),
Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
Some(Object::Array(arr)) => arr.clone(),
_ => return,
},
_ => return,
};
let mut code: Option<u32> = None;
for item in &diffs {
match item {
Object::Integer(n) => {
code = Some(*n as u32);
}
Object::Name(name) => {
if let Some(c) = code {
if c < 256 {
let glyph = String::from_utf8_lossy(name);
apply_glyph_name(&glyph, c as usize, table, ct_codes);
}
code = Some(c + 1);
}
}
Object::Reference(r) => {
if let Ok(Object::Name(name)) = doc.get_object(*r) {
if let Some(c) = code {
if c < 256 {
let glyph = String::from_utf8_lossy(name);
apply_glyph_name(&glyph, c as usize, table, ct_codes);
}
code = Some(c + 1);
}
}
}
_ => {}
}
}
}
fn apply_glyph_name(
glyph: &str,
code: usize,
table: &mut [Option<char>; 256],
ct_codes: &mut [bool; 256],
) {
if glyph == "ct" {
table[code] = None;
ct_codes[code] = true;
return;
}
if let Some(ch) = glyph_name_to_unicode(glyph) {
table[code] = Some(ch);
ct_codes[code] = false;
}
}
fn apply_base_encoding(table: &mut [Option<char>; 256], name: &str) {
let source = match name {
"WinAnsiEncoding" => winansi_encoding(),
"MacRomanEncoding" => mac_roman_encoding(),
_ => return,
};
for (i, &ch) in source.iter().enumerate() {
if ch != '\0' {
table[i] = Some(ch);
}
}
}
fn winansi_encoding() -> &'static [char; 256] {
static TABLE: OnceLock<[char; 256]> = OnceLock::new();
TABLE.get_or_init(|| {
let mut t = ['\0'; 256];
for i in 0x20..=0x7Eu8 {
t[i as usize] = i as char;
}
t[0x09] = '\t';
t[0x0A] = '\n';
t[0x0D] = '\r';
let cp1252: [(u8, char); 27] = [
(0x80, '\u{20AC}'), (0x82, '\u{201A}'), (0x83, '\u{0192}'), (0x84, '\u{201E}'), (0x85, '\u{2026}'), (0x86, '\u{2020}'), (0x87, '\u{2021}'), (0x88, '\u{02C6}'), (0x89, '\u{2030}'), (0x8A, '\u{0160}'), (0x8B, '\u{2039}'), (0x8C, '\u{0152}'), (0x8E, '\u{017D}'), (0x91, '\u{2018}'), (0x92, '\u{2019}'), (0x93, '\u{201C}'), (0x94, '\u{201D}'), (0x95, '\u{2022}'), (0x96, '\u{2013}'), (0x97, '\u{2014}'), (0x98, '\u{02DC}'), (0x99, '\u{2122}'), (0x9A, '\u{0161}'), (0x9B, '\u{203A}'), (0x9C, '\u{0153}'), (0x9E, '\u{017E}'), (0x9F, '\u{0178}'), ];
for (code, ch) in cp1252 {
t[code as usize] = ch;
}
for i in 0xA0..=0xFFu16 {
t[i as usize] = char::from_u32(i as u32).unwrap_or('\0');
}
t
})
}
fn mac_roman_encoding() -> &'static [char; 256] {
static TABLE: OnceLock<[char; 256]> = OnceLock::new();
TABLE.get_or_init(|| {
let mut t = ['\0'; 256];
for i in 0x20..=0x7Eu8 {
t[i as usize] = i as char;
}
t[0x09] = '\t';
t[0x0A] = '\n';
t[0x0D] = '\r';
let mac_upper: [char; 128] = [
'\u{00C4}', '\u{00C5}', '\u{00C7}', '\u{00C9}', '\u{00D1}', '\u{00D6}', '\u{00DC}',
'\u{00E1}', '\u{00E0}', '\u{00E2}', '\u{00E4}', '\u{00E3}', '\u{00E5}', '\u{00E7}',
'\u{00E9}', '\u{00E8}', '\u{00EA}', '\u{00EB}', '\u{00ED}', '\u{00EC}', '\u{00EE}',
'\u{00EF}', '\u{00F1}', '\u{00F3}', '\u{00F2}', '\u{00F4}', '\u{00F6}', '\u{00F5}',
'\u{00FA}', '\u{00F9}', '\u{00FB}', '\u{00FC}', '\u{2020}', '\u{00B0}', '\u{00A2}',
'\u{00A3}', '\u{00A7}', '\u{2022}', '\u{00B6}', '\u{00DF}', '\u{00AE}', '\u{00A9}',
'\u{2122}', '\u{00B4}', '\u{00A8}', '\u{2260}', '\u{00C6}', '\u{00D8}', '\u{221E}',
'\u{00B1}', '\u{2264}', '\u{2265}', '\u{00A5}', '\u{00B5}', '\u{2202}', '\u{2211}',
'\u{220F}', '\u{03C0}', '\u{222B}', '\u{00AA}', '\u{00BA}', '\u{03A9}', '\u{00E6}',
'\u{00F8}', '\u{00BF}', '\u{00A1}', '\u{00AC}', '\u{221A}', '\u{0192}', '\u{2248}',
'\u{2206}', '\u{00AB}', '\u{00BB}', '\u{2026}', '\u{00A0}', '\u{00C0}', '\u{00C3}',
'\u{00D5}', '\u{0152}', '\u{0153}', '\u{2013}', '\u{2014}', '\u{201C}', '\u{201D}',
'\u{2018}', '\u{2019}', '\u{00F7}', '\u{25CA}', '\u{00FF}', '\u{0178}', '\u{2044}',
'\u{20AC}', '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{2021}', '\u{00B7}',
'\u{201A}', '\u{201E}', '\u{2030}', '\u{00C2}', '\u{00CA}', '\u{00C1}', '\u{00CB}',
'\u{00C8}', '\u{00CD}', '\u{00CE}', '\u{00CF}', '\u{00CC}', '\u{00D3}', '\u{00D4}',
'\u{F8FF}', '\u{00D2}', '\u{00DA}', '\u{00DB}', '\u{00D9}', '\u{0131}', '\u{02C6}',
'\u{02DC}', '\u{00AF}', '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{00B8}', '\u{02DD}',
'\u{02DB}', '\u{02C7}',
];
for (i, &ch) in mac_upper.iter().enumerate() {
t[0x80 + i] = ch;
}
t
})
}
fn glyph_name_to_unicode(name: &str) -> Option<char> {
if name.starts_with("uni") && name.len() >= 7 {
return u32::from_str_radix(&name[3..7], 16)
.ok()
.and_then(char::from_u32);
}
if name.starts_with('u') && name.len() >= 5 && name[1..].chars().all(|c| c.is_ascii_hexdigit())
{
return u32::from_str_radix(&name[1..], 16)
.ok()
.and_then(char::from_u32);
}
if let Some(c) = agl_table().get(name).copied() {
return Some(c);
}
match name {
"st" => Some('\u{FB06}'),
"longst" => Some('\u{FB05}'),
_ => None,
}
}
fn agl_table() -> &'static HashMap<&'static str, char> {
static TABLE: OnceLock<HashMap<&'static str, char>> = OnceLock::new();
TABLE.get_or_init(|| {
let entries: &[(&str, char)] = &[
("space", ' '),
("exclam", '!'),
("quotedbl", '"'),
("numbersign", '#'),
("dollar", '$'),
("percent", '%'),
("ampersand", '&'),
("quotesingle", '\''),
("parenleft", '('),
("parenright", ')'),
("asterisk", '*'),
("plus", '+'),
("comma", ','),
("hyphen", '-'),
("period", '.'),
("slash", '/'),
("zero", '0'),
("one", '1'),
("two", '2'),
("three", '3'),
("four", '4'),
("five", '5'),
("six", '6'),
("seven", '7'),
("eight", '8'),
("nine", '9'),
("colon", ':'),
("semicolon", ';'),
("less", '<'),
("equal", '='),
("greater", '>'),
("question", '?'),
("at", '@'),
("A", 'A'),
("B", 'B'),
("C", 'C'),
("D", 'D'),
("E", 'E'),
("F", 'F'),
("G", 'G'),
("H", 'H'),
("I", 'I'),
("J", 'J'),
("K", 'K'),
("L", 'L'),
("M", 'M'),
("N", 'N'),
("O", 'O'),
("P", 'P'),
("Q", 'Q'),
("R", 'R'),
("S", 'S'),
("T", 'T'),
("U", 'U'),
("V", 'V'),
("W", 'W'),
("X", 'X'),
("Y", 'Y'),
("Z", 'Z'),
("bracketleft", '['),
("backslash", '\\'),
("bracketright", ']'),
("asciicircum", '^'),
("underscore", '_'),
("grave", '`'),
("a", 'a'),
("b", 'b'),
("c", 'c'),
("d", 'd'),
("e", 'e'),
("f", 'f'),
("g", 'g'),
("h", 'h'),
("i", 'i'),
("j", 'j'),
("k", 'k'),
("l", 'l'),
("m", 'm'),
("n", 'n'),
("o", 'o'),
("p", 'p'),
("q", 'q'),
("r", 'r'),
("s", 's'),
("t", 't'),
("u", 'u'),
("v", 'v'),
("w", 'w'),
("x", 'x'),
("y", 'y'),
("z", 'z'),
("braceleft", '{'),
("bar", '|'),
("braceright", '}'),
("asciitilde", '~'),
("Agrave", '\u{00C0}'),
("Aacute", '\u{00C1}'),
("Acircumflex", '\u{00C2}'),
("Atilde", '\u{00C3}'),
("Adieresis", '\u{00C4}'),
("Aring", '\u{00C5}'),
("AE", '\u{00C6}'),
("Ccedilla", '\u{00C7}'),
("Egrave", '\u{00C8}'),
("Eacute", '\u{00C9}'),
("Ecircumflex", '\u{00CA}'),
("Edieresis", '\u{00CB}'),
("Igrave", '\u{00CC}'),
("Iacute", '\u{00CD}'),
("Icircumflex", '\u{00CE}'),
("Idieresis", '\u{00CF}'),
("Eth", '\u{00D0}'),
("Ntilde", '\u{00D1}'),
("Ograve", '\u{00D2}'),
("Oacute", '\u{00D3}'),
("Ocircumflex", '\u{00D4}'),
("Otilde", '\u{00D5}'),
("Odieresis", '\u{00D6}'),
("Ugrave", '\u{00D9}'),
("Uacute", '\u{00DA}'),
("Ucircumflex", '\u{00DB}'),
("Udieresis", '\u{00DC}'),
("Yacute", '\u{00DD}'),
("Thorn", '\u{00DE}'),
("germandbls", '\u{00DF}'),
("agrave", '\u{00E0}'),
("aacute", '\u{00E1}'),
("acircumflex", '\u{00E2}'),
("atilde", '\u{00E3}'),
("adieresis", '\u{00E4}'),
("aring", '\u{00E5}'),
("ae", '\u{00E6}'),
("ccedilla", '\u{00E7}'),
("egrave", '\u{00E8}'),
("eacute", '\u{00E9}'),
("ecircumflex", '\u{00EA}'),
("edieresis", '\u{00EB}'),
("igrave", '\u{00EC}'),
("iacute", '\u{00ED}'),
("icircumflex", '\u{00EE}'),
("idieresis", '\u{00EF}'),
("eth", '\u{00F0}'),
("ntilde", '\u{00F1}'),
("ograve", '\u{00F2}'),
("oacute", '\u{00F3}'),
("ocircumflex", '\u{00F4}'),
("otilde", '\u{00F5}'),
("odieresis", '\u{00F6}'),
("ugrave", '\u{00F9}'),
("uacute", '\u{00FA}'),
("ucircumflex", '\u{00FB}'),
("udieresis", '\u{00FC}'),
("yacute", '\u{00FD}'),
("thorn", '\u{00FE}'),
("ydieresis", '\u{00FF}'),
("fi", '\u{FB01}'),
("fl", '\u{FB02}'),
("ff", '\u{FB00}'),
("ffi", '\u{FB03}'),
("ffl", '\u{FB04}'),
("endash", '\u{2013}'),
("emdash", '\u{2014}'),
("bullet", '\u{2022}'),
("ellipsis", '\u{2026}'),
("quoteleft", '\u{2018}'),
("quoteright", '\u{2019}'),
("quotedblleft", '\u{201C}'),
("quotedblright", '\u{201D}'),
("quotesinglebase", '\u{201A}'),
("quotesinglbase", '\u{201A}'),
("quotedblbase", '\u{201E}'),
("dagger", '\u{2020}'),
("daggerdbl", '\u{2021}'),
("perthousand", '\u{2030}'),
("guilsinglleft", '\u{2039}'),
("guilsinglright", '\u{203A}'),
("guillemotleft", '\u{00AB}'),
("guillemotright", '\u{00BB}'),
("trademark", '\u{2122}'),
("copyright", '\u{00A9}'),
("registered", '\u{00AE}'),
("degree", '\u{00B0}'),
("plusminus", '\u{00B1}'),
("multiply", '\u{00D7}'),
("divide", '\u{00F7}'),
("fraction", '\u{2044}'),
("Euro", '\u{20AC}'),
("sterling", '\u{00A3}'),
("yen", '\u{00A5}'),
("cent", '\u{00A2}'),
("currency", '\u{00A4}'),
("section", '\u{00A7}'),
("paragraph", '\u{00B6}'),
("brokenbar", '\u{00A6}'),
("ordfeminine", '\u{00AA}'),
("ordmasculine", '\u{00BA}'),
("exclamdown", '\u{00A1}'),
("questiondown", '\u{00BF}'),
("logicalnot", '\u{00AC}'),
("mu", '\u{00B5}'),
("macron", '\u{00AF}'),
("acute", '\u{00B4}'),
("cedilla", '\u{00B8}'),
("dieresis", '\u{00A8}'),
("circumflex", '\u{02C6}'),
("tilde", '\u{02DC}'),
("caron", '\u{02C7}'),
("ring", '\u{02DA}'),
("breve", '\u{02D8}'),
("dotaccent", '\u{02D9}'),
("hungarumlaut", '\u{02DD}'),
("ogonek", '\u{02DB}'),
("nbspace", '\u{00A0}'),
("nonbreakingspace", '\u{00A0}'),
("softhyphen", '\u{00AD}'),
("periodcentered", '\u{00B7}'),
("middot", '\u{00B7}'),
("florin", '\u{0192}'),
("OE", '\u{0152}'),
("oe", '\u{0153}'),
("Scaron", '\u{0160}'),
("scaron", '\u{0161}'),
("Zcaron", '\u{017D}'),
("zcaron", '\u{017E}'),
("Ydieresis", '\u{0178}'),
("Lslash", '\u{0141}'),
("lslash", '\u{0142}'),
("Oslash", '\u{00D8}'),
("oslash", '\u{00F8}'),
("dotlessi", '\u{0131}'),
("onesuperior", '\u{00B9}'),
("twosuperior", '\u{00B2}'),
("threesuperior", '\u{00B3}'),
("onequarter", '\u{00BC}'),
("onehalf", '\u{00BD}'),
("threequarters", '\u{00BE}'),
("minus", '\u{2212}'),
("notequal", '\u{2260}'),
("lessequal", '\u{2264}'),
("greaterequal", '\u{2265}'),
("infinity", '\u{221E}'),
("partialdiff", '\u{2202}'),
("summation", '\u{2211}'),
("product", '\u{220F}'),
("integral", '\u{222B}'),
("radical", '\u{221A}'),
("approxequal", '\u{2248}'),
("Delta", '\u{0394}'),
("lozenge", '\u{25CA}'),
("pi", '\u{03C0}'),
("Omega", '\u{03A9}'),
];
entries.iter().cloned().collect()
})
}
const CT_LIGATURE_MARKER: char = '\u{E007}';
#[derive(Clone, Default)]
struct DecodedPdfString {
text: String,
ct_origins: Vec<bool>,
}
impl DecodedPdfString {
fn from_text(text: String) -> Self {
let ct_origins = vec![false; text.chars().count()];
Self { text, ct_origins }
}
fn push_char(&mut self, ch: char, is_ct_origin: bool) {
self.text.push(ch);
self.ct_origins.push(is_ct_origin);
}
fn push_str(&mut self, s: &str) {
for ch in s.chars() {
self.push_char(ch, false);
}
}
fn extend(&mut self, other: DecodedPdfString) {
self.text.push_str(&other.text);
self.ct_origins.extend(other.ct_origins);
}
fn is_empty(&self) -> bool {
self.text.is_empty()
}
fn glyph_count(&self) -> usize {
self.ct_origins.len()
}
fn iter(&self) -> impl Iterator<Item = (char, bool)> + '_ {
self.text.chars().zip(self.ct_origins.iter().copied())
}
}
fn decode_pdf_string_with_font_marked(
bytes: &[u8],
font_info: Option<&FontInfo>,
) -> DecodedPdfString {
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
let chars: Vec<u16> = bytes[2..]
.chunks(2)
.filter_map(|chunk| {
if chunk.len() == 2 {
Some(u16::from_be_bytes([chunk[0], chunk[1]]))
} else {
None
}
})
.collect();
return DecodedPdfString::from_text(String::from_utf16_lossy(&chars));
}
if let Some(info) = font_info {
if info.is_cid && !info.to_unicode.is_empty() {
let mut result = DecodedPdfString::default();
let mut i = 0;
while i + 1 < bytes.len() {
let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
if let Some(s) = info.to_unicode.get(&code) {
result.push_str(s);
} else {
if let Some(ch) = char::from_u32(code) {
if !ch.is_control() || ch == ' ' || ch == '\t' || ch == '\n' {
result.push_char(ch, false);
}
}
}
i += 2;
}
return result;
}
if !info.is_cid && !info.to_unicode.is_empty() {
let mut result = DecodedPdfString::default();
for &b in bytes {
if let Some(s) = info.to_unicode.get(&(b as u32)) {
result.push_str(s);
} else if info.ct_codes[b as usize] {
result.push_char(CT_LIGATURE_MARKER, true);
} else if let Some(ch) = info.encoding_map[b as usize] {
result.push_char(ch, false);
} else {
let ch = b as char;
if is_printable_or_space(ch) {
result.push_char(ch, false);
}
}
}
return result;
}
if !info.is_cid
&& (info.encoding_map.iter().any(|c| c.is_some()) || info.ct_codes.iter().any(|f| *f))
{
let mut result = DecodedPdfString::default();
for &b in bytes {
if info.ct_codes[b as usize] {
result.push_char(CT_LIGATURE_MARKER, true);
} else if let Some(ch) = info.encoding_map[b as usize] {
result.push_char(ch, false);
} else {
let ch = b as char;
if is_printable_or_space(ch) {
result.push_char(ch, false);
}
}
}
return result;
}
}
let mut result = DecodedPdfString::default();
for &b in bytes {
let ch = b as char;
if is_printable_or_space(ch) {
result.push_char(ch, false);
}
}
result
}
fn is_printable_or_space(ch: char) -> bool {
let cp = ch as u32;
cp >= 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D
}
const LIGATURE_DECOMP: bool = true;
fn decompose_ligature_char(c: char) -> Option<&'static str> {
Some(match c {
'\u{FB00}' => "ff",
'\u{FB01}' => "fi",
'\u{FB02}' => "fl",
'\u{FB03}' => "ffi",
'\u{FB04}' => "ffl",
'\u{FB05}' | '\u{FB06}' => "st",
_ => return None,
})
}
fn decompose_ligature_char_with_origin(c: char, is_ct_origin: bool) -> Option<&'static str> {
if is_ct_origin && c == CT_LIGATURE_MARKER {
Some("ct")
} else {
decompose_ligature_char(c)
}
}
fn decompose_glyph_to_string(c: char) -> Option<String> {
use unicode_normalization::UnicodeNormalization;
if let Some(s) = decompose_ligature_char(c) {
return Some(s.to_string());
}
if matches!(c, '\u{FB00}'..='\u{FB4F}') {
let nfkd: String = c.nfkd().collect();
if nfkd != c.to_string() {
return Some(nfkd);
}
}
None
}
fn decompose_ligatures(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
if let Some(replacement) = decompose_glyph_to_string(c) {
out.push_str(&replacement);
} else {
out.push(c);
}
}
out
}
fn decompose_decoded_ligatures(s: &DecodedPdfString) -> String {
let mut out = String::with_capacity(s.text.len());
for (c, is_ct_origin) in s.iter() {
if let Some(replacement) = decompose_ligature_char_with_origin(c, is_ct_origin) {
out.push_str(replacement);
} else if let Some(replacement) = decompose_glyph_to_string(c) {
out.push_str(&replacement);
} else {
out.push(c);
}
}
out
}
fn maybe_decompose_decoded(s: &DecodedPdfString) -> String {
if LIGATURE_DECOMP {
if s.ct_origins.iter().any(|origin| *origin) {
decompose_decoded_ligatures(s)
} else {
decompose_ligatures(&s.text)
}
} else {
s.text.clone()
}
}
pub fn extract_page_blocks(doc: &Document, page_num: u32) -> Vec<TextBlock> {
let pages = doc.get_pages();
let Some(&page_id) = pages.get(&page_num) else {
return Vec::new();
};
let font_map = build_font_map(doc, page_id);
let resources = get_page_resources(doc, page_id).unwrap_or_default();
if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
if let Ok(content) = Content::decode(&content_bytes) {
return extract_blocks_from_ops_inner(
&content.operations,
page_num,
&font_map,
Some((doc, &resources)),
0,
None,
);
}
}
Vec::new()
}
pub fn extract_blocks_from_page_id(
doc: &Document,
page_id: lopdf::ObjectId,
page_num: u32,
) -> Vec<TextBlock> {
let font_map = build_font_map(doc, page_id);
let resources = get_page_resources(doc, page_id).unwrap_or_default();
if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
if let Ok(content) = Content::decode(&content_bytes) {
return extract_blocks_from_ops_inner(
&content.operations,
page_num,
&font_map,
Some((doc, &resources)),
0,
None,
);
}
}
Vec::new()
}
pub fn extract_text(doc: &Document) -> Vec<TextBlock> {
let pages = doc.get_pages();
let mut blocks = Vec::new();
for (&page_num, &page_id) in &pages {
let font_map = build_font_map(doc, page_id);
let resources = get_page_resources(doc, page_id).unwrap_or_default();
if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
if let Ok(content) = Content::decode(&content_bytes) {
let page_blocks = extract_blocks_from_ops_inner(
&content.operations,
page_num,
&font_map,
Some((doc, &resources)),
0,
None,
);
blocks.extend(page_blocks);
}
}
}
blocks
}
pub fn extract_page_text(doc: &Document, page_num: u32) -> Result<String> {
let pages = doc.get_pages();
let total = pages.len() as u32;
if page_num == 0 || page_num > total {
return Err(ExtractError::PageOutOfRange(page_num, total));
}
let page_id = *pages
.get(&page_num)
.ok_or(ExtractError::PageOutOfRange(page_num, total))?;
let font_map = build_font_map(doc, page_id);
let resources = get_page_resources(doc, page_id).unwrap_or_default();
let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
let content = match Content::decode(&content_bytes) {
Ok(c) => c,
Err(_) => return Ok(String::new()),
};
let blocks = extract_blocks_from_ops_inner(
&content.operations,
page_num,
&font_map,
Some((doc, &resources)),
0,
None,
);
let text = blocks
.iter()
.map(|b| b.text.as_str())
.collect::<Vec<_>>()
.join("");
Ok(text)
}
pub fn extract_positioned_chars(doc: &Document, page_num: u32) -> Result<Vec<PositionedChar>> {
let pages = doc.get_pages();
let total = pages.len() as u32;
if page_num == 0 || page_num > total {
return Err(ExtractError::PageOutOfRange(page_num, total));
}
let page_id = *pages
.get(&page_num)
.ok_or(ExtractError::PageOutOfRange(page_num, total))?;
let font_map = build_font_map(doc, page_id);
let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
let content = match Content::decode(&content_bytes) {
Ok(c) => c,
Err(_) => return Ok(Vec::new()),
};
let chars = extract_chars_from_ops(&content.operations, page_num, &font_map);
Ok(chars)
}
fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> std::result::Result<Vec<u8>, ()> {
doc.get_page_content(page_id).map_err(|_| ())
}
#[derive(Debug, Clone, Default)]
struct MarkedContentEntry {
actual_text: Option<String>,
}
fn resolve_bdc_properties(
doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
operand: &Object,
) -> Option<lopdf::Dictionary> {
match operand {
Object::Dictionary(d) => Some(d.clone()),
Object::Reference(r) => doc_and_resources.and_then(|(doc, _)| {
doc.get_object(*r).ok().and_then(|o| match o {
Object::Dictionary(d) => Some(d.clone()),
_ => None,
})
}),
Object::Name(n) => {
let (doc, resources) = doc_and_resources?;
let props = resolve_dict(doc, resources, b"Properties")?;
match props.get(n.as_slice()).ok()? {
Object::Dictionary(d) => Some(d.clone()),
Object::Reference(r) => match doc.get_object(*r).ok()? {
Object::Dictionary(d) => Some(d.clone()),
_ => None,
},
_ => None,
}
}
_ => None,
}
}
fn decode_pdf_text_string(bytes: &[u8]) -> String {
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
let u16s: Vec<u16> = bytes[2..]
.chunks_exact(2)
.map(|c| u16::from_be_bytes([c[0], c[1]]))
.collect();
return String::from_utf16_lossy(&u16s);
}
if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
return String::from_utf8_lossy(&bytes[3..]).into_owned();
}
bytes
.iter()
.filter_map(|&b| {
let ch = b as char;
if is_printable_or_space(ch) {
Some(ch)
} else {
None
}
})
.collect()
}
fn extract_actual_text(props: &lopdf::Dictionary) -> Option<String> {
let obj = props.get(b"ActualText").ok()?;
match obj {
Object::String(bytes, _) => Some(decode_pdf_text_string(bytes)),
_ => None,
}
}
fn current_actual_text(stack: &[MarkedContentEntry], inherited: Option<&str>) -> Option<String> {
stack
.iter()
.rev()
.find_map(|e| e.actual_text.clone())
.or_else(|| inherited.map(|s| s.to_string()))
}
fn extract_blocks_from_ops_inner(
ops: &[Operation],
page: u32,
font_map: &HashMap<String, FontInfo>,
doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
depth: u32,
inherited_actual_text: Option<&str>,
) -> Vec<TextBlock> {
let mut state = TextState::default();
let mut blocks = Vec::new();
let mut mc_stack: Vec<MarkedContentEntry> = Vec::new();
for op in ops {
match op.operator.as_str() {
"q" => {
state.gs_stack.push(GraphicsState { ctm: state.ctm });
}
"Q" => {
if let Some(gs) = state.gs_stack.pop() {
state.ctm = gs.ctm;
}
}
"cm" => {
if let Some(m) = extract_matrix(&op.operands) {
state.ctm = multiply_matrix(&state.ctm, &m);
}
}
"BT" => {
state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
}
"Tf" => {
if op.operands.len() >= 2 {
if let Object::Name(ref name) = op.operands[0] {
state.font_name = String::from_utf8_lossy(name).to_string();
}
if let Some(size) = as_number(&op.operands[1]) {
state.font_size = size;
}
}
}
"Tc" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.tc = v;
}
}
"Tw" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.tw = v;
}
}
"Tz" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.th = v;
}
}
"TL" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.tl = v;
}
}
"Ts" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.ts = v;
}
}
"Td" => {
if op.operands.len() >= 2 {
let tx = as_number(&op.operands[0]).unwrap_or(0.0);
let ty = as_number(&op.operands[1]).unwrap_or(0.0);
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
state.tlm = new_tlm;
state.tm = new_tlm;
}
}
"TD" => {
if op.operands.len() >= 2 {
let tx = as_number(&op.operands[0]).unwrap_or(0.0);
let ty = as_number(&op.operands[1]).unwrap_or(0.0);
state.tl = -ty;
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
state.tlm = new_tlm;
state.tm = new_tlm;
}
}
"Tm" => {
if let Some(m) = extract_matrix(&op.operands) {
state.tm = m;
state.tlm = m;
}
}
"T*" => {
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
state.tlm = new_tlm;
state.tm = new_tlm;
}
"Tj" => {
let fi = font_map.get(&state.font_name);
if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
if !text.is_empty() {
let x = state.tm[4];
let y = state.tm[5];
let text_width = text.glyph_count() as f64 * char_w;
let display_text = maybe_decompose_decoded(&text);
blocks.push(TextBlock {
text: display_text,
page,
bbox: [x, y, x + text_width, y + state.font_size],
font_name: state.font_name.clone(),
font_size: state.font_size,
actual_text: current_actual_text(&mc_stack, inherited_actual_text),
});
}
for _ in text.iter() {
state.tm[4] += char_w + state.tc;
}
}
}
"TJ" => {
if let Some(Object::Array(ref arr)) = op.operands.first() {
let x_start = state.tm[4];
let y = state.tm[5];
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
let mut combined_text = DecodedPdfString::default();
let fi = font_map.get(&state.font_name);
for item in arr {
match item {
Object::String(bytes, _) => {
let text = decode_pdf_string_with_font_marked(bytes, fi);
for _ in text.iter() {
state.tm[4] += char_w + state.tc;
}
combined_text.extend(text);
}
_ => {
if let Some(adj) = as_number(item) {
state.tm[4] -= adj / 1000.0 * state.font_size;
}
}
}
}
if !combined_text.is_empty() {
let x_end = state.tm[4];
blocks.push(TextBlock {
text: maybe_decompose_decoded(&combined_text),
page,
bbox: [x_start, y, x_end, y + state.font_size],
font_name: state.font_name.clone(),
font_size: state.font_size,
actual_text: current_actual_text(&mc_stack, inherited_actual_text),
});
}
}
}
"'" => {
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
state.tlm = new_tlm;
state.tm = new_tlm;
let fi = font_map.get(&state.font_name);
if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
if !text.is_empty() {
let x = state.tm[4];
let y = state.tm[5];
let text_width = text.glyph_count() as f64 * char_w;
let display_text = maybe_decompose_decoded(&text);
blocks.push(TextBlock {
text: display_text,
page,
bbox: [x, y, x + text_width, y + state.font_size],
font_name: state.font_name.clone(),
font_size: state.font_size,
actual_text: current_actual_text(&mc_stack, inherited_actual_text),
});
}
for _ in text.iter() {
state.tm[4] += char_w + state.tc;
}
}
}
"\"" => {
if op.operands.len() >= 3 {
if let Some(tw) = as_number(&op.operands[0]) {
state.tw = tw;
}
if let Some(tc) = as_number(&op.operands[1]) {
state.tc = tc;
}
let new_tlm =
multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
state.tlm = new_tlm;
state.tm = new_tlm;
let fi = font_map.get(&state.font_name);
if let Some(text) =
extract_decoded_string_operand_with_font(&op.operands[2..], fi)
{
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
if !text.is_empty() {
let x = state.tm[4];
let y = state.tm[5];
let text_width = text.glyph_count() as f64 * char_w;
let display_text = maybe_decompose_decoded(&text);
blocks.push(TextBlock {
text: display_text,
page,
bbox: [x, y, x + text_width, y + state.font_size],
font_name: state.font_name.clone(),
font_size: state.font_size,
actual_text: current_actual_text(&mc_stack, inherited_actual_text),
});
}
for _ in text.iter() {
state.tm[4] += char_w + state.tc;
}
}
}
}
"BMC" => {
mc_stack.push(MarkedContentEntry::default());
}
"BDC" => {
let actual_text = op
.operands
.get(1)
.and_then(|o| resolve_bdc_properties(doc_and_resources, o))
.as_ref()
.and_then(extract_actual_text);
mc_stack.push(MarkedContentEntry { actual_text });
}
"EMC" => {
mc_stack.pop();
}
"Do" => {
if depth < 5 {
if let Some((doc, resources)) = doc_and_resources {
if let Some(Object::Name(ref xobj_name)) = op.operands.first() {
let xobj_name_str = String::from_utf8_lossy(xobj_name);
let inherited_for_child =
current_actual_text(&mc_stack, inherited_actual_text);
if let Some(xobj_blocks) = extract_form_xobject_text(
doc,
resources,
&xobj_name_str,
page,
font_map,
depth,
inherited_for_child.as_deref(),
) {
blocks.extend(xobj_blocks);
}
}
}
}
}
_ => {}
}
}
blocks
}
fn extract_form_xobject_text(
doc: &Document,
resources: &lopdf::Dictionary,
name: &str,
page: u32,
font_map: &HashMap<String, FontInfo>,
depth: u32,
inherited_actual_text: Option<&str>,
) -> Option<Vec<TextBlock>> {
let xobj_dict = match resources.get(b"XObject").ok()? {
Object::Dictionary(d) => d.clone(),
Object::Reference(r) => match doc.get_object(*r).ok()? {
Object::Dictionary(d) => d.clone(),
_ => return None,
},
_ => return None,
};
let xobj_ref = match xobj_dict.get(name.as_bytes()).ok()? {
Object::Reference(r) => *r,
_ => return None,
};
let stream = match doc.get_object(xobj_ref).ok()? {
Object::Stream(s) => s.clone(),
_ => return None,
};
let subtype = stream
.dict
.get(b"Subtype")
.ok()
.and_then(|o| match o {
Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
_ => None,
})
.unwrap_or_default();
if subtype != "Form" {
return None;
}
let content_bytes = stream
.decompressed_content()
.ok()
.unwrap_or_else(|| stream.content.clone());
let content = Content::decode(&content_bytes).ok()?;
let mut xobj_font_map = font_map.clone();
if let Some(xobj_resources) = resolve_dict(doc, &stream.dict, b"Resources") {
if let Some(xobj_fonts) = resolve_dict(doc, &xobj_resources, b"Font") {
for (name_bytes, value) in xobj_fonts.iter() {
let fname = String::from_utf8_lossy(name_bytes).to_string();
if let Some(fi) = build_font_info_from_value(doc, value) {
xobj_font_map.insert(fname, fi);
}
}
}
}
let xobj_resources =
resolve_dict(doc, &stream.dict, b"Resources").unwrap_or_else(|| resources.clone());
Some(extract_blocks_from_ops_inner(
&content.operations,
page,
&xobj_font_map,
Some((doc, &xobj_resources)),
depth + 1,
inherited_actual_text,
))
}
fn build_font_info_from_value(doc: &Document, value: &Object) -> Option<FontInfo> {
let font = match value {
Object::Reference(r) => match doc.get_object(*r).ok()? {
Object::Dictionary(d) => d.clone(),
_ => return None,
},
Object::Dictionary(d) => d.clone(),
_ => return None,
};
let subtype = font
.get(b"Subtype")
.ok()
.and_then(|o| match o {
Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
_ => None,
})
.unwrap_or_default();
let is_cid = subtype == "Type0";
let mut to_unicode = parse_to_unicode_from_font(doc, &font);
if to_unicode.is_empty() && is_cid {
if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
for d in descendants {
let desc_dict = match d {
Object::Reference(r) => {
let Some(Object::Dictionary(d)) = doc.get_object(*r).ok() else {
continue;
};
d
}
Object::Dictionary(d) => d,
_ => continue,
};
let tu = parse_to_unicode_from_font(doc, desc_dict);
if !tu.is_empty() {
to_unicode = tu;
break;
}
}
}
}
let (encoding_map, ct_codes) = if !is_cid {
build_encoding_map(doc, &font)
} else {
([None; 256], [false; 256])
};
Some(FontInfo {
is_cid,
to_unicode,
encoding_map,
ct_codes,
})
}
fn push_glyph_positioned(
chars: &mut Vec<PositionedChar>,
state: &mut TextState,
page: u32,
glyph: char,
is_ct_origin: bool,
char_w: f64,
) {
let (gx, gy) = apply_ctm(state);
let constituents: Option<String> = if LIGATURE_DECOMP {
if let Some(s) = decompose_ligature_char_with_origin(glyph, is_ct_origin) {
Some(s.to_string())
} else {
decompose_glyph_to_string(glyph)
}
} else {
None
};
match constituents {
None => {
chars.push(PositionedChar {
ch: glyph,
page,
bbox: [gx, gy, gx + char_w, gy + state.font_size],
});
}
Some(s) => {
let n = s.chars().count() as f64;
let part_w = char_w / n;
for (i, c) in s.chars().enumerate() {
let x = gx + part_w * i as f64;
chars.push(PositionedChar {
ch: c,
page,
bbox: [x, gy, x + part_w, gy + state.font_size],
});
}
}
}
state.tm[4] += char_w + state.tc;
}
fn extract_chars_from_ops(
ops: &[Operation],
page: u32,
font_map: &HashMap<String, FontInfo>,
) -> Vec<PositionedChar> {
let mut state = TextState::default();
let mut chars = Vec::new();
for op in ops {
match op.operator.as_str() {
"q" => {
state.gs_stack.push(GraphicsState { ctm: state.ctm });
}
"Q" => {
if let Some(gs) = state.gs_stack.pop() {
state.ctm = gs.ctm;
}
}
"cm" => {
if let Some(m) = extract_matrix(&op.operands) {
state.ctm = multiply_matrix(&state.ctm, &m);
}
}
"BT" => {
state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
}
"Tf" => {
if op.operands.len() >= 2 {
if let Object::Name(ref name) = op.operands[0] {
state.font_name = String::from_utf8_lossy(name).to_string();
}
if let Some(size) = as_number(&op.operands[1]) {
state.font_size = size;
}
}
}
"Tc" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.tc = v;
}
}
"Tw" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.tw = v;
}
}
"Tz" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.th = v;
}
}
"TL" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.tl = v;
}
}
"Ts" => {
if let Some(v) = op.operands.first().and_then(as_number) {
state.ts = v;
}
}
"Td" => {
if op.operands.len() >= 2 {
let tx = as_number(&op.operands[0]).unwrap_or(0.0);
let ty = as_number(&op.operands[1]).unwrap_or(0.0);
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
state.tlm = new_tlm;
state.tm = new_tlm;
}
}
"TD" => {
if op.operands.len() >= 2 {
let tx = as_number(&op.operands[0]).unwrap_or(0.0);
let ty = as_number(&op.operands[1]).unwrap_or(0.0);
state.tl = -ty;
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
state.tlm = new_tlm;
state.tm = new_tlm;
}
}
"Tm" => {
if let Some(m) = extract_matrix(&op.operands) {
state.tm = m;
state.tlm = m;
}
}
"T*" => {
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
state.tlm = new_tlm;
state.tm = new_tlm;
}
"Tj" => {
let fi = font_map.get(&state.font_name);
if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
for (ch, is_ct_origin) in text.iter() {
push_glyph_positioned(
&mut chars,
&mut state,
page,
ch,
is_ct_origin,
char_w,
);
}
}
}
"TJ" => {
if let Some(Object::Array(ref arr)) = op.operands.first() {
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
let fi = font_map.get(&state.font_name);
for item in arr {
match item {
Object::String(bytes, _) => {
let text = decode_pdf_string_with_font_marked(bytes, fi);
for (ch, is_ct_origin) in text.iter() {
push_glyph_positioned(
&mut chars,
&mut state,
page,
ch,
is_ct_origin,
char_w,
);
}
}
_ => {
if let Some(adj) = as_number(item) {
state.tm[4] -= adj / 1000.0 * state.font_size;
}
}
}
}
}
}
"'" => {
let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
state.tlm = new_tlm;
state.tm = new_tlm;
let fi = font_map.get(&state.font_name);
if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
for (ch, is_ct_origin) in text.iter() {
push_glyph_positioned(
&mut chars,
&mut state,
page,
ch,
is_ct_origin,
char_w,
);
}
}
}
"\"" => {
if op.operands.len() >= 3 {
if let Some(tw) = as_number(&op.operands[0]) {
state.tw = tw;
}
if let Some(tc) = as_number(&op.operands[1]) {
state.tc = tc;
}
let new_tlm =
multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
state.tlm = new_tlm;
state.tm = new_tlm;
let fi = font_map.get(&state.font_name);
if let Some(text) =
extract_decoded_string_operand_with_font(&op.operands[2..], fi)
{
let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
for (ch, is_ct_origin) in text.iter() {
push_glyph_positioned(
&mut chars,
&mut state,
page,
ch,
is_ct_origin,
char_w,
);
}
}
}
}
_ => {}
}
}
chars
}
#[inline]
fn apply_ctm(state: &TextState) -> (f64, f64) {
let x = state.ctm[0] * state.tm[4] + state.ctm[2] * state.tm[5] + state.ctm[4];
let y = state.ctm[1] * state.tm[4] + state.ctm[3] * state.tm[5] + state.ctm[5];
(x, y)
}
fn extract_decoded_string_operand_with_font(
operands: &[Object],
font_info: Option<&FontInfo>,
) -> Option<DecodedPdfString> {
for op in operands {
if let Object::String(bytes, _) = op {
return Some(decode_pdf_string_with_font_marked(bytes, font_info));
}
}
None
}
fn as_number(obj: &Object) -> Option<f64> {
match obj {
Object::Integer(i) => Some(*i as f64),
Object::Real(f) => Some(*f as f64),
_ => None,
}
}
fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
if operands.len() < 6 {
return None;
}
let a = as_number(&operands[0])?;
let b = as_number(&operands[1])?;
let c = as_number(&operands[2])?;
let d = as_number(&operands[3])?;
let e = as_number(&operands[4])?;
let f = as_number(&operands[5])?;
Some([a, b, c, d, e, f])
}
fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
[
m1[0] * m2[0] + m1[1] * m2[2],
m1[0] * m2[1] + m1[1] * m2[3],
m1[2] * m2[0] + m1[3] * m2[2],
m1[2] * m2[1] + m1[3] * m2[3],
m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
]
}
#[cfg(test)]
mod tests {
use super::*;
use lopdf::{dictionary, Document, Object, Stream};
fn make_doc_with_text(content: &[u8]) -> Document {
let mut doc = Document::with_version("1.7");
let content_stream = Stream::new(dictionary! {}, content.to_vec());
let content_id = doc.add_object(Object::Stream(content_stream));
let page_dict = dictionary! {
"Type" => "Page",
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
"Contents" => Object::Reference(content_id),
};
let page_id = doc.add_object(Object::Dictionary(page_dict));
let pages_dict = dictionary! {
"Type" => "Pages",
"Kids" => vec![Object::Reference(page_id)],
"Count" => 1_i64,
};
let pages_id = doc.add_object(Object::Dictionary(pages_dict));
if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
d.set("Parent", Object::Reference(pages_id));
}
let catalog = dictionary! {
"Type" => "Catalog",
"Pages" => Object::Reference(pages_id),
};
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
doc
}
#[test]
fn extract_simple_text() {
let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "Hello World");
assert_eq!(blocks[0].page, 1);
assert_eq!(blocks[0].font_size, 12.0);
}
#[test]
fn extract_page_text_single() {
let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
let text = extract_page_text(&doc, 1).unwrap();
assert_eq!(text, "Hello");
}
#[test]
fn extract_page_text_out_of_range() {
let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
let result = extract_page_text(&doc, 5);
assert!(result.is_err());
}
#[test]
fn extract_positioned_chars_basic() {
let doc = make_doc_with_text(b"BT /F1 12 Tf (AB) Tj ET");
let chars = extract_positioned_chars(&doc, 1).unwrap();
assert_eq!(chars.len(), 2);
assert_eq!(chars[0].ch, 'A');
assert_eq!(chars[1].ch, 'B');
assert_eq!(chars[0].page, 1);
assert!(chars[1].bbox[0] > chars[0].bbox[0]);
}
#[test]
fn extract_tj_array() {
let doc = make_doc_with_text(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "Hello");
}
#[test]
fn empty_page_extracts_no_text() {
let doc = make_doc_with_text(b"q Q");
let blocks = extract_text(&doc);
assert!(blocks.is_empty());
}
#[test]
fn multiline_text_extraction() {
let doc = make_doc_with_text(b"BT /F1 12 Tf 12 TL (Line1) Tj T* (Line2) Tj ET");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 2);
assert_eq!(blocks[0].text, "Line1");
assert_eq!(blocks[1].text, "Line2");
}
#[test]
fn actual_text_basic_bdc_override() {
let doc = make_doc_with_text(b"BT /F1 12 Tf /Span <</ActualText (fi)>> BDC (X) Tj EMC ET");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "X");
assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
}
#[test]
fn actual_text_nested_bdc_inner_overrides_outer() {
let doc = make_doc_with_text(
b"BT /F1 12 Tf \
/Span <</ActualText (outer)>> BDC \
(A) Tj \
/Span <</ActualText (inner)>> BDC \
(B) Tj \
EMC \
(C) Tj \
EMC ET",
);
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 3);
assert_eq!(blocks[0].text, "A");
assert_eq!(blocks[0].actual_text.as_deref(), Some("outer"));
assert_eq!(blocks[1].text, "B");
assert_eq!(blocks[1].actual_text.as_deref(), Some("inner"));
assert_eq!(blocks[2].text, "C");
assert_eq!(blocks[2].actual_text.as_deref(), Some("outer"));
}
#[test]
fn actual_text_bmc_does_not_leak_emc() {
let doc = make_doc_with_text(
b"BT /F1 12 Tf \
/Span <</ActualText (X)>> BDC \
/Artifact BMC (in) Tj EMC \
(out) Tj \
EMC \
(after) Tj ET",
);
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 3);
assert_eq!(blocks[0].actual_text.as_deref(), Some("X"));
assert_eq!(blocks[1].actual_text.as_deref(), Some("X"));
assert_eq!(blocks[2].actual_text, None);
}
fn make_doc_with_xobj_inside_bdc(
page_pre: &[u8],
xobj_content: &[u8],
page_post: &[u8],
actual_text: &str,
) -> Document {
let mut doc = Document::with_version("1.7");
let xobj_dict = dictionary! {
"Type" => "XObject",
"Subtype" => "Form",
"BBox" => vec![0.into(), 0.into(), 100.into(), 100.into()],
};
let xobj_stream = Stream::new(xobj_dict, xobj_content.to_vec());
let xobj_id = doc.add_object(Object::Stream(xobj_stream));
let mut page_content = Vec::<u8>::new();
page_content.extend_from_slice(b"BT /F1 12 Tf /Span <</ActualText (");
page_content.extend_from_slice(actual_text.as_bytes());
page_content.extend_from_slice(b")>> BDC ");
page_content.extend_from_slice(page_pre);
page_content.extend_from_slice(b" /Fm0 Do ");
page_content.extend_from_slice(page_post);
page_content.extend_from_slice(b" EMC ET");
let content_stream = Stream::new(dictionary! {}, page_content);
let content_id = doc.add_object(Object::Stream(content_stream));
let resources = dictionary! {
"XObject" => dictionary! { "Fm0" => Object::Reference(xobj_id) },
};
let resources_id = doc.add_object(Object::Dictionary(resources));
let page_dict = dictionary! {
"Type" => "Page",
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
"Contents" => Object::Reference(content_id),
"Resources" => Object::Reference(resources_id),
};
let page_id = doc.add_object(Object::Dictionary(page_dict));
let pages_dict = dictionary! {
"Type" => "Pages",
"Kids" => vec![Object::Reference(page_id)],
"Count" => 1_i64,
};
let pages_id = doc.add_object(Object::Dictionary(pages_dict));
if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
d.set("Parent", Object::Reference(pages_id));
}
let catalog = dictionary! {
"Type" => "Catalog",
"Pages" => Object::Reference(pages_id),
};
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
doc
}
#[test]
fn actual_text_propagates_into_form_xobject_recursion() {
let doc = make_doc_with_xobj_inside_bdc(
b"(pre) Tj",
b"BT /F1 12 Tf (inside) Tj ET",
b"(post) Tj",
"wrapped",
);
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 3);
assert_eq!(blocks[0].text, "pre");
assert_eq!(blocks[0].actual_text.as_deref(), Some("wrapped"));
assert_eq!(blocks[1].text, "inside");
assert_eq!(
blocks[1].actual_text.as_deref(),
Some("wrapped"),
"Form XObject text lost surrounding /ActualText (issue #1358)"
);
assert_eq!(blocks[2].text, "post");
assert_eq!(blocks[2].actual_text.as_deref(), Some("wrapped"));
}
#[test]
fn actual_text_inner_xobj_bdc_overrides_inherited() {
let doc = make_doc_with_xobj_inside_bdc(
b"",
b"BT /F1 12 Tf \
/Span <</ActualText (inner)>> BDC (B) Tj EMC \
(after) Tj ET",
b"",
"outer",
);
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 2);
assert_eq!(blocks[0].text, "B");
assert_eq!(blocks[0].actual_text.as_deref(), Some("inner"));
assert_eq!(blocks[1].text, "after");
assert_eq!(blocks[1].actual_text.as_deref(), Some("outer"));
}
#[test]
fn actual_text_utf16be_bom_decodes() {
let doc = make_doc_with_text(
b"BT /F1 12 Tf /Span <</ActualText <FEFF00660069> >> BDC (X) Tj EMC ET",
);
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
}
#[test]
fn ligature_ff_decomposes() {
assert_eq!(decompose_ligature_char('\u{FB00}'), Some("ff"));
assert_eq!(decompose_ligatures("\u{FB00}"), "ff");
assert_eq!(decompose_ligatures("o\u{FB00}ice"), "office");
}
#[test]
fn ligature_fi_decomposes() {
assert_eq!(decompose_ligature_char('\u{FB01}'), Some("fi"));
assert_eq!(decompose_ligatures("\u{FB01}"), "fi");
assert_eq!(decompose_ligatures("of\u{FB01}ce"), "office");
}
#[test]
fn ligature_fl_decomposes() {
assert_eq!(decompose_ligature_char('\u{FB02}'), Some("fl"));
assert_eq!(decompose_ligatures("\u{FB02}ame"), "flame");
}
#[test]
fn ligature_ffi_decomposes() {
assert_eq!(decompose_ligature_char('\u{FB03}'), Some("ffi"));
assert_eq!(decompose_ligatures("o\u{FB03}ce"), "office");
}
#[test]
fn ligature_ffl_decomposes() {
assert_eq!(decompose_ligature_char('\u{FB04}'), Some("ffl"));
assert_eq!(decompose_ligatures("ba\u{FB04}e"), "baffle");
}
#[test]
fn ligature_st_decomposes() {
assert_eq!(decompose_ligature_char('\u{FB06}'), Some("st"));
assert_eq!(decompose_ligature_char('\u{FB05}'), Some("st"));
assert_eq!(decompose_ligatures("fa\u{FB06}"), "fast");
assert_eq!(glyph_name_to_unicode("st"), Some('\u{FB06}'));
}
#[test]
fn ligature_ct_via_glyph_name_emits_string() {
assert_eq!(glyph_name_to_unicode("ct"), None);
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6101> Tj ET", "ct");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "act");
}
#[test]
fn tounicode_pua_e007_is_preserved() {
let doc = make_doc_with_tounicode_cmap(b"BT /F1 12 Tf <01> Tj ET", 0x01, 0xE007);
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "\u{E007}");
assert_ne!(blocks[0].text, "ct");
}
#[test]
fn ligature_ct_positioned_chars_advance_as_single_glyph() {
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <610162> Tj ET", "ct");
let chars = extract_positioned_chars(&doc, 1).unwrap();
let extracted: String = chars.iter().map(|c| c.ch).collect();
assert_eq!(extracted, "actb");
assert_eq!(chars.len(), 4);
let char_w = 12.0 * APPROX_CHAR_WIDTH;
assert!((chars[1].bbox[0] - char_w).abs() < 1e-6);
assert!((chars[3].bbox[0] - (2.0 * char_w)).abs() < 1e-6);
}
fn make_doc_with_ligature_font(content: &[u8], glyph_name: &str) -> Document {
let mut doc = Document::with_version("1.7");
let content_stream = Stream::new(dictionary! {}, content.to_vec());
let content_id = doc.add_object(Object::Stream(content_stream));
let encoding = dictionary! {
"Type" => "Encoding",
"BaseEncoding" => "WinAnsiEncoding",
"Differences" => vec![
1_i64.into(),
Object::Name(glyph_name.as_bytes().to_vec()),
],
};
let encoding_id = doc.add_object(Object::Dictionary(encoding));
let font = dictionary! {
"Type" => "Font",
"Subtype" => "Type1",
"BaseFont" => "Helvetica",
"Encoding" => Object::Reference(encoding_id),
};
let font_id = doc.add_object(Object::Dictionary(font));
let resources = dictionary! {
"Font" => dictionary! { "F1" => Object::Reference(font_id) },
};
let resources_id = doc.add_object(Object::Dictionary(resources));
let page_dict = dictionary! {
"Type" => "Page",
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
"Contents" => Object::Reference(content_id),
"Resources" => Object::Reference(resources_id),
};
let page_id = doc.add_object(Object::Dictionary(page_dict));
let pages_dict = dictionary! {
"Type" => "Pages",
"Kids" => vec![Object::Reference(page_id)],
"Count" => 1_i64,
};
let pages_id = doc.add_object(Object::Dictionary(pages_dict));
if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
d.set("Parent", Object::Reference(pages_id));
}
let catalog = dictionary! {
"Type" => "Catalog",
"Pages" => Object::Reference(pages_id),
};
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
doc
}
#[test]
fn ligature_office_golden_ffi() {
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6F016365> Tj ET", "ffi");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "office");
}
fn make_doc_with_tounicode_cmap(content: &[u8], src: u8, dst: u32) -> Document {
let mut doc = Document::with_version("1.7");
let content_stream = Stream::new(dictionary! {}, content.to_vec());
let content_id = doc.add_object(Object::Stream(content_stream));
let cmap_text = format!(
"/CIDInit /ProcSet findresource begin\n\
12 dict begin\n\
begincmap\n\
/CMapType 2 def\n\
1 beginbfchar\n\
<{:02X}> <{:04X}>\n\
endbfchar\n\
endcmap CMapName currentdict /CMap defineresource pop end end",
src, dst
);
let cmap_stream = Stream::new(dictionary! {}, cmap_text.into_bytes());
let cmap_id = doc.add_object(Object::Stream(cmap_stream));
let font = dictionary! {
"Type" => "Font",
"Subtype" => "Type1",
"BaseFont" => "Helvetica",
"Encoding" => "WinAnsiEncoding",
"ToUnicode" => Object::Reference(cmap_id),
};
let font_id = doc.add_object(Object::Dictionary(font));
let resources = dictionary! {
"Font" => dictionary! { "F1" => Object::Reference(font_id) },
};
let resources_id = doc.add_object(Object::Dictionary(resources));
let page_dict = dictionary! {
"Type" => "Page",
"MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
"Contents" => Object::Reference(content_id),
"Resources" => Object::Reference(resources_id),
};
let page_id = doc.add_object(Object::Dictionary(page_dict));
let pages_dict = dictionary! {
"Type" => "Pages",
"Kids" => vec![Object::Reference(page_id)],
"Count" => 1_i64,
};
let pages_id = doc.add_object(Object::Dictionary(pages_dict));
if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
d.set("Parent", Object::Reference(pages_id));
}
let catalog = dictionary! {
"Type" => "Catalog",
"Pages" => Object::Reference(pages_id),
};
let catalog_id = doc.add_object(Object::Dictionary(catalog));
doc.trailer.set("Root", Object::Reference(catalog_id));
doc
}
#[test]
fn ligature_positioned_chars_split_bbox() {
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
let chars = extract_positioned_chars(&doc, 1).unwrap();
assert_eq!(chars.len(), 3, "ffi → 3 chars");
assert_eq!(chars[0].ch, 'f');
assert_eq!(chars[1].ch, 'f');
assert_eq!(chars[2].ch, 'i');
assert!(chars[0].bbox[2] <= chars[1].bbox[0] + 1e-9);
assert!(chars[1].bbox[2] <= chars[2].bbox[0] + 1e-9);
let total = chars[2].bbox[2] - chars[0].bbox[0];
let expected = 12.0 * APPROX_CHAR_WIDTH;
assert!((total - expected).abs() < 1e-6);
}
#[test]
fn positioned_chars_match_extracted_text_for_armenian_ligature() {
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB13");
let blocks = extract_text(&doc);
let chars = extract_positioned_chars(&doc, 1).unwrap();
assert_eq!(blocks.len(), 1);
assert_eq!(
blocks[0].text.chars().count(),
chars.len(),
"PositionedChar count drifted from extract_text() char count \
(issue #1359 problem 1): blocks[0].text = {:?}, chars = {:?}",
blocks[0].text,
chars.iter().map(|c| c.ch).collect::<String>(),
);
let chars_str: String = chars.iter().map(|c| c.ch).collect();
assert_eq!(blocks[0].text, chars_str);
let total = chars.last().unwrap().bbox[2] - chars.first().unwrap().bbox[0];
let expected = 12.0 * APPROX_CHAR_WIDTH;
assert!((total - expected).abs() < 1e-6);
}
#[test]
fn hebrew_presentation_form_extracts_single_codepoint_nfkd_mapping() {
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "\u{05D0}");
assert_eq!(
decompose_glyph_to_string('\u{FB21}').as_deref(),
Some("\u{05D0}")
);
}
#[test]
fn positioned_chars_match_extracted_text_for_hebrew_presentation_form() {
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
let blocks = extract_text(&doc);
let chars = extract_positioned_chars(&doc, 1).unwrap();
let chars_str: String = chars.iter().map(|c| c.ch).collect();
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "\u{05D0}");
assert_eq!(chars_str, blocks[0].text);
assert_eq!(blocks[0].text.chars().count(), chars.len());
}
#[test]
fn tj_block_width_matches_rendered_glyph_count() {
let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
let blocks = extract_text(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].text, "ffi");
let width = blocks[0].bbox[2] - blocks[0].bbox[0];
let expected = 12.0 * APPROX_CHAR_WIDTH;
assert!(
(width - expected).abs() < 1e-6,
"Tj bbox width {} != one glyph's char_w {} (issue #1359 problem 2)",
width,
expected,
);
}
#[test]
fn tj_and_tj_array_bbox_widths_agree_for_ligature() {
let tj_doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
let tj_blocks = extract_text(&tj_doc);
let tj_doc_arr = make_doc_with_ligature_font(b"BT /F1 12 Tf [<01>] TJ ET", "ffi");
let tj_arr_blocks = extract_text(&tj_doc_arr);
assert_eq!(tj_blocks.len(), 1);
assert_eq!(tj_arr_blocks.len(), 1);
let tj_w = tj_blocks[0].bbox[2] - tj_blocks[0].bbox[0];
let tj_arr_w = tj_arr_blocks[0].bbox[2] - tj_arr_blocks[0].bbox[0];
assert!(
(tj_w - tj_arr_w).abs() < 1e-6,
"Tj bbox width {} disagrees with TJ bbox width {} for the same \
single-glyph ligature input (issue #1359 problem 2)",
tj_w,
tj_arr_w,
);
}
}