use crate::cache::MutexExt;
use crate::error::Result;
use regex::Regex;
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::sync::{Arc, Mutex};
#[derive(Clone, Debug)]
struct RangeEntry {
start: u32,
end: u32,
target: u32,
}
#[derive(Clone, Debug)]
pub struct CMap {
chars: HashMap<u32, String>,
ranges: Vec<RangeEntry>,
notdef_ranges: Vec<RangeEntry>,
}
impl CMap {
pub fn get(&self, code: &u32) -> Option<&String> {
if let Some(s) = self.chars.get(code) {
return Some(s);
}
for range in &self.ranges {
if range.start <= *code && *code <= range.end {
return None;
}
}
for range in &self.notdef_ranges {
if range.start <= *code && *code <= range.end {
if let Some(s) = self.chars.get(&range.target) {
return Some(s);
}
}
}
None
}
pub fn is_empty(&self) -> bool {
self.chars.is_empty() && self.ranges.is_empty() && self.notdef_ranges.is_empty()
}
pub fn len(&self) -> usize {
self.chars.len() + self.ranges.len() + self.notdef_ranges.len()
}
fn new() -> Self {
CMap {
chars: HashMap::new(),
ranges: Vec::new(),
notdef_ranges: Vec::new(),
}
}
fn insert(&mut self, code: u32, unicode: String) {
self.chars.insert(code, unicode);
}
}
#[derive(Hash, Eq, PartialEq, Clone, Copy, Debug)]
pub struct CMapKey(u64);
fn compute_stream_hash(data: &[u8]) -> CMapKey {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
CMapKey(hasher.finish())
}
const MAX_CMAP_CACHE_ENTRIES: usize = 1024;
lazy_static::lazy_static! {
static ref CMAP_CACHE: Mutex<crate::cache::BoundedEntryCache<CMapKey, Arc<CMap>>> =
Mutex::new(crate::cache::BoundedEntryCache::new(MAX_CMAP_CACHE_ENTRIES));
}
pub fn clear_cmap_cache() {
CMAP_CACHE.lock_or_recover().clear();
}
pub fn cmap_cache_size() -> usize {
CMAP_CACHE.lock_or_recover().len()
}
#[derive(Debug, Clone)]
pub struct LazyCMap {
raw_stream: Vec<u8>,
cache_key: CMapKey,
parsed: Arc<Mutex<Option<Arc<CMap>>>>,
}
impl LazyCMap {
pub fn new(raw_stream: Vec<u8>) -> Self {
let cache_key = compute_stream_hash(&raw_stream);
LazyCMap {
raw_stream,
cache_key,
parsed: Arc::new(Mutex::new(None)),
}
}
pub fn raw_data(&self) -> &[u8] {
&self.raw_stream
}
pub fn get(&self) -> Option<Arc<CMap>> {
let mut parsed_guard = self.parsed.lock_or_recover();
if let Some(cached) = parsed_guard.as_ref() {
return Some(Arc::clone(cached));
}
{
let mut global = CMAP_CACHE.lock_or_recover();
if let Some(cached) = global.get(&self.cache_key) {
let arc = Arc::clone(cached);
*parsed_guard = Some(Arc::clone(&arc));
log::debug!("CMap cache hit (global) for stream hash {:?}", self.cache_key);
return Some(arc);
}
}
match parse_tounicode_cmap(&self.raw_stream) {
Ok(cmap) => {
let cmap_arc = Arc::new(cmap);
*parsed_guard = Some(Arc::clone(&cmap_arc));
{
let mut global = CMAP_CACHE.lock_or_recover();
global.insert(self.cache_key, Arc::clone(&cmap_arc));
}
log::debug!("CMap parsed and cached (stream hash {:?})", self.cache_key);
Some(cmap_arc)
},
Err(e) => {
log::warn!("Failed to parse lazy CMap: {}", e);
None
},
}
}
}
fn parse_escape_sequence(token: &str) -> Option<String> {
let token = token.trim();
let token = if token.starts_with('<') && token.ends_with('>') {
&token[1..token.len() - 1]
} else {
token
};
let token_lower = token.to_lowercase();
match token_lower.trim() {
"space" => Some(" ".to_string()),
"tab" => Some("\t".to_string()),
"newline" => Some("\n".to_string()),
"carriage return" => Some("\r".to_string()),
_ => None,
}
}
fn decode_utf16_surrogate_pair(value: u32) -> Option<String> {
let high = (value >> 16) as u16;
let low = (value & 0xFFFF) as u16;
if (0xD800..=0xDBFF).contains(&high) && (0xDC00..=0xDFFF).contains(&low) {
let codepoint = 0x10000 + (((high & 0x3FF) as u32) << 10) + ((low & 0x3FF) as u32);
char::from_u32(codepoint).map(|ch| ch.to_string())
} else {
char::from_u32(value).map(|ch| ch.to_string())
}
}
pub fn parse_tounicode_cmap(data: &[u8]) -> Result<CMap> {
let mut cmap = CMap::new();
let content = String::from_utf8_lossy(data);
for section in extract_sections(&content, "beginbfchar", "endbfchar") {
for line in section.lines() {
for (src, dst) in parse_bfchar_line(line) {
log::trace!("ToUnicode bfchar: 0x{:02X} -> {:?}", src, dst);
cmap.insert(src, dst);
}
}
}
for section in extract_sections(&content, "beginbfrange", "endbfrange") {
for line in section.lines() {
if let Some(mappings) = parse_bfrange_line(line) {
log::trace!("ToUnicode bfrange: {} mappings parsed", mappings.len());
for (src, dst) in mappings {
cmap.insert(src, dst);
}
}
}
}
for section in extract_sections(&content, "beginnotdefrange", "endnotdefrange") {
for line in section.lines() {
if let Some(mappings) = parse_notdefrange_line(line) {
log::trace!("ToUnicode notdefrange: {} mappings parsed", mappings.len());
for (src, dst) in mappings {
if !cmap.chars.contains_key(&src) {
cmap.insert(src, dst);
}
}
}
}
}
Ok(cmap)
}
fn extract_sections<'a>(content: &'a str, begin: &str, end: &str) -> Vec<&'a str> {
let mut sections = Vec::new();
let mut remaining = content;
while let Some(begin_pos) = remaining.find(begin) {
let after_begin = &remaining[begin_pos + begin.len()..];
if let Some(end_pos) = after_begin.find(end) {
sections.push(&after_begin[..end_pos]);
remaining = &after_begin[end_pos + end.len()..];
} else {
break;
}
}
sections
}
fn parse_bfchar_line(line: &str) -> Vec<(u32, String)> {
lazy_static::lazy_static! {
static ref RE: Regex = Regex::new(r"<([^>]*)>\s*<([^>]*)>").unwrap();
}
let mut results = Vec::new();
for caps in RE.captures_iter(line) {
let parsed = (|| -> Option<(u32, String)> {
let src_str = caps[1].trim().replace(char::is_whitespace, "");
let src = u32::from_str_radix(&src_str, 16).ok()?;
let dst_str = caps[2].trim();
let dst = if let Some(escape) = parse_escape_sequence(&format!("<{}>", dst_str)) {
escape
} else {
let dst_hex = dst_str.replace(char::is_whitespace, "");
if dst_hex.len() <= 4 {
let dst_code = u32::from_str_radix(&dst_hex, 16).ok()?;
char::from_u32(dst_code)?.to_string()
} else if dst_hex.len() <= 6 {
let dst_code = u32::from_str_radix(&dst_hex, 16).ok()?;
if let Some(ch) = char::from_u32(dst_code) {
ch.to_string()
} else {
return None;
}
} else if dst_hex.len() == 8 {
let dst_code = u32::from_str_radix(&dst_hex, 16).ok()?;
if let Some(decoded) = decode_utf16_surrogate_pair(dst_code) {
decoded
} else {
let mut result = String::new();
if let Ok(code1) = u32::from_str_radix(&dst_hex[0..4], 16) {
if let Some(ch) = char::from_u32(code1) {
result.push(ch);
}
}
if let Ok(code2) = u32::from_str_radix(&dst_hex[4..8], 16) {
if let Some(ch) = char::from_u32(code2) {
result.push(ch);
}
}
if result.is_empty() {
return None;
}
result
}
} else {
let mut result = String::new();
for i in (0..dst_hex.len()).step_by(4) {
let end = (i + 4).min(dst_hex.len());
if let Ok(code) = u32::from_str_radix(&dst_hex[i..end], 16) {
if let Some(ch) = char::from_u32(code) {
result.push(ch);
}
}
}
if result.is_empty() {
return None;
}
result
}
};
Some((src, dst))
})();
if let Some(pair) = parsed {
results.push(pair);
}
}
results
}
fn parse_bfrange_line(line: &str) -> Option<Vec<(u32, String)>> {
lazy_static::lazy_static! {
static ref RE_SEQ: Regex = Regex::new(
r"<([^>]*)>\s*<([^>]*)>\s*<([^>]*)>"
).unwrap();
static ref RE_ARRAY: Regex = Regex::new(
r"<([^>]*)>\s*<([^>]*)>\s*\[((?:\s*<[^>]+>\s*)+)\]"
).unwrap();
}
if let Some(caps) = RE_ARRAY.captures(line) {
let start_str = caps[1].trim().replace(char::is_whitespace, "");
let end_str = caps[2].trim().replace(char::is_whitespace, "");
let start = u32::from_str_radix(&start_str, 16).ok()?;
let end = u32::from_str_radix(&end_str, 16).ok()?;
let array_str = &caps[3];
lazy_static::lazy_static! {
static ref RE_HEX: Regex = Regex::new(r"<([^>]*)>").unwrap();
}
let dst_hexes: Vec<String> = RE_HEX
.captures_iter(array_str)
.filter_map(|cap| {
let s = cap
.get(1)
.unwrap()
.as_str()
.trim()
.replace(char::is_whitespace, "");
if !s.is_empty() {
Some(s)
} else {
None
}
})
.collect();
let mut result = Vec::new();
let range_size = (end - start + 1) as usize;
if dst_hexes.len() != range_size {
log::warn!(
"ToUnicode bfrange array size mismatch: expected {} entries for range 0x{:X}-0x{:X}, got {}",
range_size,
start,
end,
dst_hexes.len()
);
}
for (i, dst_hex) in dst_hexes.iter().take(range_size).enumerate() {
let src = start + i as u32;
let dst = if dst_hex.len() <= 4 {
let dst_code = u32::from_str_radix(dst_hex, 16).ok()?;
char::from_u32(dst_code)?.to_string()
} else if dst_hex.len() <= 6 {
let dst_code = u32::from_str_radix(dst_hex, 16).ok()?;
if let Some(ch) = char::from_u32(dst_code) {
ch.to_string()
} else {
continue;
}
} else if dst_hex.len() == 8 {
let dst_code = u32::from_str_radix(dst_hex, 16).ok()?;
if let Some(decoded) = decode_utf16_surrogate_pair(dst_code) {
decoded
} else {
let mut unicode_string = String::new();
if let Ok(code) = u32::from_str_radix(&dst_hex[0..4], 16) {
if let Some(ch) = char::from_u32(code) {
unicode_string.push(ch);
}
}
if let Ok(code) = u32::from_str_radix(&dst_hex[4..8], 16) {
if let Some(ch) = char::from_u32(code) {
unicode_string.push(ch);
}
}
if unicode_string.is_empty() {
continue;
}
unicode_string
}
} else {
let mut unicode_string = String::new();
for chunk_start in (0..dst_hex.len()).step_by(4) {
let chunk_end = (chunk_start + 4).min(dst_hex.len());
if let Ok(code) = u32::from_str_radix(&dst_hex[chunk_start..chunk_end], 16) {
if let Some(ch) = char::from_u32(code) {
unicode_string.push(ch);
}
}
}
if unicode_string.is_empty() {
continue; }
unicode_string
};
result.push((src, dst));
}
return Some(result);
}
if let Some(caps) = RE_SEQ.captures(line) {
let start_str = caps[1].trim().replace(char::is_whitespace, "");
let end_str = caps[2].trim().replace(char::is_whitespace, "");
let dst_start_str = caps[3].trim().replace(char::is_whitespace, "");
let start = u32::from_str_radix(&start_str, 16).ok()?;
let end = u32::from_str_radix(&end_str, 16).ok()?;
let dst_start = u32::from_str_radix(&dst_start_str, 16).ok()?;
let mut result = Vec::new();
let range_size = end.saturating_sub(start).min(10000);
let base_codepoint = if dst_start > 0xFFFF {
if let Some(decoded) = decode_utf16_surrogate_pair(dst_start) {
decoded.chars().next().map(|c| c as u32)
} else {
Some(dst_start)
}
} else {
Some(dst_start)
};
if let Some(base_cp) = base_codepoint {
for i in 0..=range_size {
let src = start.wrapping_add(i);
let cp = base_cp.wrapping_add(i);
if let Some(ch) = char::from_u32(cp) {
result.push((src, ch.to_string()));
}
}
}
return Some(result);
}
None
}
fn parse_notdefrange_line(line: &str) -> Option<Vec<(u32, String)>> {
lazy_static::lazy_static! {
static ref RE_SEQ: Regex = Regex::new(
r"<([^>]*)>\s*<([^>]*)>\s*<([^>]*)>"
).unwrap();
}
if let Some(caps) = RE_SEQ.captures(line) {
let start_str = caps[1].trim().replace(char::is_whitespace, "");
let end_str = caps[2].trim().replace(char::is_whitespace, "");
let dst_str = caps[3].trim();
let start = u32::from_str_radix(&start_str, 16).ok()?;
let end = u32::from_str_radix(&end_str, 16).ok()?;
let dst = if let Some(escape) = parse_escape_sequence(&format!("<{}>", dst_str)) {
escape
} else {
let dst_hex = dst_str.replace(char::is_whitespace, "");
let dst_code = u32::from_str_radix(&dst_hex, 16).ok()?;
if dst_code > 0xFFFF {
decode_utf16_surrogate_pair(dst_code)
.or_else(|| char::from_u32(dst_code).map(|ch| ch.to_string()))?
} else {
char::from_u32(dst_code)?.to_string()
}
};
let mut result = Vec::new();
let range_size = end.saturating_sub(start).min(10000); for i in 0..=range_size {
let src = start.wrapping_add(i);
result.push((src, dst.clone()));
}
return Some(result);
}
None
}
pub fn parse_cid_to_unicode(data: &[u8]) -> Result<CMap> {
parse_tounicode_cmap(data)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_bfchar_single() {
let data = b"beginbfchar\n<0041> <0041>\nendbfchar";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x41), Some(&"A".to_string()));
}
#[test]
fn test_parse_bfchar_multiple() {
let data = b"beginbfchar\n<0041> <0041>\n<0042> <0042>\n<0043> <0043>\nendbfchar";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x41), Some(&"A".to_string()));
assert_eq!(cmap.get(&0x42), Some(&"B".to_string()));
assert_eq!(cmap.get(&0x43), Some(&"C".to_string()));
}
#[test]
fn test_parse_bfchar_non_ascii() {
let data = b"beginbfchar\n<00E9> <00E9>\nendbfchar"; let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0xE9), Some(&"é".to_string()));
}
#[test]
fn test_parse_bfrange_simple() {
let data = b"beginbfrange\n<0041> <0043> <0041>\nendbfrange";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x41), Some(&"A".to_string()));
assert_eq!(cmap.get(&0x42), Some(&"B".to_string()));
assert_eq!(cmap.get(&0x43), Some(&"C".to_string()));
}
#[test]
fn test_parse_bfrange_ascii_printable() {
let data = b"beginbfrange\n<0020> <007E> <0020>\nendbfrange";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x20), Some(&" ".to_string()));
assert_eq!(cmap.get(&0x30), Some(&"0".to_string()));
assert_eq!(cmap.get(&0x41), Some(&"A".to_string()));
assert_eq!(cmap.get(&0x7A), Some(&"z".to_string()));
assert_eq!(cmap.get(&0x7E), Some(&"~".to_string()));
}
#[test]
fn test_parse_mixed_bfchar_bfrange() {
let data = b"beginbfchar\n<0041> <0058>\nendbfchar\nbeginbfrange\n<0042> <0044> <0042>\nendbfrange";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x41), Some(&"X".to_string())); assert_eq!(cmap.get(&0x42), Some(&"B".to_string())); assert_eq!(cmap.get(&0x43), Some(&"C".to_string()));
assert_eq!(cmap.get(&0x44), Some(&"D".to_string()));
}
#[test]
fn test_parse_empty_cmap() {
let data = b"";
let cmap = parse_tounicode_cmap(data).unwrap();
assert!(cmap.is_empty());
}
#[test]
fn test_parse_cmap_with_whitespace() {
let data = b"beginbfchar\n <0041> <0041> \n <0042> <0042>\nendbfchar";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x41), Some(&"A".to_string()));
assert_eq!(cmap.get(&0x42), Some(&"B".to_string()));
}
#[test]
fn test_parse_bfchar_line() {
assert_eq!(parse_bfchar_line("<0041> <0041>"), vec![(0x41, "A".to_string())]);
assert_eq!(parse_bfchar_line("<00E9> <00E9>"), vec![(0xE9, "é".to_string())]);
assert!(parse_bfchar_line("invalid line").is_empty());
}
#[test]
fn test_parse_bfchar_multiple_pairs_per_line() {
let result = parse_bfchar_line("<01> <0041> <02> <0042> <03> <0043>");
assert_eq!(result.len(), 3);
assert_eq!(result[0], (0x01, "A".to_string()));
assert_eq!(result[1], (0x02, "B".to_string()));
assert_eq!(result[2], (0x03, "C".to_string()));
}
#[test]
fn test_parse_bfrange_line() {
let result = parse_bfrange_line("<0041> <0043> <0041>").unwrap();
assert_eq!(result.len(), 3);
assert_eq!(result[0], (0x41, "A".to_string()));
assert_eq!(result[1], (0x42, "B".to_string()));
assert_eq!(result[2], (0x43, "C".to_string()));
}
#[test]
fn test_parse_bfrange_line_single_char() {
let result = parse_bfrange_line("<0041> <0041> <0041>").unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0], (0x41, "A".to_string()));
}
#[test]
fn test_parse_bfrange_line_invalid() {
assert!(parse_bfrange_line("invalid").is_none());
}
#[test]
fn test_extract_sections() {
let content =
"before\nbeginbfchar\ndata1\nendbfchar\nmiddle\nbeginbfchar\ndata2\nendbfchar\nafter";
let sections = extract_sections(content, "beginbfchar", "endbfchar");
assert_eq!(sections.len(), 2);
assert!(sections[0].contains("data1"));
assert!(sections[1].contains("data2"));
}
#[test]
fn test_extract_sections_none() {
let content = "no sections here";
let sections = extract_sections(content, "beginbfchar", "endbfchar");
assert_eq!(sections.len(), 0);
}
#[test]
fn test_parse_cid_to_unicode() {
let data = b"beginbfchar\n<0041> <0041>\nendbfchar";
let cmap = parse_cid_to_unicode(data).unwrap();
assert_eq!(cmap.get(&0x41), Some(&"A".to_string()));
}
#[test]
fn test_parse_hex_case_insensitive() {
let data = b"beginbfchar\n<00aB> <00Ab>\nendbfchar";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0xAB), Some(&"«".to_string()));
}
#[test]
fn test_parse_multiple_sections() {
let data = b"beginbfchar\n<0041> <0041>\nendbfchar\nbeginbfchar\n<0042> <0042>\nendbfchar";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.len(), 2);
assert_eq!(cmap.get(&0x41), Some(&"A".to_string()));
assert_eq!(cmap.get(&0x42), Some(&"B".to_string()));
}
#[test]
fn test_parse_bfchar_ligature() {
let data = b"beginbfchar\n<000C> <00660069>\nendbfchar"; let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x0C), Some(&"fi".to_string()));
}
#[test]
fn test_parse_bfchar_multiple_ligatures() {
let data =
b"beginbfchar\n<000B> <00660066>\n<000C> <00660069>\n<000D> <0066006C>\nendbfchar";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x0B), Some(&"ff".to_string())); assert_eq!(cmap.get(&0x0C), Some(&"fi".to_string())); assert_eq!(cmap.get(&0x0D), Some(&"fl".to_string())); }
#[test]
fn test_parse_bfrange_array_ligatures() {
let data =
b"beginbfrange\n<005F> <0061> [<00660066> <00660069> <00660066006C>]\nendbfrange";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x5F), Some(&"ff".to_string())); assert_eq!(cmap.get(&0x60), Some(&"fi".to_string())); assert_eq!(cmap.get(&0x61), Some(&"ffl".to_string())); }
#[test]
fn test_parse_bfrange_array_mixed() {
let data = b"beginbfrange\n<0010> <0012> [<0041> <00660069> <0043>]\nendbfrange";
let cmap = parse_tounicode_cmap(data).unwrap();
assert_eq!(cmap.get(&0x10), Some(&"A".to_string())); assert_eq!(cmap.get(&0x11), Some(&"fi".to_string())); assert_eq!(cmap.get(&0x12), Some(&"C".to_string())); }
#[test]
fn test_parse_zekat_cmap() {
let cmap_data = r#"
/CIDInit /ProcSet findresource begin
19 dict begin
begincmap
/CIDSystemInfo
<< /Registry (Adobe)
/Ordering (UCS)
/Supplement 0
>> def
/CMapName /Adobe-Identity-UCS def
/CMapType 2 def
1 begincodespacerange
<0000> <FFFF>
endcodespacerange
1 beginbfrange
<0003> <0004> <0020>
endbfrange
3 beginbfchar
<000F> <002C>
<0011> <002E>
<0024> <0041>
endbfchar
1 beginbfrange
<0027> <0029> <0044>
endbfrange
2 beginbfchar
<002C> <0049>
<002E> <004B>
endbfchar
2 beginbfrange
<0030> <0032> <004D>
<0035> <0037> <0052>
endbfrange
2 beginbfchar
<0039> <0056>
<003D> <005A>
endbfchar
5 beginbfrange
<0044> <0048> <0061>
<004A> <004C> <0067>
<004E> <0053> <006B>
<0055> <0059> <0072>
<005C> <005D> <0079>
endbfrange
5 beginbfchar
<006B> <00E2>
<006F> <00E7>
<007C> <00F6>
<0081> <00FC>
<00AB> <2026>
endbfchar
1 beginbfrange
<00B3> <00B4> <201C>
endbfrange
4 beginbfchar
<00C6> <00C2>
<00D5> <0131>
<00F7> <011F>
<00FA> <015F>
endbfchar
endcmap
CMapName currentdict /CMap defineresource pop
end
end
"#
.as_bytes();
let cmap = parse_tounicode_cmap(cmap_data).expect("Failed to parse CMap");
assert_eq!(cmap.get(&0x3D), Some(&"Z".to_string()));
assert_eq!(cmap.get(&0x24), Some(&"A".to_string()));
assert_eq!(cmap.get(&0xC6), Some(&"\u{00C2}".to_string())); }
}