use std::collections::HashMap;
pub fn decode_text_simple(bytes: &[u8]) -> String {
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
let utf16: Vec<u16> = bytes[2..]
.chunks(2)
.filter_map(|c| {
if c.len() == 2 {
Some(u16::from_be_bytes([c[0], c[1]]))
} else {
None
}
})
.collect();
return String::from_utf16(&utf16).unwrap_or_default();
}
if let Ok(s) = String::from_utf8(bytes.to_vec()) {
return s;
}
bytes.iter().map(|&b| b as char).collect()
}
#[derive(Debug, Clone)]
pub(crate) struct ToUnicodeMap {
pub(crate) code_width: usize,
pub(crate) mappings: HashMap<u32, String>,
}
impl ToUnicodeMap {
pub(crate) fn decode(&self, bytes: &[u8]) -> String {
let mut result = String::new();
let mut i = 0;
while i < bytes.len() {
if self.code_width == 2 && i + 1 < bytes.len() {
let code = u32::from(bytes[i]) << 8 | u32::from(bytes[i + 1]);
if let Some(s) = self.mappings.get(&code) {
result.push_str(s);
}
i += 2;
} else {
let code = u32::from(bytes[i]);
if let Some(s) = self.mappings.get(&code) {
result.push_str(s);
}
i += 1;
}
}
result
}
}
fn parse_hex(s: &str) -> Option<u32> {
u32::from_str_radix(s, 16).ok()
}
fn sanitize_unicode(s: String) -> Option<String> {
let filtered: String = s
.chars()
.filter(|&c| {
let cp = c as u32;
if cp == 0xFFFD || cp == 0xFFFE || cp == 0xFFFF {
return false;
}
if (0xFDD0..=0xFDEF).contains(&cp) {
return false;
}
if cp >= 0x10000 && (cp & 0xFFFF) >= 0xFFFE {
return false;
}
if (0xE000..=0xF8FF).contains(&cp)
|| (0xF0000..=0xFFFFD).contains(&cp)
|| (0x100000..=0x10FFFD).contains(&cp)
{
return false;
}
true
})
.collect();
if filtered.is_empty() {
None
} else {
Some(filtered)
}
}
fn hex_to_unicode(hex: &str) -> Option<String> {
if hex.len() % 4 != 0 && hex.len() == 2 {
let cp = u32::from_str_radix(hex, 16).ok()?;
let s = char::from_u32(cp).map(|c| c.to_string())?;
return sanitize_unicode(s);
}
let mut units = Vec::new();
let mut i = 0;
while i + 3 < hex.len() {
let val = u16::from_str_radix(&hex[i..i + 4], 16).ok()?;
units.push(val);
i += 4;
}
let s = String::from_utf16(&units).ok()?;
sanitize_unicode(s)
}
pub(crate) fn parse_to_unicode_cmap(data: &[u8]) -> Option<ToUnicodeMap> {
let text = String::from_utf8_lossy(data);
let mut mappings = HashMap::new();
let mut code_width: usize = 2;
if let Some(cs_start) = text.find("begincodespacerange") {
if let Some(cs_end) = text[cs_start..].find("endcodespacerange") {
let cs_block = &text[cs_start..cs_start + cs_end];
if let Some(first_angle) = cs_block.find('<') {
if let Some(close_angle) = cs_block[first_angle..].find('>') {
let hex_len = close_angle - 1; code_width = hex_len / 2; if code_width == 0 {
code_width = 1;
}
}
}
}
}
let mut search_pos = 0;
while let Some(start) = text[search_pos..].find("beginbfchar") {
let block_start = search_pos + start + "beginbfchar".len();
if let Some(end) = text[block_start..].find("endbfchar") {
let block = &text[block_start..block_start + end];
for line in block.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line
.split(['<', '>'])
.filter(|s| !s.trim().is_empty())
.collect();
if parts.len() >= 2 {
if let Some(code) = parse_hex(parts[0].trim()) {
if let Some(unicode_str) = hex_to_unicode(parts[1].trim()) {
mappings.insert(code, unicode_str);
}
}
}
}
search_pos = block_start + end;
} else {
break;
}
}
search_pos = 0;
while let Some(start) = text[search_pos..].find("beginbfrange") {
let block_start = search_pos + start + "beginbfrange".len();
if let Some(end) = text[block_start..].find("endbfrange") {
let block = &text[block_start..block_start + end];
for line in block.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
if line.contains('[') {
let parts: Vec<&str> = line
.split(['<', '>', '[', ']'])
.filter(|s| !s.trim().is_empty())
.collect();
if parts.len() >= 3 {
if let (Some(lo), Some(hi)) =
(parse_hex(parts[0].trim()), parse_hex(parts[1].trim()))
{
for (i, code) in (lo..=hi).enumerate() {
if let Some(unicode_str) =
parts.get(2 + i).and_then(|h| hex_to_unicode(h.trim()))
{
mappings.insert(code, unicode_str);
}
}
}
}
} else {
let parts: Vec<&str> = line
.split(['<', '>'])
.filter(|s| !s.trim().is_empty())
.collect();
if parts.len() >= 3 {
if let (Some(lo), Some(hi), Some(dst_start)) = (
parse_hex(parts[0].trim()),
parse_hex(parts[1].trim()),
parse_hex(parts[2].trim()),
) {
for (i, code) in (lo..=hi).enumerate() {
let dst = dst_start + i as u32;
if let Some(c) = char::from_u32(dst) {
if let Some(s) = sanitize_unicode(c.to_string()) {
mappings.insert(code, s);
}
}
}
}
}
}
}
search_pos = block_start + end;
} else {
break;
}
}
if mappings.is_empty() {
return None;
}
if code_width == 2 {
let max_key = mappings.keys().copied().max().unwrap_or(0);
if max_key <= 0xFF {
code_width = 1;
}
}
Some(ToUnicodeMap {
code_width,
mappings,
})
}
pub(crate) fn parse_truetype_cmap_table(data: &[u8]) -> Option<ToUnicodeMap> {
if data.len() < 12 {
return None;
}
let num_tables = u16::from_be_bytes([data[4], data[5]]) as usize;
let mut cmap_offset = 0u32;
let mut cmap_length = 0u32;
for i in 0..num_tables {
let record_offset = 12 + i * 16;
if record_offset + 16 > data.len() {
break;
}
let tag = &data[record_offset..record_offset + 4];
if tag == b"cmap" {
cmap_offset = u32::from_be_bytes([
data[record_offset + 8],
data[record_offset + 9],
data[record_offset + 10],
data[record_offset + 11],
]);
cmap_length = u32::from_be_bytes([
data[record_offset + 12],
data[record_offset + 13],
data[record_offset + 14],
data[record_offset + 15],
]);
break;
}
}
if cmap_offset == 0 || cmap_offset as usize + 4 > data.len() {
return None;
}
let cmap = &data[cmap_offset as usize..];
let cmap_len = cmap_length as usize;
if cmap_len < 4 {
return None;
}
let num_subtables = u16::from_be_bytes([cmap[2], cmap[3]]) as usize;
let mut best_offset: Option<u32> = None;
let mut best_priority = 0u8;
for i in 0..num_subtables {
let rec = 4 + i * 8;
if rec + 8 > cmap_len {
break;
}
let platform_id = u16::from_be_bytes([cmap[rec], cmap[rec + 1]]);
let encoding_id = u16::from_be_bytes([cmap[rec + 2], cmap[rec + 3]]);
let offset =
u32::from_be_bytes([cmap[rec + 4], cmap[rec + 5], cmap[rec + 6], cmap[rec + 7]]);
let priority = match (platform_id, encoding_id) {
(3, 1) => 3, (0, 3) => 2, (0, _) => 1, _ => 0,
};
if priority > best_priority {
best_priority = priority;
best_offset = Some(offset);
}
}
let subtable_offset = best_offset? as usize;
if subtable_offset + 2 > cmap_len {
return None;
}
let subtable = &cmap[subtable_offset..];
let format = u16::from_be_bytes([subtable[0], subtable[1]]);
let unicode_to_gid = match format {
4 => parse_cmap_format4(subtable)?,
12 => parse_cmap_format12(subtable)?,
_ => {
log::debug!("Unsupported cmap subtable format {}", format);
return None;
}
};
let mut gid_to_unicode: HashMap<u32, String> = HashMap::new();
for (unicode_cp, gid) in &unicode_to_gid {
if *gid > 0 {
if let Some(s) = char::from_u32(*unicode_cp)
.map(|c| c.to_string())
.and_then(sanitize_unicode)
{
gid_to_unicode.entry(*gid as u32).or_insert(s);
}
}
}
if gid_to_unicode.is_empty() {
return None;
}
log::debug!(
"Parsed embedded TrueType cmap: {} GID→Unicode mappings",
gid_to_unicode.len()
);
Some(ToUnicodeMap {
code_width: 2, mappings: gid_to_unicode,
})
}
fn parse_cmap_format4(data: &[u8]) -> Option<HashMap<u32, u16>> {
if data.len() < 14 {
return None;
}
let seg_count_x2 = u16::from_be_bytes([data[6], data[7]]) as usize;
let seg_count = seg_count_x2 / 2;
let end_codes_offset = 14;
let start_codes_offset = end_codes_offset + seg_count_x2 + 2; let id_delta_offset = start_codes_offset + seg_count_x2;
let id_range_offset_offset = id_delta_offset + seg_count_x2;
let needed = id_range_offset_offset + seg_count_x2;
if needed > data.len() {
return None;
}
let mut result = HashMap::new();
for seg in 0..seg_count {
let end_code = u16::from_be_bytes([
data[end_codes_offset + seg * 2],
data[end_codes_offset + seg * 2 + 1],
]);
let start_code = u16::from_be_bytes([
data[start_codes_offset + seg * 2],
data[start_codes_offset + seg * 2 + 1],
]);
let id_delta = i16::from_be_bytes([
data[id_delta_offset + seg * 2],
data[id_delta_offset + seg * 2 + 1],
]);
let id_range_offset = u16::from_be_bytes([
data[id_range_offset_offset + seg * 2],
data[id_range_offset_offset + seg * 2 + 1],
]);
if start_code == 0xFFFF {
break;
}
for code in start_code..=end_code {
let gid = if id_range_offset == 0 {
(code as i32 + id_delta as i32) as u16
} else {
let glyph_idx_offset = id_range_offset_offset
+ seg * 2
+ id_range_offset as usize
+ (code - start_code) as usize * 2;
if glyph_idx_offset + 1 < data.len() {
let gid_raw =
u16::from_be_bytes([data[glyph_idx_offset], data[glyph_idx_offset + 1]]);
if gid_raw != 0 {
(gid_raw as i32 + id_delta as i32) as u16
} else {
0
}
} else {
0
}
};
if gid != 0 {
result.insert(code as u32, gid);
}
}
}
Some(result)
}
fn parse_cmap_format12(data: &[u8]) -> Option<HashMap<u32, u16>> {
if data.len() < 16 {
return None;
}
let n_groups = u32::from_be_bytes([data[12], data[13], data[14], data[15]]) as usize;
let mut result = HashMap::new();
for i in 0..n_groups {
let offset = 16 + i * 12;
if offset + 12 > data.len() {
break;
}
let start_char = u32::from_be_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
]);
let end_char = u32::from_be_bytes([
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
let start_gid = u32::from_be_bytes([
data[offset + 8],
data[offset + 9],
data[offset + 10],
data[offset + 11],
]);
if end_char - start_char > 0x10000 {
continue;
}
for code in start_char..=end_char {
let gid = start_gid + (code - start_char);
if gid != 0 && gid <= 0xFFFF {
result.insert(code, gid as u16);
}
}
}
Some(result)
}
pub(crate) fn is_likely_binary(text: &str) -> bool {
if text.is_empty() {
return false;
}
let total_chars = text.chars().count();
if total_chars == 0 {
return false;
}
let suspicious_count = text
.chars()
.filter(|&c| {
let code = c as u32;
(code < 0x20 && !matches!(c, '\n' | '\r' | '\t'))
|| (0x80..0xA0).contains(&code)
|| (0xE000..=0xF8FF).contains(&code)
})
.count();
suspicious_count as f32 / total_chars as f32 > 0.3
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_text_simple_utf8() {
assert_eq!(decode_text_simple(b"Hello"), "Hello");
}
#[test]
fn test_decode_text_simple_latin1() {
let bytes = vec![0x48, 0x65, 0x6C, 0x6C, 0xE9];
let text = decode_text_simple(&bytes);
assert_eq!(text, "Hellé");
}
#[test]
fn test_decode_text_simple_utf16be() {
let bytes = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69];
assert_eq!(decode_text_simple(&bytes), "Hi");
}
#[test]
fn test_parse_to_unicode_cmap_bfchar() {
let cmap = b"/CIDInit /ProcSet findresource begin
12 dict begin
begincmap
1 begincodespacerange
<0000> <ffff>
endcodespacerange
3 beginbfchar
<0003> <0020>
<001C> <0039>
<0024> <0041>
endbfchar
endcmap";
let map = parse_to_unicode_cmap(cmap).unwrap();
assert_eq!(map.code_width, 1);
assert_eq!(map.mappings.get(&0x0003), Some(&" ".to_string()));
assert_eq!(map.mappings.get(&0x001C), Some(&"9".to_string()));
assert_eq!(map.mappings.get(&0x0024), Some(&"A".to_string()));
}
#[test]
fn test_parse_to_unicode_cmap_bfchar_2byte() {
let cmap = b"1 begincodespacerange
<0000> <ffff>
endcodespacerange
2 beginbfchar
<0100> <AC00>
<0200> <AD00>
endbfchar";
let map = parse_to_unicode_cmap(cmap).unwrap();
assert_eq!(map.code_width, 2);
assert_eq!(map.mappings.get(&0x0100), Some(&"\u{AC00}".to_string()));
}
#[test]
fn test_parse_to_unicode_cmap_bfrange() {
let cmap = b"1 begincodespacerange
<00> <FF>
endcodespacerange
1 beginbfrange
<20> <7E> <0020>
endbfrange";
let map = parse_to_unicode_cmap(cmap).unwrap();
assert_eq!(map.code_width, 1);
assert_eq!(map.mappings.get(&0x20), Some(&" ".to_string()));
assert_eq!(map.mappings.get(&0x41), Some(&"A".to_string()));
assert_eq!(map.mappings.get(&0x7E), Some(&"~".to_string()));
}
#[test]
fn test_to_unicode_map_decode_1byte() {
let cmap = b"1 begincodespacerange
<0000> <ffff>
endcodespacerange
3 beginbfchar
<0003> <0020>
<001C> <0039>
<0024> <0041>
endbfchar";
let map = parse_to_unicode_cmap(cmap).unwrap();
assert_eq!(map.code_width, 1);
let result = map.decode(&[0x03, 0x1C, 0x24]);
assert_eq!(result, " 9A");
}
#[test]
fn test_to_unicode_map_decode_2byte() {
let cmap = b"1 begincodespacerange
<0000> <ffff>
endcodespacerange
2 beginbfchar
<0100> <AC00>
<0101> <AC01>
endbfchar";
let map = parse_to_unicode_cmap(cmap).unwrap();
assert_eq!(map.code_width, 2);
let result = map.decode(&[0x01, 0x00, 0x01, 0x01]);
assert_eq!(result, "\u{AC00}\u{AC01}");
}
}