use crate::error::Result;
use indexmap::IndexMap;
#[derive(Debug, Clone, Default)]
pub struct ToUnicodeCMap {
pub(crate) mappings: IndexMap<u32, String>,
pub(crate) is_two_byte: bool,
}
impl ToUnicodeCMap {
pub fn parse(data: &[u8]) -> Result<Self> {
let mut out = Self::default();
let mut max_code: u32 = 0;
let mut cursor = 0;
while let Some((start, end)) = find_section(data, b"beginbfchar", b"endbfchar", cursor) {
let body = &data[start..end];
parse_bfchar(body, &mut out.mappings, &mut max_code);
cursor = end;
}
cursor = 0;
while let Some((start, end)) = find_section(data, b"beginbfrange", b"endbfrange", cursor) {
let body = &data[start..end];
parse_bfrange(body, &mut out.mappings, &mut max_code);
cursor = end;
}
out.is_two_byte = max_code > 0xFF
|| find_section(data, b"begincodespacerange", b"endcodespacerange", 0)
.map(|(s, e)| codespace_is_two_byte(&data[s..e]))
.unwrap_or(false);
Ok(out)
}
pub fn decode(&self, bytes: &[u8]) -> Result<String> {
let mut out = String::with_capacity(bytes.len());
if self.is_two_byte {
let mut i = 0;
while i + 1 < bytes.len() {
let code = ((bytes[i] as u32) << 8) | bytes[i + 1] as u32;
if let Some(s) = self.mappings.get(&code) {
out.push_str(s);
}
i += 2;
}
} else {
for &b in bytes {
if let Some(s) = self.mappings.get(&(b as u32)) {
out.push_str(s);
}
}
}
Ok(out)
}
}
fn find_section(
data: &[u8],
start_marker: &[u8],
end_marker: &[u8],
from: usize,
) -> Option<(usize, usize)> {
let hay = &data[from.min(data.len())..];
let s_off = hay
.windows(start_marker.len())
.position(|w| w == start_marker)?;
let s = from + s_off + start_marker.len();
let tail = &data[s..];
let e_off = tail.windows(end_marker.len()).position(|w| w == end_marker)?;
Some((s, s + e_off))
}
fn parse_bfchar(body: &[u8], out: &mut IndexMap<u32, String>, max_code: &mut u32) {
let mut iter = HexIter::new(body);
while let Some(src) = iter.next_hex_string() {
let Some(dst) = iter.next_hex_string() else { break };
let code = src.to_code();
let value = dst.to_unicode_string();
if code > *max_code {
*max_code = code;
}
out.insert(code, value);
}
}
fn parse_bfrange(body: &[u8], out: &mut IndexMap<u32, String>, max_code: &mut u32) {
let mut iter = HexIter::new(body);
while let Some(first) = iter.next_hex_string() {
let Some(last) = iter.next_hex_string() else { break };
let Some(third) = iter.next_third(body, &mut iter.pos()) else { break };
let start = first.to_code();
let end = last.to_code();
if start > *max_code {
*max_code = start;
}
if end > *max_code {
*max_code = end;
}
if end < start || end - start > 65_535 {
continue;
}
match third {
Third::Hex(h) => {
let base_codes = h.to_unicode_codepoints();
let mut code = start;
for i in 0..=(end - start) {
let mut buf = String::new();
for (j, &cp) in base_codes.iter().enumerate() {
if j == base_codes.len() - 1 {
if let Some(c) = char::from_u32(cp + i) {
buf.push(c);
}
} else if let Some(c) = char::from_u32(cp) {
buf.push(c);
}
}
out.insert(code, buf);
code += 1;
}
}
Third::Array(items) => {
let mut code = start;
for s in items {
if code > end {
break;
}
out.insert(code, s);
code += 1;
}
}
}
}
}
enum Third {
Hex(HexString),
Array(Vec<String>),
}
#[derive(Debug, Clone)]
struct HexString(Vec<u8>);
impl HexString {
fn to_code(&self) -> u32 {
let mut v: u32 = 0;
for &b in &self.0 {
v = (v << 8) | b as u32;
}
v
}
fn to_unicode_string(&self) -> String {
let cps = self.to_unicode_codepoints();
let mut s = String::new();
for cp in cps {
if let Some(c) = char::from_u32(cp) {
s.push(c);
}
}
s
}
fn to_unicode_codepoints(&self) -> Vec<u32> {
let bytes = &self.0;
let mut units: Vec<u16> = Vec::with_capacity(bytes.len() / 2);
let mut i = 0;
while i + 1 < bytes.len() {
units.push(((bytes[i] as u16) << 8) | bytes[i + 1] as u16);
i += 2;
}
let mut out: Vec<u32> = Vec::with_capacity(units.len());
let mut j = 0;
while j < units.len() {
let u = units[j];
if (0xD800..0xDC00).contains(&u) && j + 1 < units.len() {
let l = units[j + 1];
if (0xDC00..0xE000).contains(&l) {
let cp = 0x10000
+ (((u - 0xD800) as u32) << 10)
+ (l - 0xDC00) as u32;
out.push(cp);
j += 2;
continue;
}
}
out.push(u as u32);
j += 1;
}
out
}
}
struct HexIter<'a> {
buf: &'a [u8],
pos: usize,
}
impl<'a> HexIter<'a> {
fn new(buf: &'a [u8]) -> Self {
Self { buf, pos: 0 }
}
fn pos(&self) -> usize {
self.pos
}
fn next_hex_string(&mut self) -> Option<HexString> {
while self.pos < self.buf.len() {
let b = self.buf[self.pos];
if b == b'<' {
self.pos += 1;
let start = self.pos;
while self.pos < self.buf.len() && self.buf[self.pos] != b'>' {
self.pos += 1;
}
let end = self.pos;
if self.pos < self.buf.len() {
self.pos += 1; }
return Some(HexString(decode_hex(&self.buf[start..end])));
}
self.pos += 1;
}
None
}
fn next_third(&mut self, body: &[u8], _shared_pos: &mut usize) -> Option<Third> {
while self.pos < self.buf.len() && self.buf[self.pos].is_ascii_whitespace() {
self.pos += 1;
}
if self.pos >= self.buf.len() {
return None;
}
match self.buf[self.pos] {
b'<' => self.next_hex_string().map(Third::Hex),
b'[' => {
self.pos += 1;
let mut items: Vec<String> = Vec::new();
while self.pos < self.buf.len() {
let b = self.buf[self.pos];
if b == b']' {
self.pos += 1;
break;
}
if b == b'<' {
if let Some(h) = self.next_hex_string() {
items.push(h.to_unicode_string());
}
continue;
}
self.pos += 1;
}
let _ = body;
Some(Third::Array(items))
}
_ => None,
}
}
}
fn decode_hex(input: &[u8]) -> Vec<u8> {
let mut out: Vec<u8> = Vec::with_capacity(input.len() / 2);
let mut nybble: i16 = -1;
for &b in input {
if b.is_ascii_whitespace() {
continue;
}
let v = match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(b - b'a' + 10),
b'A'..=b'F' => Some(b - b'A' + 10),
_ => None,
};
let Some(v) = v else { break };
if nybble < 0 {
nybble = v as i16;
} else {
out.push((((nybble as u8) << 4) | v) & 0xff);
nybble = -1;
}
}
out
}
fn codespace_is_two_byte(body: &[u8]) -> bool {
let mut iter = HexIter::new(body);
while let Some(s) = iter.next_hex_string() {
if s.0.len() >= 2 {
return true;
}
}
false
}