use std::collections::HashMap;
#[derive(Debug)]
pub struct UnicodeEncoder {
glyph_cache: HashMap<u32, u16>,
}
impl UnicodeEncoder {
pub fn new() -> Self {
Self {
glyph_cache: HashMap::new(),
}
}
pub fn encode_identity_h(
&mut self,
text: &str,
glyph_lookup: impl Fn(u32) -> Option<u16>,
) -> String {
let mut hex = String::with_capacity(text.len() * 4 + 2);
hex.push('<');
for ch in text.chars() {
let codepoint = ch as u32;
let glyph_id = self
.glyph_cache
.get(&codepoint)
.copied()
.or_else(|| {
let gid = glyph_lookup(codepoint)?;
self.glyph_cache.insert(codepoint, gid);
Some(gid)
})
.unwrap_or(0);
hex.push_str(&format!("{:04X}", glyph_id));
}
hex.push('>');
hex
}
pub fn encode_char_identity_h(&self, glyph_id: u16) -> String {
format!("<{:04X}>", glyph_id)
}
pub fn encode_literal(text: &str) -> String {
let mut result = String::with_capacity(text.len() + 2);
result.push('(');
for ch in text.chars() {
match ch {
'(' => result.push_str("\\("),
')' => result.push_str("\\)"),
'\\' => result.push_str("\\\\"),
'\n' => result.push_str("\\n"),
'\r' => result.push_str("\\r"),
'\t' => result.push_str("\\t"),
c if c.is_ascii() && c >= ' ' => result.push(c),
c if (c as u32) < 256 => {
result.push_str(&format!("\\{:03o}", c as u32));
},
_ => result.push('?'),
}
}
result.push(')');
result
}
pub fn encode_utf16be(text: &str) -> String {
let mut hex = String::new();
hex.push('<');
hex.push_str("FEFF");
for ch in text.chars() {
let codepoint = ch as u32;
if codepoint <= 0xFFFF {
hex.push_str(&format!("{:04X}", codepoint));
} else {
let adjusted = codepoint - 0x10000;
let high = ((adjusted >> 10) & 0x3FF) + 0xD800;
let low = (adjusted & 0x3FF) + 0xDC00;
hex.push_str(&format!("{:04X}{:04X}", high, low));
}
}
hex.push('>');
hex
}
pub fn encode_text(text: &str) -> String {
if text
.chars()
.all(|c| c.is_ascii() && c >= ' ' && c != '(' && c != ')' && c != '\\')
{
format!("({})", text)
} else if text.chars().all(|c| (c as u32) < 256) {
Self::encode_literal(text)
} else {
Self::encode_utf16be(text)
}
}
pub fn clear_cache(&mut self) {
self.glyph_cache.clear();
}
pub fn cache_size(&self) -> usize {
self.glyph_cache.len()
}
}
impl Default for UnicodeEncoder {
fn default() -> Self {
Self::new()
}
}
pub fn unicode_to_winansi(codepoint: u32) -> Option<u8> {
if codepoint < 0x80 || (0xA0..=0xFF).contains(&codepoint) {
return Some(codepoint as u8);
}
match codepoint {
0x20AC => Some(0x80), 0x201A => Some(0x82), 0x0192 => Some(0x83), 0x201E => Some(0x84), 0x2026 => Some(0x85), 0x2020 => Some(0x86), 0x2021 => Some(0x87), 0x02C6 => Some(0x88), 0x2030 => Some(0x89), 0x0160 => Some(0x8A), 0x2039 => Some(0x8B), 0x0152 => Some(0x8C), 0x017D => Some(0x8E), 0x2018 => Some(0x91), 0x2019 => Some(0x92), 0x201C => Some(0x93), 0x201D => Some(0x94), 0x2022 => Some(0x95), 0x2013 => Some(0x96), 0x2014 => Some(0x97), 0x02DC => Some(0x98), 0x2122 => Some(0x99), 0x0161 => Some(0x9A), 0x203A => Some(0x9B), 0x0153 => Some(0x9C), 0x017E => Some(0x9E), 0x0178 => Some(0x9F), _ => None,
}
}
pub fn is_winansi_char(ch: char) -> bool {
unicode_to_winansi(ch as u32).is_some()
}
fn escape_byte_for_literal(b: u8) -> String {
match b {
b'(' => "\\(".to_string(),
b')' => "\\)".to_string(),
b'\\' => "\\\\".to_string(),
0x0A => "\\n".to_string(),
0x0D => "\\r".to_string(),
0x09 => "\\t".to_string(),
0x08 => "\\b".to_string(),
0x0C => "\\f".to_string(),
b if (0x20..0x7F).contains(&b) => (b as char).to_string(),
b => format!("\\{:03o}", b),
}
}
pub fn encode_bytes_as_literal(bytes: &[u8]) -> String {
let mut result = String::with_capacity(bytes.len() * 2 + 2);
result.push('(');
for &b in bytes {
result.push_str(&escape_byte_for_literal(b));
}
result.push(')');
result
}
pub fn encode_bytes_as_hex(bytes: &[u8]) -> String {
let mut result = String::with_capacity(bytes.len() * 2 + 2);
result.push('<');
for b in bytes {
result.push_str(&format!("{:02X}", b));
}
result.push('>');
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encode_identity_h() {
let mut encoder = UnicodeEncoder::new();
let lookup = |cp: u32| match cp {
0x41 => Some(1_u16),
0x42 => Some(2_u16),
_ => None,
};
let result = encoder.encode_identity_h("AB", lookup);
assert_eq!(result, "<00010002>");
}
#[test]
fn test_encode_identity_h_missing_glyph() {
let mut encoder = UnicodeEncoder::new();
let lookup = |_: u32| None;
let result = encoder.encode_identity_h("A", lookup);
assert_eq!(result, "<0000>"); }
#[test]
fn test_encode_literal_simple() {
let result = UnicodeEncoder::encode_literal("Hello");
assert_eq!(result, "(Hello)");
}
#[test]
fn test_encode_literal_escapes() {
let result = UnicodeEncoder::encode_literal("(test)");
assert_eq!(result, "(\\(test\\))");
let result = UnicodeEncoder::encode_literal("back\\slash");
assert_eq!(result, "(back\\\\slash)");
}
#[test]
fn test_encode_utf16be() {
let result = UnicodeEncoder::encode_utf16be("A");
assert_eq!(result, "<FEFF0041>");
let result = UnicodeEncoder::encode_utf16be("\u{20AC}");
assert_eq!(result, "<FEFF20AC>");
}
#[test]
fn test_encode_utf16be_supplementary() {
let result = UnicodeEncoder::encode_utf16be("\u{1F600}");
assert_eq!(result, "<FEFFD83DDE00>");
}
#[test]
fn test_encode_text_auto() {
let result = UnicodeEncoder::encode_text("Hello");
assert_eq!(result, "(Hello)");
let result = UnicodeEncoder::encode_text("Hello\u{20AC}World");
assert!(result.starts_with("<FEFF"));
}
#[test]
fn test_winansi_mapping() {
assert_eq!(unicode_to_winansi(0x41), Some(0x41)); assert_eq!(unicode_to_winansi(0x20AC), Some(0x80)); assert_eq!(unicode_to_winansi(0x2019), Some(0x92)); assert_eq!(unicode_to_winansi(0x10000), None); }
#[test]
fn test_is_winansi_char() {
assert!(is_winansi_char('A'));
assert!(is_winansi_char('\u{20AC}')); assert!(!is_winansi_char('\u{4E2D}')); }
#[test]
fn test_encode_bytes_as_hex() {
let result = encode_bytes_as_hex(&[0x41, 0x42, 0x43]);
assert_eq!(result, "<414243>");
}
#[test]
fn test_encode_bytes_as_literal() {
let result = encode_bytes_as_literal(b"ABC");
assert_eq!(result, "(ABC)");
let result = encode_bytes_as_literal(&[0x28, 0x29]); assert_eq!(result, "(\\(\\))");
}
#[test]
fn test_encoder_caching() {
let mut encoder = UnicodeEncoder::new();
let lookup = |cp: u32| Some(cp as u16);
encoder.encode_identity_h("AAA", lookup);
assert_eq!(encoder.cache_size(), 1);
encoder.encode_identity_h("ABC", lookup);
assert_eq!(encoder.cache_size(), 3);
encoder.clear_cache();
assert_eq!(encoder.cache_size(), 0);
}
}