use anyhow::{anyhow, Result};
pub const MQ2_UNI_DICT_ID: &str = "mq2-uni-v2-utf8safe";
const ESC: u8 = b'~';
fn get_token_map() -> Vec<(&'static [u8], &'static [u8])> {
vec![
(b"\n\n", b"~PP"), (b" ", b"~SP"), (b"\n- ", b"~LI"), (b"## ", b"~H2"), (b"# ", b"~H1"), (b"```\n", b"~CB"), (b"```", b"~CE"), (b"{\n", b"~OB"), (b"}\n", b"~CL"), (b"[\n", b"~OS"), (b"\n]", b"~CS"), (b": ", b"~CO"), (b", ", b"~CM"), (b" ", b"~IN"), (b"\n\n\n", b"~TB"), ]
}
fn skip_utf8_char(bytes: &[u8], i: usize) -> usize {
if i >= bytes.len() {
return i;
}
let b = bytes[i];
if b < 0x80 {
return i + 1; }
let len = if b & 0b1111_0000 == 0b1111_0000 { 4 } else if b & 0b1110_0000 == 0b1110_0000 { 3 } else if b & 0b1100_0000 == 0b1100_0000 { 2 } else { 1 };
(i + len).min(bytes.len())
}
pub fn mq2_uni_encode(input: &[u8]) -> Result<Vec<u8>> {
let token_map = get_token_map();
let mut out = Vec::with_capacity(input.len());
let mut i = 0;
while i < input.len() {
if input[i] < 0x80 {
let mut matched = false;
for (pattern, token) in &token_map {
if i + pattern.len() <= input.len() && &input[i..i + pattern.len()] == *pattern {
out.extend_from_slice(token);
i += pattern.len();
matched = true;
break;
}
}
if matched {
continue;
}
}
let next_i = skip_utf8_char(input, i);
out.extend_from_slice(&input[i..next_i]);
i = next_i;
}
Ok(out)
}
pub fn mq2_uni_decode(input: &[u8]) -> Result<Vec<u8>> {
let token_map = get_token_map();
let mut out = Vec::with_capacity(input.len() * 2);
let mut i = 0;
while i < input.len() {
if i + 2 < input.len() && input[i] == ESC {
let mut decoded = false;
if i + 3 <= input.len() {
let token = &input[i..i + 3];
for (pattern, tok) in &token_map {
if *tok == token {
out.extend_from_slice(pattern);
i += 3;
decoded = true;
break;
}
}
}
if decoded {
continue;
}
}
let next_i = skip_utf8_char(input, i);
out.extend_from_slice(&input[i..next_i]);
i = next_i;
}
Ok(out)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_emoji_preservation() {
let test_cases = vec![
"Hello π World! π",
"Rust π¦ is awesome! π",
"ππ’π‘π₯° emotions",
"Complex: δ½ ε₯½δΈη π― Ω
Ψ±ΨΨ¨Ψ§ π",
];
for original in test_cases {
let bytes = original.as_bytes();
let encoded = mq2_uni_encode(bytes).unwrap();
let decoded = mq2_uni_decode(&encoded).unwrap();
assert_eq!(
bytes,
decoded.as_slice(),
"Failed to preserve: {}",
original
);
let reconstructed = String::from_utf8(decoded).unwrap();
assert_eq!(original, reconstructed);
}
}
#[test]
fn test_markdown_patterns() {
let markdown = "# Title\n\n## Subtitle\n\n- Item 1\n- Item 2";
let bytes = markdown.as_bytes();
let encoded = mq2_uni_encode(bytes).unwrap();
let decoded = mq2_uni_decode(&encoded).unwrap();
assert_eq!(bytes, decoded.as_slice());
}
#[test]
fn test_utf8_boundaries() {
let text = "UTF-8: β¬Β£Β₯ Emoji: π¨βπ©βπ§βπ¦ Chinese: δΈζ";
let bytes = text.as_bytes();
let encoded = mq2_uni_encode(bytes).unwrap();
let decoded = mq2_uni_decode(&encoded).unwrap();
assert_eq!(bytes, decoded.as_slice());
}
}