extern crate encoding;
use std::str;
use std::string::String;
const CESU8_EXPR: &str = r"(?-u:\xED([\xA0-\xAF]|$)(?-u:[\x80-\xBF]|$)(?-u:\xED|$)(?-u:[\xB0-\xBF]|$)(?-u:[\x80-\xBF]|$))";
const SURROGATE_EXPR: &str = r"(?-u:\xed([\xa0-\xbf]|$)(?-u:[\x80-\xbf]|$))";
const NULL_EXPR: &str = r"(?-u:\xc0(\x80|$))";
lazy_static! {
static ref CESU_8_RE: regex::bytes::Regex = regex::bytes::Regex::new(CESU8_EXPR).unwrap();
static ref SPECIAL_BYTES_RE: regex::bytes::Regex = regex::bytes::Regex::new(
&(NULL_EXPR.to_string() + "|" + CESU8_EXPR + "|" + SURROGATE_EXPR)
)
.unwrap();
}
pub fn variant_decode(data: &[u8]) -> Result<String, Box<dyn std::error::Error>> {
let mut decoded_segments = Vec::new();
let mut position = 0;
loop {
let slice = &data[position..];
if slice.len() == 0 {
break;
}
let (decoded, consumed) = buffer_decode_step(slice)?;
if consumed == 0 {
break;
}
position += consumed;
decoded_segments.push(decoded);
}
Ok(decoded_segments.into_iter().collect())
}
fn buffer_decode_step(slice: &[u8]) -> Result<(String, usize), Box<dyn std::error::Error>> {
if let Some(captures) = SPECIAL_BYTES_RE.captures(slice) {
let matched = captures.get(0).unwrap();
let cutoff = matched.start();
if cutoff > 0 {
let ascii_slice = &slice[..cutoff];
return Ok((str::from_utf8(ascii_slice)?.to_string(), cutoff));
}
if slice.starts_with(&[0xc0]) {
if slice.len() > 1 {
return Ok(("\u{0000}".to_string(), 2));
} else {
return Ok(("".to_string(), 0));
}
} else {
return buffer_decode_surrogates(slice);
}
} else {
return Ok((str::from_utf8(slice)?.to_string(), slice.len()));
}
}
fn buffer_decode_surrogates(slice: &[u8]) -> Result<(String, usize), Box<dyn std::error::Error>> {
if slice.len() < 6 {
return Ok((str::from_utf8(slice)?.to_string(), slice.len()));
} else {
if CESU_8_RE.is_match(slice) {
let codepoint = ((slice[1] as u32 & 0x0F) << 16)
+ ((slice[2] as u32 & 0x3F) << 10)
+ ((slice[4] as u32 & 0x0F) << 6)
+ (slice[5] as u32 & 0x3F)
+ 0x10000;
let c_u32 = codepoint as u32;
return Ok((char::from_u32(c_u32).unwrap().to_string(), 6));
} else {
return Ok((str::from_utf8(&slice[..3])?.to_string(), 3));
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_decode_ascii() {
let input = "Hello, world!".as_bytes();
let output = variant_decode(input).unwrap();
assert_eq!("Hello, world!", output);
}
#[test]
fn test_decode_cesu8() {
let input = [0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80];
let output = variant_decode(&input).unwrap();
let output_bytes = output.as_bytes().to_vec();
assert_eq!(vec![0xF0, 0x90, 0x90, 0x80], output_bytes);
}
#[test]
fn test_decode_empty() {
let input = [];
let output = variant_decode(&input).unwrap();
assert_eq!("", output);
}
#[test]
fn test_decode_null() {
let input = [0xC0, 0x80];
let output = variant_decode(&input).unwrap();
assert_eq!("\0", output);
}
#[test]
fn test_decode_ascii_and_null() {
let input = [0x41, 0x42, 0xC0, 0x80, 0x43];
let output = variant_decode(&input).unwrap();
assert_eq!("AB\0C", output);
}
#[test]
fn test_decode_cesu8_and_null() {
let input = [0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80, 0xC0, 0x80];
let output = variant_decode(&input).unwrap();
let expected_output = String::from_utf8(vec![0xF0, 0x90, 0x90, 0x80, 0x00]).unwrap();
assert_eq!(expected_output, output);
}
#[test]
fn test_decode_cesu8_and_ascii() {
let input = [0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80, 0x41, 0x42, 0x43];
let output = variant_decode(&input).unwrap();
let expected_output =
String::from_utf8(vec![0xF0, 0x90, 0x90, 0x80, 0x41, 0x42, 0x43]).unwrap();
assert_eq!(expected_output, output);
}
#[test]
fn test_decode_multiple_cesu8() {
let input = [
0xED, 0xA0, 0x81, 0xED, 0xB0, 0x80, 0xED, 0xA0, 0xA1, 0xED, 0xB1, 0x81,
];
let output = variant_decode(&input).unwrap();
let expected_output =
String::from_utf8(vec![0xf0, 0x90, 0x90, 0x80, 0xf0, 0x98, 0x91, 0x81]).unwrap();
assert_eq!(expected_output, output);
}
#[test]
fn test_decode_multiple_null() {
let input = [0xC0, 0x80, 0xC0, 0x80, 0xC0, 0x80];
let output = variant_decode(&input).unwrap();
assert_eq!("\0\0\0", output);
}
#[test]
fn test_decode_multiple_ascii() {
let input = [0x41, 0x41, 0x41];
let output = variant_decode(&input).unwrap();
assert_eq!("AAA", output);
}
}