#![no_std]
extern crate alloc;
use alloc::string::String;
use alloc::vec::Vec;
use core::ptr::copy_nonoverlapping;
const ILLEGALS: [u8; 6] = [0, 10, 13, 34, 38, 92];
const SHORTENED: u8 = 0b111;
const ILLEGAL_IDX_DIRECT: [u8; 128] = {
let mut arr = [0xFF_u8; 128];
arr[0] = 0;
arr[10] = 1;
arr[13] = 2;
arr[34] = 3;
arr[38] = 4;
arr[92] = 5;
arr
};
const ILLEGAL_FLAG: [u8; 128] = {
let mut arr = [0_u8; 128];
arr[0] = 1;
arr[10] = 1;
arr[13] = 1;
arr[34] = 1;
arr[38] = 1;
arr[92] = 1;
arr
};
const ESCAPE_B1: [u8; 16] = {
let mut arr = [0_u8; 16];
let mut i = 0;
while i < 8 {
arr[i * 2] = 0b1100_0010 | ((i as u8) << 2);
arr[i * 2 + 1] = 0b1100_0010 | ((i as u8) << 2) | 1;
i += 1;
}
arr
};
#[inline(always)]
unsafe fn emit_escape_idx(out_ptr: *mut u8, out_pos: &mut usize, idx: u8, next: u8) {
let key = ((idx as usize) << 1) | ((next >> 6) as usize);
unsafe {
*out_ptr.add(*out_pos) = *ESCAPE_B1.get_unchecked(key);
*out_ptr.add(*out_pos + 1) = 0x80 | (next & 0x3F);
}
*out_pos += 2;
}
#[inline(always)]
unsafe fn emit_shortened(out_ptr: *mut u8, out_pos: &mut usize, bits: u8) {
let key = ((SHORTENED as usize) << 1) | ((bits >> 6) as usize);
unsafe {
*out_ptr.add(*out_pos) = *ESCAPE_B1.get_unchecked(key);
*out_ptr.add(*out_pos + 1) = 0x80 | (bits & 0x3F);
}
*out_pos += 2;
}
#[inline(always)]
fn pull7_tail(tail: &[u8], pos: &mut usize, acc: &mut u64, acc_bits: &mut u32) -> Option<u8> {
while *acc_bits < 7 && *pos < tail.len() {
*acc = (*acc << 8) | tail[*pos] as u64;
*pos += 1;
*acc_bits += 8;
}
if *acc_bits >= 7 {
*acc_bits -= 7;
let bits = ((*acc >> *acc_bits) & 0x7F) as u8;
if *acc_bits == 0 {
*acc = 0;
} else {
*acc &= (1_u64 << *acc_bits) - 1;
}
Some(bits)
} else if *acc_bits > 0 {
let bits = ((*acc << (7 - *acc_bits)) & 0x7F) as u8;
*acc = 0;
*acc_bits = 0;
Some(bits)
} else {
None
}
}
#[inline(always)]
fn split7(b: [u8; 7]) -> [u8; 8] {
[
b[0] >> 1,
((b[0] & 0x01) << 6) | (b[1] >> 2),
((b[1] & 0x03) << 5) | (b[2] >> 3),
((b[2] & 0x07) << 4) | (b[3] >> 4),
((b[3] & 0x0F) << 3) | (b[4] >> 5),
((b[4] & 0x1F) << 2) | (b[5] >> 6),
((b[5] & 0x3F) << 1) | (b[6] >> 7),
b[6] & 0x7F,
]
}
#[inline(always)]
fn encoded_capacity(input_len: usize) -> usize {
if input_len == 0 {
0
} else {
(input_len.saturating_mul(8).saturating_add(6) / 7).saturating_add(1)
}
}
#[inline(always)]
fn decoded_capacity(encoded_len: usize) -> usize {
(encoded_len.saturating_mul(7).saturating_add(7) / 8).saturating_add(8)
}
#[inline(always)]
unsafe fn process_groups8(
groups: &[u8; 8],
out_ptr: *mut u8,
out_pos: &mut usize,
pending_illegal_bits: &mut u8,
has_pending_illegal: &mut bool,
) {
let mut j = 0usize;
if *has_pending_illegal {
unsafe {
emit_escape_idx(
out_ptr,
out_pos,
*ILLEGAL_IDX_DIRECT.get_unchecked(*pending_illegal_bits as usize),
groups[0],
)
};
*has_pending_illegal = false;
j = 1;
}
while j < 8 {
let cur = groups[j];
let idx = *unsafe { ILLEGAL_IDX_DIRECT.get_unchecked(cur as usize) };
if idx == 0xFF {
unsafe { *out_ptr.add(*out_pos) = cur };
*out_pos += 1;
j += 1;
} else if j + 1 < 8 {
unsafe { emit_escape_idx(out_ptr, out_pos, idx, groups[j + 1]) };
j += 2;
} else {
*pending_illegal_bits = cur;
*has_pending_illegal = true;
break;
}
}
}
pub fn encode(data: &[u8]) -> String {
if data.is_empty() {
return String::new();
}
let max_out = encoded_capacity(data.len());
let mut out = Vec::<u8>::with_capacity(max_out);
let out_ptr = out.as_mut_ptr();
let mut out_pos = 0usize;
let len = data.len();
let ptr = data.as_ptr();
let mut i = 0usize;
let mut pending_illegal_bits = 0u8;
let mut has_pending_illegal = false;
while i + 7 <= len {
let b = unsafe {
[
*ptr.add(i),
*ptr.add(i + 1),
*ptr.add(i + 2),
*ptr.add(i + 3),
*ptr.add(i + 4),
*ptr.add(i + 5),
*ptr.add(i + 6),
]
};
let groups = split7(b);
if !has_pending_illegal {
let illegal = unsafe {
*ILLEGAL_FLAG.get_unchecked(groups[0] as usize)
| *ILLEGAL_FLAG.get_unchecked(groups[1] as usize)
| *ILLEGAL_FLAG.get_unchecked(groups[2] as usize)
| *ILLEGAL_FLAG.get_unchecked(groups[3] as usize)
| *ILLEGAL_FLAG.get_unchecked(groups[4] as usize)
| *ILLEGAL_FLAG.get_unchecked(groups[5] as usize)
| *ILLEGAL_FLAG.get_unchecked(groups[6] as usize)
| *ILLEGAL_FLAG.get_unchecked(groups[7] as usize)
};
if illegal == 0 {
unsafe {
copy_nonoverlapping(groups.as_ptr(), out_ptr.add(out_pos), 8);
}
out_pos += 8;
i += 7;
continue;
}
}
unsafe {
process_groups8(
&groups,
out_ptr,
&mut out_pos,
&mut pending_illegal_bits,
&mut has_pending_illegal,
);
}
i += 7;
}
let tail = &data[i..];
let mut tail_pos = 0usize;
let mut acc = 0u64;
let mut acc_bits = 0u32;
if has_pending_illegal {
if let Some(nb) = pull7_tail(tail, &mut tail_pos, &mut acc, &mut acc_bits) {
unsafe {
emit_escape_idx(
out_ptr,
&mut out_pos,
*ILLEGAL_IDX_DIRECT.get_unchecked(pending_illegal_bits as usize),
nb,
);
}
} else {
unsafe { emit_shortened(out_ptr, &mut out_pos, pending_illegal_bits) }
}
}
while let Some(cur) = pull7_tail(tail, &mut tail_pos, &mut acc, &mut acc_bits) {
let idx = unsafe { *ILLEGAL_IDX_DIRECT.get_unchecked(cur as usize) };
if idx == 0xFF {
unsafe { *out_ptr.add(out_pos) = cur };
out_pos += 1;
} else if let Some(nb) = pull7_tail(tail, &mut tail_pos, &mut acc, &mut acc_bits) {
unsafe { emit_escape_idx(out_ptr, &mut out_pos, idx, nb) };
} else {
unsafe { emit_shortened(out_ptr, &mut out_pos, cur) };
break;
}
}
unsafe {
out.set_len(out_pos);
String::from_utf8_unchecked(out)
}
}
#[inline(always)]
unsafe fn unpack8groups_scalar(
groups_ptr: *const u8,
out_ptr: *mut u8,
out_pos: &mut usize,
acc: &mut u64,
acc_bits: &mut u32,
) {
let g0 = unsafe { *groups_ptr } as u64;
let g1 = unsafe { *groups_ptr.add(1) } as u64;
let g2 = unsafe { *groups_ptr.add(2) } as u64;
let g3 = unsafe { *groups_ptr.add(3) } as u64;
let g4 = unsafe { *groups_ptr.add(4) } as u64;
let g5 = unsafe { *groups_ptr.add(5) } as u64;
let g6 = unsafe { *groups_ptr.add(6) } as u64;
let g7 = unsafe { *groups_ptr.add(7) } as u64;
let bits56 = (g0 << 49)
| (g1 << 42)
| (g2 << 35)
| (g3 << 28)
| (g4 << 21)
| (g5 << 14)
| (g6 << 7)
| g7;
let combined = (*acc << 56) | bits56;
let k = *acc_bits as usize;
unsafe {
*out_ptr.add(*out_pos) = (combined >> (k + 48)) as u8;
*out_ptr.add(*out_pos + 1) = (combined >> (k + 40)) as u8;
*out_ptr.add(*out_pos + 2) = (combined >> (k + 32)) as u8;
*out_ptr.add(*out_pos + 3) = (combined >> (k + 24)) as u8;
*out_ptr.add(*out_pos + 4) = (combined >> (k + 16)) as u8;
*out_ptr.add(*out_pos + 5) = (combined >> (k + 8)) as u8;
*out_ptr.add(*out_pos + 6) = (combined >> k) as u8;
}
*out_pos += 7;
*acc = if *acc_bits == 0 {
0
} else {
combined & ((1u64 << *acc_bits) - 1)
};
}
#[inline(always)]
unsafe fn push7_scalar(
out_ptr: *mut u8,
out_pos: &mut usize,
acc: &mut u64,
acc_bits: &mut u32,
bits: u8,
) {
*acc = (*acc << 7) | ((bits & 0x7F) as u64);
*acc_bits += 7;
if *acc_bits >= 8 {
*acc_bits -= 8;
unsafe { *out_ptr.add(*out_pos) = ((*acc >> *acc_bits) & 0xFF) as u8 };
*out_pos += 1;
if *acc_bits == 0 {
*acc = 0;
} else {
*acc &= (1_u64 << *acc_bits) - 1;
}
}
}
pub fn decode(encoded: &str) -> Result<Vec<u8>, &'static str> {
if encoded.is_empty() {
return Ok(Vec::new());
}
let bytes = encoded.as_bytes();
let len = bytes.len();
let estimated = decoded_capacity(len);
let mut out = Vec::<u8>::with_capacity(estimated);
let out_ptr = out.as_mut_ptr();
let mut out_pos = 0usize;
let mut acc = 0u64;
let mut acc_bits = 0u32;
let ptr = bytes.as_ptr();
let mut i = 0usize;
while i < len {
while i + 8 <= len {
let chunk = u64::from_le(unsafe { (ptr.add(i) as *const u64).read_unaligned() });
if (chunk & 0x8080_8080_8080_8080u64) != 0 {
break;
}
unsafe {
unpack8groups_scalar(ptr.add(i), out_ptr, &mut out_pos, &mut acc, &mut acc_bits);
}
i += 8;
}
while i < len {
let b = unsafe { *ptr.add(i) };
if b >= 128 {
break;
}
i += 1;
unsafe { push7_scalar(out_ptr, &mut out_pos, &mut acc, &mut acc_bits, b) };
}
if i >= len {
break;
}
let b1 = unsafe { *ptr.add(i) };
if (b1 & 0xE0) != 0xC0 {
return Err("Invalid lead byte");
}
if i + 1 >= len {
return Err("Unexpected end of input");
}
let b2 = unsafe { *ptr.add(i + 1) };
if (b2 & 0xC0) != 0x80 {
return Err("Invalid continuation byte");
}
i += 2;
let illegal_index = (b1 >> 2) & 7;
let first_bit = b1 & 1;
let low6 = b2 & 0x3F;
if illegal_index != SHORTENED {
if illegal_index as usize >= ILLEGALS.len() {
return Err("Illegal index out of bounds");
}
unsafe {
push7_scalar(
out_ptr,
&mut out_pos,
&mut acc,
&mut acc_bits,
ILLEGALS[illegal_index as usize],
);
}
}
unsafe {
push7_scalar(
out_ptr,
&mut out_pos,
&mut acc,
&mut acc_bits,
(first_bit << 6) | low6,
);
}
}
unsafe { out.set_len(out_pos) };
Ok(out)
}
#[cfg(test)]
mod tests {
use alloc::{format, vec};
use super::*;
#[test]
fn test_empty() {
assert_eq!(encode(b""), "");
assert_eq!(decode("").unwrap(), b"");
}
#[test]
fn test_hello_world() {
let data = b"hello world";
let enc = encode(data);
let dec = decode(&enc).expect("decoding failed");
assert_eq!(dec, data);
}
#[test]
fn test_single_byte_values() {
for b in 0..=255u8 {
let data = vec![b];
let enc = encode(&data);
let dec = decode(&enc).expect(&format!("decoding failed for byte {}", b));
assert_eq!(dec, data, "failed for byte {}", b);
}
}
#[test]
fn test_various_lengths_roundtrip() {
for len in [
0, 1, 2, 3, 6, 7, 8, 9, 14, 15, 16, 17, 31, 32, 33, 100, 255, 256, 511, 512,
] {
let data: Vec<u8> = (0..len).map(|i| (i % 251) as u8).collect();
let enc = encode(&data);
let dec = decode(&enc).expect("decoding failed");
assert_eq!(dec, data, "roundtrip failed for length {}", len);
}
}
#[test]
fn test_all_illegal_bytes_handling() {
let data = b"\x00\x0A\x0D\x22\x26\x5C";
let enc = encode(data);
let dec = decode(&enc).expect("decoding failed");
assert_eq!(dec, data.as_ref());
}
#[test]
fn test_mixed_content() {
let data: Vec<u8> = (0..=255).collect();
let enc = encode(&data);
let dec = decode(&enc).expect("decoding failed");
assert_eq!(dec, data);
}
#[test]
fn test_repeated_illegal_bytes() {
let data = vec![0u8; 100];
let enc = encode(&data);
let dec = decode(&enc).expect("decoding failed");
assert_eq!(dec, data);
}
#[test]
fn test_decode_invalid_lead_byte() {
let invalid = vec![0x80u8];
let s = unsafe { String::from_utf8_unchecked(invalid) };
assert!(decode(&s).is_err());
let invalid2 = vec![0xFFu8];
let s2 = unsafe { String::from_utf8_unchecked(invalid2) };
assert!(decode(&s2).is_err());
}
#[test]
fn test_decode_truncated_escape() {
let mut data = vec![0xC0u8];
let s = unsafe { String::from_utf8_unchecked(data.clone()) };
assert!(decode(&s).is_err());
data.push(0x40);
let s2 = unsafe { String::from_utf8_unchecked(data) };
assert!(decode(&s2).is_err());
}
#[test]
fn test_decode_invalid_continuation_byte() {
let data = vec![0xC2u8, 0xFF];
let s = unsafe { String::from_utf8_unchecked(data) };
assert!(decode(&s).is_err());
}
#[test]
fn test_shortened_at_end() {
let data = vec![0u8];
let enc = encode(&data);
let dec = decode(&enc).expect("decode failed");
assert_eq!(dec, data);
}
#[test]
fn test_very_long_input() {
let data = vec![0x55u8; 100000];
let enc = encode(&data);
let dec = decode(&enc).expect("decode failed");
assert_eq!(dec, data);
}
}