use std::mem::size_of;
use std::ops::{BitOrAssign, ShlAssign};
use arbitrary::Unstructured;
#[rustfmt::skip]
mod consts {
pub(crate) const LATIN : [char; 64] = [
'\0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E',
'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
];
pub(crate) const GREEK : [char; 64] = [
'\0',
'Ά', 'Έ', 'Ή', 'Ί', 'Ό', 'Ύ', 'Ώ',
'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η', 'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π',
'Ρ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ', 'Ψ', 'Ω',
'ά', 'έ', 'ή', 'ί',
'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω',
'ό', 'ύ', 'ώ'
];
pub(crate) const CYRILLIC : [char; 64] = [
'\0', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О',
'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю',
'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ы', 'ь', 'э', 'ю', 'я'
];
pub(crate) const HEBREW : [char; 64] = [
'\0', 'ְ', 'ֱ', 'ֲ', 'ֳ', 'ִ', 'ֵ', 'ֶ', 'ַ', 'ָ', 'ֹ', 'ֺ', 'ֻ', 'ּ', 'ֽ', '־',
'ֿ', '׀', 'ׁ', 'ׂ', '׃', 'ׄ', 'ׅ', '׆', 'ׇ', 'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז',
'ח', 'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'מ', 'ן', 'נ', 'ס', 'ע', 'ף', 'פ', 'ץ', 'צ',
'ק', 'ר', 'ש', 'ת', 'װ', 'ױ', 'ײ', '׳', '״',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}',
];
pub(crate) const ARABIC : [char; 64] = [
'\0',
'،',
'؛',
'؟',
'ء',
'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ',
'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض',
'ط', 'ظ', 'ع', 'غ',
'ف', 'ق',
'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي',
'\u{064e}', '\u{064f}', '\u{0650}', '\u{0651}',
'\u{0653}', '\u{0654}',
'\u{0656}',
'\u{0657}',
'\u{0670}',
'ٹ',
'پ',
'چ',
'ڈ',
'ڑ',
'ژ',
'ک',
'گ',
'ں',
'ھ',
'ے',
'۔',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}',
];
pub(crate) const DEVANAGARI : [char; 64] = [
'\0',
'ँ', 'ं', 'ः',
'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ',
'ए', 'ऐ',
'ओ', 'औ',
'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त',
'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष',
'स','ह',
'़',
'ा', 'ि', 'ी', 'ु', 'ू', 'ृ',
'े', 'ै',
'ो', 'ौ',
'्',
'।', '॥',
'\u{ffff}',
];
pub(crate) const HANGUL_COMPATIBILITY_JAMO : [char; 64] = [
'\0', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ',
'ㅀ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ', 'ㅏ',
'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ',
'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
];
pub(crate) const HALFWIDTH_KANA : [char; 64] = [
'\0', '。', '「', '」', '、', '・', 'ヲ', 'ァ', 'ィ', 'ゥ', 'ェ', 'ォ', 'ャ', 'ュ', 'ョ', 'ッ',
'ー', 'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ', 'ク', 'ケ', 'コ', 'サ', 'シ', 'ス', 'セ', 'ソ',
'タ', 'チ', 'ツ', 'テ', 'ト', 'ナ', 'ニ', 'ヌ', 'ネ', 'ノ', 'ハ', 'ヒ', 'フ', 'ヘ', 'ホ', 'マ',
'ミ', 'ム', 'メ', 'モ', 'ヤ', 'ユ', 'ヨ', 'ラ', 'リ', 'ル', 'レ', 'ロ', 'ワ', 'ン', '゙', '゚'
];
pub(crate) const RESERVED : [char; 64] = [
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
'\u{ffff}', '\u{ffff}', '\u{ffff}', '\u{ffff}',
];
pub(crate) const CHINESE: [char; 64] = RESERVED;
pub(crate) const CHINESE_LO: char = '\u{4e00}';
pub(crate) const CHINESE_HI: char = '\u{9fff}';
pub(crate) const CHINESE_2BIT_TAG: usize = 0b11;
pub(crate) const CHINESE_4BIT_TAG: usize = 0b1100;
pub(crate) const PAGES: [[char; 64]; 16] = [
LATIN,
GREEK,
CYRILLIC,
HEBREW,
ARABIC,
RESERVED,
RESERVED,
RESERVED,
DEVANAGARI,
RESERVED,
RESERVED,
HANGUL_COMPATIBILITY_JAMO,
CHINESE,
RESERVED,
RESERVED,
HALFWIDTH_KANA,
];
}
use consts::*;
pub trait PackedValue
where
Self: Copy,
Self: ShlAssign<usize>,
Self: BitOrAssign<Self>,
Self: ::std::cmp::PartialOrd,
Self: ::std::fmt::Debug,
Self: ::std::fmt::LowerHex,
{
const NBITS: usize = size_of::<Self>() * 8;
const NCHARS: usize = Self::NBITS / 6;
const NTAGBITS: usize = Self::NBITS - (Self::NCHARS * 6);
const NCHARBITS: usize = Self::NBITS - Self::NTAGBITS;
const NWIDECHARS: usize = (Self::NBITS - Self::NTAGBITS) / 15;
fn truncating_cast_from(i: usize) -> Self;
fn most_significant_byte(self) -> u8;
fn arbitrary<'a>(u: &mut Unstructured<'a>) -> arbitrary::Result<Self> {
let page_num = if Self::NTAGBITS == 2 {
u.int_in_range::<usize>(0..=3)? * 4
} else {
*u.choose::<usize>(&[0, 1, 2, 3, 4, 8, 11, 12, 15])?
};
let mut chars: [char; 21] = ['\0'; 21];
if page_num == CHINESE_4BIT_TAG {
let len = u.int_in_range(0..=Self::NWIDECHARS)?;
for i in 0..len {
chars[i] = unsafe {
char::from_u32_unchecked(
u.int_in_range((CHINESE_LO as u32)..=(CHINESE_HI as u32))?,
)
};
}
} else {
let len = u.int_in_range(0..=Self::NCHARS)?;
for i in 0..len {
let ch = PAGES[page_num][u.int_in_range(1..=63)?];
if ch == '\u{ffff}' {
break;
}
chars[i] = ch;
}
}
Ok(
encode::<Self, _>(chars.iter().cloned().take_while(|x| *x != '\0'))
.expect("sixbit::PackedValue::arbitrary"),
)
}
}
impl PackedValue for u8 {
fn truncating_cast_from(i: usize) -> u8 {
i as u8
}
fn most_significant_byte(self) -> u8 {
self
}
}
impl PackedValue for u16 {
fn truncating_cast_from(i: usize) -> u16 {
i as u16
}
fn most_significant_byte(self) -> u8 {
(self >> 8) as u8
}
}
impl PackedValue for u32 {
fn truncating_cast_from(i: usize) -> u32 {
i as u32
}
fn most_significant_byte(self) -> u8 {
(self >> 24) as u8
}
}
impl PackedValue for u64 {
fn truncating_cast_from(i: usize) -> u64 {
i as u64
}
fn most_significant_byte(self) -> u8 {
(self >> 56) as u8
}
}
impl PackedValue for u128 {
fn truncating_cast_from(i: usize) -> u128 {
i as u128
}
fn most_significant_byte(self) -> u8 {
(self >> 120) as u8
}
}
#[derive(PartialEq, Debug)]
pub enum EncodeError {
TooLong,
NoCodePageFor(char),
PageUnavailable(usize),
MissingFromPage(char),
}
fn chinese_15bit_delta(c: char) -> Option<usize> {
if CHINESE_LO <= c && c <= CHINESE_HI {
Some((c as usize) - (CHINESE_LO as usize))
} else {
None
}
}
pub fn encode<N, IT>(i: IT) -> Result<N, EncodeError>
where
N: PackedValue,
IT: Iterator<Item = char>,
{
let mut pi = i.peekable();
let mut out: N = N::truncating_cast_from(0);
match pi.peek() {
None => Ok(out),
Some(&init) => {
if N::NCHARBITS > 0 && chinese_15bit_delta(init) != None {
let tag = if N::NTAGBITS == 2 {
CHINESE_2BIT_TAG
} else {
CHINESE_4BIT_TAG
};
out |= N::truncating_cast_from(tag);
let mut rembits: usize = N::NCHARBITS;
for c in pi {
if rembits < 15 {
return Err(EncodeError::TooLong);
}
match chinese_15bit_delta(c) {
None => {
return Err(EncodeError::MissingFromPage(c));
}
Some(delta) => {
out <<= 15;
out |= N::truncating_cast_from(delta + 1);
rembits -= 15;
}
}
}
out <<= rembits;
return Ok(out);
}
match PAGES.iter().position(|&p| p.binary_search(&init).is_ok()) {
None => Err(EncodeError::NoCodePageFor(init)),
Some(p) => {
let mut tag = p;
let mut rem: usize = N::NCHARS;
if N::NTAGBITS == 2 {
if tag & 0b11 != 0 {
return Err(EncodeError::PageUnavailable(tag));
}
tag >>= 2;
}
out |= N::truncating_cast_from(tag);
for c in pi {
if rem == 0 {
return Err(EncodeError::TooLong);
}
match PAGES[p].binary_search(&c) {
Err(_) => return Err(EncodeError::MissingFromPage(c)),
Ok(i) => {
out <<= 6;
out |= N::truncating_cast_from(i);
rem -= 1;
}
}
}
out <<= 6 * rem;
Ok(out)
}
}
}
}
}
pub trait EncodeSixbit: Sized + Iterator<Item = char> {
fn encode_sixbit<N: PackedValue>(self) -> Result<N, EncodeError>;
}
impl<T> EncodeSixbit for T
where
T: Sized,
T: Iterator<Item = char>,
{
fn encode_sixbit<N: PackedValue>(self) -> Result<N, EncodeError> {
encode::<N, Self>(self)
}
}
pub struct DecodeSixbitIter<N: PackedValue> {
tag: usize,
tmp: N,
}
impl<N> Iterator for DecodeSixbitIter<N>
where
N: PackedValue,
{
type Item = char;
fn next(self: &mut Self) -> Option<char> {
if self.tag == CHINESE_4BIT_TAG {
let ch0 = self.tmp.most_significant_byte();
match ch0 {
0 => None,
i => {
self.tmp <<= 8;
let ch1 = self.tmp.most_significant_byte();
self.tmp <<= 7;
let delta = ((i as u32) << 7) | ((ch1 as u32) >> 1);
char::from_u32((CHINESE_LO as u32) + delta - 1)
}
}
} else {
let mut ch = self.tmp.most_significant_byte();
ch >>= 2;
match ch {
0 => None,
i => {
self.tmp <<= 6;
Some(PAGES[self.tag][i as usize])
}
}
}
}
}
pub trait DecodeSixbit
where
Self: PackedValue,
{
fn decode_sixbit(self) -> DecodeSixbitIter<Self>;
}
impl<N> DecodeSixbit for N
where
N: PackedValue,
{
fn decode_sixbit(self) -> DecodeSixbitIter<Self> {
let mut tmp = self;
let mut tag = self.most_significant_byte();
tag >>= 8 - N::NTAGBITS;
if N::NTAGBITS == 2 {
tag <<= 2;
}
tmp <<= N::NTAGBITS;
DecodeSixbitIter {
tag: tag as usize,
tmp,
}
}
}
#[cfg(test)]
mod tests {
use rand::RngCore;
use super::*;
#[test]
fn misc_invariants() {
for pair in PAGES.windows(2) {
if pair[0][1] != '\u{ffff}' && pair[1][1] != '\u{ffff}' {
if pair[0][1] >= pair[1][1] {
println!(
"mis-ordered page pair: {:?} >= {:?}",
pair[0][1], pair[1][1]
);
}
assert!(pair[0][1] < pair[1][1]);
}
}
for p in PAGES.iter() {
assert!(p[0] == '\0' || p[0] == '\u{ffff}');
for pair in p.windows(2) {
if pair[0] != '\0'
&& pair[1] != '\0'
&& pair[0] != '\u{ffff}'
&& pair[1] != '\u{ffff}'
{
if pair[0] >= pair[1] {
println!("mis-ordered char pair: {:?} >= {:?}", pair[0], pair[1]);
}
assert!(pair[0] < pair[1]);
}
}
}
}
fn round_trip<N: PackedValue>(s: &str) -> Result<N, EncodeError> {
match s.chars().encode_sixbit::<N>() {
Ok(enc) => {
let dec: String = enc.decode_sixbit().collect();
println!("roundtrip: {:?} => {:x} => {:?}", s, enc, dec);
assert!(dec == s);
Ok(enc)
}
err => err,
}
}
#[test]
fn test_latin() {
assert!(round_trip::<u128>("PRINTER_is_on_FIRE").is_ok());
assert!(round_trip::<u64>("NO_CARRIER").is_ok());
assert!(round_trip::<u32>("_CAT_").is_ok());
assert!(round_trip::<u16>("OK").is_ok());
assert!(round_trip::<u8>("1").is_ok());
assert!(round_trip::<u128>("Printer_Working").is_ok());
assert!(round_trip::<u64>("ATDT_123").is_ok());
assert!(round_trip::<u32>("Uwu").is_ok());
assert!(round_trip::<u16>("A").is_ok());
assert!(round_trip::<u8>("").is_ok());
assert!(round_trip::<u128>("PRINTER_FULLY_OPERATIONAL") == Err(EncodeError::TooLong));
assert!(round_trip::<u64>("ATDT_123_4567") == Err(EncodeError::TooLong));
assert!(round_trip::<u32>("aaaaaaa") == Err(EncodeError::TooLong));
assert!(round_trip::<u16>("aba") == Err(EncodeError::TooLong));
assert!(round_trip::<u8>("OOH") == Err(EncodeError::TooLong));
assert!(round_trip::<u128>("©2018") == Err(EncodeError::NoCodePageFor('©')));
assert!(round_trip::<u128>("ΨΩ") == Err(EncodeError::PageUnavailable(1)));
assert!(round_trip::<u64>("sh@rk") == Err(EncodeError::MissingFromPage('@')));
}
fn check_order<N: PackedValue>(a: &str, b: &str) {
assert!(a < b);
assert!(a.chars().encode_sixbit::<N>().unwrap() < b.chars().encode_sixbit::<N>().unwrap());
}
#[test]
fn sorting() {
check_order::<u32>("", "AB");
check_order::<u64>("abcd", "abcde");
check_order::<u64>("abcde", "abcdf");
check_order::<u64>("α", "αβγ");
check_order::<u64>("αβ", "αβγ");
check_order::<u64>("αβγ", "αβδ");
check_order::<u64>("怎么", "怎么样");
check_order::<u64>("以前", "以后");
check_order::<u64>("abc", "αβγ");
check_order::<u64>("αβγ", "абв");
check_order::<u64>("абв", "אבג");
check_order::<u64>("אבג", "ابة");
check_order::<u64>("ابة", "कखग");
check_order::<u64>("कखग", "ㄱㄲㄳ");
check_order::<u64>("ㄱㄲㄳ", "合伙人");
check_order::<u64>("合伙人", "ヲァィ");
}
#[test]
fn test_greek() {
assert!(round_trip::<u64>("αλήθεια").is_ok());
assert!(round_trip::<u16>("γη").is_ok());
}
#[test]
fn test_cyrillic() {
assert!(round_trip::<u64>("содержать").is_ok());
assert!(round_trip::<u16>("же").is_ok());
}
#[test]
fn test_hebrew() {
assert!(round_trip::<u64>("לעשות").is_ok());
assert!(round_trip::<u16>("כל").is_ok());
}
#[test]
fn test_arabic() {
assert!(round_trip::<u128>("محافظت").is_ok());
assert!(round_trip::<u64>("العاصمة").is_ok());
assert!(round_trip::<u32>("البعض").is_ok());
assert!(round_trip::<u16>("از").is_ok());
assert!(round_trip::<u8>("و").is_ok());
}
#[test]
fn test_devanagari() {
assert!(round_trip::<u128>("किंकर्तव्यविमूढ़").is_ok());
assert!(round_trip::<u64>("आवश्यकता").is_ok());
assert!(round_trip::<u32>("सपना").is_ok());
assert!(round_trip::<u16>("पल").is_ok());
assert!(round_trip::<u8>("आ").is_ok());
}
#[test]
fn test_chinese() {
assert!(round_trip::<u128>("高速火车站").is_ok());
assert!(round_trip::<u64>("合伙人").is_ok());
assert!(round_trip::<u32>("同事").is_ok());
}
#[test]
fn test_compatibility_hangul_jamo() {
assert!(round_trip::<u64>("ㅇㅜㅁㅈㅣㄱㅇㅣㅁ").is_ok());
assert!(round_trip::<u16>("ㅅㅜ").is_ok());
}
#[test]
fn test_halfwidth_kana() {
assert!(round_trip::<u64>("イクツカノ").is_ok());
assert!(round_trip::<u16>("ヤル").is_ok());
}
#[test]
fn test_arbitrary() {
for _ in 0..64 {
let mut bytes = [0u8; 1024];
rand::thread_rng().fill_bytes(&mut bytes);
let mut u = arbitrary::Unstructured::new(&bytes);
let packed = <u128 as PackedValue>::arbitrary(&mut u).expect("arbitrary");
dbg!(packed);
let decoded: String = packed.decode_sixbit().collect();
dbg!(decoded);
}
}
}