use crate::unicode_encoding::UnicodeEncodingError::*;
use crate::unicode_encoding::UnicodeEncodingError;
use crate::unicode_encoding::UnicodeEncoding;
use crate::utf_32::Utf32;
pub struct Utf1 {
pub data: Vec<u8>
}
impl Utf1 {
fn check_sanity_utf1(&self) -> UnicodeEncodingError {
let mut index: usize = 0;
while index < self.data.len() {
let (_glyph, len) = match utf_1_glyph_to_utf_32(&self.data, index) {
Err(x) => {return x;},
Ok(x) => x,
};
index += len;
check_index_ok(index, &self.data);
}
return NoError;
}
}
impl UnicodeEncoding for Utf1 {
fn from_utf_32(data_utf_32: &Utf32) -> Self {
let mut data: Vec<u8> = Vec::new();
for glyph in &data_utf_32.data {
for new_byte in utf_32_glyph_to_utf_1(*glyph) {
data.push(new_byte);
}
}
let utf = Utf1{data: data};
return utf;
}
fn to_utf_32(&self) -> Utf32 {
let mut index: usize = 0;
let mut data: Vec<u32> = Vec::new();
while index < self.data.len() {
let (glyph, len) = match utf_1_glyph_to_utf_32(&self.data, index) {
Err(_) => {panic!("[UNICODE ENCODING ERROR] Invalid UTF-1 glyph. This should not have happen as a MissingEncodedBytes should have been raised earlier..");},
Ok(x) => x,
};
data.push(glyph);
index += len;
check_index_ok(index, &self.data);
}
return Utf32{data: data};
}
fn to_bytes(&self, _big_endian: bool) -> Vec<u8> {
let ret = self.data.clone();
return ret;
}
fn from_bytes_no_check(bytes: &[u8], _big_endian: bool) -> Result<Self, UnicodeEncodingError> {
let ret = Utf1{data: bytes.to_vec()};
match ret.check_sanity_utf1() {
NoError => Ok(ret),
x => Err(x),
}
}
}
const FIRST_CATEGORY_GLYPH_LIMIT: u32 = 0xA0;
const SECOND_CATEGORY_GLYPH_LIMIT: u32 = 0x100;
const THIRD_CATEGORY_GLYPH_LIMIT: u32 = 0x4016;
const FOURTH_CATEGORY_GLYPH_LIMIT: u32 = 0x38E2E;
const UTF_1_MODULO: u32 = 0xBE;
const THIRD_CATEGORY_GLYPH_TERM: u32 = 0xA1;
const FOURTH_CATEGORY_GLYPH_TERM: u32 = 0xF6;
const FIFTH_CATEGORY_GLYPH_TERM: u32 = 0xFC;
const FIRST_CATEGORY_SEQUENCE_LIMIT: u32 = FIRST_CATEGORY_GLYPH_LIMIT;
const SECOND_CATEGORY_SEQUENCE_LIMIT: u32 = 0xF6;
const THIRD_CATEGORY_SEQUENCE_LIMIT: u32 = 0xFC;
fn t(z: u8) -> u8 {
if z < 0x5E {
z + 0x21
} else if z < 0xBE {
z + 0x42
} else if z < 0xDF {
z - 0xBE
} else {
z - 0x60
}
}
fn utf_32_glyph_to_utf_1(glyph: u32) -> Vec<u8> {
let mut ret: Vec<u8> = Vec::new();
if glyph < FIRST_CATEGORY_GLYPH_LIMIT {
ret.push(glyph as u8);
} else if glyph < SECOND_CATEGORY_GLYPH_LIMIT {
ret.push(FIRST_CATEGORY_GLYPH_LIMIT as u8);
ret.push(glyph as u8);
} else if glyph < THIRD_CATEGORY_GLYPH_LIMIT {
let y = glyph - SECOND_CATEGORY_GLYPH_LIMIT;
ret.push((THIRD_CATEGORY_GLYPH_TERM + y / UTF_1_MODULO) as u8);
ret.push(t((y % UTF_1_MODULO) as u8));
} else if glyph < FOURTH_CATEGORY_GLYPH_LIMIT {
let y = glyph - THIRD_CATEGORY_GLYPH_LIMIT;
ret.push((FOURTH_CATEGORY_GLYPH_TERM + y / UTF_1_MODULO / UTF_1_MODULO) as u8);
ret.push(t((y / UTF_1_MODULO % UTF_1_MODULO) as u8));
ret.push(t((y % UTF_1_MODULO) as u8));
} else {
let y = glyph - FOURTH_CATEGORY_GLYPH_LIMIT;
ret.push((FIFTH_CATEGORY_GLYPH_TERM + y / UTF_1_MODULO / UTF_1_MODULO / UTF_1_MODULO / UTF_1_MODULO) as u8);
ret.push(t((y / UTF_1_MODULO / UTF_1_MODULO / UTF_1_MODULO % UTF_1_MODULO) as u8));
ret.push(t((y / UTF_1_MODULO / UTF_1_MODULO % UTF_1_MODULO) as u8));
ret.push(t((y / UTF_1_MODULO % UTF_1_MODULO) as u8));
ret.push(t((y % UTF_1_MODULO) as u8));
}
return ret;
}
fn u(z: u8) -> u8 {
if z < 0x21 {
z + 0xBE
} else if z < 0x7F {
z - 0x21
} else if z < 0xA0 {
z + 0x60
} else {
z - 0x42
}
}
fn uu(z: u8) -> u32 {
return u(z) as u32;
}
fn utf_1_glyph_to_utf_32(utf1_data: &Vec<u8>, start: usize) -> Result<(u32, usize), UnicodeEncodingError> {
let first_byte = utf1_data[start] as u8;
if first_byte < FIRST_CATEGORY_SEQUENCE_LIMIT as u8 {
return Ok((first_byte as u32, 1));
} else if first_byte == FIRST_CATEGORY_SEQUENCE_LIMIT as u8 {
if utf1_data.len() < start + 2 {
return Err(MissingEncodedBytes);
}
return Ok((utf1_data[start+1] as u32, 2));
} else if first_byte < SECOND_CATEGORY_SEQUENCE_LIMIT as u8 {
if utf1_data.len() < start + 2 {
return Err(MissingEncodedBytes);
}
let mut ret = ((first_byte as u32)- 1 - FIRST_CATEGORY_SEQUENCE_LIMIT) * UTF_1_MODULO;
ret += uu(utf1_data[start+1]);
ret += SECOND_CATEGORY_GLYPH_LIMIT;
return Ok((ret, 2));
} else if first_byte < THIRD_CATEGORY_SEQUENCE_LIMIT as u8 {
if utf1_data.len() < start + 3 {
return Err(MissingEncodedBytes);
}
let mut ret = ((first_byte as u32) - SECOND_CATEGORY_SEQUENCE_LIMIT) * UTF_1_MODULO * UTF_1_MODULO;
ret += uu(utf1_data[start+1]) * UTF_1_MODULO;
ret += uu(utf1_data[start+2]);
ret += THIRD_CATEGORY_GLYPH_LIMIT;
return Ok((ret, 3));
} else {
if utf1_data.len() < start + 5 {
return Err(MissingEncodedBytes);
}
let mut ret = ((first_byte as u32) - THIRD_CATEGORY_SEQUENCE_LIMIT) * UTF_1_MODULO * UTF_1_MODULO * UTF_1_MODULO * UTF_1_MODULO;
ret += uu(utf1_data[start+1]) * UTF_1_MODULO * UTF_1_MODULO * UTF_1_MODULO;
ret += uu(utf1_data[start+2]) * UTF_1_MODULO * UTF_1_MODULO;
ret += uu(utf1_data[start+3]) * UTF_1_MODULO;
ret += uu(utf1_data[start+4]);
ret += FOURTH_CATEGORY_GLYPH_LIMIT;
return Ok((ret, 5));
}
}
fn check_index_ok(index: usize, bytes: &Vec<u8>) {
if index > bytes.len() {
panic!("[UNICODE ENCODING ERROR] Some bytes are missing to encode of glyph. This is a library issue as a MissingEncodedBytes error should have been raised earlier in that case.\n");
}
}
#[test]
fn test_utf_32_to_utf_1_raw() {
fn simple_conv(glyph: u32, vec: Vec<u8>) {
let conv = utf_32_glyph_to_utf_1(glyph);
assert_eq!(conv, vec);
}
simple_conv(0x000045, vec![0x45]);
simple_conv(0x00007F, vec![0x7F]);
simple_conv(0x000080, vec![0x80]);
simple_conv(0x00009F, vec![0x9F]);
simple_conv(0x0000A0, vec![0xA0, 0xA0]);
simple_conv(0x0000BF, vec![0xA0, 0xBF]);
simple_conv(0x0000C0, vec![0xA0, 0xC0]);
simple_conv(0x0000FF, vec![0xA0, 0xFF]);
simple_conv(0x000100, vec![0xA1, 0x21]);
simple_conv(0x00015D, vec![0xA1, 0x7E]);
simple_conv(0x0001BD, vec![0xA1, 0xFF]);
simple_conv(0x0001BE, vec![0xA2, 0x21]);
simple_conv(0x0007FF, vec![0xAA, 0x72]);
simple_conv(0x000800, vec![0xAA, 0x73]);
simple_conv(0x000FFF, vec![0xB5, 0x48]);
simple_conv(0x001000, vec![0xB5, 0x49]);
simple_conv(0x00D7FF, vec![0xF7, 0x2F, 0xC3]);
simple_conv(0x00FDEF, vec![0xF7, 0x62, 0xD9]);
simple_conv(0x038E2D, vec![0xFB, 0xFF, 0xFF]);
simple_conv(0x038E2E, vec![0xFC, 0x21, 0x21, 0x21, 0x21]);
simple_conv(0x10FFFF, vec![0xFC, 0x21, 0x39, 0x6E, 0x6C]);
}
#[test]
fn test_u_v() {
for i in 0..0x100 {
let i_u8 = i as u8;
assert_eq!(t(u(i_u8)), i_u8);
}
}
#[test]
fn test_utf_32_to_utf_1_and_back() {
fn double_conv(glyph: u32) {
let conv = utf_32_glyph_to_utf_1(glyph);
let (conv_back, size) = utf_1_glyph_to_utf_32(&conv, 0).unwrap();
assert_eq!(glyph, conv_back);
assert_eq!(size, conv.len());
}
double_conv(0x000045);
double_conv(0x00007F);
double_conv(0x000080);
double_conv(0x00009F);
double_conv(0x0000A0);
double_conv(0x0000BF);
double_conv(0x0000C0);
double_conv(0x0000FF);
double_conv(0x000100);
double_conv(0x00015D);
double_conv(0x0001BD);
double_conv(0x0001BE);
double_conv(0x0007FF);
double_conv(0x000800);
double_conv(0x000FFF);
double_conv(0x001000);
double_conv(0x00D7FF);
double_conv(0x00FDEF);
double_conv(0x038E2D);
double_conv(0x038E2E);
double_conv(0x10FFFF);
}