use std::{
error::{self, Error},
fmt::Display,
sync::Arc,
};
use bytecount::num_chars;
pub const BYTES_PER_CHAR: usize = 4;
const VALID_ONE_BYTE_UTF8_FIRST_BYTE_MAX_VALUE: u8 = 0b01111111_u8;
const VALID_TWO_BYTE_UTF8_FIRST_BYTE_RANGE: (u8, u8) = (0b11000000_u8, 0b11011111_u8);
const VALID_TWO_BYTE_UTF8_SECOND_BYTE_RANGE: (u8, u8) = (0b10000000_u8, 0b10111111_u8);
const VALID_THREE_BYTE_UTF8_FIRST_BYTE_RANGE: (u8, u8) = (0b11100000_u8, 0b11110111_u8);
const VALID_THREE_BYTE_UTF8_SECOND_BYTE_RANGE: (u8, u8) = (0b10000000_u8, 0b10111111_u8);
const VALID_THREE_BYTE_UTF8_THIRD_BYTE_RANGE: (u8, u8) = (0b10000000_u8, 0b10111111_u8);
const VALID_FOUR_BYTE_UTF8_FIRST_BYTE_RANGE: (u8, u8) = (0b11110000_u8, 0b11110111_u8);
const VALID_FOUR_BYTE_UTF8_SECOND_BYTE_RANGE: (u8, u8) = (0b10000000_u8, 0b10111111_u8);
const VALID_FOUR_BYTE_UTF8_THIRD_BYTE_RANGE: (u8, u8) = (0b10000000_u8, 0b10111111_u8);
const VALID_FOUR_BYTE_UTF8_FOURTH_BYTE_RANGE: (u8, u8) = (0b10000000_u8, 0b10111111_u8);
const SPACE_BYTE: &[u8] = &[0, 0, 0, 32];
type PreparedCustomBytes = (Option<u8>, Option<u8>, Option<u8>, Option<u8>);
pub type CustomStringBytesVec = Vec<u8>;
pub type CustomStringBytesSlice = [u8];
fn is_in_range<T: PartialEq + PartialOrd>(value: T, range: (T, T)) -> bool {
value >= range.0 && value <= range.1
}
#[derive(Debug, Clone)]
enum InvalidCustomStringErrorType {
InvalidLength(usize),
InvalidFormat,
}
#[derive(Debug, Clone)]
struct InvalidCustomStringByteError {
error_type: InvalidCustomStringErrorType,
invalid_sequence: Option<Vec<u8>>,
}
impl Display for InvalidCustomStringByteError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self.error_type {
InvalidCustomStringErrorType::InvalidFormat => {
write!(
f,
"Invalid custom bytes: {:?}",
self.invalid_sequence.as_ref().unwrap()
)
}
InvalidCustomStringErrorType::InvalidLength(length) => {
write!(f, "Invalid bytes length: {}", length)
}
}
}
}
impl InvalidCustomStringByteError {
pub fn new_invalid_length(invalid_data: &[u8]) -> Self {
Self {
error_type: InvalidCustomStringErrorType::InvalidLength(invalid_data.len()),
invalid_sequence: None,
}
}
pub fn new_invalid_format(invalid_data: &[u8]) -> Self {
Self {
error_type: InvalidCustomStringErrorType::InvalidFormat,
invalid_sequence: Some(invalid_data.into()),
}
}
}
impl Error for InvalidCustomStringByteError {}
pub trait FixedCharsLengthByteSlice {
fn slice_by_char_indice(&self, start: usize, end: usize) -> Self;
fn chars_len(&self) -> usize;
fn is_valid_custom_str_bytes(&self) -> bool;
}
impl FixedCharsLengthByteSlice for &CustomStringBytesSlice {
fn slice_by_char_indice(&self, start: usize, end: usize) -> Self {
self.get((start * BYTES_PER_CHAR)..(end * BYTES_PER_CHAR))
.unwrap()
}
fn chars_len(&self) -> usize {
self.len() / BYTES_PER_CHAR
}
fn is_valid_custom_str_bytes(&self) -> bool {
if self.len() % 4 != 0 {
return false;
}
for index in 0..self.chars_len() {
let current_slice = self.slice_by_char_indice(index, index + 1);
match current_slice {
[0, 0, 0, one_byte_char]
if one_byte_char <= &VALID_ONE_BYTE_UTF8_FIRST_BYTE_MAX_VALUE => {}
[0, 0, first_byte, second_byte]
if is_in_range(*first_byte, VALID_TWO_BYTE_UTF8_FIRST_BYTE_RANGE)
&& is_in_range(*second_byte, VALID_TWO_BYTE_UTF8_SECOND_BYTE_RANGE) => {}
[0, first_byte, second_byte, third_byte]
if is_in_range(*first_byte, VALID_THREE_BYTE_UTF8_FIRST_BYTE_RANGE)
&& is_in_range(*second_byte, VALID_THREE_BYTE_UTF8_SECOND_BYTE_RANGE)
&& is_in_range(*third_byte, VALID_THREE_BYTE_UTF8_THIRD_BYTE_RANGE) => {}
[first_byte, second_byte, third_byte, fourth_byte]
if is_in_range(*first_byte, VALID_FOUR_BYTE_UTF8_FIRST_BYTE_RANGE)
&& is_in_range(*second_byte, VALID_FOUR_BYTE_UTF8_SECOND_BYTE_RANGE)
&& is_in_range(*third_byte, VALID_FOUR_BYTE_UTF8_THIRD_BYTE_RANGE)
&& is_in_range(*fourth_byte, VALID_FOUR_BYTE_UTF8_FOURTH_BYTE_RANGE) => {}
_ => {
return false;
}
}
}
true
}
}
pub fn rfind_space_char_index(custom_text: &CustomStringBytesSlice) -> Option<usize> {
assert_eq!(custom_text.len() % 4, 0);
for index in (0..(custom_text.len() / BYTES_PER_CHAR)).rev() {
if let SPACE_BYTE = &custom_text[(index) * BYTES_PER_CHAR..(index + 1) * BYTES_PER_CHAR] {
return Some(index);
}
}
None
}
fn is_whitespace(custom_bytes: &CustomStringBytesSlice) -> bool {
matches!(
custom_bytes,
[0, 0, 0, 9] | [0, 0, 0, 10] | [0, 0, 0, 11] | [0, 0, 0, 12] | [0, 0, 0, 13] | [0, 0, 0, 32] | [0, 0, 194, 133] | [0, 0, 194, 160] | [0, 0xe1, 0x9a, 0x80] | [0, 0xe1, 0xa0, 0x8e] | [0, 0xe2, 0x80, 0x80] | [0, 0xe2, 0x80, 0x81] | [0, 0xe2, 0x80, 0x82] | [0, 0xe2, 0x80, 0x83] | [0, 0xe2, 0x80, 0x84] | [0, 0xe2, 0x80, 0x85] | [0, 0xe2, 0x80, 0x86] | [0, 0xe2, 0x80, 0x87] | [0, 0xe2, 0x80, 0x88] | [0, 0xe2, 0x80, 0x89] | [0, 0xe2, 0x80, 0x8a] | [0, 0xe2, 0x80, 0x8b] | [0, 0xe2, 0x80, 0x8c] | [0, 0xe2, 0x80, 0x8d] | [0, 0xe2, 0x80, 0x8e] | [0, 0xe2, 0x80, 0x8f] | [0, 0xe2, 0x80, 0xa8] | [0, 0xe2, 0x80, 0xa9] | [0, 0xe2, 0x80, 0xaf] | [0, 0xe2, 0x81, 0x9f] | [0, 0xe2, 0x81, 0xa0] | [0, 0xe3, 0x80, 0x80] | [0, 0xef, 0xbb, 0xbf] )
}
fn to_four_bytes(input: &str) -> CustomStringBytesVec {
let output_size = num_chars(input.as_bytes());
let mut output_vec: Vec<u8> = Vec::with_capacity(output_size * 2);
for character in input.chars() {
let mut bytes_buffer: [u8; 4] = [0; 4];
character.encode_utf8(&mut bytes_buffer);
let arranged_buffer = match bytes_buffer {
[a, 0, 0, 0] => [0, 0, 0, a],
[a, b, 0, 0] => [0, 0, a, b],
[a, b, c, 0] => [0, a, b, c],
_ => bytes_buffer,
};
output_vec.extend_from_slice(&arranged_buffer);
}
output_vec
}
fn trim_to_std_utf8(
input: &CustomStringBytesSlice,
) -> Result<PreparedCustomBytes, Box<dyn error::Error>> {
if input.len() % 4 != 0 {
Err(InvalidCustomStringByteError::new_invalid_length(input).into())
} else {
match input {
[0, 0, 0, one_byte_char]
if one_byte_char <= &VALID_ONE_BYTE_UTF8_FIRST_BYTE_MAX_VALUE =>
{
Ok((None, None, None, Some(*one_byte_char)))
}
[0, 0, first_byte, second_byte]
if is_in_range(*first_byte, VALID_TWO_BYTE_UTF8_FIRST_BYTE_RANGE)
&& is_in_range(*second_byte, VALID_TWO_BYTE_UTF8_SECOND_BYTE_RANGE) =>
{
Ok((None, None, Some(*first_byte), Some(*second_byte)))
}
[0, first_byte, second_byte, third_byte]
if is_in_range(*first_byte, VALID_THREE_BYTE_UTF8_FIRST_BYTE_RANGE)
&& is_in_range(*second_byte, VALID_THREE_BYTE_UTF8_SECOND_BYTE_RANGE)
&& is_in_range(*third_byte, VALID_THREE_BYTE_UTF8_THIRD_BYTE_RANGE) =>
{
Ok((
None,
Some(*first_byte),
Some(*second_byte),
Some(*third_byte),
))
}
[first_byte, second_byte, third_byte, fourth_byte]
if is_in_range(*first_byte, VALID_FOUR_BYTE_UTF8_FIRST_BYTE_RANGE)
&& is_in_range(*second_byte, VALID_FOUR_BYTE_UTF8_SECOND_BYTE_RANGE)
&& is_in_range(*third_byte, VALID_FOUR_BYTE_UTF8_THIRD_BYTE_RANGE)
&& is_in_range(*fourth_byte, VALID_FOUR_BYTE_UTF8_FOURTH_BYTE_RANGE) =>
{
Ok((
Some(*first_byte),
Some(*second_byte),
Some(*third_byte),
Some(*fourth_byte),
))
}
_ => Err(InvalidCustomStringByteError::new_invalid_format(input).into()),
}
}
}
pub trait FixedLengthCustomString<T: Sized + FixedLengthCustomString<T>> {
fn substring(&self, start: usize, end: usize) -> T;
fn get_original_string(&self) -> &[u8];
}
#[derive(Clone, Debug)]
pub struct CustomString {
content: Arc<CustomStringBytesVec>,
chars_content: Arc<Vec<char>>,
start: usize,
end: usize,
}
impl CustomString {
pub fn new(base_string: &str) -> Self {
let content = to_four_bytes(base_string);
let chars_content = Arc::new(base_string.chars().collect::<Vec<char>>());
let length = content.len() / BYTES_PER_CHAR;
Self {
content: Arc::new(content),
start: 0,
end: length,
chars_content,
}
}
pub fn raw_content(&self) -> &[u8] {
self.content
.as_slice()
.slice_by_char_indice(self.start, self.end)
}
pub fn is_full_string(&self) -> bool {
self.start == 0 && self.end == self.content.len() / BYTES_PER_CHAR
}
pub fn chars_len(&self) -> usize {
self.end - self.start
}
pub fn full_string_bytes_len(&self) -> usize {
self.content.len()
}
pub fn is_empty(&self) -> bool {
self.chars_len() == 0
}
pub fn trim(&self) -> Self {
let mut new_content: &[u8] = &self.content;
while !new_content.is_empty() && is_whitespace(&new_content[0..BYTES_PER_CHAR]) {
new_content = &new_content[BYTES_PER_CHAR..];
}
while !new_content.is_empty()
&& is_whitespace(&new_content[(new_content.len() - BYTES_PER_CHAR)..])
{
new_content = &new_content[..(new_content.len() - BYTES_PER_CHAR)];
}
let length = new_content.len() / BYTES_PER_CHAR;
Self {
content: Arc::new(Vec::from(new_content)),
chars_content: self.chars_content.clone(),
start: 0,
end: length,
}
}
pub fn get_chars_content(&self) -> &[char] {
self.chars_content
.as_slice()
.get(self.start..self.end)
.unwrap()
}
pub fn get_char_at(&self, index: usize) -> char {
*self
.chars_content
.as_slice()
.get(index + self.start)
.unwrap()
}
pub fn convert_raw_bytes_to_std_string(input: &[u8]) -> String {
let mut output_content: Vec<u8> = Vec::with_capacity(input.len() / 100);
for index in 0..input.chars_len() {
let extracted_bytes =
trim_to_std_utf8(input.slice_by_char_indice(index, index + 1)).unwrap();
match extracted_bytes {
(None, None, None, Some(first_byte)) => {
output_content.push(first_byte);
}
(None, None, Some(first_byte), Some(second_byte)) => {
output_content.push(first_byte);
output_content.push(second_byte);
}
(None, Some(first_byte), Some(second_byte), Some(third_byte)) => {
output_content.push(first_byte);
output_content.push(second_byte);
output_content.push(third_byte);
}
(Some(first_byte), Some(second_byte), Some(third_byte), Some(fourth_byte)) => {
output_content.push(first_byte);
output_content.push(second_byte);
output_content.push(third_byte);
output_content.push(fourth_byte);
}
_ => panic!("error"),
}
}
let output =
unsafe { String::from(std::str::from_utf8_unchecked(output_content.as_slice())) };
output
}
pub fn convert_raw_bytes_to_utf8_bytes(input: &[u8]) -> Vec<u8> {
let mut output_content: Vec<u8> = Vec::with_capacity(input.len() / 100);
for index in 0..input.chars_len() {
let extracted_bytes =
trim_to_std_utf8(input.slice_by_char_indice(index, index + 1)).unwrap();
match extracted_bytes {
(None, None, None, Some(first_byte)) => {
output_content.push(first_byte);
}
(None, None, Some(first_byte), Some(second_byte)) => {
output_content.push(first_byte);
output_content.push(second_byte);
}
(Some(first_byte), Some(second_byte), Some(third_byte), None) => {
output_content.push(first_byte);
output_content.push(second_byte);
output_content.push(third_byte);
}
(Some(first_byte), Some(second_byte), Some(third_byte), Some(fourth_byte)) => {
output_content.push(first_byte);
output_content.push(second_byte);
output_content.push(third_byte);
output_content.push(fourth_byte);
}
_ => panic!("error"),
}
}
output_content.shrink_to_fit();
output_content
}
pub fn substring(&self, start: usize, end: usize) -> Self {
let new_start = self.start + start;
let new_end = self.start + end;
let full_content = self.content.clone();
Self {
content: full_content,
chars_content: self.chars_content.clone(),
start: new_start,
end: new_end,
}
}
pub fn substring_as_bytes(&self, char_start: usize, char_end: usize) -> &[u8] {
self.content
.as_slice()
.slice_by_char_indice(char_start, char_end)
}
}
#[test]
fn check_slice() {
let ex: &[u8] = &[255, 255, 255, 255, 0, 255, 111, 0];
assert_eq!(ex.slice_by_char_indice(0, 1), &[255, 255, 255, 255]);
assert_eq!(ex.slice_by_char_indice(1, 2), &[0, 255, 111, 0]);
assert!("".is_empty());
}
#[test]
fn test_bytes() {
let text = [
"ไต้หวัน (แป่ะเอ๋ยี้: Tâi-oân; ไต่อวัน) หรือ ไถวาน ",
"(อักษรโรมัน: Taiwan; จีนตัวย่อ: 台湾; จีนตัวเต็ม: 臺灣/台灣; พินอิน: ",
"Táiwān; ไถวาน) หรือชื่อทางการว่า สาธารณรัฐจีน (จีนตัวย่อ: 中华民国; ",
"จีนตัวเต็ม: 中華民國; พินอิน: Zhōnghuá ",
"Mínguó) เป็นรัฐในทวีปเอเชียตะวันออก[7][8][9] ปัจจุบันประกอบด้วย",
"เกาะใหญ่ 5 แห่ง คือ จินเหมิน (金門), ไต้หวัน, เผิงหู (澎湖), หมาจู่ ",
"(馬祖), และอูชิว (烏坵) กับทั้งเกาะเล็กเกาะน้อยอีกจำนวนหนึ่ง ",
"ท้องที่ดังกล่าวเรียกรวมกันว่า \"พื้นที่ไต้หวัน\" (臺灣地區)\n",
]
.join("");
let custom_string = CustomString::new(&text);
assert_eq!(custom_string.full_string_bytes_len() % 4, 0);
}
#[test]
fn test_trim() {
assert!(CustomString::new(" ").trim().is_empty());
assert!(CustomString::new(" ").trim().is_empty());
assert!(CustomString::new("\n").trim().is_empty());
assert!(CustomString::new(" \t\n ").trim().is_empty());
assert_eq!(CustomString::new(" abc ").trim().chars_len(), 3);
assert_eq!(CustomString::new(" aก ").trim().full_string_bytes_len(), 8); }