pub trait CharUnit:
Copy
+ Clone
+ Default
+ Eq
+ PartialEq
+ Ord
+ std::hash::Hash
+ std::fmt::Debug
+ Send
+ Sync
+ 'static
{
fn from_str(s: &str) -> Vec<Self>;
fn to_string(units: &[Self]) -> String;
fn iter_str(s: &str) -> Box<dyn Iterator<Item = Self> + '_>;
fn hash_to_u64(&self) -> u64 {
use rustc_hash::FxHasher;
use std::hash::Hasher;
let mut hasher = FxHasher::default();
self.hash(&mut hasher);
hasher.finish()
}
fn to_dat_offset(&self) -> usize;
}
impl CharUnit for u8 {
#[inline]
fn from_str(s: &str) -> Vec<Self> {
s.as_bytes().to_vec()
}
#[inline]
fn to_string(units: &[Self]) -> String {
String::from_utf8_lossy(units).into_owned()
}
#[inline]
fn iter_str(s: &str) -> Box<dyn Iterator<Item = Self> + '_> {
Box::new(s.bytes())
}
#[inline]
fn to_dat_offset(&self) -> usize {
*self as usize
}
}
impl CharUnit for char {
#[inline]
fn from_str(s: &str) -> Vec<Self> {
s.chars().collect()
}
#[inline]
fn to_string(units: &[Self]) -> String {
units.iter().collect()
}
#[inline]
fn iter_str(s: &str) -> Box<dyn Iterator<Item = Self> + '_> {
Box::new(s.chars())
}
#[inline]
fn to_dat_offset(&self) -> usize {
*self as usize
}
}
impl CharUnit for u64 {
#[inline]
fn from_str(s: &str) -> Vec<Self> {
let bytes = s.as_bytes();
if bytes.is_empty() {
return Vec::new();
}
bytes
.chunks(8)
.map(|chunk| {
let mut arr = [0u8; 8];
arr[..chunk.len()].copy_from_slice(chunk);
u64::from_le_bytes(arr)
})
.collect()
}
#[inline]
fn to_string(units: &[Self]) -> String {
if units.is_empty() {
return String::new();
}
let bytes: Vec<u8> = units.iter().flat_map(|&u| u.to_le_bytes()).collect();
let end = bytes
.iter()
.rposition(|&b| b != 0)
.map(|i| i + 1)
.unwrap_or(0);
String::from_utf8_lossy(&bytes[..end]).into_owned()
}
#[inline]
fn iter_str(s: &str) -> Box<dyn Iterator<Item = Self> + '_> {
Box::new(Self::from_str(s).into_iter())
}
#[inline]
fn to_dat_offset(&self) -> usize {
(*self as u32) as usize
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_u8_ascii() {
let s = "hello";
let units = u8::from_str(s);
assert_eq!(units, vec![b'h', b'e', b'l', b'l', b'o']);
assert_eq!(<u8 as CharUnit>::to_string(&units), s);
}
#[test]
fn test_u8_unicode() {
let s = "café";
let units = u8::from_str(s);
assert_eq!(units.len(), 5); assert_eq!(<u8 as CharUnit>::to_string(&units), s);
}
#[test]
fn test_char_ascii() {
let s = "hello";
let units = char::from_str(s);
assert_eq!(units, vec!['h', 'e', 'l', 'l', 'o']);
assert_eq!(<char as CharUnit>::to_string(&units), s);
}
#[test]
fn test_char_unicode() {
let s = "café";
let units = char::from_str(s);
assert_eq!(units, vec!['c', 'a', 'f', 'é']);
assert_eq!(units.len(), 4);
assert_eq!(<char as CharUnit>::to_string(&units), s);
}
#[test]
fn test_char_emoji() {
let s = "hello 🎉 world";
let units = char::from_str(s);
assert_eq!(units.len(), 13); assert!(units.contains(&'🎉'));
assert_eq!(<char as CharUnit>::to_string(&units), s);
}
#[test]
fn test_char_cjk() {
let s = "中文";
let units = char::from_str(s);
assert_eq!(units, vec!['中', '文']);
assert_eq!(units.len(), 2);
assert_eq!(<char as CharUnit>::to_string(&units), s);
}
#[test]
fn test_iter_u8() {
let s = "hi";
let collected: Vec<u8> = u8::iter_str(s).collect();
assert_eq!(collected, vec![b'h', b'i']);
}
#[test]
fn test_iter_char() {
let s = "café";
let collected: Vec<char> = <char as CharUnit>::iter_str(s).collect();
assert_eq!(collected, vec!['c', 'a', 'f', 'é']);
}
#[test]
fn test_u64_short_string() {
let s = "hello";
let units = u64::from_str(s);
assert_eq!(units.len(), 1);
assert_eq!(<u64 as CharUnit>::to_string(&units), s);
}
#[test]
fn test_u64_exact_8_bytes() {
let s = "12345678";
let units = u64::from_str(s);
assert_eq!(units.len(), 1);
assert_eq!(<u64 as CharUnit>::to_string(&units), s);
}
#[test]
fn test_u64_multi_unit() {
let s = "hello world!"; let units = u64::from_str(s);
assert_eq!(units.len(), 2);
assert_eq!(<u64 as CharUnit>::to_string(&units), s);
}
#[test]
fn test_u64_empty() {
let s = "";
let units = u64::from_str(s);
assert!(units.is_empty());
assert_eq!(<u64 as CharUnit>::to_string(&units), s);
}
#[test]
fn test_u64_unicode() {
let s = "café"; let units = u64::from_str(s);
assert_eq!(units.len(), 1);
assert_eq!(<u64 as CharUnit>::to_string(&units), s);
}
#[test]
fn test_iter_u64() {
let s = "hello world!"; let collected: Vec<u64> = u64::iter_str(s).collect();
assert_eq!(collected.len(), 2);
assert_eq!(<u64 as CharUnit>::to_string(&collected), s);
}
}