use crate::{Error, Result};
use byteorder::{ByteOrder, LittleEndian};
use memmap2::Mmap;
use std::sync::Arc;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
#[repr(u8)]
pub enum CharCategory {
#[default]
Default = 0,
Space = 1,
Kanji = 2,
Symbol = 3,
Numeric = 4,
Alpha = 5,
Hiragana = 6,
Katakana = 7,
Kanjinumeric = 8,
Greek = 9,
Cyrillic = 10,
}
impl From<u8> for CharCategory {
fn from(value: u8) -> Self {
match value {
1 => Self::Space,
2 => Self::Kanji,
3 => Self::Symbol,
4 => Self::Numeric,
5 => Self::Alpha,
6 => Self::Hiragana,
7 => Self::Katakana,
8 => Self::Kanjinumeric,
9 => Self::Greek,
10 => Self::Cyrillic,
_ => Self::Default,
}
}
}
#[derive(Debug, Clone, Copy, Default)]
#[repr(C)]
pub struct CharInfo {
packed: u32,
}
impl CharInfo {
pub const SIZE: usize = 4;
#[inline]
pub fn type_mask(&self) -> u32 {
self.packed & 0x3FFFF }
#[inline]
pub fn default_type(&self) -> u8 {
((self.packed >> 18) & 0xFF) as u8
}
#[inline]
pub fn length(&self) -> u8 {
((self.packed >> 26) & 0xF) as u8
}
#[inline]
pub fn group(&self) -> bool {
((self.packed >> 30) & 1) != 0
}
#[inline]
pub fn invoke(&self) -> bool {
((self.packed >> 31) & 1) != 0
}
#[inline]
pub fn category(&self) -> CharCategory {
CharCategory::from(self.default_type())
}
#[inline]
pub fn is_kind_of(&self, other: CharInfo) -> bool {
(self.type_mask() & other.type_mask()) != 0
}
}
pub struct CharDef {
_mmap: Arc<Mmap>,
categories: Vec<String>,
map_ptr: *const CharInfo,
}
unsafe impl Send for CharDef {}
unsafe impl Sync for CharDef {}
impl std::fmt::Debug for CharDef {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CharDef")
.field("categories", &self.categories)
.finish()
}
}
impl CharDef {
pub const TABLE_SIZE: usize = 0xFFFF;
pub fn from_mmap(mmap: Arc<Mmap>) -> Result<Self> {
let data = &mmap[..];
if data.len() < 4 {
return Err(Error::CharDefError(
"Character definition file too small".to_string(),
));
}
let csize = LittleEndian::read_u32(&data[0..4]) as usize;
let expected_size = 4 + (csize * 32) + (Self::TABLE_SIZE * CharInfo::SIZE);
if data.len() != expected_size {
return Err(Error::CharDefError(format!(
"Character definition file size mismatch: expected {}, got {}",
expected_size,
data.len()
)));
}
let mut categories = Vec::with_capacity(csize);
let mut offset = 4;
for _ in 0..csize {
let name_bytes = &data[offset..offset + 32];
let name_end = name_bytes
.iter()
.position(|&b| b == 0)
.unwrap_or(name_bytes.len());
let name = String::from_utf8_lossy(&name_bytes[..name_end]).to_string();
categories.push(name);
offset += 32;
}
let map_ptr = data[offset..].as_ptr() as *const CharInfo;
Ok(Self {
_mmap: mmap,
categories,
map_ptr,
})
}
#[inline]
pub fn get_char_info(&self, c: char) -> CharInfo {
let code = c as u32;
if code < Self::TABLE_SIZE as u32 {
unsafe { *self.map_ptr.add(code as usize) }
} else {
CharInfo::default()
}
}
pub fn get_char_info_from_bytes(&self, bytes: &[u8]) -> (CharInfo, usize) {
if bytes.is_empty() {
return (CharInfo::default(), 0);
}
let s = match std::str::from_utf8(bytes) {
Ok(s) => s,
Err(_) => return (CharInfo::default(), 1),
};
if let Some(c) = s.chars().next() {
let len = c.len_utf8();
(self.get_char_info(c), len)
} else {
(CharInfo::default(), 0)
}
}
pub fn category_name(&self, id: usize) -> Option<&str> {
self.categories.get(id).map(String::as_str)
}
pub fn category_count(&self) -> usize {
self.categories.len()
}
pub fn category_id(&self, name: &str) -> Option<usize> {
self.categories.iter().position(|n| n == name)
}
pub fn should_group(&self, category: CharCategory) -> bool {
let sample_char = match category {
CharCategory::Default => ' ',
CharCategory::Space => ' ',
CharCategory::Kanji => '漢',
CharCategory::Symbol => '!',
CharCategory::Numeric => '0',
CharCategory::Alpha => 'A',
CharCategory::Hiragana => 'あ',
CharCategory::Katakana => 'ア',
CharCategory::Kanjinumeric => '一',
CharCategory::Greek => 'Α',
CharCategory::Cyrillic => 'А',
};
self.get_char_info(sample_char).group()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_charinfo_size() {
assert_eq!(std::mem::size_of::<CharInfo>(), 4);
assert_eq!(CharInfo::SIZE, 4);
}
#[test]
fn test_charinfo_default() {
let info = CharInfo::default();
assert_eq!(info.type_mask(), 0);
assert_eq!(info.default_type(), 0);
assert_eq!(info.length(), 0);
assert!(!info.group());
assert!(!info.invoke());
}
#[test]
fn test_char_category() {
assert_eq!(CharCategory::from(0), CharCategory::Default);
assert_eq!(CharCategory::from(1), CharCategory::Space);
assert_eq!(CharCategory::from(6), CharCategory::Hiragana);
assert_eq!(CharCategory::from(7), CharCategory::Katakana);
assert_eq!(CharCategory::from(255), CharCategory::Default);
}
}