use crate::{Error, Result};
use byteorder::{ByteOrder, LittleEndian};
use memmap2::Mmap;
use std::sync::Arc;
use super::DictionaryEntry;
use super::double_array_trie::{DartsResult, DoubleArrayTrie};
pub const DICTIONARY_MAGIC_ID: u32 = 0xef71_8f77;
pub const DIC_VERSION: u32 = 102;
pub const HEADER_SIZE: usize = 72;
const MAX_RESULTS: usize = 512;
#[derive(Debug, Clone, Copy)]
#[repr(C)]
pub struct Token {
pub left_id: u16,
pub right_id: u16,
pub pos_id: u16,
pub wcost: i16,
pub feature_offset: u32,
pub compound: u32,
}
impl Token {
pub const SIZE: usize = 16;
}
pub struct SysDic {
_mmap: Arc<Mmap>,
trie: DoubleArrayTrie,
tokens_ptr: *const Token,
tokens_count: usize,
features_ptr: *const u8,
features_size: usize,
version: u32,
dict_type: u32,
lexicon_size: u32,
left_size: u32,
right_size: u32,
charset: String,
}
unsafe impl Send for SysDic {}
unsafe impl Sync for SysDic {}
impl std::fmt::Debug for SysDic {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SysDic")
.field("version", &self.version)
.field("dict_type", &self.dict_type)
.field("lexicon_size", &self.lexicon_size)
.field("left_size", &self.left_size)
.field("right_size", &self.right_size)
.field("charset", &self.charset)
.field("trie_size", &self.trie.size())
.field("tokens_count", &self.tokens_count)
.finish()
}
}
impl SysDic {
pub fn from_mmap(mmap: Arc<Mmap>) -> Result<Self> {
let data = &mmap[..];
if data.len() < HEADER_SIZE {
return Err(Error::CorruptedDictionary(format!(
"Dictionary file too small: {} bytes (minimum {} bytes)",
data.len(),
HEADER_SIZE
)));
}
let magic = LittleEndian::read_u32(&data[0..4]);
let expected_size = magic ^ DICTIONARY_MAGIC_ID;
if expected_size != data.len() as u32 {
return Err(Error::InvalidDictionaryFormat(format!(
"Magic number mismatch: expected file size {}, got {}",
expected_size,
data.len()
)));
}
let version = LittleEndian::read_u32(&data[4..8]);
if version != DIC_VERSION {
return Err(Error::InvalidDictionaryFormat(format!(
"Incompatible dictionary version: expected {}, got {}",
DIC_VERSION, version
)));
}
let dict_type = LittleEndian::read_u32(&data[8..12]);
let lexicon_size = LittleEndian::read_u32(&data[12..16]);
let left_size = LittleEndian::read_u32(&data[16..20]);
let right_size = LittleEndian::read_u32(&data[20..24]);
let da_size = LittleEndian::read_u32(&data[24..28]) as usize;
let token_size = LittleEndian::read_u32(&data[28..32]) as usize;
let feature_size = LittleEndian::read_u32(&data[32..36]) as usize;
let charset_bytes = &data[40..72];
let charset_end = charset_bytes
.iter()
.position(|&b| b == 0)
.unwrap_or(charset_bytes.len());
let charset = String::from_utf8_lossy(&charset_bytes[..charset_end]).to_string();
let expected_total = HEADER_SIZE + da_size + token_size + feature_size;
if data.len() < expected_total {
return Err(Error::CorruptedDictionary(format!(
"Dictionary file truncated: expected {} bytes, got {}",
expected_total,
data.len()
)));
}
let da_offset = HEADER_SIZE;
let trie = DoubleArrayTrie::from_bytes(&data[da_offset..], da_size)?;
let token_offset = da_offset + da_size;
let tokens_ptr = data[token_offset..].as_ptr() as *const Token;
let tokens_count = token_size / Token::SIZE;
let feature_offset = token_offset + token_size;
let features_ptr = data[feature_offset..].as_ptr();
let features_size = feature_size;
Ok(Self {
_mmap: mmap,
trie,
tokens_ptr,
tokens_count,
features_ptr,
features_size,
version,
dict_type,
lexicon_size,
left_size,
right_size,
charset,
})
}
pub fn common_prefix_search(&self, key: &str) -> Vec<DictionaryEntry> {
let key_bytes = key.as_bytes();
let mut results = [DartsResult::default(); MAX_RESULTS];
let num_results = self.trie.common_prefix_search(key_bytes, &mut results);
let mut entries = Vec::new();
for result in results.iter().take(num_results) {
let value = result.value as u32;
let token_start = (value >> 8) as usize;
let token_count = (value & 0xff) as usize;
for i in 0..token_count {
let token_idx = token_start + i;
if let Some(token) = self.get_token(token_idx) {
let feature = self.get_feature(token).to_string();
entries.push(DictionaryEntry {
length: result.length,
word_id: token_idx as u32,
left_id: token.left_id,
right_id: token.right_id,
pos_id: token.pos_id,
wcost: token.wcost,
feature,
});
}
}
}
entries
}
#[inline]
fn get_token(&self, index: usize) -> Option<&Token> {
if index < self.tokens_count {
Some(unsafe { &*self.tokens_ptr.add(index) })
} else {
None
}
}
pub fn get_feature(&self, token: &Token) -> &str {
let offset = token.feature_offset as usize;
if offset >= self.features_size {
return "";
}
let ptr = unsafe { self.features_ptr.add(offset) };
let mut len = 0;
while len < self.features_size - offset {
if unsafe { *ptr.add(len) } == 0 {
break;
}
len += 1;
}
let slice = unsafe { std::slice::from_raw_parts(ptr, len) };
std::str::from_utf8(slice).unwrap_or("")
}
pub fn charset(&self) -> &str {
&self.charset
}
pub fn lexicon_size(&self) -> usize {
self.lexicon_size as usize
}
pub fn version(&self) -> u32 {
self.version
}
pub fn token_count(&self) -> usize {
self.tokens_count
}
pub fn token_at(&self, index: usize) -> Option<&Token> {
self.get_token(index)
}
pub fn dict_type(&self) -> u32 {
self.dict_type
}
pub fn left_size(&self) -> usize {
self.left_size as usize
}
pub fn right_size(&self) -> usize {
self.right_size as usize
}
pub fn trie_size(&self) -> usize {
self.trie.size()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_token_size() {
assert_eq!(std::mem::size_of::<Token>(), 16);
assert_eq!(Token::SIZE, 16);
}
#[test]
fn test_magic_id() {
assert_eq!(DICTIONARY_MAGIC_ID, 0xef71_8f77);
}
#[test]
fn test_header_size() {
assert_eq!(HEADER_SIZE, 72);
}
}