mod char_def;
mod connection_matrix;
mod double_array_trie;
mod feature;
mod overlay;
mod sys_dic;
mod unknown;
pub mod user_dict;
pub use char_def::{CharCategory, CharDef, CharInfo};
pub use connection_matrix::ConnectionMatrix;
pub use double_array_trie::{DartsResult, DoubleArrayTrie};
pub use feature::FeatureTable;
pub use overlay::{OverlayDictionary, OverlayEntry};
pub use sys_dic::{SysDic, Token};
pub use unknown::UnknownDictionary;
pub use user_dict::{DictFormat, UserDictManager, UserDictStats, UserEntry, ValidationResult};
use crate::{Error, Result};
use memmap2::Mmap;
use std::fs::File;
use std::path::Path;
use std::sync::Arc;
pub const SYS_DIC_FILE: &str = "sys.dic";
pub const UNK_DIC_FILE: &str = "unk.dic";
pub const MATRIX_FILE: &str = "matrix.bin";
pub const CHAR_BIN_FILE: &str = "char.bin";
pub const MECAB_SYS_DIC: u32 = 0;
pub const MECAB_USR_DIC: u32 = 1;
pub const MECAB_UNK_DIC: u32 = 2;
pub type SurfaceMap = std::collections::HashMap<String, Vec<(String, f32)>>;
pub struct Dictionary {
pub sys_dic: SysDic,
pub unknown: UnknownDictionary,
pub matrix: ConnectionMatrix,
pub char_def: CharDef,
pub overlay: OverlayDictionary,
pub semantic_pool: Option<Arc<crate::semantic::pool::SemanticPool>>,
pub surface_map: Option<Arc<SurfaceMap>>,
_mmaps: Vec<Arc<Mmap>>,
}
impl std::fmt::Debug for Dictionary {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Dictionary")
.field("sys_dic", &self.sys_dic)
.field("matrix", &self.matrix)
.field("char_def", &self.char_def)
.field("overlay", &self.overlay)
.finish()
}
}
impl Dictionary {
pub fn load(path: &Path) -> Result<Self> {
if !path.exists() {
return Err(Error::DictionaryNotFound(path.to_path_buf()));
}
let mut mmaps = Vec::new();
let sys_path = path.join(SYS_DIC_FILE);
let sys_mmap = Arc::new(Self::open_mmap(&sys_path)?);
let sys_dic = SysDic::from_mmap(Arc::clone(&sys_mmap))?;
mmaps.push(sys_mmap);
let matrix_path = path.join(MATRIX_FILE);
let matrix_mmap = Arc::new(Self::open_mmap(&matrix_path)?);
let matrix = ConnectionMatrix::from_mmap(Arc::clone(&matrix_mmap))?;
mmaps.push(matrix_mmap);
let char_path = path.join(CHAR_BIN_FILE);
let char_mmap = Arc::new(Self::open_mmap(&char_path)?);
let char_def = CharDef::from_mmap(Arc::clone(&char_mmap))?;
mmaps.push(char_mmap);
let unk_path = path.join(UNK_DIC_FILE);
let unk_mmap = Arc::new(Self::open_mmap(&unk_path)?);
let unknown = UnknownDictionary::from_mmap(Arc::clone(&unk_mmap))?;
mmaps.push(unk_mmap);
Ok(Self {
sys_dic,
unknown,
matrix,
char_def,
overlay: OverlayDictionary::new(),
semantic_pool: None,
surface_map: None,
_mmaps: mmaps,
})
}
pub fn load_with_semantics(path: &Path, semantic_path: &Path) -> Result<Self> {
let mut dict = Self::load(path)?;
if semantic_path.exists() {
let pool_file = File::open(semantic_path)?;
let pool_data = unsafe { Mmap::map(&pool_file)? };
let pool = crate::semantic::pool::SemanticPool::from_bytes(&pool_data)?;
dict.semantic_pool = Some(Arc::new(pool));
let semantic_dir = semantic_path.parent().unwrap_or(Path::new("."));
let map_path = semantic_dir.join("surface_map.json");
if map_path.exists() {
let map_data = std::fs::read_to_string(&map_path)?;
let map: SurfaceMap = serde_json::from_str(&map_data)?;
dict.surface_map = Some(Arc::new(map));
}
}
Ok(dict)
}
fn open_mmap(path: &Path) -> Result<Mmap> {
if !path.exists() {
return Err(Error::DictionaryNotFound(path.to_path_buf()));
}
let file = File::open(path)?;
let mmap = unsafe { Mmap::map(&file)? };
Ok(mmap)
}
pub fn default_dictionary() -> Result<Self> {
let locations = [
"/var/lib/mecab/dic/ipadic-utf8",
"/usr/lib/mecab/dic/ipadic-utf8",
"/usr/local/lib/mecab/dic/ipadic-utf8",
"/usr/share/mecab/dic/ipadic-utf8",
"/usr/lib/mecab/dic/ipadic",
"/usr/local/lib/mecab/dic/ipadic",
"/usr/share/mecab/dic/ipadic",
"/usr/lib64/mecab/dic/ipadic",
];
if let Some(home) = std::env::var_os("HOME") {
let home_path = Path::new(&home).join(".local/share/mecrab/dic/ipadic");
if home_path.exists() {
return Self::load(&home_path);
}
}
for location in &locations {
let path = Path::new(location);
if path.exists() {
return Self::load(path);
}
}
Err(Error::DefaultDictionaryNotFound)
}
pub fn lookup(&self, key: &str) -> Vec<DictionaryEntry> {
let mut results = self.overlay.lookup(key);
results.extend(self.sys_dic.common_prefix_search(key));
results
}
pub fn add_word(&self, surface: &str, entry: OverlayEntry) {
self.overlay.add_word(surface, entry);
}
pub fn add_simple_word(&self, surface: &str, reading: &str, pronunciation: &str, wcost: i16) {
self.overlay
.add_simple(surface, reading, pronunciation, wcost);
}
pub fn remove_word(&self, surface: &str) -> bool {
self.overlay.remove_word(surface)
}
pub fn overlay_size(&self) -> usize {
self.overlay.len()
}
#[inline]
pub fn connection_cost(&self, right_id: u16, left_id: u16) -> i16 {
self.matrix.cost(right_id, left_id)
}
#[inline]
pub unsafe fn connection_cost_unchecked(&self, right_id: u16, left_id: u16) -> i16 {
unsafe { self.matrix.cost_unchecked(right_id, left_id) }
}
pub fn get_feature(&self, token: &Token) -> &str {
self.sys_dic.get_feature(token)
}
pub fn char_info(&self, c: char) -> CharInfo {
self.char_def.get_char_info(c)
}
pub fn char_category(&self, c: char) -> CharCategory {
self.char_def.get_char_info(c).category()
}
pub fn charset(&self) -> &str {
self.sys_dic.charset()
}
pub fn size(&self) -> usize {
self.sys_dic.lexicon_size()
}
}
#[derive(Debug, Clone)]
pub struct DictionaryEntry {
pub length: usize,
pub word_id: u32,
pub left_id: u16,
pub right_id: u16,
pub pos_id: u16,
pub wcost: i16,
pub feature: String,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dictionary_file_names() {
assert_eq!(SYS_DIC_FILE, "sys.dic");
assert_eq!(MATRIX_FILE, "matrix.bin");
assert_eq!(CHAR_BIN_FILE, "char.bin");
assert_eq!(UNK_DIC_FILE, "unk.dic");
}
#[test]
fn test_dictionary_types() {
assert_eq!(MECAB_SYS_DIC, 0);
assert_eq!(MECAB_USR_DIC, 1);
assert_eq!(MECAB_UNK_DIC, 2);
}
}
#[cfg(test)]
mod integration_tests {
use super::*;
use std::path::Path;
#[test]
fn test_load_ipadic() {
let path = Path::new("/var/lib/mecab/dic/ipadic-utf8");
if !path.exists() {
eprintln!("IPADIC not found, skipping test");
return;
}
let dict = Dictionary::load(path).expect("Failed to load dictionary");
eprintln!("Dictionary loaded:");
eprintln!(" Size: {} entries", dict.size());
eprintln!(" Charset: {}", dict.charset());
let entries = dict.lookup("の");
eprintln!("\nLookup 'の': {} entries", entries.len());
for entry in entries.iter().take(3) {
eprintln!(
" length={}, wcost={}, feature={}",
entry.length, entry.wcost, entry.feature
);
}
assert!(!entries.is_empty(), "Expected to find 'の' in dictionary");
let entries = dict.lookup("テスト");
eprintln!("\nLookup 'テスト': {} entries", entries.len());
for entry in entries.iter().take(3) {
eprintln!(
" length={}, wcost={}, feature={}",
entry.length, entry.wcost, entry.feature
);
}
assert!(
!entries.is_empty(),
"Expected to find 'テスト' in dictionary"
);
let entries = dict.lookup("東京");
eprintln!("\nLookup '東京': {} entries", entries.len());
for entry in entries.iter().take(3) {
eprintln!(
" length={}, wcost={}, feature={}",
entry.length, entry.wcost, entry.feature
);
}
assert!(!entries.is_empty(), "Expected to find '東京' in dictionary");
}
}