use std::io::{BufRead, BufReader, Read as _, Write as _};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use crate::entry_store::{EagerStore, EntryStore, LazyStore};
use crate::error::{DictError, Result};
use crate::lazy_entries::LazyEntries;
use crate::matrix::{ConnectionMatrix, Matrix};
use crate::trie::Trie;
use crate::user_dict::UserDictionary;
use crate::{Dictionary, Entry};
#[cfg(feature = "hot-reload-v2")]
use crate::hot_reload_v2::HotReloadDictV2;
const DEFAULT_DICDIR_PATHS: &[&str] = &[
"/usr/local/lib/mecab/dic/mecab-ko-dic",
"/usr/lib/mecab/dic/mecab-ko-dic",
"/opt/mecab/dic/mecab-ko-dic",
"./dic/mecab-ko-dic",
];
const TRIE_FILE: &str = "sys.dic";
const MATRIX_FILE: &str = "matrix.bin";
const ENTRIES_BIN_FILE: &str = "entries.bin";
const ENTRIES_CSV_FILE: &str = "entries.csv";
const ENTRIES_MAGIC: &[u8; 4] = b"MKED";
const ENTRIES_VERSION: u32 = 1;
pub struct SystemDictionary {
dicdir: PathBuf,
trie: Trie<'static>,
matrix: ConnectionMatrix,
entry_store: Arc<dyn EntryStore>,
user_dict: Option<Arc<UserDictionary>>,
#[cfg(feature = "hot-reload-v2")]
hot_reload: Option<Arc<HotReloadDictV2>>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DictEntry {
pub surface: String,
pub left_id: u16,
pub right_id: u16,
pub cost: i16,
pub feature: String,
}
impl DictEntry {
pub fn new(
surface: impl Into<String>,
left_id: u16,
right_id: u16,
cost: i16,
feature: impl Into<String>,
) -> Self {
Self {
surface: surface.into(),
left_id,
right_id,
cost,
feature: feature.into(),
}
}
#[must_use]
pub fn to_entry(&self) -> Entry {
Entry {
surface: self.surface.clone(),
left_id: self.left_id,
right_id: self.right_id,
cost: self.cost,
feature: self.feature.clone(),
}
}
}
impl From<Entry> for DictEntry {
fn from(entry: Entry) -> Self {
Self {
surface: entry.surface,
left_id: entry.left_id,
right_id: entry.right_id,
cost: entry.cost,
feature: entry.feature,
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct LoadOptions {
pub use_mmap_matrix: bool,
pub use_lazy_entries: bool,
pub lazy_cache_size: Option<usize>,
}
impl Default for LoadOptions {
fn default() -> Self {
Self {
use_mmap_matrix: false,
use_lazy_entries: true,
lazy_cache_size: Some(10000),
}
}
}
impl LoadOptions {
#[must_use]
pub const fn memory_optimized() -> Self {
Self {
use_mmap_matrix: true,
use_lazy_entries: true,
lazy_cache_size: Some(10000),
}
}
#[must_use]
pub const fn speed_optimized() -> Self {
Self {
use_mmap_matrix: false,
use_lazy_entries: false,
lazy_cache_size: None,
}
}
#[must_use]
pub const fn eager() -> Self {
Self::speed_optimized()
}
}
impl SystemDictionary {
pub fn load_default() -> Result<Self> {
let dicdir = DictionaryLoader::find_dicdir()?;
Self::load(dicdir)
}
pub fn load_memory_optimized() -> Result<Self> {
let dicdir = DictionaryLoader::find_dicdir()?;
Self::load_with_options(dicdir, LoadOptions::memory_optimized())
}
pub fn load_with_options<P: AsRef<Path>>(dicdir: P, options: LoadOptions) -> Result<Self> {
let dicdir = dicdir.as_ref().to_path_buf();
let trie_path = dicdir.join(TRIE_FILE);
let trie = if trie_path.exists() {
Trie::from_file(&trie_path)?
} else {
let compressed_path = dicdir.join(format!("{TRIE_FILE}.zst"));
if compressed_path.exists() {
Trie::from_compressed_file(&compressed_path)?
} else {
return Err(DictError::Format(format!(
"Trie file not found: {}",
trie_path.display()
)));
}
};
let matrix_path = dicdir.join(MATRIX_FILE);
let matrix = if matrix_path.exists() {
if options.use_mmap_matrix {
ConnectionMatrix::from_mmap_file(&matrix_path)?
} else {
ConnectionMatrix::from_bin_file(&matrix_path)?
}
} else {
let compressed_path = dicdir.join(format!("{MATRIX_FILE}.zst"));
if compressed_path.exists() {
ConnectionMatrix::from_compressed_file(&compressed_path)?
} else {
let def_path = dicdir.join("matrix.def");
if def_path.exists() {
ConnectionMatrix::from_def_file(&def_path)?
} else {
return Err(DictError::Format(format!(
"Matrix file not found: {}",
matrix_path.display()
)));
}
}
};
let entry_store: Arc<dyn EntryStore> = if options.use_lazy_entries {
let entries_path = dicdir.join(ENTRIES_BIN_FILE);
if entries_path.exists() {
if let Ok(lazy) = LazyEntries::from_file(&entries_path) {
if let Some(cache_size) = options.lazy_cache_size {
lazy.set_cache_size(cache_size);
}
Arc::new(LazyStore::new(lazy))
} else {
let entries = Self::load_entries(&dicdir)?;
Arc::new(EagerStore::new(entries))
}
} else {
let entries = Self::load_entries(&dicdir)?;
Arc::new(EagerStore::new(entries))
}
} else {
let entries = Self::load_entries(&dicdir)?;
Arc::new(EagerStore::new(entries))
};
Ok(Self {
dicdir,
trie,
matrix,
entry_store,
user_dict: None,
#[cfg(feature = "hot-reload-v2")]
hot_reload: None,
})
}
pub fn load<P: AsRef<Path>>(dicdir: P) -> Result<Self> {
let dicdir = dicdir.as_ref().to_path_buf();
let trie_path = dicdir.join(TRIE_FILE);
let trie = if trie_path.exists() {
Trie::from_file(&trie_path)?
} else {
let compressed_path = dicdir.join(format!("{TRIE_FILE}.zst"));
if compressed_path.exists() {
Trie::from_compressed_file(&compressed_path)?
} else {
return Err(DictError::Format(format!(
"Trie file not found: {}",
trie_path.display()
)));
}
};
let matrix_path = dicdir.join(MATRIX_FILE);
let matrix = if matrix_path.exists() {
ConnectionMatrix::from_bin_file(&matrix_path)?
} else {
let compressed_path = dicdir.join(format!("{MATRIX_FILE}.zst"));
if compressed_path.exists() {
ConnectionMatrix::from_compressed_file(&compressed_path)?
} else {
let def_path = dicdir.join("matrix.def");
if def_path.exists() {
ConnectionMatrix::from_def_file(&def_path)?
} else {
return Err(DictError::Format(format!(
"Matrix file not found: {}",
matrix_path.display()
)));
}
}
};
let entries = Self::load_entries(&dicdir)?;
let entry_store: Arc<dyn EntryStore> = Arc::new(EagerStore::new(entries));
Ok(Self {
dicdir,
trie,
matrix,
entry_store,
user_dict: None,
#[cfg(feature = "hot-reload-v2")]
hot_reload: None,
})
}
fn load_entries(dicdir: &Path) -> Result<Vec<DictEntry>> {
let bin_path = dicdir.join(ENTRIES_BIN_FILE);
if bin_path.exists() {
return Self::load_entries_bin(&bin_path);
}
let csv_path = dicdir.join(ENTRIES_CSV_FILE);
if csv_path.exists() {
return Self::load_entries_csv(&csv_path);
}
Ok(Vec::new())
}
fn load_entries_csv(path: &Path) -> Result<Vec<DictEntry>> {
let file = std::fs::File::open(path).map_err(DictError::Io)?;
let reader = BufReader::new(file);
let mut entries = Vec::new();
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(DictError::Io)?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let mut fields = line.splitn(5, ',');
let surface = fields
.next()
.ok_or_else(|| {
DictError::Format(format!("line {}: missing surface", line_num + 1))
})?
.to_string();
let left_id: u16 = fields
.next()
.ok_or_else(|| {
DictError::Format(format!("line {}: missing left_id", line_num + 1))
})?
.parse()
.map_err(|_| {
DictError::Format(format!("line {}: invalid left_id", line_num + 1))
})?;
let right_id: u16 = fields
.next()
.ok_or_else(|| {
DictError::Format(format!("line {}: missing right_id", line_num + 1))
})?
.parse()
.map_err(|_| {
DictError::Format(format!("line {}: invalid right_id", line_num + 1))
})?;
let cost: i16 = fields
.next()
.ok_or_else(|| DictError::Format(format!("line {}: missing cost", line_num + 1)))?
.parse()
.map_err(|_| DictError::Format(format!("line {}: invalid cost", line_num + 1)))?;
let feature = fields.next().unwrap_or("").to_string();
entries.push(DictEntry {
surface,
left_id,
right_id,
cost,
feature,
});
}
Ok(entries)
}
fn load_entries_bin(path: &Path) -> Result<Vec<DictEntry>> {
let data = std::fs::read(path).map_err(DictError::Io)?;
let mut cursor = std::io::Cursor::new(&data);
let mut magic = [0u8; 4];
cursor
.read_exact(&mut magic)
.map_err(|e| DictError::Format(format!("entries.bin magic: {e}")))?;
if &magic == b"MKE2" {
return Self::load_entries_bin_v2(path);
}
if &magic != ENTRIES_MAGIC {
return Err(DictError::Format(
"entries.bin: invalid magic number (expected MKED or MKE2)".into(),
));
}
let version = cursor
.read_u32::<LittleEndian>()
.map_err(|e| DictError::Format(format!("entries.bin version: {e}")))?;
if version != ENTRIES_VERSION {
return Err(DictError::Format(format!(
"entries.bin: unsupported version {version}"
)));
}
let count = cursor
.read_u32::<LittleEndian>()
.map_err(|e| DictError::Format(format!("entries.bin count: {e}")))?;
let mut entries = Vec::with_capacity(count as usize);
for i in 0..count {
let left_id = cursor
.read_u16::<LittleEndian>()
.map_err(|e| DictError::Format(format!("entries.bin entry {i} left_id: {e}")))?;
let right_id = cursor
.read_u16::<LittleEndian>()
.map_err(|e| DictError::Format(format!("entries.bin entry {i} right_id: {e}")))?;
let cost = cursor
.read_i16::<LittleEndian>()
.map_err(|e| DictError::Format(format!("entries.bin entry {i} cost: {e}")))?;
let surface_len = cursor
.read_u16::<LittleEndian>()
.map_err(|e| DictError::Format(format!("entries.bin entry {i} surface_len: {e}")))?
as usize;
let feature_len = cursor
.read_u16::<LittleEndian>()
.map_err(|e| DictError::Format(format!("entries.bin entry {i} feature_len: {e}")))?
as usize;
let mut surface_bytes = vec![0u8; surface_len];
cursor
.read_exact(&mut surface_bytes)
.map_err(|e| DictError::Format(format!("entries.bin entry {i} surface: {e}")))?;
let surface = String::from_utf8(surface_bytes).map_err(|e| {
DictError::Format(format!("entries.bin entry {i} surface utf8: {e}"))
})?;
let mut feature_bytes = vec![0u8; feature_len];
cursor
.read_exact(&mut feature_bytes)
.map_err(|e| DictError::Format(format!("entries.bin entry {i} feature: {e}")))?;
let feature = String::from_utf8(feature_bytes).map_err(|e| {
DictError::Format(format!("entries.bin entry {i} feature utf8: {e}"))
})?;
entries.push(DictEntry {
surface,
left_id,
right_id,
cost,
feature,
});
}
Ok(entries)
}
fn load_entries_bin_v2(path: &Path) -> Result<Vec<DictEntry>> {
let lazy = LazyEntries::from_file(path)?;
let count = lazy.len();
let mut entries = Vec::with_capacity(count);
for i in 0..count {
let entry = lazy.get(i as u32)?;
entries.push((*entry).clone());
}
Ok(entries)
}
pub fn save_entries_bin(entries: &[DictEntry], path: &Path) -> Result<()> {
let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
file.write_all(ENTRIES_MAGIC).map_err(DictError::Io)?;
file.write_u32::<LittleEndian>(ENTRIES_VERSION)
.map_err(DictError::Io)?;
let count = u32::try_from(entries.len())
.map_err(|_| DictError::Format("too many entries".into()))?;
file.write_u32::<LittleEndian>(count)
.map_err(DictError::Io)?;
for entry in entries {
file.write_u16::<LittleEndian>(entry.left_id)
.map_err(DictError::Io)?;
file.write_u16::<LittleEndian>(entry.right_id)
.map_err(DictError::Io)?;
file.write_i16::<LittleEndian>(entry.cost)
.map_err(DictError::Io)?;
let surface_bytes = entry.surface.as_bytes();
let surface_len = u16::try_from(surface_bytes.len())
.map_err(|_| DictError::Format("surface too long".into()))?;
file.write_u16::<LittleEndian>(surface_len)
.map_err(DictError::Io)?;
let feature_bytes = entry.feature.as_bytes();
let feature_len = u16::try_from(feature_bytes.len())
.map_err(|_| DictError::Format("feature too long".into()))?;
file.write_u16::<LittleEndian>(feature_len)
.map_err(DictError::Io)?;
file.write_all(surface_bytes).map_err(DictError::Io)?;
file.write_all(feature_bytes).map_err(DictError::Io)?;
}
Ok(())
}
pub fn save_entries_csv(entries: &[DictEntry], path: &Path) -> Result<()> {
let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
for entry in entries {
writeln!(
file,
"{},{},{},{},{}",
entry.surface, entry.left_id, entry.right_id, entry.cost, entry.feature
)
.map_err(DictError::Io)?;
}
Ok(())
}
fn get_entries_at(&self, first_index: u32, surface: &str) -> Result<Vec<Arc<DictEntry>>> {
self.entry_store.get_entries_at(first_index, surface)
}
#[must_use]
pub fn with_user_dictionary(mut self, user_dict: UserDictionary) -> Self {
self.user_dict = Some(Arc::new(user_dict));
self
}
pub fn set_user_dictionary(&mut self, user_dict: UserDictionary) {
self.user_dict = Some(Arc::new(user_dict));
}
#[must_use]
pub fn dicdir(&self) -> &Path {
&self.dicdir
}
#[must_use]
pub const fn trie(&self) -> &Trie<'static> {
&self.trie
}
#[must_use]
pub const fn matrix(&self) -> &ConnectionMatrix {
&self.matrix
}
#[must_use]
pub fn entry_count(&self) -> usize {
self.entry_store.len()
}
#[must_use]
pub fn entry_store(&self) -> &Arc<dyn EntryStore> {
&self.entry_store
}
#[must_use]
pub fn user_dictionary(&self) -> Option<&UserDictionary> {
self.user_dict.as_deref()
}
#[cfg(feature = "hot-reload-v2")]
#[must_use]
pub fn with_hot_reload(mut self, hr: Arc<HotReloadDictV2>) -> Self {
self.hot_reload = Some(hr);
self
}
#[cfg(feature = "hot-reload-v2")]
pub fn set_hot_reload(&mut self, hr: Arc<HotReloadDictV2>) {
self.hot_reload = Some(hr);
}
#[cfg(feature = "hot-reload-v2")]
#[must_use]
pub const fn hot_reload(&self) -> Option<&Arc<HotReloadDictV2>> {
self.hot_reload.as_ref()
}
pub fn get_entry(&self, index: u32) -> Result<Arc<DictEntry>> {
self.entry_store.get(index)
}
pub fn common_prefix_search(&self, text: &str) -> Result<Vec<(Arc<DictEntry>, usize)>> {
let mut results = Vec::new();
for (index, byte_len) in self.trie.common_prefix_search(text) {
let surface = &text[..byte_len];
let entries = self.get_entries_at(index, surface)?;
for entry in entries {
results.push((entry, byte_len));
}
}
#[cfg(feature = "hot-reload-v2")]
if let Some(hr) = &self.hot_reload {
let snapshot = hr.load();
let domain_entries = snapshot.domain_stack.common_prefix_search(text);
for user_entry in domain_entries {
let byte_len = user_entry.surface.len();
let dict_entry = Arc::new(DictEntry::new(
&user_entry.surface,
user_entry.left_id,
user_entry.right_id,
user_entry.cost,
&user_entry.feature,
));
results.push((dict_entry, byte_len));
}
}
Ok(results)
}
pub fn common_prefix_search_at(
&self,
text: &str,
start_byte: usize,
) -> Result<Vec<(Arc<DictEntry>, usize)>> {
let mut results = Vec::new();
for (index, end_byte) in self.trie.common_prefix_search_at(text, start_byte) {
let byte_len = end_byte - start_byte;
let surface = &text[start_byte..end_byte];
let entries = self.get_entries_at(index, surface)?;
for entry in entries {
results.push((entry, byte_len));
}
}
Ok(results)
}
#[must_use]
pub fn lookup_combined(&self, surface: &str) -> Vec<Entry> {
let mut results = self.lookup(surface);
if let Some(user_dict) = &self.user_dict {
let user_entries = user_dict.lookup(surface);
results.extend(user_entries.iter().map(|e| e.to_entry()));
}
#[cfg(feature = "hot-reload-v2")]
if let Some(hr) = &self.hot_reload {
let snapshot = hr.load();
let domain_entries = snapshot.domain_stack.lookup(surface);
results.extend(domain_entries.iter().map(|ue| Entry {
surface: ue.surface.clone(),
left_id: ue.left_id,
right_id: ue.right_id,
cost: ue.cost,
feature: ue.feature.clone(),
}));
}
results
}
#[doc(hidden)]
#[must_use]
pub fn new_test(
dicdir: PathBuf,
trie: Trie<'static>,
matrix: ConnectionMatrix,
entries: Vec<DictEntry>,
) -> Self {
Self {
dicdir,
trie,
matrix,
entry_store: Arc::new(EagerStore::new(entries)),
user_dict: None,
#[cfg(feature = "hot-reload-v2")]
hot_reload: None,
}
}
}
impl Dictionary for SystemDictionary {
fn lookup(&self, surface: &str) -> Vec<Entry> {
if let Some(index) = self.trie.exact_match(surface) {
if let Ok(entries) = self.get_entries_at(index, surface) {
if !entries.is_empty() {
return entries.iter().map(|e| e.to_entry()).collect();
}
}
}
Vec::new()
}
fn get_connection_cost(&self, left_id: u16, right_id: u16) -> i16 {
i16::try_from(self.matrix.get(right_id, left_id)).unwrap_or(i16::MAX)
}
}
pub struct DictionaryLoader;
impl DictionaryLoader {
pub fn find_dicdir() -> Result<PathBuf> {
if let Ok(dicdir) = std::env::var("MECAB_DICDIR") {
let path = PathBuf::from(dicdir);
if path.is_dir() {
return Ok(path);
}
}
for &path_str in DEFAULT_DICDIR_PATHS {
let path = PathBuf::from(path_str);
if path.is_dir() {
return Ok(path);
}
}
{
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let test_dict = manifest_dir.join("../../test-fixtures/mini-dict");
if test_dict.is_dir() {
eprintln!(
"[mecab-ko WARNING] No system dictionary found; falling back to sparse \
test dictionary at '{}'. Most Korean words will NOT be tokenized. \
Set MECAB_DICDIR to a full mecab-ko-dic installation path.",
test_dict.display()
);
return Ok(test_dict);
}
}
Err(DictError::Format(
"Dictionary directory not found. Set MECAB_DICDIR environment variable or \
install mecab-ko-dic to one of: /usr/local/lib/mecab/dic/mecab-ko-dic, \
/usr/lib/mecab/dic/mecab-ko-dic, /opt/mecab/dic/mecab-ko-dic, \
./dic/mecab-ko-dic"
.to_string(),
))
}
pub fn load_system<P: AsRef<Path>>(dicdir: P) -> Result<SystemDictionary> {
SystemDictionary::load(dicdir)
}
pub fn load_default() -> Result<SystemDictionary> {
SystemDictionary::load_default()
}
pub fn validate_dicdir<P: AsRef<Path>>(dicdir: P) -> Result<()> {
let dicdir = dicdir.as_ref();
if !dicdir.is_dir() {
return Err(DictError::Format(format!(
"Dictionary directory does not exist: {}",
dicdir.display()
)));
}
let has_trie =
dicdir.join(TRIE_FILE).exists() || dicdir.join(format!("{TRIE_FILE}.zst")).exists();
let has_matrix = dicdir.join(MATRIX_FILE).exists() || dicdir.join("matrix.def").exists();
if !has_trie {
return Err(DictError::Format(format!(
"Trie file not found in {}",
dicdir.display()
)));
}
if !has_matrix {
return Err(DictError::Format(format!(
"Matrix file not found in {}",
dicdir.display()
)));
}
Ok(())
}
}
#[cfg(test)]
#[allow(
clippy::expect_used,
clippy::unwrap_used,
clippy::items_after_statements
)]
mod tests {
use super::*;
use crate::matrix::DenseMatrix;
use crate::trie::TrieBuilder;
fn create_test_dictionary() -> SystemDictionary {
let entries = vec![
("가", 0u32),
("가다", 1),
("가방", 2),
("나", 3),
("나다", 4),
];
let trie_bytes = TrieBuilder::build(&entries).expect("should build trie");
let trie = Trie::from_vec(trie_bytes);
let matrix = DenseMatrix::new(10, 10, 100);
let matrix = ConnectionMatrix::Dense(matrix);
let dict_entries = vec![
DictEntry::new("가", 1, 1, 100, "NNG,*,T,가,*,*,*,*"),
DictEntry::new("가다", 2, 2, 200, "VV,*,F,가다,*,*,*,*"),
DictEntry::new("가방", 3, 3, 300, "NNG,*,T,가방,*,*,*,*"),
DictEntry::new("나", 4, 4, 400, "NP,*,F,나,*,*,*,*"),
DictEntry::new("나다", 5, 5, 500, "VV,*,F,나다,*,*,*,*"),
];
SystemDictionary {
dicdir: PathBuf::from("./test_dic"),
trie,
matrix,
entry_store: Arc::new(EagerStore::new(dict_entries)),
user_dict: None,
#[cfg(feature = "hot-reload-v2")]
hot_reload: None,
}
}
#[test]
fn test_dict_entry_creation() {
let entry = DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*");
assert_eq!(entry.surface, "안녕");
assert_eq!(entry.left_id, 1);
assert_eq!(entry.right_id, 1);
assert_eq!(entry.cost, 100);
}
#[test]
fn test_dict_entry_to_entry() {
let dict_entry = DictEntry::new("테스트", 5, 5, 200, "NNG,*,T,테스트,*,*,*,*");
let entry = dict_entry.to_entry();
assert_eq!(entry.surface, "테스트");
assert_eq!(entry.left_id, 5);
assert_eq!(entry.cost, 200);
}
#[test]
fn test_system_dictionary_lookup() {
let dict = create_test_dictionary();
let entries = dict.lookup("가");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].surface, "가");
let entries = dict.lookup("가다");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].surface, "가다");
let entries = dict.lookup("없음");
assert!(entries.is_empty());
}
#[test]
fn test_system_dictionary_get_connection_cost() {
let dict = create_test_dictionary();
let cost = dict.get_connection_cost(1, 2);
assert_eq!(cost, 100); }
#[test]
fn test_common_prefix_search() {
let dict = create_test_dictionary();
let results = dict
.common_prefix_search("가방에")
.expect("search should work");
assert_eq!(results.len(), 2);
let surfaces: Vec<_> = results.iter().map(|(e, _)| e.surface.as_str()).collect();
assert!(surfaces.contains(&"가"));
assert!(surfaces.contains(&"가방"));
}
#[test]
fn test_common_prefix_search_at() {
let dict = create_test_dictionary();
let text = "나가다";
let start = "나".len();
let results = dict
.common_prefix_search_at(text, start)
.expect("search should work");
assert_eq!(results.len(), 2);
let surfaces: Vec<_> = results.iter().map(|(e, _)| e.surface.as_str()).collect();
assert!(surfaces.contains(&"가"));
assert!(surfaces.contains(&"가다"));
}
#[test]
fn test_with_user_dictionary() {
let mut dict = create_test_dictionary();
let mut user_dict = UserDictionary::new();
user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
user_dict.add_entry("머신러닝", "NNG", Some(-1000), None);
dict.set_user_dictionary(user_dict);
let entries = dict.lookup_combined("딥러닝");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].surface, "딥러닝");
}
#[test]
fn test_lookup_combined_system_and_user() {
let mut dict = create_test_dictionary();
let mut user_dict = UserDictionary::new();
user_dict.add_entry("가", "JKS", Some(-500), None);
dict.set_user_dictionary(user_dict);
let entries = dict.lookup_combined("가");
assert_eq!(entries.len(), 2);
}
#[test]
fn test_get_entry() {
let dict = create_test_dictionary();
let entry = dict.get_entry(0);
assert!(entry.is_ok());
assert_eq!(entry.unwrap().surface, "가");
let entry = dict.get_entry(100);
assert!(entry.is_err());
}
#[test]
fn test_dicdir() {
let dict = create_test_dictionary();
assert_eq!(dict.dicdir(), Path::new("./test_dic"));
}
#[test]
fn test_trie_reference() {
let dict = create_test_dictionary();
let trie = dict.trie();
assert!(trie.exact_match("가").is_some());
}
#[test]
fn test_matrix_reference() {
let dict = create_test_dictionary();
let matrix = dict.matrix();
assert_eq!(matrix.left_size(), 10);
assert_eq!(matrix.right_size(), 10);
}
#[test]
fn test_entry_count() {
let dict = create_test_dictionary();
assert_eq!(dict.entry_count(), 5);
}
#[test]
fn test_dictionary_loader_find_dicdir() {
let result = DictionaryLoader::find_dicdir();
match result {
Ok(path) => {
assert!(path.is_dir());
}
Err(e) => {
assert!(e.to_string().contains("Dictionary directory not found"));
}
}
}
#[test]
fn test_dict_entry_from_entry() {
let entry = Entry {
surface: "테스트".to_string(),
left_id: 10,
right_id: 20,
cost: 300,
feature: "NNG,*,T,테스트,*,*,*,*".to_string(),
};
let dict_entry: DictEntry = entry.into();
assert_eq!(dict_entry.surface, "테스트");
assert_eq!(dict_entry.left_id, 10);
assert_eq!(dict_entry.right_id, 20);
assert_eq!(dict_entry.cost, 300);
}
#[test]
fn test_entries_bin_roundtrip() {
let entries = vec![
DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*"),
DictEntry::new("하세요", 2, 2, 50, "VV,*,F,하세요,*,*,*,*"),
DictEntry::new("감사", 3, 3, 80, "NNG,*,F,감사,*,*,*,*"),
];
let temp = tempfile::NamedTempFile::new().expect("create temp file");
let path = temp.path();
SystemDictionary::save_entries_bin(&entries, path).expect("save should work");
let loaded = SystemDictionary::load_entries_bin(path).expect("load should work");
assert_eq!(loaded.len(), 3);
assert_eq!(loaded[0].surface, "안녕");
assert_eq!(loaded[0].left_id, 1);
assert_eq!(loaded[0].cost, 100);
assert_eq!(loaded[0].feature, "NNG,*,T,안녕,*,*,*,*");
assert_eq!(loaded[1].surface, "하세요");
assert_eq!(loaded[2].surface, "감사");
}
#[test]
fn test_entries_csv_roundtrip() {
let entries = vec![
DictEntry::new("형태소", 10, 20, 150, "NNG,*,F,형태소,*,*,*,*"),
DictEntry::new("분석", 11, 21, 200, "NNG,*,T,분석,*,*,*,*"),
];
let temp = tempfile::NamedTempFile::new().expect("create temp file");
let path = temp.path();
SystemDictionary::save_entries_csv(&entries, path).expect("save should work");
let loaded = SystemDictionary::load_entries_csv(path).expect("load should work");
assert_eq!(loaded.len(), 2);
assert_eq!(loaded[0].surface, "형태소");
assert_eq!(loaded[0].left_id, 10);
assert_eq!(loaded[0].right_id, 20);
assert_eq!(loaded[0].cost, 150);
assert_eq!(loaded[1].surface, "분석");
}
#[test]
fn test_get_entries_at_multi() {
let trie_input = vec![("가", 0u32), ("나", 2u32)];
let trie_bytes = TrieBuilder::build(&trie_input).expect("build trie");
let trie = Trie::from_vec(trie_bytes);
let matrix = ConnectionMatrix::Dense(DenseMatrix::new(5, 5, 100));
let dict_entries = vec![
DictEntry::new("가", 1, 1, 100, "VV,*,F,가,*,*,*,*"),
DictEntry::new("가", 2, 2, 50, "JKS,*,F,가,*,*,*,*"),
DictEntry::new("나", 3, 3, 200, "NP,*,F,나,*,*,*,*"),
];
let dict = SystemDictionary {
dicdir: PathBuf::from("./test"),
trie,
matrix,
entry_store: Arc::new(EagerStore::new(dict_entries)),
user_dict: None,
#[cfg(feature = "hot-reload-v2")]
hot_reload: None,
};
let results = dict.get_entries_at(0, "가").expect("should get entries");
assert_eq!(results.len(), 2);
assert_eq!(results[0].feature, "VV,*,F,가,*,*,*,*");
assert_eq!(results[1].feature, "JKS,*,F,가,*,*,*,*");
use crate::Dictionary;
let entries = dict.lookup("가");
assert_eq!(entries.len(), 2);
}
}