1use std::io::{BufRead, BufReader, Read as _, Write as _};
31use std::path::{Path, PathBuf};
32use std::sync::Arc;
33
34use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
35
36use crate::entry_store::{EagerStore, EntryStore, LazyStore};
37use crate::error::{DictError, Result};
38use crate::lazy_entries::LazyEntries;
39use crate::matrix::{ConnectionMatrix, Matrix};
40use crate::trie::Trie;
41use crate::user_dict::UserDictionary;
42use crate::{Dictionary, Entry};
43
44#[cfg(feature = "hot-reload-v2")]
45use crate::hot_reload_v2::HotReloadDictV2;
46
47const DEFAULT_DICDIR_PATHS: &[&str] = &[
49 "/usr/local/lib/mecab/dic/mecab-ko-dic",
50 "/usr/lib/mecab/dic/mecab-ko-dic",
51 "/opt/mecab/dic/mecab-ko-dic",
52 "./dic/mecab-ko-dic",
53];
54
55const TRIE_FILE: &str = "sys.dic";
57const MATRIX_FILE: &str = "matrix.bin";
58const ENTRIES_BIN_FILE: &str = "entries.bin";
59const ENTRIES_CSV_FILE: &str = "entries.csv";
60
61const ENTRIES_MAGIC: &[u8; 4] = b"MKED";
63const ENTRIES_VERSION: u32 = 1;
65
66pub struct SystemDictionary {
71 dicdir: PathBuf,
73 trie: Trie<'static>,
75 matrix: ConnectionMatrix,
77 entry_store: Arc<dyn EntryStore>,
79 user_dict: Option<Arc<UserDictionary>>,
81 #[cfg(feature = "hot-reload-v2")]
83 hot_reload: Option<Arc<HotReloadDictV2>>,
84}
85
86#[derive(Debug, Clone, PartialEq, Eq)]
90pub struct DictEntry {
91 pub surface: String,
93 pub left_id: u16,
95 pub right_id: u16,
97 pub cost: i16,
99 pub feature: String,
101}
102
103impl DictEntry {
104 pub fn new(
106 surface: impl Into<String>,
107 left_id: u16,
108 right_id: u16,
109 cost: i16,
110 feature: impl Into<String>,
111 ) -> Self {
112 Self {
113 surface: surface.into(),
114 left_id,
115 right_id,
116 cost,
117 feature: feature.into(),
118 }
119 }
120
121 #[must_use]
123 pub fn to_entry(&self) -> Entry {
124 Entry {
125 surface: self.surface.clone(),
126 left_id: self.left_id,
127 right_id: self.right_id,
128 cost: self.cost,
129 feature: self.feature.clone(),
130 }
131 }
132}
133
134impl From<Entry> for DictEntry {
135 fn from(entry: Entry) -> Self {
136 Self {
137 surface: entry.surface,
138 left_id: entry.left_id,
139 right_id: entry.right_id,
140 cost: entry.cost,
141 feature: entry.feature,
142 }
143 }
144}
145
146#[derive(Debug, Clone, Copy)]
151pub struct LoadOptions {
152 pub use_mmap_matrix: bool,
154 pub use_lazy_entries: bool,
156 pub lazy_cache_size: Option<usize>,
158}
159
160impl Default for LoadOptions {
161 fn default() -> Self {
167 Self {
168 use_mmap_matrix: false,
169 use_lazy_entries: true,
170 lazy_cache_size: Some(10000),
171 }
172 }
173}
174
175impl LoadOptions {
176 #[must_use]
178 pub const fn memory_optimized() -> Self {
179 Self {
180 use_mmap_matrix: true,
181 use_lazy_entries: true,
182 lazy_cache_size: Some(10000),
183 }
184 }
185
186 #[must_use]
191 pub const fn speed_optimized() -> Self {
192 Self {
193 use_mmap_matrix: false,
194 use_lazy_entries: false,
195 lazy_cache_size: None,
196 }
197 }
198
199 #[must_use]
203 pub const fn eager() -> Self {
204 Self::speed_optimized()
205 }
206}
207
208impl SystemDictionary {
209 pub fn load_default() -> Result<Self> {
220 let dicdir = DictionaryLoader::find_dicdir()?;
221 Self::load(dicdir)
222 }
223
224 pub fn load_memory_optimized() -> Result<Self> {
233 let dicdir = DictionaryLoader::find_dicdir()?;
234 Self::load_with_options(dicdir, LoadOptions::memory_optimized())
235 }
236
237 pub fn load_with_options<P: AsRef<Path>>(dicdir: P, options: LoadOptions) -> Result<Self> {
244 let dicdir = dicdir.as_ref().to_path_buf();
245
246 let trie_path = dicdir.join(TRIE_FILE);
248 let trie = if trie_path.exists() {
249 Trie::from_file(&trie_path)?
250 } else {
251 let compressed_path = dicdir.join(format!("{TRIE_FILE}.zst"));
253 if compressed_path.exists() {
254 Trie::from_compressed_file(&compressed_path)?
255 } else {
256 return Err(DictError::Format(format!(
257 "Trie file not found: {}",
258 trie_path.display()
259 )));
260 }
261 };
262
263 let matrix_path = dicdir.join(MATRIX_FILE);
265 let matrix = if matrix_path.exists() {
266 if options.use_mmap_matrix {
267 ConnectionMatrix::from_mmap_file(&matrix_path)?
268 } else {
269 ConnectionMatrix::from_bin_file(&matrix_path)?
270 }
271 } else {
272 let compressed_path = dicdir.join(format!("{MATRIX_FILE}.zst"));
274 if compressed_path.exists() {
275 ConnectionMatrix::from_compressed_file(&compressed_path)?
276 } else {
277 let def_path = dicdir.join("matrix.def");
279 if def_path.exists() {
280 ConnectionMatrix::from_def_file(&def_path)?
281 } else {
282 return Err(DictError::Format(format!(
283 "Matrix file not found: {}",
284 matrix_path.display()
285 )));
286 }
287 }
288 };
289
290 let entry_store: Arc<dyn EntryStore> = if options.use_lazy_entries {
292 let entries_path = dicdir.join(ENTRIES_BIN_FILE);
293 if entries_path.exists() {
294 if let Ok(lazy) = LazyEntries::from_file(&entries_path) {
296 if let Some(cache_size) = options.lazy_cache_size {
297 lazy.set_cache_size(cache_size);
298 }
299 Arc::new(LazyStore::new(lazy))
300 } else {
301 let entries = Self::load_entries(&dicdir)?;
303 Arc::new(EagerStore::new(entries))
304 }
305 } else {
306 let entries = Self::load_entries(&dicdir)?;
308 Arc::new(EagerStore::new(entries))
309 }
310 } else {
311 let entries = Self::load_entries(&dicdir)?;
312 Arc::new(EagerStore::new(entries))
313 };
314
315 Ok(Self {
316 dicdir,
317 trie,
318 matrix,
319 entry_store,
320 user_dict: None,
321 #[cfg(feature = "hot-reload-v2")]
322 hot_reload: None,
323 })
324 }
325
326 pub fn load<P: AsRef<Path>>(dicdir: P) -> Result<Self> {
337 let dicdir = dicdir.as_ref().to_path_buf();
338
339 let trie_path = dicdir.join(TRIE_FILE);
341 let trie = if trie_path.exists() {
342 Trie::from_file(&trie_path)?
343 } else {
344 let compressed_path = dicdir.join(format!("{TRIE_FILE}.zst"));
346 if compressed_path.exists() {
347 Trie::from_compressed_file(&compressed_path)?
348 } else {
349 return Err(DictError::Format(format!(
350 "Trie file not found: {}",
351 trie_path.display()
352 )));
353 }
354 };
355
356 let matrix_path = dicdir.join(MATRIX_FILE);
358 let matrix = if matrix_path.exists() {
359 ConnectionMatrix::from_bin_file(&matrix_path)?
360 } else {
361 let compressed_path = dicdir.join(format!("{MATRIX_FILE}.zst"));
363 if compressed_path.exists() {
364 ConnectionMatrix::from_compressed_file(&compressed_path)?
365 } else {
366 let def_path = dicdir.join("matrix.def");
368 if def_path.exists() {
369 ConnectionMatrix::from_def_file(&def_path)?
370 } else {
371 return Err(DictError::Format(format!(
372 "Matrix file not found: {}",
373 matrix_path.display()
374 )));
375 }
376 }
377 };
378
379 let entries = Self::load_entries(&dicdir)?;
381 let entry_store: Arc<dyn EntryStore> = Arc::new(EagerStore::new(entries));
382
383 Ok(Self {
384 dicdir,
385 trie,
386 matrix,
387 entry_store,
388 user_dict: None,
389 #[cfg(feature = "hot-reload-v2")]
390 hot_reload: None,
391 })
392 }
393
394 fn load_entries(dicdir: &Path) -> Result<Vec<DictEntry>> {
400 let bin_path = dicdir.join(ENTRIES_BIN_FILE);
402 if bin_path.exists() {
403 return Self::load_entries_bin(&bin_path);
404 }
405
406 let csv_path = dicdir.join(ENTRIES_CSV_FILE);
408 if csv_path.exists() {
409 return Self::load_entries_csv(&csv_path);
410 }
411
412 Ok(Vec::new())
414 }
415
416 fn load_entries_csv(path: &Path) -> Result<Vec<DictEntry>> {
420 let file = std::fs::File::open(path).map_err(DictError::Io)?;
421 let reader = BufReader::new(file);
422 let mut entries = Vec::new();
423
424 for (line_num, line_result) in reader.lines().enumerate() {
425 let line = line_result.map_err(DictError::Io)?;
426 let line = line.trim();
427 if line.is_empty() || line.starts_with('#') {
428 continue;
429 }
430
431 let mut fields = line.splitn(5, ',');
433 let surface = fields
434 .next()
435 .ok_or_else(|| {
436 DictError::Format(format!("line {}: missing surface", line_num + 1))
437 })?
438 .to_string();
439 let left_id: u16 = fields
440 .next()
441 .ok_or_else(|| {
442 DictError::Format(format!("line {}: missing left_id", line_num + 1))
443 })?
444 .parse()
445 .map_err(|_| {
446 DictError::Format(format!("line {}: invalid left_id", line_num + 1))
447 })?;
448 let right_id: u16 = fields
449 .next()
450 .ok_or_else(|| {
451 DictError::Format(format!("line {}: missing right_id", line_num + 1))
452 })?
453 .parse()
454 .map_err(|_| {
455 DictError::Format(format!("line {}: invalid right_id", line_num + 1))
456 })?;
457 let cost: i16 = fields
458 .next()
459 .ok_or_else(|| DictError::Format(format!("line {}: missing cost", line_num + 1)))?
460 .parse()
461 .map_err(|_| DictError::Format(format!("line {}: invalid cost", line_num + 1)))?;
462 let feature = fields.next().unwrap_or("").to_string();
463
464 entries.push(DictEntry {
465 surface,
466 left_id,
467 right_id,
468 cost,
469 feature,
470 });
471 }
472
473 Ok(entries)
474 }
475
476 fn load_entries_bin(path: &Path) -> Result<Vec<DictEntry>> {
481 let data = std::fs::read(path).map_err(DictError::Io)?;
482 let mut cursor = std::io::Cursor::new(&data);
483
484 let mut magic = [0u8; 4];
486 cursor
487 .read_exact(&mut magic)
488 .map_err(|e| DictError::Format(format!("entries.bin magic: {e}")))?;
489
490 if &magic == b"MKE2" {
492 return Self::load_entries_bin_v2(path);
493 }
494
495 if &magic != ENTRIES_MAGIC {
497 return Err(DictError::Format(
498 "entries.bin: invalid magic number (expected MKED or MKE2)".into(),
499 ));
500 }
501
502 let version = cursor
504 .read_u32::<LittleEndian>()
505 .map_err(|e| DictError::Format(format!("entries.bin version: {e}")))?;
506 if version != ENTRIES_VERSION {
507 return Err(DictError::Format(format!(
508 "entries.bin: unsupported version {version}"
509 )));
510 }
511
512 let count = cursor
514 .read_u32::<LittleEndian>()
515 .map_err(|e| DictError::Format(format!("entries.bin count: {e}")))?;
516
517 let mut entries = Vec::with_capacity(count as usize);
518 for i in 0..count {
519 let left_id = cursor
520 .read_u16::<LittleEndian>()
521 .map_err(|e| DictError::Format(format!("entries.bin entry {i} left_id: {e}")))?;
522 let right_id = cursor
523 .read_u16::<LittleEndian>()
524 .map_err(|e| DictError::Format(format!("entries.bin entry {i} right_id: {e}")))?;
525 let cost = cursor
526 .read_i16::<LittleEndian>()
527 .map_err(|e| DictError::Format(format!("entries.bin entry {i} cost: {e}")))?;
528 let surface_len = cursor
529 .read_u16::<LittleEndian>()
530 .map_err(|e| DictError::Format(format!("entries.bin entry {i} surface_len: {e}")))?
531 as usize;
532 let feature_len = cursor
533 .read_u16::<LittleEndian>()
534 .map_err(|e| DictError::Format(format!("entries.bin entry {i} feature_len: {e}")))?
535 as usize;
536
537 let mut surface_bytes = vec![0u8; surface_len];
538 cursor
539 .read_exact(&mut surface_bytes)
540 .map_err(|e| DictError::Format(format!("entries.bin entry {i} surface: {e}")))?;
541 let surface = String::from_utf8(surface_bytes).map_err(|e| {
542 DictError::Format(format!("entries.bin entry {i} surface utf8: {e}"))
543 })?;
544
545 let mut feature_bytes = vec![0u8; feature_len];
546 cursor
547 .read_exact(&mut feature_bytes)
548 .map_err(|e| DictError::Format(format!("entries.bin entry {i} feature: {e}")))?;
549 let feature = String::from_utf8(feature_bytes).map_err(|e| {
550 DictError::Format(format!("entries.bin entry {i} feature utf8: {e}"))
551 })?;
552
553 entries.push(DictEntry {
554 surface,
555 left_id,
556 right_id,
557 cost,
558 feature,
559 });
560 }
561
562 Ok(entries)
563 }
564
565 fn load_entries_bin_v2(path: &Path) -> Result<Vec<DictEntry>> {
569 let lazy = LazyEntries::from_file(path)?;
570 let count = lazy.len();
571 let mut entries = Vec::with_capacity(count);
572
573 for i in 0..count {
574 let entry = lazy.get(i as u32)?;
575 entries.push((*entry).clone());
576 }
577
578 Ok(entries)
579 }
580
581 pub fn save_entries_bin(entries: &[DictEntry], path: &Path) -> Result<()> {
587 let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
588
589 file.write_all(ENTRIES_MAGIC).map_err(DictError::Io)?;
590 file.write_u32::<LittleEndian>(ENTRIES_VERSION)
591 .map_err(DictError::Io)?;
592
593 let count = u32::try_from(entries.len())
594 .map_err(|_| DictError::Format("too many entries".into()))?;
595 file.write_u32::<LittleEndian>(count)
596 .map_err(DictError::Io)?;
597
598 for entry in entries {
599 file.write_u16::<LittleEndian>(entry.left_id)
600 .map_err(DictError::Io)?;
601 file.write_u16::<LittleEndian>(entry.right_id)
602 .map_err(DictError::Io)?;
603 file.write_i16::<LittleEndian>(entry.cost)
604 .map_err(DictError::Io)?;
605
606 let surface_bytes = entry.surface.as_bytes();
607 let surface_len = u16::try_from(surface_bytes.len())
608 .map_err(|_| DictError::Format("surface too long".into()))?;
609 file.write_u16::<LittleEndian>(surface_len)
610 .map_err(DictError::Io)?;
611
612 let feature_bytes = entry.feature.as_bytes();
613 let feature_len = u16::try_from(feature_bytes.len())
614 .map_err(|_| DictError::Format("feature too long".into()))?;
615 file.write_u16::<LittleEndian>(feature_len)
616 .map_err(DictError::Io)?;
617
618 file.write_all(surface_bytes).map_err(DictError::Io)?;
619 file.write_all(feature_bytes).map_err(DictError::Io)?;
620 }
621
622 Ok(())
623 }
624
625 pub fn save_entries_csv(entries: &[DictEntry], path: &Path) -> Result<()> {
631 let mut file = std::fs::File::create(path).map_err(DictError::Io)?;
632
633 for entry in entries {
634 writeln!(
635 file,
636 "{},{},{},{},{}",
637 entry.surface, entry.left_id, entry.right_id, entry.cost, entry.feature
638 )
639 .map_err(DictError::Io)?;
640 }
641
642 Ok(())
643 }
644
645 fn get_entries_at(&self, first_index: u32, surface: &str) -> Result<Vec<Arc<DictEntry>>> {
654 self.entry_store.get_entries_at(first_index, surface)
655 }
656
657 #[must_use]
663 pub fn with_user_dictionary(mut self, user_dict: UserDictionary) -> Self {
664 self.user_dict = Some(Arc::new(user_dict));
665 self
666 }
667
668 pub fn set_user_dictionary(&mut self, user_dict: UserDictionary) {
670 self.user_dict = Some(Arc::new(user_dict));
671 }
672
673 #[must_use]
675 pub fn dicdir(&self) -> &Path {
676 &self.dicdir
677 }
678
679 #[must_use]
681 pub const fn trie(&self) -> &Trie<'static> {
682 &self.trie
683 }
684
685 #[must_use]
687 pub const fn matrix(&self) -> &ConnectionMatrix {
688 &self.matrix
689 }
690
691 #[must_use]
693 pub fn entry_count(&self) -> usize {
694 self.entry_store.len()
695 }
696
697 #[must_use]
699 pub fn entry_store(&self) -> &Arc<dyn EntryStore> {
700 &self.entry_store
701 }
702
703 #[must_use]
705 pub fn user_dictionary(&self) -> Option<&UserDictionary> {
706 self.user_dict.as_deref()
707 }
708
709 #[cfg(feature = "hot-reload-v2")]
715 #[must_use]
716 pub fn with_hot_reload(mut self, hr: Arc<HotReloadDictV2>) -> Self {
717 self.hot_reload = Some(hr);
718 self
719 }
720
721 #[cfg(feature = "hot-reload-v2")]
727 pub fn set_hot_reload(&mut self, hr: Arc<HotReloadDictV2>) {
728 self.hot_reload = Some(hr);
729 }
730
731 #[cfg(feature = "hot-reload-v2")]
733 #[must_use]
734 pub const fn hot_reload(&self) -> Option<&Arc<HotReloadDictV2>> {
735 self.hot_reload.as_ref()
736 }
737
738 pub fn get_entry(&self, index: u32) -> Result<Arc<DictEntry>> {
749 self.entry_store.get(index)
750 }
751
752 pub fn common_prefix_search(&self, text: &str) -> Result<Vec<(Arc<DictEntry>, usize)>> {
769 let mut results = Vec::new();
770 for (index, byte_len) in self.trie.common_prefix_search(text) {
771 let surface = &text[..byte_len];
772 let entries = self.get_entries_at(index, surface)?;
773 for entry in entries {
774 results.push((entry, byte_len));
775 }
776 }
777
778 #[cfg(feature = "hot-reload-v2")]
780 if let Some(hr) = &self.hot_reload {
781 let snapshot = hr.load();
782 let domain_entries = snapshot.domain_stack.common_prefix_search(text);
783 for user_entry in domain_entries {
784 let byte_len = user_entry.surface.len();
785 let dict_entry = Arc::new(DictEntry::new(
786 &user_entry.surface,
787 user_entry.left_id,
788 user_entry.right_id,
789 user_entry.cost,
790 &user_entry.feature,
791 ));
792 results.push((dict_entry, byte_len));
793 }
794 }
795
796 Ok(results)
797 }
798
799 pub fn common_prefix_search_at(
810 &self,
811 text: &str,
812 start_byte: usize,
813 ) -> Result<Vec<(Arc<DictEntry>, usize)>> {
814 let mut results = Vec::new();
815 for (index, end_byte) in self.trie.common_prefix_search_at(text, start_byte) {
816 let byte_len = end_byte - start_byte;
817 let surface = &text[start_byte..end_byte];
818 let entries = self.get_entries_at(index, surface)?;
819 for entry in entries {
820 results.push((entry, byte_len));
821 }
822 }
823 Ok(results)
824 }
825
826 #[must_use]
832 pub fn lookup_combined(&self, surface: &str) -> Vec<Entry> {
833 let mut results = self.lookup(surface);
834
835 if let Some(user_dict) = &self.user_dict {
837 let user_entries = user_dict.lookup(surface);
838 results.extend(user_entries.iter().map(|e| e.to_entry()));
839 }
840
841 #[cfg(feature = "hot-reload-v2")]
843 if let Some(hr) = &self.hot_reload {
844 let snapshot = hr.load();
845 let domain_entries = snapshot.domain_stack.lookup(surface);
846 results.extend(domain_entries.iter().map(|ue| Entry {
847 surface: ue.surface.clone(),
848 left_id: ue.left_id,
849 right_id: ue.right_id,
850 cost: ue.cost,
851 feature: ue.feature.clone(),
852 }));
853 }
854
855 results
856 }
857
858 #[doc(hidden)]
860 #[must_use]
861 pub fn new_test(
862 dicdir: PathBuf,
863 trie: Trie<'static>,
864 matrix: ConnectionMatrix,
865 entries: Vec<DictEntry>,
866 ) -> Self {
867 Self {
868 dicdir,
869 trie,
870 matrix,
871 entry_store: Arc::new(EagerStore::new(entries)),
872 user_dict: None,
873 #[cfg(feature = "hot-reload-v2")]
874 hot_reload: None,
875 }
876 }
877}
878
879impl Dictionary for SystemDictionary {
880 fn lookup(&self, surface: &str) -> Vec<Entry> {
881 if let Some(index) = self.trie.exact_match(surface) {
883 if let Ok(entries) = self.get_entries_at(index, surface) {
884 if !entries.is_empty() {
885 return entries.iter().map(|e| e.to_entry()).collect();
886 }
887 }
888 }
889
890 Vec::new()
891 }
892
893 fn get_connection_cost(&self, left_id: u16, right_id: u16) -> i16 {
894 i16::try_from(self.matrix.get(right_id, left_id)).unwrap_or(i16::MAX)
895 }
896}
897
898pub struct DictionaryLoader;
902
903impl DictionaryLoader {
904 pub fn find_dicdir() -> Result<PathBuf> {
914 if let Ok(dicdir) = std::env::var("MECAB_DICDIR") {
916 let path = PathBuf::from(dicdir);
917 if path.is_dir() {
918 return Ok(path);
919 }
920 }
921
922 for &path_str in DEFAULT_DICDIR_PATHS {
924 let path = PathBuf::from(path_str);
925 if path.is_dir() {
926 return Ok(path);
927 }
928 }
929
930 {
939 let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
940 let test_dict = manifest_dir.join("../../test-fixtures/mini-dict");
941 if test_dict.is_dir() {
942 eprintln!(
943 "[mecab-ko WARNING] No system dictionary found; falling back to sparse \
944 test dictionary at '{}'. Most Korean words will NOT be tokenized. \
945 Set MECAB_DICDIR to a full mecab-ko-dic installation path.",
946 test_dict.display()
947 );
948 return Ok(test_dict);
949 }
950 }
951
952 Err(DictError::Format(
953 "Dictionary directory not found. Set MECAB_DICDIR environment variable or \
954 install mecab-ko-dic to one of: /usr/local/lib/mecab/dic/mecab-ko-dic, \
955 /usr/lib/mecab/dic/mecab-ko-dic, /opt/mecab/dic/mecab-ko-dic, \
956 ./dic/mecab-ko-dic"
957 .to_string(),
958 ))
959 }
960
961 pub fn load_system<P: AsRef<Path>>(dicdir: P) -> Result<SystemDictionary> {
967 SystemDictionary::load(dicdir)
968 }
969
970 pub fn load_default() -> Result<SystemDictionary> {
976 SystemDictionary::load_default()
977 }
978
979 pub fn validate_dicdir<P: AsRef<Path>>(dicdir: P) -> Result<()> {
989 let dicdir = dicdir.as_ref();
990
991 if !dicdir.is_dir() {
992 return Err(DictError::Format(format!(
993 "Dictionary directory does not exist: {}",
994 dicdir.display()
995 )));
996 }
997
998 let has_trie =
1000 dicdir.join(TRIE_FILE).exists() || dicdir.join(format!("{TRIE_FILE}.zst")).exists();
1001
1002 let has_matrix = dicdir.join(MATRIX_FILE).exists() || dicdir.join("matrix.def").exists();
1003
1004 if !has_trie {
1005 return Err(DictError::Format(format!(
1006 "Trie file not found in {}",
1007 dicdir.display()
1008 )));
1009 }
1010
1011 if !has_matrix {
1012 return Err(DictError::Format(format!(
1013 "Matrix file not found in {}",
1014 dicdir.display()
1015 )));
1016 }
1017
1018 Ok(())
1019 }
1020}
1021
1022#[cfg(test)]
1023#[allow(
1024 clippy::expect_used,
1025 clippy::unwrap_used,
1026 clippy::items_after_statements
1027)]
1028mod tests {
1029 use super::*;
1030 use crate::matrix::DenseMatrix;
1031 use crate::trie::TrieBuilder;
1032
1033 fn create_test_dictionary() -> SystemDictionary {
1034 let entries = vec![
1036 ("가", 0u32),
1037 ("가다", 1),
1038 ("가방", 2),
1039 ("나", 3),
1040 ("나다", 4),
1041 ];
1042 let trie_bytes = TrieBuilder::build(&entries).expect("should build trie");
1043 let trie = Trie::from_vec(trie_bytes);
1044
1045 let matrix = DenseMatrix::new(10, 10, 100);
1047 let matrix = ConnectionMatrix::Dense(matrix);
1048
1049 let dict_entries = vec![
1051 DictEntry::new("가", 1, 1, 100, "NNG,*,T,가,*,*,*,*"),
1052 DictEntry::new("가다", 2, 2, 200, "VV,*,F,가다,*,*,*,*"),
1053 DictEntry::new("가방", 3, 3, 300, "NNG,*,T,가방,*,*,*,*"),
1054 DictEntry::new("나", 4, 4, 400, "NP,*,F,나,*,*,*,*"),
1055 DictEntry::new("나다", 5, 5, 500, "VV,*,F,나다,*,*,*,*"),
1056 ];
1057
1058 SystemDictionary {
1059 dicdir: PathBuf::from("./test_dic"),
1060 trie,
1061 matrix,
1062 entry_store: Arc::new(EagerStore::new(dict_entries)),
1063 user_dict: None,
1064 #[cfg(feature = "hot-reload-v2")]
1065 hot_reload: None,
1066 }
1067 }
1068
1069 #[test]
1070 fn test_dict_entry_creation() {
1071 let entry = DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*");
1072 assert_eq!(entry.surface, "안녕");
1073 assert_eq!(entry.left_id, 1);
1074 assert_eq!(entry.right_id, 1);
1075 assert_eq!(entry.cost, 100);
1076 }
1077
1078 #[test]
1079 fn test_dict_entry_to_entry() {
1080 let dict_entry = DictEntry::new("테스트", 5, 5, 200, "NNG,*,T,테스트,*,*,*,*");
1081 let entry = dict_entry.to_entry();
1082
1083 assert_eq!(entry.surface, "테스트");
1084 assert_eq!(entry.left_id, 5);
1085 assert_eq!(entry.cost, 200);
1086 }
1087
1088 #[test]
1089 fn test_system_dictionary_lookup() {
1090 let dict = create_test_dictionary();
1091
1092 let entries = dict.lookup("가");
1093 assert_eq!(entries.len(), 1);
1094 assert_eq!(entries[0].surface, "가");
1095
1096 let entries = dict.lookup("가다");
1097 assert_eq!(entries.len(), 1);
1098 assert_eq!(entries[0].surface, "가다");
1099
1100 let entries = dict.lookup("없음");
1101 assert!(entries.is_empty());
1102 }
1103
1104 #[test]
1105 fn test_system_dictionary_get_connection_cost() {
1106 let dict = create_test_dictionary();
1107 let cost = dict.get_connection_cost(1, 2);
1108 assert_eq!(cost, 100); }
1110
1111 #[test]
1112 fn test_common_prefix_search() {
1113 let dict = create_test_dictionary();
1114
1115 let results = dict
1117 .common_prefix_search("가방에")
1118 .expect("search should work");
1119 assert_eq!(results.len(), 2);
1120
1121 let surfaces: Vec<_> = results.iter().map(|(e, _)| e.surface.as_str()).collect();
1122 assert!(surfaces.contains(&"가"));
1123 assert!(surfaces.contains(&"가방"));
1124 }
1125
1126 #[test]
1127 fn test_common_prefix_search_at() {
1128 let dict = create_test_dictionary();
1129
1130 let text = "나가다";
1131 let start = "나".len(); let results = dict
1134 .common_prefix_search_at(text, start)
1135 .expect("search should work");
1136 assert_eq!(results.len(), 2); let surfaces: Vec<_> = results.iter().map(|(e, _)| e.surface.as_str()).collect();
1139 assert!(surfaces.contains(&"가"));
1140 assert!(surfaces.contains(&"가다"));
1141 }
1142
1143 #[test]
1144 fn test_with_user_dictionary() {
1145 let mut dict = create_test_dictionary();
1146
1147 let mut user_dict = UserDictionary::new();
1148 user_dict.add_entry("딥러닝", "NNG", Some(-1000), None);
1149 user_dict.add_entry("머신러닝", "NNG", Some(-1000), None);
1150
1151 dict.set_user_dictionary(user_dict);
1152
1153 let entries = dict.lookup_combined("딥러닝");
1154 assert_eq!(entries.len(), 1);
1155 assert_eq!(entries[0].surface, "딥러닝");
1156 }
1157
1158 #[test]
1159 fn test_lookup_combined_system_and_user() {
1160 let mut dict = create_test_dictionary();
1161
1162 let mut user_dict = UserDictionary::new();
1163 user_dict.add_entry("가", "JKS", Some(-500), None); dict.set_user_dictionary(user_dict);
1166
1167 let entries = dict.lookup_combined("가");
1168 assert_eq!(entries.len(), 2);
1170 }
1171
1172 #[test]
1173 fn test_get_entry() {
1174 let dict = create_test_dictionary();
1175
1176 let entry = dict.get_entry(0);
1177 assert!(entry.is_ok());
1178 assert_eq!(entry.unwrap().surface, "가");
1179
1180 let entry = dict.get_entry(100);
1181 assert!(entry.is_err());
1182 }
1183
1184 #[test]
1185 fn test_dicdir() {
1186 let dict = create_test_dictionary();
1187 assert_eq!(dict.dicdir(), Path::new("./test_dic"));
1188 }
1189
1190 #[test]
1191 fn test_trie_reference() {
1192 let dict = create_test_dictionary();
1193 let trie = dict.trie();
1194 assert!(trie.exact_match("가").is_some());
1195 }
1196
1197 #[test]
1198 fn test_matrix_reference() {
1199 let dict = create_test_dictionary();
1200 let matrix = dict.matrix();
1201 assert_eq!(matrix.left_size(), 10);
1202 assert_eq!(matrix.right_size(), 10);
1203 }
1204
1205 #[test]
1206 fn test_entry_count() {
1207 let dict = create_test_dictionary();
1208 assert_eq!(dict.entry_count(), 5);
1209 }
1210
1211 #[test]
1212 fn test_dictionary_loader_find_dicdir() {
1213 let result = DictionaryLoader::find_dicdir();
1216
1217 match result {
1220 Ok(path) => {
1221 assert!(path.is_dir());
1222 }
1223 Err(e) => {
1224 assert!(e.to_string().contains("Dictionary directory not found"));
1226 }
1227 }
1228 }
1229
1230 #[test]
1231 fn test_dict_entry_from_entry() {
1232 let entry = Entry {
1233 surface: "테스트".to_string(),
1234 left_id: 10,
1235 right_id: 20,
1236 cost: 300,
1237 feature: "NNG,*,T,테스트,*,*,*,*".to_string(),
1238 };
1239
1240 let dict_entry: DictEntry = entry.into();
1241 assert_eq!(dict_entry.surface, "테스트");
1242 assert_eq!(dict_entry.left_id, 10);
1243 assert_eq!(dict_entry.right_id, 20);
1244 assert_eq!(dict_entry.cost, 300);
1245 }
1246
1247 #[test]
1248 fn test_entries_bin_roundtrip() {
1249 let entries = vec![
1250 DictEntry::new("안녕", 1, 1, 100, "NNG,*,T,안녕,*,*,*,*"),
1251 DictEntry::new("하세요", 2, 2, 50, "VV,*,F,하세요,*,*,*,*"),
1252 DictEntry::new("감사", 3, 3, 80, "NNG,*,F,감사,*,*,*,*"),
1253 ];
1254
1255 let temp = tempfile::NamedTempFile::new().expect("create temp file");
1256 let path = temp.path();
1257
1258 SystemDictionary::save_entries_bin(&entries, path).expect("save should work");
1259 let loaded = SystemDictionary::load_entries_bin(path).expect("load should work");
1260
1261 assert_eq!(loaded.len(), 3);
1262 assert_eq!(loaded[0].surface, "안녕");
1263 assert_eq!(loaded[0].left_id, 1);
1264 assert_eq!(loaded[0].cost, 100);
1265 assert_eq!(loaded[0].feature, "NNG,*,T,안녕,*,*,*,*");
1266 assert_eq!(loaded[1].surface, "하세요");
1267 assert_eq!(loaded[2].surface, "감사");
1268 }
1269
1270 #[test]
1271 fn test_entries_csv_roundtrip() {
1272 let entries = vec![
1273 DictEntry::new("형태소", 10, 20, 150, "NNG,*,F,형태소,*,*,*,*"),
1274 DictEntry::new("분석", 11, 21, 200, "NNG,*,T,분석,*,*,*,*"),
1275 ];
1276
1277 let temp = tempfile::NamedTempFile::new().expect("create temp file");
1278 let path = temp.path();
1279
1280 SystemDictionary::save_entries_csv(&entries, path).expect("save should work");
1281 let loaded = SystemDictionary::load_entries_csv(path).expect("load should work");
1282
1283 assert_eq!(loaded.len(), 2);
1284 assert_eq!(loaded[0].surface, "형태소");
1285 assert_eq!(loaded[0].left_id, 10);
1286 assert_eq!(loaded[0].right_id, 20);
1287 assert_eq!(loaded[0].cost, 150);
1288 assert_eq!(loaded[1].surface, "분석");
1289 }
1290
1291 #[test]
1292 fn test_get_entries_at_multi() {
1293 let trie_input = vec![("가", 0u32), ("나", 2u32)];
1295 let trie_bytes = TrieBuilder::build(&trie_input).expect("build trie");
1296 let trie = Trie::from_vec(trie_bytes);
1297 let matrix = ConnectionMatrix::Dense(DenseMatrix::new(5, 5, 100));
1298
1299 let dict_entries = vec![
1300 DictEntry::new("가", 1, 1, 100, "VV,*,F,가,*,*,*,*"),
1301 DictEntry::new("가", 2, 2, 50, "JKS,*,F,가,*,*,*,*"),
1302 DictEntry::new("나", 3, 3, 200, "NP,*,F,나,*,*,*,*"),
1303 ];
1304
1305 let dict = SystemDictionary {
1306 dicdir: PathBuf::from("./test"),
1307 trie,
1308 matrix,
1309 entry_store: Arc::new(EagerStore::new(dict_entries)),
1310 user_dict: None,
1311 #[cfg(feature = "hot-reload-v2")]
1312 hot_reload: None,
1313 };
1314
1315 let results = dict.get_entries_at(0, "가").expect("should get entries");
1317 assert_eq!(results.len(), 2);
1318 assert_eq!(results[0].feature, "VV,*,F,가,*,*,*,*");
1319 assert_eq!(results[1].feature, "JKS,*,F,가,*,*,*,*");
1320
1321 use crate::Dictionary;
1323 let entries = dict.lookup("가");
1324 assert_eq!(entries.len(), 2);
1325 }
1326}