1use crate::error::{DictError, Result};
6use crate::matrix::{DenseMatrix, Matrix};
7use crate::trie::Trie;
8use crate::{Dictionary, Entry};
9use std::fs::File;
10use std::path::{Path, PathBuf};
11
12#[derive(Debug, Clone, Copy)]
14pub struct LoaderConfig {
15 pub use_mmap: bool,
17 pub auto_decompress: bool,
19 pub lazy_load: bool,
21}
22
23impl Default for LoaderConfig {
24 fn default() -> Self {
25 Self {
26 use_mmap: true,
27 auto_decompress: true,
28 lazy_load: false,
29 }
30 }
31}
32
33pub struct MmapDictionary {
37 trie: Trie<'static>,
39 matrix: DenseMatrix,
41 dict_dir: PathBuf,
43 entries: Vec<Entry>,
45}
46
47impl MmapDictionary {
48 pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
64 Self::load_with_config(path, LoaderConfig::default())
65 }
66
67 pub fn load_with_config<P: AsRef<Path>>(path: P, config: LoaderConfig) -> Result<Self> {
73 let dict_dir = path.as_ref().to_path_buf();
74
75 let trie_data = Self::load_trie(&dict_dir, config)?;
77 let trie = Trie::from_vec(trie_data);
78
79 let matrix = Self::load_matrix(&dict_dir, config)?;
81
82 let entries = Self::load_entries(&dict_dir, config)?;
84
85 Ok(Self {
86 trie,
87 matrix,
88 dict_dir,
89 entries,
90 })
91 }
92
93 #[cfg(feature = "zstd")]
95 fn load_trie(dict_dir: &Path, config: LoaderConfig) -> Result<Vec<u8>> {
96 let compressed_path = dict_dir.join("sys.dic.zst");
98 let uncompressed_path = dict_dir.join("sys.dic");
99
100 if config.auto_decompress && compressed_path.exists() {
101 let file = File::open(&compressed_path)?;
103 let mut decoder = zstd::Decoder::new(file)?;
104 let mut buffer = Vec::new();
105 std::io::Read::read_to_end(&mut decoder, &mut buffer)?;
106 Ok(buffer)
107 } else if uncompressed_path.exists() {
108 Ok(std::fs::read(&uncompressed_path)?)
110 } else {
111 Err(DictError::Format(
112 "sys.dic or sys.dic.zst not found".to_string(),
113 ))
114 }
115 }
116
117 #[cfg(not(feature = "zstd"))]
119 fn load_trie(dict_dir: &Path, _config: LoaderConfig) -> Result<Vec<u8>> {
120 let uncompressed_path = dict_dir.join("sys.dic");
121
122 if uncompressed_path.exists() {
123 Ok(std::fs::read(&uncompressed_path)?)
124 } else {
125 Err(DictError::Format(
126 "sys.dic not found (zstd feature disabled, compressed files not supported)"
127 .to_string(),
128 ))
129 }
130 }
131
132 #[cfg(feature = "zstd")]
134 fn load_matrix(dict_dir: &Path, config: LoaderConfig) -> Result<DenseMatrix> {
135 let compressed_path = dict_dir.join("matrix.bin.zst");
136 let uncompressed_path = dict_dir.join("matrix.bin");
137
138 if config.auto_decompress && compressed_path.exists() {
139 DenseMatrix::from_compressed_file(&compressed_path)
140 } else if uncompressed_path.exists() {
141 DenseMatrix::from_bin_file(&uncompressed_path)
142 } else {
143 Err(DictError::Format(
144 "matrix.bin or matrix.bin.zst not found".to_string(),
145 ))
146 }
147 }
148
149 #[cfg(not(feature = "zstd"))]
151 fn load_matrix(dict_dir: &Path, _config: LoaderConfig) -> Result<DenseMatrix> {
152 let uncompressed_path = dict_dir.join("matrix.bin");
153
154 if uncompressed_path.exists() {
155 DenseMatrix::from_bin_file(&uncompressed_path)
156 } else {
157 Err(DictError::Format(
158 "matrix.bin not found (zstd feature disabled, compressed files not supported)"
159 .to_string(),
160 ))
161 }
162 }
163
164 #[cfg(feature = "zstd")]
194 fn load_entries(dict_dir: &Path, config: LoaderConfig) -> Result<Vec<Entry>> {
195 let bin_path = dict_dir.join("entries.bin");
197 let compressed_bin_path = dict_dir.join("entries.bin.zst");
198 let csv_path = dict_dir.join("entries.csv");
199
200 if config.auto_decompress && compressed_bin_path.exists() {
202 return Self::load_entries_from_compressed_bin(&compressed_bin_path);
203 }
204
205 if bin_path.exists() {
207 return Self::load_entries_from_bin(&bin_path);
208 }
209
210 if csv_path.exists() {
212 return Self::load_entries_from_csv(&csv_path);
213 }
214
215 Ok(Vec::new())
219 }
220
221 #[cfg(not(feature = "zstd"))]
222 fn load_entries(dict_dir: &Path, _config: LoaderConfig) -> Result<Vec<Entry>> {
223 let bin_path = dict_dir.join("entries.bin");
225 let csv_path = dict_dir.join("entries.csv");
226
227 if bin_path.exists() {
229 return Self::load_entries_from_bin(&bin_path);
230 }
231
232 if csv_path.exists() {
234 return Self::load_entries_from_csv(&csv_path);
235 }
236
237 Ok(Vec::new())
239 }
240
241 fn load_entries_from_bin(path: &Path) -> Result<Vec<Entry>> {
243 use std::io::Read;
244
245 let mut file = File::open(path)?;
246 let mut buffer = Vec::new();
247 file.read_to_end(&mut buffer)?;
248
249 Self::parse_entries_binary(&buffer)
250 }
251
252 #[cfg(feature = "zstd")]
254 fn load_entries_from_compressed_bin(path: &Path) -> Result<Vec<Entry>> {
255 use std::io::Read;
256
257 let file = File::open(path)?;
258 let mut decoder = zstd::Decoder::new(file)?;
259 let mut buffer = Vec::new();
260 decoder.read_to_end(&mut buffer)?;
261
262 Self::parse_entries_binary(&buffer)
263 }
264
265 #[cfg(not(feature = "zstd"))]
267 #[allow(dead_code)]
268 fn load_entries_from_compressed_bin(_path: &Path) -> Result<Vec<Entry>> {
269 Err(DictError::Format(
270 "zstd feature is not enabled. Use uncompressed files or enable the 'zstd' feature."
271 .to_string(),
272 ))
273 }
274
275 fn parse_entries_binary(data: &[u8]) -> Result<Vec<Entry>> {
277 use std::io::{Cursor, Read};
278
279 let mut cursor = Cursor::new(data);
280 let mut count_bytes = [0u8; 4];
281 cursor.read_exact(&mut count_bytes).map_err(|_| {
282 DictError::Format("Failed to read entry count from binary file".to_string())
283 })?;
284
285 let count = u32::from_le_bytes(count_bytes) as usize;
286 let mut entries = Vec::with_capacity(count);
287
288 for _ in 0..count {
289 let mut buf = [0u8; 2];
291 cursor.read_exact(&mut buf).map_err(|_| {
292 DictError::Format("Failed to read left_id from binary file".to_string())
293 })?;
294 let left_id = u16::from_le_bytes(buf);
295
296 cursor.read_exact(&mut buf).map_err(|_| {
298 DictError::Format("Failed to read right_id from binary file".to_string())
299 })?;
300 let right_id = u16::from_le_bytes(buf);
301
302 cursor.read_exact(&mut buf).map_err(|_| {
304 DictError::Format("Failed to read cost from binary file".to_string())
305 })?;
306 let cost = i16::from_le_bytes(buf);
307
308 cursor.read_exact(&mut buf).map_err(|_| {
310 DictError::Format("Failed to read surface length from binary file".to_string())
311 })?;
312 let surface_len = u16::from_le_bytes(buf) as usize;
313 let mut surface_bytes = vec![0u8; surface_len];
314 cursor.read_exact(&mut surface_bytes).map_err(|_| {
315 DictError::Format("Failed to read surface from binary file".to_string())
316 })?;
317 let surface = String::from_utf8(surface_bytes)
318 .map_err(|_| DictError::Format("Invalid UTF-8 in surface field".to_string()))?;
319
320 cursor.read_exact(&mut buf).map_err(|_| {
322 DictError::Format("Failed to read feature length from binary file".to_string())
323 })?;
324 let feature_len = u16::from_le_bytes(buf) as usize;
325 let mut feature_bytes = vec![0u8; feature_len];
326 cursor.read_exact(&mut feature_bytes).map_err(|_| {
327 DictError::Format("Failed to read feature from binary file".to_string())
328 })?;
329 let feature = String::from_utf8(feature_bytes)
330 .map_err(|_| DictError::Format("Invalid UTF-8 in feature field".to_string()))?;
331
332 entries.push(Entry {
333 surface,
334 left_id,
335 right_id,
336 cost,
337 feature,
338 });
339 }
340
341 Ok(entries)
342 }
343
344 fn load_entries_from_csv(path: &Path) -> Result<Vec<Entry>> {
346 use std::io::{BufRead, BufReader};
347
348 let file = File::open(path)?;
349 let reader = BufReader::new(file);
350 let mut entries = Vec::new();
351
352 for (line_num, line_result) in reader.lines().enumerate() {
353 let line = line_result?;
354
355 if line.trim().is_empty() || line.starts_with('#') {
357 continue;
358 }
359
360 let entry = Self::parse_csv_line(&line)
361 .map_err(|e| DictError::Format(format!("Failed to parse line {line_num}: {e}")))?;
362
363 entries.push(entry);
364 }
365
366 Ok(entries)
367 }
368
369 fn parse_csv_line(line: &str) -> Result<Entry> {
374 let parts: Vec<&str> = line.split(',').collect();
375
376 if parts.len() < 5 {
377 return Err(DictError::Format(format!(
378 "Invalid CSV line: expected at least 5 fields, got {}",
379 parts.len()
380 )));
381 }
382
383 let surface = parts[0].to_string();
384
385 let left_id = parts[1]
386 .parse::<u16>()
387 .map_err(|_| DictError::Format(format!("Invalid left_id: {}", parts[1])))?;
388
389 let right_id = parts[2]
390 .parse::<u16>()
391 .map_err(|_| DictError::Format(format!("Invalid right_id: {}", parts[2])))?;
392
393 let cost = parts[3]
394 .parse::<i16>()
395 .map_err(|_| DictError::Format(format!("Invalid cost: {}", parts[3])))?;
396
397 let feature = parts[4..].join(",");
399
400 Ok(Entry {
401 surface,
402 left_id,
403 right_id,
404 cost,
405 feature,
406 })
407 }
408
409 #[must_use]
411 pub const fn trie(&self) -> &Trie<'static> {
412 &self.trie
413 }
414
415 #[must_use]
417 pub const fn matrix(&self) -> &DenseMatrix {
418 &self.matrix
419 }
420
421 #[must_use]
423 pub fn dict_dir(&self) -> &Path {
424 &self.dict_dir
425 }
426
427 #[must_use]
429 pub fn entries(&self) -> &[Entry] {
430 &self.entries
431 }
432
433 #[must_use]
439 pub fn get_entry(&self, index: u32) -> Option<&Entry> {
440 self.entries.get(index as usize)
441 }
442}
443
444impl Dictionary for MmapDictionary {
445 fn lookup(&self, surface: &str) -> Vec<Entry> {
446 self.trie
448 .exact_match(surface)
449 .map_or_else(Vec::new, |index| {
450 self.entries.get(index as usize).map_or_else(
451 || {
452 vec![Entry {
455 surface: surface.to_string(),
456 left_id: 0,
457 right_id: 0,
458 cost: 0,
459 feature: "UNK,*,*,*,*,*,*,*".to_string(),
460 }]
461 },
462 |entry| vec![entry.clone()],
463 )
464 })
465 }
466
467 fn get_connection_cost(&self, left_id: u16, right_id: u16) -> i16 {
468 let cost = self.matrix.get(left_id, right_id);
471 cost.clamp(i32::from(i16::MIN), i32::from(i16::MAX))
472 .try_into()
473 .unwrap_or(i16::MAX)
474 }
475}
476
477pub struct LazyDictionary {
481 dict_path: PathBuf,
482 config: LoaderConfig,
483 dict: std::sync::Mutex<Option<MmapDictionary>>,
484}
485
486impl LazyDictionary {
487 pub fn new<P: AsRef<Path>>(path: P) -> Self {
489 Self::new_with_config(path, LoaderConfig::default())
490 }
491
492 pub fn new_with_config<P: AsRef<Path>>(path: P, config: LoaderConfig) -> Self {
494 Self {
495 dict_path: path.as_ref().to_path_buf(),
496 config,
497 dict: std::sync::Mutex::new(None),
498 }
499 }
500
501 fn ensure_loaded(&self) -> Result<()> {
503 let mut dict = self.dict.lock().map_err(|_| {
504 DictError::Format("Failed to acquire lock for lazy dictionary".to_string())
505 })?;
506
507 if dict.is_some() {
508 return Ok(());
509 }
510
511 let loaded_dict = MmapDictionary::load_with_config(&self.dict_path, self.config)?;
512 *dict = Some(loaded_dict);
513 drop(dict);
514
515 Ok(())
516 }
517}
518
519impl Dictionary for LazyDictionary {
520 fn lookup(&self, surface: &str) -> Vec<Entry> {
521 if self.ensure_loaded().is_err() {
522 return Vec::new();
523 }
524
525 let Ok(dict) = self.dict.lock() else {
526 return Vec::new();
527 };
528
529 dict.as_ref().map_or_else(Vec::new, |d| d.lookup(surface))
530 }
531
532 fn get_connection_cost(&self, left_id: u16, right_id: u16) -> i16 {
533 if self.ensure_loaded().is_err() {
534 return 0;
535 }
536
537 let Ok(dict) = self.dict.lock() else {
538 return 0;
539 };
540
541 dict.as_ref()
542 .map_or(0, |d| d.get_connection_cost(left_id, right_id))
543 }
544}
545
546pub struct DictionaryLoader {
548 path: PathBuf,
549 config: LoaderConfig,
550}
551
552impl DictionaryLoader {
553 pub fn new<P: AsRef<Path>>(path: P) -> Self {
555 Self {
556 path: path.as_ref().to_path_buf(),
557 config: LoaderConfig::default(),
558 }
559 }
560
561 #[must_use]
563 pub const fn use_mmap(mut self, use_mmap: bool) -> Self {
564 self.config.use_mmap = use_mmap;
565 self
566 }
567
568 #[must_use]
570 pub const fn auto_decompress(mut self, auto: bool) -> Self {
571 self.config.auto_decompress = auto;
572 self
573 }
574
575 #[must_use]
577 pub const fn lazy_load(mut self, lazy: bool) -> Self {
578 self.config.lazy_load = lazy;
579 self
580 }
581
582 pub fn load(self) -> Result<Box<dyn Dictionary>> {
588 if self.config.lazy_load {
589 Ok(Box::new(LazyDictionary::new_with_config(
590 self.path,
591 self.config,
592 )))
593 } else {
594 Ok(Box::new(MmapDictionary::load_with_config(
595 self.path,
596 self.config,
597 )?))
598 }
599 }
600}
601
602#[cfg(test)]
603#[allow(clippy::expect_used, clippy::unwrap_used)]
604mod tests {
605 use super::*;
606 use crate::trie::TrieBuilder;
607
608 fn create_test_dict() -> tempfile::TempDir {
609 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
610
611 let trie_entries = vec![("가", 0u32), ("가다", 1u32), ("가방", 2u32)];
613 let trie_bytes = TrieBuilder::build(&trie_entries).expect("build trie");
614 std::fs::write(temp_dir.path().join("sys.dic"), trie_bytes).expect("write trie");
615
616 let matrix = DenseMatrix::new(10, 10, 100);
618 matrix
619 .to_bin_file(temp_dir.path().join("matrix.bin"))
620 .expect("write matrix");
621
622 let entries_csv = "가,1,1,100,NNG,*,T,가,*,*,*,*\n\
624 가다,2,2,200,VV,*,F,가다,*,*,*,*\n\
625 가방,3,3,300,NNG,*,T,가방,*,*,*,*\n";
626 std::fs::write(temp_dir.path().join("entries.csv"), entries_csv).expect("write entries");
627
628 temp_dir
629 }
630
631 #[test]
632 fn test_mmap_dictionary_load() {
633 let temp_dir = create_test_dict();
634 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
635
636 assert!(dict.trie().exact_match("가").is_some());
637 assert!(dict.trie().exact_match("가다").is_some());
638 assert!(dict.trie().exact_match("없음").is_none());
639 }
640
641 #[test]
642 fn test_dictionary_lookup() {
643 let temp_dir = create_test_dict();
644 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
645
646 let entries = dict.lookup("가");
647 assert!(!entries.is_empty());
648 assert_eq!(entries[0].surface, "가");
649 assert_eq!(entries[0].left_id, 1);
650 assert_eq!(entries[0].right_id, 1);
651 assert_eq!(entries[0].cost, 100);
652 assert!(entries[0].feature.starts_with("NNG"));
653
654 let no_entries = dict.lookup("없음");
655 assert!(no_entries.is_empty());
656 }
657
658 #[test]
659 fn test_connection_cost() {
660 let temp_dir = create_test_dict();
661 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
662
663 let cost = dict.get_connection_cost(0, 0);
664 assert_eq!(cost, 100); }
666
667 #[test]
668 fn test_loader_builder() {
669 let temp_dir = create_test_dict();
670
671 let dict = DictionaryLoader::new(temp_dir.path())
672 .use_mmap(true)
673 .auto_decompress(true)
674 .load()
675 .expect("load failed");
676
677 let entries = dict.lookup("가");
678 assert!(!entries.is_empty());
679 }
680
681 #[test]
682 fn test_lazy_dictionary() {
683 let temp_dir = create_test_dict();
684
685 let dict = LazyDictionary::new(temp_dir.path());
686
687 let entries = dict.lookup("가");
689 assert!(!entries.is_empty());
690
691 let entries2 = dict.lookup("가다");
693 assert!(!entries2.is_empty());
694 }
695
696 #[test]
697 fn test_missing_dictionary() {
698 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
699 let result = MmapDictionary::load(temp_dir.path());
700 assert!(result.is_err());
701 }
702
703 #[test]
704 fn test_get_entry_by_index() {
705 let temp_dir = create_test_dict();
706 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
707
708 let entry = dict.get_entry(0);
710 assert!(entry.is_some());
711 assert_eq!(entry.unwrap().surface, "가");
712
713 let entry = dict.get_entry(1);
714 assert!(entry.is_some());
715 assert_eq!(entry.unwrap().surface, "가다");
716
717 let entry = dict.get_entry(100);
719 assert!(entry.is_none());
720 }
721
722 #[test]
723 fn test_entries_accessor() {
724 let temp_dir = create_test_dict();
725 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
726
727 let entries = dict.entries();
728 assert_eq!(entries.len(), 3);
729 assert_eq!(entries[0].surface, "가");
730 assert_eq!(entries[1].surface, "가다");
731 assert_eq!(entries[2].surface, "가방");
732 }
733
734 #[test]
735 fn test_csv_parsing() {
736 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
737
738 let entries_csv = "안녕,10,20,500,NNG,*,T,안녕,*,*,*,*\n\
740 하세요,15,25,600,VV+EC,*,F,하세요,*,*,*,*\n";
741 std::fs::write(temp_dir.path().join("entries.csv"), entries_csv).expect("write entries");
742
743 let trie_entries = vec![("안녕", 0u32), ("하세요", 1u32)];
745 let trie_bytes = TrieBuilder::build(&trie_entries).expect("build trie");
746 std::fs::write(temp_dir.path().join("sys.dic"), trie_bytes).expect("write trie");
747
748 let matrix = DenseMatrix::new(30, 30, 100);
749 matrix
750 .to_bin_file(temp_dir.path().join("matrix.bin"))
751 .expect("write matrix");
752
753 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
755 let entries = dict.lookup("안녕");
756 assert!(!entries.is_empty());
757 assert_eq!(entries[0].surface, "안녕");
758 assert_eq!(entries[0].left_id, 10);
759 assert_eq!(entries[0].right_id, 20);
760 assert_eq!(entries[0].cost, 500);
761 }
762
763 #[test]
764 fn test_dict_without_entries() {
765 let temp_dir = tempfile::TempDir::new().expect("create temp dir");
766
767 let trie_entries = vec![("테스트", 0u32)];
769 let trie_bytes = TrieBuilder::build(&trie_entries).expect("build trie");
770 std::fs::write(temp_dir.path().join("sys.dic"), trie_bytes).expect("write trie");
771
772 let matrix = DenseMatrix::new(2, 2, 100);
773 matrix
774 .to_bin_file(temp_dir.path().join("matrix.bin"))
775 .expect("write matrix");
776
777 let dict = MmapDictionary::load(temp_dir.path()).expect("load failed");
779 assert_eq!(dict.entries().len(), 0);
780
781 let entries = dict.lookup("테스트");
783 assert!(!entries.is_empty());
784 assert_eq!(entries[0].surface, "테스트");
785 assert_eq!(entries[0].feature, "UNK,*,*,*,*,*,*,*");
786 }
787}