1use std::collections::HashMap;
27use std::io::{BufRead, BufReader};
28use std::path::Path;
29
30use crate::error::{DictError, Result};
31use crate::trie::{Trie, TrieBuilder};
32use crate::Entry;
33
34const VALID_POS_TAGS: &[&str] = &[
36 "NNG", "NNP", "NNB", "NR", "NP",
38 "VV", "VA", "VX", "VCP", "VCN",
40 "MM", "MAG", "MAJ", "IC",
42 "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC",
44 "EP", "EF", "EC", "ETN", "ETM",
46 "XPN", "XSN", "XSV", "XSA", "XR",
48 "SF", "SE", "SS", "SP", "SO", "SW",
50 "SL", "SH", "SN",
52 "NA",
54];
55
56#[must_use]
58pub fn is_valid_pos_tag(pos: &str) -> bool {
59 if VALID_POS_TAGS.contains(&pos) {
61 return true;
62 }
63
64 if pos.contains('+') {
66 return pos.split('+').all(|p| VALID_POS_TAGS.contains(&p));
67 }
68
69 false
70}
71
72#[must_use]
84pub fn estimate_pos(surface: &str) -> &'static str {
85 if surface.is_empty() {
87 return "NA";
88 }
89
90 let chars: Vec<char> = surface.chars().collect();
91 let first_char = chars[0];
92 let last_char = *chars.last().unwrap_or(&first_char);
93
94 if surface.chars().all(|c| c.is_ascii_digit()) {
96 return "SN";
97 }
98
99 if surface.chars().all(|c| c.is_ascii_alphabetic()) {
101 if surface.chars().all(|c| c.is_ascii_uppercase()) {
103 return "SL"; }
105 return "SL"; }
107
108 if surface.chars().all(|c| c.is_ascii_alphanumeric()) {
110 return "SL";
111 }
112
113 if is_hangul(first_char) {
115 if matches!(last_char, '다' | '하' | '되') {
117 return "VV"; }
119
120 if matches!(last_char, '이' | '히' | '게' | '로' | '리') && chars.len() >= 2 {
122 }
125
126 if surface.chars().any(|c| c.is_ascii_alphabetic()) {
129 return "NNP"; }
131
132 return "NNG";
134 }
135
136 if first_char.is_ascii_punctuation() {
138 return "SW";
139 }
140
141 if is_hanja(first_char) {
143 return "SH";
144 }
145
146 "NNG"
148}
149
150fn is_hangul(c: char) -> bool {
152 ('\u{AC00}'..='\u{D7A3}').contains(&c) || ('\u{1100}'..='\u{11FF}').contains(&c) || ('\u{3130}'..='\u{318F}').contains(&c) }
156
157fn is_hanja(c: char) -> bool {
159 ('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3400}'..='\u{4DBF}').contains(&c) }
162
163#[derive(Debug, Clone, Default)]
165pub struct ValidationResult {
166 pub is_valid: bool,
168 pub warnings: Vec<String>,
170 pub errors: Vec<String>,
172}
173
174impl ValidationResult {
175 #[must_use]
177 pub fn is_ok(&self) -> bool {
178 self.is_valid && self.warnings.is_empty()
179 }
180
181 #[must_use]
183 pub fn issue_count(&self) -> usize {
184 self.warnings.len() + self.errors.len()
185 }
186}
187
188#[derive(Debug, Clone)]
190pub struct DictionaryStats {
191 pub entry_count: usize,
193 pub unique_surfaces: usize,
195 pub pos_distribution: HashMap<String, usize>,
197 pub average_cost: f64,
199}
200
201impl std::fmt::Display for DictionaryStats {
202 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
203 writeln!(f, "Dictionary Statistics:")?;
204 writeln!(f, " Total entries: {}", self.entry_count)?;
205 writeln!(f, " Unique surfaces: {}", self.unique_surfaces)?;
206 writeln!(f, " Average cost: {:.2}", self.average_cost)?;
207 writeln!(f, " POS distribution:")?;
208
209 let mut pos_sorted: Vec<_> = self.pos_distribution.iter().collect();
210 pos_sorted.sort_by(|a, b| b.1.cmp(a.1));
211
212 for (pos, count) in pos_sorted.iter().take(10) {
213 writeln!(f, " {pos}: {count}")?;
214 }
215
216 Ok(())
217 }
218}
219
220#[derive(Debug, Clone, PartialEq, Eq)]
222pub struct UserEntry {
223 pub surface: String,
225 pub left_id: u16,
227 pub right_id: u16,
229 pub cost: i16,
231 pub pos: String,
233 pub reading: Option<String>,
235 pub lemma: Option<String>,
237 pub feature: String,
239}
240
241impl UserEntry {
242 pub fn new(
244 surface: impl Into<String>,
245 pos: impl Into<String>,
246 cost: i16,
247 reading: Option<String>,
248 ) -> Self {
249 let surface = surface.into();
250 let pos = pos.into();
251 let feature = format!("{},*,*,{},*,*,*,*", pos, reading.as_deref().unwrap_or("*"));
252 Self {
253 surface,
254 left_id: 0, right_id: 0,
256 cost,
257 pos,
258 reading,
259 lemma: None,
260 feature,
261 }
262 }
263
264 #[must_use]
266 pub const fn with_context_ids(mut self, left_id: u16, right_id: u16) -> Self {
267 self.left_id = left_id;
268 self.right_id = right_id;
269 self
270 }
271
272 #[must_use]
274 pub fn with_lemma(mut self, lemma: impl Into<String>) -> Self {
275 self.lemma = Some(lemma.into());
276 self
277 }
278
279 #[must_use]
281 pub fn to_entry(&self) -> Entry {
282 let feature = format!(
283 "{},*,*,*,*,*,{},*",
284 self.pos,
285 self.reading.as_deref().unwrap_or("*")
286 );
287
288 Entry {
289 surface: self.surface.clone(),
290 left_id: self.left_id,
291 right_id: self.right_id,
292 cost: self.cost,
293 feature,
294 }
295 }
296}
297
298#[derive(Clone)]
302pub struct UserDictionary {
303 entries: Vec<UserEntry>,
305 surface_map: HashMap<String, Vec<usize>>,
307 trie_cache: Option<Vec<u8>>,
309 default_cost: i16,
311}
312
313impl Default for UserDictionary {
314 fn default() -> Self {
315 Self::new()
316 }
317}
318
319impl UserDictionary {
320 #[must_use]
322 pub fn new() -> Self {
323 Self {
324 entries: Vec::new(),
325 surface_map: HashMap::new(),
326 trie_cache: None,
327 default_cost: -1000, }
329 }
330
331 #[must_use]
333 pub const fn with_default_cost(mut self, cost: i16) -> Self {
334 self.default_cost = cost;
335 self
336 }
337
338 pub fn add_entry(
347 &mut self,
348 surface: impl Into<String>,
349 pos: impl Into<String>,
350 cost: Option<i16>,
351 reading: Option<String>,
352 ) -> &mut Self {
353 let surface = surface.into();
354 let cost = cost.unwrap_or(self.default_cost);
355 let entry = UserEntry::new(surface.clone(), pos, cost, reading);
356
357 let idx = self.entries.len();
358 self.entries.push(entry);
359
360 self.surface_map.entry(surface).or_default().push(idx);
361
362 self.trie_cache = None;
364
365 self
366 }
367
368 pub fn add_entry_with_ids(
370 &mut self,
371 surface: impl Into<String>,
372 pos: impl Into<String>,
373 cost: i16,
374 left_id: u16,
375 right_id: u16,
376 reading: Option<String>,
377 ) -> &mut Self {
378 let surface = surface.into();
379 let entry =
380 UserEntry::new(surface.clone(), pos, cost, reading).with_context_ids(left_id, right_id);
381
382 let idx = self.entries.len();
383 self.entries.push(entry);
384
385 self.surface_map.entry(surface).or_default().push(idx);
386
387 self.trie_cache = None;
388
389 self
390 }
391
392 pub fn load_from_csv<P: AsRef<Path>>(&mut self, path: P) -> Result<&mut Self> {
410 let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
411 let reader = BufReader::new(file);
412
413 for (line_num, line_result) in reader.lines().enumerate() {
414 let line = line_result.map_err(DictError::Io)?;
415 let line = line.trim();
416
417 if line.is_empty() || line.starts_with('#') {
419 continue;
420 }
421
422 self.parse_csv_line(line, line_num + 1)?;
423 }
424
425 Ok(self)
426 }
427
428 pub fn load_from_str(&mut self, content: &str) -> Result<&mut Self> {
434 for (line_num, line) in content.lines().enumerate() {
435 let line = line.trim();
436
437 if line.is_empty() || line.starts_with('#') {
438 continue;
439 }
440
441 self.parse_csv_line(line, line_num + 1)?;
442 }
443
444 Ok(self)
445 }
446
447 fn parse_csv_line(&mut self, line: &str, line_num: usize) -> Result<()> {
449 let parts: Vec<&str> = line.split(',').collect();
450
451 if parts.len() < 2 {
452 return Err(DictError::Format(format!(
453 "Invalid user dictionary format at line {line_num}: expected at least 2 fields"
454 )));
455 }
456
457 let surface = parts[0].trim();
458 let pos = parts[1].trim();
459
460 if surface.is_empty() || pos.is_empty() {
461 return Err(DictError::Format(format!(
462 "Empty surface or POS at line {line_num}"
463 )));
464 }
465
466 let cost = if parts.len() > 2 && !parts[2].trim().is_empty() {
467 parts[2].trim().parse::<i16>().map_err(|_| {
468 DictError::Format(format!("Invalid cost at line {}: {}", line_num, parts[2]))
469 })?
470 } else {
471 self.default_cost
472 };
473
474 let reading = if parts.len() > 3 && !parts[3].trim().is_empty() {
475 Some(parts[3].trim().to_string())
476 } else {
477 None
478 };
479
480 self.add_entry(surface, pos, Some(cost), reading);
481
482 Ok(())
483 }
484
485 #[must_use]
487 pub fn lookup(&self, surface: &str) -> Vec<&UserEntry> {
488 self.surface_map
489 .get(surface)
490 .map(|indices| {
491 indices
492 .iter()
493 .filter_map(|&idx| self.entries.get(idx))
494 .collect()
495 })
496 .unwrap_or_default()
497 }
498
499 #[must_use]
511 pub fn common_prefix_search(&self, text: &str) -> Vec<&UserEntry> {
512 let mut results = Vec::new();
513
514 for entry in &self.entries {
516 if text.starts_with(&entry.surface) {
517 results.push(entry);
518 }
519 }
520
521 results
522 }
523
524 #[must_use]
526 pub fn entries(&self) -> &[UserEntry] {
527 &self.entries
528 }
529
530 #[must_use]
532 pub fn len(&self) -> usize {
533 self.entries.len()
534 }
535
536 #[must_use]
538 pub fn is_empty(&self) -> bool {
539 self.entries.is_empty()
540 }
541
542 pub fn build_trie(&mut self) -> Result<&[u8]> {
550 if let Some(ref cache) = self.trie_cache {
551 return Ok(cache);
552 }
553
554 if self.entries.is_empty() {
555 return Err(DictError::Format(
556 "Cannot build Trie from empty user dictionary".to_string(),
557 ));
558 }
559
560 #[allow(clippy::cast_possible_truncation)]
562 let mut trie_entries: Vec<(&str, u32)> = self
563 .surface_map
564 .iter()
565 .filter_map(|(surface, indices)| {
566 indices.first().map(|&idx| (surface.as_str(), idx as u32))
567 })
568 .collect();
569
570 trie_entries.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));
572
573 let bytes = TrieBuilder::build(&trie_entries)?;
574 self.trie_cache = Some(bytes);
575
576 Ok(self.trie_cache.as_ref().unwrap_or_else(|| unreachable!()))
578 }
579
580 #[must_use]
582 pub fn get_trie(&self) -> Option<Trie<'_>> {
583 self.trie_cache.as_ref().map(|bytes| Trie::new(bytes))
584 }
585
586 #[must_use]
588 pub fn to_entries(&self) -> Vec<Entry> {
589 self.entries.iter().map(UserEntry::to_entry).collect()
590 }
591
592 pub fn clear(&mut self) {
594 self.entries.clear();
595 self.surface_map.clear();
596 self.trie_cache = None;
597 }
598
599 #[must_use]
607 pub fn validate(&self) -> ValidationResult {
608 let mut warnings = Vec::new();
609 let mut errors = Vec::new();
610
611 for (idx, entry) in self.entries.iter().enumerate() {
612 if entry.surface.is_empty() {
614 errors.push(format!("Entry {idx}: empty surface"));
615 }
616
617 if entry.pos.is_empty() {
619 errors.push(format!("Entry {idx}: empty POS tag"));
620 }
621
622 if entry.cost == i16::MIN || entry.cost == i16::MAX {
624 warnings.push(format!(
625 "Entry {} ({}): cost {} is at extreme value",
626 idx, entry.surface, entry.cost
627 ));
628 }
629
630 if !is_valid_pos_tag(&entry.pos) {
632 warnings.push(format!(
633 "Entry {} ({}): unknown POS tag '{}'",
634 idx, entry.surface, entry.pos
635 ));
636 }
637 }
638
639 let mut seen: HashMap<(&str, &str), usize> = HashMap::new();
641 for (idx, entry) in self.entries.iter().enumerate() {
642 let key = (entry.surface.as_str(), entry.pos.as_str());
643 if let Some(&prev_idx) = seen.get(&key) {
644 warnings.push(format!(
645 "Duplicate entry at {} and {}: {} ({})",
646 prev_idx, idx, entry.surface, entry.pos
647 ));
648 } else {
649 seen.insert(key, idx);
650 }
651 }
652
653 ValidationResult {
654 is_valid: errors.is_empty(),
655 warnings,
656 errors,
657 }
658 }
659
660 pub fn remove_duplicates(&mut self) {
664 let mut seen: HashMap<(String, String), bool> = HashMap::new();
665 let mut new_entries = Vec::new();
666
667 for entry in self.entries.drain(..) {
668 let key = (entry.surface.clone(), entry.pos.clone());
669 if seen.contains_key(&key) {
670 continue;
671 }
672 seen.insert(key, true);
673 new_entries.push(entry);
674 }
675
676 self.entries = new_entries;
677 self.rebuild_surface_map();
678 self.trie_cache = None;
679 }
680
681 fn rebuild_surface_map(&mut self) {
683 self.surface_map.clear();
684 for (idx, entry) in self.entries.iter().enumerate() {
685 self.surface_map
686 .entry(entry.surface.clone())
687 .or_default()
688 .push(idx);
689 }
690 }
691
692 pub fn remove_surface(&mut self, surface: &str) -> usize {
698 if let Some(indices) = self.surface_map.remove(surface) {
699 let count = indices.len();
700
701 let mut indices_sorted = indices;
703 indices_sorted.sort_by(|a, b| b.cmp(a));
704
705 for idx in indices_sorted {
706 if idx < self.entries.len() {
707 self.entries.remove(idx);
708 }
709 }
710
711 self.rebuild_surface_map();
712 self.trie_cache = None;
713 count
714 } else {
715 0
716 }
717 }
718
719 pub fn check_csv_duplicates<P: AsRef<Path>>(
733 path: P,
734 ) -> Result<Vec<(usize, String, String)>> {
735 let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
736 let reader = BufReader::new(file);
737
738 let mut seen: HashMap<(String, String), usize> = HashMap::new();
739 let mut duplicates = Vec::new();
740
741 for (line_num, line_result) in reader.lines().enumerate() {
742 let line = line_result.map_err(DictError::Io)?;
743 let line = line.trim();
744
745 if line.is_empty() || line.starts_with('#') {
746 continue;
747 }
748
749 let parts: Vec<&str> = line.split(',').collect();
750 if parts.len() >= 2 {
751 let surface = parts[0].trim().to_string();
752 let pos = parts[1].trim().to_string();
753 let key = (surface.clone(), pos.clone());
754
755 if let Some(&prev_line) = seen.get(&key) {
756 duplicates.push((line_num + 1, surface, pos));
757 duplicates.push((prev_line, key.0.clone(), key.1.clone()));
758 } else {
759 seen.insert(key, line_num + 1);
760 }
761 }
762 }
763
764 Ok(duplicates)
765 }
766
767 pub fn add_entry_auto_pos(
771 &mut self,
772 surface: impl Into<String>,
773 cost: Option<i16>,
774 reading: Option<String>,
775 ) -> &mut Self {
776 let surface = surface.into();
777 let pos = estimate_pos(&surface);
778 self.add_entry(surface, pos, cost, reading)
779 }
780
781 #[must_use]
793 pub fn check_system_conflicts<S: std::hash::BuildHasher>(
794 &self,
795 system_surfaces: &std::collections::HashSet<String, S>,
796 ) -> Vec<(usize, String, String)> {
797 let mut conflicts = Vec::new();
798
799 for (idx, entry) in self.entries.iter().enumerate() {
800 if system_surfaces.contains(&entry.surface) {
801 conflicts.push((idx, entry.surface.clone(), entry.pos.clone()));
802 }
803 }
804
805 conflicts
806 }
807
808 #[must_use]
810 pub fn stats(&self) -> DictionaryStats {
811 let mut pos_counts: HashMap<String, usize> = HashMap::new();
812 let mut total_cost: i64 = 0;
813
814 for entry in &self.entries {
815 *pos_counts.entry(entry.pos.clone()).or_insert(0) += 1;
816 total_cost += i64::from(entry.cost);
817 }
818
819 DictionaryStats {
820 entry_count: self.entries.len(),
821 unique_surfaces: self.surface_map.len(),
822 pos_distribution: pos_counts,
823 #[allow(clippy::cast_precision_loss)]
824 average_cost: if self.entries.is_empty() {
825 0.0
826 } else {
827 (total_cost as f64) / (self.entries.len() as f64)
829 },
830 }
831 }
832
833 pub fn save_to_csv<P: AsRef<Path>>(&self, path: P) -> Result<()> {
839 use std::io::Write;
840
841 let mut file = std::fs::File::create(path.as_ref()).map_err(DictError::Io)?;
842
843 writeln!(file, "# 사용자 정의 사전").map_err(DictError::Io)?;
844 writeln!(file, "# 표면형,품사,비용,읽기").map_err(DictError::Io)?;
845
846 for entry in &self.entries {
847 let reading = entry.reading.as_deref().unwrap_or("");
848 writeln!(
849 file,
850 "{},{},{},{}",
851 entry.surface, entry.pos, entry.cost, reading
852 )
853 .map_err(DictError::Io)?;
854 }
855
856 Ok(())
857 }
858}
859
860pub struct UserDictionaryBuilder {
862 dict: UserDictionary,
863}
864
865impl Default for UserDictionaryBuilder {
866 fn default() -> Self {
867 Self::new()
868 }
869}
870
871impl UserDictionaryBuilder {
872 #[must_use]
874 pub fn new() -> Self {
875 Self {
876 dict: UserDictionary::new(),
877 }
878 }
879
880 #[must_use]
882 pub fn default_cost(mut self, cost: i16) -> Self {
883 self.dict = self.dict.with_default_cost(cost);
884 self
885 }
886
887 #[must_use]
889 pub fn add(mut self, surface: &str, pos: &str) -> Self {
890 self.dict.add_entry(surface, pos, None, None);
891 self
892 }
893
894 #[must_use]
896 pub fn add_with_cost(mut self, surface: &str, pos: &str, cost: i16) -> Self {
897 self.dict.add_entry(surface, pos, Some(cost), None);
898 self
899 }
900
901 #[must_use]
903 pub fn add_full(mut self, surface: &str, pos: &str, cost: i16, reading: Option<&str>) -> Self {
904 self.dict
905 .add_entry(surface, pos, Some(cost), reading.map(String::from));
906 self
907 }
908
909 pub fn load_csv<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
915 self.dict.load_from_csv(path)?;
916 Ok(self)
917 }
918
919 pub fn load_str(mut self, content: &str) -> Result<Self> {
925 self.dict.load_from_str(content)?;
926 Ok(self)
927 }
928
929 #[must_use]
931 pub fn build(self) -> UserDictionary {
932 self.dict
933 }
934
935 pub fn build_with_trie(mut self) -> Result<UserDictionary> {
941 if !self.dict.is_empty() {
942 self.dict.build_trie()?;
943 }
944 Ok(self.dict)
945 }
946}
947
948#[cfg(test)]
949#[allow(clippy::expect_used, clippy::unwrap_used)]
950mod tests {
951 use super::*;
952
953 #[test]
954 fn test_add_entry() {
955 let mut dict = UserDictionary::new();
956 dict.add_entry("딥러닝", "NNG", Some(-500), None);
957 dict.add_entry("머신러닝", "NNG", None, Some("머신러닝".to_string()));
958
959 assert_eq!(dict.len(), 2);
960 }
961
962 #[test]
963 fn test_lookup() {
964 let mut dict = UserDictionary::new();
965 dict.add_entry("딥러닝", "NNG", Some(-500), None);
966 dict.add_entry("딥러닝", "NNP", Some(-300), None); let entries = dict.lookup("딥러닝");
969 assert_eq!(entries.len(), 2);
970 assert_eq!(entries[0].pos, "NNG");
971 assert_eq!(entries[1].pos, "NNP");
972 }
973
974 #[test]
975 fn test_load_from_str() {
976 let csv = r"
977# 사용자 사전
978형태소분석,NNG,-1000,형태소분석
979딥러닝,NNG,-500,
980자연어처리,NNG,,자연어처리
981";
982 let mut dict = UserDictionary::new();
983 dict.load_from_str(csv).expect("should load");
984
985 assert_eq!(dict.len(), 3);
986
987 let entries = dict.lookup("형태소분석");
988 assert_eq!(entries.len(), 1);
989 assert_eq!(entries[0].cost, -1000);
990 assert_eq!(entries[0].reading.as_deref(), Some("형태소분석"));
991
992 let entries = dict.lookup("딥러닝");
993 assert_eq!(entries.len(), 1);
994 assert_eq!(entries[0].cost, -500);
995
996 let entries = dict.lookup("자연어처리");
997 assert_eq!(entries.len(), 1);
998 assert_eq!(entries[0].cost, -1000); }
1000
1001 #[test]
1002 fn test_build_trie() {
1003 let mut dict = UserDictionary::new();
1004 dict.add_entry("가", "NNG", Some(0), None);
1005 dict.add_entry("가다", "VV", Some(0), None);
1006 dict.add_entry("가방", "NNG", Some(0), None);
1007
1008 let bytes = dict.build_trie().expect("should build");
1009 assert!(!bytes.is_empty());
1010
1011 let trie = dict.get_trie().expect("should have trie");
1012 assert!(trie.exact_match("가").is_some());
1013 assert!(trie.exact_match("가다").is_some());
1014 assert!(trie.exact_match("가방").is_some());
1015 assert!(trie.exact_match("없음").is_none());
1016 }
1017
1018 #[test]
1019 fn test_builder_pattern() {
1020 let dict = UserDictionaryBuilder::new()
1021 .default_cost(-500)
1022 .add("딥러닝", "NNG")
1023 .add_with_cost("머신러닝", "NNG", -300)
1024 .add_full("자연어처리", "NNG", -400, Some("자연어처리"))
1025 .build();
1026
1027 assert_eq!(dict.len(), 3);
1028
1029 let entries = dict.lookup("딥러닝");
1030 assert_eq!(entries[0].cost, -500); let entries = dict.lookup("머신러닝");
1033 assert_eq!(entries[0].cost, -300);
1034 }
1035
1036 #[test]
1037 fn test_to_entry() {
1038 let user_entry = UserEntry::new("테스트", "NNG", -100, Some("테스트".to_string()));
1039 let entry = user_entry.to_entry();
1040
1041 assert_eq!(entry.surface, "테스트");
1042 assert_eq!(entry.cost, -100);
1043 assert!(entry.feature.contains("NNG"));
1044 assert!(entry.feature.contains("테스트"));
1045 }
1046
1047 #[test]
1048 fn test_korean_entries() {
1049 let mut dict = UserDictionary::new();
1050 dict.add_entry("챗GPT", "NNP", Some(-1000), Some("챗지피티".to_string()));
1051 dict.add_entry("클로드", "NNP", Some(-1000), None);
1052 dict.add_entry("라마", "NNP", Some(-1000), None);
1053 dict.add_entry("메타", "NNP", Some(-800), None);
1054 dict.add_entry("앤트로픽", "NNP", Some(-1000), None);
1055
1056 assert_eq!(dict.len(), 5);
1057
1058 let entries = dict.lookup("챗GPT");
1059 assert_eq!(entries[0].reading.as_deref(), Some("챗지피티"));
1060 }
1061
1062 #[test]
1063 fn test_clear() {
1064 let mut dict = UserDictionary::new();
1065 dict.add_entry("테스트", "NNG", None, None);
1066 assert_eq!(dict.len(), 1);
1067
1068 dict.clear();
1069 assert!(dict.is_empty());
1070 }
1071
1072 #[test]
1073 fn test_invalid_csv() {
1074 let csv = "표면형만";
1075 let mut dict = UserDictionary::new();
1076 let result = dict.load_from_str(csv);
1077 assert!(result.is_err());
1078 }
1079
1080 #[test]
1081 fn test_common_prefix_search() {
1082 let mut dict = UserDictionary::new();
1083 dict.add_entry("형태", "NNG", Some(0), None);
1084 dict.add_entry("형태소", "NNG", Some(0), None);
1085 dict.add_entry("형태소분석", "NNG", Some(0), None);
1086
1087 dict.build_trie().expect("should build");
1088
1089 let trie = dict.get_trie().expect("should have trie");
1090
1091 assert_eq!(trie.common_prefix_search("형태소분석기").count(), 3); }
1094
1095 #[test]
1096 fn test_with_context_ids() {
1097 let mut dict = UserDictionary::new();
1098 dict.add_entry_with_ids("테스트", "NNG", -100, 1234, 5678, None);
1099
1100 let entries = dict.lookup("테스트");
1101 assert_eq!(entries[0].left_id, 1234);
1102 assert_eq!(entries[0].right_id, 5678);
1103 }
1104
1105 #[test]
1106 fn test_validate() {
1107 let mut dict = UserDictionary::new();
1108 dict.add_entry("테스트", "NNG", Some(-100), None);
1109 dict.add_entry("유효", "VV", Some(-200), None);
1110
1111 let result = dict.validate();
1112 assert!(result.is_valid);
1113 }
1114
1115 #[test]
1116 fn test_validate_with_invalid_pos() {
1117 let mut dict = UserDictionary::new();
1118 dict.add_entry("테스트", "INVALID_POS", Some(-100), None);
1119
1120 let result = dict.validate();
1121 assert!(result.is_valid); assert!(!result.warnings.is_empty());
1123 }
1124
1125 #[test]
1126 fn test_remove_duplicates() {
1127 let mut dict = UserDictionary::new();
1128 dict.add_entry("테스트", "NNG", Some(-100), None);
1129 dict.add_entry("테스트", "NNG", Some(-200), None); dict.add_entry("테스트", "VV", Some(-300), None); assert_eq!(dict.len(), 3);
1133
1134 dict.remove_duplicates();
1135 assert_eq!(dict.len(), 2); }
1137
1138 #[test]
1139 fn test_remove_surface() {
1140 let mut dict = UserDictionary::new();
1141 dict.add_entry("삭제", "NNG", Some(-100), None);
1142 dict.add_entry("삭제", "VV", Some(-200), None);
1143 dict.add_entry("유지", "NNG", Some(-100), None);
1144
1145 let removed = dict.remove_surface("삭제");
1146 assert_eq!(removed, 2);
1147 assert_eq!(dict.len(), 1);
1148 assert!(dict.lookup("삭제").is_empty());
1149 }
1150
1151 #[test]
1152 fn test_stats() {
1153 let mut dict = UserDictionary::new();
1154 dict.add_entry("명사1", "NNG", Some(-100), None);
1155 dict.add_entry("명사2", "NNG", Some(-200), None);
1156 dict.add_entry("동사", "VV", Some(-150), None);
1157
1158 let stats = dict.stats();
1159 assert_eq!(stats.entry_count, 3);
1160 assert_eq!(stats.unique_surfaces, 3);
1161 assert_eq!(stats.pos_distribution.get("NNG"), Some(&2));
1162 assert_eq!(stats.pos_distribution.get("VV"), Some(&1));
1163 }
1164
1165 #[test]
1166 fn test_is_valid_pos_tag() {
1167 assert!(is_valid_pos_tag("NNG"));
1168 assert!(is_valid_pos_tag("VV"));
1169 assert!(is_valid_pos_tag("NNG+JX")); assert!(!is_valid_pos_tag("INVALID"));
1171 }
1172
1173 #[test]
1174 fn test_estimate_pos() {
1175 assert_eq!(estimate_pos("GPT"), "SL");
1177 assert_eq!(estimate_pos("BTS"), "SL");
1178
1179 assert_eq!(estimate_pos("123"), "SN");
1181
1182 assert_eq!(estimate_pos("챗GPT"), "NNP");
1184
1185 assert_eq!(estimate_pos("하다"), "VV");
1187 assert_eq!(estimate_pos("먹다"), "VV");
1188
1189 assert_eq!(estimate_pos("메타버스"), "NNG");
1191 assert_eq!(estimate_pos("사과"), "NNG");
1192
1193 assert_eq!(estimate_pos(""), "NA");
1195 }
1196
1197 #[test]
1198 fn test_add_entry_auto_pos() {
1199 let mut dict = UserDictionary::new();
1200 dict.add_entry_auto_pos("GPT", None, None);
1201 dict.add_entry_auto_pos("챗GPT", None, None);
1202 dict.add_entry_auto_pos("메타버스", None, None);
1203
1204 let entries = dict.lookup("GPT");
1205 assert_eq!(entries[0].pos, "SL");
1206
1207 let entries = dict.lookup("챗GPT");
1208 assert_eq!(entries[0].pos, "NNP");
1209
1210 let entries = dict.lookup("메타버스");
1211 assert_eq!(entries[0].pos, "NNG");
1212 }
1213
1214 #[test]
1215 fn test_check_system_conflicts() {
1216 use std::collections::HashSet;
1217
1218 let mut dict = UserDictionary::new();
1219 dict.add_entry("사과", "NNG", None, None); dict.add_entry("챗GPT", "NNP", None, None); dict.add_entry("바나나", "NNG", None, None); let system_surfaces: HashSet<String> =
1224 ["사과", "바나나", "포도"].iter().map(|s| s.to_string()).collect();
1225
1226 let conflicts = dict.check_system_conflicts(&system_surfaces);
1227 assert_eq!(conflicts.len(), 2);
1228
1229 let surfaces: Vec<&str> = conflicts.iter().map(|(_, s, _)| s.as_str()).collect();
1230 assert!(surfaces.contains(&"사과"));
1231 assert!(surfaces.contains(&"바나나"));
1232 }
1233}