1use std::collections::HashMap;
27use std::io::{BufRead, BufReader};
28use std::path::Path;
29
30use crate::error::{DictError, Result};
31use crate::trie::{Trie, TrieBuilder};
32use crate::Entry;
33
34const VALID_POS_TAGS: &[&str] = &[
36 "NNG", "NNP", "NNB", "NR", "NP",
38 "VV", "VA", "VX", "VCP", "VCN",
40 "MM", "MAG", "MAJ", "IC",
42 "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC",
44 "EP", "EF", "EC", "ETN", "ETM",
46 "XPN", "XSN", "XSV", "XSA", "XR",
48 "SF", "SE", "SS", "SP", "SO", "SW",
50 "SL", "SH", "SN",
52 "NA",
54];
55
56#[must_use]
58pub fn is_valid_pos_tag(pos: &str) -> bool {
59 if VALID_POS_TAGS.contains(&pos) {
61 return true;
62 }
63
64 if pos.contains('+') {
66 return pos.split('+').all(|p| VALID_POS_TAGS.contains(&p));
67 }
68
69 false
70}
71
72#[must_use]
84pub fn estimate_pos(surface: &str) -> &'static str {
85 if surface.is_empty() {
87 return "NA";
88 }
89
90 let chars: Vec<char> = surface.chars().collect();
91 let first_char = chars[0];
92 let last_char = *chars.last().unwrap_or(&first_char);
93
94 if surface.chars().all(|c| c.is_ascii_digit()) {
96 return "SN";
97 }
98
99 if surface.chars().all(|c| c.is_ascii_alphabetic()) {
101 if surface.chars().all(|c| c.is_ascii_uppercase()) {
103 return "SL"; }
105 return "SL"; }
107
108 if surface.chars().all(|c| c.is_ascii_alphanumeric()) {
110 return "SL";
111 }
112
113 if is_hangul(first_char) {
115 if matches!(last_char, '다' | '하' | '되') {
117 return "VV"; }
119
120 if matches!(last_char, '이' | '히' | '게' | '로' | '리') && chars.len() >= 2 {
122 }
125
126 if surface.chars().any(|c| c.is_ascii_alphabetic()) {
129 return "NNP"; }
131
132 return "NNG";
134 }
135
136 if first_char.is_ascii_punctuation() {
138 return "SW";
139 }
140
141 if is_hanja(first_char) {
143 return "SH";
144 }
145
146 "NNG"
148}
149
150fn is_hangul(c: char) -> bool {
152 ('\u{AC00}'..='\u{D7A3}').contains(&c) || ('\u{1100}'..='\u{11FF}').contains(&c) || ('\u{3130}'..='\u{318F}').contains(&c) }
156
157fn is_hanja(c: char) -> bool {
159 ('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3400}'..='\u{4DBF}').contains(&c) }
162
163#[derive(Debug, Clone, Default)]
165pub struct ValidationResult {
166 pub is_valid: bool,
168 pub warnings: Vec<String>,
170 pub errors: Vec<String>,
172}
173
174impl ValidationResult {
175 #[must_use]
177 pub fn is_ok(&self) -> bool {
178 self.is_valid && self.warnings.is_empty()
179 }
180
181 #[must_use]
183 pub fn issue_count(&self) -> usize {
184 self.warnings.len() + self.errors.len()
185 }
186}
187
188#[derive(Debug, Clone)]
190pub struct DictionaryStats {
191 pub entry_count: usize,
193 pub unique_surfaces: usize,
195 pub pos_distribution: HashMap<String, usize>,
197 pub average_cost: f64,
199}
200
201impl std::fmt::Display for DictionaryStats {
202 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
203 writeln!(f, "Dictionary Statistics:")?;
204 writeln!(f, " Total entries: {}", self.entry_count)?;
205 writeln!(f, " Unique surfaces: {}", self.unique_surfaces)?;
206 writeln!(f, " Average cost: {:.2}", self.average_cost)?;
207 writeln!(f, " POS distribution:")?;
208
209 let mut pos_sorted: Vec<_> = self.pos_distribution.iter().collect();
210 pos_sorted.sort_by(|a, b| b.1.cmp(a.1));
211
212 for (pos, count) in pos_sorted.iter().take(10) {
213 writeln!(f, " {pos}: {count}")?;
214 }
215
216 Ok(())
217 }
218}
219
220#[derive(Debug, Clone, PartialEq, Eq)]
222pub struct UserEntry {
223 pub surface: String,
225 pub left_id: u16,
227 pub right_id: u16,
229 pub cost: i16,
231 pub pos: String,
233 pub reading: Option<String>,
235 pub lemma: Option<String>,
237 pub feature: String,
239}
240
241impl UserEntry {
242 pub fn new(
244 surface: impl Into<String>,
245 pos: impl Into<String>,
246 cost: i16,
247 reading: Option<String>,
248 ) -> Self {
249 let surface = surface.into();
250 let pos = pos.into();
251 let feature = format!("{},*,*,{},*,*,*,*", pos, reading.as_deref().unwrap_or("*"));
252 Self {
253 surface,
254 left_id: 0, right_id: 0,
256 cost,
257 pos,
258 reading,
259 lemma: None,
260 feature,
261 }
262 }
263
264 #[must_use]
266 pub const fn with_context_ids(mut self, left_id: u16, right_id: u16) -> Self {
267 self.left_id = left_id;
268 self.right_id = right_id;
269 self
270 }
271
272 #[must_use]
274 pub fn with_lemma(mut self, lemma: impl Into<String>) -> Self {
275 self.lemma = Some(lemma.into());
276 self
277 }
278
279 #[must_use]
281 pub fn to_entry(&self) -> Entry {
282 let feature = format!(
283 "{},*,*,*,*,*,{},*",
284 self.pos,
285 self.reading.as_deref().unwrap_or("*")
286 );
287
288 Entry {
289 surface: self.surface.clone(),
290 left_id: self.left_id,
291 right_id: self.right_id,
292 cost: self.cost,
293 feature,
294 }
295 }
296}
297
298#[derive(Clone)]
302pub struct UserDictionary {
303 entries: Vec<UserEntry>,
305 surface_map: HashMap<String, Vec<usize>>,
307 trie_cache: Option<Vec<u8>>,
309 default_cost: i16,
311}
312
313impl Default for UserDictionary {
314 fn default() -> Self {
315 Self::new()
316 }
317}
318
319impl UserDictionary {
320 #[must_use]
322 pub fn new() -> Self {
323 Self {
324 entries: Vec::new(),
325 surface_map: HashMap::new(),
326 trie_cache: None,
327 default_cost: -1000, }
329 }
330
331 #[must_use]
333 pub const fn with_default_cost(mut self, cost: i16) -> Self {
334 self.default_cost = cost;
335 self
336 }
337
338 pub fn add_entry(
347 &mut self,
348 surface: impl Into<String>,
349 pos: impl Into<String>,
350 cost: Option<i16>,
351 reading: Option<String>,
352 ) -> &mut Self {
353 let surface = surface.into();
354 let cost = cost.unwrap_or(self.default_cost);
355 let entry = UserEntry::new(surface.clone(), pos, cost, reading);
356
357 let idx = self.entries.len();
358 self.entries.push(entry);
359
360 self.surface_map.entry(surface).or_default().push(idx);
361
362 self.trie_cache = None;
364
365 self
366 }
367
368 pub fn add_entry_with_ids(
370 &mut self,
371 surface: impl Into<String>,
372 pos: impl Into<String>,
373 cost: i16,
374 left_id: u16,
375 right_id: u16,
376 reading: Option<String>,
377 ) -> &mut Self {
378 let surface = surface.into();
379 let entry =
380 UserEntry::new(surface.clone(), pos, cost, reading).with_context_ids(left_id, right_id);
381
382 let idx = self.entries.len();
383 self.entries.push(entry);
384
385 self.surface_map.entry(surface).or_default().push(idx);
386
387 self.trie_cache = None;
388
389 self
390 }
391
392 pub fn load_from_csv<P: AsRef<Path>>(&mut self, path: P) -> Result<&mut Self> {
410 let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
411 let reader = BufReader::new(file);
412
413 for (line_num, line_result) in reader.lines().enumerate() {
414 let line = line_result.map_err(DictError::Io)?;
415 let line = line.trim();
416
417 if line.is_empty() || line.starts_with('#') {
419 continue;
420 }
421
422 self.parse_csv_line(line, line_num + 1)?;
423 }
424
425 Ok(self)
426 }
427
428 pub fn load_from_str(&mut self, content: &str) -> Result<&mut Self> {
434 for (line_num, line) in content.lines().enumerate() {
435 let line = line.trim();
436
437 if line.is_empty() || line.starts_with('#') {
438 continue;
439 }
440
441 self.parse_csv_line(line, line_num + 1)?;
442 }
443
444 Ok(self)
445 }
446
447 fn parse_csv_line(&mut self, line: &str, line_num: usize) -> Result<()> {
453 let parts: Vec<&str> = line.split(',').collect();
454
455 if parts.len() < 2 {
456 return Err(DictError::Format(format!(
457 "Invalid user dictionary format at line {line_num}: expected at least 2 fields"
458 )));
459 }
460
461 let surface = parts[0].trim();
462 let pos = parts[1].trim();
463
464 if surface.is_empty() || pos.is_empty() {
465 return Err(DictError::Format(format!(
466 "Empty surface or POS at line {line_num}"
467 )));
468 }
469
470 let cost = if parts.len() > 2 && !parts[2].trim().is_empty() {
471 parts[2].trim().parse::<i16>().map_err(|_| {
472 DictError::Format(format!("Invalid cost at line {}: {}", line_num, parts[2]))
473 })?
474 } else {
475 self.default_cost
476 };
477
478 let reading = if parts.len() > 3 && !parts[3].trim().is_empty() {
479 Some(parts[3].trim().to_string())
480 } else {
481 None
482 };
483
484 if parts.len() >= 6 && !parts[4].trim().is_empty() && !parts[5].trim().is_empty() {
486 let left_id = parts[4].trim().parse::<u16>().map_err(|_| {
487 DictError::Format(format!("Invalid left_id at line {}: {}", line_num, parts[4]))
488 })?;
489 let right_id = parts[5].trim().parse::<u16>().map_err(|_| {
490 DictError::Format(format!("Invalid right_id at line {}: {}", line_num, parts[5]))
491 })?;
492 self.add_entry_with_ids(surface, pos, cost, left_id, right_id, reading);
493 } else {
494 self.add_entry(surface, pos, Some(cost), reading);
495 }
496
497 Ok(())
498 }
499
500 #[must_use]
502 pub fn lookup(&self, surface: &str) -> Vec<&UserEntry> {
503 self.surface_map
504 .get(surface)
505 .map(|indices| {
506 indices
507 .iter()
508 .filter_map(|&idx| self.entries.get(idx))
509 .collect()
510 })
511 .unwrap_or_default()
512 }
513
514 #[must_use]
526 pub fn common_prefix_search(&self, text: &str) -> Vec<&UserEntry> {
527 let mut results = Vec::new();
528
529 for entry in &self.entries {
531 if text.starts_with(&entry.surface) {
532 results.push(entry);
533 }
534 }
535
536 results
537 }
538
539 #[must_use]
541 pub fn entries(&self) -> &[UserEntry] {
542 &self.entries
543 }
544
545 #[must_use]
547 pub fn len(&self) -> usize {
548 self.entries.len()
549 }
550
551 #[must_use]
553 pub fn is_empty(&self) -> bool {
554 self.entries.is_empty()
555 }
556
557 pub fn build_trie(&mut self) -> Result<&[u8]> {
565 if let Some(ref cache) = self.trie_cache {
566 return Ok(cache);
567 }
568
569 if self.entries.is_empty() {
570 return Err(DictError::Format(
571 "Cannot build Trie from empty user dictionary".to_string(),
572 ));
573 }
574
575 #[allow(clippy::cast_possible_truncation)]
577 let mut trie_entries: Vec<(&str, u32)> = self
578 .surface_map
579 .iter()
580 .filter_map(|(surface, indices)| {
581 indices.first().map(|&idx| (surface.as_str(), idx as u32))
582 })
583 .collect();
584
585 trie_entries.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));
587
588 let bytes = TrieBuilder::build(&trie_entries)?;
589 self.trie_cache = Some(bytes);
590
591 Ok(self.trie_cache.as_ref().unwrap_or_else(|| unreachable!()))
593 }
594
595 #[must_use]
597 pub fn get_trie(&self) -> Option<Trie<'_>> {
598 self.trie_cache.as_ref().map(|bytes| Trie::new(bytes))
599 }
600
601 #[must_use]
603 pub fn to_entries(&self) -> Vec<Entry> {
604 self.entries.iter().map(UserEntry::to_entry).collect()
605 }
606
607 pub fn clear(&mut self) {
609 self.entries.clear();
610 self.surface_map.clear();
611 self.trie_cache = None;
612 }
613
614 #[must_use]
622 pub fn validate(&self) -> ValidationResult {
623 let mut warnings = Vec::new();
624 let mut errors = Vec::new();
625
626 for (idx, entry) in self.entries.iter().enumerate() {
627 if entry.surface.is_empty() {
629 errors.push(format!("Entry {idx}: empty surface"));
630 }
631
632 if entry.pos.is_empty() {
634 errors.push(format!("Entry {idx}: empty POS tag"));
635 }
636
637 if entry.cost == i16::MIN || entry.cost == i16::MAX {
639 warnings.push(format!(
640 "Entry {} ({}): cost {} is at extreme value",
641 idx, entry.surface, entry.cost
642 ));
643 }
644
645 if !is_valid_pos_tag(&entry.pos) {
647 warnings.push(format!(
648 "Entry {} ({}): unknown POS tag '{}'",
649 idx, entry.surface, entry.pos
650 ));
651 }
652 }
653
654 let mut seen: HashMap<(&str, &str), usize> = HashMap::new();
656 for (idx, entry) in self.entries.iter().enumerate() {
657 let key = (entry.surface.as_str(), entry.pos.as_str());
658 if let Some(&prev_idx) = seen.get(&key) {
659 warnings.push(format!(
660 "Duplicate entry at {} and {}: {} ({})",
661 prev_idx, idx, entry.surface, entry.pos
662 ));
663 } else {
664 seen.insert(key, idx);
665 }
666 }
667
668 ValidationResult {
669 is_valid: errors.is_empty(),
670 warnings,
671 errors,
672 }
673 }
674
675 pub fn remove_duplicates(&mut self) {
679 let mut seen: HashMap<(String, String), bool> = HashMap::new();
680 let mut new_entries = Vec::new();
681
682 for entry in self.entries.drain(..) {
683 let key = (entry.surface.clone(), entry.pos.clone());
684 if seen.contains_key(&key) {
685 continue;
686 }
687 seen.insert(key, true);
688 new_entries.push(entry);
689 }
690
691 self.entries = new_entries;
692 self.rebuild_surface_map();
693 self.trie_cache = None;
694 }
695
696 fn rebuild_surface_map(&mut self) {
698 self.surface_map.clear();
699 for (idx, entry) in self.entries.iter().enumerate() {
700 self.surface_map
701 .entry(entry.surface.clone())
702 .or_default()
703 .push(idx);
704 }
705 }
706
707 pub fn remove_surface(&mut self, surface: &str) -> usize {
713 if let Some(indices) = self.surface_map.remove(surface) {
714 let count = indices.len();
715
716 let mut indices_sorted = indices;
718 indices_sorted.sort_by(|a, b| b.cmp(a));
719
720 for idx in indices_sorted {
721 if idx < self.entries.len() {
722 self.entries.remove(idx);
723 }
724 }
725
726 self.rebuild_surface_map();
727 self.trie_cache = None;
728 count
729 } else {
730 0
731 }
732 }
733
734 pub fn check_csv_duplicates<P: AsRef<Path>>(
748 path: P,
749 ) -> Result<Vec<(usize, String, String)>> {
750 let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
751 let reader = BufReader::new(file);
752
753 let mut seen: HashMap<(String, String), usize> = HashMap::new();
754 let mut duplicates = Vec::new();
755
756 for (line_num, line_result) in reader.lines().enumerate() {
757 let line = line_result.map_err(DictError::Io)?;
758 let line = line.trim();
759
760 if line.is_empty() || line.starts_with('#') {
761 continue;
762 }
763
764 let parts: Vec<&str> = line.split(',').collect();
765 if parts.len() >= 2 {
766 let surface = parts[0].trim().to_string();
767 let pos = parts[1].trim().to_string();
768 let key = (surface.clone(), pos.clone());
769
770 if let Some(&prev_line) = seen.get(&key) {
771 duplicates.push((line_num + 1, surface, pos));
772 duplicates.push((prev_line, key.0.clone(), key.1.clone()));
773 } else {
774 seen.insert(key, line_num + 1);
775 }
776 }
777 }
778
779 Ok(duplicates)
780 }
781
782 pub fn add_entry_auto_pos(
786 &mut self,
787 surface: impl Into<String>,
788 cost: Option<i16>,
789 reading: Option<String>,
790 ) -> &mut Self {
791 let surface = surface.into();
792 let pos = estimate_pos(&surface);
793 self.add_entry(surface, pos, cost, reading)
794 }
795
796 #[must_use]
808 pub fn check_system_conflicts<S: std::hash::BuildHasher>(
809 &self,
810 system_surfaces: &std::collections::HashSet<String, S>,
811 ) -> Vec<(usize, String, String)> {
812 let mut conflicts = Vec::new();
813
814 for (idx, entry) in self.entries.iter().enumerate() {
815 if system_surfaces.contains(&entry.surface) {
816 conflicts.push((idx, entry.surface.clone(), entry.pos.clone()));
817 }
818 }
819
820 conflicts
821 }
822
823 #[must_use]
825 pub fn stats(&self) -> DictionaryStats {
826 let mut pos_counts: HashMap<String, usize> = HashMap::new();
827 let mut total_cost: i64 = 0;
828
829 for entry in &self.entries {
830 *pos_counts.entry(entry.pos.clone()).or_insert(0) += 1;
831 total_cost += i64::from(entry.cost);
832 }
833
834 DictionaryStats {
835 entry_count: self.entries.len(),
836 unique_surfaces: self.surface_map.len(),
837 pos_distribution: pos_counts,
838 #[allow(clippy::cast_precision_loss)]
839 average_cost: if self.entries.is_empty() {
840 0.0
841 } else {
842 (total_cost as f64) / (self.entries.len() as f64)
844 },
845 }
846 }
847
848 pub fn save_to_csv<P: AsRef<Path>>(&self, path: P) -> Result<()> {
854 use std::io::Write;
855
856 let mut file = std::fs::File::create(path.as_ref()).map_err(DictError::Io)?;
857
858 writeln!(file, "# 사용자 정의 사전").map_err(DictError::Io)?;
859 writeln!(file, "# 표면형,품사,비용,읽기").map_err(DictError::Io)?;
860
861 for entry in &self.entries {
862 let reading = entry.reading.as_deref().unwrap_or("");
863 writeln!(
864 file,
865 "{},{},{},{}",
866 entry.surface, entry.pos, entry.cost, reading
867 )
868 .map_err(DictError::Io)?;
869 }
870
871 Ok(())
872 }
873}
874
875pub struct UserDictionaryBuilder {
877 dict: UserDictionary,
878}
879
880impl Default for UserDictionaryBuilder {
881 fn default() -> Self {
882 Self::new()
883 }
884}
885
886impl UserDictionaryBuilder {
887 #[must_use]
889 pub fn new() -> Self {
890 Self {
891 dict: UserDictionary::new(),
892 }
893 }
894
895 #[must_use]
897 pub fn default_cost(mut self, cost: i16) -> Self {
898 self.dict = self.dict.with_default_cost(cost);
899 self
900 }
901
902 #[must_use]
904 pub fn add(mut self, surface: &str, pos: &str) -> Self {
905 self.dict.add_entry(surface, pos, None, None);
906 self
907 }
908
909 #[must_use]
911 pub fn add_with_cost(mut self, surface: &str, pos: &str, cost: i16) -> Self {
912 self.dict.add_entry(surface, pos, Some(cost), None);
913 self
914 }
915
916 #[must_use]
918 pub fn add_full(mut self, surface: &str, pos: &str, cost: i16, reading: Option<&str>) -> Self {
919 self.dict
920 .add_entry(surface, pos, Some(cost), reading.map(String::from));
921 self
922 }
923
924 pub fn load_csv<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
930 self.dict.load_from_csv(path)?;
931 Ok(self)
932 }
933
934 pub fn load_str(mut self, content: &str) -> Result<Self> {
940 self.dict.load_from_str(content)?;
941 Ok(self)
942 }
943
944 #[must_use]
946 pub fn build(self) -> UserDictionary {
947 self.dict
948 }
949
950 pub fn build_with_trie(mut self) -> Result<UserDictionary> {
956 if !self.dict.is_empty() {
957 self.dict.build_trie()?;
958 }
959 Ok(self.dict)
960 }
961}
962
963#[cfg(test)]
964#[allow(clippy::expect_used, clippy::unwrap_used)]
965mod tests {
966 use super::*;
967
968 #[test]
969 fn test_add_entry() {
970 let mut dict = UserDictionary::new();
971 dict.add_entry("딥러닝", "NNG", Some(-500), None);
972 dict.add_entry("머신러닝", "NNG", None, Some("머신러닝".to_string()));
973
974 assert_eq!(dict.len(), 2);
975 }
976
977 #[test]
978 fn test_lookup() {
979 let mut dict = UserDictionary::new();
980 dict.add_entry("딥러닝", "NNG", Some(-500), None);
981 dict.add_entry("딥러닝", "NNP", Some(-300), None); let entries = dict.lookup("딥러닝");
984 assert_eq!(entries.len(), 2);
985 assert_eq!(entries[0].pos, "NNG");
986 assert_eq!(entries[1].pos, "NNP");
987 }
988
989 #[test]
990 fn test_load_from_str() {
991 let csv = r"
992# 사용자 사전
993형태소분석,NNG,-1000,형태소분석
994딥러닝,NNG,-500,
995자연어처리,NNG,,자연어처리
996";
997 let mut dict = UserDictionary::new();
998 dict.load_from_str(csv).expect("should load");
999
1000 assert_eq!(dict.len(), 3);
1001
1002 let entries = dict.lookup("형태소분석");
1003 assert_eq!(entries.len(), 1);
1004 assert_eq!(entries[0].cost, -1000);
1005 assert_eq!(entries[0].reading.as_deref(), Some("형태소분석"));
1006
1007 let entries = dict.lookup("딥러닝");
1008 assert_eq!(entries.len(), 1);
1009 assert_eq!(entries[0].cost, -500);
1010
1011 let entries = dict.lookup("자연어처리");
1012 assert_eq!(entries.len(), 1);
1013 assert_eq!(entries[0].cost, -1000); }
1015
1016 #[test]
1017 fn test_build_trie() {
1018 let mut dict = UserDictionary::new();
1019 dict.add_entry("가", "NNG", Some(0), None);
1020 dict.add_entry("가다", "VV", Some(0), None);
1021 dict.add_entry("가방", "NNG", Some(0), None);
1022
1023 let bytes = dict.build_trie().expect("should build");
1024 assert!(!bytes.is_empty());
1025
1026 let trie = dict.get_trie().expect("should have trie");
1027 assert!(trie.exact_match("가").is_some());
1028 assert!(trie.exact_match("가다").is_some());
1029 assert!(trie.exact_match("가방").is_some());
1030 assert!(trie.exact_match("없음").is_none());
1031 }
1032
1033 #[test]
1034 fn test_builder_pattern() {
1035 let dict = UserDictionaryBuilder::new()
1036 .default_cost(-500)
1037 .add("딥러닝", "NNG")
1038 .add_with_cost("머신러닝", "NNG", -300)
1039 .add_full("자연어처리", "NNG", -400, Some("자연어처리"))
1040 .build();
1041
1042 assert_eq!(dict.len(), 3);
1043
1044 let entries = dict.lookup("딥러닝");
1045 assert_eq!(entries[0].cost, -500); let entries = dict.lookup("머신러닝");
1048 assert_eq!(entries[0].cost, -300);
1049 }
1050
1051 #[test]
1052 fn test_to_entry() {
1053 let user_entry = UserEntry::new("테스트", "NNG", -100, Some("테스트".to_string()));
1054 let entry = user_entry.to_entry();
1055
1056 assert_eq!(entry.surface, "테스트");
1057 assert_eq!(entry.cost, -100);
1058 assert!(entry.feature.contains("NNG"));
1059 assert!(entry.feature.contains("테스트"));
1060 }
1061
1062 #[test]
1063 fn test_korean_entries() {
1064 let mut dict = UserDictionary::new();
1065 dict.add_entry("챗GPT", "NNP", Some(-1000), Some("챗지피티".to_string()));
1066 dict.add_entry("클로드", "NNP", Some(-1000), None);
1067 dict.add_entry("라마", "NNP", Some(-1000), None);
1068 dict.add_entry("메타", "NNP", Some(-800), None);
1069 dict.add_entry("앤트로픽", "NNP", Some(-1000), None);
1070
1071 assert_eq!(dict.len(), 5);
1072
1073 let entries = dict.lookup("챗GPT");
1074 assert_eq!(entries[0].reading.as_deref(), Some("챗지피티"));
1075 }
1076
1077 #[test]
1078 fn test_clear() {
1079 let mut dict = UserDictionary::new();
1080 dict.add_entry("테스트", "NNG", None, None);
1081 assert_eq!(dict.len(), 1);
1082
1083 dict.clear();
1084 assert!(dict.is_empty());
1085 }
1086
1087 #[test]
1088 fn test_invalid_csv() {
1089 let csv = "표면형만";
1090 let mut dict = UserDictionary::new();
1091 let result = dict.load_from_str(csv);
1092 assert!(result.is_err());
1093 }
1094
1095 #[test]
1096 fn test_common_prefix_search() {
1097 let mut dict = UserDictionary::new();
1098 dict.add_entry("형태", "NNG", Some(0), None);
1099 dict.add_entry("형태소", "NNG", Some(0), None);
1100 dict.add_entry("형태소분석", "NNG", Some(0), None);
1101
1102 dict.build_trie().expect("should build");
1103
1104 let trie = dict.get_trie().expect("should have trie");
1105
1106 assert_eq!(trie.common_prefix_search("형태소분석기").count(), 3); }
1109
1110 #[test]
1111 fn test_with_context_ids() {
1112 let mut dict = UserDictionary::new();
1113 dict.add_entry_with_ids("테스트", "NNG", -100, 1234, 5678, None);
1114
1115 let entries = dict.lookup("테스트");
1116 assert_eq!(entries[0].left_id, 1234);
1117 assert_eq!(entries[0].right_id, 5678);
1118 }
1119
1120 #[test]
1121 fn test_validate() {
1122 let mut dict = UserDictionary::new();
1123 dict.add_entry("테스트", "NNG", Some(-100), None);
1124 dict.add_entry("유효", "VV", Some(-200), None);
1125
1126 let result = dict.validate();
1127 assert!(result.is_valid);
1128 }
1129
1130 #[test]
1131 fn test_validate_with_invalid_pos() {
1132 let mut dict = UserDictionary::new();
1133 dict.add_entry("테스트", "INVALID_POS", Some(-100), None);
1134
1135 let result = dict.validate();
1136 assert!(result.is_valid); assert!(!result.warnings.is_empty());
1138 }
1139
1140 #[test]
1141 fn test_remove_duplicates() {
1142 let mut dict = UserDictionary::new();
1143 dict.add_entry("테스트", "NNG", Some(-100), None);
1144 dict.add_entry("테스트", "NNG", Some(-200), None); dict.add_entry("테스트", "VV", Some(-300), None); assert_eq!(dict.len(), 3);
1148
1149 dict.remove_duplicates();
1150 assert_eq!(dict.len(), 2); }
1152
1153 #[test]
1154 fn test_remove_surface() {
1155 let mut dict = UserDictionary::new();
1156 dict.add_entry("삭제", "NNG", Some(-100), None);
1157 dict.add_entry("삭제", "VV", Some(-200), None);
1158 dict.add_entry("유지", "NNG", Some(-100), None);
1159
1160 let removed = dict.remove_surface("삭제");
1161 assert_eq!(removed, 2);
1162 assert_eq!(dict.len(), 1);
1163 assert!(dict.lookup("삭제").is_empty());
1164 }
1165
1166 #[test]
1167 fn test_stats() {
1168 let mut dict = UserDictionary::new();
1169 dict.add_entry("명사1", "NNG", Some(-100), None);
1170 dict.add_entry("명사2", "NNG", Some(-200), None);
1171 dict.add_entry("동사", "VV", Some(-150), None);
1172
1173 let stats = dict.stats();
1174 assert_eq!(stats.entry_count, 3);
1175 assert_eq!(stats.unique_surfaces, 3);
1176 assert_eq!(stats.pos_distribution.get("NNG"), Some(&2));
1177 assert_eq!(stats.pos_distribution.get("VV"), Some(&1));
1178 }
1179
1180 #[test]
1181 fn test_is_valid_pos_tag() {
1182 assert!(is_valid_pos_tag("NNG"));
1183 assert!(is_valid_pos_tag("VV"));
1184 assert!(is_valid_pos_tag("NNG+JX")); assert!(!is_valid_pos_tag("INVALID"));
1186 }
1187
1188 #[test]
1189 fn test_estimate_pos() {
1190 assert_eq!(estimate_pos("GPT"), "SL");
1192 assert_eq!(estimate_pos("BTS"), "SL");
1193
1194 assert_eq!(estimate_pos("123"), "SN");
1196
1197 assert_eq!(estimate_pos("챗GPT"), "NNP");
1199
1200 assert_eq!(estimate_pos("하다"), "VV");
1202 assert_eq!(estimate_pos("먹다"), "VV");
1203
1204 assert_eq!(estimate_pos("메타버스"), "NNG");
1206 assert_eq!(estimate_pos("사과"), "NNG");
1207
1208 assert_eq!(estimate_pos(""), "NA");
1210 }
1211
1212 #[test]
1213 fn test_add_entry_auto_pos() {
1214 let mut dict = UserDictionary::new();
1215 dict.add_entry_auto_pos("GPT", None, None);
1216 dict.add_entry_auto_pos("챗GPT", None, None);
1217 dict.add_entry_auto_pos("메타버스", None, None);
1218
1219 let entries = dict.lookup("GPT");
1220 assert_eq!(entries[0].pos, "SL");
1221
1222 let entries = dict.lookup("챗GPT");
1223 assert_eq!(entries[0].pos, "NNP");
1224
1225 let entries = dict.lookup("메타버스");
1226 assert_eq!(entries[0].pos, "NNG");
1227 }
1228
1229 #[test]
1230 fn test_check_system_conflicts() {
1231 use std::collections::HashSet;
1232
1233 let mut dict = UserDictionary::new();
1234 dict.add_entry("사과", "NNG", None, None); dict.add_entry("챗GPT", "NNP", None, None); dict.add_entry("바나나", "NNG", None, None); let system_surfaces: HashSet<String> =
1239 ["사과", "바나나", "포도"].iter().map(|s| s.to_string()).collect();
1240
1241 let conflicts = dict.check_system_conflicts(&system_surfaces);
1242 assert_eq!(conflicts.len(), 2);
1243
1244 let surfaces: Vec<&str> = conflicts.iter().map(|(_, s, _)| s.as_str()).collect();
1245 assert!(surfaces.contains(&"사과"));
1246 assert!(surfaces.contains(&"바나나"));
1247 }
1248}