1use std::collections::HashMap;
27use std::io::{BufRead, BufReader};
28use std::path::Path;
29
30use crate::error::{DictError, Result};
31use crate::trie::{Trie, TrieBuilder};
32use crate::Entry;
33
34const VALID_POS_TAGS: &[&str] = &[
36 "NNG", "NNP", "NNB", "NR", "NP", "VV", "VA", "VX", "VCP", "VCN", "MM", "MAG", "MAJ", "IC", "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC", "EP", "EF", "EC", "ETN", "ETM", "XPN", "XSN", "XSV", "XSA", "XR", "SF", "SE", "SS", "SP", "SO", "SW", "SL", "SH", "SN", "NA",
46];
47
48#[must_use]
50pub fn is_valid_pos_tag(pos: &str) -> bool {
51 if VALID_POS_TAGS.contains(&pos) {
53 return true;
54 }
55
56 if pos.contains('+') {
58 return pos.split('+').all(|p| VALID_POS_TAGS.contains(&p));
59 }
60
61 false
62}
63
64#[must_use]
76pub fn estimate_pos(surface: &str) -> &'static str {
77 if surface.is_empty() {
79 return "NA";
80 }
81
82 let chars: Vec<char> = surface.chars().collect();
83 let first_char = chars[0];
84 let last_char = *chars.last().unwrap_or(&first_char);
85
86 if surface.chars().all(|c| c.is_ascii_digit()) {
88 return "SN";
89 }
90
91 if surface.chars().all(|c| c.is_ascii_alphabetic()) {
93 if surface.chars().all(|c| c.is_ascii_uppercase()) {
95 return "SL"; }
97 return "SL"; }
99
100 if surface.chars().all(|c| c.is_ascii_alphanumeric()) {
102 return "SL";
103 }
104
105 if is_hangul(first_char) {
107 if matches!(last_char, '다' | '하' | '되') {
109 return "VV"; }
111
112 if matches!(last_char, '이' | '히' | '게' | '로' | '리') && chars.len() >= 2 {
114 }
117
118 if surface.chars().any(|c| c.is_ascii_alphabetic()) {
121 return "NNP"; }
123
124 return "NNG";
126 }
127
128 if first_char.is_ascii_punctuation() {
130 return "SW";
131 }
132
133 if is_hanja(first_char) {
135 return "SH";
136 }
137
138 "NNG"
140}
141
142fn is_hangul(c: char) -> bool {
144 ('\u{AC00}'..='\u{D7A3}').contains(&c) || ('\u{1100}'..='\u{11FF}').contains(&c) || ('\u{3130}'..='\u{318F}').contains(&c) }
148
149fn is_hanja(c: char) -> bool {
151 ('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3400}'..='\u{4DBF}').contains(&c) }
154
155#[derive(Debug, Clone, Default)]
157pub struct ValidationResult {
158 pub is_valid: bool,
160 pub warnings: Vec<String>,
162 pub errors: Vec<String>,
164}
165
166impl ValidationResult {
167 #[must_use]
169 pub fn is_ok(&self) -> bool {
170 self.is_valid && self.warnings.is_empty()
171 }
172
173 #[must_use]
175 pub fn issue_count(&self) -> usize {
176 self.warnings.len() + self.errors.len()
177 }
178}
179
180#[derive(Debug, Clone)]
182pub struct DictionaryStats {
183 pub entry_count: usize,
185 pub unique_surfaces: usize,
187 pub pos_distribution: HashMap<String, usize>,
189 pub average_cost: f64,
191}
192
193impl std::fmt::Display for DictionaryStats {
194 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
195 writeln!(f, "Dictionary Statistics:")?;
196 writeln!(f, " Total entries: {}", self.entry_count)?;
197 writeln!(f, " Unique surfaces: {}", self.unique_surfaces)?;
198 writeln!(f, " Average cost: {:.2}", self.average_cost)?;
199 writeln!(f, " POS distribution:")?;
200
201 let mut pos_sorted: Vec<_> = self.pos_distribution.iter().collect();
202 pos_sorted.sort_by(|a, b| b.1.cmp(a.1));
203
204 for (pos, count) in pos_sorted.iter().take(10) {
205 writeln!(f, " {pos}: {count}")?;
206 }
207
208 Ok(())
209 }
210}
211
212#[derive(Debug, Clone, PartialEq, Eq)]
214pub struct UserEntry {
215 pub surface: String,
217 pub left_id: u16,
219 pub right_id: u16,
221 pub cost: i16,
223 pub pos: String,
225 pub reading: Option<String>,
227 pub lemma: Option<String>,
229 pub feature: String,
231}
232
233impl UserEntry {
234 pub fn new(
236 surface: impl Into<String>,
237 pos: impl Into<String>,
238 cost: i16,
239 reading: Option<String>,
240 ) -> Self {
241 let surface = surface.into();
242 let pos = pos.into();
243 let feature = format!("{},*,*,{},*,*,*,*", pos, reading.as_deref().unwrap_or("*"));
244 Self {
245 surface,
246 left_id: 0, right_id: 0,
248 cost,
249 pos,
250 reading,
251 lemma: None,
252 feature,
253 }
254 }
255
256 #[must_use]
258 pub const fn with_context_ids(mut self, left_id: u16, right_id: u16) -> Self {
259 self.left_id = left_id;
260 self.right_id = right_id;
261 self
262 }
263
264 #[must_use]
266 pub fn with_lemma(mut self, lemma: impl Into<String>) -> Self {
267 self.lemma = Some(lemma.into());
268 self
269 }
270
271 #[must_use]
273 pub fn to_entry(&self) -> Entry {
274 let feature = format!(
275 "{},*,*,*,*,*,{},*",
276 self.pos,
277 self.reading.as_deref().unwrap_or("*")
278 );
279
280 Entry {
281 surface: self.surface.clone(),
282 left_id: self.left_id,
283 right_id: self.right_id,
284 cost: self.cost,
285 feature,
286 }
287 }
288}
289
290#[derive(Clone)]
294pub struct UserDictionary {
295 entries: Vec<UserEntry>,
297 surface_map: HashMap<String, Vec<usize>>,
299 trie_cache: Option<Vec<u8>>,
301 default_cost: i16,
303}
304
305impl Default for UserDictionary {
306 fn default() -> Self {
307 Self::new()
308 }
309}
310
311impl UserDictionary {
312 #[must_use]
314 pub fn new() -> Self {
315 Self {
316 entries: Vec::new(),
317 surface_map: HashMap::new(),
318 trie_cache: None,
319 default_cost: -1000, }
321 }
322
323 #[must_use]
325 pub const fn with_default_cost(mut self, cost: i16) -> Self {
326 self.default_cost = cost;
327 self
328 }
329
330 pub fn add_entry(
339 &mut self,
340 surface: impl Into<String>,
341 pos: impl Into<String>,
342 cost: Option<i16>,
343 reading: Option<String>,
344 ) -> &mut Self {
345 let surface = surface.into();
346 let cost = cost.unwrap_or(self.default_cost);
347 let entry = UserEntry::new(surface.clone(), pos, cost, reading);
348
349 let idx = self.entries.len();
350 self.entries.push(entry);
351
352 self.surface_map.entry(surface).or_default().push(idx);
353
354 self.trie_cache = None;
356
357 self
358 }
359
360 pub fn add_entry_with_ids(
362 &mut self,
363 surface: impl Into<String>,
364 pos: impl Into<String>,
365 cost: i16,
366 left_id: u16,
367 right_id: u16,
368 reading: Option<String>,
369 ) -> &mut Self {
370 let surface = surface.into();
371 let entry =
372 UserEntry::new(surface.clone(), pos, cost, reading).with_context_ids(left_id, right_id);
373
374 let idx = self.entries.len();
375 self.entries.push(entry);
376
377 self.surface_map.entry(surface).or_default().push(idx);
378
379 self.trie_cache = None;
380
381 self
382 }
383
384 pub fn load_from_csv<P: AsRef<Path>>(&mut self, path: P) -> Result<&mut Self> {
402 let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
403 let reader = BufReader::new(file);
404
405 for (line_num, line_result) in reader.lines().enumerate() {
406 let line = line_result.map_err(DictError::Io)?;
407 let line = line.trim();
408
409 if line.is_empty() || line.starts_with('#') {
411 continue;
412 }
413
414 self.parse_csv_line(line, line_num + 1)?;
415 }
416
417 Ok(self)
418 }
419
420 pub fn load_from_str(&mut self, content: &str) -> Result<&mut Self> {
426 for (line_num, line) in content.lines().enumerate() {
427 let line = line.trim();
428
429 if line.is_empty() || line.starts_with('#') {
430 continue;
431 }
432
433 self.parse_csv_line(line, line_num + 1)?;
434 }
435
436 Ok(self)
437 }
438
439 fn parse_csv_line(&mut self, line: &str, line_num: usize) -> Result<()> {
445 let parts: Vec<&str> = line.split(',').collect();
446
447 if parts.len() < 2 {
448 return Err(DictError::Format(format!(
449 "Invalid user dictionary format at line {line_num}: expected at least 2 fields"
450 )));
451 }
452
453 let surface = parts[0].trim();
454 let pos = parts[1].trim();
455
456 if surface.is_empty() || pos.is_empty() {
457 return Err(DictError::Format(format!(
458 "Empty surface or POS at line {line_num}"
459 )));
460 }
461
462 let cost = if parts.len() > 2 && !parts[2].trim().is_empty() {
463 parts[2].trim().parse::<i16>().map_err(|_| {
464 DictError::Format(format!("Invalid cost at line {}: {}", line_num, parts[2]))
465 })?
466 } else {
467 self.default_cost
468 };
469
470 let reading = if parts.len() > 3 && !parts[3].trim().is_empty() {
471 Some(parts[3].trim().to_string())
472 } else {
473 None
474 };
475
476 if parts.len() >= 6 && !parts[4].trim().is_empty() && !parts[5].trim().is_empty() {
478 let left_id = parts[4].trim().parse::<u16>().map_err(|_| {
479 DictError::Format(format!(
480 "Invalid left_id at line {}: {}",
481 line_num, parts[4]
482 ))
483 })?;
484 let right_id = parts[5].trim().parse::<u16>().map_err(|_| {
485 DictError::Format(format!(
486 "Invalid right_id at line {}: {}",
487 line_num, parts[5]
488 ))
489 })?;
490 self.add_entry_with_ids(surface, pos, cost, left_id, right_id, reading);
491 } else {
492 self.add_entry(surface, pos, Some(cost), reading);
493 }
494
495 Ok(())
496 }
497
498 #[must_use]
500 pub fn lookup(&self, surface: &str) -> Vec<&UserEntry> {
501 self.surface_map
502 .get(surface)
503 .map(|indices| {
504 indices
505 .iter()
506 .filter_map(|&idx| self.entries.get(idx))
507 .collect()
508 })
509 .unwrap_or_default()
510 }
511
512 #[must_use]
524 pub fn common_prefix_search(&self, text: &str) -> Vec<&UserEntry> {
525 let mut results = Vec::new();
526
527 for entry in &self.entries {
529 if text.starts_with(&entry.surface) {
530 results.push(entry);
531 }
532 }
533
534 results
535 }
536
537 #[must_use]
539 pub fn entries(&self) -> &[UserEntry] {
540 &self.entries
541 }
542
543 #[must_use]
545 pub fn len(&self) -> usize {
546 self.entries.len()
547 }
548
549 #[must_use]
551 pub fn is_empty(&self) -> bool {
552 self.entries.is_empty()
553 }
554
555 pub fn build_trie(&mut self) -> Result<&[u8]> {
563 if let Some(ref cache) = self.trie_cache {
564 return Ok(cache);
565 }
566
567 if self.entries.is_empty() {
568 return Err(DictError::Format(
569 "Cannot build Trie from empty user dictionary".to_string(),
570 ));
571 }
572
573 #[allow(clippy::cast_possible_truncation)]
575 let mut trie_entries: Vec<(&str, u32)> = self
576 .surface_map
577 .iter()
578 .filter_map(|(surface, indices)| {
579 indices.first().map(|&idx| (surface.as_str(), idx as u32))
580 })
581 .collect();
582
583 trie_entries.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));
585
586 let bytes = TrieBuilder::build(&trie_entries)?;
587 self.trie_cache = Some(bytes);
588
589 Ok(self.trie_cache.as_ref().unwrap_or_else(|| unreachable!()))
591 }
592
593 #[must_use]
595 pub fn get_trie(&self) -> Option<Trie<'_>> {
596 self.trie_cache.as_ref().map(|bytes| Trie::new(bytes))
597 }
598
599 #[must_use]
601 pub fn to_entries(&self) -> Vec<Entry> {
602 self.entries.iter().map(UserEntry::to_entry).collect()
603 }
604
605 pub fn clear(&mut self) {
607 self.entries.clear();
608 self.surface_map.clear();
609 self.trie_cache = None;
610 }
611
612 #[must_use]
620 pub fn validate(&self) -> ValidationResult {
621 let mut warnings = Vec::new();
622 let mut errors = Vec::new();
623
624 for (idx, entry) in self.entries.iter().enumerate() {
625 if entry.surface.is_empty() {
627 errors.push(format!("Entry {idx}: empty surface"));
628 }
629
630 if entry.pos.is_empty() {
632 errors.push(format!("Entry {idx}: empty POS tag"));
633 }
634
635 if entry.cost == i16::MIN || entry.cost == i16::MAX {
637 warnings.push(format!(
638 "Entry {} ({}): cost {} is at extreme value",
639 idx, entry.surface, entry.cost
640 ));
641 }
642
643 if !is_valid_pos_tag(&entry.pos) {
645 warnings.push(format!(
646 "Entry {} ({}): unknown POS tag '{}'",
647 idx, entry.surface, entry.pos
648 ));
649 }
650 }
651
652 let mut seen: HashMap<(&str, &str), usize> = HashMap::new();
654 for (idx, entry) in self.entries.iter().enumerate() {
655 let key = (entry.surface.as_str(), entry.pos.as_str());
656 if let Some(&prev_idx) = seen.get(&key) {
657 warnings.push(format!(
658 "Duplicate entry at {} and {}: {} ({})",
659 prev_idx, idx, entry.surface, entry.pos
660 ));
661 } else {
662 seen.insert(key, idx);
663 }
664 }
665
666 ValidationResult {
667 is_valid: errors.is_empty(),
668 warnings,
669 errors,
670 }
671 }
672
673 pub fn remove_duplicates(&mut self) {
677 let mut seen: HashMap<(String, String), bool> = HashMap::new();
678 let mut new_entries = Vec::new();
679
680 for entry in self.entries.drain(..) {
681 let key = (entry.surface.clone(), entry.pos.clone());
682 if seen.contains_key(&key) {
683 continue;
684 }
685 seen.insert(key, true);
686 new_entries.push(entry);
687 }
688
689 self.entries = new_entries;
690 self.rebuild_surface_map();
691 self.trie_cache = None;
692 }
693
694 fn rebuild_surface_map(&mut self) {
696 self.surface_map.clear();
697 for (idx, entry) in self.entries.iter().enumerate() {
698 self.surface_map
699 .entry(entry.surface.clone())
700 .or_default()
701 .push(idx);
702 }
703 }
704
705 pub fn remove_surface(&mut self, surface: &str) -> usize {
711 if let Some(indices) = self.surface_map.remove(surface) {
712 let count = indices.len();
713
714 let mut indices_sorted = indices;
716 indices_sorted.sort_by(|a, b| b.cmp(a));
717
718 for idx in indices_sorted {
719 if idx < self.entries.len() {
720 self.entries.remove(idx);
721 }
722 }
723
724 self.rebuild_surface_map();
725 self.trie_cache = None;
726 count
727 } else {
728 0
729 }
730 }
731
732 pub fn check_csv_duplicates<P: AsRef<Path>>(path: P) -> Result<Vec<(usize, String, String)>> {
746 let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
747 let reader = BufReader::new(file);
748
749 let mut seen: HashMap<(String, String), usize> = HashMap::new();
750 let mut duplicates = Vec::new();
751
752 for (line_num, line_result) in reader.lines().enumerate() {
753 let line = line_result.map_err(DictError::Io)?;
754 let line = line.trim();
755
756 if line.is_empty() || line.starts_with('#') {
757 continue;
758 }
759
760 let parts: Vec<&str> = line.split(',').collect();
761 if parts.len() >= 2 {
762 let surface = parts[0].trim().to_string();
763 let pos = parts[1].trim().to_string();
764 let key = (surface.clone(), pos.clone());
765
766 if let Some(&prev_line) = seen.get(&key) {
767 duplicates.push((line_num + 1, surface, pos));
768 duplicates.push((prev_line, key.0.clone(), key.1.clone()));
769 } else {
770 seen.insert(key, line_num + 1);
771 }
772 }
773 }
774
775 Ok(duplicates)
776 }
777
778 pub fn add_entry_auto_pos(
782 &mut self,
783 surface: impl Into<String>,
784 cost: Option<i16>,
785 reading: Option<String>,
786 ) -> &mut Self {
787 let surface = surface.into();
788 let pos = estimate_pos(&surface);
789 self.add_entry(surface, pos, cost, reading)
790 }
791
792 #[must_use]
804 pub fn check_system_conflicts<S: std::hash::BuildHasher>(
805 &self,
806 system_surfaces: &std::collections::HashSet<String, S>,
807 ) -> Vec<(usize, String, String)> {
808 let mut conflicts = Vec::new();
809
810 for (idx, entry) in self.entries.iter().enumerate() {
811 if system_surfaces.contains(&entry.surface) {
812 conflicts.push((idx, entry.surface.clone(), entry.pos.clone()));
813 }
814 }
815
816 conflicts
817 }
818
819 #[must_use]
821 pub fn stats(&self) -> DictionaryStats {
822 let mut pos_counts: HashMap<String, usize> = HashMap::new();
823 let mut total_cost: i64 = 0;
824
825 for entry in &self.entries {
826 *pos_counts.entry(entry.pos.clone()).or_insert(0) += 1;
827 total_cost += i64::from(entry.cost);
828 }
829
830 DictionaryStats {
831 entry_count: self.entries.len(),
832 unique_surfaces: self.surface_map.len(),
833 pos_distribution: pos_counts,
834 #[allow(clippy::cast_precision_loss)]
835 average_cost: if self.entries.is_empty() {
836 0.0
837 } else {
838 (total_cost as f64) / (self.entries.len() as f64)
840 },
841 }
842 }
843
844 pub fn save_to_csv<P: AsRef<Path>>(&self, path: P) -> Result<()> {
850 use std::io::Write;
851
852 let mut file = std::fs::File::create(path.as_ref()).map_err(DictError::Io)?;
853
854 writeln!(file, "# 사용자 정의 사전").map_err(DictError::Io)?;
855 writeln!(file, "# 표면형,품사,비용,읽기").map_err(DictError::Io)?;
856
857 for entry in &self.entries {
858 let reading = entry.reading.as_deref().unwrap_or("");
859 writeln!(
860 file,
861 "{},{},{},{}",
862 entry.surface, entry.pos, entry.cost, reading
863 )
864 .map_err(DictError::Io)?;
865 }
866
867 Ok(())
868 }
869}
870
871pub struct UserDictionaryBuilder {
873 dict: UserDictionary,
874}
875
876impl Default for UserDictionaryBuilder {
877 fn default() -> Self {
878 Self::new()
879 }
880}
881
882impl UserDictionaryBuilder {
883 #[must_use]
885 pub fn new() -> Self {
886 Self {
887 dict: UserDictionary::new(),
888 }
889 }
890
891 #[must_use]
893 pub fn default_cost(mut self, cost: i16) -> Self {
894 self.dict = self.dict.with_default_cost(cost);
895 self
896 }
897
898 #[must_use]
900 pub fn add(mut self, surface: &str, pos: &str) -> Self {
901 self.dict.add_entry(surface, pos, None, None);
902 self
903 }
904
905 #[must_use]
907 pub fn add_with_cost(mut self, surface: &str, pos: &str, cost: i16) -> Self {
908 self.dict.add_entry(surface, pos, Some(cost), None);
909 self
910 }
911
912 #[must_use]
914 pub fn add_full(mut self, surface: &str, pos: &str, cost: i16, reading: Option<&str>) -> Self {
915 self.dict
916 .add_entry(surface, pos, Some(cost), reading.map(String::from));
917 self
918 }
919
920 pub fn load_csv<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
926 self.dict.load_from_csv(path)?;
927 Ok(self)
928 }
929
930 pub fn load_str(mut self, content: &str) -> Result<Self> {
936 self.dict.load_from_str(content)?;
937 Ok(self)
938 }
939
940 #[must_use]
942 pub fn build(self) -> UserDictionary {
943 self.dict
944 }
945
946 pub fn build_with_trie(mut self) -> Result<UserDictionary> {
952 if !self.dict.is_empty() {
953 self.dict.build_trie()?;
954 }
955 Ok(self.dict)
956 }
957}
958
959#[cfg(test)]
960#[allow(clippy::expect_used, clippy::unwrap_used)]
961mod tests {
962 use super::*;
963
964 #[test]
965 fn test_add_entry() {
966 let mut dict = UserDictionary::new();
967 dict.add_entry("딥러닝", "NNG", Some(-500), None);
968 dict.add_entry("머신러닝", "NNG", None, Some("머신러닝".to_string()));
969
970 assert_eq!(dict.len(), 2);
971 }
972
973 #[test]
974 fn test_lookup() {
975 let mut dict = UserDictionary::new();
976 dict.add_entry("딥러닝", "NNG", Some(-500), None);
977 dict.add_entry("딥러닝", "NNP", Some(-300), None); let entries = dict.lookup("딥러닝");
980 assert_eq!(entries.len(), 2);
981 assert_eq!(entries[0].pos, "NNG");
982 assert_eq!(entries[1].pos, "NNP");
983 }
984
985 #[test]
986 fn test_load_from_str() {
987 let csv = r"
988# 사용자 사전
989형태소분석,NNG,-1000,형태소분석
990딥러닝,NNG,-500,
991자연어처리,NNG,,자연어처리
992";
993 let mut dict = UserDictionary::new();
994 dict.load_from_str(csv).expect("should load");
995
996 assert_eq!(dict.len(), 3);
997
998 let entries = dict.lookup("형태소분석");
999 assert_eq!(entries.len(), 1);
1000 assert_eq!(entries[0].cost, -1000);
1001 assert_eq!(entries[0].reading.as_deref(), Some("형태소분석"));
1002
1003 let entries = dict.lookup("딥러닝");
1004 assert_eq!(entries.len(), 1);
1005 assert_eq!(entries[0].cost, -500);
1006
1007 let entries = dict.lookup("자연어처리");
1008 assert_eq!(entries.len(), 1);
1009 assert_eq!(entries[0].cost, -1000); }
1011
1012 #[test]
1013 fn test_build_trie() {
1014 let mut dict = UserDictionary::new();
1015 dict.add_entry("가", "NNG", Some(0), None);
1016 dict.add_entry("가다", "VV", Some(0), None);
1017 dict.add_entry("가방", "NNG", Some(0), None);
1018
1019 let bytes = dict.build_trie().expect("should build");
1020 assert!(!bytes.is_empty());
1021
1022 let trie = dict.get_trie().expect("should have trie");
1023 assert!(trie.exact_match("가").is_some());
1024 assert!(trie.exact_match("가다").is_some());
1025 assert!(trie.exact_match("가방").is_some());
1026 assert!(trie.exact_match("없음").is_none());
1027 }
1028
1029 #[test]
1030 fn test_builder_pattern() {
1031 let dict = UserDictionaryBuilder::new()
1032 .default_cost(-500)
1033 .add("딥러닝", "NNG")
1034 .add_with_cost("머신러닝", "NNG", -300)
1035 .add_full("자연어처리", "NNG", -400, Some("자연어처리"))
1036 .build();
1037
1038 assert_eq!(dict.len(), 3);
1039
1040 let entries = dict.lookup("딥러닝");
1041 assert_eq!(entries[0].cost, -500); let entries = dict.lookup("머신러닝");
1044 assert_eq!(entries[0].cost, -300);
1045 }
1046
1047 #[test]
1048 fn test_to_entry() {
1049 let user_entry = UserEntry::new("테스트", "NNG", -100, Some("테스트".to_string()));
1050 let entry = user_entry.to_entry();
1051
1052 assert_eq!(entry.surface, "테스트");
1053 assert_eq!(entry.cost, -100);
1054 assert!(entry.feature.contains("NNG"));
1055 assert!(entry.feature.contains("테스트"));
1056 }
1057
1058 #[test]
1059 fn test_korean_entries() {
1060 let mut dict = UserDictionary::new();
1061 dict.add_entry("챗GPT", "NNP", Some(-1000), Some("챗지피티".to_string()));
1062 dict.add_entry("클로드", "NNP", Some(-1000), None);
1063 dict.add_entry("라마", "NNP", Some(-1000), None);
1064 dict.add_entry("메타", "NNP", Some(-800), None);
1065 dict.add_entry("앤트로픽", "NNP", Some(-1000), None);
1066
1067 assert_eq!(dict.len(), 5);
1068
1069 let entries = dict.lookup("챗GPT");
1070 assert_eq!(entries[0].reading.as_deref(), Some("챗지피티"));
1071 }
1072
1073 #[test]
1074 fn test_clear() {
1075 let mut dict = UserDictionary::new();
1076 dict.add_entry("테스트", "NNG", None, None);
1077 assert_eq!(dict.len(), 1);
1078
1079 dict.clear();
1080 assert!(dict.is_empty());
1081 }
1082
1083 #[test]
1084 fn test_invalid_csv() {
1085 let csv = "표면형만";
1086 let mut dict = UserDictionary::new();
1087 let result = dict.load_from_str(csv);
1088 assert!(result.is_err());
1089 }
1090
1091 #[test]
1092 fn test_common_prefix_search() {
1093 let mut dict = UserDictionary::new();
1094 dict.add_entry("형태", "NNG", Some(0), None);
1095 dict.add_entry("형태소", "NNG", Some(0), None);
1096 dict.add_entry("형태소분석", "NNG", Some(0), None);
1097
1098 dict.build_trie().expect("should build");
1099
1100 let trie = dict.get_trie().expect("should have trie");
1101
1102 assert_eq!(trie.common_prefix_search("형태소분석기").count(), 3); }
1105
1106 #[test]
1107 fn test_with_context_ids() {
1108 let mut dict = UserDictionary::new();
1109 dict.add_entry_with_ids("테스트", "NNG", -100, 1234, 5678, None);
1110
1111 let entries = dict.lookup("테스트");
1112 assert_eq!(entries[0].left_id, 1234);
1113 assert_eq!(entries[0].right_id, 5678);
1114 }
1115
1116 #[test]
1117 fn test_validate() {
1118 let mut dict = UserDictionary::new();
1119 dict.add_entry("테스트", "NNG", Some(-100), None);
1120 dict.add_entry("유효", "VV", Some(-200), None);
1121
1122 let result = dict.validate();
1123 assert!(result.is_valid);
1124 }
1125
1126 #[test]
1127 fn test_validate_with_invalid_pos() {
1128 let mut dict = UserDictionary::new();
1129 dict.add_entry("테스트", "INVALID_POS", Some(-100), None);
1130
1131 let result = dict.validate();
1132 assert!(result.is_valid); assert!(!result.warnings.is_empty());
1134 }
1135
1136 #[test]
1137 fn test_remove_duplicates() {
1138 let mut dict = UserDictionary::new();
1139 dict.add_entry("테스트", "NNG", Some(-100), None);
1140 dict.add_entry("테스트", "NNG", Some(-200), None); dict.add_entry("테스트", "VV", Some(-300), None); assert_eq!(dict.len(), 3);
1144
1145 dict.remove_duplicates();
1146 assert_eq!(dict.len(), 2); }
1148
1149 #[test]
1150 fn test_remove_surface() {
1151 let mut dict = UserDictionary::new();
1152 dict.add_entry("삭제", "NNG", Some(-100), None);
1153 dict.add_entry("삭제", "VV", Some(-200), None);
1154 dict.add_entry("유지", "NNG", Some(-100), None);
1155
1156 let removed = dict.remove_surface("삭제");
1157 assert_eq!(removed, 2);
1158 assert_eq!(dict.len(), 1);
1159 assert!(dict.lookup("삭제").is_empty());
1160 }
1161
1162 #[test]
1163 fn test_stats() {
1164 let mut dict = UserDictionary::new();
1165 dict.add_entry("명사1", "NNG", Some(-100), None);
1166 dict.add_entry("명사2", "NNG", Some(-200), None);
1167 dict.add_entry("동사", "VV", Some(-150), None);
1168
1169 let stats = dict.stats();
1170 assert_eq!(stats.entry_count, 3);
1171 assert_eq!(stats.unique_surfaces, 3);
1172 assert_eq!(stats.pos_distribution.get("NNG"), Some(&2));
1173 assert_eq!(stats.pos_distribution.get("VV"), Some(&1));
1174 }
1175
1176 #[test]
1177 fn test_is_valid_pos_tag() {
1178 assert!(is_valid_pos_tag("NNG"));
1179 assert!(is_valid_pos_tag("VV"));
1180 assert!(is_valid_pos_tag("NNG+JX")); assert!(!is_valid_pos_tag("INVALID"));
1182 }
1183
1184 #[test]
1185 fn test_estimate_pos() {
1186 assert_eq!(estimate_pos("GPT"), "SL");
1188 assert_eq!(estimate_pos("BTS"), "SL");
1189
1190 assert_eq!(estimate_pos("123"), "SN");
1192
1193 assert_eq!(estimate_pos("챗GPT"), "NNP");
1195
1196 assert_eq!(estimate_pos("하다"), "VV");
1198 assert_eq!(estimate_pos("먹다"), "VV");
1199
1200 assert_eq!(estimate_pos("메타버스"), "NNG");
1202 assert_eq!(estimate_pos("사과"), "NNG");
1203
1204 assert_eq!(estimate_pos(""), "NA");
1206 }
1207
1208 #[test]
1209 fn test_add_entry_auto_pos() {
1210 let mut dict = UserDictionary::new();
1211 dict.add_entry_auto_pos("GPT", None, None);
1212 dict.add_entry_auto_pos("챗GPT", None, None);
1213 dict.add_entry_auto_pos("메타버스", None, None);
1214
1215 let entries = dict.lookup("GPT");
1216 assert_eq!(entries[0].pos, "SL");
1217
1218 let entries = dict.lookup("챗GPT");
1219 assert_eq!(entries[0].pos, "NNP");
1220
1221 let entries = dict.lookup("메타버스");
1222 assert_eq!(entries[0].pos, "NNG");
1223 }
1224
1225 #[test]
1226 fn test_check_system_conflicts() {
1227 use std::collections::HashSet;
1228
1229 let mut dict = UserDictionary::new();
1230 dict.add_entry("사과", "NNG", None, None); dict.add_entry("챗GPT", "NNP", None, None); dict.add_entry("바나나", "NNG", None, None); let system_surfaces: HashSet<String> = ["사과", "바나나", "포도"]
1235 .iter()
1236 .map(|s| (*s).to_string())
1237 .collect();
1238
1239 let conflicts = dict.check_system_conflicts(&system_surfaces);
1240 assert_eq!(conflicts.len(), 2);
1241
1242 let surfaces: Vec<&str> = conflicts.iter().map(|(_, s, _)| s.as_str()).collect();
1243 assert!(surfaces.contains(&"사과"));
1244 assert!(surfaces.contains(&"바나나"));
1245 }
1246}