use std::collections::HashMap;
use std::io::{BufRead, BufReader};
use std::path::Path;
use crate::error::{DictError, Result};
use crate::trie::{Trie, TrieBuilder};
use crate::Entry;
const VALID_POS_TAGS: &[&str] = &[
"NNG", "NNP", "NNB", "NR", "NP", "VV", "VA", "VX", "VCP", "VCN", "MM", "MAG", "MAJ", "IC", "JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC", "EP", "EF", "EC", "ETN", "ETM", "XPN", "XSN", "XSV", "XSA", "XR", "SF", "SE", "SS", "SP", "SO", "SW", "SL", "SH", "SN", "NA",
];
#[must_use]
pub fn is_valid_pos_tag(pos: &str) -> bool {
if VALID_POS_TAGS.contains(&pos) {
return true;
}
if pos.contains('+') {
return pos.split('+').all(|p| VALID_POS_TAGS.contains(&p));
}
false
}
#[must_use]
pub fn estimate_pos(surface: &str) -> &'static str {
if surface.is_empty() {
return "NA";
}
let chars: Vec<char> = surface.chars().collect();
let first_char = chars[0];
let last_char = *chars.last().unwrap_or(&first_char);
if surface.chars().all(|c| c.is_ascii_digit()) {
return "SN";
}
if surface.chars().all(|c| c.is_ascii_alphabetic()) {
if surface.chars().all(|c| c.is_ascii_uppercase()) {
return "SL"; }
return "SL"; }
if surface.chars().all(|c| c.is_ascii_alphanumeric()) {
return "SL";
}
if is_hangul(first_char) {
if matches!(last_char, '다' | '하' | '되') {
return "VV"; }
if matches!(last_char, '이' | '히' | '게' | '로' | '리') && chars.len() >= 2 {
}
if surface.chars().any(|c| c.is_ascii_alphabetic()) {
return "NNP"; }
return "NNG";
}
if first_char.is_ascii_punctuation() {
return "SW";
}
if is_hanja(first_char) {
return "SH";
}
"NNG"
}
fn is_hangul(c: char) -> bool {
('\u{AC00}'..='\u{D7A3}').contains(&c) || ('\u{1100}'..='\u{11FF}').contains(&c) || ('\u{3130}'..='\u{318F}').contains(&c) }
fn is_hanja(c: char) -> bool {
('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3400}'..='\u{4DBF}').contains(&c) }
#[derive(Debug, Clone, Default)]
pub struct ValidationResult {
pub is_valid: bool,
pub warnings: Vec<String>,
pub errors: Vec<String>,
}
impl ValidationResult {
#[must_use]
pub fn is_ok(&self) -> bool {
self.is_valid && self.warnings.is_empty()
}
#[must_use]
pub fn issue_count(&self) -> usize {
self.warnings.len() + self.errors.len()
}
}
#[derive(Debug, Clone)]
pub struct DictionaryStats {
pub entry_count: usize,
pub unique_surfaces: usize,
pub pos_distribution: HashMap<String, usize>,
pub average_cost: f64,
}
impl std::fmt::Display for DictionaryStats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "Dictionary Statistics:")?;
writeln!(f, " Total entries: {}", self.entry_count)?;
writeln!(f, " Unique surfaces: {}", self.unique_surfaces)?;
writeln!(f, " Average cost: {:.2}", self.average_cost)?;
writeln!(f, " POS distribution:")?;
let mut pos_sorted: Vec<_> = self.pos_distribution.iter().collect();
pos_sorted.sort_by(|a, b| b.1.cmp(a.1));
for (pos, count) in pos_sorted.iter().take(10) {
writeln!(f, " {pos}: {count}")?;
}
Ok(())
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct UserEntry {
pub surface: String,
pub left_id: u16,
pub right_id: u16,
pub cost: i16,
pub pos: String,
pub reading: Option<String>,
pub lemma: Option<String>,
pub feature: String,
}
impl UserEntry {
pub fn new(
surface: impl Into<String>,
pos: impl Into<String>,
cost: i16,
reading: Option<String>,
) -> Self {
let surface = surface.into();
let pos = pos.into();
let feature = format!("{},*,*,{},*,*,*,*", pos, reading.as_deref().unwrap_or("*"));
Self {
surface,
left_id: 0, right_id: 0,
cost,
pos,
reading,
lemma: None,
feature,
}
}
#[must_use]
pub const fn with_context_ids(mut self, left_id: u16, right_id: u16) -> Self {
self.left_id = left_id;
self.right_id = right_id;
self
}
#[must_use]
pub fn with_lemma(mut self, lemma: impl Into<String>) -> Self {
self.lemma = Some(lemma.into());
self
}
#[must_use]
pub fn to_entry(&self) -> Entry {
let feature = format!(
"{},*,*,*,*,*,{},*",
self.pos,
self.reading.as_deref().unwrap_or("*")
);
Entry {
surface: self.surface.clone(),
left_id: self.left_id,
right_id: self.right_id,
cost: self.cost,
feature,
}
}
}
#[derive(Clone)]
pub struct UserDictionary {
entries: Vec<UserEntry>,
surface_map: HashMap<String, Vec<usize>>,
trie_cache: Option<Vec<u8>>,
default_cost: i16,
}
impl Default for UserDictionary {
fn default() -> Self {
Self::new()
}
}
impl UserDictionary {
#[must_use]
pub fn new() -> Self {
Self {
entries: Vec::new(),
surface_map: HashMap::new(),
trie_cache: None,
default_cost: -1000, }
}
#[must_use]
pub const fn with_default_cost(mut self, cost: i16) -> Self {
self.default_cost = cost;
self
}
pub fn add_entry(
&mut self,
surface: impl Into<String>,
pos: impl Into<String>,
cost: Option<i16>,
reading: Option<String>,
) -> &mut Self {
let surface = surface.into();
let cost = cost.unwrap_or(self.default_cost);
let entry = UserEntry::new(surface.clone(), pos, cost, reading);
let idx = self.entries.len();
self.entries.push(entry);
self.surface_map.entry(surface).or_default().push(idx);
self.trie_cache = None;
self
}
pub fn add_entry_with_ids(
&mut self,
surface: impl Into<String>,
pos: impl Into<String>,
cost: i16,
left_id: u16,
right_id: u16,
reading: Option<String>,
) -> &mut Self {
let surface = surface.into();
let entry =
UserEntry::new(surface.clone(), pos, cost, reading).with_context_ids(left_id, right_id);
let idx = self.entries.len();
self.entries.push(entry);
self.surface_map.entry(surface).or_default().push(idx);
self.trie_cache = None;
self
}
pub fn load_from_csv<P: AsRef<Path>>(&mut self, path: P) -> Result<&mut Self> {
let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
let reader = BufReader::new(file);
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(DictError::Io)?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
self.parse_csv_line(line, line_num + 1)?;
}
Ok(self)
}
pub fn load_from_str(&mut self, content: &str) -> Result<&mut Self> {
for (line_num, line) in content.lines().enumerate() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
self.parse_csv_line(line, line_num + 1)?;
}
Ok(self)
}
fn parse_csv_line(&mut self, line: &str, line_num: usize) -> Result<()> {
let parts: Vec<&str> = line.split(',').collect();
if parts.len() < 2 {
return Err(DictError::Format(format!(
"Invalid user dictionary format at line {line_num}: expected at least 2 fields"
)));
}
let surface = parts[0].trim();
let pos = parts[1].trim();
if surface.is_empty() || pos.is_empty() {
return Err(DictError::Format(format!(
"Empty surface or POS at line {line_num}"
)));
}
let cost = if parts.len() > 2 && !parts[2].trim().is_empty() {
parts[2].trim().parse::<i16>().map_err(|_| {
DictError::Format(format!("Invalid cost at line {}: {}", line_num, parts[2]))
})?
} else {
self.default_cost
};
let reading = if parts.len() > 3 && !parts[3].trim().is_empty() {
Some(parts[3].trim().to_string())
} else {
None
};
if parts.len() >= 6 && !parts[4].trim().is_empty() && !parts[5].trim().is_empty() {
let left_id = parts[4].trim().parse::<u16>().map_err(|_| {
DictError::Format(format!(
"Invalid left_id at line {}: {}",
line_num, parts[4]
))
})?;
let right_id = parts[5].trim().parse::<u16>().map_err(|_| {
DictError::Format(format!(
"Invalid right_id at line {}: {}",
line_num, parts[5]
))
})?;
self.add_entry_with_ids(surface, pos, cost, left_id, right_id, reading);
} else {
self.add_entry(surface, pos, Some(cost), reading);
}
Ok(())
}
#[must_use]
pub fn lookup(&self, surface: &str) -> Vec<&UserEntry> {
self.surface_map
.get(surface)
.map(|indices| {
indices
.iter()
.filter_map(|&idx| self.entries.get(idx))
.collect()
})
.unwrap_or_default()
}
#[must_use]
pub fn common_prefix_search(&self, text: &str) -> Vec<&UserEntry> {
let mut results = Vec::new();
for entry in &self.entries {
if text.starts_with(&entry.surface) {
results.push(entry);
}
}
results
}
#[must_use]
pub fn entries(&self) -> &[UserEntry] {
&self.entries
}
#[must_use]
pub fn len(&self) -> usize {
self.entries.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
pub fn build_trie(&mut self) -> Result<&[u8]> {
if let Some(ref cache) = self.trie_cache {
return Ok(cache);
}
if self.entries.is_empty() {
return Err(DictError::Format(
"Cannot build Trie from empty user dictionary".to_string(),
));
}
#[allow(clippy::cast_possible_truncation)]
let mut trie_entries: Vec<(&str, u32)> = self
.surface_map
.iter()
.filter_map(|(surface, indices)| {
indices.first().map(|&idx| (surface.as_str(), idx as u32))
})
.collect();
trie_entries.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));
let bytes = TrieBuilder::build(&trie_entries)?;
self.trie_cache = Some(bytes);
Ok(self.trie_cache.as_ref().unwrap_or_else(|| unreachable!()))
}
#[must_use]
pub fn get_trie(&self) -> Option<Trie<'_>> {
self.trie_cache.as_ref().map(|bytes| Trie::new(bytes))
}
#[must_use]
pub fn to_entries(&self) -> Vec<Entry> {
self.entries.iter().map(UserEntry::to_entry).collect()
}
pub fn clear(&mut self) {
self.entries.clear();
self.surface_map.clear();
self.trie_cache = None;
}
#[must_use]
pub fn validate(&self) -> ValidationResult {
let mut warnings = Vec::new();
let mut errors = Vec::new();
for (idx, entry) in self.entries.iter().enumerate() {
if entry.surface.is_empty() {
errors.push(format!("Entry {idx}: empty surface"));
}
if entry.pos.is_empty() {
errors.push(format!("Entry {idx}: empty POS tag"));
}
if entry.cost == i16::MIN || entry.cost == i16::MAX {
warnings.push(format!(
"Entry {} ({}): cost {} is at extreme value",
idx, entry.surface, entry.cost
));
}
if !is_valid_pos_tag(&entry.pos) {
warnings.push(format!(
"Entry {} ({}): unknown POS tag '{}'",
idx, entry.surface, entry.pos
));
}
}
let mut seen: HashMap<(&str, &str), usize> = HashMap::new();
for (idx, entry) in self.entries.iter().enumerate() {
let key = (entry.surface.as_str(), entry.pos.as_str());
if let Some(&prev_idx) = seen.get(&key) {
warnings.push(format!(
"Duplicate entry at {} and {}: {} ({})",
prev_idx, idx, entry.surface, entry.pos
));
} else {
seen.insert(key, idx);
}
}
ValidationResult {
is_valid: errors.is_empty(),
warnings,
errors,
}
}
pub fn remove_duplicates(&mut self) {
let mut seen: HashMap<(String, String), bool> = HashMap::new();
let mut new_entries = Vec::new();
for entry in self.entries.drain(..) {
let key = (entry.surface.clone(), entry.pos.clone());
if seen.contains_key(&key) {
continue;
}
seen.insert(key, true);
new_entries.push(entry);
}
self.entries = new_entries;
self.rebuild_surface_map();
self.trie_cache = None;
}
fn rebuild_surface_map(&mut self) {
self.surface_map.clear();
for (idx, entry) in self.entries.iter().enumerate() {
self.surface_map
.entry(entry.surface.clone())
.or_default()
.push(idx);
}
}
pub fn remove_surface(&mut self, surface: &str) -> usize {
if let Some(indices) = self.surface_map.remove(surface) {
let count = indices.len();
let mut indices_sorted = indices;
indices_sorted.sort_by(|a, b| b.cmp(a));
for idx in indices_sorted {
if idx < self.entries.len() {
self.entries.remove(idx);
}
}
self.rebuild_surface_map();
self.trie_cache = None;
count
} else {
0
}
}
pub fn check_csv_duplicates<P: AsRef<Path>>(path: P) -> Result<Vec<(usize, String, String)>> {
let file = std::fs::File::open(path.as_ref()).map_err(DictError::Io)?;
let reader = BufReader::new(file);
let mut seen: HashMap<(String, String), usize> = HashMap::new();
let mut duplicates = Vec::new();
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(DictError::Io)?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 2 {
let surface = parts[0].trim().to_string();
let pos = parts[1].trim().to_string();
let key = (surface.clone(), pos.clone());
if let Some(&prev_line) = seen.get(&key) {
duplicates.push((line_num + 1, surface, pos));
duplicates.push((prev_line, key.0.clone(), key.1.clone()));
} else {
seen.insert(key, line_num + 1);
}
}
}
Ok(duplicates)
}
pub fn add_entry_auto_pos(
&mut self,
surface: impl Into<String>,
cost: Option<i16>,
reading: Option<String>,
) -> &mut Self {
let surface = surface.into();
let pos = estimate_pos(&surface);
self.add_entry(surface, pos, cost, reading)
}
#[must_use]
pub fn check_system_conflicts<S: std::hash::BuildHasher>(
&self,
system_surfaces: &std::collections::HashSet<String, S>,
) -> Vec<(usize, String, String)> {
let mut conflicts = Vec::new();
for (idx, entry) in self.entries.iter().enumerate() {
if system_surfaces.contains(&entry.surface) {
conflicts.push((idx, entry.surface.clone(), entry.pos.clone()));
}
}
conflicts
}
#[must_use]
pub fn stats(&self) -> DictionaryStats {
let mut pos_counts: HashMap<String, usize> = HashMap::new();
let mut total_cost: i64 = 0;
for entry in &self.entries {
*pos_counts.entry(entry.pos.clone()).or_insert(0) += 1;
total_cost += i64::from(entry.cost);
}
DictionaryStats {
entry_count: self.entries.len(),
unique_surfaces: self.surface_map.len(),
pos_distribution: pos_counts,
#[allow(clippy::cast_precision_loss)]
average_cost: if self.entries.is_empty() {
0.0
} else {
(total_cost as f64) / (self.entries.len() as f64)
},
}
}
pub fn save_to_csv<P: AsRef<Path>>(&self, path: P) -> Result<()> {
use std::io::Write;
let mut file = std::fs::File::create(path.as_ref()).map_err(DictError::Io)?;
writeln!(file, "# 사용자 정의 사전").map_err(DictError::Io)?;
writeln!(file, "# 표면형,품사,비용,읽기").map_err(DictError::Io)?;
for entry in &self.entries {
let reading = entry.reading.as_deref().unwrap_or("");
writeln!(
file,
"{},{},{},{}",
entry.surface, entry.pos, entry.cost, reading
)
.map_err(DictError::Io)?;
}
Ok(())
}
}
pub struct UserDictionaryBuilder {
dict: UserDictionary,
}
impl Default for UserDictionaryBuilder {
fn default() -> Self {
Self::new()
}
}
impl UserDictionaryBuilder {
#[must_use]
pub fn new() -> Self {
Self {
dict: UserDictionary::new(),
}
}
#[must_use]
pub fn default_cost(mut self, cost: i16) -> Self {
self.dict = self.dict.with_default_cost(cost);
self
}
#[must_use]
pub fn add(mut self, surface: &str, pos: &str) -> Self {
self.dict.add_entry(surface, pos, None, None);
self
}
#[must_use]
pub fn add_with_cost(mut self, surface: &str, pos: &str, cost: i16) -> Self {
self.dict.add_entry(surface, pos, Some(cost), None);
self
}
#[must_use]
pub fn add_full(mut self, surface: &str, pos: &str, cost: i16, reading: Option<&str>) -> Self {
self.dict
.add_entry(surface, pos, Some(cost), reading.map(String::from));
self
}
pub fn load_csv<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
self.dict.load_from_csv(path)?;
Ok(self)
}
pub fn load_str(mut self, content: &str) -> Result<Self> {
self.dict.load_from_str(content)?;
Ok(self)
}
#[must_use]
pub fn build(self) -> UserDictionary {
self.dict
}
pub fn build_with_trie(mut self) -> Result<UserDictionary> {
if !self.dict.is_empty() {
self.dict.build_trie()?;
}
Ok(self.dict)
}
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_add_entry() {
let mut dict = UserDictionary::new();
dict.add_entry("딥러닝", "NNG", Some(-500), None);
dict.add_entry("머신러닝", "NNG", None, Some("머신러닝".to_string()));
assert_eq!(dict.len(), 2);
}
#[test]
fn test_lookup() {
let mut dict = UserDictionary::new();
dict.add_entry("딥러닝", "NNG", Some(-500), None);
dict.add_entry("딥러닝", "NNP", Some(-300), None);
let entries = dict.lookup("딥러닝");
assert_eq!(entries.len(), 2);
assert_eq!(entries[0].pos, "NNG");
assert_eq!(entries[1].pos, "NNP");
}
#[test]
fn test_load_from_str() {
let csv = r"
# 사용자 사전
형태소분석,NNG,-1000,형태소분석
딥러닝,NNG,-500,
자연어처리,NNG,,자연어처리
";
let mut dict = UserDictionary::new();
dict.load_from_str(csv).expect("should load");
assert_eq!(dict.len(), 3);
let entries = dict.lookup("형태소분석");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].cost, -1000);
assert_eq!(entries[0].reading.as_deref(), Some("형태소분석"));
let entries = dict.lookup("딥러닝");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].cost, -500);
let entries = dict.lookup("자연어처리");
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].cost, -1000); }
#[test]
fn test_build_trie() {
let mut dict = UserDictionary::new();
dict.add_entry("가", "NNG", Some(0), None);
dict.add_entry("가다", "VV", Some(0), None);
dict.add_entry("가방", "NNG", Some(0), None);
let bytes = dict.build_trie().expect("should build");
assert!(!bytes.is_empty());
let trie = dict.get_trie().expect("should have trie");
assert!(trie.exact_match("가").is_some());
assert!(trie.exact_match("가다").is_some());
assert!(trie.exact_match("가방").is_some());
assert!(trie.exact_match("없음").is_none());
}
#[test]
fn test_builder_pattern() {
let dict = UserDictionaryBuilder::new()
.default_cost(-500)
.add("딥러닝", "NNG")
.add_with_cost("머신러닝", "NNG", -300)
.add_full("자연어처리", "NNG", -400, Some("자연어처리"))
.build();
assert_eq!(dict.len(), 3);
let entries = dict.lookup("딥러닝");
assert_eq!(entries[0].cost, -500);
let entries = dict.lookup("머신러닝");
assert_eq!(entries[0].cost, -300);
}
#[test]
fn test_to_entry() {
let user_entry = UserEntry::new("테스트", "NNG", -100, Some("테스트".to_string()));
let entry = user_entry.to_entry();
assert_eq!(entry.surface, "테스트");
assert_eq!(entry.cost, -100);
assert!(entry.feature.contains("NNG"));
assert!(entry.feature.contains("테스트"));
}
#[test]
fn test_korean_entries() {
let mut dict = UserDictionary::new();
dict.add_entry("챗GPT", "NNP", Some(-1000), Some("챗지피티".to_string()));
dict.add_entry("클로드", "NNP", Some(-1000), None);
dict.add_entry("라마", "NNP", Some(-1000), None);
dict.add_entry("메타", "NNP", Some(-800), None);
dict.add_entry("앤트로픽", "NNP", Some(-1000), None);
assert_eq!(dict.len(), 5);
let entries = dict.lookup("챗GPT");
assert_eq!(entries[0].reading.as_deref(), Some("챗지피티"));
}
#[test]
fn test_clear() {
let mut dict = UserDictionary::new();
dict.add_entry("테스트", "NNG", None, None);
assert_eq!(dict.len(), 1);
dict.clear();
assert!(dict.is_empty());
}
#[test]
fn test_invalid_csv() {
let csv = "표면형만";
let mut dict = UserDictionary::new();
let result = dict.load_from_str(csv);
assert!(result.is_err());
}
#[test]
fn test_common_prefix_search() {
let mut dict = UserDictionary::new();
dict.add_entry("형태", "NNG", Some(0), None);
dict.add_entry("형태소", "NNG", Some(0), None);
dict.add_entry("형태소분석", "NNG", Some(0), None);
dict.build_trie().expect("should build");
let trie = dict.get_trie().expect("should have trie");
assert_eq!(trie.common_prefix_search("형태소분석기").count(), 3); }
#[test]
fn test_with_context_ids() {
let mut dict = UserDictionary::new();
dict.add_entry_with_ids("테스트", "NNG", -100, 1234, 5678, None);
let entries = dict.lookup("테스트");
assert_eq!(entries[0].left_id, 1234);
assert_eq!(entries[0].right_id, 5678);
}
#[test]
fn test_validate() {
let mut dict = UserDictionary::new();
dict.add_entry("테스트", "NNG", Some(-100), None);
dict.add_entry("유효", "VV", Some(-200), None);
let result = dict.validate();
assert!(result.is_valid);
}
#[test]
fn test_validate_with_invalid_pos() {
let mut dict = UserDictionary::new();
dict.add_entry("테스트", "INVALID_POS", Some(-100), None);
let result = dict.validate();
assert!(result.is_valid); assert!(!result.warnings.is_empty());
}
#[test]
fn test_remove_duplicates() {
let mut dict = UserDictionary::new();
dict.add_entry("테스트", "NNG", Some(-100), None);
dict.add_entry("테스트", "NNG", Some(-200), None); dict.add_entry("테스트", "VV", Some(-300), None);
assert_eq!(dict.len(), 3);
dict.remove_duplicates();
assert_eq!(dict.len(), 2); }
#[test]
fn test_remove_surface() {
let mut dict = UserDictionary::new();
dict.add_entry("삭제", "NNG", Some(-100), None);
dict.add_entry("삭제", "VV", Some(-200), None);
dict.add_entry("유지", "NNG", Some(-100), None);
let removed = dict.remove_surface("삭제");
assert_eq!(removed, 2);
assert_eq!(dict.len(), 1);
assert!(dict.lookup("삭제").is_empty());
}
#[test]
fn test_stats() {
let mut dict = UserDictionary::new();
dict.add_entry("명사1", "NNG", Some(-100), None);
dict.add_entry("명사2", "NNG", Some(-200), None);
dict.add_entry("동사", "VV", Some(-150), None);
let stats = dict.stats();
assert_eq!(stats.entry_count, 3);
assert_eq!(stats.unique_surfaces, 3);
assert_eq!(stats.pos_distribution.get("NNG"), Some(&2));
assert_eq!(stats.pos_distribution.get("VV"), Some(&1));
}
#[test]
fn test_is_valid_pos_tag() {
assert!(is_valid_pos_tag("NNG"));
assert!(is_valid_pos_tag("VV"));
assert!(is_valid_pos_tag("NNG+JX")); assert!(!is_valid_pos_tag("INVALID"));
}
#[test]
fn test_estimate_pos() {
assert_eq!(estimate_pos("GPT"), "SL");
assert_eq!(estimate_pos("BTS"), "SL");
assert_eq!(estimate_pos("123"), "SN");
assert_eq!(estimate_pos("챗GPT"), "NNP");
assert_eq!(estimate_pos("하다"), "VV");
assert_eq!(estimate_pos("먹다"), "VV");
assert_eq!(estimate_pos("메타버스"), "NNG");
assert_eq!(estimate_pos("사과"), "NNG");
assert_eq!(estimate_pos(""), "NA");
}
#[test]
fn test_add_entry_auto_pos() {
let mut dict = UserDictionary::new();
dict.add_entry_auto_pos("GPT", None, None);
dict.add_entry_auto_pos("챗GPT", None, None);
dict.add_entry_auto_pos("메타버스", None, None);
let entries = dict.lookup("GPT");
assert_eq!(entries[0].pos, "SL");
let entries = dict.lookup("챗GPT");
assert_eq!(entries[0].pos, "NNP");
let entries = dict.lookup("메타버스");
assert_eq!(entries[0].pos, "NNG");
}
#[test]
fn test_check_system_conflicts() {
use std::collections::HashSet;
let mut dict = UserDictionary::new();
dict.add_entry("사과", "NNG", None, None); dict.add_entry("챗GPT", "NNP", None, None); dict.add_entry("바나나", "NNG", None, None);
let system_surfaces: HashSet<String> = ["사과", "바나나", "포도"]
.iter()
.map(|s| (*s).to_string())
.collect();
let conflicts = dict.check_system_conflicts(&system_surfaces);
assert_eq!(conflicts.len(), 2);
let surfaces: Vec<&str> = conflicts.iter().map(|(_, s, _)| s.as_str()).collect();
assert!(surfaces.contains(&"사과"));
assert!(surfaces.contains(&"바나나"));
}
}