use std::collections::HashMap;
use std::io::BufRead;
use mecab_ko_hangul::{classify_char, CharType};
use crate::error::{Error, Result};
use crate::lattice::{Lattice, NodeBuilder, NodeType};
pub type CategoryId = u8;
pub const DEFAULT_CATEGORY: CategoryId = 0;
pub const SPACE_CATEGORY: CategoryId = 1;
pub const HANGUL_CATEGORY: CategoryId = 2;
pub const HANJA_CATEGORY: CategoryId = 3;
pub const ALPHA_CATEGORY: CategoryId = 4;
pub const NUMERIC_CATEGORY: CategoryId = 5;
pub const SYMBOL_CATEGORY: CategoryId = 6;
#[derive(Debug, Clone)]
pub struct CharCategoryDef {
pub name: String,
pub id: CategoryId,
pub invoke: bool,
pub group: bool,
pub length: usize,
}
impl CharCategoryDef {
#[must_use]
pub fn new(name: &str, id: CategoryId, invoke: bool, group: bool, length: usize) -> Self {
Self {
name: name.to_string(),
id,
invoke,
group,
length,
}
}
}
#[derive(Debug, Clone)]
pub struct UnknownDef {
pub category_id: CategoryId,
pub left_id: u16,
pub right_id: u16,
pub cost: i16,
pub pos: String,
pub feature: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum WordPattern {
Plain,
ProperNoun,
CamelCase,
HangulAlphaMix,
NumberUnit,
Emoji,
}
impl UnknownDef {
#[must_use]
pub fn new(
category_id: CategoryId,
left_id: u16,
right_id: u16,
cost: i16,
pos: &str,
feature: &str,
) -> Self {
Self {
category_id,
left_id,
right_id,
cost,
pos: pos.to_string(),
feature: feature.to_string(),
}
}
}
#[derive(Debug, Clone)]
pub struct CharCategoryMap {
categories: Vec<CharCategoryDef>,
name_to_id: HashMap<String, CategoryId>,
type_to_category: HashMap<CharType, CategoryId>,
range_overrides: Vec<(u32, u32, CategoryId)>,
}
impl Default for CharCategoryMap {
fn default() -> Self {
Self::korean_default()
}
}
impl CharCategoryMap {
#[must_use]
pub fn new() -> Self {
Self {
categories: Vec::new(),
name_to_id: HashMap::new(),
type_to_category: HashMap::new(),
range_overrides: Vec::new(),
}
}
#[must_use]
pub fn korean_default() -> Self {
let mut map = Self::new();
let defaults = [
("DEFAULT", DEFAULT_CATEGORY, false, true, 0),
("SPACE", SPACE_CATEGORY, false, true, 0),
("HANGUL", HANGUL_CATEGORY, false, true, 2), ("HANJA", HANJA_CATEGORY, false, false, 1),
("ALPHA", ALPHA_CATEGORY, true, true, 0), ("NUMERIC", NUMERIC_CATEGORY, true, true, 0), ("SYMBOL", SYMBOL_CATEGORY, true, true, 0),
];
for (name, id, invoke, group, length) in defaults {
map.add_category(CharCategoryDef::new(name, id, invoke, group, length));
}
map.type_to_category
.insert(CharType::HangulSyllable, HANGUL_CATEGORY);
map.type_to_category
.insert(CharType::HangulJamo, HANGUL_CATEGORY);
map.type_to_category.insert(CharType::Hanja, HANJA_CATEGORY);
map.type_to_category
.insert(CharType::Katakana, ALPHA_CATEGORY);
map.type_to_category
.insert(CharType::Hiragana, ALPHA_CATEGORY);
map.type_to_category
.insert(CharType::Alphabet, ALPHA_CATEGORY);
map.type_to_category
.insert(CharType::Digit, NUMERIC_CATEGORY);
map.type_to_category
.insert(CharType::Whitespace, SPACE_CATEGORY);
map.type_to_category
.insert(CharType::Punctuation, SYMBOL_CATEGORY);
map.type_to_category
.insert(CharType::Other, DEFAULT_CATEGORY);
map
}
pub fn add_category(&mut self, def: CharCategoryDef) {
self.name_to_id.insert(def.name.clone(), def.id);
self.categories.push(def);
}
pub fn add_range(&mut self, start: u32, end: u32, category_id: CategoryId) {
self.range_overrides.push((start, end, category_id));
}
#[must_use]
pub fn get_category(&self, c: char) -> CategoryId {
let code = c as u32;
for &(start, end, cat_id) in &self.range_overrides {
if code >= start && code <= end {
return cat_id;
}
}
let char_type = classify_char(c);
self.type_to_category
.get(&char_type)
.copied()
.unwrap_or(DEFAULT_CATEGORY)
}
#[must_use]
pub fn get_category_def(&self, id: CategoryId) -> Option<&CharCategoryDef> {
self.categories.iter().find(|c| c.id == id)
}
#[must_use]
pub fn get_id_by_name(&self, name: &str) -> Option<CategoryId> {
self.name_to_id.get(name).copied()
}
pub fn from_char_def<R: BufRead>(reader: R) -> Result<Self> {
let mut map = Self::new();
let mut next_id: CategoryId = 0;
for line in reader.lines() {
let line = line.map_err(|e| Error::Init(e.to_string()))?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
if !line.starts_with("0x") && !line.chars().next().is_some_and(|c| c.is_ascii_digit()) {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 4 {
let name = parts[0];
let invoke = parts[1] == "1";
let group = parts[2] == "1";
let length: usize = parts[3].parse().unwrap_or(0);
map.add_category(CharCategoryDef::new(name, next_id, invoke, group, length));
next_id += 1;
}
}
else if line.starts_with("0x") {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let range_part = parts[0];
let category_name = parts[1];
if let Some(cat_id) = map.get_id_by_name(category_name) {
if let Some((start, end)) = parse_unicode_range(range_part) {
map.add_range(start, end, cat_id);
}
}
}
}
}
Ok(map)
}
}
fn parse_unicode_range(s: &str) -> Option<(u32, u32)> {
if let Some((start_str, end_str)) = s.split_once("..") {
let start = parse_hex(start_str)?;
let end = parse_hex(end_str)?;
Some((start, end))
} else {
let value = parse_hex(s)?;
Some((value, value))
}
}
fn parse_hex(s: &str) -> Option<u32> {
let s = s.trim_start_matches("0x").trim_start_matches("0X");
u32::from_str_radix(s, 16).ok()
}
#[must_use]
const fn is_emoji(c: char) -> bool {
let code = c as u32;
matches!(code,
0x1F300..=0x1F9FF | 0x2600..=0x27BF )
}
#[derive(Debug, Clone, Default)]
pub struct UnknownDictionary {
entries: HashMap<CategoryId, Vec<UnknownDef>>,
}
impl UnknownDictionary {
#[must_use]
pub fn new() -> Self {
Self {
entries: HashMap::new(),
}
}
#[must_use]
pub fn korean_default() -> Self {
let mut dict = Self::new();
let defaults = [
(DEFAULT_CATEGORY, 1800, 3562, 7000, "SY", "SY,*,*,*,*,*,*,*"),
(SPACE_CATEGORY, 1799, 3559, 0, "SP", "SP,*,*,*,*,*,*,*"),
(
HANGUL_CATEGORY,
1800,
3565,
5000,
"UNKNOWN",
"UNKNOWN,*,*,*,*,*,*,*",
),
(HANJA_CATEGORY, 1800, 3560, 6000, "SH", "SH,*,*,*,*,*,*,*"),
(ALPHA_CATEGORY, 1800, 3558, 4000, "SL", "SL,*,*,*,*,*,*,*"),
(NUMERIC_CATEGORY, 1800, 3561, 3000, "SN", "SN,*,*,*,*,*,*,*"),
(SYMBOL_CATEGORY, 1800, 3562, 7000, "SY", "SY,*,*,*,*,*,*,*"),
];
for (cat_id, left_id, right_id, cost, pos, feature) in defaults {
dict.add_entry(UnknownDef::new(
cat_id, left_id, right_id, cost, pos, feature,
));
}
dict
}
pub fn add_entry(&mut self, def: UnknownDef) {
self.entries.entry(def.category_id).or_default().push(def);
}
#[must_use]
pub fn get_entries(&self, category_id: CategoryId) -> &[UnknownDef] {
self.entries
.get(&category_id)
.map_or(&[], std::vec::Vec::as_slice)
}
pub fn from_unk_def<R: BufRead>(reader: R, category_map: &CharCategoryMap) -> Result<Self> {
let mut dict = Self::new();
for line in reader.lines() {
let line = line.map_err(|e| Error::Init(e.to_string()))?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let parts: Vec<&str> = line.split(',').collect();
if parts.len() >= 5 {
let category_name = parts[0];
let left_id: u16 = parts[1].parse().unwrap_or(0);
let right_id: u16 = parts[2].parse().unwrap_or(0);
let cost: i16 = parts[3].parse().unwrap_or(0);
let pos = parts[4];
let feature = line;
if let Some(cat_id) = category_map.get_id_by_name(category_name) {
dict.add_entry(UnknownDef::new(
cat_id, left_id, right_id, cost, pos, feature,
));
}
}
}
Ok(dict)
}
}
#[derive(Debug, Clone)]
pub struct UnknownCandidate {
pub surface: String,
pub start_pos: usize,
pub end_pos: usize,
pub left_id: u16,
pub right_id: u16,
pub cost: i16,
pub pos: String,
pub category_id: CategoryId,
pub pattern: WordPattern,
}
#[derive(Debug, Clone)]
pub struct UnknownHandler {
pub category_map: CharCategoryMap,
pub unknown_dict: UnknownDictionary,
}
impl Default for UnknownHandler {
fn default() -> Self {
Self::korean_default()
}
}
impl UnknownHandler {
#[must_use]
pub const fn new(category_map: CharCategoryMap, unknown_dict: UnknownDictionary) -> Self {
Self {
category_map,
unknown_dict,
}
}
#[must_use]
pub fn korean_default() -> Self {
Self::new(
CharCategoryMap::korean_default(),
UnknownDictionary::korean_default(),
)
}
#[must_use]
fn detect_pattern(&self, surface: &str) -> WordPattern {
let chars: Vec<char> = surface.chars().collect();
if chars.is_empty() {
return WordPattern::Plain;
}
if chars.iter().any(|&c| is_emoji(c)) {
return WordPattern::Emoji;
}
let has_hangul = chars.iter().any(|&c| {
let cat = self.category_map.get_category(c);
cat == HANGUL_CATEGORY
});
let has_alpha = chars.iter().any(|&c| {
let cat = self.category_map.get_category(c);
cat == ALPHA_CATEGORY
});
if has_hangul && has_alpha {
return WordPattern::HangulAlphaMix;
}
let has_digit = chars.iter().any(|&c| {
let cat = self.category_map.get_category(c);
cat == NUMERIC_CATEGORY
});
if has_digit && (has_hangul || has_alpha) {
return WordPattern::NumberUnit;
}
if has_alpha && !has_hangul {
if chars.len() > 1 {
let mut has_internal_uppercase = false;
for (i, &c) in chars.iter().enumerate() {
if i > 0 && c.is_uppercase() {
has_internal_uppercase = true;
break;
}
}
if has_internal_uppercase {
return WordPattern::CamelCase;
}
}
if chars[0].is_uppercase() && chars.len() > 1 {
return WordPattern::ProperNoun;
}
}
WordPattern::Plain
}
#[must_use]
#[allow(clippy::unused_self)]
fn adjust_cost_by_pattern(&self, base_cost: i16, pattern: WordPattern, length: usize) -> i16 {
let mut cost = i32::from(base_cost);
match pattern {
WordPattern::Plain => {
if length > 6 {
#[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
let penalty = ((length - 6) * 80) as i32; cost += penalty;
}
}
WordPattern::ProperNoun => {
cost -= 600; }
WordPattern::CamelCase => {
cost -= 400; }
WordPattern::HangulAlphaMix => {
cost -= 100; }
WordPattern::NumberUnit => {
cost -= 300; }
WordPattern::Emoji => {
cost += 1500; }
}
#[allow(clippy::cast_possible_truncation)]
{
cost.clamp(i32::from(i16::MIN), i32::from(i16::MAX)) as i16
}
}
#[must_use]
#[allow(clippy::unused_self)]
fn estimate_pos(
&self,
pattern: WordPattern,
category_id: CategoryId,
base_pos: &str,
) -> String {
match pattern {
WordPattern::ProperNoun | WordPattern::CamelCase if category_id == ALPHA_CATEGORY => {
return "NNP".to_string();
}
WordPattern::HangulAlphaMix if category_id == HANGUL_CATEGORY => {
return "NNG".to_string();
}
_ => {}
}
base_pos.to_string()
}
#[must_use]
pub fn generate_candidates(
&self,
text: &str,
start_pos: usize,
has_dict_entry: bool,
has_space_before: impl Fn(usize) -> bool,
) -> Vec<UnknownCandidate> {
let start_byte = text
.char_indices()
.nth(start_pos)
.map_or(text.len(), |(b, _)| b);
let suffix = &text[start_byte..];
let Some(first_char) = suffix.chars().next() else {
return Vec::new();
};
let category_id = self.category_map.get_category(first_char);
let Some(category_def) = self.category_map.get_category_def(category_id) else {
return Vec::new();
};
if !category_def.invoke && has_dict_entry {
return Vec::new();
}
let unknown_defs = self.unknown_dict.get_entries(category_id);
if unknown_defs.is_empty() {
return Vec::new();
}
let mut candidates = Vec::new();
if category_def.group {
let mut char_count = 0usize;
let mut byte_end = 0usize;
for c in suffix.chars() {
if self.category_map.get_category(c) != category_id {
break;
}
if char_count > 0 && has_space_before(start_pos + char_count) {
break;
}
byte_end += c.len_utf8();
char_count += 1;
}
let group_char_count = char_count; let max_len = if category_def.length > 0 {
category_def.length.min(group_char_count)
} else {
group_char_count
};
let mut byte_offset = 0usize;
let mut char_iter = suffix.chars();
for len in 1..=max_len {
if let Some(c) = char_iter.next() {
byte_offset += c.len_utf8();
} else {
break;
}
let end_pos = start_pos + len;
let surface = &suffix[..byte_offset];
let pattern = self.detect_pattern(surface);
for def in unknown_defs {
let adjusted_cost = self.adjust_cost_by_pattern(def.cost, pattern, len);
let estimated_pos = self.estimate_pos(pattern, category_id, &def.pos);
candidates.push(UnknownCandidate {
surface: surface.to_string(),
start_pos,
end_pos,
left_id: def.left_id,
right_id: def.right_id,
cost: adjusted_cost,
pos: estimated_pos,
category_id,
pattern,
});
}
}
let _ = byte_end; } else {
let char_total = suffix.chars().count();
let max_len = if category_def.length > 0 {
category_def.length.min(char_total)
} else {
1
};
let mut byte_offset = 0usize;
let mut char_iter = suffix.chars();
for len in 1..=max_len {
if let Some(c) = char_iter.next() {
byte_offset += c.len_utf8();
} else {
break;
}
let end_pos = start_pos + len;
let surface = &suffix[..byte_offset];
let pattern = self.detect_pattern(surface);
for def in unknown_defs {
let adjusted_cost = self.adjust_cost_by_pattern(def.cost, pattern, len);
let estimated_pos = self.estimate_pos(pattern, category_id, &def.pos);
candidates.push(UnknownCandidate {
surface: surface.to_string(),
start_pos,
end_pos,
left_id: def.left_id,
right_id: def.right_id,
cost: adjusted_cost,
pos: estimated_pos,
category_id,
pattern,
});
}
}
}
candidates
}
#[cfg(test)]
fn find_group_end(&self, chars: &[char], start_pos: usize, category_id: CategoryId) -> usize {
let mut pos = start_pos;
while pos < chars.len() {
if self.category_map.get_category(chars[pos]) != category_id {
break;
}
pos += 1;
}
pos
}
pub fn add_unknown_nodes(
&self,
lattice: &mut Lattice,
start_pos: usize,
has_dict_entry: bool,
) -> usize {
let text = lattice.text();
let candidates = self.generate_candidates(text, start_pos, has_dict_entry, |pos| {
lattice.has_space_at(pos)
});
let mut count = 0;
for candidate in candidates {
lattice.add_node(
NodeBuilder::new(&candidate.surface, candidate.start_pos, candidate.end_pos)
.left_id(candidate.left_id)
.right_id(candidate.right_id)
.word_cost(i32::from(candidate.cost))
.node_type(NodeType::Unknown),
);
count += 1;
}
count
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::needless_collect)]
mod tests {
use super::*;
impl UnknownHandler {
fn generate_candidates_no_space(
&self,
text: &str,
start_pos: usize,
has_dict_entry: bool,
) -> Vec<UnknownCandidate> {
self.generate_candidates(text, start_pos, has_dict_entry, |_| false)
}
}
#[test]
fn test_category_map_default() {
let map = CharCategoryMap::korean_default();
assert_eq!(map.get_category('가'), HANGUL_CATEGORY);
assert_eq!(map.get_category('A'), ALPHA_CATEGORY);
assert_eq!(map.get_category('1'), NUMERIC_CATEGORY);
assert_eq!(map.get_category(' '), SPACE_CATEGORY);
assert_eq!(map.get_category('.'), SYMBOL_CATEGORY);
assert_eq!(map.get_category('韓'), HANJA_CATEGORY);
}
#[test]
fn test_category_def() {
let map = CharCategoryMap::korean_default();
let hangul_def = map.get_category_def(HANGUL_CATEGORY).unwrap();
assert_eq!(hangul_def.name, "HANGUL");
assert!(!hangul_def.invoke);
assert!(hangul_def.group);
assert_eq!(hangul_def.length, 2);
let alpha_def = map.get_category_def(ALPHA_CATEGORY).unwrap();
assert!(alpha_def.invoke); }
#[test]
fn test_unknown_dict_default() {
let dict = UnknownDictionary::korean_default();
let hangul_entries = dict.get_entries(HANGUL_CATEGORY);
assert!(!hangul_entries.is_empty());
assert_eq!(hangul_entries[0].pos, "UNKNOWN");
let alpha_entries = dict.get_entries(ALPHA_CATEGORY);
assert!(!alpha_entries.is_empty());
assert_eq!(alpha_entries[0].pos, "SL");
}
#[test]
fn test_generate_candidates_hangul() {
let handler = UnknownHandler::korean_default();
let candidates = handler.generate_candidates_no_space("가나다라", 0, false);
assert!(!candidates.is_empty());
let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
assert!(surfaces.contains(&"가"));
assert!(surfaces.contains(&"가나"));
}
#[test]
fn test_generate_candidates_alpha() {
let handler = UnknownHandler::korean_default();
let candidates = handler.generate_candidates_no_space("ABC", 0, false);
let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
assert!(surfaces.contains(&"A"));
assert!(surfaces.contains(&"AB"));
assert!(surfaces.contains(&"ABC"));
}
#[test]
fn test_generate_candidates_with_dict_entry() {
let handler = UnknownHandler::korean_default();
let candidates = handler.generate_candidates_no_space("가나다", 0, true);
assert!(candidates.is_empty());
let candidates = handler.generate_candidates_no_space("ABC", 0, true);
assert!(!candidates.is_empty());
}
#[test]
fn test_generate_candidates_mixed() {
let handler = UnknownHandler::korean_default();
let text = "가ABC";
let candidates = handler.generate_candidates_no_space(text, 0, false);
assert!(candidates.iter().all(|c| c.category_id == HANGUL_CATEGORY));
let candidates = handler.generate_candidates_no_space(text, 1, false);
assert!(candidates.iter().all(|c| c.category_id == ALPHA_CATEGORY));
}
#[test]
fn test_find_group_end() {
let handler = UnknownHandler::korean_default();
let chars: Vec<char> = "가나다ABC".chars().collect();
let end = handler.find_group_end(&chars, 0, HANGUL_CATEGORY);
assert_eq!(end, 3);
let end = handler.find_group_end(&chars, 3, ALPHA_CATEGORY);
assert_eq!(end, 6);
}
#[test]
fn test_add_unknown_nodes() {
let handler = UnknownHandler::korean_default();
let mut lattice = Lattice::new("테스트ABC");
let count = handler.add_unknown_nodes(&mut lattice, 0, false);
assert!(count > 0);
let nodes_at_0: Vec<_> = lattice.nodes_starting_at(0).collect();
assert!(!nodes_at_0.is_empty());
}
#[test]
fn test_pattern_detection_proper_noun() {
let handler = UnknownHandler::korean_default();
let pattern = handler.detect_pattern("Apple");
assert_eq!(pattern, WordPattern::ProperNoun);
let pattern = handler.detect_pattern("Google");
assert_eq!(pattern, WordPattern::ProperNoun);
}
#[test]
fn test_pattern_detection_camel_case() {
let handler = UnknownHandler::korean_default();
let pattern = handler.detect_pattern("iPhone");
assert_eq!(pattern, WordPattern::CamelCase);
let pattern = handler.detect_pattern("HelloWorld");
assert_eq!(pattern, WordPattern::CamelCase);
let pattern = handler.detect_pattern("iPad");
assert_eq!(pattern, WordPattern::CamelCase);
}
#[test]
fn test_pattern_detection_hangul_alpha_mix() {
let handler = UnknownHandler::korean_default();
let pattern = handler.detect_pattern("카카오톡");
assert_eq!(pattern, WordPattern::Plain);
let pattern = handler.detect_pattern("API키");
assert_eq!(pattern, WordPattern::HangulAlphaMix);
}
#[test]
fn test_pattern_detection_number_unit() {
let handler = UnknownHandler::korean_default();
let pattern = handler.detect_pattern("15kg");
assert_eq!(pattern, WordPattern::NumberUnit);
let pattern = handler.detect_pattern("3개");
assert_eq!(pattern, WordPattern::NumberUnit);
let pattern = handler.detect_pattern("100원");
assert_eq!(pattern, WordPattern::NumberUnit);
}
#[test]
fn test_pattern_detection_emoji() {
let handler = UnknownHandler::korean_default();
let pattern = handler.detect_pattern("😀");
assert_eq!(pattern, WordPattern::Emoji);
let pattern = handler.detect_pattern("안녕😊");
assert_eq!(pattern, WordPattern::Emoji);
}
#[test]
fn test_pattern_detection_plain() {
let handler = UnknownHandler::korean_default();
let pattern = handler.detect_pattern("hello");
assert_eq!(pattern, WordPattern::Plain);
let _pattern = handler.detect_pattern("test123");
}
#[test]
fn test_cost_adjustment_by_pattern() {
let handler = UnknownHandler::korean_default();
let base_cost = 4000i16;
let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::ProperNoun, 5);
assert!(adjusted < base_cost);
let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::CamelCase, 5);
assert!(adjusted < base_cost);
let adjusted = handler.adjust_cost_by_pattern(base_cost, WordPattern::Emoji, 1);
assert!(adjusted > base_cost);
}
#[test]
fn test_cost_adjustment_by_length() {
let handler = UnknownHandler::korean_default();
let base_cost = 5000i16;
let cost_short = handler.adjust_cost_by_pattern(base_cost, WordPattern::Plain, 3);
let cost_long = handler.adjust_cost_by_pattern(base_cost, WordPattern::Plain, 10);
assert!(cost_long > cost_short);
}
#[test]
fn test_pos_estimation_proper_noun() {
let handler = UnknownHandler::korean_default();
let pos = handler.estimate_pos(WordPattern::ProperNoun, ALPHA_CATEGORY, "SL");
assert_eq!(pos, "NNP");
let pos = handler.estimate_pos(WordPattern::CamelCase, ALPHA_CATEGORY, "SL");
assert_eq!(pos, "NNP");
}
#[test]
fn test_pos_estimation_hangul_alpha_mix() {
let handler = UnknownHandler::korean_default();
let pos = handler.estimate_pos(WordPattern::HangulAlphaMix, HANGUL_CATEGORY, "UNKNOWN");
assert_eq!(pos, "NNG");
}
#[test]
fn test_generate_candidates_with_patterns() {
let handler = UnknownHandler::korean_default();
let candidates = handler.generate_candidates_no_space("Apple", 0, false);
assert!(!candidates.is_empty());
let has_proper_noun = candidates
.iter()
.any(|c| c.pattern == WordPattern::ProperNoun);
assert!(has_proper_noun);
let proper_noun_candidates: Vec<_> = candidates
.iter()
.filter(|c| c.pattern == WordPattern::ProperNoun)
.collect();
assert!(proper_noun_candidates.iter().any(|c| c.pos == "NNP"));
}
#[test]
fn test_generate_candidates_abbreviation() {
let handler = UnknownHandler::korean_default();
let candidates = handler.generate_candidates_no_space("API", 0, false);
assert!(!candidates.is_empty());
let surfaces: Vec<_> = candidates.iter().map(|c| c.surface.as_str()).collect();
assert!(surfaces.contains(&"API") || surfaces.contains(&"A"));
}
#[test]
fn test_generate_candidates_camel_case() {
let handler = UnknownHandler::korean_default();
let candidates = handler.generate_candidates_no_space("iPhone", 0, false);
assert!(!candidates.is_empty());
let has_camel = candidates
.iter()
.any(|c| c.pattern == WordPattern::CamelCase);
assert!(has_camel);
}
#[test]
fn test_unknown_korean_word() {
let handler = UnknownHandler::korean_default();
let candidates = handler.generate_candidates_no_space("테스트", 0, false);
assert!(!candidates.is_empty());
assert!(candidates.iter().all(|c| c.category_id == HANGUL_CATEGORY));
}
#[test]
fn test_is_emoji() {
assert!(is_emoji('😀'));
assert!(is_emoji('😊'));
assert!(is_emoji('🚀'));
assert!(is_emoji('❤'));
assert!(!is_emoji('a'));
assert!(!is_emoji('가'));
assert!(!is_emoji('1'));
}
#[test]
fn test_parse_unicode_range() {
assert_eq!(
parse_unicode_range("0xAC00..0xD7A3"),
Some((0xAC00, 0xD7A3))
);
assert_eq!(parse_unicode_range("0xAC00"), Some((0xAC00, 0xAC00)));
assert_eq!(parse_unicode_range("0x0020"), Some((0x0020, 0x0020)));
}
#[test]
fn test_char_def_parsing() {
let char_def = r"
# Comment line
DEFAULT 0 1 0
SPACE 0 1 0
HANGUL 0 1 2
ALPHA 1 1 0
0xAC00..0xD7A3 HANGUL
0x0041..0x005A ALPHA
";
let map = CharCategoryMap::from_char_def(char_def.as_bytes()).unwrap();
assert!(map.get_id_by_name("DEFAULT").is_some());
assert!(map.get_id_by_name("HANGUL").is_some());
assert!(map.get_id_by_name("ALPHA").is_some());
assert_eq!(
map.get_category('가'),
map.get_id_by_name("HANGUL").unwrap()
);
assert_eq!(map.get_category('A'), map.get_id_by_name("ALPHA").unwrap());
}
#[test]
fn test_unk_def_parsing() {
let char_def = "DEFAULT 0 1 0\nHANGUL 0 1 2\n";
let map = CharCategoryMap::from_char_def(char_def.as_bytes()).unwrap();
let unk_def = r"
DEFAULT,1800,3562,7000,SY,*,*,*,*,*,*,*
HANGUL,1800,3565,5000,UNKNOWN,*,*,*,*,*,*,*
";
let dict = UnknownDictionary::from_unk_def(unk_def.as_bytes(), &map).unwrap();
let hangul_id = map.get_id_by_name("HANGUL").unwrap();
let entries = dict.get_entries(hangul_id);
assert!(!entries.is_empty());
assert_eq!(entries[0].pos, "UNKNOWN");
assert_eq!(entries[0].cost, 5000);
}
}