use crate::pos_tag::PosTag;
use crate::tokenizer::{Token, Tokenizer};
use crate::Result;
use std::collections::HashSet;
const COMPOUND_DICT: &[(&str, &[(&str, &str)])] = &[
("형태소분석기", &[("형태소", "NNG"), ("분석기", "NNG")]),
("형태소분석", &[("형태소", "NNG"), ("분석", "NNG")]),
("자연어처리", &[("자연어", "NNG"), ("처리", "NNG")]),
("인공지능", &[("인공", "NNG"), ("지능", "NNG")]),
("기계학습", &[("기계", "NNG"), ("학습", "NNG")]),
("딥러닝", &[("딥", "NNG"), ("러닝", "NNG")]),
("데이터베이스", &[("데이터", "NNG"), ("베이스", "NNG")]),
("운영체제", &[("운영", "NNG"), ("체제", "NNG")]),
("프로그래밍", &[("프로그램", "NNG"), ("밍", "XSN")]),
("소프트웨어", &[("소프트", "NNG"), ("웨어", "NNG")]),
("하드웨어", &[("하드", "NNG"), ("웨어", "NNG")]),
("대한민국", &[("대한", "NNP"), ("민국", "NNG")]),
("국립국어원", &[("국립", "NNG"), ("국어원", "NNP")]),
("대통령", &[("대", "XPN"), ("통령", "NNG")]),
("국무총리", &[("국무", "NNG"), ("총리", "NNG")]),
("대법원", &[("대", "XPN"), ("법원", "NNG")]),
("헌법재판소", &[("헌법", "NNG"), ("재판소", "NNG")]),
("국회의원", &[("국회", "NNG"), ("의원", "NNG")]),
(
"지방자치단체",
&[("지방", "NNG"), ("자치", "NNG"), ("단체", "NNG")],
),
("대학교", &[("대학", "NNG"), ("교", "NNG")]),
("초등학교", &[("초등", "NNG"), ("학교", "NNG")]),
("중학교", &[("중", "XPN"), ("학교", "NNG")]),
("고등학교", &[("고등", "NNG"), ("학교", "NNG")]),
("운동장", &[("운동", "NNG"), ("장", "NNG")]),
("도서관", &[("도서", "NNG"), ("관", "NNG")]),
("교과서", &[("교과", "NNG"), ("서", "NNG")]),
("아파트", &[("아파트", "NNG")]),
("백화점", &[("백화", "NNG"), ("점", "NNG")]),
("주차장", &[("주차", "NNG"), ("장", "NNG")]),
("병원", &[("병원", "NNG")]),
("약국", &[("약국", "NNG")]),
("편의점", &[("편의", "NNG"), ("점", "NNG")]),
("공항", &[("공항", "NNG")]),
("지하철", &[("지하", "NNG"), ("철", "NNG")]),
("버스정류장", &[("버스", "NNG"), ("정류장", "NNG")]),
("주식시장", &[("주식", "NNG"), ("시장", "NNG")]),
("부동산", &[("부동", "NNG"), ("산", "NNG")]),
("신용카드", &[("신용", "NNG"), ("카드", "NNG")]),
("은행계좌", &[("은행", "NNG"), ("계좌", "NNG")]),
("지구온난화", &[("지구", "NNG"), ("온난화", "NNG")]),
("환경오염", &[("환경", "NNG"), ("오염", "NNG")]),
("태양광", &[("태양", "NNG"), ("광", "NNG")]),
("풍력발전", &[("풍력", "NNG"), ("발전", "NNG")]),
("건강보험", &[("건강", "NNG"), ("보험", "NNG")]),
("의료기관", &[("의료", "NNG"), ("기관", "NNG")]),
("응급실", &[("응급", "NNG"), ("실", "NNG")]),
("수술실", &[("수술", "NNG"), ("실", "NNG")]),
];
const PREFIXES: &[(&str, &str)] = &[
("신", "XPN"), ("구", "XPN"), ("총", "XPN"), ("부", "XPN"), ("대", "XPN"), ("소", "XPN"), ("중", "XPN"), ("고", "XPN"), ("저", "XPN"), ("최", "XPN"), ("초", "XPN"), ("준", "XPN"), ("범", "XPN"), ("반", "XPN"), ("비", "XPN"), ("미", "XPN"), ("재", "XPN"), ("전", "XPN"), ("후", "XPN"), ("무", "XPN"), ("유", "XPN"), ("친", "XPN"), ("반", "XPN"), ];
const SUFFIXES: &[(&str, &str)] = &[
("들", "XSN"), ("님", "XSN"), ("씨", "XSN"), ("꾼", "XSN"), ("쟁이", "XSN"), ("치", "XSN"), ("가", "XSN"), ("자", "XSN"), ("사", "XSN"), ("원", "XSN"), ("인", "XSN"), ("생", "XSN"), ("장", "XSN"), ("실", "XSN"), ("관", "XSN"), ("소", "XSN"), ("점", "XSN"), ("기", "XSN"), ("화", "XSN"), ("적", "XSN"), ("성", "XSN"), ("율", "XSN"), ("도", "XSN"), ("비", "XSN"), ("권", "XSN"), ("론", "XSN"), ("학", "XSN"), ("계", "XSN"), ];
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DecompoundMode {
None,
Discard,
Mixed,
}
impl DecompoundMode {
#[must_use]
pub fn parse(s: &str) -> Option<Self> {
match s.to_lowercase().as_str() {
"none" => Some(Self::None),
"discard" => Some(Self::Discard),
"mixed" => Some(Self::Mixed),
_ => None,
}
}
#[must_use]
#[allow(clippy::should_implement_trait)]
pub fn from_str(s: &str) -> Option<Self> {
Self::parse(s)
}
#[must_use]
pub const fn as_str(&self) -> &'static str {
match self {
Self::None => "none",
Self::Discard => "discard",
Self::Mixed => "mixed",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NoriToken {
pub surface: String,
pub pos_tag: String,
pub start_offset: usize,
pub end_offset: usize,
pub lemma: Option<String>,
pub reading: Option<String>,
pub word_type: WordType,
pub is_decompound: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum WordType {
Known,
Unknown,
User,
}
impl WordType {
#[must_use]
pub const fn as_str(&self) -> &'static str {
match self {
Self::Known => "KNOWN",
Self::Unknown => "UNKNOWN",
Self::User => "USER",
}
}
}
pub struct NoriTokenizer {
tokenizer: Tokenizer,
decompound_mode: DecompoundMode,
output_unknown_unigrams: bool,
}
impl NoriTokenizer {
pub fn new(decompound_mode: DecompoundMode, output_unknown_unigrams: bool) -> Result<Self> {
Ok(Self {
tokenizer: Tokenizer::new()?,
decompound_mode,
output_unknown_unigrams,
})
}
pub fn with_dict(
dict_path: &str,
decompound_mode: DecompoundMode,
output_unknown_unigrams: bool,
) -> Result<Self> {
Ok(Self {
tokenizer: Tokenizer::with_dict(dict_path)?,
decompound_mode,
output_unknown_unigrams,
})
}
pub fn tokenize(&mut self, text: &str) -> Result<Vec<NoriToken>> {
let mecab_tokens = self.tokenizer.tokenize(text);
let mut nori_tokens = Vec::new();
for token in &mecab_tokens {
let nori_token = self.convert_token(token, text);
nori_tokens.extend(nori_token);
}
Ok(nori_tokens)
}
fn convert_token(&self, token: &Token, text: &str) -> Vec<NoriToken> {
let pos_tag = token.pos.parse::<PosTag>().unwrap_or(PosTag::Unknown);
let nori_tag = pos_tag.to_nori_compat();
let mut tokens = vec![NoriToken {
surface: token.surface.clone(),
pos_tag: nori_tag.as_str().to_string(),
start_offset: char_offset(text, token.start_byte),
end_offset: char_offset(text, token.end_byte),
lemma: token.lemma.clone(),
reading: token.reading.clone(),
word_type: if pos_tag == PosTag::Unknown {
WordType::Unknown
} else {
WordType::Known
},
is_decompound: false,
}];
if self.should_decompound(pos_tag) {
let decompounded = Self::decompound_token_enhanced(token, text);
tokens = self.apply_decompound_mode(tokens, decompounded);
}
if self.output_unknown_unigrams && pos_tag == PosTag::Unknown {
tokens = Self::split_unknown_to_unigrams(token, text);
}
tokens
}
fn should_decompound(&self, pos_tag: PosTag) -> bool {
self.decompound_mode != DecompoundMode::None && matches!(pos_tag, PosTag::NNG | PosTag::NNP)
}
fn decompound_token_enhanced(token: &Token, text: &str) -> Vec<NoriToken> {
if let Some(tokens) = Self::try_dict_decompose(token, text) {
return tokens;
}
if let Some(tokens) = Self::try_extract_suffix(token, text) {
return tokens;
}
if let Some(tokens) = Self::try_extract_prefix(token, text) {
return tokens;
}
Self::decompound_token(token, text)
}
fn try_dict_decompose(token: &Token, text: &str) -> Option<Vec<NoriToken>> {
let surface = &token.surface;
for (compound, parts) in COMPOUND_DICT {
if *compound == surface {
if parts.len() <= 1 {
return None;
}
let mut result = Vec::with_capacity(parts.len());
let mut byte_offset = token.start_byte;
for (part_surface, part_pos) in *parts {
let part_bytes = part_surface.len();
result.push(NoriToken {
surface: (*part_surface).to_string(),
pos_tag: (*part_pos).to_string(),
start_offset: char_offset(text, byte_offset),
end_offset: char_offset(text, byte_offset + part_bytes),
lemma: None,
reading: None,
word_type: WordType::Known,
is_decompound: true,
});
byte_offset += part_bytes;
}
return Some(result);
}
}
None
}
fn try_extract_suffix(token: &Token, text: &str) -> Option<Vec<NoriToken>> {
let surface = &token.surface;
let chars: Vec<char> = surface.chars().collect();
if chars.len() < 2 {
return None;
}
let mut sorted_suffixes: Vec<_> = SUFFIXES.iter().collect();
sorted_suffixes.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
for (suffix, suffix_tag) in sorted_suffixes {
let suffix_chars: Vec<char> = suffix.chars().collect();
if chars.len() > suffix_chars.len()
&& chars[chars.len() - suffix_chars.len()..] == suffix_chars[..]
{
let stem_len = chars.len() - suffix_chars.len();
let stem: String = chars[..stem_len].iter().collect();
let stem_bytes = stem.len();
if stem_len >= 1 {
let result = vec![
NoriToken {
surface: stem,
pos_tag: token.pos.clone(),
start_offset: char_offset(text, token.start_byte),
end_offset: char_offset(text, token.start_byte + stem_bytes),
lemma: None,
reading: None,
word_type: WordType::Known,
is_decompound: true,
},
NoriToken {
surface: (*suffix).to_string(),
pos_tag: (*suffix_tag).to_string(),
start_offset: char_offset(text, token.start_byte + stem_bytes),
end_offset: char_offset(text, token.end_byte),
lemma: None,
reading: None,
word_type: WordType::Known,
is_decompound: true,
},
];
return Some(result);
}
}
}
None
}
fn try_extract_prefix(token: &Token, text: &str) -> Option<Vec<NoriToken>> {
let surface = &token.surface;
let chars: Vec<char> = surface.chars().collect();
if chars.len() < 2 {
return None;
}
let mut sorted_prefixes: Vec<_> = PREFIXES.iter().collect();
sorted_prefixes.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
for (prefix, prefix_tag) in sorted_prefixes {
let prefix_chars: Vec<char> = prefix.chars().collect();
if chars.len() > prefix_chars.len() && chars[..prefix_chars.len()] == prefix_chars[..] {
let rest: String = chars[prefix_chars.len()..].iter().collect();
let prefix_bytes = prefix.len();
let rest_len = chars.len() - prefix_chars.len();
if rest_len >= 2 {
let result = vec![
NoriToken {
surface: (*prefix).to_string(),
pos_tag: (*prefix_tag).to_string(),
start_offset: char_offset(text, token.start_byte),
end_offset: char_offset(text, token.start_byte + prefix_bytes),
lemma: None,
reading: None,
word_type: WordType::Known,
is_decompound: true,
},
NoriToken {
surface: rest,
pos_tag: token.pos.clone(),
start_offset: char_offset(text, token.start_byte + prefix_bytes),
end_offset: char_offset(text, token.end_byte),
lemma: None,
reading: None,
word_type: WordType::Known,
is_decompound: true,
},
];
return Some(result);
}
}
}
None
}
fn decompound_token(token: &Token, text: &str) -> Vec<NoriToken> {
use mecab_ko_hangul::{has_jongseong, is_hangul_syllable};
let surface = &token.surface;
let chars: Vec<char> = surface.chars().collect();
if chars.len() < 3 {
return Vec::new();
}
if !chars.iter().all(|&c| is_hangul_syllable(c)) {
return Vec::new();
}
let mut split_positions = Vec::new();
for i in 1..chars.len() {
if i >= chars.len() - 1 {
continue;
}
let prev_char = chars[i - 1];
let curr_char = chars[i];
let prev_has_jong = has_jongseong(prev_char) == Some(true);
let curr_has_jong = has_jongseong(curr_char) == Some(true);
let is_boundary = if !prev_has_jong && curr_has_jong {
true
} else if prev_has_jong && !curr_has_jong {
true
} else if prev_has_jong && curr_has_jong && i >= 2 {
has_jongseong(chars[i - 2]) == Some(true)
} else {
false
};
if is_boundary {
if i >= 1 && chars.len() - i >= 1 {
split_positions.push(i);
}
}
}
if split_positions.is_empty() {
let mid = chars.len() / 2;
if mid >= 1 && chars.len() - mid >= 1 {
split_positions.push(mid);
}
}
if split_positions.len() > 2 {
let first = split_positions[0];
let last = split_positions[split_positions.len() - 1];
split_positions = vec![first, last];
}
if split_positions.is_empty() {
return Vec::new();
}
let mut result = Vec::new();
let mut start_idx = 0;
let mut byte_offset = token.start_byte;
for &split_pos in &split_positions {
if split_pos <= start_idx {
continue;
}
let part: String = chars[start_idx..split_pos].iter().collect();
let part_len_bytes = part.len();
if !part.is_empty() && split_pos - start_idx >= 1 {
result.push(NoriToken {
surface: part,
pos_tag: token.pos.clone(),
start_offset: char_offset(text, byte_offset),
end_offset: char_offset(text, byte_offset + part_len_bytes),
lemma: None,
reading: None,
word_type: WordType::Known,
is_decompound: true,
});
}
byte_offset += part_len_bytes;
start_idx = split_pos;
}
if start_idx < chars.len() {
let part: String = chars[start_idx..].iter().collect();
let part_len_bytes = part.len();
if !part.is_empty() {
result.push(NoriToken {
surface: part,
pos_tag: token.pos.clone(),
start_offset: char_offset(text, byte_offset),
end_offset: char_offset(text, byte_offset + part_len_bytes),
lemma: None,
reading: None,
word_type: WordType::Known,
is_decompound: true,
});
}
}
result
}
fn apply_decompound_mode(
&self,
original: Vec<NoriToken>,
decompounded: Vec<NoriToken>,
) -> Vec<NoriToken> {
match self.decompound_mode {
DecompoundMode::None => original,
DecompoundMode::Discard => {
if decompounded.is_empty() {
original
} else {
decompounded
}
}
DecompoundMode::Mixed => {
let mut result = original;
result.extend(decompounded);
result
}
}
}
fn split_unknown_to_unigrams(token: &Token, text: &str) -> Vec<NoriToken> {
let chars: Vec<char> = token.surface.chars().collect();
let mut tokens = Vec::new();
let mut char_pos = token.start_byte;
for ch in chars {
let surface = ch.to_string();
let char_len = ch.len_utf8();
tokens.push(NoriToken {
surface,
pos_tag: "UNKNOWN".to_string(),
start_offset: char_offset(text, char_pos),
end_offset: char_offset(text, char_pos + char_len),
lemma: None,
reading: None,
word_type: WordType::Unknown,
is_decompound: false,
});
char_pos += char_len;
}
tokens
}
}
pub struct NoriAnalyzer {
tokenizer: NoriTokenizer,
stoptags: HashSet<String>,
_user_dictionary: Option<String>,
}
impl NoriAnalyzer {
pub fn new(
user_dictionary: Option<String>,
decompound_mode: DecompoundMode,
stoptags: Vec<String>,
output_unknown_unigrams: bool,
) -> Result<Self> {
Ok(Self {
tokenizer: NoriTokenizer::new(decompound_mode, output_unknown_unigrams)?,
stoptags: stoptags.into_iter().collect(),
_user_dictionary: user_dictionary,
})
}
pub fn default_with_decompound(decompound_mode: DecompoundMode) -> Result<Self> {
Self::new(
None,
decompound_mode,
vec!["J".to_string(), "E".to_string()],
false,
)
}
pub fn analyze(&mut self, text: &str) -> Result<Vec<NoriToken>> {
let tokens = self.tokenizer.tokenize(text)?;
Ok(self.filter_stoptags(tokens))
}
fn filter_stoptags(&self, tokens: Vec<NoriToken>) -> Vec<NoriToken> {
if self.stoptags.is_empty() {
return tokens;
}
tokens
.into_iter()
.filter(|token| !self.stoptags.contains(&token.pos_tag))
.collect()
}
pub fn add_stoptag(&mut self, tag: String) {
self.stoptags.insert(tag);
}
pub fn remove_stoptag(&mut self, tag: &str) -> bool {
self.stoptags.remove(tag)
}
#[must_use]
pub fn stoptags(&self) -> Vec<&str> {
self.stoptags.iter().map(String::as_str).collect()
}
}
#[must_use]
pub fn mecab_to_nori_tag(mecab_tag: &str) -> String {
mecab_tag.parse::<PosTag>().map_or_else(
|_| mecab_tag.to_string(),
|tag| tag.to_nori_compat().as_str().to_string(),
)
}
#[must_use]
pub fn nori_to_mecab_tag(nori_tag: &str) -> String {
match nori_tag {
"J" => "JX".to_string(),
"E" => "EF".to_string(),
_ => nori_tag.to_string(),
}
}
fn char_offset(text: &str, byte_offset: usize) -> usize {
text[..byte_offset.min(text.len())].chars().count()
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_decompound_mode_from_str() {
assert_eq!(DecompoundMode::parse("none"), Some(DecompoundMode::None));
assert_eq!(
DecompoundMode::parse("discard"),
Some(DecompoundMode::Discard)
);
assert_eq!(DecompoundMode::parse("mixed"), Some(DecompoundMode::Mixed));
assert_eq!(DecompoundMode::parse("NONE"), Some(DecompoundMode::None));
assert_eq!(DecompoundMode::parse("invalid"), None);
}
#[test]
fn test_decompound_mode_as_str() {
assert_eq!(DecompoundMode::None.as_str(), "none");
assert_eq!(DecompoundMode::Discard.as_str(), "discard");
assert_eq!(DecompoundMode::Mixed.as_str(), "mixed");
}
#[test]
fn test_word_type_as_str() {
assert_eq!(WordType::Known.as_str(), "KNOWN");
assert_eq!(WordType::Unknown.as_str(), "UNKNOWN");
assert_eq!(WordType::User.as_str(), "USER");
}
#[test]
fn test_mecab_to_nori_tag() {
assert_eq!(mecab_to_nori_tag("JKS"), "J");
assert_eq!(mecab_to_nori_tag("JKO"), "J");
assert_eq!(mecab_to_nori_tag("JX"), "J");
assert_eq!(mecab_to_nori_tag("EF"), "E");
assert_eq!(mecab_to_nori_tag("EC"), "E");
assert_eq!(mecab_to_nori_tag("ETM"), "E");
assert_eq!(mecab_to_nori_tag("NNG"), "NNG");
assert_eq!(mecab_to_nori_tag("VV"), "VV");
assert_eq!(mecab_to_nori_tag("MAG"), "MAG");
}
#[test]
fn test_nori_to_mecab_tag() {
assert_eq!(nori_to_mecab_tag("J"), "JX");
assert_eq!(nori_to_mecab_tag("E"), "EF");
assert_eq!(nori_to_mecab_tag("NNG"), "NNG");
assert_eq!(nori_to_mecab_tag("VV"), "VV");
}
#[test]
fn test_char_offset() {
let text = "안녕하세요";
assert_eq!(char_offset(text, 0), 0);
assert_eq!(char_offset(text, 3), 1); assert_eq!(char_offset(text, 6), 2); assert_eq!(char_offset(text, 100), 5); }
#[test]
fn test_nori_tokenizer_creation() {
let tokenizer = NoriTokenizer::new(DecompoundMode::None, false);
assert!(tokenizer.is_ok());
let tokenizer = NoriTokenizer::new(DecompoundMode::Mixed, true);
assert!(tokenizer.is_ok());
}
#[test]
fn test_nori_analyzer_creation() {
let analyzer = NoriAnalyzer::new(
None,
DecompoundMode::None,
vec!["J".to_string(), "E".to_string()],
false,
);
assert!(analyzer.is_ok());
}
#[test]
fn test_nori_analyzer_default() {
let analyzer = NoriAnalyzer::default_with_decompound(DecompoundMode::Mixed);
assert!(analyzer.is_ok());
let analyzer = analyzer.unwrap();
let stoptags = analyzer.stoptags();
assert_eq!(stoptags.len(), 2);
assert!(stoptags.contains(&"J"));
assert!(stoptags.contains(&"E"));
}
#[test]
fn test_nori_analyzer_stoptag_management() {
let mut analyzer = NoriAnalyzer::default_with_decompound(DecompoundMode::None).unwrap();
assert_eq!(analyzer.stoptags().len(), 2);
analyzer.add_stoptag("SF".to_string());
assert_eq!(analyzer.stoptags().len(), 3);
assert!(analyzer.stoptags().contains(&"SF"));
assert!(analyzer.remove_stoptag("SF"));
assert_eq!(analyzer.stoptags().len(), 2);
assert!(!analyzer.stoptags().contains(&"SF"));
assert!(!analyzer.remove_stoptag("NONEXISTENT"));
}
#[test]
fn test_pos_tag_nori_mapping() {
assert_eq!(PosTag::JKS.to_nori_compat().as_str(), "J");
assert_eq!(PosTag::JKO.to_nori_compat().as_str(), "J");
assert_eq!(PosTag::JX.to_nori_compat().as_str(), "J");
assert_eq!(PosTag::EF.to_nori_compat().as_str(), "E");
assert_eq!(PosTag::EC.to_nori_compat().as_str(), "E");
assert_eq!(PosTag::ETM.to_nori_compat().as_str(), "E");
assert_eq!(PosTag::NNG.to_nori_compat().as_str(), "NNG");
assert_eq!(PosTag::VV.to_nori_compat().as_str(), "VV");
}
#[test]
fn test_tokenizer_basic_functionality() {
let mut tokenizer = NoriTokenizer::new(DecompoundMode::None, false).unwrap();
let result = tokenizer.tokenize("안녕");
assert!(result.is_ok());
let tokens = result.unwrap();
assert!(!tokens.is_empty());
}
#[test]
fn test_analyzer_basic_functionality() {
let mut analyzer = NoriAnalyzer::default_with_decompound(DecompoundMode::None).unwrap();
let result = analyzer.analyze("테스트");
assert!(result.is_ok());
}
#[test]
fn test_decompound_token_basic() {
let token = Token {
surface: "형태소분석".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 5,
start_byte: 0,
end_byte: 15, reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "형태소분석");
assert!(!result.is_empty(), "Should decompose compound noun");
for part in &result {
assert!(
part.is_decompound,
"All parts should be marked as decompound"
);
assert_eq!(part.pos_tag, "NNG");
assert_eq!(part.word_type, WordType::Known);
}
}
#[test]
fn test_decompound_token_short_word() {
let token = Token {
surface: "사과".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 2,
start_byte: 0,
end_byte: 6,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "사과");
assert!(result.is_empty(), "Short words should not be decomposed");
}
#[test]
fn test_decompound_token_non_hangul() {
let token = Token {
surface: "ABC".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 3,
start_byte: 0,
end_byte: 3,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "ABC");
assert!(
result.is_empty(),
"Non-Hangul words should not be decomposed"
);
}
#[test]
fn test_decompound_token_mixed_jongseong() {
let token = Token {
surface: "학교운동장".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 5,
start_byte: 0,
end_byte: 15,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "학교운동장");
if !result.is_empty() {
for part in &result {
assert!(part.is_decompound);
assert!(!part.surface.is_empty());
assert_eq!(part.pos_tag, "NNG");
}
}
}
#[test]
fn test_decompound_modes_with_compound() {
use super::DecompoundMode;
let test_token = Token {
surface: "형태소분석".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 5,
start_byte: 0,
end_byte: 15,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let tokenizer = NoriTokenizer::new(DecompoundMode::None, false).unwrap();
let pos_tag = test_token.pos.parse::<PosTag>().unwrap();
assert!(!tokenizer.should_decompound(pos_tag));
let tokenizer = NoriTokenizer::new(DecompoundMode::Discard, false).unwrap();
assert!(tokenizer.should_decompound(pos_tag));
let tokenizer = NoriTokenizer::new(DecompoundMode::Mixed, false).unwrap();
assert!(tokenizer.should_decompound(pos_tag));
}
#[test]
fn test_compound_noun_patterns() {
let token = Token {
surface: "대한민국".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 4,
start_byte: 0,
end_byte: 12,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "대한민국");
assert!(!result.is_empty(), "Should decompose 대한민국");
let token = Token {
surface: "국립국어원".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 5,
start_byte: 0,
end_byte: 15,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "국립국어원");
assert!(!result.is_empty(), "Should decompose 국립국어원");
}
#[test]
fn test_decompound_offset_accuracy() {
let token = Token {
surface: "형태소분석".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 5,
start_byte: 0,
end_byte: 15,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "형태소분석");
if !result.is_empty() {
let mut prev_end = 0;
for part in &result {
assert!(
part.start_offset >= prev_end,
"Offsets should not overlap: {} >= {}",
part.start_offset,
prev_end
);
assert!(
part.end_offset > part.start_offset,
"End should be after start: {} > {}",
part.end_offset,
part.start_offset
);
prev_end = part.end_offset;
}
assert_eq!(
result.last().unwrap().end_offset,
5,
"Last token should end at original token end"
);
}
}
#[test]
fn test_decompound_min_syllable_constraint() {
let short_words = vec![
("한글", 2), ("사과", 2), ("바나나", 3), ];
for (word, len) in short_words {
let token = Token {
surface: word.to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: len,
start_byte: 0,
end_byte: word.len(),
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, word);
if len < 3 {
assert!(
result.is_empty(),
"Words with {len} syllables should not decompose: {word}"
);
}
}
}
#[test]
fn test_decompound_preserves_wordtype() {
let token = Token {
surface: "형태소분석".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 5,
start_byte: 0,
end_byte: 15,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token(&token, "형태소분석");
for part in result {
assert_eq!(part.word_type, WordType::Known);
assert!(part.is_decompound);
}
}
#[test]
fn test_mixed_mode_returns_both() {
let mut tokenizer = NoriTokenizer::new(DecompoundMode::Mixed, false).unwrap();
let text = "형태소";
let result = tokenizer.tokenize(text);
assert!(result.is_ok());
}
#[test]
fn test_discard_mode_returns_only_parts() {
let mut tokenizer = NoriTokenizer::new(DecompoundMode::Discard, false).unwrap();
let text = "형태소";
let result = tokenizer.tokenize(text);
assert!(result.is_ok());
}
#[test]
fn test_dict_decompose_basic() {
let token = Token {
surface: "형태소분석기".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 6,
start_byte: 0,
end_byte: 18,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::try_dict_decompose(&token, "형태소분석기");
assert!(result.is_some(), "Should find compound in dictionary");
let parts = result.unwrap();
assert_eq!(parts.len(), 2);
assert_eq!(parts[0].surface, "형태소");
assert_eq!(parts[1].surface, "분석기");
}
#[test]
fn test_dict_decompose_대한민국() {
let token = Token {
surface: "대한민국".to_string(),
pos: "NNP".to_string(),
start_pos: 0,
end_pos: 4,
start_byte: 0,
end_byte: 12,
reading: None,
lemma: None,
cost: 0,
features: "NNP,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::try_dict_decompose(&token, "대한민국");
assert!(result.is_some(), "Should find 대한민국 in dictionary");
let parts = result.unwrap();
assert_eq!(parts.len(), 2);
assert_eq!(parts[0].surface, "대한");
assert_eq!(parts[0].pos_tag, "NNP");
assert_eq!(parts[1].surface, "민국");
}
#[test]
fn test_enhanced_suffix_extraction() {
let token = Token {
surface: "현대화".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 3,
start_byte: 0,
end_byte: 9,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::try_extract_suffix(&token, "현대화");
assert!(result.is_some(), "Should extract suffix 화");
let parts = result.unwrap();
assert_eq!(parts.len(), 2);
assert_eq!(parts[0].surface, "현대");
assert_eq!(parts[1].surface, "화");
assert_eq!(parts[1].pos_tag, "XSN");
}
#[test]
fn test_enhanced_prefix_extraction() {
let token = Token {
surface: "초고속".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 3,
start_byte: 0,
end_byte: 9,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::try_extract_prefix(&token, "초고속");
assert!(result.is_some(), "Should extract prefix 초");
let parts = result.unwrap();
assert_eq!(parts.len(), 2);
assert_eq!(parts[0].surface, "초");
assert_eq!(parts[0].pos_tag, "XPN");
assert_eq!(parts[1].surface, "고속");
}
#[test]
fn test_decompound_enhanced_priority() {
let token = Token {
surface: "형태소분석".to_string(),
pos: "NNG".to_string(),
start_pos: 0,
end_pos: 5,
start_byte: 0,
end_byte: 15,
reading: None,
lemma: None,
cost: 0,
features: "NNG,*,*,*,*,*,*,*".to_string(),
normalized: None,
};
let result = NoriTokenizer::decompound_token_enhanced(&token, "형태소분석");
assert_eq!(result.len(), 2);
assert_eq!(result[0].surface, "형태소");
assert_eq!(result[1].surface, "분석");
}
#[test]
fn test_multiple_suffix_entries() {
assert!(SUFFIXES.len() > 10, "Should have many suffix entries");
assert!(
SUFFIXES.iter().any(|(s, _)| *s == "화"),
"Should contain 화"
);
assert!(
SUFFIXES.iter().any(|(s, _)| *s == "적"),
"Should contain 적"
);
assert!(
SUFFIXES.iter().any(|(s, _)| *s == "쟁이"),
"Should contain 쟁이"
);
}
#[test]
fn test_multiple_prefix_entries() {
assert!(PREFIXES.len() > 10, "Should have many prefix entries");
assert!(
PREFIXES.iter().any(|(p, _)| *p == "초"),
"Should contain 초"
);
assert!(
PREFIXES.iter().any(|(p, _)| *p == "최"),
"Should contain 최"
);
assert!(
PREFIXES.iter().any(|(p, _)| *p == "친"),
"Should contain 친"
);
}
}