use std::collections::HashMap;
use super::hangul::{remove_jongseong_bieup, remove_jongseong_nieun, remove_jongseong_rieul};
use super::types::EndingRule;
#[must_use]
pub(super) fn is_compound_tag(pos: &str) -> bool {
pos.contains('+')
}
#[must_use]
pub(super) fn split_compound_tag<S: std::hash::BuildHasher>(
tag_map: &HashMap<String, Vec<String>, S>,
pos: &str,
) -> Vec<String> {
tag_map.get(pos).cloned().unwrap_or_else(|| {
if pos.contains('+') {
pos.split('+').map(String::from).collect()
} else {
vec![pos.to_string()]
}
})
}
#[must_use]
pub(super) fn split_morpheme<S: std::hash::BuildHasher>(
surface: &str,
pos: &str,
tag_map: &HashMap<String, Vec<String>, S>,
ending_rules: &[EndingRule],
) -> Vec<(String, String)> {
if !is_compound_tag(pos) {
return vec![(surface.to_string(), pos.to_string())];
}
if pos == "EP+EP" {
if surface == "셨" {
return vec![
("시".to_string(), "EP".to_string()),
("었".to_string(), "EP".to_string()),
];
}
}
let tags = split_compound_tag(tag_map, pos);
if tags.len() >= 2 && tags[0] == tags[1] && pos != "EP+EP" {
return vec![(surface.to_string(), tags[0].clone())];
}
if pos == "EP+EF" {
if surface == "입니다" {
return vec![
("이".to_string(), "VCP".to_string()),
("습니다".to_string(), "EF".to_string()),
];
} else if surface == "입니까" {
return vec![
("이".to_string(), "VCP".to_string()),
("습니까".to_string(), "EF".to_string()),
];
}
}
if pos == "VCP+EF" {
if surface == "입니다" {
return vec![
("이".to_string(), "VCP".to_string()),
("습니다".to_string(), "EF".to_string()),
];
} else if surface == "입니까" {
return vec![
("이".to_string(), "VCP".to_string()),
("습니까".to_string(), "EF".to_string()),
];
}
}
if pos == "VV+EC" && surface == "는다" {
return vec![("는다".to_string(), "EF".to_string())];
}
if pos == "EP+EC" && surface == "며" {
return vec![("며".to_string(), "EC".to_string())];
}
if pos == "VV+EC+EP+EF" && surface.ends_with("야겠다") && surface.chars().count() >= 4 {
let stem_len = surface.chars().count() - 3; let stem: String = surface.chars().take(stem_len).collect();
return vec![
(stem, "VV".to_string()),
("아야겠".to_string(), "EP".to_string()),
("다".to_string(), "EF".to_string()),
];
}
if pos == "VV+VX+EF" {
if surface.ends_with("어지다") && surface.chars().count() >= 4 {
let stem: String = surface.chars().take(surface.chars().count() - 3).collect();
return vec![
(stem, "VV".to_string()),
("어지".to_string(), "VX".to_string()),
("다".to_string(), "EF".to_string()),
];
}
if surface.ends_with("아지다") && surface.chars().count() >= 4 {
let stem: String = surface.chars().take(surface.chars().count() - 3).collect();
return vec![
(stem, "VV".to_string()),
("아지".to_string(), "VX".to_string()),
("다".to_string(), "EF".to_string()),
];
}
}
if pos == "VV+EF" {
let causative_verbs = [
"웃기다",
"울리다",
"높이다",
"낮추다",
"늘이다",
"줄이다",
"살리다",
"죽이다",
"알리다",
"먹이다",
"재우다",
"깨우다",
"들리다",
"놀리다",
];
if causative_verbs.contains(&surface) {
let stem_len = surface.chars().count() - 1;
let stem: String = surface.chars().take(stem_len).collect();
return vec![
(stem, "VV".to_string()),
("다".to_string(), "EF".to_string()),
];
}
if surface.ends_with("니다") && surface.chars().count() >= 3 {
let chars: Vec<char> = surface.chars().collect();
let first_char = chars[0];
if let Some(stem) = remove_jongseong_bieup(first_char) {
if chars.len() == 3 {
return vec![
(stem.to_string(), "VV".to_string()),
("ㅂ니다".to_string(), "EF".to_string()),
];
}
}
}
if surface.ends_with("니까") && surface.chars().count() >= 3 {
let chars: Vec<char> = surface.chars().collect();
let first_char = chars[0];
if let Some(stem) = remove_jongseong_bieup(first_char) {
if chars.len() == 3 {
return vec![
(stem.to_string(), "VV".to_string()),
("ㅂ니까".to_string(), "EF".to_string()),
];
}
}
}
if surface.ends_with("게요") && surface.chars().count() >= 3 {
let chars: Vec<char> = surface.chars().collect();
let stem_char = chars[chars.len() - 3]; if let Some(stem) = remove_jongseong_rieul(stem_char) {
let prefix: String = chars[..chars.len() - 3].iter().collect();
let full_stem = format!("{prefix}{stem}");
return vec![
(full_stem, "VV".to_string()),
("ㄹ게요".to_string(), "EF".to_string()),
];
}
}
if surface.ends_with("까요") && surface.chars().count() >= 3 {
let chars: Vec<char> = surface.chars().collect();
let stem_char = chars[chars.len() - 3]; if let Some(stem) = remove_jongseong_rieul(stem_char) {
let prefix: String = chars[..chars.len() - 3].iter().collect();
let full_stem = format!("{prefix}{stem}");
return vec![
(full_stem, "VV".to_string()),
("ㄹ까요".to_string(), "EF".to_string()),
];
}
}
if surface.ends_with("까") && !surface.ends_with("까요") && surface.chars().count() >= 2
{
let chars: Vec<char> = surface.chars().collect();
let stem_char = chars[chars.len() - 2]; if let Some(stem) = remove_jongseong_rieul(stem_char) {
let prefix: String = chars[..chars.len() - 2].iter().collect();
let full_stem = format!("{prefix}{stem}");
return vec![
(full_stem, "VV".to_string()),
("ㄹ까".to_string(), "EF".to_string()),
];
}
}
if surface.ends_with("래요") && surface.chars().count() >= 3 {
let chars: Vec<char> = surface.chars().collect();
let stem_char = chars[chars.len() - 3]; if let Some(stem) = remove_jongseong_rieul(stem_char) {
let prefix: String = chars[..chars.len() - 3].iter().collect();
let full_stem = format!("{prefix}{stem}");
return vec![
(full_stem, "VV".to_string()),
("ㄹ래요".to_string(), "EF".to_string()),
];
}
}
if surface == "해요" {
return vec![
("하".to_string(), "VV".to_string()),
("어요".to_string(), "EF".to_string()),
];
}
if surface == "봐요" {
return vec![
("보".to_string(), "VV".to_string()),
("아요".to_string(), "EF".to_string()),
];
}
if surface == "와요" {
return vec![
("오".to_string(), "VV".to_string()),
("아요".to_string(), "EF".to_string()),
];
}
if surface == "해" {
return vec![
("하".to_string(), "VV".to_string()),
("어".to_string(), "EF".to_string()),
];
}
if surface == "돼요" {
return vec![
("되".to_string(), "VV".to_string()),
("어요".to_string(), "EF".to_string()),
];
}
if surface == "돼" {
return vec![
("되".to_string(), "VV".to_string()),
("어".to_string(), "EF".to_string()),
];
}
}
if let Some(result) = try_split_contracted(surface, pos, tag_map) {
return result;
}
if let Some(result) = try_split_contracted_two_tags(surface, pos, tag_map) {
return result;
}
for rule in ending_rules {
if rule.pos_pattern == pos {
for ending in &rule.endings {
if surface.ends_with(ending.as_str()) {
let stem_len = surface.chars().count() - ending.chars().count();
if stem_len > 0 {
let stem: String = surface.chars().take(stem_len).collect();
return create_split_morphemes(&stem, ending, &rule.target_tags);
}
}
}
}
}
if (pos == "VV+ETM" || pos == "VA+ETM") && surface.chars().count() == 1 {
let ch = surface.chars().next().unwrap();
if let Some(stem_char) = remove_jongseong_rieul(ch) {
return vec![
(
stem_char.to_string(),
if pos.starts_with("VV") {
"VV".to_string()
} else {
"VA".to_string()
},
),
("ㄹ".to_string(), "ETM".to_string()),
];
}
if let Some(stem_char) = remove_jongseong_nieun(ch) {
return vec![
(
stem_char.to_string(),
if pos.starts_with("VV") {
"VV".to_string()
} else {
"VA".to_string()
},
),
("ㄴ".to_string(), "ETM".to_string()),
];
}
}
let tags = split_compound_tag(tag_map, pos);
if tags.len() > 1 {
return vec![(surface.to_string(), tags[0].clone())];
}
vec![(surface.to_string(), pos.to_string())]
}
#[cfg(test)]
mod tests {
use super::*;
use crate::sejong::ending_rules::init_ending_rules;
use crate::sejong::tag_map::tag_map;
fn make_tag_map() -> &'static HashMap<String, Vec<String>> {
tag_map()
}
fn make_rules() -> Vec<crate::sejong::types::EndingRule> {
init_ending_rules()
}
#[test]
fn test_is_compound_tag_with_plus() {
assert!(is_compound_tag("VV+EF"));
assert!(is_compound_tag("VA+EP+EF"));
assert!(is_compound_tag("NNG+JKS"));
}
#[test]
fn test_is_compound_tag_without_plus() {
assert!(!is_compound_tag("NNG"));
assert!(!is_compound_tag("VV"));
assert!(!is_compound_tag("EF"));
}
#[test]
fn test_split_compound_tag_known_pattern() {
let map = make_tag_map();
assert_eq!(
split_compound_tag(map, "VV+EF"),
vec!["VV".to_string(), "EF".to_string()]
);
assert_eq!(
split_compound_tag(map, "VA+EP+EF"),
vec!["VA".to_string(), "EP".to_string(), "EF".to_string()]
);
}
#[test]
fn test_split_compound_tag_unknown_falls_back_to_simple_split() {
let map = make_tag_map();
let result = split_compound_tag(map, "XSV+EF");
assert_eq!(result, vec!["XSV".to_string(), "EF".to_string()]);
}
#[test]
fn test_split_morpheme_simple_non_compound() {
let map = make_tag_map();
let rules = make_rules();
let result = split_morpheme("먹", "VV", map, &rules);
assert_eq!(result, vec![("먹".to_string(), "VV".to_string())]);
}
#[test]
fn test_split_morpheme_haeyo_contraction() {
let map = make_tag_map();
let rules = make_rules();
let result = split_morpheme("해요", "VV+EF", map, &rules);
assert_eq!(result.len(), 2);
assert_eq!(result[0], ("하".to_string(), "VV".to_string()));
assert_eq!(result[1], ("어요".to_string(), "EF".to_string()));
}
#[test]
fn test_split_morpheme_imnida_vcp() {
let map = make_tag_map();
let rules = make_rules();
let result = split_morpheme("입니다", "VCP+EF", map, &rules);
assert_eq!(result.len(), 2);
assert_eq!(result[0], ("이".to_string(), "VCP".to_string()));
assert_eq!(result[1], ("습니다".to_string(), "EF".to_string()));
}
#[test]
fn test_split_morpheme_ep_ep_syeosseo() {
let map = make_tag_map();
let rules = make_rules();
let result = split_morpheme("셨", "EP+EP", map, &rules);
assert_eq!(result.len(), 2);
assert_eq!(result[0], ("시".to_string(), "EP".to_string()));
assert_eq!(result[1], ("었".to_string(), "EP".to_string()));
}
}
fn try_split_contracted<S: std::hash::BuildHasher>(
surface: &str,
pos: &str,
tag_map: &HashMap<String, Vec<String>, S>,
) -> Option<Vec<(String, String)>> {
let tags = split_compound_tag(tag_map, pos);
if tags.len() != 3 {
return None;
}
let contracted_stems = [
("했", "하", "았"),
("갔", "가", "았"),
("왔", "오", "았"),
("봤", "보", "았"),
("샀", "사", "았"),
("잤", "자", "았"),
("됐", "되", "었"),
("났", "나", "았"), ("랐", "라", "았"), ("섰", "서", "었"), ];
let chars: Vec<char> = surface.chars().collect();
if chars.is_empty() {
return None;
}
let ef_patterns = ["어요", "어", "다", "지", "니", "나", "습니다", "습니까"];
let ec_patterns = ["다고", "라고", "냐고", "자고"];
let first_char = chars[0].to_string();
for (contracted, stem, prefinal) in &contracted_stems {
if first_char == *contracted {
let ending: String = chars[1..].iter().collect();
if !ending.is_empty() {
for ef in &ef_patterns {
if ending == *ef || ending.ends_with(ef) {
return Some(vec![
((*stem).to_string(), tags[0].clone()),
((*prefinal).to_string(), tags[1].clone()),
(ending, tags[2].clone()),
]);
}
}
for ec in &ec_patterns {
if ending == *ec || ending.ends_with(ec) {
return Some(vec![
((*stem).to_string(), tags[0].clone()),
((*prefinal).to_string(), tags[1].clone()),
(ending, tags[2].clone()),
]);
}
}
}
}
}
for i in 1..chars.len() {
let mid_char = chars[i].to_string();
for (contracted, stem, prefinal) in &contracted_stems {
if mid_char == *contracted {
let prefix: String = chars[..i].iter().collect();
let full_stem = format!("{prefix}{stem}");
let suffix: String = chars[i + 1..].iter().collect();
if !suffix.is_empty() {
for ef in &ef_patterns {
if suffix == *ef || suffix.ends_with(ef) {
return Some(vec![
(full_stem, tags[0].clone()),
((*prefinal).to_string(), tags[1].clone()),
(suffix, tags[2].clone()),
]);
}
}
for ec in &ec_patterns {
if suffix == *ec || suffix.ends_with(ec) {
return Some(vec![
(full_stem, tags[0].clone()),
((*prefinal).to_string(), tags[1].clone()),
(suffix, tags[2].clone()),
]);
}
}
}
}
}
}
None
}
fn try_split_contracted_two_tags<S: std::hash::BuildHasher>(
surface: &str,
pos: &str,
tag_map: &HashMap<String, Vec<String>, S>,
) -> Option<Vec<(String, String)>> {
let tags = split_compound_tag(tag_map, pos);
if tags.len() != 2 {
return None;
}
let contracted_patterns = [
("해", "하", "어"), ("돼", "되", "어"), ("봬", "뵈", "어"), ];
let chars: Vec<char> = surface.chars().collect();
if chars.is_empty() {
return None;
}
let first_char = chars[0].to_string();
let rest: String = chars[1..].iter().collect();
for (contracted, stem, vowel) in &contracted_patterns {
if first_char == *contracted && !rest.is_empty() {
let ending = format!("{vowel}{rest}");
return Some(vec![
((*stem).to_string(), tags[0].clone()),
(ending, tags[1].clone()),
]);
}
}
for (contracted, stem, vowel) in &contracted_patterns {
if surface == *contracted {
return Some(vec![
((*stem).to_string(), tags[0].clone()),
((*vowel).to_string(), tags[1].clone()),
]);
}
}
let past_contracted_patterns = [
("봤", "보", "았"), ("갔", "가", "았"), ("왔", "오", "았"), ("샀", "사", "았"), ("잤", "자", "았"), ("됐", "되", "었"), ("했", "하", "았"), ];
for (contracted, stem, prefinal) in &past_contracted_patterns {
if first_char == *contracted {
return Some(vec![
((*stem).to_string(), tags[0].clone()),
((*prefinal).to_string(), "EP".to_string()),
(rest, tags[1].clone()),
]);
}
}
None
}
fn create_split_morphemes(stem: &str, ending: &str, tags: &[String]) -> Vec<(String, String)> {
let mut result = Vec::new();
if tags.len() == 2 {
result.push((stem.to_string(), tags[0].clone()));
result.push((ending.to_string(), tags[1].clone()));
} else if tags.len() == 3 {
if tags[1] == "VX" && tags[2] == "EF" {
let (vx_part, ef_part) = split_causative_ending(ending);
result.push((stem.to_string(), tags[0].clone()));
result.push((vx_part, tags[1].clone()));
result.push((ef_part, tags[2].clone()));
} else {
let (prefinal, final_ending) = split_prefinal_ending(ending);
result.push((stem.to_string(), tags[0].clone()));
result.push((prefinal, tags[1].clone()));
result.push((final_ending, tags[2].clone()));
}
} else if tags.len() == 4 {
let (ep1, ep2, ef) = split_honorific_past_ending(ending);
result.push((stem.to_string(), tags[0].clone()));
result.push((ep1, tags[1].clone()));
result.push((ep2, tags[2].clone()));
result.push((ef, tags[3].clone()));
} else {
result.push((stem.to_string(), tags[0].clone()));
if tags.len() > 1 {
result.push((ending.to_string(), tags[tags.len() - 1].clone()));
}
}
result
}
fn split_causative_ending(ending: &str) -> (String, String) {
let causative_patterns = ["이", "히", "리", "기"];
for pattern in &causative_patterns {
if ending.starts_with(pattern) {
let vx_part = (*pattern).to_string();
let ef_part: String = ending.chars().skip(pattern.chars().count()).collect();
if !ef_part.is_empty() {
return (vx_part, ef_part);
}
}
}
let chars: Vec<char> = ending.chars().collect();
if chars.len() >= 2 {
(chars[0].to_string(), chars[1..].iter().collect())
} else {
(ending.to_string(), String::new())
}
}
pub(super) fn split_prefinal_ending(ending: &str) -> (String, String) {
let compound_prefinal_patterns = [
"으셨",
"셨",
"으시었",
"시었", "으시겠",
"시겠", ];
for pattern in &compound_prefinal_patterns {
if ending.starts_with(pattern) {
let prefinal = (*pattern).to_string();
let final_part: String = ending.chars().skip(pattern.chars().count()).collect();
if !final_part.is_empty() {
return (prefinal, final_part);
}
}
}
let prefinal_patterns = ["었", "았", "였", "겠", "으시", "시", "더"];
for pattern in &prefinal_patterns {
if ending.starts_with(pattern) {
let prefinal = (*pattern).to_string();
let final_part: String = ending.chars().skip(pattern.chars().count()).collect();
if !final_part.is_empty() {
return (prefinal, final_part);
}
}
}
(ending.to_string(), String::new())
}
fn split_honorific_past_ending(ending: &str) -> (String, String, String) {
let patterns = [
("셨습니다", "시", "었", "습니다"),
("셨습니까", "시", "었", "습니까"),
("셨어요", "시", "었", "어요"),
("셨어", "시", "었", "어"),
("셨다", "시", "었", "다"),
];
for (pattern, ep1, ep2, ef) in &patterns {
if ending == *pattern {
return ((*ep1).to_string(), (*ep2).to_string(), (*ef).to_string());
}
}
let chars: Vec<char> = ending.chars().collect();
if chars.len() >= 3 {
(
chars[0].to_string(),
chars[1].to_string(),
chars[2..].iter().collect(),
)
} else {
(ending.to_string(), String::new(), String::new())
}
}