use crate::sejong::hangul::extract_vowel;
use crate::sejong::types::SejongToken;
use std::collections::HashMap;
use std::sync::LazyLock;
static PARTICLE_MAP: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
[
("이", "JKS"),
("가", "JKS"),
("께서", "JKS"),
("을", "JKO"),
("를", "JKO"),
("에", "JKB"),
("에서", "JKB"),
("에게", "JKB"),
("로", "JKB"),
("으로", "JKB"),
("한테", "JKB"),
("보다", "JKB"),
("처럼", "JKB"),
("같이", "JKB"),
("의", "JKG"),
("야", "JKV"),
("여", "JKV"),
("이여", "JKV"),
("은", "JX"),
("는", "JX"),
("도", "JX"),
("만", "JX"),
("까지", "JX"),
("부터", "JX"),
("마저", "JX"),
("조차", "JX"),
("라도", "JX"),
("밖에", "JX"),
("요", "JX"),
("와", "JC"),
("과", "JC"),
("이랑", "JC"),
("랑", "JC"),
("하고", "JC"),
]
.into_iter()
.collect()
});
const NOUN_POSES: &[&str] = &["NNG", "NNP", "NNB", "NP", "NR"];
const INTERROGATIVES: &[&str] = &[
"어디", "언제", "뭐", "무엇", "누구", "어느", "어떤", "왜", "어찌",
];
const VERB_POSES: &[&str] = &["VV", "VA", "VX"];
static ETM_MAP: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
[
("는", "ETM"),
("ㄴ", "ETM"),
("은", "ETM"),
("ㄹ", "ETM"),
("을", "ETM"),
("던", "ETM"),
]
.into_iter()
.collect()
});
const XSV_TRIGGER_POSES: &[&str] = &["NNG", "NNP", "NNB"];
const XSV_PATTERNS: &[&str] = &["하", "해", "했", "되", "됐"];
const FINAL_ENDINGS: &[&str] = &["니", "다", "요", "죠", "지", "나", "자"];
const BASE_VERBS: &[&str] = &[
"가다", "오다", "보다", "먹다", "되다", "주다", "받다", "쓰다", "읽다", "듣다", "말다", "살다",
"죽다", "자다", "일다", "앉다", "서다", "놓다", "두다", "치다", "잡다", "놀다", "울다",
];
const POSSESSIVE_PRONOUNS: &[&str] = &["나의", "너의", "우리의", "저의", "그의", "그녀의"];
const MAJ_TO_MAG: &[&str] = &["또한", "따라서", "그러므로"];
static TIME_WORDS: LazyLock<HashMap<&'static str, (&'static str, &'static str)>> =
LazyLock::new(|| {
[
("열시", ("열", "시")),
("세시", ("세", "시")),
("한시", ("한", "시")),
("두시", ("두", "시")),
("네시", ("네", "시")),
("다섯시", ("다섯", "시")),
("여섯시", ("여섯", "시")),
("일곱시", ("일곱", "시")),
("여덟시", ("여덟", "시")),
("아홉시", ("아홉", "시")),
]
.into_iter()
.collect()
});
pub(super) fn apply_particle_and_ending_corrections(tokens: &mut Vec<SejongToken>) {
let mut corrections: Vec<(usize, String)> = Vec::new();
for i in 1..tokens.len() {
let prev_surface = &tokens[i - 1].surface;
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if NOUN_POSES.contains(&prev_pos.as_str())
&& (curr_pos == "EF"
|| curr_pos == "EC"
|| curr_pos == "ETN"
|| curr_pos == "EP"
|| curr_pos == "VV"
|| curr_pos == "VA"
|| curr_pos == "JKB"
|| curr_pos == "NNG")
{
let next_is_ep = i + 1 < tokens.len() && tokens[i + 1].pos == "EP";
let next_is_ending =
i + 1 < tokens.len() && (tokens[i + 1].pos == "EF" || tokens[i + 1].pos == "EC");
let prev_is_interrogative = INTERROGATIVES.contains(&prev_surface.as_str());
let is_definite_particle = curr_surface == "께서";
if is_definite_particle || (!next_is_ep && !next_is_ending && !prev_is_interrogative) {
if let Some(&correct_pos) = PARTICLE_MAP.get(curr_surface.as_str()) {
corrections.push((i, correct_pos.to_string()));
}
}
}
}
for (idx, new_pos) in corrections {
tokens[idx].pos = new_pos;
}
let mut etm_corrections: Vec<(usize, String)> = Vec::new();
for i in 1..tokens.len() {
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if VERB_POSES.contains(&prev_pos.as_str())
&& (curr_pos == "JX" || curr_pos == "EF" || curr_pos == "EC")
{
if let Some(&correct_pos) = ETM_MAP.get(curr_surface.as_str()) {
etm_corrections.push((i, correct_pos.to_string()));
}
}
}
for (idx, new_pos) in etm_corrections {
tokens[idx].pos = new_pos;
}
let mut xsv_corrections: Vec<(usize, String)> = Vec::new();
for i in 1..tokens.len() {
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if XSV_TRIGGER_POSES.contains(&prev_pos.as_str())
&& (curr_pos == "VV" || curr_pos == "EF" || curr_pos == "VA")
&& XSV_PATTERNS.contains(&curr_surface.as_str())
{
xsv_corrections.push((i, "XSV".to_string()));
}
}
for (idx, new_pos) in xsv_corrections {
tokens[idx].pos = new_pos;
}
let mut ec_restorations: Vec<(usize, String)> = Vec::new();
for i in 1..tokens.len() {
let prev_surface = &tokens[i - 1].surface;
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if (prev_pos == "VV" || prev_pos == "VA") && curr_surface == "서" && curr_pos == "EC" {
if let Some(last_char) = prev_surface.chars().last() {
let vowel = extract_vowel(last_char);
let restored = if vowel == 'ㅏ' || vowel == 'ㅗ' {
"아서"
} else {
"어서"
};
ec_restorations.push((i, restored.to_string()));
}
}
}
for (idx, new_surface) in ec_restorations {
tokens[idx].surface = new_surface;
}
let mut ef_restorations: Vec<(usize, String)> = Vec::new();
for i in 1..tokens.len() {
let prev_surface = &tokens[i - 1].surface;
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if (prev_pos == "VV" || prev_pos == "VA") && curr_surface == "요" && curr_pos == "EF" {
if let Some(last_char) = prev_surface.chars().last() {
let vowel = extract_vowel(last_char);
let restored = if vowel == 'ㅏ' || vowel == 'ㅗ' {
"아요"
} else {
"어요"
};
ef_restorations.push((i, restored.to_string()));
}
}
}
for (idx, new_surface) in ef_restorations {
tokens[idx].surface = new_surface;
}
let mut ec_merge_corrections: Vec<(usize, String, String)> = Vec::new();
for i in 1..tokens.len() {
let prev_surface = &tokens[i - 1].surface;
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if (prev_pos == "XSV" || prev_pos == "VV" || prev_pos == "VA")
&& curr_surface == "서"
&& curr_pos == "EC"
{
if prev_surface.ends_with("면") {
let new_prev = prev_surface.trim_end_matches("면").to_string();
ec_merge_corrections.push((i - 1, new_prev, "면서".to_string()));
}
}
}
for (prev_idx, new_prev_surface, new_curr_surface) in ec_merge_corrections {
if !new_prev_surface.is_empty() {
tokens[prev_idx].surface = new_prev_surface;
}
tokens[prev_idx + 1].surface = new_curr_surface;
}
let mut merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_surface == "합니" && curr_pos == "VV" && next_surface == "다" && next_pos == "EF"
{
merge_indices.push(i);
}
}
for idx in merge_indices.into_iter().rev() {
let merged = format!("{}{}", tokens[idx].surface, tokens[idx + 1].surface);
tokens[idx].surface = merged;
tokens[idx].pos = "EF".to_string();
tokens[idx].end_pos = tokens[idx + 1].end_pos;
tokens.remove(idx + 1);
}
if let Some(last) = tokens.last_mut() {
if last.pos == "EC" {
if FINAL_ENDINGS.contains(&last.surface.as_str()) {
last.pos = "EF".to_string();
}
}
}
let mut xsv_to_vv_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if (curr_surface == "하" || curr_surface == "해" || curr_surface == "했")
&& curr_pos == "XSV"
&& next_pos == "EC"
&& (next_surface == "아야" || next_surface == "어야" || next_surface == "야")
{
xsv_to_vv_indices.push(i);
}
if curr_surface == "하"
&& curr_pos == "XSV"
&& next_pos == "EF"
&& (next_surface == "세요" || next_surface == "시오" || next_surface == "십시오")
{
xsv_to_vv_indices.push(i);
}
}
for idx in xsv_to_vv_indices {
tokens[idx].pos = "VV".to_string();
}
let mut vcp_insert_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_pos == "NP"
&& (next_pos == "EF" || next_pos == "EC")
&& (next_surface == "세요" || next_surface == "에요" || next_surface == "예요")
{
vcp_insert_indices.push(i + 1);
}
}
for idx in vcp_insert_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
if tokens[idx].pos == "EC" {
tokens[idx].pos = "EF".to_string();
}
tokens.insert(idx, SejongToken::new("이", "VCP", start, start));
}
let mut nnb_to_ec_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_surface == "지" && curr_pos == "NNB" && next_surface == "않" && next_pos == "VX" {
nnb_to_ec_indices.push(i);
}
}
for idx in nnb_to_ec_indices {
tokens[idx].pos = "EC".to_string();
}
let mut verb_split_indices: Vec<usize> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "VV" && BASE_VERBS.contains(&token.surface.as_str()) {
verb_split_indices.push(i);
}
}
for idx in verb_split_indices.into_iter().rev() {
let surface = &tokens[idx].surface;
if let Some(stem) = surface.strip_suffix("다") {
if !stem.is_empty() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(stem, "VV", start, start + stem_len);
tokens.insert(idx + 1, SejongToken::new("다", "EF", start + stem_len, end));
}
}
}
let mut xsv_da_to_vv_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_surface == "하" && curr_pos == "XSV" && next_surface == "다" && next_pos == "EF" {
xsv_da_to_vv_indices.push(i);
}
}
for idx in xsv_da_to_vv_indices {
tokens[idx].pos = "VV".to_string();
}
let mut gi_split_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
if curr_pos == "NNG"
&& curr_surface.ends_with("기")
&& curr_surface.chars().count() >= 2
&& (next_surface == "전" || next_surface == "위해" || next_surface == "시작")
{
gi_split_indices.push(i);
}
}
for idx in gi_split_indices.into_iter().rev() {
let surface = &tokens[idx].surface;
if let Some(stem) = surface.strip_suffix("기") {
if !stem.is_empty() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(stem, "VV", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new("기", "ETN", start + stem_len, end),
);
}
}
}
let mut jx_delete_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
let next2_surface = &tokens[i + 2].surface;
let next2_pos = &tokens[i + 2].pos;
if curr_surface == "는"
&& curr_pos == "JX"
&& next_surface == "들"
&& next_pos == "XSN"
&& next2_surface == "이"
&& next2_pos == "JKS"
{
jx_delete_indices.push(i);
}
}
for idx in jx_delete_indices.into_iter().rev() {
tokens.remove(idx);
}
let mut possessive_split_indices: Vec<usize> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "NNG" && POSSESSIVE_PRONOUNS.contains(&token.surface.as_str()) {
possessive_split_indices.push(i);
}
}
for idx in possessive_split_indices.into_iter().rev() {
let surface = &tokens[idx].surface;
if let Some(stem) = surface.strip_suffix("의") {
if !stem.is_empty() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(stem, "NP", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new("의", "JKG", start + stem_len, end),
);
}
}
}
let mut genitive_split_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len() {
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if prev_pos == "NP"
&& curr_pos == "NNG"
&& curr_surface.starts_with("의")
&& curr_surface.chars().count() >= 2
{
genitive_split_indices.push(i);
}
}
for idx in genitive_split_indices.into_iter().rev() {
let surface = tokens[idx].surface.clone();
if let Some(rest) = surface.strip_prefix("의") {
if !rest.is_empty() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let rest_owned = rest.to_string();
tokens[idx] = SejongToken::new("의", "JKG", start, start + 1);
tokens.insert(
idx + 1,
SejongToken::new(&rest_owned, "NNG", start + 1, end),
);
}
}
}
let mut honorific_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_pos == "NNG" && next_surface == "님의" && (next_pos == "NNP" || next_pos == "NNG")
{
honorific_merge_indices.push(i);
}
}
for idx in honorific_merge_indices.into_iter().rev() {
let merged = format!("{}님", tokens[idx].surface);
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new(&merged, "NNG", start, end - 1);
tokens[idx + 1] = SejongToken::new("의", "JKG", end - 1, end);
}
for token in tokens.iter_mut() {
if token.pos == "MAJ" && MAJ_TO_MAG.contains(&token.surface.as_str()) {
token.pos = "MAG".to_string();
}
}
for i in 1..tokens.len() {
let prev_pos = &tokens[i - 1].pos;
let curr_pos = &tokens[i].pos;
let curr_surface = &tokens[i].surface;
if (prev_pos == "NNG" || prev_pos == "NNP" || prev_pos == "NP")
&& curr_pos == "EP"
&& curr_surface == "이"
{
tokens[i].pos = "VCP".to_string();
}
}
let mut time_split_indices: Vec<(usize, String, String)> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "NNG" {
if let Some(&(num, unit)) = TIME_WORDS.get(token.surface.as_str()) {
time_split_indices.push((i, num.to_string(), unit.to_string()));
}
}
}
for (idx, num, unit) in time_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let mid = start + num.chars().count();
tokens[idx] = SejongToken::new(&num, "NR", start, mid);
tokens.insert(idx + 1, SejongToken::new(&unit, "NNB", mid, end));
}
let mut maj_split_indices: Vec<usize> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "MAJ" && token.surface == "그렇다면" {
maj_split_indices.push(i);
}
}
for idx in maj_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("그렇", "VA", start, start + 2);
tokens.insert(idx + 1, SejongToken::new("다면", "EC", start + 2, end));
}
}