use crate::sejong::hangul::has_jongseong;
use crate::sejong::types::SejongToken;
pub(super) fn apply_suffix_and_dependency_corrections(tokens: &mut Vec<SejongToken>) {
let mut jeok_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].pos == "NNG" && tokens[i + 1].pos == "XSN" && tokens[i + 1].surface == "적" {
jeok_merge_indices.push(i);
}
}
for idx in jeok_merge_indices.into_iter().rev() {
let merged = format!("{}적", tokens[idx].surface);
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new(&merged, "NNG", start, end);
tokens.remove(idx + 1);
}
for i in 0..tokens.len().saturating_sub(1) {
let prev_pos = &tokens[i].pos;
if (prev_pos == "NNG" || prev_pos == "NNP" || prev_pos == "NP" || prev_pos == "XSN")
&& tokens[i + 1].pos == "JKB"
&& tokens[i + 1].surface == "의"
{
tokens[i + 1].pos = "JKG".to_string();
}
}
let mut sieot_split_indices: Vec<usize> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "EP" && token.surface == "시었" {
sieot_split_indices.push(i);
}
}
for idx in sieot_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("시", "EP", start, start + 1);
tokens.insert(idx + 1, SejongToken::new("었", "EP", start + 1, end));
}
let mut gyeot_split_indices: Vec<usize> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "EP+EF" && token.surface == "겠습니다" {
gyeot_split_indices.push(i);
}
}
for idx in gyeot_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("겠", "EP", start, start + 1);
tokens.insert(idx + 1, SejongToken::new("습니다", "EF", start + 1, end));
}
let mut hapnida_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if i > 0
&& tokens[i - 1].pos == "EC"
&& tokens[i].surface == "하"
&& tokens[i].pos == "XSV"
&& tokens[i + 1].surface == "ㅂ니다"
&& tokens[i + 1].pos == "EF"
{
hapnida_merge_indices.push(i);
}
}
for idx in hapnida_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("합니다", "EF", start, end);
tokens.remove(idx + 1);
}
for i in 1..tokens.len() {
if tokens[i].surface == "중"
&& tokens[i].pos == "NNG"
&& (tokens[i - 1].pos == "NNG" || tokens[i - 1].pos == "NNP")
{
if i + 1 < tokens.len()
&& (tokens[i + 1].pos == "VCP"
|| tokens[i + 1].pos == "JX"
|| tokens[i + 1].pos == "JKS"
|| tokens[i + 1].pos == "EF")
{
tokens[i].pos = "NNB".to_string();
}
}
}
for i in 1..tokens.len() {
if tokens[i].surface == "지"
&& (tokens[i].pos == "VX" || tokens[i].pos == "EC")
&& tokens[i - 1].pos == "ETM"
{
tokens[i].pos = "NNB".to_string();
}
}
let time_unit_nouns = ["분", "시", "원", "년", "월", "일", "개", "명", "번"];
for i in 1..tokens.len() {
if time_unit_nouns.contains(&tokens[i].surface.as_str())
&& (tokens[i].pos == "XSN" || tokens[i].pos == "NNG")
&& tokens[i - 1].pos == "NR"
{
tokens[i].pos = "NNB".to_string();
}
}
for i in 0..tokens.len() {
if tokens[i].surface == "것" && tokens[i].pos == "NP" {
if i + 1 < tokens.len() {
let next_pos = &tokens[i + 1].pos;
if next_pos == "VCP"
|| next_pos == "JKS"
|| next_pos == "JX"
|| next_pos == "JKO"
|| next_pos == "NNB"
{
tokens[i].pos = "NNB".to_string();
}
}
if tokens[i].pos == "NP" && i > 0 && tokens[i - 1].pos == "ETM" {
tokens[i].pos = "NNB".to_string();
}
if tokens[i].pos == "NP" && (tokens.len() == 1 || i == tokens.len() - 1) {
tokens[i].pos = "NNB".to_string();
}
}
}
let mut adnominal_splits: Vec<(usize, String, String, String)> = Vec::new();
for i in 0..tokens.len() {
if tokens[i].pos == "VV" {
let next_is_noun = if i + 1 < tokens.len() {
let next_pos = &tokens[i + 1].pos;
next_pos == "NNG" || next_pos == "NNP" || next_pos == "NNB" || next_pos == "NP"
} else {
false
};
if next_is_noun {
let surface = &tokens[i].surface;
if surface == "간" {
adnominal_splits.push((
i,
"가".to_string(),
"VV".to_string(),
"ㄴ".to_string(),
));
} else if surface == "온" {
adnominal_splits.push((
i,
"오".to_string(),
"VV".to_string(),
"ㄴ".to_string(),
));
} else if surface == "한" {
adnominal_splits.push((
i,
"하".to_string(),
"VV".to_string(),
"ㄴ".to_string(),
));
}
else if surface == "갈" {
adnominal_splits.push((
i,
"가".to_string(),
"VV".to_string(),
"ㄹ".to_string(),
));
} else if surface == "올" {
adnominal_splits.push((
i,
"오".to_string(),
"VV".to_string(),
"ㄹ".to_string(),
));
} else if surface == "할" {
adnominal_splits.push((
i,
"하".to_string(),
"VV".to_string(),
"ㄹ".to_string(),
));
}
}
}
}
for (idx, stem, stem_pos, ending) in adnominal_splits.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new(&stem, &stem_pos, start, end);
tokens.insert(idx + 1, SejongToken::new(&ending, "ETM", end, end));
}
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].pos == "VV" && tokens[i + 1].surface == "세요" && tokens[i + 1].pos == "EF" {
let rieul_verbs = [
("드", "들"), ("아", "알"), ];
for (dropped, original) in rieul_verbs {
if tokens[i].surface == dropped {
tokens[i].surface = original.to_string();
break;
}
}
}
}
let derived_nouns: std::collections::HashMap<&str, (&str, &str)> = [
("웃음", ("웃", "VV")),
("울음", ("울", "VV")),
("걸음", ("걷", "VV")),
("놀이", ("놀", "VV")),
("먹이", ("먹", "VV")),
("잠", ("자", "VV")),
("꿈", ("꾸", "VV")),
]
.into_iter()
.collect();
let mut derived_split_indices: Vec<(usize, String, String, String)> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "NNG" {
if let Some(&(stem, stem_pos)) = derived_nouns.get(token.surface.as_str()) {
let suffix = if token.surface.ends_with("음") {
"음"
} else if token.surface == "잠" || token.surface == "꿈" {
"ㅁ"
} else if token.surface.ends_with("이") {
"이"
} else {
continue;
};
derived_split_indices.push((
i,
stem.to_string(),
stem_pos.to_string(),
suffix.to_string(),
));
}
}
}
for (idx, stem, stem_pos, suffix) in derived_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(&stem, &stem_pos, start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new(&suffix, "ETN", start + stem_len, end),
);
}
let vv_etm_patterns: std::collections::HashMap<&str, (&str, &str)> = [
("간", ("가", "ㄴ")), ("온", ("오", "ㄴ")), ("한", ("하", "ㄴ")), ("본", ("보", "ㄴ")), ("잔", ("자", "ㄴ")), ("산", ("사", "ㄴ")), ("된", ("되", "ㄴ")), ("쓴", ("쓰", "ㄴ")), ("갈", ("가", "ㄹ")), ("올", ("오", "ㄹ")), ("할", ("하", "ㄹ")), ("볼", ("보", "ㄹ")), ("살", ("살", "ㄹ")), ("알", ("알", "ㄹ")), ("될", ("되", "ㄹ")), ]
.into_iter()
.collect();
let mut etm_split_indices: Vec<(usize, String, String)> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if (token.pos == "VV" || token.pos == "VA") && token.surface.chars().count() == 1 {
if let Some(&(stem, etm)) = vv_etm_patterns.get(token.surface.as_str()) {
let should_split = if i + 1 < tokens.len() {
let next_pos = &tokens[i + 1].pos;
next_pos.starts_with("NN")
|| next_pos == "NP"
|| next_pos == "VV"
|| next_pos == "VA"
|| next_pos == "MM"
|| next_pos == "MAG"
} else {
true
};
if should_split {
etm_split_indices.push((i, stem.to_string(), etm.to_string()));
}
}
}
}
for (idx, stem, etm) in etm_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(&stem, "VV", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new(&etm, "ETM", start + stem_len, end),
);
}
let xsv_split_patterns: std::collections::HashMap<&str, (&str, &str, &str)> = [
("되었다", ("되", "었", "다")),
("하였다", ("하", "었", "다")),
("되었어", ("되", "었", "어")),
("하였어", ("하", "었", "어")),
("되었으면", ("되", "었", "으면")),
("하였으면", ("하", "었", "으면")),
]
.into_iter()
.collect();
let mut xsv_split_indices: Vec<(usize, String, String, String)> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "XSV" {
if let Some(&(stem, ep, ef)) = xsv_split_patterns.get(token.surface.as_str()) {
xsv_split_indices.push((i, stem.to_string(), ep.to_string(), ef.to_string()));
}
}
}
for (idx, stem, ep, ef) in xsv_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
let ep_len = ep.chars().count();
tokens[idx] = SejongToken::new(&stem, "XSV", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new(&ep, "EP", start + stem_len, start + stem_len + ep_len),
);
let ef_pos = if ef == "다" || ef == "어" {
"EF"
} else {
"EC"
};
tokens.insert(
idx + 2,
SejongToken::new(&ef, ef_pos, start + stem_len + ep_len, end),
);
}
for i in 1..tokens.len() {
if tokens[i].surface == "이" && tokens[i].pos == "MM" && tokens[i - 1].pos == "VV" {
let prev_surface = &tokens[i - 1].surface;
let etn_triggers = ["먹", "놀", "알", "살", "높", "낮", "깊", "넓", "짧"];
if etn_triggers.iter().any(|&s| prev_surface == s) {
tokens[i].pos = "ETN".to_string();
}
}
}
let mut seyo_merge_indices: Vec<usize> = Vec::new();
for i in 2..tokens.len() {
if tokens[i - 2].pos == "VCP"
&& tokens[i - 1].surface == "시"
&& (tokens[i - 1].pos == "NNB" || tokens[i - 1].pos == "EP")
&& tokens[i].surface == "어요"
&& tokens[i].pos == "EF"
{
seyo_merge_indices.push(i - 1);
}
}
for idx in seyo_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("세요", "EF", start, end);
tokens.remove(idx + 1);
}
for token in tokens.iter_mut() {
if token.pos == "EC" && token.surface == "아" {
token.surface = "어".to_string();
}
}
for i in 1..tokens.len() {
if tokens[i].surface == "어"
&& tokens[i].pos == "IC"
&& (tokens[i - 1].pos == "VV"
|| tokens[i - 1].pos == "VA"
|| tokens[i - 1].pos == "XSV")
{
tokens[i].pos = "EC".to_string();
}
}
for i in 1..tokens.len() {
if tokens[i].surface == "이"
&& tokens[i].pos == "MM"
&& (tokens[i - 1].pos == "NNG"
|| tokens[i - 1].pos == "NNP"
|| tokens[i - 1].pos == "NNB"
|| tokens[i - 1].pos == "NP"
|| tokens[i - 1].pos == "XSN")
{
if let Some(last_char) = tokens[i - 1].surface.chars().last() {
if has_jongseong(last_char) {
tokens[i].pos = "JKS".to_string();
}
}
}
}
for i in 1..tokens.len() {
if tokens[i].surface == "에"
&& tokens[i].pos == "IC"
&& (tokens[i - 1].pos == "NNG"
|| tokens[i - 1].pos == "NNP"
|| tokens[i - 1].pos == "NNB")
{
tokens[i].pos = "JKB".to_string();
}
}
let mut ya_split_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len() {
if tokens[i].surface == "야" && tokens[i].pos == "IC" && tokens[i - 1].pos == "NP" {
if let Some(last_char) = tokens[i - 1].surface.chars().last() {
if !has_jongseong(last_char) {
ya_split_indices.push(i);
}
}
}
}
for idx in ya_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("이", "VCP", start, start);
tokens.insert(idx + 1, SejongToken::new("야", "EF", start, end));
}
for i in 1..tokens.len().saturating_sub(1) {
if tokens[i].surface == "이"
&& tokens[i].pos == "NP"
&& tokens[i - 1].pos == "NNG"
&& tokens[i + 1].surface == "네"
&& tokens[i + 1].pos == "XSN"
{
tokens[i].pos = "VCP".to_string();
tokens[i + 1].pos = "EF".to_string();
}
}
let mut reo_split_indices: Vec<(usize, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
let next_pos = &tokens[i + 1].pos;
if surface.ends_with("러")
&& surface.chars().count() >= 2
&& (pos == "JKB" || pos == "NNP")
&& (next_pos == "VV+EF" || next_pos == "VV")
{
let stem: String = surface.chars().take(surface.chars().count() - 1).collect();
reo_split_indices.push((i, stem));
}
}
for (idx, stem) in reo_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(&stem, "VV", start, start + stem_len);
tokens.insert(idx + 1, SejongToken::new("러", "EC", start + stem_len, end));
}
}