use crate::sejong::types::SejongToken;
pub(super) fn apply_sentence_final_endings_corrections(tokens: &mut Vec<SejongToken>) {
let mut remove_si_ep_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].pos == "VCP"
&& tokens[i + 1].surface == "시"
&& tokens[i + 1].pos == "EP"
&& tokens[i + 2].surface == "라고"
&& tokens[i + 2].pos == "EC"
{
remove_si_ep_indices.push(i + 1);
}
}
for idx in remove_si_ep_indices.into_iter().rev() {
tokens.remove(idx);
}
let mut nda_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if (tokens[i].surface == "ㄴ" || tokens[i].surface == "는")
&& tokens[i].pos == "ETM"
&& tokens[i + 1].surface == "다"
&& tokens[i + 1].pos == "NNG"
&& i + 2 < tokens.len()
&& (tokens[i + 2].pos == "VV" || tokens[i + 2].pos == "VA")
{
nda_merge_indices.push(i);
}
}
for idx in nda_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
let merged_surface = format!("{}다", tokens[idx].surface);
tokens[idx] = SejongToken::new(&merged_surface, "EF", start, end);
tokens.remove(idx + 1);
}
let xpn_prefixes: std::collections::HashSet<&str> = ["큰", "작은", "새", "헌", "젊은", "늙은"]
.into_iter()
.collect();
for i in 0..tokens.len().saturating_sub(1) {
if xpn_prefixes.contains(tokens[i].surface.as_str())
&& (tokens[i].pos == "VA" || tokens[i].pos == "ETM")
&& tokens[i + 1].pos == "NNG"
{
}
}
let mut xpn_merge_indices: Vec<(usize, String)> = Vec::new();
let xpn_stem_map: std::collections::HashMap<&str, &str> =
[("크", "큰"), ("작", "작은")].into_iter().collect();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].pos == "VA"
&& tokens[i + 1].surface == "ㄴ"
&& tokens[i + 1].pos == "ETM"
&& tokens[i + 2].pos == "NNG"
{
if let Some(merged) = xpn_stem_map.get(tokens[i].surface.as_str()) {
xpn_merge_indices.push((i, (*merged).to_string()));
}
}
}
for (idx, merged) in xpn_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new(&merged, "XPN", start, end);
tokens.remove(idx + 1);
}
let mut nnb_fix_indices: Vec<(usize, &str)> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "채"
&& tokens[i].pos == "VV"
&& tokens[i + 1].surface == "어"
&& tokens[i + 1].pos == "EC"
{
nnb_fix_indices.push((i, "채"));
}
if tokens[i].surface == "대"
&& tokens[i].pos == "NNG"
&& tokens[i + 1].surface == "로"
&& tokens[i + 1].pos == "JKB"
{
nnb_fix_indices.push((i, "대로"));
}
if tokens[i].surface == "따르"
&& tokens[i].pos == "VV"
&& tokens[i + 1].surface == "어"
&& tokens[i + 1].pos == "EC"
{
if i > 0 && tokens[i - 1].pos == "ETM" {
nnb_fix_indices.push((i, "따라"));
}
}
}
for (idx, surface) in nnb_fix_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new(surface, "NNB", start, end);
tokens.remove(idx + 1);
}
if let Some(last) = tokens.last_mut() {
if last.surface == "다" && last.pos == "NNG" {
last.pos = "EF".to_string();
}
}
for i in 1..tokens.len() {
if tokens[i].surface == "다"
&& tokens[i].pos == "NNG"
&& (tokens[i - 1].pos == "VV" || tokens[i - 1].pos == "VA" || tokens[i - 1].pos == "VX")
{
tokens[i].pos = "EF".to_string();
}
}
for i in 1..tokens.len() {
if tokens[i].surface == "다"
&& tokens[i].pos == "NNG"
&& (tokens[i - 1].pos == "XSV" || tokens[i - 1].pos == "EP")
{
tokens[i].pos = "EF".to_string();
}
}
for i in 1..tokens.len() {
if tokens[i].surface == "다" && tokens[i].pos == "NNG" && tokens[i - 1].pos == "VCP" {
tokens[i].pos = "EF".to_string();
}
}
let question_pronouns = [
"얼마",
"뭐",
"무엇",
"누구",
"어디",
"언제",
"어느",
"왜",
"어떻게",
];
for token in tokens.iter_mut() {
if token.pos == "NNG" && question_pronouns.contains(&token.surface.as_str()) {
token.pos = "NP".to_string();
}
}
for token in tokens.iter_mut() {
if token.pos == "EP" {
if token.surface == "ㅓㅆ" {
token.surface = "었".to_string();
} else if token.surface == "ㅏㅆ" {
token.surface = "았".to_string();
}
}
}
let compound_va_nouns = [
"재미", "맛", "멋", "값", "뜻", "힘", "흥미", "의미", "가치", "효과", "보람", "관심", "정", "맥", "볼", ];
let mut va_merge_indices: Vec<(usize, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if compound_va_nouns.contains(&tokens[i].surface.as_str())
&& tokens[i].pos == "NNG"
&& (tokens[i + 1].surface == "있" || tokens[i + 1].surface == "없")
&& (tokens[i + 1].pos == "VV" || tokens[i + 1].pos == "VX")
{
let merged = format!("{}{}", tokens[i].surface, tokens[i + 1].surface);
va_merge_indices.push((i, merged));
}
}
for (idx, merged) in va_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new(&merged, "VA", start, end);
tokens.remove(idx + 1);
}
let mut spurious_etm_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len().saturating_sub(1) {
if tokens[i].surface == "ㄹ"
&& tokens[i].pos == "ETM"
&& tokens[i - 1].pos == "VA"
&& tokens[i + 1].pos == "EF"
{
spurious_etm_indices.push(i);
}
}
for idx in spurious_etm_indices.into_iter().rev() {
tokens.remove(idx);
}
let mut eoseo_fix_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].pos == "VV"
&& tokens[i + 1].surface == "어디"
&& tokens[i + 1].pos == "NP"
&& tokens[i + 2].surface == "서"
&& tokens[i + 2].pos == "JKB"
{
eoseo_fix_indices.push(i + 1);
}
}
for idx in eoseo_fix_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("어서", "EC", start, end);
tokens.remove(idx + 1);
}
if let Some(last) = tokens.last_mut() {
if last.pos == "EC" || last.pos == "IC" {
if last.surface == "ㅏ" || last.surface == "아" {
last.surface = "아".to_string();
last.pos = "EF".to_string();
} else if last.surface == "ㅓ" || last.surface == "어" {
last.surface = "어".to_string();
last.pos = "EF".to_string();
}
}
}
if tokens.len() >= 2 {
let last_idx = tokens.len() - 1;
if tokens[last_idx].surface == "네"
&& tokens[last_idx].pos == "IC"
&& (tokens[last_idx - 1].pos == "VV"
|| tokens[last_idx - 1].pos == "VA"
|| tokens[last_idx - 1].pos == "VX")
{
tokens[last_idx].pos = "EF".to_string();
}
}
for token in tokens.iter_mut() {
if token.pos == "EF" {
match token.surface.as_str() {
"ㅔ요" => token.surface = "에요".to_string(),
"ㅐ요" => token.surface = "애요".to_string(),
"ㅔ" => token.surface = "에".to_string(),
"ㅐ" => token.surface = "애".to_string(),
_ => {}
}
}
}
if let Some(last) = tokens.last_mut() {
if last.pos == "EC" {
if last.surface == "ᆫ데요" || last.surface == "ㄴ데요" {
last.surface = "ㄴ데요".to_string();
last.pos = "EF".to_string();
} else if last.surface == "ᆫ데" || last.surface == "ㄴ데" {
last.surface = "ㄴ데".to_string();
last.pos = "EF".to_string();
} else if last.surface == "네" {
last.pos = "EF".to_string();
}
}
}
let mut idx = 0;
while idx + 1 < tokens.len() {
if tokens[idx].pos == "NR" && tokens[idx + 1].pos == "NR" {
let second = tokens[idx + 1].surface.as_str();
if ["십", "백", "천", "만"].contains(&second) {
let first = tokens[idx].surface.clone();
if ["일", "이", "삼", "사", "오", "육", "칠", "팔", "구"].contains(&first.as_str())
{
tokens[idx].surface = format!("{first}{second}");
tokens.remove(idx + 1);
continue;
}
}
}
idx += 1;
}
let single_sino_numerals: std::collections::HashSet<&str> = [
"일", "이", "삼", "사", "오", "육", "칠", "팔", "구", "영", "공",
]
.into_iter()
.collect();
for token in tokens.iter_mut() {
if token.pos == "NR"
&& token.surface.chars().count() == 1
&& single_sino_numerals.contains(token.surface.as_str())
{
token.pos = "SN".to_string();
}
}
let mut i = 0;
while i + 1 < tokens.len() {
if tokens[i].surface == "어"
&& tokens[i].pos == "EF"
&& tokens[i + 1].surface == "요"
&& tokens[i + 1].pos == "JX"
{
let start = tokens[i].start_pos;
let end = tokens[i + 1].end_pos;
tokens[i] = SejongToken::new("어요", "EF", start, end);
tokens.remove(i + 1);
continue;
}
i += 1;
}
let len = tokens.len();
if len >= 2
&& tokens[len - 2].surface == "채"
&& tokens[len - 2].pos == "VV"
&& (tokens[len - 1].surface == "아" || tokens[len - 1].surface == "ㅏ")
&& tokens[len - 1].pos == "EF"
{
tokens[len - 2].pos = "NNB".to_string();
tokens.remove(len - 1);
}
}