use crate::sejong::types::SejongToken;
use super::xsv_and_ec_ef::apply_xsv_and_ec_ef_corrections;
use super::xsv_morpheme_split::apply_xsv_morpheme_split_corrections;
use super::sentence_final_endings::apply_sentence_final_endings_corrections;
pub(super) fn apply_sentence_final_corrections(tokens: &mut Vec<SejongToken>) {
apply_xsv_and_ec_ef_corrections(tokens);
apply_xsv_morpheme_split_corrections(tokens);
let verb_base_forms: std::collections::HashSet<&str> = [
"하다", "가다", "오다", "보다", "사다", "주다", "타다", "서다", "나다",
]
.into_iter()
.collect();
let mut split_verb_base_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len() {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
if pos == "NNG" && verb_base_forms.contains(surface.as_str()) {
let is_standalone = if i + 1 < tokens.len() {
let next_pos = &tokens[i + 1].pos;
!next_pos.starts_with("VV")
&& !next_pos.starts_with("VA")
&& !next_pos.starts_with("EC")
&& !next_pos.starts_with("EF")
&& !next_pos.starts_with("EP")
} else {
true
};
if is_standalone {
split_verb_base_indices.push(i);
}
}
}
for idx in split_verb_base_indices.into_iter().rev() {
let surface = tokens[idx].surface.clone();
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem: String = surface.chars().take(surface.chars().count() - 1).collect();
tokens[idx].surface = stem;
tokens[idx].pos = "VV".to_string();
tokens.insert(idx + 1, SejongToken::new("다", "EF", start, end));
}
for i in 1..tokens.len() {
if i >= 2 {
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if prev_pos == "EF" && curr_surface == "하" && curr_pos == "XSV" {
tokens[i].pos = "VV".to_string();
}
}
}
let mut fix_seyo_indices: Vec<(usize, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_pos == "VV"
&& curr_surface.ends_with("세")
&& curr_surface.chars().count() >= 2
&& next_surface == "아요"
&& next_pos == "EF"
{
let stem: String = curr_surface
.chars()
.take(curr_surface.chars().count() - 1)
.collect();
fix_seyo_indices.push((i, stem));
}
}
for (idx, stem) in fix_seyo_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = if idx + 1 < tokens.len() {
tokens[idx + 1].end_pos
} else {
tokens[idx].end_pos
};
tokens[idx].surface = stem;
tokens[idx + 1].surface = "세요".to_string();
tokens[idx + 1].start_pos = start;
tokens[idx + 1].end_pos = end;
}
let neg_adverbs: std::collections::HashSet<&str> = ["안", "못"].into_iter().collect();
let mut split_neg_indices: Vec<(usize, String, String)> = Vec::new();
for i in 0..tokens.len() {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
if pos == "VV" && surface.chars().count() == 2 {
let first_char: String = surface.chars().take(1).collect();
if neg_adverbs.contains(first_char.as_str()) {
let verb_stem: String = surface.chars().skip(1).collect();
split_neg_indices.push((i, first_char, verb_stem));
}
}
}
for (idx, adv, stem) in split_neg_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx].surface = adv;
tokens[idx].pos = "MAG".to_string();
tokens[idx].end_pos = start;
tokens.insert(
idx + 1,
SejongToken {
surface: stem.clone(),
pos: "VV".to_string(),
start_pos: start,
end_pos: end,
original_surface: Some(stem),
original_pos: Some("VV".to_string()),
},
);
}
let mut quote_fix_indices: Vec<(usize, String, bool)> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
let curr_surface = tokens[i].surface.clone();
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_pos == "EF"
&& curr_surface == "자"
&& next_surface == "이"
&& next_pos == "VCP"
&& i + 2 < tokens.len()
&& tokens[i + 2].surface == "고"
&& tokens[i + 2].pos == "EC"
{
quote_fix_indices.push((i, "자고".to_string(), true));
}
if curr_pos == "EF" && curr_surface == "다" && next_surface == "고하" && next_pos == "VV"
{
}
}
for (idx, new_surface, remove_next) in quote_fix_indices.into_iter().rev() {
tokens[idx].surface = new_surface;
tokens[idx].pos = "EC".to_string();
if remove_next {
if idx + 2 < tokens.len() {
tokens.remove(idx + 2); }
if idx + 1 < tokens.len() {
tokens.remove(idx + 1); }
}
}
let mut split_goha_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len() {
if tokens[i].surface == "고하" && tokens[i].pos == "VV" {
if i > 0 && tokens[i - 1].surface == "다" && tokens[i - 1].pos == "EF" {
split_goha_indices.push(i);
}
}
}
for idx in split_goha_indices.into_iter().rev() {
if idx > 0 {
tokens[idx - 1].surface = "다고".to_string();
tokens[idx - 1].pos = "EC".to_string();
}
tokens[idx].surface = "하".to_string();
}
let independent_vx: std::collections::HashSet<&str> =
["보", "하", "가", "오"].into_iter().collect();
for i in 0..tokens.len() {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
if pos == "VX"
&& independent_vx.contains(surface.as_str())
&& i > 0
&& tokens[i - 1].pos == "EC"
{
let prev_surface = &tokens[i - 1].surface;
if prev_surface != "어" && prev_surface != "아" {
tokens[i].pos = "VV".to_string();
}
}
}
let ec_after_si: std::collections::HashSet<&str> =
["니까", "면", "니", "으니까", "으면", "으니"]
.into_iter()
.collect();
let mut remove_si_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
if curr_surface == "시" && curr_pos == "EP" && ec_after_si.contains(next_surface.as_str())
{
remove_si_indices.push(i);
}
}
for idx in remove_si_indices.into_iter().rev() {
tokens.remove(idx);
}
let mut ddohan_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].surface == "또"
&& tokens[i].pos == "MAG"
&& tokens[i + 1].surface == "하"
&& tokens[i + 1].pos == "VV"
&& tokens[i + 2].surface == "ㄴ"
&& tokens[i + 2].pos == "ETM"
{
ddohan_indices.push(i);
}
}
for idx in ddohan_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 2].end_pos;
tokens[idx] = SejongToken::new("또한", "MAG", start, end);
tokens.remove(idx + 2);
tokens.remove(idx + 1);
}
let hanguk_patterns: [(&str, &str); 4] = [
("국", "NNG"),
("국의", "NNG"),
("국어", "NNG"),
("국인", "NNG"),
];
let mut hanguk_merge_indices: Vec<(usize, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].surface == "하"
&& tokens[i].pos == "VV"
&& tokens[i + 1].surface == "ㄴ"
&& tokens[i + 1].pos == "ETM"
{
for (suffix, pos) in &hanguk_patterns {
if tokens[i + 2].surface == *suffix && tokens[i + 2].pos == *pos {
if *suffix == "국의" {
hanguk_merge_indices.push((i, "국의".to_string()));
} else {
hanguk_merge_indices.push((i, (*suffix).to_string()));
}
break;
}
}
}
}
for (idx, suffix) in hanguk_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 2].end_pos;
if suffix == "국의" {
tokens[idx] = SejongToken::new("한국", "NNP", start, start + 2);
tokens[idx + 1] = SejongToken::new("의", "JKG", start + 2, end);
tokens.remove(idx + 2);
} else {
let merged_surface = format!("한{suffix}");
tokens[idx] = SejongToken::new(&merged_surface, "NNP", start, end);
tokens.remove(idx + 2);
tokens.remove(idx + 1);
}
}
let n = tokens.len();
if n >= 2 {
let is_sentence_end = true; if is_sentence_end
&& tokens[n - 1].surface == "자"
&& tokens[n - 1].pos == "NNG"
&& tokens[n - 2].pos == "VV"
{
tokens[n - 1].pos = "EF".to_string();
}
}
let mut family_merge_indices: Vec<(usize, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "아버"
&& tokens[i].pos == "NNP"
&& tokens[i + 1].surface == "지"
&& tokens[i + 1].pos == "VX"
{
family_merge_indices.push((i, "아버지".to_string()));
}
}
for (idx, merged) in family_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new(&merged, "NNG", start, end);
tokens.remove(idx + 1);
}
let mut ic_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "어머"
&& tokens[i].pos == "IC"
&& tokens[i + 1].surface == "나"
&& tokens[i + 1].pos == "NP"
{
ic_merge_indices.push(i);
}
}
for idx in ic_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("어머나", "IC", start, end);
tokens.remove(idx + 1);
}
let mut quote_fix_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].pos == "EF"
&& tokens[i + 1].surface == "고"
&& tokens[i + 1].pos == "NNG"
&& tokens[i + 2].surface == "하"
&& tokens[i + 2].pos == "XSV"
{
quote_fix_indices.push(i);
}
}
for idx in quote_fix_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
let merged_surface = format!("{}고", tokens[idx].surface);
tokens[idx] = SejongToken::new(&merged_surface, "EC", start, end);
tokens.remove(idx + 1);
if idx + 1 < tokens.len() && tokens[idx + 1].surface == "하" && tokens[idx + 1].pos == "XSV"
{
tokens[idx + 1].pos = "VV".to_string();
}
}
apply_sentence_final_endings_corrections(tokens);
}