use crate::sejong::types::SejongToken;
pub(super) fn apply_compound_and_irregular_corrections(tokens: &mut Vec<SejongToken>) {
let mut family_merge_indices: Vec<(usize, usize, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(3) {
let t0 = &tokens[i];
let t1 = &tokens[i + 1];
let t2 = &tokens[i + 2];
let t3 = &tokens[i + 3];
if t0.surface == "하"
&& (t0.pos == "XSV" || t0.pos == "VV")
&& t1.surface == "ㄹ"
&& t1.pos == "ETM"
&& t2.surface == "머"
&& t2.pos == "NP"
&& t3.surface == "님"
&& t3.pos == "XSN"
{
family_merge_indices.push((i, i + 3, "할머님".to_string()));
}
else if t0.surface == "하"
&& (t0.pos == "XSV" || t0.pos == "VV")
&& t1.surface == "ㄹ"
&& t1.pos == "ETM"
&& t2.surface == "아버"
&& t2.pos == "NNG"
&& t3.surface == "님"
&& t3.pos == "XSN"
{
family_merge_indices.push((i, i + 3, "할아버님".to_string()));
}
}
for (start_idx, end_idx, merged) in family_merge_indices.into_iter().rev() {
let start = tokens[start_idx].start_pos;
let end = tokens[end_idx].end_pos;
tokens[start_idx] = SejongToken::new(&merged, "NNG", start, end);
for j in (start_idx + 1..=end_idx).rev() {
tokens.remove(j);
}
}
let mut sigan_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].surface == "시"
&& (tokens[i].pos == "NNG" || tokens[i].pos == "NNB")
&& tokens[i + 1].surface == "가"
&& (tokens[i + 1].pos == "VV" || tokens[i + 1].pos == "JKS")
&& tokens[i + 2].surface == "ㄴ"
&& tokens[i + 2].pos == "ETM"
{
sigan_merge_indices.push(i);
}
}
for idx in sigan_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 2].end_pos;
tokens[idx] = SejongToken::new("시간", "NNG", start, end);
tokens.remove(idx + 2);
tokens.remove(idx + 1);
}
let mut jumal_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "주"
&& tokens[i].pos == "VX"
&& tokens[i + 1].surface == "말"
&& tokens[i + 1].pos == "NNG"
{
jumal_merge_indices.push(i);
}
}
for idx in jumal_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("주말", "NNG", start, end);
tokens.remove(idx + 1);
}
let mut galdeung_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].surface == "가"
&& tokens[i].pos == "VV"
&& tokens[i + 1].surface == "ㄹ"
&& tokens[i + 1].pos == "ETM"
&& tokens[i + 2].surface == "등"
&& (tokens[i + 2].pos == "NNG" || tokens[i + 2].pos == "NNB")
{
galdeung_merge_indices.push(i);
}
}
for idx in galdeung_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 2].end_pos;
tokens[idx] = SejongToken::new("갈등", "NNG", start, end);
tokens.remove(idx + 2);
tokens.remove(idx + 1);
}
let mut sl_ga_merge_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len().saturating_sub(1) {
if tokens[i].surface == "가"
&& tokens[i].pos == "VV"
&& tokens[i - 1].pos == "SL"
&& (tokens[i + 1].surface == "어" || tokens[i + 1].surface == "아")
&& tokens[i + 1].pos == "EC"
{
sl_ga_merge_indices.push(i);
}
}
for idx in sl_ga_merge_indices.into_iter().rev() {
tokens[idx].pos = "JKS".to_string();
tokens.remove(idx + 1); }
for i in 1..tokens.len() {
if tokens[i].surface == "가" && tokens[i].pos == "VV" && tokens[i - 1].pos == "SL" {
tokens[i].pos = "JKS".to_string();
}
}
let mut jinheng_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
if tokens[i].surface == "지"
&& tokens[i].pos == "VX"
&& tokens[i + 1].surface == "ㄴ"
&& tokens[i + 1].pos == "ETM"
&& tokens[i + 2].surface == "행"
&& tokens[i + 2].pos == "NNG"
{
jinheng_merge_indices.push(i);
}
}
for idx in jinheng_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 2].end_pos;
tokens[idx] = SejongToken::new("진행", "NNG", start, end);
tokens.remove(idx + 2);
tokens.remove(idx + 1);
}
let ec_keep_verbs = ["위하", "대하", "인하", "관하", "의하", "통하", "비하"];
for i in 0..tokens.len() {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
if pos == "EC" && (surface == "어" || surface == "아") {
let prev_is_verb = i > 0
&& (tokens[i - 1].pos == "VV"
|| tokens[i - 1].pos == "VA"
|| tokens[i - 1].pos == "VX");
if prev_is_verb {
let prev_surface = &tokens[i - 1].surface;
if ec_keep_verbs.iter().any(|v| prev_surface == *v) {
continue;
}
let is_last = i + 1 >= tokens.len();
let is_eojeol_final = if is_last {
true
} else {
match (&tokens[i].original_surface, &tokens[i + 1].original_surface) {
(Some(curr_orig), Some(next_orig)) => curr_orig != next_orig,
(Some(_), None) => true, (None, _) => false, }
};
if is_eojeol_final {
tokens[i].pos = "EF".to_string();
}
}
}
}
for i in 1..tokens.len() {
if (tokens[i].pos == "EC" || tokens[i].pos == "EF") && tokens[i].surface == "아" {
let prev_surface = &tokens[i - 1].surface;
let prev_pos = &tokens[i - 1].pos;
if (prev_pos == "VV" || prev_pos == "XSV") && prev_surface.ends_with("하") {
tokens[i].surface = "어".to_string();
}
}
}
let mut jup_fix_indices: Vec<(usize, String, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "주" && tokens[i].pos == "VX" {
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if next_surface == "워" && next_pos == "NNG" {
jup_fix_indices.push((i, "줍".to_string(), "어".to_string()));
}
else if next_surface == "우면" && next_pos == "NNG" {
jup_fix_indices.push((i, "줍".to_string(), "으면".to_string()));
}
}
}
for (idx, stem, ending) in jup_fix_indices.into_iter().rev() {
let start1 = tokens[idx].start_pos;
let end1 = tokens[idx].end_pos;
let start2 = tokens[idx + 1].start_pos;
let end2 = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new(&stem, "VV", start1, end1);
let is_eojeol_final = if ending == "으면" {
false } else if idx + 2 >= tokens.len() {
true } else {
let next_pos = &tokens[idx + 2].pos;
next_pos == "VV"
|| next_pos == "VX"
|| next_pos == "NNG"
|| next_pos == "NNP"
|| next_pos == "NP"
|| next_pos == "MAG"
};
let ending_pos = if is_eojeol_final { "EF" } else { "EC" };
tokens[idx + 1] = SejongToken::new(&ending, ending_pos, start2, end2);
}
let mut mugeop_fix_indices: Vec<(usize, String)> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "무거" && tokens[i].pos == "NNG" {
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if next_surface == "우면" && next_pos == "NNG" {
mugeop_fix_indices.push((i, "으면".to_string()));
}
}
}
for (idx, ending) in mugeop_fix_indices.into_iter().rev() {
let start1 = tokens[idx].start_pos;
let end1 = tokens[idx].end_pos;
let start2 = tokens[idx + 1].start_pos;
let end2 = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("무겁", "VA", start1, end1);
tokens[idx + 1] = SejongToken::new(&ending, "EC", start2, end2);
}
let mut ireumyeon_fix_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len() {
if tokens[i].surface == "이르면" && tokens[i].pos == "MAJ" {
if tokens[i - 1].pos == "VV" || tokens[i - 1].pos == "EF" {
ireumyeon_fix_indices.push(i);
}
}
}
for idx in ireumyeon_fix_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("이르", "VV", start, start + 2);
tokens.insert(idx + 1, SejongToken::new("면", "EC", start + 2, end));
}
let mut norae_fix_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len() {
if tokens[i].surface == "노래" && tokens[i].pos == "NNG" {
if i >= 2
&& tokens[i - 2].surface == "노랗"
&& tokens[i - 2].pos == "VA"
&& tokens[i - 1].surface == "다"
&& tokens[i - 1].pos == "EF"
{
norae_fix_indices.push(i);
}
}
}
for idx in norae_fix_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("노랗", "VA", start, start + 2);
tokens.insert(idx + 1, SejongToken::new("아", "EF", start + 2, end));
}
let mut an_remove_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len().saturating_sub(1) {
if tokens[i].surface == "안" && tokens[i].pos == "MAG" {
if tokens[i - 1].pos == "VX"
&& tokens[i + 1].surface == "으며"
&& tokens[i + 1].pos == "EC"
{
an_remove_indices.push(i);
}
}
}
for idx in an_remove_indices.into_iter().rev() {
tokens.remove(idx);
}
}