use super::types::SejongToken;
mod compound_and_irregular;
mod compound_noun;
mod conjugation;
mod particle_and_ending;
mod post_conjugation;
mod pos_reclassification;
mod sentence_final;
mod sentence_final_endings;
mod suffix_and_dependency;
mod tag_normalization;
mod verb_and_morpheme;
mod verb_splitting;
mod xsv_and_ec_ef;
mod xsv_morpheme_split;
#[cfg(test)]
mod tests;
use compound_and_irregular::apply_compound_and_irregular_corrections;
use compound_noun::apply_compound_noun_corrections;
use conjugation::apply_conjugation_corrections;
use particle_and_ending::apply_particle_and_ending_corrections;
use post_conjugation::apply_post_conjugation_corrections;
use pos_reclassification::apply_pos_reclassification_corrections;
use sentence_final::apply_sentence_final_corrections;
use suffix_and_dependency::apply_suffix_and_dependency_corrections;
use tag_normalization::apply_tag_normalization_corrections;
use verb_and_morpheme::apply_verb_and_morpheme_corrections;
#[allow(clippy::too_many_lines)]
pub(super) fn apply_context_corrections(tokens: &mut Vec<SejongToken>) {
if !tokens.is_empty() && tokens[0].surface == "하" && tokens[0].pos == "XSV" {
tokens[0].pos = "VV".to_string();
}
let mut i = 0;
while i < tokens.len() {
if tokens[i].surface == "서울특별시" && tokens[i].pos == "NNP" {
let original_start = tokens[i].start_pos;
let original_end = tokens[i].end_pos;
let original_surface = tokens[i].surface.clone();
let original_pos = tokens[i].pos.clone();
tokens[i] = SejongToken::from_split(
"서울",
"NNP",
original_start,
original_start + "서울".len(),
&original_surface,
&original_pos,
);
tokens.insert(
i + 1,
SejongToken::from_split(
"특별시",
"NNG",
original_start + "서울".len(),
original_end,
&original_surface,
&original_pos,
),
);
i += 2;
continue;
}
i += 1;
}
for token in tokens.iter_mut() {
if token.surface == "그래" && token.pos == "VV" {
token.surface = "그러".to_string();
}
}
for token in tokens.iter_mut() {
if token.pos == "ETN" && token.surface == "\u{1106}" {
token.surface = "ㅁ".to_string();
}
}
let mut i = 0;
while i + 2 < tokens.len() {
if tokens[i].surface == "따라"
&& tokens[i].pos == "NNB"
&& tokens[i + 1].surface == "서"
&& tokens[i + 1].pos == "VV"
&& tokens[i + 2].surface == "어"
&& tokens[i + 2].pos == "EC"
{
tokens[i].surface = "따라서".to_string();
tokens[i].pos = "MAG".to_string();
tokens[i].end_pos = tokens[i + 2].end_pos;
tokens.remove(i + 2);
tokens.remove(i + 1);
i += 1;
continue;
}
i += 1;
}
for i in 1..tokens.len() {
if tokens[i].surface == "아" && tokens[i].pos == "IC" && tokens[i - 1].pos == "NNG" {
tokens[i].pos = "JX".to_string();
}
}
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "하"
&& tokens[i].pos == "XSV"
&& tokens[i + 1].surface == "여"
&& tokens[i + 1].pos == "XSN"
{
tokens[i + 1].surface = "어".to_string();
tokens[i + 1].pos = "EC".to_string();
}
}
apply_compound_noun_corrections(tokens);
apply_pos_reclassification_corrections(tokens);
let mut i = 0;
while i + 2 < tokens.len() {
if tokens[i].pos == "VA"
&& tokens[i + 1].surface == "ㅁ"
&& tokens[i + 1].pos == "ETN"
&& tokens[i + 2].pos == "NNG"
{
let start = tokens[i].start_pos;
let end = tokens[i + 1].end_pos;
let surface = format!("{}ㅁ", tokens[i].surface);
let merged_surface = if tokens[i].surface.ends_with("쁘") {
"나쁨".to_string()
} else {
surface
};
tokens[i] = SejongToken::new(&merged_surface, "NNG", start, end);
tokens.remove(i + 1);
continue;
}
i += 1;
}
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "어디"
&& tokens[i].pos == "NP"
&& tokens[i + 1].surface == "서"
&& tokens[i + 1].pos == "JKB"
{
tokens[i + 1].surface = "에서".to_string();
}
}
let mut i = 0;
while i + 2 < tokens.len() {
if tokens[i].pos == "EP"
&& tokens[i + 1].surface == "늘"
&& tokens[i + 1].pos == "VV"
&& tokens[i + 2].surface == "ㄴ데"
&& tokens[i + 2].pos == "EC"
{
let start = tokens[i + 1].start_pos;
let end = tokens[i + 2].end_pos;
tokens[i + 1] = SejongToken::new("는데", "EC", start, end);
tokens.remove(i + 2);
i += 2;
continue;
}
i += 1;
}
let mut i = 0;
while i + 1 < tokens.len() {
if tokens[i].surface == "그"
&& tokens[i].pos == "NP"
&& tokens[i + 1].surface == "동안"
&& tokens[i + 1].pos == "NNG"
{
let start = tokens[i].start_pos;
let end = tokens[i + 1].end_pos;
tokens[i] = SejongToken::new("그동안", "NNG", start, end);
tokens.remove(i + 1);
continue;
}
i += 1;
}
apply_tag_normalization_corrections(tokens);
let standalone_maj = [
"그러나",
"그래서",
"따라서",
"그리고",
"또한",
"그런데",
"또는",
"혹은",
];
let mut i = 0;
while i < tokens.len() {
if tokens[i].pos == "MAJ" && tokens[i].surface.ends_with("지만") {
let prev_is_maj = if i > 0 {
tokens[i - 1].pos == "MAJ"
|| standalone_maj.contains(&tokens[i - 1].surface.as_str())
} else {
false
};
let next_is_maj = if i + 1 < tokens.len() {
tokens[i + 1].pos == "MAJ"
|| standalone_maj.contains(&tokens[i + 1].surface.as_str())
} else {
false
};
if prev_is_maj || next_is_maj {
i += 1;
continue;
}
let surface = &tokens[i].surface;
if surface.len() > "지만".len() {
let stem = &surface[..surface.len() - "지만".len()];
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let stem_end = start + stem.len();
tokens[i] = SejongToken::new(stem, "VV", start, stem_end);
tokens.insert(i + 1, SejongToken::new("지만", "EC", stem_end, end));
i += 2;
continue;
}
}
i += 1;
}
let mut i = 0;
while i < tokens.len() {
if i + 1 < tokens.len()
&& tokens[i].pos == "NNG"
&& tokens[i + 1].surface == "만"
&& tokens[i + 1].pos == "JX"
{
let surface = &tokens[i].surface;
if surface.ends_with("지") && surface.len() > "지".len() {
let stem = &surface[..surface.len() - "지".len()];
let verb_stems = ["가", "보", "하", "오", "서", "먹", "읽"];
if verb_stems.contains(&stem) {
let start = tokens[i].start_pos;
let end = tokens[i + 1].end_pos;
let stem_end = start + stem.len();
tokens[i] = SejongToken::new(stem, "VV", start, stem_end);
tokens[i + 1] = SejongToken::new("지만", "EC", stem_end, end);
i += 2;
continue;
}
}
}
i += 1;
}
apply_particle_and_ending_corrections(tokens);
apply_verb_and_morpheme_corrections(tokens);
apply_compound_and_irregular_corrections(tokens);
apply_suffix_and_dependency_corrections(tokens);
apply_conjugation_corrections(tokens);
apply_post_conjugation_corrections(tokens);
apply_sentence_final_corrections(tokens);
}