use crate::sejong::types::SejongToken;
pub(super) fn apply_compound_noun_corrections(tokens: &mut Vec<SejongToken>) {
let mut i = 0;
while i + 2 < tokens.len() {
if tokens[i].surface == "가"
&& tokens[i].pos == "VV"
&& tokens[i + 1].surface == "지"
&& (tokens[i + 1].pos == "NNB" || tokens[i + 1].pos == "VX")
&& tokens[i + 2].surface == "고"
&& tokens[i + 2].pos == "EC"
{
tokens[i].surface = "가지".to_string();
tokens[i].end_pos = tokens[i + 1].end_pos;
tokens.remove(i + 1);
i += 2;
continue;
}
i += 1;
}
let xpn_compounds: std::collections::HashMap<&str, (&str, &str)> = [
("맨손", ("맨", "손")),
("맨발", ("맨", "발")),
("맨몸", ("맨", "몸")),
("맨땅", ("맨", "땅")),
]
.into_iter()
.collect();
let mut i = 0;
while i < tokens.len() {
if tokens[i].pos == "NNG" && tokens[i].surface == "밤낮" {
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let first_len = "밤".chars().count();
tokens[i] = SejongToken::new("밤", "NNG", start, start + first_len);
tokens.insert(i + 1, SejongToken::new("낮", "NNG", start + first_len, end));
i += 2;
continue;
}
i += 1;
}
let mut i = 0;
while i < tokens.len() {
if tokens[i].pos == "NNG" {
if let Some((prefix, noun)) = xpn_compounds.get(tokens[i].surface.as_str()) {
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let prefix_len = prefix.chars().count();
tokens[i] = SejongToken::new(prefix, "XPN", start, start + prefix_len);
tokens.insert(
i + 1,
SejongToken::new(noun, "NNG", start + prefix_len, end),
);
i += 2;
continue;
}
}
i += 1;
}
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "작"
&& tokens[i].pos == "VA"
&& tokens[i + 1].surface == "은"
&& tokens[i + 1].pos == "ETM"
{
if i + 2 < tokens.len() && tokens[i + 2].surface == "집" {
tokens[i].pos = "XPN".to_string();
tokens[i + 1].pos = "XPN".to_string();
}
}
}
let compound_nouns: std::collections::HashSet<(&str, &str)> = [
("무역", "수지"),
("여론", "조사"),
("시민", "단체"),
("국민", "경제"),
("경제", "성장"),
("대통령", "선거"),
("정부", "정책"),
("환경", "보호"),
("인공", "지능"),
("형태소", "분석"),
]
.into_iter()
.collect();
let mut i = 0;
while i + 1 < tokens.len() {
if tokens[i].pos == "NNG" && tokens[i + 1].pos == "NNG" {
let pair = (tokens[i].surface.as_str(), tokens[i + 1].surface.as_str());
if compound_nouns.contains(&pair) {
let start = tokens[i].start_pos;
let end = tokens[i + 1].end_pos;
let merged = format!("{}{}", tokens[i].surface, tokens[i + 1].surface);
tokens[i] = SejongToken::new(&merged, "NNG", start, end);
tokens.remove(i + 1);
continue;
}
}
i += 1;
}
let va_ec_words: std::collections::HashMap<&str, &str> = [
("높이", "높"),
("낮이", "낮"),
("깊이", "깊"),
("넓이", "넓"),
]
.into_iter()
.collect();
let va_stems: std::collections::HashSet<&str> = ["높", "낮", "깊", "넓"].into_iter().collect();
let mut i = 0;
while i < tokens.len() {
if tokens[i].pos == "NNG" {
if let Some(&stem) = va_ec_words.get(tokens[i].surface.as_str()) {
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let stem_len = stem.chars().count();
tokens[i] = SejongToken::new(stem, "VA", start, start + stem_len);
tokens.insert(i + 1, SejongToken::new("이", "EC", start + stem_len, end));
i += 2;
continue;
}
}
if i + 1 < tokens.len()
&& tokens[i].pos == "NNG"
&& va_stems.contains(tokens[i].surface.as_str())
&& tokens[i + 1].surface == "이"
&& tokens[i + 1].pos == "JKS"
{
tokens[i].pos = "VA".to_string();
tokens[i + 1].pos = "EC".to_string();
i += 2;
continue;
}
i += 1;
}
let mut i = 0;
while i < tokens.len() {
if i > 0
&& tokens[i].surface == "히다"
&& tokens[i].pos == "NNP"
&& tokens[i - 1].pos == "VV"
{
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
tokens[i] = SejongToken::new("히", "VX", start, start + "히".len());
tokens.insert(i + 1, SejongToken::new("다", "EF", start + "히".len(), end));
i += 2;
continue;
}
i += 1;
}
}