use crate::sejong::types::SejongToken;
static XPN_COMPOUNDS: &[(&str, (&str, &str))] = &[
("맨손", ("맨", "손")),
("맨발", ("맨", "발")),
("맨몸", ("맨", "몸")),
("맨땅", ("맨", "땅")),
];
static COMPOUND_NOUN_PAIRS: &[(&str, &str)] = &[
("무역", "수지"),
("여론", "조사"),
("시민", "단체"),
("국민", "경제"),
("경제", "성장"),
("대통령", "선거"),
("정부", "정책"),
("환경", "보호"),
("인공", "지능"),
("형태소", "분석"),
];
static VA_EC_WORDS: &[(&str, &str)] = &[
("높이", "높"),
("낮이", "낮"),
("깊이", "깊"),
("넓이", "넓"),
];
static VA_STEMS: &[&str] = &["높", "낮", "깊", "넓"];
pub(super) fn apply_compound_noun_corrections(tokens: &mut Vec<SejongToken>) {
let mut i = 0;
while i + 2 < tokens.len() {
if tokens[i].surface == "가"
&& tokens[i].pos == "VV"
&& tokens[i + 1].surface == "지"
&& (tokens[i + 1].pos == "NNB" || tokens[i + 1].pos == "VX")
&& tokens[i + 2].surface == "고"
&& tokens[i + 2].pos == "EC"
{
tokens[i].surface = "가지".to_string();
tokens[i].end_pos = tokens[i + 1].end_pos;
tokens.remove(i + 1);
i += 2;
continue;
}
i += 1;
}
let mut i = 0;
while i < tokens.len() {
if tokens[i].pos == "NNG" && tokens[i].surface == "밤낮" {
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let first_len = "밤".chars().count();
tokens[i] = SejongToken::new("밤", "NNG", start, start + first_len);
tokens.insert(i + 1, SejongToken::new("낮", "NNG", start + first_len, end));
i += 2;
continue;
}
i += 1;
}
let mut i = 0;
while i < tokens.len() {
if tokens[i].pos == "NNG" {
if let Some(&(_, (prefix, noun))) = XPN_COMPOUNDS
.iter()
.find(|(k, _)| *k == tokens[i].surface.as_str())
{
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let prefix_len = prefix.chars().count();
tokens[i] = SejongToken::new(prefix, "XPN", start, start + prefix_len);
tokens.insert(
i + 1,
SejongToken::new(noun, "NNG", start + prefix_len, end),
);
i += 2;
continue;
}
}
i += 1;
}
for i in 0..tokens.len().saturating_sub(1) {
if tokens[i].surface == "작"
&& tokens[i].pos == "VA"
&& tokens[i + 1].surface == "은"
&& tokens[i + 1].pos == "ETM"
{
if i + 2 < tokens.len() && tokens[i + 2].surface == "집" {
tokens[i].pos = "XPN".to_string();
tokens[i + 1].pos = "XPN".to_string();
}
}
}
let mut i = 0;
while i + 1 < tokens.len() {
if tokens[i].pos == "NNG" && tokens[i + 1].pos == "NNG" {
let a = tokens[i].surface.as_str();
let b = tokens[i + 1].surface.as_str();
if COMPOUND_NOUN_PAIRS
.iter()
.any(|&(ka, kb)| ka == a && kb == b)
{
let start = tokens[i].start_pos;
let end = tokens[i + 1].end_pos;
let merged = format!("{}{}", tokens[i].surface, tokens[i + 1].surface);
tokens[i] = SejongToken::new(&merged, "NNG", start, end);
tokens.remove(i + 1);
continue;
}
}
i += 1;
}
let mut i = 0;
while i < tokens.len() {
if tokens[i].pos == "NNG" {
if let Some(stem) = VA_EC_WORDS
.iter()
.find(|(k, _)| *k == tokens[i].surface.as_str())
.map(|(_, v)| *v)
{
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let stem_len = stem.chars().count();
tokens[i] = SejongToken::new(stem, "VA", start, start + stem_len);
tokens.insert(i + 1, SejongToken::new("이", "EC", start + stem_len, end));
i += 2;
continue;
}
}
if i + 1 < tokens.len()
&& tokens[i].pos == "NNG"
&& VA_STEMS.contains(&tokens[i].surface.as_str())
&& tokens[i + 1].surface == "이"
&& tokens[i + 1].pos == "JKS"
{
tokens[i].pos = "VA".to_string();
tokens[i + 1].pos = "EC".to_string();
i += 2;
continue;
}
i += 1;
}
let mut i = 0;
while i < tokens.len() {
if i > 0
&& tokens[i].surface == "히다"
&& tokens[i].pos == "NNP"
&& tokens[i - 1].pos == "VV"
{
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
tokens[i] = SejongToken::new("히", "VX", start, start + "히".len());
tokens.insert(i + 1, SejongToken::new("다", "EF", start + "히".len(), end));
i += 2;
continue;
}
i += 1;
}
}