use crate::sejong::types::SejongToken;
pub(super) fn apply_verb_splitting_corrections(tokens: &mut Vec<SejongToken>) {
let verb_gi_words: std::collections::HashMap<&str, &str> = [
("가기", "가"),
("오기", "오"),
("하기", "하"),
("먹기", "먹"),
("보기", "보"),
("듣기", "듣"),
("읽기", "읽"),
("쓰기", "쓰"),
("걷기", "걷"),
("달리기", "달리"),
("말하기", "말하"),
]
.into_iter()
.collect();
let mut verb_gi_split_indices: Vec<(usize, String)> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "NNG" {
if let Some(&stem) = verb_gi_words.get(token.surface.as_str()) {
verb_gi_split_indices.push((i, stem.to_string()));
}
}
}
for (idx, stem) in verb_gi_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(&stem, "VV", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new("기", "ETN", start + stem_len, end),
);
}
let hada_noun_verbs: std::collections::HashMap<&str, &str> = [
("말씀하", "말씀"),
("공부하", "공부"),
("준비하", "준비"),
("사용하", "사용"),
("시작하", "시작"),
("운동하", "운동"),
("요리하", "요리"),
("청소하", "청소"),
("여행하", "여행"),
("산책하", "산책"),
("연습하", "연습"),
("설명하", "설명"),
]
.into_iter()
.collect();
let mut hada_split_indices: Vec<(usize, String)> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "VV" {
if let Some(&noun) = hada_noun_verbs.get(token.surface.as_str()) {
hada_split_indices.push((i, noun.to_string()));
}
}
}
for (idx, noun) in hada_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let noun_len = noun.chars().count();
tokens[idx] = SejongToken::new(&noun, "NNG", start, start + noun_len);
tokens.insert(idx + 1, SejongToken::new("하", "VV", start + noun_len, end));
}
let mut gonaseo_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(2) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_surface == "고" && curr_pos == "EC" && next_surface == "나서" && next_pos == "VV"
{
gonaseo_merge_indices.push(i);
}
}
for idx in gonaseo_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("고나서", "EC", start, end);
tokens.remove(idx + 1);
if idx + 1 < tokens.len() && tokens[idx + 1].surface == "어" && tokens[idx + 1].pos == "EC"
{
tokens.remove(idx + 1);
}
}
let honorific_verbs: std::collections::HashSet<&str> = [
"드시",
"오시",
"가시",
"주시",
"보시",
"하시",
"잡수시",
"계시",
"나오시",
"들어오시",
]
.into_iter()
.collect();
let mut honorific_split_indices: Vec<usize> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if (token.pos == "VV" || token.pos == "VA")
&& honorific_verbs.contains(token.surface.as_str())
{
honorific_split_indices.push(i);
}
}
for idx in honorific_split_indices.into_iter().rev() {
let surface = tokens[idx].surface.clone();
let pos = tokens[idx].pos.clone();
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
if let Some(stem) = surface.strip_suffix("시") {
if !stem.is_empty() {
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(stem, &pos, start, start + stem_len);
tokens.insert(idx + 1, SejongToken::new("시", "EP", start + stem_len, end));
if idx + 2 < tokens.len()
&& tokens[idx + 2].surface == "시"
&& tokens[idx + 2].pos == "NNB"
{
tokens.remove(idx + 2);
}
}
}
}
let mut jeon_merge_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_surface == "저"
&& curr_pos == "NP"
&& (next_surface == "ᆫ" || next_surface == "ㄴ")
&& next_pos == "JX"
{
jeon_merge_indices.push(i);
}
}
for idx in jeon_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("전", "NNG", start, end);
tokens.remove(idx + 1);
}
let time_place_nouns: std::collections::HashSet<&str> = [
"전",
"후",
"동안",
"사이",
"때",
"곳",
"집",
"학교",
"회사",
"시작",
"끝",
"처음",
"마지막",
"오늘",
"내일",
"어제",
]
.into_iter()
.collect();
for i in 0..tokens.len().saturating_sub(1) {
let curr_pos = &tokens[i].pos;
let curr_surface = &tokens[i].surface;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if (curr_pos == "NNG" || curr_pos == "NNB")
&& time_place_nouns.contains(curr_surface.as_str())
&& next_surface == "에"
&& next_pos == "EF"
{
tokens[i + 1].pos = "JKB".to_string();
}
}
for token in tokens.iter_mut() {
let surface = token.surface.clone();
if (surface == "ᄇ니다" || surface == "ㅂ니다") && token.pos == "EC" {
token.pos = "EF".to_string();
token.surface = "ㅂ니다".to_string();
} else if (surface == "ᄇ니까" || surface == "ㅂ니까") && token.pos == "EC" {
token.pos = "EF".to_string();
token.surface = "ㅂ니까".to_string();
}
}
for token in tokens.iter_mut() {
if token.pos == "ETM" {
let normalized = token
.surface
.replace('ᆫ', "ㄴ")
.replace('ᆯ', "ㄹ")
.replace('ᆷ', "ㅁ");
if normalized != token.surface {
token.surface = normalized;
}
}
}
for i in 0..tokens.len() {
if tokens[i].pos != "EF" {
continue;
}
let surface = tokens[i].surface.clone();
let prev_surface = if i > 0 {
tokens[i - 1].surface.clone()
} else {
String::new()
};
let prev_pos = if i > 0 {
tokens[i - 1].pos.clone()
} else {
String::new()
};
let is_bnida = surface == "ㅂ니다" || surface == "ᄇ니다";
let is_bnikka = surface == "ㅂ니까" || surface == "ᄇ니까";
if is_bnida || is_bnikka {
let use_seupnida = (prev_pos == "EP"
&& (prev_surface == "었"
|| prev_surface == "겠"
|| prev_surface == "았"
|| prev_surface == "였"))
|| prev_pos == "VCP";
if use_seupnida {
if is_bnida {
tokens[i].surface = "습니다".to_string();
} else {
tokens[i].surface = "습니까".to_string();
}
} else {
if surface == "ᄇ니다" {
tokens[i].surface = "ㅂ니다".to_string();
} else if surface == "ᄇ니까" {
tokens[i].surface = "ㅂ니까".to_string();
}
}
}
}
let passive_verbs: std::collections::HashMap<&str, (&str, &str)> = [
("열리", ("열", "리")),
("걸리", ("걸", "리")),
("눌리", ("눌", "리")),
("밀리", ("밀", "리")),
("끌리", ("끌", "리")),
("뚫리", ("뚫", "리")),
("풀리", ("풀", "리")),
("팔리", ("팔", "리")),
("불리", ("불", "리")),
("보이", ("보", "이")),
("쓰이", ("쓰", "이")),
("덮이", ("덮", "이")),
("놓이", ("놓", "이")),
("쌓이", ("쌓", "이")),
("먹이", ("먹", "이")),
("잡히", ("잡", "히")),
("읽히", ("읽", "히")),
("막히", ("막", "히")),
("묻히", ("묻", "히")),
("닫히", ("닫", "히")),
("꽂히", ("꽂", "히")),
("안기", ("안", "기")),
("쫓기", ("쫓", "기")),
]
.into_iter()
.collect();
let mut passive_split_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len() {
if tokens[i].pos == "VV" && passive_verbs.contains_key(tokens[i].surface.as_str()) {
passive_split_indices.push(i);
}
}
for idx in passive_split_indices.into_iter().rev() {
let surface = tokens[idx].surface.clone();
if let Some(&(stem, suffix)) = passive_verbs.get(surface.as_str()) {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(stem, "VV", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new(suffix, "VX", start + stem_len, end),
);
}
}
for i in 0..tokens.len().saturating_sub(1) {
let curr_pos = tokens[i].pos.clone();
let next_surface = tokens[i + 1].surface.clone();
let next_pos = tokens[i + 1].pos.clone();
if curr_pos == "VV" && next_surface == "시" && next_pos == "NNB" {
let is_honorific_context = if i + 2 < tokens.len() {
let following_pos = &tokens[i + 2].pos;
following_pos == "EP" || following_pos == "EF" || following_pos == "EC"
} else {
false
};
if is_honorific_context {
tokens[i + 1].pos = "EP".to_string();
}
}
}
let causative_verbs: std::collections::HashMap<&str, (&str, &str)> = [
("입히", ("입", "히")),
("읽히", ("읽", "히")),
("익히", ("익", "히")),
("앉히", ("앉", "히")),
("눕히", ("눕", "히")),
("없히", ("없", "히")),
("묻히", ("묻", "히")),
("넓히", ("넓", "히")),
("죽이", ("죽", "이")),
("살리", ("살", "리")),
("올리", ("올", "리")),
("내리", ("내", "리")),
("돌리", ("돌", "리")),
("굴리", ("굴", "리")),
("울리", ("울", "리")),
("벗기", ("벗", "기")),
("숨기", ("숨", "기")),
("옮기", ("옮", "기")),
("알리", ("알", "리")),
("날리", ("날", "리")),
]
.into_iter()
.collect();
let mut causative_split_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len() {
if tokens[i].pos == "VV" && causative_verbs.contains_key(tokens[i].surface.as_str()) {
causative_split_indices.push(i);
}
}
for idx in causative_split_indices.into_iter().rev() {
let surface = tokens[idx].surface.clone();
if let Some(&(stem, suffix)) = causative_verbs.get(surface.as_str()) {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(stem, "VV", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new(suffix, "VX", start + stem_len, end),
);
}
}
let mut vx_etn_merge_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len() {
if tokens[i - 1].surface == "이"
&& tokens[i - 1].pos == "VX"
&& tokens[i].surface == "ㅁ"
&& tokens[i].pos == "ETN"
{
vx_etn_merge_indices.push(i - 1);
}
}
for idx in vx_etn_merge_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx + 1].end_pos;
tokens[idx] = SejongToken::new("임", "ETN", start, end);
tokens.remove(idx + 1);
}
let vx_ef_patterns: std::collections::HashMap<&str, (&str, &str)> = [
("볼게요", ("보", "ㄹ게요")),
("할게요", ("하", "ㄹ게요")),
("갈게요", ("가", "ㄹ게요")),
("올게요", ("오", "ㄹ게요")),
("줄게요", ("주", "ㄹ게요")),
("볼게", ("보", "ㄹ게")),
("할게", ("하", "ㄹ게")),
("갈게", ("가", "ㄹ게")),
("올게", ("오", "ㄹ게")),
("줄게", ("주", "ㄹ게")),
("볼까요", ("보", "ㄹ까요")),
("할까요", ("하", "ㄹ까요")),
("갈까요", ("가", "ㄹ까요")),
("올까요", ("오", "ㄹ까요")),
("볼래요", ("보", "ㄹ래요")),
("할래요", ("하", "ㄹ래요")),
("갈래요", ("가", "ㄹ래요")),
]
.into_iter()
.collect();
for i in 0..tokens.len() {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
if pos.contains("EC+VX") {
if let Some(&(stem, ending)) = vx_ef_patterns.get(surface.as_str()) {
let start = tokens[i].start_pos;
let end = tokens[i].end_pos;
let stem_len = stem.chars().count();
tokens[i] = SejongToken::new(stem, "VV", start, start + stem_len);
tokens.insert(i + 1, SejongToken::new(ending, "EF", start + stem_len, end));
break; }
}
}
for i in 0..tokens.len() {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
let is_final = i == tokens.len() - 1 || (i + 1 < tokens.len() && tokens[i + 1].pos == "SF");
if is_final && pos == "EC" && surface == "아요" {
let prev_is_verb = if i > 0 {
let prev_pos = &tokens[i - 1].pos;
prev_pos == "XSV" || prev_pos == "VV" || prev_pos == "VA" || prev_pos == "VX"
} else {
false
};
if prev_is_verb {
tokens[i].pos = "EF".to_string();
}
}
}
for i in 0..tokens.len() {
let surface = &tokens[i].surface;
let pos = &tokens[i].pos;
let is_mid_sentence = i + 1 < tokens.len();
if is_mid_sentence && pos == "EF" && surface == "고" {
let prev_is_verb = if i > 0 {
let prev_pos = &tokens[i - 1].pos;
prev_pos == "XSV" || prev_pos == "VV" || prev_pos == "VA" || prev_pos == "VX"
} else {
false
};
if prev_is_verb {
tokens[i].pos = "EC".to_string();
}
}
}
let mut hago_split_indices: Vec<usize> = Vec::new();
for i in 1..tokens.len().saturating_sub(1) {
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_pos = &tokens[i + 1].pos;
if prev_pos == "NNG" && curr_surface == "하고" && curr_pos == "JC" && next_pos == "VX" {
hago_split_indices.push(i);
}
}
for idx in hago_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("하", "XSV", start, start + 1);
tokens.insert(idx + 1, SejongToken::new("고", "EC", start + 1, end));
}
for i in 1..tokens.len().saturating_sub(1) {
let prev_pos = tokens[i - 1].pos.clone();
let curr_surface = tokens[i].surface.clone();
let curr_pos = tokens[i].pos.clone();
let next_surface = tokens[i + 1].surface.clone();
let next_pos = tokens[i + 1].pos.clone();
if prev_pos == "NNG"
&& curr_surface == "하"
&& curr_pos == "IC"
&& next_surface == "면서"
&& next_pos == "EF"
{
tokens[i].pos = "XSV".to_string();
tokens[i + 1].pos = "EC".to_string();
}
}
let mm_split_patterns: std::collections::HashMap<&str, (&str, &str)> = [
("오는", ("오", "는")),
("가는", ("가", "는")),
("하는", ("하", "는")),
("되는", ("되", "는")),
("있는", ("있", "는")),
("없는", ("없", "는")),
("먹는", ("먹", "는")),
("보는", ("보", "는")),
("받는", ("받", "는")),
("주는", ("주", "는")),
]
.into_iter()
.collect();
let mut mm_split_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len() {
if tokens[i].pos == "MM" && mm_split_patterns.contains_key(tokens[i].surface.as_str()) {
mm_split_indices.push(i);
}
}
for idx in mm_split_indices.into_iter().rev() {
let surface = tokens[idx].surface.clone();
if let Some(&(stem, ending)) = mm_split_patterns.get(surface.as_str()) {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
let stem_len = stem.chars().count();
tokens[idx] = SejongToken::new(stem, "VV", start, start + stem_len);
tokens.insert(
idx + 1,
SejongToken::new(ending, "ETM", start + stem_len, end),
);
}
}
let single_char_etm_patterns: std::collections::HashMap<&str, (&str, &str)> = [
("간", ("가", "ㄴ")),
("온", ("오", "ㄴ")),
("본", ("보", "ㄴ")),
("한", ("하", "ㄴ")),
("된", ("되", "ㄴ")),
("난", ("나", "ㄴ")),
("준", ("주", "ㄴ")),
("쓴", ("쓰", "ㄴ")),
("산", ("사", "ㄴ")),
("갈", ("가", "ㄹ")),
("올", ("오", "ㄹ")),
("볼", ("보", "ㄹ")),
("할", ("하", "ㄹ")),
("될", ("되", "ㄹ")),
("줄", ("주", "ㄹ")),
("쓸", ("쓰", "ㄹ")),
("살", ("사", "ㄹ")),
]
.into_iter()
.collect();
let mut single_vv_split_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_pos = &tokens[i + 1].pos;
if curr_pos == "VV"
&& curr_surface.chars().count() == 1
&& single_char_etm_patterns.contains_key(curr_surface.as_str())
&& (next_pos == "NNG" || next_pos == "NNP" || next_pos == "NNB")
{
single_vv_split_indices.push(i);
}
}
for idx in single_vv_split_indices.into_iter().rev() {
let surface = tokens[idx].surface.clone();
if let Some(&(stem, etm)) = single_char_etm_patterns.get(surface.as_str()) {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new(stem, "VV", start, end);
tokens.insert(idx + 1, SejongToken::new(etm, "ETM", end, end));
}
}
let mut vx_delete_indices: Vec<usize> = Vec::new();
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
let next_surface = &tokens[i + 1].surface;
let next_pos = &tokens[i + 1].pos;
if curr_surface == "하" && curr_pos == "VX" && next_surface == "합니다" && next_pos == "EF"
{
vx_delete_indices.push(i);
}
}
for idx in vx_delete_indices.into_iter().rev() {
tokens.remove(idx);
}
let mut jeone_split_indices: Vec<usize> = Vec::new();
for (i, token) in tokens.iter().enumerate() {
if token.pos == "MAG" && token.surface == "전에" {
jeone_split_indices.push(i);
}
}
for idx in jeone_split_indices.into_iter().rev() {
let start = tokens[idx].start_pos;
let end = tokens[idx].end_pos;
tokens[idx] = SejongToken::new("전", "NNG", start, start + 1);
tokens.insert(idx + 1, SejongToken::new("에", "JKB", start + 1, end));
}
for i in 0..tokens.len().saturating_sub(1) {
let curr_surface = tokens[i].surface.clone();
let curr_pos = tokens[i].pos.clone();
let next_surface = tokens[i + 1].surface.clone();
let next_pos = tokens[i + 1].pos.clone();
if curr_surface == "하" && curr_pos == "IC" && next_surface == "지" && next_pos == "VX" {
tokens[i].pos = "VV".to_string();
tokens[i + 1].pos = "EC".to_string();
}
}
for i in 2..tokens.len() {
let prev_pos = &tokens[i - 1].pos;
let curr_surface = &tokens[i].surface;
let curr_pos = &tokens[i].pos;
if prev_pos == "JKS" && curr_surface == "있" && curr_pos == "VX" {
tokens[i].pos = "VV".to_string();
}
}
}