use std::borrow::Cow;
use crate::rules::token::{SpaceKind, Token, WordMeta, WordToken};
use crate::rules::token_rule::{TokenAction, TokenPhase, TokenRule};
pub struct AsteriskSpacingRule;
pub struct KoreanAuxiliaryVerbSpacingRule;
const AUX_VERB_SUFFIXES: &[&str] = &[
"있다.", "있다",
];
fn split_aux_verb(text: &str) -> Option<(&str, &str)> {
for suffix in AUX_VERB_SUFFIXES {
if let Some(prefix) = text.strip_suffix(suffix)
&& !prefix.is_empty()
&& prefix.chars().any(crate::utils::is_korean_char)
{
return Some((prefix, *suffix));
}
}
None
}
impl TokenRule for KoreanAuxiliaryVerbSpacingRule {
fn phase(&self) -> TokenPhase {
TokenPhase::Normalization
}
fn priority(&self) -> u16 {
50 }
fn apply<'a>(
&self,
tokens: &[Token<'a>],
index: usize,
_state: &mut crate::rules::context::EncoderState,
) -> Result<TokenAction<'a>, String> {
let Some(Token::Word(word)) = tokens.get(index) else {
return Ok(TokenAction::Noop);
};
if !word.meta.has_korean {
return Ok(TokenAction::Noop);
}
let text = word.text.as_ref();
let Some((prefix, suffix)) = split_aux_verb(text) else {
return Ok(TokenAction::Noop);
};
let prefix_owned = prefix.to_string();
let suffix_owned = suffix.to_string();
let prefix_chars: Vec<char> = prefix_owned.chars().collect();
let suffix_chars: Vec<char> = suffix_owned.chars().collect();
Ok(TokenAction::ReplaceMany(vec![
Token::Word(WordToken {
text: Cow::Owned(prefix_owned),
chars: prefix_chars.clone(),
meta: WordMeta::from_chars(&prefix_chars),
}),
Token::Space(SpaceKind::Regular),
Token::Word(WordToken {
text: Cow::Owned(suffix_owned),
chars: suffix_chars.clone(),
meta: WordMeta::from_chars(&suffix_chars),
}),
]))
}
}
fn is_last_word_index(tokens: &[Token], index: usize) -> bool {
!tokens
.iter()
.skip(index + 1)
.any(|t| matches!(t, Token::Word(_)))
}
impl TokenRule for AsteriskSpacingRule {
fn phase(&self) -> TokenPhase {
TokenPhase::PostWord
}
fn priority(&self) -> u16 {
400
}
fn apply<'a>(
&self,
tokens: &[Token<'a>],
index: usize,
_state: &mut crate::rules::context::EncoderState,
) -> Result<TokenAction<'a>, String> {
let Some(Token::Word(current)) = tokens.get(index) else {
return Ok(TokenAction::Noop);
};
if !is_last_word_index(tokens, index) {
return Ok(TokenAction::Noop);
}
let mut trailing_spaces = 0usize;
if current.text == "*" || current.text.ends_with('*') {
trailing_spaces += 1;
}
if trailing_spaces == 0 {
return Ok(TokenAction::Noop);
}
let replacement = vec![
Token::Word(current.clone()),
Token::PreEncoded(vec![0; trailing_spaces]),
];
Ok(TokenAction::ReplaceMany(replacement))
}
}