use crate::rules::context::EncoderState;
use crate::rules::token::Token;
use crate::rules::token_rule::{TokenAction, TokenPhase, TokenRule};
use crate::unicode::decode_unicode;
pub struct Rule73AppendixPlaceholderRule;
impl TokenRule for Rule73AppendixPlaceholderRule {
fn phase(&self) -> TokenPhase {
TokenPhase::Normalization
}
fn priority(&self) -> u16 {
5 }
fn apply<'a>(
&self,
tokens: &[Token<'a>],
index: usize,
_state: &mut EncoderState,
) -> Result<TokenAction<'a>, String> {
let Some(Token::Word(word)) = tokens.get(index) else {
return Ok(TokenAction::Noop);
};
if word.chars.first() != Some(&'\u{F000}') {
return Ok(TokenAction::Noop);
}
let mut j = index + 1;
while matches!(tokens.get(j), Some(Token::Space(_))) {
j += 1;
}
let Some(Token::Word(next_word)) = tokens.get(j) else {
return Ok(TokenAction::Noop);
};
let next_text = next_word.text.as_ref();
if !next_text.starts_with("은/는") {
return Ok(TokenAction::Noop);
}
let prefix_bytes = vec![
decode_unicode('⠸'),
decode_unicode('⠦'),
decode_unicode('⠦'),
decode_unicode('⠄'),
decode_unicode('⠫'),
decode_unicode('⠠'),
decode_unicode('⠴'),
decode_unicode('⠴'),
decode_unicode('⠇'),
];
let mut replacement: Vec<Token<'a>> = vec![Token::PreEncoded(prefix_bytes)];
let rest_after_f000: String = word.chars.iter().skip(1).collect();
if !rest_after_f000.is_empty() {
let rest_chars: Vec<char> = rest_after_f000.chars().collect();
let rest_meta = crate::rules::token::WordMeta::from_chars(&rest_chars);
replacement.push(Token::Word(crate::rules::token::WordToken {
text: std::borrow::Cow::Owned(rest_after_f000),
chars: rest_chars,
meta: rest_meta,
}));
}
replacement.push(Token::Word(next_word.clone()));
let consume_count = j + 1 - index;
Ok(TokenAction::ReplaceRange(consume_count, replacement))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rules::token::{SpaceKind, WordMeta, WordToken};
use std::borrow::Cow;
fn word_tok(text: &str) -> Token<'_> {
let chars: Vec<char> = text.chars().collect();
let meta = WordMeta::from_chars(&chars);
Token::Word(WordToken {
text: Cow::Borrowed(text),
chars,
meta,
})
}
#[test]
fn placeholder_followed_by_end_returns_noop() {
let placeholder = word_tok("\u{F000}");
let tokens = vec![placeholder, Token::Space(SpaceKind::Regular)];
let mut state = EncoderState::new(false);
let action = Rule73AppendixPlaceholderRule
.apply(&tokens, 0, &mut state)
.expect("ok");
assert!(matches!(action, TokenAction::Noop));
}
#[test]
fn placeholder_with_extra_chars_pushes_rest() {
let placeholder = word_tok("\u{F000}A");
let euntneun = word_tok("은/는");
let tokens = vec![placeholder, Token::Space(SpaceKind::Regular), euntneun];
let mut state = EncoderState::new(false);
let action = Rule73AppendixPlaceholderRule
.apply(&tokens, 0, &mut state)
.expect("ok");
let TokenAction::ReplaceRange(_, replacement) = action else {
panic!("expected ReplaceRange");
};
assert!(replacement.len() >= 3);
assert!(
replacement
.iter()
.any(|t| matches!(t, Token::Word(w) if w.text == "A"))
);
}
#[test]
fn non_placeholder_word_returns_noop() {
let tokens = vec![word_tok("hello")];
let mut state = EncoderState::new(false);
let action = Rule73AppendixPlaceholderRule
.apply(&tokens, 0, &mut state)
.expect("ok");
assert!(matches!(action, TokenAction::Noop));
}
#[test]
fn non_word_token_returns_noop() {
let tokens = vec![Token::PreEncoded(vec![1, 2, 3])];
let mut state = EncoderState::new(false);
let action = Rule73AppendixPlaceholderRule
.apply(&tokens, 0, &mut state)
.expect("ok");
assert!(matches!(action, TokenAction::Noop));
}
#[test]
fn placeholder_next_word_not_eunneun_returns_noop() {
let placeholder = word_tok("\u{F000}");
let other = word_tok("xyz");
let tokens = vec![placeholder, Token::Space(SpaceKind::Regular), other];
let mut state = EncoderState::new(false);
let action = Rule73AppendixPlaceholderRule
.apply(&tokens, 0, &mut state)
.expect("ok");
assert!(matches!(action, TokenAction::Noop));
}
}