use crate::char_struct::{CharType, KoreanChar};
use crate::english_logic;
use crate::fraction;
use crate::rules::context::{EncoderState, RuleContext};
use crate::rules::engine::RuleEngine;
use crate::rules::korean::rule_69::parse_numeric_ascii_unit_prefix;
use crate::rules::traits::Phase;
use super::token::{DocumentIR, ModeEvent, SpaceKind, Token, WordToken};
pub(crate) const HANGUL_WRAP_START_BYTES: [u8; 2] = [56, 55];
pub(crate) const HANGUL_WRAP_END_BYTES: [u8; 2] = [56, 62];
struct WordContext<'a> {
prev_word: &'a str,
remaining_words: &'a [&'a str],
}
fn is_hangul_wrap_start(token: &Token<'_>) -> bool {
matches!(token, Token::PreEncoded(bytes) if bytes.as_slice() == HANGUL_WRAP_START_BYTES)
}
fn is_hangul_wrap_end(token: &Token<'_>) -> bool {
matches!(token, Token::PreEncoded(bytes) if bytes.as_slice() == HANGUL_WRAP_END_BYTES)
}
fn next_non_space_is_hangul_wrap_start<'a>(tokens: &'a [Token<'a>], after_index: usize) -> bool {
for token in tokens.iter().skip(after_index + 1) {
match token {
Token::Space(_) => continue,
t => return is_hangul_wrap_start(t),
}
}
false
}
fn prev_non_space_is_hangul_wrap_end<'a>(tokens: &'a [Token<'a>], before_index: usize) -> bool {
for token in tokens[..before_index].iter().rev() {
match token {
Token::Space(_) => continue,
t => return is_hangul_wrap_end(t),
}
}
false
}
fn is_math_context_char(c: char) -> bool {
c.is_ascii_alphabetic()
|| ('\u{2080}'..='\u{2089}').contains(&c)
|| c == '\u{00B2}'
|| c == '\u{00B3}'
|| ('\u{2070}'..='\u{2079}').contains(&c)
|| matches!(c, '∇' | '∂' | '∞' | '∫')
|| ('α'..='ω').contains(&c)
|| ('Α'..='Ω').contains(&c)
}
fn token_is_math_word(token: Option<&Token<'_>>) -> bool {
let Some(tok) = token else {
return false;
};
match tok {
Token::Word(w) => {
!w.meta.has_korean
&& (w.chars.iter().any(|c| is_math_context_char(*c))
|| w.chars.contains(&'(')
|| w.chars.contains(&')')
|| w.chars.contains(&'/'))
}
Token::PreEncoded(_) => true,
_ => false,
}
}
fn is_math_operator_space_suppression<'a>(tokens: &'a [Token<'a>], space_idx: usize) -> bool {
fn token_is_relation_operator_word(token: Option<&Token<'_>>) -> bool {
match token {
Some(Token::Word(w)) => {
w.chars.len() <= 2
&& w.chars.iter().all(|c| {
matches!(*c, '=' | '<' | '>' | '\u{2260}' | '\u{2264}' | '\u{2265}')
})
}
Some(Token::PreEncoded(bytes)) => matches!(
bytes.as_slice(),
[18, 18] | [40, 18, 18] | [16, 16] | [16, 18] | [18, 16] ),
_ => false,
}
}
if space_idx + 1 < tokens.len()
&& token_is_relation_operator_word(tokens.get(space_idx + 1))
&& space_idx > 0
&& token_is_math_word(tokens.get(space_idx - 1))
{
return true;
}
if space_idx > 0
&& token_is_relation_operator_word(tokens.get(space_idx - 1))
&& space_idx + 1 < tokens.len()
&& token_is_math_word(tokens.get(space_idx + 1))
{
return true;
}
false
}
pub fn emit(ir: &mut DocumentIR, char_engine: &mut RuleEngine) -> Result<Vec<u8>, String> {
let mut result = Vec::new();
let word_texts = if ir.tokens.len() > 1 {
collect_word_texts(&ir.tokens)
} else {
Vec::new()
};
let mut word_index = 0usize;
for (idx, token) in ir.tokens.iter().enumerate() {
match token {
Token::Word(word) => {
let context = if word_texts.is_empty() {
WordContext {
prev_word: "",
remaining_words: &[],
}
} else {
word_context(&word_texts, word_index)
};
emit_word(
word,
idx,
&mut ir.state,
char_engine,
&ir.tokens,
context,
&mut result,
)?;
word_index += 1;
}
Token::Space(SpaceKind::Regular) => {
if !is_math_operator_space_suppression(&ir.tokens, idx) {
result.push(0);
}
}
Token::Mode(event) => emit_mode_event(*event, &mut ir.state, &mut result),
Token::Fraction(frac) => {
if let Some(ref w) = frac.whole {
result.extend(fraction::encode_mixed_fraction(
w,
&frac.numerator,
&frac.denominator,
)?);
} else {
result.extend(fraction::encode_fraction(
&frac.numerator,
&frac.denominator,
)?);
}
ir.state.is_number = true;
}
Token::PreEncoded(bytes) => {
if bytes.as_slice() == HANGUL_WRAP_START_BYTES {
ir.state.is_english = false;
ir.state.needs_english_continuation = false;
ir.state.roman_number_chain = false;
} else if bytes.as_slice() == HANGUL_WRAP_END_BYTES {
ir.state.is_english = true;
ir.state.needs_english_continuation = false;
}
result.extend(bytes);
}
}
}
if ir.state.triple_big_english {
result.push(32);
result.push(4);
}
Ok(result)
}
fn collect_word_texts<'tokens, 'source>(tokens: &'tokens [Token<'source>]) -> Vec<&'tokens str> {
let mut word_texts = Vec::with_capacity(tokens.len().div_ceil(2));
for token in tokens {
if let Token::Word(word) = token {
word_texts.push(word.text.as_ref());
}
}
word_texts
}
fn word_context<'a>(word_texts: &'a [&'a str], word_index: usize) -> WordContext<'a> {
let prev_word = word_index
.checked_sub(1)
.map_or("", |prev_index| word_texts[prev_index]);
let remaining_words = &word_texts[word_index + 1..];
WordContext {
prev_word,
remaining_words,
}
}
fn emit_mode_event(event: ModeEvent, state: &mut EncoderState, result: &mut Vec<u8>) {
match event {
ModeEvent::EnterEnglish => {
result.push(52);
state.is_english = true;
state.needs_english_continuation = false;
state.roman_number_chain = false;
}
ModeEvent::EnterEnglishContinue => {
result.push(48);
state.is_english = true;
state.needs_english_continuation = false;
state.roman_number_chain = false;
}
ModeEvent::CapsWord => {
result.push(32);
result.push(32);
}
ModeEvent::Grade1Indicator => {
result.push(48);
}
ModeEvent::CapsPassageStart => {
result.push(32);
result.push(32);
result.push(32);
state.triple_big_english = true;
}
ModeEvent::CapsPassageEnd => {
result.push(32);
result.push(4);
state.triple_big_english = false;
}
}
}
#[allow(clippy::too_many_arguments)]
fn apply_core_encoding_rules(
engine: &mut RuleEngine,
char_type: &CharType,
word_chars: &[char],
index: usize,
is_all_uppercase: bool,
has_korean_char: bool,
ascii_starts_at_beginning: bool,
state: &mut EncoderState,
skip_count: &mut usize,
remaining_words: &[&str],
prev_word: &str,
result: &mut Vec<u8>,
) -> Result<crate::rules::traits::RuleResult, String> {
let mut ctx = RuleContext {
word_chars,
index,
char_type,
prev_word,
remaining_words,
has_korean_char,
is_all_uppercase,
ascii_starts_at_beginning,
skip_count,
state,
result,
};
engine.apply_phase(Phase::CoreEncoding, &mut ctx)
}
#[allow(clippy::too_many_arguments)]
fn apply_inter_character_rules(
engine: &mut RuleEngine,
char_type: &CharType,
word_chars: &[char],
index: usize,
is_all_uppercase: bool,
has_korean_char: bool,
ascii_starts_at_beginning: bool,
state: &mut EncoderState,
skip_count: &mut usize,
remaining_words: &[&str],
prev_word: &str,
result: &mut Vec<u8>,
) -> Result<(), String> {
let mut ctx = RuleContext {
word_chars,
index,
char_type,
prev_word,
remaining_words,
has_korean_char,
is_all_uppercase,
ascii_starts_at_beginning,
skip_count,
state,
result,
};
engine.apply_phase(Phase::InterCharacter, &mut ctx)?;
Ok(())
}
fn exit_english(state: &mut EncoderState, needs_continuation: bool) {
state.is_english = false;
state.needs_english_continuation = needs_continuation;
state.roman_number_chain = false;
}
fn enter_english(state: &mut EncoderState, result: &mut Vec<u8>) {
if state.needs_english_continuation {
result.push(48);
} else {
result.push(52);
}
state.is_english = true;
state.needs_english_continuation = false;
state.roman_number_chain = false;
}
fn exit_english_for_roman_number_chain(state: &mut EncoderState) {
exit_english(state, false);
state.roman_number_chain = true;
}
fn resume_english_from_roman_number_chain(state: &mut EncoderState) {
state.is_english = true;
state.needs_english_continuation = false;
state.roman_number_chain = false;
}
fn emit_word(
word: &WordToken,
token_index: usize,
state: &mut EncoderState,
char_engine: &mut RuleEngine,
all_tokens: &[Token],
context: WordContext<'_>,
result: &mut Vec<u8>,
) -> Result<(), String> {
let prev_word = context.prev_word;
let remaining_words = context.remaining_words;
let next_is_hangul_wrap = next_non_space_is_hangul_wrap_start(all_tokens, token_index);
let prev_is_hangul_wrap_end = prev_non_space_is_hangul_wrap_end(all_tokens, token_index);
let word_chars = word.chars.as_slice();
let word_len = word_chars.len();
if word_len > 0 {
let meta = word.meta;
let is_all_uppercase = meta.is_all_uppercase;
let has_korean_char = meta.has_korean;
let has_ascii_alphabetic = meta.has_ascii_alphabetic;
if word_chars.first().is_some_and(|ch| ch.is_ascii_digit())
&& let Some((numeric, unit, consumed)) = parse_numeric_ascii_unit_prefix(word_chars)
&& consumed == word_chars.len()
{
let mut encoded = crate::encode(&numeric)?;
encoded.extend(unit);
result.extend(encoded);
return Ok(());
}
if state.english_indicator
&& !state.is_english
&& has_ascii_alphabetic
&& word_chars[0].is_ascii_alphabetic()
{
if state.roman_number_chain {
resume_english_from_roman_number_chain(state);
} else if state.english_dominant_no_indicator {
state.is_english = true;
state.needs_english_continuation = false;
state.roman_number_chain = false;
} else {
enter_english(state, result);
}
}
let first_ascii_index = word_chars.iter().position(|c| c.is_ascii_alphabetic());
let ascii_starts_at_beginning = matches!(first_ascii_index, Some(0));
let mut is_number = false;
let mut is_big_english = false;
let mut skip_count = 0usize;
for (i, c) in word_chars.iter().enumerate() {
if skip_count > 0 {
skip_count -= 1;
continue;
}
let char_type = CharType::new(*c)?;
if state.english_indicator && state.is_english {
match &char_type {
CharType::English(_) => {}
CharType::Number(_) => {
exit_english_for_roman_number_chain(state);
}
CharType::Symbol(sym) => {
let prev_wrap_eng_continuation = i == 0
&& prev_is_hangul_wrap_end
&& matches!(*sym, '.' | '/' | '@' | '#' | '_' | ':' | '-')
&& english_logic::next_ascii_letter_or_digit(
word_chars,
i,
remaining_words,
);
let next_wrap_eng_continuation = i == word_chars.len() - 1
&& next_is_hangul_wrap
&& matches!(*sym, '.' | ',' | ':' | ';');
if prev_wrap_eng_continuation
|| next_wrap_eng_continuation
|| english_logic::should_render_symbol_as_english(
state.english_indicator,
state.is_english,
&state.parenthesis_stack,
*sym,
word_chars,
i,
remaining_words,
)
|| english_logic::should_keep_english_mode_for_symbol(
*sym,
word_chars,
i,
remaining_words,
)
{
} else if english_logic::should_force_terminator_before_symbol(*sym)
|| !english_logic::should_skip_terminator_for_symbol(*sym)
{
result.push(50);
exit_english(state, false);
} else {
exit_english(state, english_logic::should_request_continuation(*sym));
}
}
_ => {
result.push(50);
exit_english(state, false);
}
}
}
if state.roman_number_chain && !state.is_english {
match &char_type {
CharType::English(_) => {
result.push(48);
resume_english_from_roman_number_chain(state);
}
CharType::Number(_) => {}
_ => {
state.roman_number_chain = false;
}
}
}
match &char_type {
CharType::Korean(_) | CharType::KoreanPart(_) => {
state.needs_english_continuation = false;
}
CharType::Number(_) => {}
_ => {}
}
state.is_number = is_number;
state.is_big_english = is_big_english;
apply_core_encoding_rules(
char_engine,
&char_type,
word_chars,
i,
is_all_uppercase,
has_korean_char,
ascii_starts_at_beginning,
state,
&mut skip_count,
remaining_words,
prev_word,
result,
)?;
is_number = state.is_number;
is_big_english = state.is_big_english;
if let CharType::Korean(ref korean) = char_type
&& i < word_len - 1
{
let recon_type = CharType::Korean(KoreanChar {
cho: korean.cho,
jung: korean.jung,
jong: korean.jong,
});
state.is_number = is_number;
state.is_big_english = is_big_english;
apply_inter_character_rules(
char_engine,
&recon_type,
word_chars,
i,
is_all_uppercase,
has_korean_char,
ascii_starts_at_beginning,
state,
&mut skip_count,
remaining_words,
prev_word,
result,
)?;
is_number = state.is_number;
is_big_english = state.is_big_english;
}
if !c.is_numeric() {
is_number = false;
}
if c.is_ascii_alphabetic() && !c.is_uppercase() {
is_big_english = false;
}
}
}
if state.english_indicator && state.is_english && next_is_hangul_wrap {
} else if state.english_dominant_no_indicator && state.english_indicator && state.is_english {
} else if state.english_indicator && state.is_english {
if remaining_words.is_empty() {
result.push(50);
exit_english(state, false);
} else if let Some(next_word) = remaining_words.first() {
let ascii_letters = next_word
.chars()
.filter(|c| c.is_ascii_alphabetic())
.collect::<Vec<_>>();
let has_invalid_symbol = next_word.chars().any(|ch| {
!(ch.is_ascii_alphabetic()
|| english_logic::is_english_symbol(ch)
|| crate::symbol_shortcut::is_symbol_char(ch)
|| crate::utils::is_korean_char(ch))
});
let is_single_letter_word = ascii_letters.len() == 1
&& !next_word.chars().any(|ch| ch.is_ascii_digit())
&& !has_invalid_symbol;
if is_single_letter_word
&& english_logic::requires_single_letter_continuation(ascii_letters[0])
{
exit_english(state, true);
} else if let Some(next_char) = next_word.chars().next() {
if let Ok(next_type) = CharType::new(next_char) {
match next_type {
CharType::English(_) | CharType::Number(_) => {}
CharType::Symbol(sym) => {
if state.english_indicator
&& state.is_english
&& english_logic::is_english_symbol(sym)
{
} else if english_logic::should_force_terminator_before_symbol(sym)
|| !english_logic::should_skip_terminator_for_symbol(sym)
{
result.push(50);
exit_english(state, false);
} else {
exit_english(
state,
english_logic::should_request_continuation(sym),
);
}
}
_ => {
result.push(50);
exit_english(state, false);
}
}
} else {
result.push(50);
exit_english(state, false);
}
}
}
}
if !state.has_processed_word {
state.has_processed_word = true;
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use crate::encode;
use crate::rules::korean::rule_1::Rule1;
use crate::utils;
use super::*;
fn english_indicator(text: &str) -> bool {
text.split(' ')
.filter(|word| !word.is_empty())
.any(|word| word.chars().any(utils::is_korean_char))
}
fn make_char_engine() -> RuleEngine {
let mut engine = RuleEngine::new();
engine.register(Box::new(crate::rules::korean::rule_53::Rule53));
engine.register(Box::new(crate::rules::korean::rule_18::Rule18));
engine.register(Box::new(crate::rules::korean::rule_29::Rule29));
engine.register(Box::new(crate::rules::korean::rule_44::Rule44));
engine.register(Box::new(crate::rules::korean::rule_16::Rule16));
engine.register(Box::new(crate::rules::korean::rule_14::Rule14));
engine.register(Box::new(crate::rules::korean::rule_13::Rule13));
engine.register(Box::new(crate::rules::korean::rule_korean::RuleKorean));
engine.register(Box::new(crate::rules::korean::rule_28::Rule28));
engine.register(Box::new(crate::rules::korean::rule_40::Rule40));
engine.register(Box::new(crate::rules::korean::rule_8::Rule8));
engine.register(Box::new(Rule1));
engine.register(Box::new(crate::rules::korean::rule_2::Rule2));
engine.register(Box::new(crate::rules::korean::rule_3::Rule3));
engine.register(Box::new(
crate::rules::korean::rule_english_symbol::RuleEnglishSymbol,
));
engine.register(Box::new(crate::rules::korean::rule_61::Rule61));
engine.register(Box::new(crate::rules::korean::rule_41::Rule41));
engine.register(Box::new(crate::rules::korean::rule_56::Rule56));
engine.register(Box::new(crate::rules::korean::rule_57::Rule57));
engine.register(Box::new(crate::rules::korean::rule_58::Rule58));
engine.register(Box::new(crate::rules::korean::rule_60::Rule60));
engine.register(Box::new(crate::rules::korean::rule_49::Rule49));
engine.register(Box::new(crate::rules::korean::rule_space::RuleSpace));
engine.register(Box::new(crate::rules::korean::rule_math::RuleMath));
engine.register(Box::new(crate::rules::korean::rule_fraction::RuleFraction));
engine.register(Box::new(crate::rules::korean::rule_11::Rule11));
engine.register(Box::new(crate::rules::korean::rule_12::Rule12));
engine
}
fn make_token_engine() -> crate::rules::token_engine::TokenRuleEngine {
let mut engine = crate::rules::token_engine::TokenRuleEngine::new();
engine.register(Box::new(
crate::rules::token_rules::normalize::NormalizeEllipsis,
));
engine.register(Box::new(
crate::rules::token_rules::emphasis_ring::EmphasisRingRule,
));
engine.register(Box::new(
crate::rules::token_rules::latex_fraction::LatexFractionRule,
));
engine.register(Box::new(
crate::rules::token_rules::inline_fraction::InlineFractionRule,
));
engine.register(Box::new(
crate::rules::token_rules::word_shortcut::WordShortcutRule,
));
engine.register(Box::new(
crate::rules::token_rules::uppercase_passage::UppercasePassageRule,
));
engine.register(Box::new(
crate::rules::token_rules::middle_dot_spacing::MiddleDotSpacingRule,
));
engine.register(Box::new(
crate::rules::token_rules::quote_attachment::QuoteAttachmentRule,
));
engine.register(Box::new(
crate::rules::token_rules::spacing::AsteriskSpacingRule,
));
engine
}
fn assert_round_trip(text: &str) {
let mut ir = DocumentIR::parse(text, english_indicator(text));
let mut engine = make_char_engine();
let mut token_engine = make_token_engine();
let state_before_token_rules = ir.state.clone();
token_engine
.apply_all(&mut ir.tokens, &mut ir.state)
.unwrap();
ir.state = state_before_token_rules;
let emitted = emit(&mut ir, &mut engine).unwrap();
let expected = encode(text).unwrap();
assert_eq!(
emitted, expected,
"round-trip mismatch for {:?}\n emit: {:?}\n encode: {:?}",
text, emitted, expected
);
}
#[rstest::rstest]
#[case::korean_greeting("안녕하세요")]
#[case::english_words("hello world")]
#[case::triple_uppercase_passage("WELCOME TO KOREA")]
#[case::english_indicator_sns("SNS에서")]
#[case::english_indicator_atm("ATM 기기")]
#[case::english_indicator_bmi_paren("BMI(지수)")]
#[case::mixed_upper_atm("ATM")]
#[case::mixed_upper_capitalized("Contents")]
#[case::mixed_upper_title("Table of Contents")]
#[case::number_with_comma("1,000")]
#[case::number_decimal("0.48")]
#[case::multi_word_korean("상상이상의 ")]
#[case::korean_with_newline("안녕\n반가워")]
#[case::word_shortcut_geuraeseo("그래서")]
#[case::word_shortcut_geureona("그러나")]
#[case::latex_fraction_half("$\\frac{1}{2}$")]
#[case::math_symbols_korean_sentence("나루 + 배 = 나룻배")]
#[case::phone_number_range("02-2669-9775~6")]
#[case::parenthesized_english_bmi("지수(BMI)")]
#[case::parenthesized_english_chejilryang_bmi("체질량 지수(BMI)")]
#[case::standalone_jamo("삼각형 ㄱㄴㄷ")]
#[case::kg_parenthesized("(kg)")]
#[case::kg_bare("kg")]
#[case::roma_bracket("Roma [ㄹㄹ로마]")]
fn emit_round_trip(#[case] text: &str) {
assert_round_trip(text);
}
#[test]
fn mode_events_emit_expected_bytes() {
let mut ir = DocumentIR {
tokens: vec![
Token::Mode(ModeEvent::EnterEnglish),
Token::Mode(ModeEvent::EnterEnglishContinue),
Token::Mode(ModeEvent::CapsWord),
Token::Mode(ModeEvent::CapsPassageStart),
Token::Mode(ModeEvent::CapsPassageEnd),
Token::Mode(ModeEvent::Grade1Indicator),
],
state: EncoderState::new(false),
};
let mut engine = make_char_engine();
let out = emit(&mut ir, &mut engine).unwrap();
assert_eq!(out, vec![52, 48, 32, 32, 32, 32, 32, 32, 4, 48]);
}
#[test]
fn fraction_token_encodes() {
let mut ir = DocumentIR {
tokens: vec![
Token::Fraction(super::super::token::FractionToken {
whole: None,
numerator: "1".to_string(),
denominator: "2".to_string(),
}),
Token::Space(SpaceKind::Regular),
Token::Fraction(super::super::token::FractionToken {
whole: Some("3".to_string()),
numerator: "1".to_string(),
denominator: "4".to_string(),
}),
],
state: EncoderState::new(false),
};
let mut engine = make_char_engine();
let out = emit(&mut ir, &mut engine).unwrap();
let mut expected = fraction::encode_fraction("1", "2").unwrap();
expected.push(0);
expected.extend(fraction::encode_mixed_fraction("3", "1", "4").unwrap());
assert_eq!(out, expected);
}
#[test]
fn extract_context_uses_prev_and_remaining_words() {
let words = ["A", "B", "C"];
let tokens = words
.iter()
.map(|w| {
let chars: Vec<char> = w.chars().collect();
Token::Word(WordToken {
text: Cow::Borrowed(w),
chars: chars.clone(),
meta: super::super::token::WordMeta::from_chars(&chars),
})
})
.collect::<Vec<_>>();
let word_texts = collect_word_texts(&tokens);
let context = word_context(&word_texts, 1);
assert_eq!(context.prev_word, "A");
assert_eq!(context.remaining_words, ["C"]);
}
#[test]
fn token_is_math_word_returns_false_for_non_word_non_preencoded() {
use super::token_is_math_word;
use crate::rules::token::{ModeEvent, SpaceKind};
assert!(!token_is_math_word(None));
assert!(!token_is_math_word(Some(&Token::Space(SpaceKind::Regular))));
assert!(!token_is_math_word(Some(&Token::Mode(
ModeEvent::EnterEnglish
))));
let chars: Vec<char> = "한국".chars().collect();
let kw = Token::Word(crate::rules::token::WordToken {
text: std::borrow::Cow::Borrowed("한국"),
chars: chars.clone(),
meta: crate::rules::token::WordMeta::from_chars(&chars),
});
assert!(!token_is_math_word(Some(&kw)));
assert!(token_is_math_word(Some(&Token::PreEncoded(vec![1, 2, 3]))));
}
#[test]
fn emit_end_of_stream_triple_big_english_safety_net() {
use crate::rules::engine::RuleEngine;
use crate::rules::token::DocumentIR;
let mut ir = DocumentIR::parse("", false);
ir.state.triple_big_english = true;
let mut engine = RuleEngine::new();
let result = emit(&mut ir, &mut engine).unwrap();
assert_eq!(
result,
vec![32, 4],
"expected safety-net close bytes, got {result:?}"
);
}
}