use crate::char_struct::{CharType, KoreanChar};
use crate::english_logic;
use crate::fraction;
use crate::rules::context::{EncoderState, RuleContext};
use crate::rules::engine::RuleEngine;
use crate::rules::korean::rule_69::parse_numeric_ascii_unit_prefix;
use crate::rules::traits::Phase;
use super::token::{DocumentIR, ModeEvent, SpaceKind, Token, WordMeta, WordToken};
pub fn emit(ir: &mut DocumentIR, char_engine: &mut RuleEngine) -> Result<Vec<u8>, String> {
let mut result = Vec::new();
for token in &ir.tokens {
match token {
Token::Word(word) => {
emit_word(word, &mut ir.state, char_engine, &ir.tokens, &mut result)?;
}
Token::Space(SpaceKind::Regular) => result.push(0),
Token::Mode(event) => emit_mode_event(*event, &mut ir.state, &mut result),
Token::Fraction(frac) => {
if let Some(ref w) = frac.whole {
result.extend(fraction::encode_mixed_fraction(
w,
&frac.numerator,
&frac.denominator,
)?);
} else {
result.extend(fraction::encode_fraction(
&frac.numerator,
&frac.denominator,
)?);
}
ir.state.is_number = true;
}
Token::PreEncoded(bytes) => result.extend(bytes),
}
}
if ir.state.triple_big_english {
result.push(32);
result.push(4);
}
Ok(result)
}
fn emit_mode_event(event: ModeEvent, state: &mut EncoderState, result: &mut Vec<u8>) {
match event {
ModeEvent::EnterEnglish => {
result.push(52);
state.is_english = true;
state.needs_english_continuation = false;
}
ModeEvent::EnterEnglishContinue => {
result.push(48);
state.is_english = true;
state.needs_english_continuation = false;
}
ModeEvent::CapsWord => {
result.push(32);
result.push(32);
}
ModeEvent::CapsPassageStart => {
result.push(32);
result.push(32);
result.push(32);
state.triple_big_english = true;
}
ModeEvent::CapsPassageEnd => {
result.push(32);
result.push(4);
state.triple_big_english = false;
}
}
}
#[allow(clippy::too_many_arguments)]
fn apply_core_encoding_rules(
engine: &mut RuleEngine,
char_type: &CharType,
word_chars: &[char],
index: usize,
is_all_uppercase: bool,
has_korean_char: bool,
ascii_starts_at_beginning: bool,
state: &mut EncoderState,
skip_count: &mut usize,
remaining_words: &[&str],
prev_word: &str,
result: &mut Vec<u8>,
) -> Result<crate::rules::traits::RuleResult, String> {
let mut ctx = RuleContext {
word_chars,
index,
char_type,
prev_word,
remaining_words,
has_korean_char,
is_all_uppercase,
ascii_starts_at_beginning,
skip_count,
state,
result,
};
engine.apply_phase(Phase::CoreEncoding, &mut ctx)
}
#[allow(clippy::too_many_arguments)]
fn apply_inter_character_rules(
engine: &mut RuleEngine,
char_type: &CharType,
word_chars: &[char],
index: usize,
is_all_uppercase: bool,
has_korean_char: bool,
ascii_starts_at_beginning: bool,
state: &mut EncoderState,
skip_count: &mut usize,
remaining_words: &[&str],
prev_word: &str,
result: &mut Vec<u8>,
) -> Result<(), String> {
let mut ctx = RuleContext {
word_chars,
index,
char_type,
prev_word,
remaining_words,
has_korean_char,
is_all_uppercase,
ascii_starts_at_beginning,
skip_count,
state,
result,
};
engine.apply_phase(Phase::InterCharacter, &mut ctx)?;
Ok(())
}
fn exit_english(state: &mut EncoderState, needs_continuation: bool) {
state.is_english = false;
state.needs_english_continuation = needs_continuation;
}
fn enter_english(state: &mut EncoderState, result: &mut Vec<u8>) {
if state.needs_english_continuation {
result.push(48);
} else {
result.push(52);
}
state.is_english = true;
state.needs_english_continuation = false;
}
fn extract_word_context<'a>(
word: &WordToken<'a>,
all_tokens: &'a [Token<'a>],
) -> (&'a str, Vec<&'a str>) {
let mut prev_word = "";
let mut remaining_words = Vec::new();
let mut seen_current = false;
for token in all_tokens {
if let Token::Word(candidate) = token {
if !seen_current {
if std::ptr::eq(candidate, word) {
seen_current = true;
} else {
prev_word = candidate.text.as_ref();
}
} else {
remaining_words.push(candidate.text.as_ref());
}
}
}
(prev_word, remaining_words)
}
fn emit_word(
word: &WordToken,
state: &mut EncoderState,
char_engine: &mut RuleEngine,
all_tokens: &[Token],
result: &mut Vec<u8>,
) -> Result<(), String> {
let (prev_word, remaining_words_vec) = extract_word_context(word, all_tokens);
let remaining_words = remaining_words_vec.as_slice();
let word_text = word.text.as_ref();
let word_chars: Vec<char> = word_text.chars().collect();
let word_len = word_chars.len();
if word_len > 0 {
let meta = WordMeta::from_chars(&word_chars);
let is_all_uppercase = meta.is_all_uppercase;
let has_korean_char = meta.has_korean;
let has_ascii_alphabetic = meta.has_ascii_alphabetic;
if word_chars.first().is_some_and(|ch| ch.is_ascii_digit())
&& let Some((numeric, unit, consumed)) = parse_numeric_ascii_unit_prefix(&word_chars)
&& consumed == word_chars.len()
{
let mut encoded = crate::encode(&numeric)?;
encoded.extend(unit);
result.extend(encoded);
return Ok(());
}
if state.english_indicator
&& !state.is_english
&& has_ascii_alphabetic
&& word_chars[0].is_ascii_alphabetic()
{
enter_english(state, result);
}
let first_ascii_index = word_chars.iter().position(|c| c.is_ascii_alphabetic());
let ascii_starts_at_beginning = matches!(first_ascii_index, Some(0));
let mut is_number = false;
let mut is_big_english = false;
let mut skip_count = 0usize;
for (i, c) in word_chars.iter().enumerate() {
if skip_count > 0 {
skip_count -= 1;
continue;
}
let char_type = CharType::new(*c)?;
if state.english_indicator && state.is_english {
match &char_type {
CharType::English(_) => {}
CharType::Number(_) => {
exit_english(state, true);
}
CharType::Symbol(sym) => {
if english_logic::should_render_symbol_as_english(
state.english_indicator,
state.is_english,
&state.parenthesis_stack,
*sym,
&word_chars,
i,
remaining_words,
) || english_logic::should_keep_english_mode_for_symbol(
*sym,
&word_chars,
i,
remaining_words,
) {
} else if english_logic::should_force_terminator_before_symbol(*sym)
|| !english_logic::should_skip_terminator_for_symbol(*sym)
{
result.push(50);
exit_english(state, false);
} else {
exit_english(state, english_logic::should_request_continuation(*sym));
}
}
_ => {
result.push(50);
exit_english(state, false);
}
}
}
match &char_type {
CharType::Korean(_) | CharType::KoreanPart(_) => {
state.needs_english_continuation = false;
}
CharType::Number(_) => {}
_ => {}
}
let mut core_state = EncoderState {
mode_stack: state.mode_stack.clone(),
is_english: state.is_english,
english_indicator: state.english_indicator,
triple_big_english: state.triple_big_english,
has_processed_word: state.has_processed_word,
needs_english_continuation: state.needs_english_continuation,
parenthesis_stack: state.parenthesis_stack.clone(),
is_number,
is_big_english,
};
apply_core_encoding_rules(
char_engine,
&char_type,
&word_chars,
i,
is_all_uppercase,
has_korean_char,
ascii_starts_at_beginning,
&mut core_state,
&mut skip_count,
remaining_words,
prev_word,
result,
)?;
state.is_english = core_state.is_english;
state.triple_big_english = core_state.triple_big_english;
state.has_processed_word = core_state.has_processed_word;
state.needs_english_continuation = core_state.needs_english_continuation;
state.parenthesis_stack = core_state.parenthesis_stack;
state.mode_stack = core_state.mode_stack;
is_number = core_state.is_number;
is_big_english = core_state.is_big_english;
if let CharType::Korean(ref korean) = char_type
&& i < word_len - 1
{
let recon_type = CharType::Korean(KoreanChar {
cho: korean.cho,
jung: korean.jung,
jong: korean.jong,
});
let mut inter_state = EncoderState {
mode_stack: state.mode_stack.clone(),
is_english: state.is_english,
english_indicator: state.english_indicator,
triple_big_english: state.triple_big_english,
has_processed_word: state.has_processed_word,
needs_english_continuation: state.needs_english_continuation,
parenthesis_stack: state.parenthesis_stack.clone(),
is_number,
is_big_english,
};
apply_inter_character_rules(
char_engine,
&recon_type,
&word_chars,
i,
is_all_uppercase,
has_korean_char,
ascii_starts_at_beginning,
&mut inter_state,
&mut skip_count,
remaining_words,
prev_word,
result,
)?;
state.is_english = inter_state.is_english;
state.triple_big_english = inter_state.triple_big_english;
state.has_processed_word = inter_state.has_processed_word;
state.needs_english_continuation = inter_state.needs_english_continuation;
state.parenthesis_stack = inter_state.parenthesis_stack;
state.mode_stack = inter_state.mode_stack;
is_number = inter_state.is_number;
is_big_english = inter_state.is_big_english;
}
if !c.is_numeric() {
is_number = false;
}
if c.is_ascii_alphabetic() && !c.is_uppercase() {
is_big_english = false;
}
}
}
if state.english_indicator && state.is_english {
if remaining_words.is_empty() {
result.push(50);
exit_english(state, false);
} else if let Some(next_word) = remaining_words.first() {
let ascii_letters = next_word
.chars()
.filter(|c| c.is_ascii_alphabetic())
.collect::<Vec<_>>();
let has_invalid_symbol = next_word.chars().any(|ch| {
!(ch.is_ascii_alphabetic()
|| english_logic::is_english_symbol(ch)
|| crate::symbol_shortcut::is_symbol_char(ch)
|| crate::utils::is_korean_char(ch))
});
let is_single_letter_word = ascii_letters.len() == 1
&& !next_word.chars().any(|ch| ch.is_ascii_digit())
&& !has_invalid_symbol;
if is_single_letter_word
&& english_logic::requires_single_letter_continuation(ascii_letters[0])
{
exit_english(state, true);
} else if let Some(next_char) = next_word.chars().next() {
if let Ok(next_type) = CharType::new(next_char) {
match next_type {
CharType::English(_) | CharType::Number(_) => {}
CharType::Symbol(sym) => {
if state.english_indicator
&& state.is_english
&& english_logic::is_english_symbol(sym)
{
} else if english_logic::should_force_terminator_before_symbol(sym)
|| !english_logic::should_skip_terminator_for_symbol(sym)
{
result.push(50);
exit_english(state, false);
} else {
exit_english(
state,
english_logic::should_request_continuation(sym),
);
}
}
_ => {
result.push(50);
exit_english(state, false);
}
}
} else {
result.push(50);
exit_english(state, false);
}
}
}
}
if !state.has_processed_word {
state.has_processed_word = true;
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use crate::encode;
use crate::rules::korean::rule_1::Rule1;
use crate::utils;
use super::*;
fn english_indicator(text: &str) -> bool {
text.split(' ')
.filter(|word| !word.is_empty())
.any(|word| word.chars().any(utils::is_korean_char))
}
fn make_char_engine() -> RuleEngine {
let mut engine = RuleEngine::new();
engine.register(Box::new(crate::rules::korean::rule_53::Rule53));
engine.register(Box::new(crate::rules::korean::rule_18::Rule18));
engine.register(Box::new(crate::rules::korean::rule_29::Rule29));
engine.register(Box::new(crate::rules::korean::rule_44::Rule44));
engine.register(Box::new(crate::rules::korean::rule_16::Rule16));
engine.register(Box::new(crate::rules::korean::rule_14::Rule14));
engine.register(Box::new(crate::rules::korean::rule_13::Rule13));
engine.register(Box::new(crate::rules::korean::rule_korean::RuleKorean));
engine.register(Box::new(crate::rules::korean::rule_28::Rule28));
engine.register(Box::new(crate::rules::korean::rule_40::Rule40));
engine.register(Box::new(crate::rules::korean::rule_8::Rule8));
engine.register(Box::new(Rule1));
engine.register(Box::new(crate::rules::korean::rule_2::Rule2));
engine.register(Box::new(crate::rules::korean::rule_3::Rule3));
engine.register(Box::new(
crate::rules::korean::rule_english_symbol::RuleEnglishSymbol,
));
engine.register(Box::new(crate::rules::korean::rule_61::Rule61));
engine.register(Box::new(crate::rules::korean::rule_41::Rule41));
engine.register(Box::new(crate::rules::korean::rule_56::Rule56));
engine.register(Box::new(crate::rules::korean::rule_57::Rule57));
engine.register(Box::new(crate::rules::korean::rule_58::Rule58));
engine.register(Box::new(crate::rules::korean::rule_60::Rule60));
engine.register(Box::new(crate::rules::korean::rule_49::Rule49));
engine.register(Box::new(crate::rules::korean::rule_space::RuleSpace));
engine.register(Box::new(crate::rules::korean::rule_math::RuleMath));
engine.register(Box::new(crate::rules::korean::rule_fraction::RuleFraction));
engine.register(Box::new(crate::rules::korean::rule_11::Rule11));
engine.register(Box::new(crate::rules::korean::rule_12::Rule12));
engine
}
fn make_token_engine() -> crate::rules::token_engine::TokenRuleEngine {
let mut engine = crate::rules::token_engine::TokenRuleEngine::new();
engine.register(Box::new(
crate::rules::token_rules::normalize::NormalizeEllipsis,
));
engine.register(Box::new(
crate::rules::token_rules::emphasis_ring::EmphasisRingRule,
));
engine.register(Box::new(
crate::rules::token_rules::latex_fraction::LatexFractionRule,
));
engine.register(Box::new(
crate::rules::token_rules::inline_fraction::InlineFractionRule,
));
engine.register(Box::new(
crate::rules::token_rules::word_shortcut::WordShortcutRule,
));
engine.register(Box::new(
crate::rules::token_rules::uppercase_passage::UppercasePassageRule,
));
engine.register(Box::new(
crate::rules::token_rules::middle_dot_spacing::MiddleDotSpacingRule,
));
engine.register(Box::new(
crate::rules::token_rules::quote_attachment::QuoteAttachmentRule,
));
engine.register(Box::new(
crate::rules::token_rules::spacing::AsteriskSpacingRule,
));
engine
}
fn assert_round_trip(text: &str) {
let mut ir = DocumentIR::parse(text, english_indicator(text));
let mut engine = make_char_engine();
let mut token_engine = make_token_engine();
let state_before_token_rules = ir.state.clone();
token_engine
.apply_all(&mut ir.tokens, &mut ir.state)
.unwrap();
ir.state = state_before_token_rules;
let emitted = emit(&mut ir, &mut engine).unwrap();
let expected = encode(text).unwrap();
assert_eq!(
emitted, expected,
"round-trip mismatch for {:?}\n emit: {:?}\n encode: {:?}",
text, emitted, expected
);
}
#[test]
fn emit_round_trip_korean() {
assert_round_trip("안녕하세요");
}
#[test]
fn emit_round_trip_english_words() {
assert_round_trip("hello world");
}
#[test]
fn mode_events_emit_expected_bytes() {
let mut ir = DocumentIR {
tokens: vec![
Token::Mode(ModeEvent::EnterEnglish),
Token::Mode(ModeEvent::EnterEnglishContinue),
Token::Mode(ModeEvent::CapsWord),
Token::Mode(ModeEvent::CapsPassageStart),
Token::Mode(ModeEvent::CapsPassageEnd),
],
state: EncoderState::new(false),
};
let mut engine = make_char_engine();
let out = emit(&mut ir, &mut engine).unwrap();
assert_eq!(out, vec![52, 48, 32, 32, 32, 32, 32, 32, 4]);
}
#[test]
fn fraction_token_encodes() {
let mut ir = DocumentIR {
tokens: vec![
Token::Fraction(super::super::token::FractionToken {
whole: None,
numerator: "1".to_string(),
denominator: "2".to_string(),
}),
Token::Space(SpaceKind::Regular),
Token::Fraction(super::super::token::FractionToken {
whole: Some("3".to_string()),
numerator: "1".to_string(),
denominator: "4".to_string(),
}),
],
state: EncoderState::new(false),
};
let mut engine = make_char_engine();
let out = emit(&mut ir, &mut engine).unwrap();
let mut expected = fraction::encode_fraction("1", "2").unwrap();
expected.push(0);
expected.extend(fraction::encode_mixed_fraction("3", "1", "4").unwrap());
assert_eq!(out, expected);
}
#[test]
fn extract_context_uses_prev_and_remaining_words() {
let words = ["A", "B", "C"];
let tokens = words
.iter()
.map(|w| {
let chars: Vec<char> = w.chars().collect();
Token::Word(WordToken {
text: Cow::Borrowed(w),
chars: chars.clone(),
meta: super::super::token::WordMeta::from_chars(&chars),
})
})
.collect::<Vec<_>>();
let target = match &tokens[1] {
Token::Word(w) => w,
_ => panic!("expected word"),
};
let (prev, rem) = extract_word_context(target, &tokens);
assert_eq!(prev, "A");
assert_eq!(rem, vec!["C"]);
}
#[test]
fn emit_round_trip_triple_uppercase() {
assert_round_trip("WELCOME TO KOREA");
}
#[test]
fn emit_round_trip_english_indicator_with_korean() {
assert_round_trip("SNS에서");
assert_round_trip("ATM 기기");
assert_round_trip("BMI(지수)");
}
#[test]
fn emit_round_trip_mixed_uppercase_word() {
assert_round_trip("ATM");
assert_round_trip("Contents");
assert_round_trip("Table of Contents");
}
#[test]
fn emit_round_trip_numbers() {
assert_round_trip("1,000");
assert_round_trip("0.48");
}
#[test]
fn emit_round_trip_multi_word_korean() {
assert_round_trip("상상이상의 ");
}
#[test]
fn emit_round_trip_korean_with_newline() {
assert_round_trip("안녕\n반가워");
}
#[test]
fn emit_round_trip_word_shortcut() {
assert_round_trip("그래서");
assert_round_trip("그러나");
}
#[test]
fn emit_round_trip_latex_fraction() {
assert_round_trip("$\\frac{1}{2}$");
}
#[test]
fn emit_round_trip_math_symbols() {
assert_round_trip("나루 + 배 = 나룻배");
}
#[test]
fn emit_round_trip_phone_number() {
assert_round_trip("02-2669-9775~6");
}
#[test]
fn emit_round_trip_parenthesized_english() {
assert_round_trip("지수(BMI)");
assert_round_trip("체질량 지수(BMI)");
}
#[test]
fn emit_round_trip_standalone_jamo() {
assert_round_trip("삼각형 ㄱㄴㄷ");
}
#[test]
fn emit_round_trip_kg_parenthesized() {
assert_round_trip("(kg)");
assert_round_trip("kg");
}
#[test]
fn emit_round_trip_roma_bracket() {
assert_round_trip("Roma [ㄹㄹ로마]");
}
}