use std::borrow::Cow;
use super::context::EncoderState;
pub struct DocumentIR<'a> {
pub tokens: Vec<Token<'a>>,
pub state: EncoderState,
}
#[derive(Debug, Clone)]
pub enum Token<'a> {
Word(WordToken<'a>),
Space(SpaceKind),
Fraction(FractionToken),
Mode(ModeEvent),
PreEncoded(Vec<u8>),
}
#[derive(Debug, Clone)]
pub struct WordToken<'a> {
pub text: Cow<'a, str>,
pub chars: Vec<char>,
pub meta: WordMeta,
}
#[derive(Debug, Clone, Copy)]
pub struct WordMeta {
pub has_korean: bool,
pub is_all_uppercase: bool,
pub starts_with_ascii: bool,
pub has_ascii_alphabetic: bool,
}
impl WordMeta {
pub fn from_chars(chars: &[char]) -> WordMeta {
let mut has_korean = false;
let mut has_ascii_alphabetic = false;
let mut ascii_letter_count = 0u16;
let mut uppercase_count = 0u16;
for ch in chars {
let code = *ch as u32;
if (0xAC00..=0xD7A3).contains(&code) {
has_korean = true;
}
if ch.is_ascii_alphabetic() {
has_ascii_alphabetic = true;
ascii_letter_count = ascii_letter_count.saturating_add(1);
if ch.is_ascii_uppercase() {
uppercase_count = uppercase_count.saturating_add(1);
}
}
}
let starts_with_ascii = chars.first().is_some_and(char::is_ascii_alphabetic);
let is_all_uppercase = ascii_letter_count >= 2 && ascii_letter_count == uppercase_count;
WordMeta {
has_korean,
is_all_uppercase,
starts_with_ascii,
has_ascii_alphabetic,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SpaceKind {
Regular,
}
#[derive(Debug, Clone)]
pub struct FractionToken {
pub whole: Option<String>,
pub numerator: String,
pub denominator: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModeEvent {
EnterEnglish,
EnterEnglishContinue,
CapsWord,
CapsPassageStart,
CapsPassageEnd,
Grade1Indicator,
}
impl<'a> DocumentIR<'a> {
pub fn parse(text: &'a str, english_indicator: bool) -> Self {
let raw_words: Vec<&str> = text.split(' ').filter(|w| !w.is_empty()).collect();
let mut tokens = Vec::new();
let mut i = 0usize;
while i < raw_words.len() {
let mut owned_merged: Option<String> = None;
let mut consumed = 1usize;
let first = raw_words[i];
let mut dollar_count = first.chars().filter(|c| *c == '$').count();
if dollar_count % 2 == 1 {
let mut merged = first.to_string();
let mut j = i + 1;
while j < raw_words.len() {
merged.push(' ');
merged.push_str(raw_words[j]);
dollar_count += raw_words[j].chars().filter(|c| *c == '$').count();
j += 1;
if dollar_count % 2 == 0 {
break;
}
}
consumed = j.saturating_sub(i);
owned_merged = Some(merged);
} else if let Some((merged, merged_count)) = merge_math_span(&raw_words, i) {
consumed = merged_count;
owned_merged = Some(merged);
}
let text_cow = match owned_merged {
Some(merged) => Cow::Owned(merged),
None => Cow::Borrowed(raw_words[i]),
};
let chars: Vec<char> = text_cow.chars().collect();
let meta = WordMeta::from_chars(&chars);
tokens.push(Token::Word(WordToken {
text: text_cow,
chars,
meta,
}));
i += consumed;
if i < raw_words.len() {
tokens.push(Token::Space(SpaceKind::Regular));
}
}
DocumentIR {
tokens,
state: EncoderState::new(english_indicator),
}
}
}
fn is_korean_char(c: char) -> bool {
let code = c as u32;
(0xAC00..=0xD7A3).contains(&code) || (0x3131..=0x3163).contains(&code)
}
fn is_math_span_char(c: char) -> bool {
is_korean_char(c)
|| c.is_ascii_alphanumeric()
|| matches!(
c,
'=' | '+'
| '-'
| '\u{2212}'
| '×'
| '÷'
| '/'
| '√'
| '('
| ')'
| '['
| ']'
| '{'
| '}'
| ','
| '.'
| '!'
| '<'
| '>'
| ':'
| ';'
| '\''
| '"'
)
}
fn has_math_trigger(text: &str) -> bool {
text.chars().any(|c| {
matches!(
c,
'=' | '×' | '÷' | '/' | '√' | '{' | '}' | '[' | ']' | '(' | ')'
)
}) || text.contains("...")
}
fn merge_math_span(raw_words: &[&str], start: usize) -> Option<(String, usize)> {
let mut merged = String::new();
let mut end = start;
let mut paren_balance = 0i32;
let mut square_balance = 0i32;
let mut curly_balance = 0i32;
let mut saw_korean = false;
let mut saw_trigger = false;
let mut best: Option<(String, usize)> = None;
while end < raw_words.len() {
let word = raw_words[end];
#[allow(clippy::redundant_closure)]
let all_span = word.chars().all(|c| is_math_span_char(c));
if !all_span {
break;
}
if !merged.is_empty() {
merged.push(' ');
}
merged.push_str(word);
for ch in word.chars() {
saw_korean |= is_korean_char(ch);
saw_trigger |= matches!(
ch,
'=' | '×' | '÷' | '/' | '√' | '{' | '}' | '[' | ']' | '(' | ')'
);
match ch {
'(' => paren_balance += 1,
')' => paren_balance -= 1,
'[' => square_balance += 1,
']' => square_balance -= 1,
'{' => curly_balance += 1,
'}' => curly_balance -= 1,
_ => {}
}
}
let balanced = paren_balance == 0 && square_balance == 0 && curly_balance == 0;
let multi_word = end > start;
let looks_like_span = saw_trigger || has_math_trigger(&merged);
let is_brace_math = merged.contains('=') && merged.contains('{') && merged.contains('}');
let is_mixed_korean_math = saw_korean
&& merged.contains('=')
&& (merged.contains('×') || merged.contains('√'))
&& !merged.chars().any(|c| c.is_ascii_alphabetic());
if multi_word && balanced && looks_like_span && (is_brace_math || is_mixed_korean_math) {
best = Some((merged.clone(), end + 1 - start));
}
end += 1;
}
best
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn word_meta_korean_only() {
let chars: Vec<char> = "안녕".chars().collect();
let meta = WordMeta::from_chars(&chars);
assert!(meta.has_korean);
assert!(!meta.has_ascii_alphabetic);
assert!(!meta.starts_with_ascii);
assert!(!meta.is_all_uppercase);
}
#[test]
fn word_meta_english_uppercase() {
let chars: Vec<char> = "ATM".chars().collect();
let meta = WordMeta::from_chars(&chars);
assert!(!meta.has_korean);
assert!(meta.has_ascii_alphabetic);
assert!(meta.starts_with_ascii);
assert!(meta.is_all_uppercase);
}
#[test]
fn word_meta_mixed() {
let chars: Vec<char> = "A한b".chars().collect();
let meta = WordMeta::from_chars(&chars);
assert!(meta.has_korean);
assert!(meta.has_ascii_alphabetic);
assert!(meta.starts_with_ascii);
assert!(!meta.is_all_uppercase);
}
#[test]
fn token_debug_clone_works() {
let token = Token::Word(WordToken {
text: Cow::Borrowed("hello"),
chars: vec!['h', 'e', 'l', 'l', 'o'],
meta: WordMeta::from_chars(&['h', 'e', 'l', 'l', 'o']),
});
let cloned = token.clone();
assert!(format!("{cloned:?}").contains("Word"));
}
#[test]
fn parse_simple_words() {
let ir = DocumentIR::parse("hello world", false);
assert_eq!(ir.tokens.len(), 3);
match &ir.tokens[0] {
Token::Word(w) => assert_eq!(w.text, "hello"),
_ => panic!("expected first token to be word"),
}
assert!(matches!(ir.tokens[1], Token::Space(SpaceKind::Regular)));
match &ir.tokens[2] {
Token::Word(w) => assert_eq!(w.text, "world"),
_ => panic!("expected third token to be word"),
}
}
#[test]
fn parse_empty() {
let ir = DocumentIR::parse("", false);
assert!(ir.tokens.is_empty());
}
#[test]
fn parse_sets_meta() {
let ir = DocumentIR::parse("ATM 한A", true);
match &ir.tokens[0] {
Token::Word(w) => {
assert!(w.meta.is_all_uppercase);
assert!(w.meta.starts_with_ascii);
}
_ => panic!("expected word"),
}
match &ir.tokens[2] {
Token::Word(w) => {
assert!(w.meta.has_korean);
assert!(w.meta.has_ascii_alphabetic);
assert!(!w.meta.is_all_uppercase);
}
_ => panic!("expected word"),
}
}
}