use crate::math_symbol_shortcut;
use crate::rules::context::EncoderState;
use crate::rules::math;
use crate::rules::token::{Token, WordMeta, WordToken};
use crate::rules::token_rule::{TokenAction, TokenPhase, TokenRule};
use std::borrow::Cow;
pub struct MathExpressionTokenRule;
fn is_superscript(c: char) -> bool {
matches!(
c,
'\u{2070}' | '\u{00B9}' | '\u{00B2}' | '\u{00B3}' | '\u{2074}'
..='\u{2079}'
| '\u{207A}'
| '\u{207B}'
| '\u{207D}'
| '\u{207E}'
| '\u{207F}'
| '\u{1D4F}'
| '\u{1D50}'
| '\u{02E3}'
| '\u{1D9C}'
)
}
fn is_subscript(c: char) -> bool {
matches!(
c,
'\u{2080}'
..='\u{2089}'
| '\u{208A}'
| '\u{208B}'
| '\u{208D}'
| '\u{208E}'
| '\u{2090}'
| '\u{2093}'
| '\u{2098}'
| '\u{2099}'
)
}
fn is_combining_math_mark(c: char) -> bool {
matches!(
c,
'\u{0305}' | '\u{0307}' | '\u{0308}' | '\u{0309}' | '\u{030A}' | '\u{0332}'
)
}
fn is_middle_dot_numeric_word(chars: &[char]) -> bool {
let middle_dot_count = chars
.iter()
.filter(|c| matches!(**c, '\u{00B7}' | '\u{22C5}'))
.count();
if middle_dot_count != 1 {
return false;
}
chars
.iter()
.all(|c| c.is_ascii_digit() || matches!(*c, '\u{00B7}' | '\u{22C5}' | '\u{2212}' | '-'))
}
fn adjacent_korean_word_flags(tokens: &[Token<'_>], index: usize) -> (bool, bool) {
let prev_has_korean = index
.checked_sub(1)
.and_then(|mut i| {
loop {
match tokens.get(i) {
Some(Token::Space(_)) => {
i = i.checked_sub(1)?;
}
Some(Token::Word(w)) => return Some(w.meta.has_korean),
_ => return None,
}
}
})
.unwrap_or(false);
let next_has_korean = {
let mut i = index + 1;
loop {
match tokens.get(i) {
Some(Token::Space(_)) => i += 1,
Some(Token::Word(w)) => break w.meta.has_korean,
_ => break false,
}
}
};
(prev_has_korean, next_has_korean)
}
fn has_adjacent_korean_word(tokens: &[Token<'_>], index: usize) -> bool {
let (prev_has_korean, next_has_korean) = adjacent_korean_word_flags(tokens, index);
prev_has_korean || next_has_korean
}
fn is_korean_char(c: char) -> bool {
let code = c as u32;
(0xAC00..=0xD7A3).contains(&code) || (0x3131..=0x3163).contains(&code)
}
fn is_korean_suffix_char(c: char) -> bool {
is_korean_char(c) || matches!(c, ')' | ']' | '}' | '.' | ',' | '!' | '?')
}
fn build_word_token(text: String) -> Token<'static> {
let chars: Vec<char> = text.chars().collect();
Token::Word(WordToken {
text: Cow::Owned(text),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
})
}
fn is_strong_mixed_math_candidate(chars: &[char], text: &str) -> bool {
if chars.len() <= 1 {
return false;
}
let has_superscript = chars.iter().any(|c| is_superscript(*c));
let has_subscript = chars.iter().any(|c| is_subscript(*c));
let has_combining_mark = chars.iter().any(|c| is_combining_math_mark(*c));
let starts_with_function = math::function::starts_with_function(text);
let starts_with_root = chars.first() == Some(&'√');
let is_absolute_value_form = chars.first() == Some(&'|') && chars.last() == Some(&'|');
starts_with_function
|| starts_with_root
|| is_absolute_value_form
|| has_superscript
|| has_subscript
|| has_combining_mark
}
fn is_rule_68_compact_notation(chars: &[char]) -> bool {
if chars.len() < 2 || !chars[0].is_ascii_uppercase() {
return false;
}
if chars.len() == 2 && chars[1] == '-' {
return true;
}
chars[1..]
.iter()
.all(|c| matches!(*c, '⁺' | '⁻' | '₀'..='₉'))
&& chars[1..]
.iter()
.any(|c| is_superscript(*c) || is_subscript(*c))
}
fn should_wrap_math_sentence(chars: &[char], text: &str) -> bool {
if chars.len() <= 1 {
return false;
}
let has_letters = chars.iter().any(|c| c.is_ascii_alphabetic());
let has_digits = chars.iter().any(|c| c.is_ascii_digit());
let has_math_symbol = chars
.iter()
.any(|c| math_symbol_shortcut::is_math_symbol_char(*c));
let has_superscript = chars.iter().any(|c| is_superscript(*c));
let has_subscript = chars.iter().any(|c| is_subscript(*c));
let has_combining_mark = chars.iter().any(|c| is_combining_math_mark(*c));
let has_math_operator = chars.iter().any(|c| {
matches!(
c,
'+' | '=' | '>' | '<' | '.' | ',' | '-' | '\u{2212}' | '/' | '!'
)
});
let has_brackets = chars
.iter()
.any(|c| matches!(c, '(' | ')' | '[' | ']' | '{' | '}'));
is_strong_mixed_math_candidate(chars, text)
|| (has_digits && (has_math_operator || has_math_symbol || has_brackets))
|| (has_letters && has_digits)
|| (has_letters && has_brackets)
|| (has_letters && has_math_operator)
|| (has_superscript || has_subscript || has_combining_mark)
}
fn try_encode_math_slice(chars: &[char]) -> Option<Vec<u8>> {
if chars.is_empty() || chars.iter().any(|c| is_korean_char(*c)) {
return None;
}
let text: String = chars.iter().collect();
if !is_strong_mixed_math_candidate(chars, &text) {
return None;
}
if !is_math_expression(chars, &text) {
return None;
}
math::encoder::encode_math_expression(&text).ok()
}
fn try_encode_mixed_math_prefix(prefix: &[char], suffix: &[char]) -> Option<Vec<u8>> {
if let Some(bytes) = try_encode_math_slice(prefix) {
let text: String = prefix.iter().collect();
if !suffix.is_empty()
&& suffix.iter().all(|c| is_korean_suffix_char(*c))
&& suffix.iter().any(|c| is_korean_char(*c))
&& math::rule_46::is_trig_function(&text)
{
return math::encoder::encode_math_expression(&format!("{text}x")).ok();
}
return Some(bytes);
}
None
}
fn split_mixed_math_word(
word: &crate::rules::token::WordToken<'_>,
leading_delimiter_len: usize,
) -> Option<Vec<Token<'static>>> {
if !word.meta.has_korean || word.chars.iter().all(|c| is_korean_char(*c)) {
return None;
}
let chars = &word.chars;
let len = chars.len();
for end in (1..=len).rev() {
let Some(bytes) = try_encode_mixed_math_prefix(&chars[..end], &chars[end..]) else {
continue;
};
if end == len {
return None;
}
if !chars[end..].iter().all(|c| is_korean_suffix_char(*c))
|| !chars[end..].iter().any(|c| is_korean_char(*c))
{
continue;
}
let suffix: String = chars[end..].iter().collect();
return Some(vec![
Token::PreEncoded(vec![0; leading_delimiter_len]),
Token::PreEncoded(bytes),
Token::PreEncoded(vec![0, 0]),
build_word_token(suffix),
]);
}
None
}
fn is_math_expression(chars: &[char], text: &str) -> bool {
if is_rule_68_compact_notation(chars) {
return false;
}
if chars.len() == 1
&& matches!(
chars[0],
'+' | '=' | '−' | '×' | '÷' | '<' | '>' | '≠' | '≥' | '≤'
)
{
return true;
}
if chars.len() == 1 && crate::fraction::is_unicode_fraction(chars[0]) {
return true;
}
if chars.len() == 2 && matches!(chars[0], '-' | '\u{2212}') && chars[1] == '\u{221E}' {
return true;
}
for c in chars {
let code = *c as u32;
if (0xAC00..=0xD7A3).contains(&code) || (0x3131..=0x3163).contains(&code) {
return false;
}
}
let has_letters = chars.iter().any(|c| c.is_ascii_alphabetic());
let has_digits = chars.iter().any(|c| c.is_ascii_digit());
let has_math_symbol = chars
.iter()
.any(|c| math_symbol_shortcut::is_math_symbol_char(*c));
let has_strong_math_symbol = chars.iter().any(|c| {
math_symbol_shortcut::is_math_symbol_char(*c)
&& !matches!(*c, '\u{00B7}' | '\u{22C5}' | '/')
});
let has_superscript = chars.iter().any(|c| is_superscript(*c));
let has_subscript = chars.iter().any(|c| is_subscript(*c));
let has_combining_mark = chars.iter().any(|c| is_combining_math_mark(*c));
let has_function = math::function::starts_with_function(text);
let has_math_operator = chars.iter().any(|c| {
matches!(
c,
'+' | '=' | '>' | '<' | '.' | ',' | '-' | '\u{2212}' | '/' | '!'
)
});
let has_brackets = chars
.iter()
.any(|c| matches!(c, '(' | ')' | '[' | ']' | '{' | '}'));
let starts_with_math_symbol = chars
.first()
.is_some_and(|c| math_symbol_shortcut::is_math_symbol_char(*c));
if chars.first().is_some_and(|c| c.is_ascii_digit())
&& chars.iter().any(|c| matches!(*c, '\u{2080}'..='\u{2089}'))
&& chars
.iter()
.all(|c| c.is_ascii_digit() || matches!(*c, '\u{2080}'..='\u{2089}'))
{
return true;
}
if !has_letters
&& chars
.iter()
.all(|c| c.is_ascii_digit() || matches!(c, '-' | '~' | '(' | ')' | ','))
&& !chars
.first()
.is_some_and(|c| matches!(*c, '-' | '\u{2212}'))
{
return false;
}
if !has_letters && chars.contains(&'/') && chars.iter().all(|c| c.is_ascii_digit() || *c == '/')
{
let parts: Vec<&str> = text.split('/').collect();
if parts.len() == 2
&& parts
.iter()
.all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit()) && p.len() == 1)
{
return true;
}
return false;
}
if has_function
&& let Some((name, _)) = math::function::match_function_prefix(text)
&& (chars.len() > name.len() || text == name)
{
return true;
}
if let Some(rest) = text.strip_prefix("arc")
&& ["sin", "cos", "tan"]
.iter()
.any(|name| rest.starts_with(name))
{
return true;
}
if chars.len() == 3
&& chars[0].is_ascii_lowercase()
&& chars[1].is_ascii_uppercase()
&& chars[2].is_ascii_lowercase()
{
return true;
}
if has_letters
&& !has_digits
&& !has_strong_math_symbol
&& !has_superscript
&& !has_subscript
&& chars
.iter()
.all(|c| c.is_ascii_alphabetic() || matches!(*c, ',' | '.' | '\'' | '"'))
{
return false;
}
if chars.len() >= 3
&& chars.first() == Some(&'(')
&& chars.get(1).is_some_and(|c| c.is_ascii_alphabetic())
&& chars.get(2) == Some(&')')
&& chars
.get(3)
.is_none_or(|c| matches!(*c, ',' | '.' | '\'' | '"'))
{
return false;
}
if (has_superscript || has_subscript) && (has_letters || has_digits) {
return true;
}
if has_combining_mark && (has_letters || has_digits) {
return true;
}
if has_math_operator && has_letters {
let trailing_slash_word = chars.last() == Some(&'/')
&& chars
.iter()
.all(|c| c.is_ascii_alphabetic() || matches!(*c, '/' | '\''));
if trailing_slash_word {
return false;
}
return true;
}
if has_math_operator && has_digits {
return true;
}
if has_math_symbol && has_letters && has_digits {
return true;
}
if has_math_symbol && has_digits {
return true;
}
if has_strong_math_symbol && has_letters {
return true;
}
if has_strong_math_symbol && has_digits {
return true;
}
if starts_with_math_symbol && has_digits {
return true;
}
if has_letters && chars.contains(&'/') {
let has_letter_before_slash = chars
.windows(2)
.any(|w| w[0].is_ascii_alphabetic() && w[1] == '/');
let has_letter_after_slash = chars
.windows(2)
.any(|w| w[0] == '/' && w[1].is_ascii_alphabetic());
if has_letter_before_slash && has_letter_after_slash {
return true;
}
}
if has_digits && chars.iter().any(|c| matches!(*c, '-' | '\u{2212}')) {
return true;
}
if has_brackets && has_digits {
return true;
}
if has_brackets
&& has_letters
&& (has_math_operator || has_math_symbol || has_superscript || has_subscript)
{
return true;
}
if has_brackets && has_letters {
let text_bytes = text.as_bytes();
let has_func_call = text_bytes
.windows(2)
.any(|w| (w[0] as char).is_ascii_alphabetic() && w[1] == b'(');
if has_func_call {
return true;
}
}
if chars.len() >= 2 && chars[0].is_ascii_digit() {
let has_letter_after_digit = chars.iter().skip(1).any(|c| c.is_ascii_lowercase());
if has_letter_after_digit {
return true;
}
}
false
}
impl TokenRule for MathExpressionTokenRule {
fn phase(&self) -> TokenPhase {
TokenPhase::FractionDetection
}
fn priority(&self) -> u16 {
50 }
fn apply<'a>(
&self,
tokens: &[Token<'a>],
index: usize,
_state: &mut EncoderState,
) -> Result<TokenAction<'a>, String> {
fn prev_next_words<'a>(
tokens: &'a [Token<'a>],
index: usize,
) -> (
Option<&'a crate::rules::token::WordToken<'a>>,
Option<&'a crate::rules::token::WordToken<'a>>,
) {
let prev = index.checked_sub(1).and_then(|mut i| {
loop {
match tokens.get(i) {
Some(Token::Space(_)) => i = i.checked_sub(1)?,
Some(Token::Word(w)) => return Some(w),
_ => return None,
}
}
});
let next = {
let mut i = index + 1;
loop {
match tokens.get(i) {
Some(Token::Space(_)) => i += 1,
Some(Token::Word(w)) => break Some(w),
_ => break None,
}
}
};
(prev, next)
}
fn is_logic_symbol_word(word: &crate::rules::token::WordToken<'_>) -> bool {
word.chars
.first()
.is_some_and(|c| word.chars.len() == 1 && matches!(*c, '⊻'))
}
let Some(Token::Word(word)) = tokens.get(index) else {
return Ok(TokenAction::Noop);
};
let text = word.text.as_ref();
if is_middle_dot_numeric_word(&word.chars) && has_adjacent_korean_word(tokens, index) {
return Ok(TokenAction::Noop);
}
if matches!(word.chars.as_slice(), ['∴' | '∵']) {
let has_prev_content = index
.checked_sub(1)
.and_then(|mut i| {
loop {
match tokens.get(i) {
Some(Token::Space(_)) => i = i.checked_sub(1)?,
Some(Token::Word(_) | Token::PreEncoded(_)) => return Some(true),
_ => return None,
}
}
})
.unwrap_or(false);
let has_next_content = {
let mut i = index + 1;
loop {
match tokens.get(i) {
Some(Token::Space(_)) => i += 1,
Some(Token::Word(_) | Token::PreEncoded(_)) => break true,
_ => break false,
}
}
};
if has_prev_content && has_next_content {
let encoded =
math_symbol_shortcut::encode_char_math_symbol_shortcut(word.chars[0])?;
let mut out = vec![0];
out.extend_from_slice(encoded);
out.push(0);
return Ok(TokenAction::Replace(Token::PreEncoded(out)));
}
}
if word.chars.len() == 1 && word.chars[0].is_ascii_uppercase() {
let (prev, next) = prev_next_words(tokens, index);
if prev.is_some_and(is_logic_symbol_word) || next.is_some_and(is_logic_symbol_word) {
let code = crate::english::encode_english(word.chars[0].to_ascii_lowercase())?;
return Ok(TokenAction::Replace(Token::PreEncoded(vec![code])));
}
}
if let Some(stripped) = text.strip_prefix('$') {
if let Some(close_idx) = stripped.find('$')
&& close_idx + 1 < stripped.len()
{
let latex = &text[..=close_idx + 1];
let suffix = &stripped[close_idx + 1..];
if let Some((whole, numerator, denominator)) =
crate::fraction::parse_latex_fraction(latex)
{
return Ok(TokenAction::ReplaceMany(vec![
Token::Fraction(crate::rules::token::FractionToken {
whole,
numerator,
denominator,
}),
build_word_token(suffix.to_string()),
]));
}
let inner = &latex[1..latex.len() - 1];
let math_text = crate::rules::token_rules::latex_math::strip_latex_to_math(inner);
if let Ok(bytes) = math::encoder::encode_math_expression(&math_text) {
return Ok(TokenAction::ReplaceMany(vec![
Token::PreEncoded(bytes),
build_word_token(suffix.to_string()),
]));
}
}
if let Some((whole, numerator, denominator)) =
crate::fraction::parse_latex_fraction(text)
{
return Ok(TokenAction::Replace(Token::Fraction(
crate::rules::token::FractionToken {
whole,
numerator,
denominator,
},
)));
}
if text.ends_with('$') && text.len() >= 3 {
let inner = &text[1..text.len() - 1];
let math_text = crate::rules::token_rules::latex_math::strip_latex_to_math(inner);
if let Ok(bytes) = math::encoder::encode_math_expression(&math_text) {
return Ok(TokenAction::Replace(Token::PreEncoded(bytes)));
}
}
return Ok(TokenAction::Noop);
}
if !is_math_expression(&word.chars, text) {
let leading_delimiter_len =
if matches!(tokens.get(index.saturating_sub(1)), Some(Token::Space(_))) {
1
} else {
2
};
if let Some(replacement) = split_mixed_math_word(word, leading_delimiter_len) {
return Ok(TokenAction::ReplaceMany(replacement));
}
return Ok(TokenAction::Noop);
}
match math::encoder::encode_math_expression(text) {
Ok(bytes) => {
let (prev_has_korean, next_has_korean) = adjacent_korean_word_flags(tokens, index);
let should_wrap = should_wrap_math_sentence(&word.chars, text);
let mut wrapped = Vec::with_capacity(
bytes.len()
+ usize::from(prev_has_korean && should_wrap)
+ usize::from(next_has_korean && should_wrap),
);
if prev_has_korean && should_wrap {
wrapped.push(0);
}
if !prev_has_korean
&& text.contains('\u{2206}')
&& text.contains('=')
&& text.contains(")+(")
{
wrapped.push(0);
wrapped.push(0);
}
wrapped.extend_from_slice(&bytes);
if next_has_korean && should_wrap {
wrapped.push(0);
}
Ok(TokenAction::Replace(Token::PreEncoded(wrapped)))
}
Err(_) => {
Ok(TokenAction::Noop)
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rules::token::WordMeta;
use std::borrow::Cow;
#[test]
fn test_is_math_with_operator() {
let chars: Vec<char> = "ax+b=0".chars().collect();
assert!(is_math_expression(&chars, "ax+b=0"));
}
#[test]
fn test_is_math_with_function() {
let chars: Vec<char> = "sin3x".chars().collect();
assert!(is_math_expression(&chars, "sin3x"));
}
#[test]
fn test_is_math_with_standalone_function_name() {
let chars: Vec<char> = "sin".chars().collect();
assert!(is_math_expression(&chars, "sin"));
}
#[test]
fn test_is_not_math_korean() {
let chars: Vec<char> = "안녕".chars().collect();
assert!(!is_math_expression(&chars, "안녕"));
}
#[test]
fn test_is_not_math_plain_english() {
let chars: Vec<char> = "hello".chars().collect();
assert!(!is_math_expression(&chars, "hello"));
}
#[test]
fn test_is_math_with_superscript() {
let chars: Vec<char> = "x²".chars().collect();
assert!(is_math_expression(&chars, "x²"));
}
#[test]
fn test_is_math_digit_letter_with_operator() {
let chars: Vec<char> = "3a+b".chars().collect();
assert!(is_math_expression(&chars, "3a+b"));
}
#[test]
fn test_is_math_digit_then_letter() {
let chars: Vec<char> = "3ab".chars().collect();
assert!(is_math_expression(&chars, "3ab"));
}
#[test]
fn test_is_not_math_letter_then_digit() {
let chars: Vec<char> = "MP3".chars().collect();
assert!(!is_math_expression(&chars, "MP3"));
}
#[test]
fn test_is_math_symbol_digit_combo() {
let chars: Vec<char> = "≠0".chars().collect();
assert!(is_math_expression(&chars, "≠0"));
}
#[test]
fn test_is_math_decimal_number() {
let chars: Vec<char> = "0.17".chars().collect();
assert!(is_math_expression(&chars, "0.17"));
}
#[test]
fn test_is_math_relation_shorthand() {
let chars: Vec<char> = "aRb".chars().collect();
assert!(is_math_expression(&chars, "aRb"));
}
#[test]
fn test_is_math_negative_infinity() {
let chars: Vec<char> = "-∞".chars().collect();
assert!(is_math_expression(&chars, "-∞"));
}
#[test]
fn test_is_math_unicode_fraction_char() {
let chars: Vec<char> = "⅔".chars().collect();
assert!(is_math_expression(&chars, "⅔"));
}
#[test]
fn test_is_math_base_notation() {
let chars: Vec<char> = "1010₂".chars().collect();
assert!(is_math_expression(&chars, "1010₂"));
}
#[test]
fn split_mixed_math_word_extracts_math_prefix() {
let chars: Vec<char> = "tan의".chars().collect();
let word = crate::rules::token::WordToken {
text: Cow::Borrowed("tan의"),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
};
let replacement = split_mixed_math_word(&word, 2).expect("expected split");
assert!(matches!(replacement[0], Token::PreEncoded(ref bytes) if bytes == &vec![0, 0]));
assert!(matches!(replacement[1], Token::PreEncoded(_)));
assert!(matches!(replacement[2], Token::PreEncoded(ref bytes) if bytes == &vec![0, 0]));
assert!(matches!(&replacement[3], Token::Word(w) if w.text == "의"));
}
#[test]
fn split_mixed_math_word_keeps_plain_mixed_english_korean() {
let chars: Vec<char> = "ATM에서".chars().collect();
let word = crate::rules::token::WordToken {
text: Cow::Borrowed("ATM에서"),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
};
assert!(split_mixed_math_word(&word, 2).is_none());
}
}