use crate::rules::context::EncoderState;
use crate::rules::math;
use crate::rules::math::math_token_rule::MathContext;
use crate::rules::token::{Token, WordMeta, WordToken};
use std::borrow::Cow;
use super::detect::is_math_expression;
pub(super) fn is_superscript(c: char) -> bool {
matches!(
c,
'\u{2070}' | '\u{00B9}' | '\u{00B2}' | '\u{00B3}'
| '\u{2074}'..='\u{2079}'
| '\u{207A}'
| '\u{207B}'
| '\u{207D}'
| '\u{207E}'
| '\u{207F}'
| '\u{2071}'
| '\u{02B0}'
| '\u{02B2}'
| '\u{02B3}'
| '\u{02B7}'
| '\u{02B8}'
| '\u{02E1}'
| '\u{02E2}'
| '\u{02E3}'
| '\u{1D43}'..='\u{1D58}'
| '\u{1D5B}'
| '\u{1D9C}'
| '\u{1DA0}'
| '\u{1DBB}'
)
}
pub(super) fn is_subscript(c: char) -> bool {
matches!(
c,
'\u{2080}'..='\u{2089}'
| '\u{208A}'
| '\u{208B}'
| '\u{208D}'
| '\u{208E}'
| '\u{2090}'..='\u{209C}'
| '\u{1D62}'..='\u{1D65}'
)
}
pub(super) fn is_combining_math_mark(c: char) -> bool {
matches!(
c,
'\u{0304}' | '\u{0305}' | '\u{0307}' | '\u{0308}' | '\u{0309}' | '\u{030A}' | '\u{0332}'
)
}
pub(super) fn is_middle_dot_numeric_word(chars: &[char]) -> bool {
let middle_dot_count = chars
.iter()
.filter(|c| matches!(**c, '\u{00B7}' | '\u{22C5}'))
.count();
if middle_dot_count != 1 {
return false;
}
chars
.iter()
.all(|c| c.is_ascii_digit() || matches!(*c, '\u{00B7}' | '\u{22C5}' | '\u{2212}' | '-'))
}
pub(super) fn adjacent_korean_word_flags(tokens: &[Token<'_>], index: usize) -> (bool, bool) {
let prev_has_korean = index
.checked_sub(1)
.and_then(|mut i| {
loop {
match tokens.get(i) {
Some(Token::Space(_)) => {
i = i.checked_sub(1)?;
}
Some(Token::Word(w)) => return Some(w.meta.has_korean),
_ => return None,
}
}
})
.unwrap_or(false);
let next_has_korean = {
let mut i = index + 1;
loop {
match tokens.get(i) {
Some(Token::Space(_)) => i += 1,
Some(Token::Word(w)) => break w.meta.has_korean,
_ => break false,
}
}
};
(prev_has_korean, next_has_korean)
}
pub(super) fn has_adjacent_korean_word(tokens: &[Token<'_>], index: usize) -> bool {
let (prev_has_korean, next_has_korean) = adjacent_korean_word_flags(tokens, index);
prev_has_korean || next_has_korean
}
pub(super) fn is_korean_char(c: char) -> bool {
let code = c as u32;
(0xAC00..=0xD7A3).contains(&code) || (0x3131..=0x3163).contains(&code)
}
pub(super) fn is_korean_suffix_char(c: char) -> bool {
is_korean_char(c) || matches!(c, ')' | ']' | '}' | '.' | ',' | '!' | '?')
}
pub(super) fn math_context_from_state(state: &EncoderState) -> MathContext {
MathContext {
matrix_context_active: state.matrix_context_active,
math_mode_active: state.math_mode_active,
}
}
pub(super) fn rule_44_requires_space_before_korean(s: &str) -> bool {
let Some(first_char) = s.chars().next() else {
return false;
};
let code = first_char as u32;
if !(0xAC00..=0xD7A3).contains(&code) {
return false;
}
let cho_index = (code - 0xAC00) / (21 * 28);
if matches!(cho_index, 2 | 3 | 6 | 15 | 16 | 17 | 18) {
return true;
}
first_char == '운'
}
pub(super) fn build_word_token(text: String) -> Token<'static> {
let chars: Vec<char> = text.chars().collect();
Token::Word(WordToken {
text: Cow::Owned(text),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
})
}
pub(super) fn is_strong_mixed_math_candidate(chars: &[char], text: &str) -> bool {
if chars.len() <= 1 {
return false;
}
let has_superscript = chars.iter().any(|c| is_superscript(*c));
let has_subscript = chars.iter().any(|c| is_subscript(*c));
let has_combining_mark = chars.iter().any(|c| is_combining_math_mark(*c));
let starts_with_function = math::function::starts_with_function(text);
let starts_with_root = chars.first() == Some(&'√');
let is_absolute_value_form = chars.first() == Some(&'|') && chars.last() == Some(&'|');
let has_equation = chars.contains(&'=')
&& chars.iter().any(|c| c.is_ascii_alphabetic())
&& chars
.iter()
.any(|c| matches!(*c, '+' | '-' | '×' | '÷' | '\u{2212}'));
let has_function_call = chars.len() >= 3
&& chars[0].is_ascii_alphabetic()
&& chars[1] == '('
&& chars.iter().filter(|c| c.is_ascii_alphabetic()).count() <= 3;
starts_with_function
|| starts_with_root
|| is_absolute_value_form
|| has_superscript
|| has_subscript
|| has_combining_mark
|| has_equation
|| has_function_call
}
pub(super) fn is_rule_68_compact_notation(chars: &[char]) -> bool {
if chars.len() < 2 || !chars[0].is_ascii_uppercase() {
return false;
}
if chars.len() == 2 && chars[1] == '-' {
return true;
}
chars[1..]
.iter()
.all(|c| matches!(*c, '⁺' | '⁻' | '₀'..='₉'))
&& chars[1..]
.iter()
.any(|c| is_superscript(*c) || is_subscript(*c))
}
pub(super) fn try_encode_math_slice(chars: &[char], math_context: MathContext) -> Option<Vec<u8>> {
if chars.is_empty() || chars.iter().any(|c| is_korean_char(*c)) {
return None;
}
let text: String = chars.iter().collect();
if !is_strong_mixed_math_candidate(chars, &text) {
return None;
}
if !is_math_expression(chars, &text) {
return None;
}
math::encoder::encode_math_expression_with_context(&text, math_context).ok()
}
pub(super) fn is_mixed_math_expression(chars: &[char], text: &str) -> bool {
let has_korean = chars.iter().any(|c| is_korean_char(*c));
let has_root = chars.contains(&'√');
let has_parens = chars.iter().any(|c| matches!(*c, '(' | ')'));
let has_math_op = chars
.iter()
.any(|c| matches!(*c, '=' | '+' | '/' | '×' | '÷'));
let fraction_with_korean =
has_parens && has_math_op && (text.contains("/(") || text.contains(")/")) && {
let mut depth = 0i32;
let mut korean_in_parens = false;
for c in chars {
match *c {
'(' => depth += 1,
')' => depth -= 1,
_ if depth > 0 && is_korean_char(*c) => korean_in_parens = true,
_ => {}
}
}
korean_in_parens
};
let root_with_korean = has_root
&& chars
.windows(2)
.any(|w| w[0] == '√' && is_korean_char(w[1]));
let multi_word_korean_phrase = chars
.windows(3)
.any(|w| is_korean_char(w[0]) && w[1] == ' ' && is_korean_char(w[2]));
let has_english_letter = chars.iter().any(|c| c.is_ascii_alphabetic());
has_korean
&& (fraction_with_korean
|| root_with_korean
|| (multi_word_korean_phrase && has_math_op && !has_english_letter))
}
pub(super) fn try_encode_mixed_math_slice(
chars: &[char],
math_context: MathContext,
) -> Option<Vec<u8>> {
if chars.is_empty() {
return None;
}
let text: String = chars.iter().collect();
if !is_mixed_math_expression(chars, &text) {
return None;
}
math::encoder::encode_math_expression_with_context(&text, math_context).ok()
}
pub(super) fn try_encode_mixed_math_prefix(
prefix: &[char],
suffix: &[char],
math_context: MathContext,
) -> Option<Vec<u8>> {
if let Some(bytes) = try_encode_math_slice(prefix, math_context) {
let text: String = prefix.iter().collect();
if !suffix.is_empty()
&& suffix.iter().all(|c| is_korean_suffix_char(*c))
&& suffix.iter().any(|c| is_korean_char(*c))
&& math::rule_46::is_trig_function(&text)
{
return math::encoder::encode_math_expression_with_context(
&format!("{text}x"),
math_context,
)
.ok();
}
return Some(bytes);
}
None
}
fn build_math_prefix_replacement(
leading_delimiter_len: usize,
bytes: Vec<u8>,
suffix: String,
) -> Vec<Token<'static>> {
let lead = Token::PreEncoded(vec![0; leading_delimiter_len]);
let math = Token::PreEncoded(bytes);
let sep = Token::PreEncoded(vec![0, 0]);
let trailing = build_word_token(suffix);
vec![lead, math, sep, trailing]
}
fn build_korean_prefix_math_suffix(prefix: String, bytes: Vec<u8>) -> Vec<Token<'static>> {
let head = build_word_token(prefix);
let sep = Token::PreEncoded(vec![0, 0]);
let math = Token::PreEncoded(bytes);
vec![head, sep, math]
}
pub(super) fn split_mixed_math_word(
word: &crate::rules::token::WordToken<'_>,
leading_delimiter_len: usize,
math_context: MathContext,
) -> Option<Vec<Token<'static>>> {
if !word.meta.has_korean || word.chars.iter().all(|c| is_korean_char(*c)) {
return None;
}
let chars = &word.chars;
let len = chars.len();
let math_prefix_result = (1..len).rev().find_map(|end| {
let bytes = try_encode_mixed_math_prefix(&chars[..end], &chars[end..], math_context)?;
let suffix_chars = &chars[end..];
let suffix_is_korean = suffix_chars.iter().all(|c| is_korean_suffix_char(*c))
&& suffix_chars.iter().any(|c| is_korean_char(*c));
if suffix_is_korean {
Some(build_math_prefix_replacement(
leading_delimiter_len,
bytes,
suffix_chars.iter().collect(),
))
} else {
None
}
});
if let Some(replacement) = math_prefix_result {
return Some(replacement);
}
let _ = leading_delimiter_len;
(1..len).find_map(|start| {
let prefix_chars = &chars[..start];
let suffix_chars = &chars[start..];
let prefix_all_korean = prefix_chars.iter().all(|c| is_korean_char(*c));
let suffix_no_korean = !suffix_chars.iter().any(|c| is_korean_char(*c));
if !prefix_all_korean || !suffix_no_korean {
return None;
}
let suffix_text: String = suffix_chars.iter().collect();
let suffix_is_math = is_mixed_math_expression(suffix_chars, &suffix_text)
|| is_math_expression(suffix_chars, &suffix_text);
if !suffix_is_math {
return None;
}
let bytes =
math::encoder::encode_math_expression_with_context(&suffix_text, math_context).ok()?;
Some(build_korean_prefix_math_suffix(
prefix_chars.iter().collect(),
bytes,
))
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rules::math::math_token_rule::MathContext;
#[test]
fn try_encode_math_slice_fallback_to_regular_encode() {
let chars: Vec<char> = "f(~)".chars().collect();
let _ = try_encode_math_slice(&chars, MathContext::default());
let chars: Vec<char> = "2\u{0305}.3010".chars().collect();
let _ = try_encode_math_slice(&chars, MathContext::default());
}
#[test]
fn try_encode_mixed_math_slice_empty_returns_none() {
let result = try_encode_mixed_math_slice(&[], MathContext::default());
assert!(result.is_none());
}
#[test]
fn split_mixed_math_word_whole_word_no_split() {
use crate::rules::token::{WordMeta, WordToken};
use std::borrow::Cow;
let chars: Vec<char> = "한x".chars().collect();
let word = WordToken {
text: Cow::Owned("한x".to_string()),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
};
let _ = split_mixed_math_word(&word, 0, MathContext::default());
}
#[test]
fn split_mixed_math_word_non_korean_suffix_continues() {
use crate::rules::token::{WordMeta, WordToken};
use std::borrow::Cow;
let chars: Vec<char> = "x한a".chars().collect();
let word = WordToken {
text: Cow::Owned("x한a".to_string()),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
};
let _ = split_mixed_math_word(&word, 0, MathContext::default());
}
#[test]
fn split_mixed_math_word_korean_prefix_math_suffix_encode_fail() {
use crate::rules::token::{WordMeta, WordToken};
use std::borrow::Cow;
let chars: Vec<char> = "한국x~".chars().collect();
let word = WordToken {
text: Cow::Owned("한국x~".to_string()),
chars: chars.clone(),
meta: WordMeta::from_chars(&chars),
};
let _ = split_mixed_math_word(&word, 0, MathContext::default());
}
}