use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum GlyphMeaning {
Multiplication,
Variable(String),
Subtraction,
UnaryMinus,
Addition,
Equals,
Digit(u8),
Letter(char),
Prime,
Apostrophe,
Quote,
Division,
Ratio,
DecimalPoint,
SentenceEnd,
Separator,
DecimalSeparator,
Grouping,
FunctionApplication,
Unknown,
}
#[derive(Debug, Clone)]
pub struct HomoglyphSet {
pub glyphs: Vec<char>,
pub meanings: Vec<GlyphMeaning>,
pub canonical: char,
}
impl HomoglyphSet {
pub fn new(glyphs: Vec<char>, meanings: Vec<GlyphMeaning>, canonical: char) -> Self {
Self {
glyphs,
meanings,
canonical,
}
}
pub fn contains(&self, glyph: char) -> bool {
self.glyphs.contains(&glyph)
}
}
#[derive(Debug, Clone, Default)]
pub struct MathContext {
pub in_math_mode: bool,
pub in_text_mode: bool,
pub prev_token: Option<String>,
pub next_token: Option<String>,
pub paren_depth: i32,
pub prev_was_operator: bool,
pub prev_was_number: bool,
pub domain: Option<MathDomain>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MathDomain {
General,
Algebra,
Analysis,
LinearAlgebra,
NumberTheory,
Statistics,
Physics,
ComputerScience,
}
pub struct HomoglyphDisambiguator {
confusion_sets: HashMap<char, HomoglyphSet>,
config: DisambiguatorConfig,
}
#[derive(Debug, Clone)]
pub struct DisambiguatorConfig {
pub context_weight: f32,
pub frequency_weight: f32,
pub normalize: bool,
}
impl Default for DisambiguatorConfig {
fn default() -> Self {
Self {
context_weight: 0.7,
frequency_weight: 0.3,
normalize: true,
}
}
}
impl Default for HomoglyphDisambiguator {
fn default() -> Self {
Self::new()
}
}
impl HomoglyphDisambiguator {
pub fn new() -> Self {
let mut disambiguator = Self {
confusion_sets: HashMap::new(),
config: DisambiguatorConfig::default(),
};
disambiguator.register_standard_confusions();
disambiguator
}
pub fn with_config(config: DisambiguatorConfig) -> Self {
let mut disambiguator = Self {
confusion_sets: HashMap::new(),
config,
};
disambiguator.register_standard_confusions();
disambiguator
}
fn register_standard_confusions(&mut self) {
self.register(HomoglyphSet::new(
vec!['x', 'X', '×', '✕', '✖', '⨯'],
vec![
GlyphMeaning::Variable("x".to_string()),
GlyphMeaning::Multiplication,
],
'x',
));
self.register(HomoglyphSet::new(
vec!['-', '−', '–', '—', '‐', '‑'],
vec![GlyphMeaning::Subtraction, GlyphMeaning::UnaryMinus],
'-',
));
self.register(HomoglyphSet::new(
vec!['0', 'O', 'o', 'Ο', 'ο', '০'],
vec![
GlyphMeaning::Digit(0),
GlyphMeaning::Variable("O".to_string()),
GlyphMeaning::Variable("o".to_string()),
],
'0',
));
self.register(HomoglyphSet::new(
vec!['1', 'l', 'I', '|', 'ǀ', 'ⅼ'],
vec![
GlyphMeaning::Digit(1),
GlyphMeaning::Variable("l".to_string()),
GlyphMeaning::Variable("I".to_string()),
],
'1',
));
self.register(HomoglyphSet::new(
vec!['2', 'Z', 'z', 'Ζ', 'ζ'],
vec![
GlyphMeaning::Digit(2),
GlyphMeaning::Variable("Z".to_string()),
],
'2',
));
self.register(HomoglyphSet::new(
vec!['5', 'S', 's', 'Ѕ', 'ѕ'],
vec![
GlyphMeaning::Digit(5),
GlyphMeaning::Variable("S".to_string()),
],
'5',
));
self.register(HomoglyphSet::new(
vec!['\'', '′', 'ʹ', 'ˈ', '\u{2019}'], vec![GlyphMeaning::Prime, GlyphMeaning::Apostrophe],
'\'',
));
self.register(HomoglyphSet::new(
vec!['/', '÷', '∕', '⁄'],
vec![GlyphMeaning::Division],
'/',
));
self.register(HomoglyphSet::new(
vec![':', '∶', '꞉'],
vec![GlyphMeaning::Ratio],
':',
));
self.register(HomoglyphSet::new(
vec!['.', '·', '⋅', '∙'],
vec![
GlyphMeaning::DecimalPoint,
GlyphMeaning::Multiplication,
GlyphMeaning::SentenceEnd,
],
'.',
));
self.register(HomoglyphSet::new(
vec![',', ','],
vec![GlyphMeaning::Separator, GlyphMeaning::DecimalSeparator],
',',
));
self.register(HomoglyphSet::new(
vec!['+', '+', '➕'],
vec![GlyphMeaning::Addition],
'+',
));
self.register(HomoglyphSet::new(
vec!['=', '=', '⩵', '₌'],
vec![GlyphMeaning::Equals],
'=',
));
self.register(HomoglyphSet::new(
vec!['A', 'Α'], vec![GlyphMeaning::Variable("A".to_string())],
'A',
));
self.register(HomoglyphSet::new(
vec!['B', 'Β', 'В'], vec![GlyphMeaning::Variable("B".to_string())],
'B',
));
self.register(HomoglyphSet::new(
vec!['E', 'Ε', 'Е'], vec![GlyphMeaning::Variable("E".to_string())],
'E',
));
self.register(HomoglyphSet::new(
vec!['H', 'Η', 'Н'], vec![GlyphMeaning::Variable("H".to_string())],
'H',
));
self.register(HomoglyphSet::new(
vec!['K', 'Κ', 'К'], vec![GlyphMeaning::Variable("K".to_string())],
'K',
));
self.register(HomoglyphSet::new(
vec!['M', 'Μ', 'М'], vec![GlyphMeaning::Variable("M".to_string())],
'M',
));
self.register(HomoglyphSet::new(
vec!['N', 'Ν'], vec![GlyphMeaning::Variable("N".to_string())],
'N',
));
self.register(HomoglyphSet::new(
vec!['P', 'Ρ', 'Р'], vec![GlyphMeaning::Variable("P".to_string())],
'P',
));
self.register(HomoglyphSet::new(
vec!['T', 'Τ', 'Т'], vec![GlyphMeaning::Variable("T".to_string())],
'T',
));
self.register(HomoglyphSet::new(
vec!['Y', 'Υ', 'У'], vec![GlyphMeaning::Variable("Y".to_string())],
'Y',
));
}
pub fn register(&mut self, set: HomoglyphSet) {
for &glyph in &set.glyphs {
self.confusion_sets.insert(glyph, set.clone());
}
}
pub fn is_ambiguous(&self, c: char) -> bool {
self.confusion_sets.contains_key(&c)
}
pub fn get_confusion_set(&self, c: char) -> Option<&HomoglyphSet> {
self.confusion_sets.get(&c)
}
pub fn disambiguate(&self, glyph: char, context: &MathContext) -> GlyphMeaning {
let Some(set) = self.confusion_sets.get(&glyph) else {
return GlyphMeaning::Unknown;
};
if set.meanings.len() == 1 {
return set.meanings[0].clone();
}
self.disambiguate_with_context(glyph, set, context)
}
fn disambiguate_with_context(
&self,
glyph: char,
set: &HomoglyphSet,
context: &MathContext,
) -> GlyphMeaning {
let mut best_meaning = set.meanings[0].clone();
let mut best_score = 0.0f32;
for meaning in &set.meanings {
let score = self.score_meaning(glyph, meaning, context);
if score > best_score {
best_score = score;
best_meaning = meaning.clone();
}
}
best_meaning
}
fn score_meaning(&self, glyph: char, meaning: &GlyphMeaning, context: &MathContext) -> f32 {
let mut score = 0.5;
match meaning {
GlyphMeaning::Multiplication => {
if context.prev_was_number {
score += 0.3;
}
if context.in_math_mode {
score += 0.2;
}
if glyph == '×' || glyph == '⋅' || glyph == '∙' {
score += 0.4; }
}
GlyphMeaning::Variable(_) => {
if context.prev_was_operator {
score += 0.3;
}
if glyph.is_alphabetic() {
score += 0.2;
}
}
GlyphMeaning::Subtraction => {
if context.prev_was_number {
score += 0.4;
}
}
GlyphMeaning::UnaryMinus => {
if context.prev_was_operator || context.prev_token.is_none() {
score += 0.4;
}
}
GlyphMeaning::Digit(_) => {
if let Some(ref prev) = context.prev_token {
if prev.chars().all(|c| c.is_ascii_digit() || c == '.') {
score += 0.4;
}
}
}
GlyphMeaning::DecimalPoint => {
if context.prev_was_number {
if let Some(ref next) = context.next_token {
if next
.chars()
.next()
.map(|c| c.is_ascii_digit())
.unwrap_or(false)
{
score += 0.5;
}
}
}
}
GlyphMeaning::Prime => {
if context.in_math_mode && !context.prev_was_number && !context.prev_was_operator {
score += 0.4;
}
}
_ => {}
}
score
}
pub fn normalize(&self, input: &str) -> String {
if !self.config.normalize {
return input.to_string();
}
input
.chars()
.map(|c| {
if let Some(set) = self.confusion_sets.get(&c) {
set.canonical
} else {
c
}
})
.collect()
}
pub fn get_confusables(&self, c: char) -> Vec<char> {
self.confusion_sets
.get(&c)
.map(|set| set.glyphs.clone())
.unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_disambiguator_creation() {
let disambiguator = HomoglyphDisambiguator::new();
assert!(disambiguator.is_ambiguous('x'));
assert!(disambiguator.is_ambiguous('×'));
assert!(disambiguator.is_ambiguous('-'));
assert!(disambiguator.is_ambiguous('0'));
}
#[test]
fn test_get_confusion_set() {
let disambiguator = HomoglyphDisambiguator::new();
let set = disambiguator.get_confusion_set('x');
assert!(set.is_some());
let set = set.expect("layers/mathml/homoglyph.rs: required value was None/Err");
assert!(set.contains('×'));
}
#[test]
fn test_disambiguate_x_after_number() {
let disambiguator = HomoglyphDisambiguator::new();
let context = MathContext {
in_math_mode: true,
prev_was_number: true,
..Default::default()
};
let meaning = disambiguator.disambiguate('x', &context);
assert!(matches!(
meaning,
GlyphMeaning::Multiplication | GlyphMeaning::Variable(_)
));
}
#[test]
fn test_disambiguate_x_after_operator() {
let disambiguator = HomoglyphDisambiguator::new();
let context = MathContext {
in_math_mode: true,
prev_was_operator: true,
..Default::default()
};
let meaning = disambiguator.disambiguate('x', &context);
assert!(matches!(meaning, GlyphMeaning::Variable(_)));
}
#[test]
fn test_disambiguate_minus() {
let disambiguator = HomoglyphDisambiguator::new();
let context = MathContext {
prev_was_number: true,
..Default::default()
};
let meaning = disambiguator.disambiguate('-', &context);
assert!(matches!(meaning, GlyphMeaning::Subtraction));
let context = MathContext {
prev_was_operator: true,
..Default::default()
};
let meaning = disambiguator.disambiguate('-', &context);
assert!(matches!(meaning, GlyphMeaning::UnaryMinus));
}
#[test]
fn test_normalize() {
let disambiguator = HomoglyphDisambiguator::new();
let normalized = disambiguator.normalize("2×3");
assert_eq!(normalized, "2x3");
let normalized = disambiguator.normalize("a−b");
assert_eq!(normalized, "a-b");
}
#[test]
fn test_get_confusables() {
let disambiguator = HomoglyphDisambiguator::new();
let confusables = disambiguator.get_confusables('x');
assert!(confusables.contains(&'×'));
assert!(confusables.contains(&'X'));
}
#[test]
fn test_non_ambiguous_char() {
let disambiguator = HomoglyphDisambiguator::new();
assert!(!disambiguator.is_ambiguous('q'));
assert!(disambiguator.get_confusion_set('q').is_none());
}
#[test]
fn test_greek_latin_confusion() {
let disambiguator = HomoglyphDisambiguator::new();
assert!(disambiguator.is_ambiguous('Α'));
let set = disambiguator
.get_confusion_set('Α')
.expect("layers/mathml/homoglyph.rs: required value was None/Err");
assert!(set.contains('A'));
}
#[test]
fn test_zero_o_confusion() {
let disambiguator = HomoglyphDisambiguator::new();
let set = disambiguator
.get_confusion_set('0')
.expect("layers/mathml/homoglyph.rs: required value was None/Err");
assert!(set.contains('O'));
assert!(set.contains('o'));
}
#[test]
fn test_homoglyph_set() {
let set = HomoglyphSet::new(
vec!['a', 'ɑ', 'α'],
vec![GlyphMeaning::Variable("a".to_string())],
'a',
);
assert!(set.contains('a'));
assert!(set.contains('α'));
assert!(!set.contains('b'));
assert_eq!(set.canonical, 'a');
}
#[test]
fn test_config() {
let config = DisambiguatorConfig {
normalize: false,
..Default::default()
};
let disambiguator = HomoglyphDisambiguator::with_config(config);
let result = disambiguator.normalize("2×3");
assert_eq!(result, "2×3");
}
}