pub fn latex_to_unicode(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
match chars[i] {
'\\' => {
let start = i + 1;
let mut end = start;
while end < len && chars[end].is_ascii_alphabetic() {
end += 1;
}
if end == start {
if start < len {
match chars[start] {
'{' | '}' => { i = start + 1; continue; }
'\\' => { out.push('\n'); i = start + 1; continue; }
',' => { out.push('\u{2009}'); i = start + 1; continue; } ';' => { out.push(' '); i = start + 1; continue; }
'!' => { i = start + 1; continue; } _ => { out.push(chars[start]); i = start + 1; continue; }
}
}
i += 1;
continue;
}
let cmd: String = chars[start..end].iter().collect();
i = end;
if let Some(sym) = command_to_unicode(&cmd) {
out.push_str(sym);
} else if cmd == "frac" {
let num = extract_brace_group(&chars, &mut i);
let den = extract_brace_group(&chars, &mut i);
let num_u = latex_to_unicode(&num);
let den_u = latex_to_unicode(&den);
if num_u.len() <= 1 && den_u.len() <= 1 {
out.push_str(&format!("{num_u}/{den_u}"));
} else {
out.push_str(&format!("({num_u})/({den_u})"));
}
} else if cmd == "sqrt" {
let body = extract_brace_group(&chars, &mut i);
let body_u = latex_to_unicode(&body);
out.push_str(&format!("√({body_u})"));
} else if cmd == "text" || cmd == "mathrm" || cmd == "mathbf"
|| cmd == "mathit" || cmd == "mathbb" || cmd == "mathcal"
|| cmd == "operatorname"
{
let body = extract_brace_group(&chars, &mut i);
out.push_str(&latex_to_unicode(&body));
} else if cmd == "left" || cmd == "right" {
if i < len {
match chars[i] {
'(' | ')' | '[' | ']' | '|' => {
out.push(chars[i]);
i += 1;
}
'\\' if i + 1 < len => {
if chars[i + 1] == '{' { out.push('('); }
else if chars[i + 1] == '}' { out.push(')'); }
else if chars[i + 1] == '|' { out.push('‖'); }
i += 2;
}
'.' => { i += 1; } _ => {}
}
}
} else if cmd == "begin" || cmd == "end" {
let _env = extract_brace_group(&chars, &mut i);
} else {
out.push_str(&cmd);
}
}
'{' | '}' => {
i += 1;
}
'^' => {
i += 1;
let group = extract_script_arg(&chars, &mut i);
let converted = latex_to_unicode(&group);
for ch in converted.chars() {
out.push(to_superscript(ch));
}
}
'_' => {
i += 1;
let group = extract_script_arg(&chars, &mut i);
let converted = latex_to_unicode(&group);
for ch in converted.chars() {
out.push(to_subscript(ch));
}
}
'~' => {
out.push(' ');
i += 1;
}
_ => {
out.push(chars[i]);
i += 1;
}
}
}
out
}
fn extract_script_arg(chars: &[char], pos: &mut usize) -> String {
while *pos < chars.len() && chars[*pos] == ' ' {
*pos += 1;
}
if *pos >= chars.len() {
return String::new();
}
if chars[*pos] == '{' {
extract_brace_group(chars, pos)
} else if chars[*pos] == '\\' {
let start = *pos;
*pos += 1; while *pos < chars.len() && chars[*pos].is_ascii_alphabetic() {
*pos += 1;
}
chars[start..*pos].iter().collect()
} else {
let c = chars[*pos];
*pos += 1;
c.to_string()
}
}
fn extract_brace_group(chars: &[char], pos: &mut usize) -> String {
while *pos < chars.len() && chars[*pos] == ' ' {
*pos += 1;
}
if *pos >= chars.len() || chars[*pos] != '{' {
return String::new();
}
*pos += 1; let mut depth = 1;
let start = *pos;
while *pos < chars.len() && depth > 0 {
match chars[*pos] {
'{' => depth += 1,
'}' => depth -= 1,
_ => {}
}
if depth > 0 {
*pos += 1;
}
}
let end = *pos;
if *pos < chars.len() {
*pos += 1; }
chars[start..end].iter().collect()
}
fn command_to_unicode(cmd: &str) -> Option<&'static str> {
Some(match cmd {
"alpha" => "α", "beta" => "β", "gamma" => "γ", "delta" => "δ",
"epsilon" | "varepsilon" => "ε", "zeta" => "ζ", "eta" => "η",
"theta" | "vartheta" => "θ", "iota" => "ι", "kappa" => "κ",
"lambda" => "λ", "mu" => "μ", "nu" => "ν", "xi" => "ξ",
"pi" => "π", "rho" | "varrho" => "ρ", "sigma" => "σ",
"varsigma" => "ς", "tau" => "τ", "upsilon" => "υ",
"phi" | "varphi" => "φ", "chi" => "χ", "psi" => "ψ", "omega" => "ω",
"Gamma" => "Γ", "Delta" => "Δ", "Theta" => "Θ", "Lambda" => "Λ",
"Xi" => "Ξ", "Pi" => "Π", "Sigma" => "Σ", "Upsilon" => "Υ",
"Phi" => "Φ", "Psi" => "Ψ", "Omega" => "Ω",
"sum" => "∑", "prod" => "∏", "int" => "∫", "iint" => "∬",
"iiint" => "∭", "oint" => "∮", "coprod" => "∐",
"bigcup" => "⋃", "bigcap" => "⋂", "bigoplus" => "⊕",
"bigotimes" => "⊗",
"leq" | "le" => "≤", "geq" | "ge" => "≥", "neq" | "ne" => "≠",
"approx" => "≈", "equiv" => "≡", "sim" => "∼", "simeq" => "≃",
"cong" => "≅", "propto" => "∝", "ll" => "≪", "gg" => "≫",
"subset" => "⊂", "supset" => "⊃", "subseteq" => "⊆",
"supseteq" => "⊇", "in" => "∈", "notin" => "∉",
"ni" => "∋", "forall" => "∀", "exists" => "∃",
"nexists" => "∄", "emptyset" | "varnothing" => "∅",
"to" | "rightarrow" => "→", "leftarrow" => "←",
"leftrightarrow" => "↔", "Rightarrow" => "⇒",
"Leftarrow" => "⇐", "Leftrightarrow" => "⇔",
"uparrow" => "↑", "downarrow" => "↓",
"mapsto" => "↦", "hookrightarrow" => "↪",
"longrightarrow" => "⟶", "longleftarrow" => "⟵",
"Longrightarrow" => "⟹", "implies" => "⟹", "iff" => "⟺",
"infty" => "∞", "partial" => "∂", "nabla" => "∇",
"pm" => "±", "mp" => "∓", "times" => "×", "div" => "÷",
"cdot" => "·", "star" => "⋆", "ast" => "∗",
"circ" => "∘", "bullet" => "•", "oplus" => "⊕",
"otimes" => "⊗", "dagger" => "†", "ddagger" => "‡",
"neg" | "lnot" => "¬", "land" | "wedge" => "∧",
"lor" | "vee" => "∨",
"cap" => "∩", "cup" => "∪",
"ldots" | "dots" => "…", "cdots" => "⋯", "vdots" => "⋮", "ddots" => "⋱",
"angle" => "∠", "measuredangle" => "∡",
"perp" => "⊥", "parallel" => "∥",
"hbar" => "ℏ", "ell" => "ℓ", "Re" => "ℜ", "Im" => "ℑ",
"aleph" => "ℵ", "wp" => "℘",
"mathbb" => "", "quad" => " ", "qquad" => " ",
"langle" => "⟨", "rangle" => "⟩",
"lceil" => "⌈", "rceil" => "⌉",
"lfloor" => "⌊", "rfloor" => "⌋",
_ => return None,
})
}
fn to_superscript(ch: char) -> char {
match ch {
'0' => '⁰', '1' => '¹', '2' => '²', '3' => '³', '4' => '⁴',
'5' => '⁵', '6' => '⁶', '7' => '⁷', '8' => '⁸', '9' => '⁹',
'+' => '⁺', '-' | '−' => '⁻', '=' => '⁼', '(' => '⁽', ')' => '⁾',
'n' => 'ⁿ', 'i' => 'ⁱ', 'x' => 'ˣ', 'y' => 'ʸ',
'a' => 'ᵃ', 'b' => 'ᵇ', 'c' => 'ᶜ', 'd' => 'ᵈ', 'e' => 'ᵉ',
'f' => 'ᶠ', 'g' => 'ᵍ', 'h' => 'ʰ', 'k' => 'ᵏ', 'l' => 'ˡ',
'm' => 'ᵐ', 'o' => 'ᵒ', 'p' => 'ᵖ', 'r' => 'ʳ', 's' => 'ˢ',
't' => 'ᵗ', 'u' => 'ᵘ', 'v' => 'ᵛ', 'w' => 'ʷ', 'z' => 'ᶻ',
'T' => 'ᵀ',
_ => ch, }
}
fn to_subscript(ch: char) -> char {
match ch {
'0' => '₀', '1' => '₁', '2' => '₂', '3' => '₃', '4' => '₄',
'5' => '₅', '6' => '₆', '7' => '₇', '8' => '₈', '9' => '₉',
'+' => '₊', '-' | '−' => '₋', '=' => '₌', '(' => '₍', ')' => '₎',
'a' => 'ₐ', 'e' => 'ₑ', 'h' => 'ₕ', 'i' => 'ᵢ', 'j' => 'ⱼ',
'k' => 'ₖ', 'l' => 'ₗ', 'm' => 'ₘ', 'n' => 'ₙ', 'o' => 'ₒ',
'p' => 'ₚ', 'r' => 'ᵣ', 's' => 'ₛ', 't' => 'ₜ', 'u' => 'ᵤ',
'v' => 'ᵥ', 'x' => 'ₓ',
_ => ch,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_superscript() {
assert_eq!(latex_to_unicode("E = mc^2"), "E = mc²");
}
#[test]
fn simple_subscript() {
assert_eq!(latex_to_unicode("x_i"), "xᵢ");
}
#[test]
fn greek_letters() {
assert_eq!(latex_to_unicode(r"\alpha + \beta"), "α + β");
}
#[test]
fn fraction() {
assert_eq!(latex_to_unicode(r"\frac{a}{b}"), "a/b");
assert_eq!(latex_to_unicode(r"\frac{x+1}{y-1}"), "(x+1)/(y-1)");
}
#[test]
fn sqrt() {
assert_eq!(latex_to_unicode(r"\sqrt{x}"), "√(x)");
}
#[test]
fn sum_with_limits() {
let result = latex_to_unicode(r"\sum_{i=1}^{n} x_i");
assert!(result.contains('∑'), "should contain sum symbol: {result}");
assert!(result.contains('ₙ') || result.contains('ⁿ'), "should have n: {result}");
}
#[test]
fn euler_identity() {
let result = latex_to_unicode(r"e^{i\pi} + 1 = 0");
assert!(result.contains('π'), "should contain pi: {result}");
assert!(result.contains('ⁱ'), "should have superscript i: {result}");
}
#[test]
fn integral() {
let result = latex_to_unicode(r"\int_0^\infty e^{-x} dx");
assert!(result.contains('∫'), "should contain integral: {result}");
assert!(result.contains('∞'), "should contain infinity: {result}");
}
#[test]
fn nabla_and_partial() {
let result = latex_to_unicode(r"\nabla \cdot \mathbf{E} = \frac{\rho}{\varepsilon_0}");
assert!(result.contains('∇'), "should contain nabla: {result}");
assert!(result.contains('·'), "should contain cdot: {result}");
}
#[test]
fn unknown_command_passes_through() {
let result = latex_to_unicode(r"\unknowncmd{x}");
assert!(result.contains("unknowncmd"), "unknown command should pass through: {result}");
}
#[test]
fn empty_input() {
assert_eq!(latex_to_unicode(""), "");
}
}