#[allow(clippy::too_many_lines)]
pub fn latex_to_unicode(input: &str) -> String {
use std::fmt::Write as _;
let mut out = String::with_capacity(input.len());
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
while i < len {
match chars[i] {
'\\' => {
let start = i + 1;
let mut end = start;
while end < len && chars[end].is_ascii_alphabetic() {
end += 1;
}
if end == start {
if start < len {
match chars[start] {
'{' | '}' | '!' => {
i = start + 1;
continue;
}
'\\' => {
out.push('\n');
i = start + 1;
continue;
}
',' => {
out.push('\u{2009}');
i = start + 1;
continue;
}
';' => {
out.push(' ');
i = start + 1;
continue;
}
_ => {
out.push(chars[start]);
i = start + 1;
continue;
}
}
}
i += 1;
continue;
}
let cmd: String = chars[start..end].iter().collect();
i = end;
if let Some(sym) = command_to_unicode(&cmd) {
out.push_str(sym);
} else if cmd == "frac" {
let num = extract_brace_group(&chars, &mut i);
let den = extract_brace_group(&chars, &mut i);
let num_u = latex_to_unicode(&num);
let den_u = latex_to_unicode(&den);
if num_u.len() <= 1 && den_u.len() <= 1 {
let _ = write!(out, "{num_u}/{den_u}");
} else {
let _ = write!(out, "({num_u})/({den_u})");
}
} else if cmd == "sqrt" {
let body = extract_brace_group(&chars, &mut i);
let body_u = latex_to_unicode(&body);
let _ = write!(out, "√({body_u})");
} else if cmd == "text"
|| cmd == "mathrm"
|| cmd == "mathbf"
|| cmd == "mathit"
|| cmd == "mathbb"
|| cmd == "mathcal"
|| cmd == "operatorname"
{
let body = extract_brace_group(&chars, &mut i);
out.push_str(&latex_to_unicode(&body));
} else if cmd == "left" || cmd == "right" {
if i < len {
match chars[i] {
'(' | ')' | '[' | ']' | '|' => {
out.push(chars[i]);
i += 1;
}
'\\' if i + 1 < len => {
if chars[i + 1] == '{' {
out.push('(');
} else if chars[i + 1] == '}' {
out.push(')');
} else if chars[i + 1] == '|' {
out.push('‖');
}
i += 2;
}
'.' => {
i += 1;
} _ => {}
}
}
} else if cmd == "begin" || cmd == "end" {
let _env = extract_brace_group(&chars, &mut i);
} else {
out.push_str(&cmd);
}
}
'{' | '}' => {
i += 1;
}
'^' => {
i += 1;
let group = extract_script_arg(&chars, &mut i);
let converted = latex_to_unicode(&group);
for ch in converted.chars() {
out.push(to_superscript(ch));
}
}
'_' => {
i += 1;
let group = extract_script_arg(&chars, &mut i);
let converted = latex_to_unicode(&group);
for ch in converted.chars() {
out.push(to_subscript(ch));
}
}
'~' => {
out.push(' ');
i += 1;
}
_ => {
out.push(chars[i]);
i += 1;
}
}
}
out
}
fn extract_script_arg(chars: &[char], pos: &mut usize) -> String {
while *pos < chars.len() && chars[*pos] == ' ' {
*pos += 1;
}
if *pos >= chars.len() {
return String::new();
}
if chars[*pos] == '{' {
extract_brace_group(chars, pos)
} else if chars[*pos] == '\\' {
let start = *pos;
*pos += 1; while *pos < chars.len() && chars[*pos].is_ascii_alphabetic() {
*pos += 1;
}
chars[start..*pos].iter().collect()
} else {
let c = chars[*pos];
*pos += 1;
c.to_string()
}
}
fn extract_brace_group(chars: &[char], pos: &mut usize) -> String {
while *pos < chars.len() && chars[*pos] == ' ' {
*pos += 1;
}
if *pos >= chars.len() || chars[*pos] != '{' {
return String::new();
}
*pos += 1; let mut depth = 1;
let start = *pos;
while *pos < chars.len() && depth > 0 {
match chars[*pos] {
'{' => depth += 1,
'}' => depth -= 1,
_ => {}
}
if depth > 0 {
*pos += 1;
}
}
let end = *pos;
if *pos < chars.len() {
*pos += 1; }
chars[start..end].iter().collect()
}
#[allow(clippy::too_many_lines)]
fn command_to_unicode(cmd: &str) -> Option<&'static str> {
Some(match cmd {
"alpha" => "α",
"beta" => "β",
"gamma" => "γ",
"delta" => "δ",
"epsilon" | "varepsilon" => "ε",
"zeta" => "ζ",
"eta" => "η",
"theta" | "vartheta" => "θ",
"iota" => "ι",
"kappa" => "κ",
"lambda" => "λ",
"mu" => "μ",
"nu" => "ν",
"xi" => "ξ",
"pi" => "π",
"rho" | "varrho" => "ρ",
"sigma" => "σ",
"varsigma" => "ς",
"tau" => "τ",
"upsilon" => "υ",
"phi" | "varphi" => "φ",
"chi" => "χ",
"psi" => "ψ",
"omega" => "ω",
"Gamma" => "Γ",
"Delta" => "Δ",
"Theta" => "Θ",
"Lambda" => "Λ",
"Xi" => "Ξ",
"Pi" => "Π",
"Sigma" => "Σ",
"Upsilon" => "Υ",
"Phi" => "Φ",
"Psi" => "Ψ",
"Omega" => "Ω",
"sum" => "∑",
"prod" => "∏",
"int" => "∫",
"iint" => "∬",
"iiint" => "∭",
"oint" => "∮",
"coprod" => "∐",
"bigcup" => "⋃",
"bigcap" => "⋂",
"bigoplus" | "oplus" => "⊕",
"bigotimes" | "otimes" => "⊗",
"leq" | "le" => "≤",
"geq" | "ge" => "≥",
"neq" | "ne" => "≠",
"approx" => "≈",
"equiv" => "≡",
"sim" => "∼",
"simeq" => "≃",
"cong" => "≅",
"propto" => "∝",
"ll" => "≪",
"gg" => "≫",
"subset" => "⊂",
"supset" => "⊃",
"subseteq" => "⊆",
"supseteq" => "⊇",
"in" => "∈",
"notin" => "∉",
"ni" => "∋",
"forall" => "∀",
"exists" => "∃",
"nexists" => "∄",
"emptyset" | "varnothing" => "∅",
"to" | "rightarrow" => "→",
"leftarrow" => "←",
"leftrightarrow" => "↔",
"Rightarrow" => "⇒",
"Leftarrow" => "⇐",
"Leftrightarrow" => "⇔",
"uparrow" => "↑",
"downarrow" => "↓",
"mapsto" => "↦",
"hookrightarrow" => "↪",
"longrightarrow" => "⟶",
"longleftarrow" => "⟵",
"Longrightarrow" | "implies" => "⟹",
"iff" => "⟺",
"infty" => "∞",
"partial" => "∂",
"nabla" => "∇",
"pm" => "±",
"mp" => "∓",
"times" => "×",
"div" => "÷",
"cdot" => "·",
"star" => "⋆",
"ast" => "∗",
"circ" => "∘",
"bullet" => "•",
"dagger" => "†",
"ddagger" => "‡",
"neg" | "lnot" => "¬",
"land" | "wedge" => "∧",
"lor" | "vee" => "∨",
"cap" => "∩",
"cup" => "∪",
"ldots" | "dots" => "…",
"cdots" => "⋯",
"vdots" => "⋮",
"ddots" => "⋱",
"angle" => "∠",
"measuredangle" => "∡",
"perp" => "⊥",
"parallel" => "∥",
"hbar" => "ℏ",
"ell" => "ℓ",
"Re" => "ℜ",
"Im" => "ℑ",
"aleph" => "ℵ",
"wp" => "℘",
"mathbb" => "", "quad" => " ",
"qquad" => " ",
"langle" => "⟨",
"rangle" => "⟩",
"lceil" => "⌈",
"rceil" => "⌉",
"lfloor" => "⌊",
"rfloor" => "⌋",
_ => return None,
})
}
fn to_superscript(ch: char) -> char {
match ch {
'0' => '⁰',
'1' => '¹',
'2' => '²',
'3' => '³',
'4' => '⁴',
'5' => '⁵',
'6' => '⁶',
'7' => '⁷',
'8' => '⁸',
'9' => '⁹',
'+' => '⁺',
'-' | '−' => '⁻',
'=' => '⁼',
'(' => '⁽',
')' => '⁾',
'n' => 'ⁿ',
'i' => 'ⁱ',
'x' => 'ˣ',
'y' => 'ʸ',
'a' => 'ᵃ',
'b' => 'ᵇ',
'c' => 'ᶜ',
'd' => 'ᵈ',
'e' => 'ᵉ',
'f' => 'ᶠ',
'g' => 'ᵍ',
'h' => 'ʰ',
'k' => 'ᵏ',
'l' => 'ˡ',
'm' => 'ᵐ',
'o' => 'ᵒ',
'p' => 'ᵖ',
'r' => 'ʳ',
's' => 'ˢ',
't' => 'ᵗ',
'u' => 'ᵘ',
'v' => 'ᵛ',
'w' => 'ʷ',
'z' => 'ᶻ',
'T' => 'ᵀ',
_ => ch, }
}
fn to_subscript(ch: char) -> char {
match ch {
'0' => '₀',
'1' => '₁',
'2' => '₂',
'3' => '₃',
'4' => '₄',
'5' => '₅',
'6' => '₆',
'7' => '₇',
'8' => '₈',
'9' => '₉',
'+' => '₊',
'-' | '−' => '₋',
'=' => '₌',
'(' => '₍',
')' => '₎',
'a' => 'ₐ',
'e' => 'ₑ',
'h' => 'ₕ',
'i' => 'ᵢ',
'j' => 'ⱼ',
'k' => 'ₖ',
'l' => 'ₗ',
'm' => 'ₘ',
'n' => 'ₙ',
'o' => 'ₒ',
'p' => 'ₚ',
'r' => 'ᵣ',
's' => 'ₛ',
't' => 'ₜ',
'u' => 'ᵤ',
'v' => 'ᵥ',
'x' => 'ₓ',
_ => ch,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_superscript() {
assert_eq!(latex_to_unicode("E = mc^2"), "E = mc²");
}
#[test]
fn simple_subscript() {
assert_eq!(latex_to_unicode("x_i"), "xᵢ");
}
#[test]
fn greek_letters() {
assert_eq!(latex_to_unicode(r"\alpha + \beta"), "α + β");
}
#[test]
fn fraction() {
assert_eq!(latex_to_unicode(r"\frac{a}{b}"), "a/b");
assert_eq!(latex_to_unicode(r"\frac{x+1}{y-1}"), "(x+1)/(y-1)");
}
#[test]
fn sqrt() {
assert_eq!(latex_to_unicode(r"\sqrt{x}"), "√(x)");
}
#[test]
fn sum_with_limits() {
let result = latex_to_unicode(r"\sum_{i=1}^{n} x_i");
assert!(result.contains('∑'), "should contain sum symbol: {result}");
assert!(
result.contains('ₙ') || result.contains('ⁿ'),
"should have n: {result}"
);
}
#[test]
fn euler_identity() {
let result = latex_to_unicode(r"e^{i\pi} + 1 = 0");
assert!(result.contains('π'), "should contain pi: {result}");
assert!(result.contains('ⁱ'), "should have superscript i: {result}");
}
#[test]
fn integral() {
let result = latex_to_unicode(r"\int_0^\infty e^{-x} dx");
assert!(result.contains('∫'), "should contain integral: {result}");
assert!(result.contains('∞'), "should contain infinity: {result}");
}
#[test]
fn nabla_and_partial() {
let result = latex_to_unicode(r"\nabla \cdot \mathbf{E} = \frac{\rho}{\varepsilon_0}");
assert!(result.contains('∇'), "should contain nabla: {result}");
assert!(result.contains('·'), "should contain cdot: {result}");
}
#[test]
fn unknown_command_passes_through() {
let result = latex_to_unicode(r"\unknowncmd{x}");
assert!(
result.contains("unknowncmd"),
"unknown command should pass through: {result}"
);
}
#[test]
fn empty_input() {
assert_eq!(latex_to_unicode(""), "");
}
}