markdown-tui-explorer 1.5.0

A terminal-based markdown file browser and viewer with search, syntax highlighting, and live reload
//! Best-effort LaTeX math → Unicode text conversion.
//!
//! Converts common LaTeX math commands to their Unicode equivalents so
//! display and inline math render readably in a terminal without a full
//! typesetting engine.  The output is plain text — no rasterisation, no
//! external dependencies.
//!
//! Coverage: Greek letters, common operators and relations, arrows,
//! `\frac`, `\sqrt`, super/subscripts (digits + a few letters), and
//! delimiter commands (`\left`, `\right`, `\cdots`, etc.).  Unknown
//! commands pass through stripped of their backslash so the result is
//! always at least as readable as raw LaTeX.

/// Convert a LaTeX math expression to approximate Unicode text.
///
/// # Examples
///
/// ```ignore
/// assert_eq!(latex_to_unicode(r"E = mc^2"), "E = mc²");
/// assert_eq!(latex_to_unicode(r"\sum_{i=1}^{n} x_i"), "∑ᵢ₌₁ⁿ xᵢ");
/// ```
pub fn latex_to_unicode(input: &str) -> String {
    let mut out = String::with_capacity(input.len());
    let chars: Vec<char> = input.chars().collect();
    let len = chars.len();
    let mut i = 0;

    while i < len {
        match chars[i] {
            '\\' => {
                // Collect the command name (letters only).
                let start = i + 1;
                let mut end = start;
                while end < len && chars[end].is_ascii_alphabetic() {
                    end += 1;
                }
                if end == start {
                    // Single non-alpha char after backslash: \{, \}, \\, etc.
                    if start < len {
                        match chars[start] {
                            '{' | '}' => { i = start + 1; continue; }
                            '\\' => { out.push('\n'); i = start + 1; continue; }
                            ',' => { out.push('\u{2009}'); i = start + 1; continue; } // thin space
                            ';' => { out.push(' '); i = start + 1; continue; }
                            '!' => { i = start + 1; continue; } // negative thin space → skip
                            _ => { out.push(chars[start]); i = start + 1; continue; }
                        }
                    }
                    i += 1;
                    continue;
                }
                let cmd: String = chars[start..end].iter().collect();
                i = end;

                // Try to map the command.
                if let Some(sym) = command_to_unicode(&cmd) {
                    out.push_str(sym);
                } else if cmd == "frac" {
                    let num = extract_brace_group(&chars, &mut i);
                    let den = extract_brace_group(&chars, &mut i);
                    let num_u = latex_to_unicode(&num);
                    let den_u = latex_to_unicode(&den);
                    if num_u.len() <= 1 && den_u.len() <= 1 {
                        out.push_str(&format!("{num_u}/{den_u}"));
                    } else {
                        out.push_str(&format!("({num_u})/({den_u})"));
                    }
                } else if cmd == "sqrt" {
                    let body = extract_brace_group(&chars, &mut i);
                    let body_u = latex_to_unicode(&body);
                    out.push_str(&format!("√({body_u})"));
                } else if cmd == "text" || cmd == "mathrm" || cmd == "mathbf"
                    || cmd == "mathit" || cmd == "mathbb" || cmd == "mathcal"
                    || cmd == "operatorname"
                {
                    let body = extract_brace_group(&chars, &mut i);
                    out.push_str(&latex_to_unicode(&body));
                } else if cmd == "left" || cmd == "right" {
                    // Consume the delimiter character that follows.
                    if i < len {
                        match chars[i] {
                            '(' | ')' | '[' | ']' | '|' => {
                                out.push(chars[i]);
                                i += 1;
                            }
                            '\\' if i + 1 < len => {
                                // \left\{ or \right\}
                                if chars[i + 1] == '{' { out.push('('); }
                                else if chars[i + 1] == '}' { out.push(')'); }
                                else if chars[i + 1] == '|' { out.push(''); }
                                i += 2;
                            }
                            '.' => { i += 1; } // \left. or \right. = invisible delimiter
                            _ => {}
                        }
                    }
                } else if cmd == "begin" || cmd == "end" {
                    // Skip \begin{env} / \end{env}.
                    let _env = extract_brace_group(&chars, &mut i);
                } else {
                    // Unknown command — emit the name without the backslash.
                    out.push_str(&cmd);
                }
            }
            '{' | '}' => {
                // Bare braces used for grouping — skip.
                i += 1;
            }
            '^' => {
                i += 1;
                let group = extract_script_arg(&chars, &mut i);
                let converted = latex_to_unicode(&group);
                for ch in converted.chars() {
                    out.push(to_superscript(ch));
                }
            }
            '_' => {
                i += 1;
                let group = extract_script_arg(&chars, &mut i);
                let converted = latex_to_unicode(&group);
                for ch in converted.chars() {
                    out.push(to_subscript(ch));
                }
            }
            '~' => {
                out.push(' ');
                i += 1;
            }
            _ => {
                out.push(chars[i]);
                i += 1;
            }
        }
    }
    out
}

/// Extract the argument of a `^` or `_` script operator.
///
/// Handles three forms:
/// - `{…}` brace group
/// - `\command` (a backslash followed by alphabetic chars)
/// - A single character
fn extract_script_arg(chars: &[char], pos: &mut usize) -> String {
    // Skip whitespace.
    while *pos < chars.len() && chars[*pos] == ' ' {
        *pos += 1;
    }
    if *pos >= chars.len() {
        return String::new();
    }
    if chars[*pos] == '{' {
        extract_brace_group(chars, pos)
    } else if chars[*pos] == '\\' {
        // Capture the entire \command as the script argument.
        let start = *pos;
        *pos += 1; // skip '\'
        while *pos < chars.len() && chars[*pos].is_ascii_alphabetic() {
            *pos += 1;
        }
        chars[start..*pos].iter().collect()
    } else {
        let c = chars[*pos];
        *pos += 1;
        c.to_string()
    }
}

/// Extract a `{…}` brace-delimited group starting at `chars[*pos]`.
/// If `chars[*pos]` is not `{`, returns an empty string.
/// Advances `*pos` past the closing `}`.
fn extract_brace_group(chars: &[char], pos: &mut usize) -> String {
    // Skip whitespace before the brace.
    while *pos < chars.len() && chars[*pos] == ' ' {
        *pos += 1;
    }
    if *pos >= chars.len() || chars[*pos] != '{' {
        return String::new();
    }
    *pos += 1; // skip '{'
    let mut depth = 1;
    let start = *pos;
    while *pos < chars.len() && depth > 0 {
        match chars[*pos] {
            '{' => depth += 1,
            '}' => depth -= 1,
            _ => {}
        }
        if depth > 0 {
            *pos += 1;
        }
    }
    let end = *pos;
    if *pos < chars.len() {
        *pos += 1; // skip closing '}'
    }
    chars[start..end].iter().collect()
}

/// Map a LaTeX command name (without backslash) to a Unicode string.
fn command_to_unicode(cmd: &str) -> Option<&'static str> {
    Some(match cmd {
        // Greek lowercase
        "alpha" => "α", "beta" => "β", "gamma" => "γ", "delta" => "δ",
        "epsilon" | "varepsilon" => "ε", "zeta" => "ζ", "eta" => "η",
        "theta" | "vartheta" => "θ", "iota" => "ι", "kappa" => "κ",
        "lambda" => "λ", "mu" => "μ", "nu" => "ν", "xi" => "ξ",
        "pi" => "π", "rho" | "varrho" => "ρ", "sigma" => "σ",
        "varsigma" => "ς", "tau" => "τ", "upsilon" => "υ",
        "phi" | "varphi" => "φ", "chi" => "χ", "psi" => "ψ", "omega" => "ω",
        // Greek uppercase
        "Gamma" => "Γ", "Delta" => "Δ", "Theta" => "Θ", "Lambda" => "Λ",
        "Xi" => "Ξ", "Pi" => "Π", "Sigma" => "Σ", "Upsilon" => "Υ",
        "Phi" => "Φ", "Psi" => "Ψ", "Omega" => "Ω",
        // Operators
        "sum" => "", "prod" => "", "int" => "", "iint" => "",
        "iiint" => "", "oint" => "", "coprod" => "",
        "bigcup" => "", "bigcap" => "", "bigoplus" => "",
        "bigotimes" => "",
        // Relations
        "leq" | "le" => "", "geq" | "ge" => "", "neq" | "ne" => "",
        "approx" => "", "equiv" => "", "sim" => "", "simeq" => "",
        "cong" => "", "propto" => "", "ll" => "", "gg" => "",
        "subset" => "", "supset" => "", "subseteq" => "",
        "supseteq" => "", "in" => "", "notin" => "",
        "ni" => "", "forall" => "", "exists" => "",
        "nexists" => "", "emptyset" | "varnothing" => "",
        // Arrows
        "to" | "rightarrow" => "", "leftarrow" => "",
        "leftrightarrow" => "", "Rightarrow" => "",
        "Leftarrow" => "", "Leftrightarrow" => "",
        "uparrow" => "", "downarrow" => "",
        "mapsto" => "", "hookrightarrow" => "",
        "longrightarrow" => "", "longleftarrow" => "",
        "Longrightarrow" => "", "implies" => "", "iff" => "",
        // Miscellaneous
        "infty" => "", "partial" => "", "nabla" => "",
        "pm" => "±", "mp" => "", "times" => "×", "div" => "÷",
        "cdot" => "·", "star" => "", "ast" => "",
        "circ" => "", "bullet" => "", "oplus" => "",
        "otimes" => "", "dagger" => "", "ddagger" => "",
        "neg" | "lnot" => "¬", "land" | "wedge" => "",
        "lor" | "vee" => "",
        "cap" => "", "cup" => "",
        "ldots" | "dots" => "", "cdots" => "", "vdots" => "", "ddots" => "",
        "angle" => "", "measuredangle" => "",
        "perp" => "", "parallel" => "",
        "hbar" => "", "ell" => "", "Re" => "", "Im" => "",
        "aleph" => "", "wp" => "",
        // Blackboard bold letters (common)
        "mathbb" => "", // handled separately via extract_brace_group
        // Spacing
        "quad" => "  ", "qquad" => "    ",
        // Misc text
        "langle" => "", "rangle" => "",
        "lceil" => "", "rceil" => "",
        "lfloor" => "", "rfloor" => "",
        _ => return None,
    })
}

/// Best-effort mapping of a character to its Unicode superscript form.
fn to_superscript(ch: char) -> char {
    match ch {
        '0' => '', '1' => '¹', '2' => '²', '3' => '³', '4' => '',
        '5' => '', '6' => '', '7' => '', '8' => '', '9' => '',
        '+' => '', '-' | '' => '', '=' => '', '(' => '', ')' => '',
        'n' => '', 'i' => '', 'x' => 'ˣ', 'y' => 'ʸ',
        'a' => '', 'b' => '', 'c' => '', 'd' => '', 'e' => '',
        'f' => '', 'g' => '', 'h' => 'ʰ', 'k' => '', 'l' => 'ˡ',
        'm' => '', 'o' => '', 'p' => '', 'r' => 'ʳ', 's' => 'ˢ',
        't' => '', 'u' => '', 'v' => '', 'w' => 'ʷ', 'z' => '',
        'T' => '',
        _ => ch, // no superscript form available — pass through
    }
}

/// Best-effort mapping of a character to its Unicode subscript form.
fn to_subscript(ch: char) -> char {
    match ch {
        '0' => '', '1' => '', '2' => '', '3' => '', '4' => '',
        '5' => '', '6' => '', '7' => '', '8' => '', '9' => '',
        '+' => '', '-' | '' => '', '=' => '', '(' => '', ')' => '',
        'a' => '', 'e' => '', 'h' => '', 'i' => '', 'j' => '',
        'k' => '', 'l' => '', 'm' => '', 'n' => '', 'o' => '',
        'p' => '', 'r' => '', 's' => '', 't' => '', 'u' => '',
        'v' => '', 'x' => '',
        _ => ch,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn simple_superscript() {
        assert_eq!(latex_to_unicode("E = mc^2"), "E = mc²");
    }

    #[test]
    fn simple_subscript() {
        assert_eq!(latex_to_unicode("x_i"), "xᵢ");
    }

    #[test]
    fn greek_letters() {
        assert_eq!(latex_to_unicode(r"\alpha + \beta"), "α + β");
    }

    #[test]
    fn fraction() {
        assert_eq!(latex_to_unicode(r"\frac{a}{b}"), "a/b");
        assert_eq!(latex_to_unicode(r"\frac{x+1}{y-1}"), "(x+1)/(y-1)");
    }

    #[test]
    fn sqrt() {
        assert_eq!(latex_to_unicode(r"\sqrt{x}"), "√(x)");
    }

    #[test]
    fn sum_with_limits() {
        let result = latex_to_unicode(r"\sum_{i=1}^{n} x_i");
        assert!(result.contains(''), "should contain sum symbol: {result}");
        assert!(result.contains('') || result.contains(''), "should have n: {result}");
    }

    #[test]
    fn euler_identity() {
        let result = latex_to_unicode(r"e^{i\pi} + 1 = 0");
        assert!(result.contains('π'), "should contain pi: {result}");
        assert!(result.contains(''), "should have superscript i: {result}");
    }

    #[test]
    fn integral() {
        let result = latex_to_unicode(r"\int_0^\infty e^{-x} dx");
        assert!(result.contains(''), "should contain integral: {result}");
        assert!(result.contains(''), "should contain infinity: {result}");
    }

    #[test]
    fn nabla_and_partial() {
        let result = latex_to_unicode(r"\nabla \cdot \mathbf{E} = \frac{\rho}{\varepsilon_0}");
        assert!(result.contains(''), "should contain nabla: {result}");
        assert!(result.contains('·'), "should contain cdot: {result}");
    }

    #[test]
    fn unknown_command_passes_through() {
        let result = latex_to_unicode(r"\unknowncmd{x}");
        assert!(result.contains("unknowncmd"), "unknown command should pass through: {result}");
    }

    #[test]
    fn empty_input() {
        assert_eq!(latex_to_unicode(""), "");
    }
}