Skip to main content

streamdown_plugin/
latex.rs

1//! LaTeX to Unicode conversion plugin.
2//!
3//! Converts LaTeX math expressions between `$$` delimiters to Unicode.
4//!
5//! # Supported conversions
6//!
7//! - Greek letters: `\alpha` → α, `\beta` → β, etc.
8//! - Operators: `\sum` → Σ, `\prod` → Π, `\int` → ∫
9//! - Relations: `\leq` → ≤, `\geq` → ≥, `\neq` → ≠
10//! - Subscripts: `x_1` → x₁, `x_{10}` → x₁₀
11//! - Superscripts: `x^2` → x², `x^{10}` → x¹⁰
12//! - Fractions: `\frac{a}{b}` → a/b
13//! - Common symbols: `\infty` → ∞, `\pm` → ±, etc.
14
15use crate::{Plugin, ProcessResult};
16use regex::Regex;
17use std::collections::HashMap;
18use std::sync::LazyLock;
19use streamdown_config::ComputedStyle;
20use streamdown_core::state::ParseState;
21
22/// LaTeX plugin for converting math to Unicode.
23pub struct LatexPlugin {
24    /// Whether we're inside a $$ block
25    in_block: bool,
26    /// Buffer for multi-line expressions
27    buffer: String,
28}
29
30impl LatexPlugin {
31    /// Create a new LaTeX plugin.
32    pub fn new() -> Self {
33        Self {
34            in_block: false,
35            buffer: String::new(),
36        }
37    }
38}
39
40impl Default for LatexPlugin {
41    fn default() -> Self {
42        Self::new()
43    }
44}
45
46impl Plugin for LatexPlugin {
47    fn name(&self) -> &str {
48        "latex"
49    }
50
51    fn process_line(
52        &mut self,
53        line: &str,
54        _state: &ParseState,
55        _style: &ComputedStyle,
56    ) -> Option<ProcessResult> {
57        // Handle inline $...$ first (single line)
58        if !self.in_block && line.contains('$') && !line.contains("$$") {
59            // Check for inline math
60            let converted = convert_inline_math(line);
61            if converted != line {
62                return Some(ProcessResult::Lines(vec![converted]));
63            }
64        }
65
66        // Check for $$ delimiters
67        if !self.in_block {
68            if let Some(idx) = line.find("$$") {
69                self.in_block = true;
70                self.buffer.clear();
71
72                // Get content after opening $$
73                let after = &line[idx + 2..];
74
75                // Check if closing $$ is on same line
76                if let Some(end_idx) = after.find("$$") {
77                    // Single line expression
78                    self.in_block = false;
79                    let expr = &after[..end_idx];
80                    let converted = latex_to_unicode(expr);
81                    return Some(ProcessResult::Lines(vec![converted]));
82                }
83
84                // Multi-line: start buffering
85                self.buffer.push_str(after);
86                return Some(ProcessResult::Continue);
87            }
88            return None;
89        }
90
91        // We're in a block, looking for closing $$
92        if let Some(idx) = line.find("$$") {
93            // Found closing delimiter
94            self.in_block = false;
95            self.buffer.push_str(&line[..idx]);
96
97            let converted = latex_to_unicode(&self.buffer);
98            self.buffer.clear();
99
100            return Some(ProcessResult::Lines(vec![converted]));
101        }
102
103        // Continue buffering
104        if !self.buffer.is_empty() {
105            self.buffer.push(' ');
106        }
107        self.buffer.push_str(line);
108        Some(ProcessResult::Continue)
109    }
110
111    fn flush(&mut self) -> Option<Vec<String>> {
112        if self.buffer.is_empty() {
113            return None;
114        }
115
116        // Return unconverted buffer if stream ended mid-block
117        let result = std::mem::take(&mut self.buffer);
118        self.in_block = false;
119        Some(vec![format!("$$ {} (incomplete)", result)])
120    }
121
122    fn reset(&mut self) {
123        self.in_block = false;
124        self.buffer.clear();
125    }
126
127    fn is_active(&self) -> bool {
128        self.in_block
129    }
130
131    fn priority(&self) -> i32 {
132        10 // Lower priority than most plugins
133    }
134}
135
136/// Convert inline math ($...$) in a line.
137fn convert_inline_math(line: &str) -> String {
138    static INLINE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\$([^$]+)\$").unwrap());
139
140    INLINE_RE
141        .replace_all(line, |caps: &regex::Captures| latex_to_unicode(&caps[1]))
142        .to_string()
143}
144
145/// Convert LaTeX expression to Unicode.
146pub fn latex_to_unicode(latex: &str) -> String {
147    let mut result = latex.to_string();
148
149    // Apply conversions in order
150    result = convert_commands(&result);
151    result = convert_fractions(&result);
152    result = convert_subscripts(&result);
153    result = convert_superscripts(&result);
154    result = cleanup(&result);
155
156    result
157}
158
159/// Greek letters and symbols mapping.
160static GREEK_LETTERS: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
161    let mut m = HashMap::new();
162    // Lowercase Greek
163    m.insert("alpha", "α");
164    m.insert("beta", "β");
165    m.insert("gamma", "γ");
166    m.insert("delta", "δ");
167    m.insert("epsilon", "ε");
168    m.insert("varepsilon", "ε");
169    m.insert("zeta", "ζ");
170    m.insert("eta", "η");
171    m.insert("theta", "θ");
172    m.insert("vartheta", "ϑ");
173    m.insert("iota", "ι");
174    m.insert("kappa", "κ");
175    m.insert("lambda", "λ");
176    m.insert("mu", "μ");
177    m.insert("nu", "ν");
178    m.insert("xi", "ξ");
179    m.insert("omicron", "ο");
180    m.insert("pi", "π");
181    m.insert("varpi", "ϖ");
182    m.insert("rho", "ρ");
183    m.insert("varrho", "ϱ");
184    m.insert("sigma", "σ");
185    m.insert("varsigma", "ς");
186    m.insert("tau", "τ");
187    m.insert("upsilon", "υ");
188    m.insert("phi", "φ");
189    m.insert("varphi", "ϕ");
190    m.insert("chi", "χ");
191    m.insert("psi", "ψ");
192    m.insert("omega", "ω");
193    // Uppercase Greek
194    m.insert("Gamma", "Γ");
195    m.insert("Delta", "Δ");
196    m.insert("Theta", "Θ");
197    m.insert("Lambda", "Λ");
198    m.insert("Xi", "Ξ");
199    m.insert("Pi", "Π");
200    m.insert("Sigma", "Σ");
201    m.insert("Upsilon", "Υ");
202    m.insert("Phi", "Φ");
203    m.insert("Psi", "Ψ");
204    m.insert("Omega", "Ω");
205    m
206});
207
208/// Operators mapping.
209static OPERATORS: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
210    let mut m = HashMap::new();
211    m.insert("sum", "Σ");
212    m.insert("prod", "Π");
213    m.insert("int", "∫");
214    m.insert("iint", "∬");
215    m.insert("iiint", "∭");
216    m.insert("oint", "∮");
217    m.insert("partial", "∂");
218    m.insert("nabla", "∇");
219    m.insert("sqrt", "√");
220    m.insert("cbrt", "∛");
221    m.insert("times", "×");
222    m.insert("div", "÷");
223    m.insert("cdot", "·");
224    m.insert("ast", "∗");
225    m.insert("star", "⋆");
226    m.insert("circ", "∘");
227    m.insert("bullet", "•");
228    m.insert("oplus", "⊕");
229    m.insert("ominus", "⊖");
230    m.insert("otimes", "⊗");
231    m.insert("oslash", "⊘");
232    m.insert("odot", "⊙");
233    m
234});
235
236/// Relations mapping.
237static RELATIONS: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
238    let mut m = HashMap::new();
239    m.insert("leq", "≤");
240    m.insert("le", "≤");
241    m.insert("geq", "≥");
242    m.insert("ge", "≥");
243    m.insert("neq", "≠");
244    m.insert("ne", "≠");
245    m.insert("approx", "≈");
246    m.insert("equiv", "≡");
247    m.insert("sim", "∼");
248    m.insert("simeq", "≃");
249    m.insert("cong", "≅");
250    m.insert("propto", "∝");
251    m.insert("ll", "≪");
252    m.insert("gg", "≫");
253    m.insert("subset", "⊂");
254    m.insert("supset", "⊃");
255    m.insert("subseteq", "⊆");
256    m.insert("supseteq", "⊇");
257    m.insert("in", "∈");
258    m.insert("notin", "∉");
259    m.insert("ni", "∋");
260    m.insert("forall", "∀");
261    m.insert("exists", "∃");
262    m.insert("nexists", "∄");
263    m
264});
265
266/// Symbols mapping.
267static SYMBOLS: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
268    let mut m = HashMap::new();
269    m.insert("infty", "∞");
270    m.insert("pm", "±");
271    m.insert("mp", "∓");
272    m.insert("to", "→");
273    m.insert("rightarrow", "→");
274    m.insert("leftarrow", "←");
275    m.insert("leftrightarrow", "↔");
276    m.insert("Rightarrow", "⇒");
277    m.insert("Leftarrow", "⇐");
278    m.insert("Leftrightarrow", "⇔");
279    m.insert("uparrow", "↑");
280    m.insert("downarrow", "↓");
281    m.insert("mapsto", "↦");
282    m.insert("ldots", "…");
283    m.insert("cdots", "⋯");
284    m.insert("vdots", "⋮");
285    m.insert("ddots", "⋱");
286    m.insert("therefore", "∴");
287    m.insert("because", "∵");
288    m.insert("angle", "∠");
289    m.insert("perp", "⊥");
290    m.insert("parallel", "∥");
291    m.insert("triangle", "△");
292    m.insert("square", "□");
293    m.insert("diamond", "◇");
294    m.insert("emptyset", "∅");
295    m.insert("varnothing", "∅");
296    m.insert("neg", "¬");
297    m.insert("lnot", "¬");
298    m.insert("land", "∧");
299    m.insert("wedge", "∧");
300    m.insert("lor", "∨");
301    m.insert("vee", "∨");
302    m.insert("cap", "∩");
303    m.insert("cup", "∪");
304    m.insert("setminus", "∖");
305    m.insert("aleph", "ℵ");
306    m.insert("hbar", "ℏ");
307    m.insert("ell", "ℓ");
308    m.insert("Re", "ℜ");
309    m.insert("Im", "ℑ");
310    m.insert("wp", "℘");
311    m.insert("prime", "′");
312    m.insert("degree", "°");
313    m
314});
315
316/// Subscript digits.
317static SUBSCRIPT_DIGITS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
318    let mut m = HashMap::new();
319    m.insert('0', '₀');
320    m.insert('1', '₁');
321    m.insert('2', '₂');
322    m.insert('3', '₃');
323    m.insert('4', '₄');
324    m.insert('5', '₅');
325    m.insert('6', '₆');
326    m.insert('7', '₇');
327    m.insert('8', '₈');
328    m.insert('9', '₉');
329    m.insert('+', '₊');
330    m.insert('-', '₋');
331    m.insert('=', '₌');
332    m.insert('(', '₍');
333    m.insert(')', '₎');
334    m.insert('a', 'ₐ');
335    m.insert('e', 'ₑ');
336    m.insert('h', 'ₕ');
337    m.insert('i', 'ᵢ');
338    m.insert('j', 'ⱼ');
339    m.insert('k', 'ₖ');
340    m.insert('l', 'ₗ');
341    m.insert('m', 'ₘ');
342    m.insert('n', 'ₙ');
343    m.insert('o', 'ₒ');
344    m.insert('p', 'ₚ');
345    m.insert('r', 'ᵣ');
346    m.insert('s', 'ₛ');
347    m.insert('t', 'ₜ');
348    m.insert('u', 'ᵤ');
349    m.insert('v', 'ᵥ');
350    m.insert('x', 'ₓ');
351    m
352});
353
354/// Superscript characters.
355static SUPERSCRIPT_CHARS: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
356    let mut m = HashMap::new();
357    m.insert('0', '⁰');
358    m.insert('1', '¹');
359    m.insert('2', '²');
360    m.insert('3', '³');
361    m.insert('4', '⁴');
362    m.insert('5', '⁵');
363    m.insert('6', '⁶');
364    m.insert('7', '⁷');
365    m.insert('8', '⁸');
366    m.insert('9', '⁹');
367    m.insert('+', '⁺');
368    m.insert('-', '⁻');
369    m.insert('=', '⁼');
370    m.insert('(', '⁽');
371    m.insert(')', '⁾');
372    m.insert('a', 'ᵃ');
373    m.insert('b', 'ᵇ');
374    m.insert('c', 'ᶜ');
375    m.insert('d', 'ᵈ');
376    m.insert('e', 'ᵉ');
377    m.insert('f', 'ᶠ');
378    m.insert('g', 'ᵍ');
379    m.insert('h', 'ʰ');
380    m.insert('i', 'ⁱ');
381    m.insert('j', 'ʲ');
382    m.insert('k', 'ᵏ');
383    m.insert('l', 'ˡ');
384    m.insert('m', 'ᵐ');
385    m.insert('n', 'ⁿ');
386    m.insert('o', 'ᵒ');
387    m.insert('p', 'ᵖ');
388    m.insert('r', 'ʳ');
389    m.insert('s', 'ˢ');
390    m.insert('t', 'ᵗ');
391    m.insert('u', 'ᵘ');
392    m.insert('v', 'ᵛ');
393    m.insert('w', 'ʷ');
394    m.insert('x', 'ˣ');
395    m.insert('y', 'ʸ');
396    m.insert('z', 'ᶻ');
397    m
398});
399
400/// Convert LaTeX commands (\alpha, \sum, etc.).
401fn convert_commands(input: &str) -> String {
402    static CMD_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\\([a-zA-Z]+)").unwrap());
403
404    CMD_RE
405        .replace_all(input, |caps: &regex::Captures| {
406            let cmd = &caps[1];
407
408            // Check each mapping
409            if let Some(s) = GREEK_LETTERS.get(cmd) {
410                return (*s).to_string();
411            }
412            if let Some(s) = OPERATORS.get(cmd) {
413                return (*s).to_string();
414            }
415            if let Some(s) = RELATIONS.get(cmd) {
416                return (*s).to_string();
417            }
418            if let Some(s) = SYMBOLS.get(cmd) {
419                return (*s).to_string();
420            }
421
422            // Unknown command, keep original
423            format!("\\{}", cmd)
424        })
425        .to_string()
426}
427
428/// Convert fractions \frac{a}{b} → a/b.
429fn convert_fractions(input: &str) -> String {
430    static FRAC_RE: LazyLock<Regex> =
431        LazyLock::new(|| Regex::new(r"\\frac\{([^}]*)\}\{([^}]*)\}").unwrap());
432
433    FRAC_RE
434        .replace_all(input, |caps: &regex::Captures| {
435            let num = &caps[1];
436            let den = &caps[2];
437            format!("({}/{})", num, den)
438        })
439        .to_string()
440}
441
442/// Convert subscripts x_1 → x₁, x_{10} → x₁₀.
443fn convert_subscripts(input: &str) -> String {
444    // First handle braced subscripts: x_{abc}
445    static BRACED_SUB_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"_\{([^}]+)\}").unwrap());
446
447    let result = BRACED_SUB_RE
448        .replace_all(input, |caps: &regex::Captures| {
449            let content = &caps[1];
450            to_subscript(content)
451        })
452        .to_string();
453
454    // Then handle single-char subscripts: x_1
455    static SINGLE_SUB_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"_([0-9a-z])").unwrap());
456
457    SINGLE_SUB_RE
458        .replace_all(&result, |caps: &regex::Captures| {
459            let c = caps[1].chars().next().unwrap();
460            SUBSCRIPT_DIGITS
461                .get(&c)
462                .map(|&s| s.to_string())
463                .unwrap_or_else(|| format!("_{}", c))
464        })
465        .to_string()
466}
467
468/// Convert superscripts x^2 → x², x^{10} → x¹⁰.
469fn convert_superscripts(input: &str) -> String {
470    // First handle braced superscripts: x^{abc}
471    static BRACED_SUP_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\^\{([^}]+)\}").unwrap());
472
473    let result = BRACED_SUP_RE
474        .replace_all(input, |caps: &regex::Captures| {
475            let content = &caps[1];
476            to_superscript(content)
477        })
478        .to_string();
479
480    // Then handle single-char superscripts: x^2
481    static SINGLE_SUP_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\^([0-9a-z])").unwrap());
482
483    SINGLE_SUP_RE
484        .replace_all(&result, |caps: &regex::Captures| {
485            let c = caps[1].chars().next().unwrap();
486            SUPERSCRIPT_CHARS
487                .get(&c)
488                .map(|&s| s.to_string())
489                .unwrap_or_else(|| format!("^{}", c))
490        })
491        .to_string()
492}
493
494/// Convert string to subscript.
495fn to_subscript(s: &str) -> String {
496    s.chars()
497        .map(|c| SUBSCRIPT_DIGITS.get(&c).copied().unwrap_or(c))
498        .collect()
499}
500
501/// Convert string to superscript.
502fn to_superscript(s: &str) -> String {
503    s.chars()
504        .map(|c| SUPERSCRIPT_CHARS.get(&c).copied().unwrap_or(c))
505        .collect()
506}
507
508/// Clean up the result.
509fn cleanup(input: &str) -> String {
510    // Remove extra braces and spaces
511    input
512        .replace("{ ", "")
513        .replace(" }", "")
514        .replace("{}", "")
515        .trim()
516        .to_string()
517}
518
519#[cfg(test)]
520mod tests {
521    use super::*;
522
523    #[test]
524    fn test_greek_letters() {
525        assert_eq!(latex_to_unicode(r"\alpha + \beta"), "α + β");
526        assert_eq!(latex_to_unicode(r"\Gamma\Delta"), "ΓΔ");
527        assert_eq!(latex_to_unicode(r"\pi r^2"), "π r²");
528    }
529
530    #[test]
531    fn test_operators() {
532        assert_eq!(latex_to_unicode(r"\sum x"), "Σ x");
533        assert_eq!(latex_to_unicode(r"\int f(x) dx"), "∫ f(x) dx");
534        // Subscripts are now converted!
535        let result = latex_to_unicode(r"\prod_{i=1}");
536        assert!(result.contains("Π")); // Pi symbol
537        assert!(result.contains("₁")); // Subscript 1
538    }
539
540    #[test]
541    fn test_relations() {
542        assert_eq!(latex_to_unicode(r"x \leq y"), "x ≤ y");
543        assert_eq!(latex_to_unicode(r"a \neq b"), "a ≠ b");
544        assert_eq!(latex_to_unicode(r"A \subset B"), "A ⊂ B");
545    }
546
547    #[test]
548    fn test_symbols() {
549        assert_eq!(latex_to_unicode(r"\infty"), "∞");
550        assert_eq!(latex_to_unicode(r"\pm 1"), "± 1");
551        assert_eq!(latex_to_unicode(r"x \to y"), "x → y");
552    }
553
554    #[test]
555    fn test_subscripts() {
556        assert_eq!(latex_to_unicode("x_1"), "x₁");
557        assert_eq!(latex_to_unicode("x_{12}"), "x₁₂");
558        assert_eq!(latex_to_unicode("a_n"), "aₙ");
559    }
560
561    #[test]
562    fn test_superscripts() {
563        assert_eq!(latex_to_unicode("x^2"), "x²");
564        assert_eq!(latex_to_unicode("x^{10}"), "x¹⁰");
565        assert_eq!(latex_to_unicode("e^x"), "eˣ");
566    }
567
568    #[test]
569    fn test_fractions() {
570        assert_eq!(latex_to_unicode(r"\frac{a}{b}"), "(a/b)");
571        assert_eq!(latex_to_unicode(r"\frac{1}{2}"), "(1/2)");
572    }
573
574    #[test]
575    fn test_complex_expression() {
576        let input = r"E = mc^2";
577        assert_eq!(latex_to_unicode(input), "E = mc²");
578
579        let input = r"\sum_{i=1}^n x_i";
580        let result = latex_to_unicode(input);
581        assert!(result.contains("Σ")); // Sum symbol
582                                       // Subscripts should be converted
583        assert!(result.contains("ᵢ") || result.contains("i")); // Subscript i or regular i
584    }
585
586    #[test]
587    fn test_inline_math() {
588        assert_eq!(convert_inline_math("The value $x^2$ is"), "The value x² is");
589        assert_eq!(
590            convert_inline_math("We have $\\alpha$ and $\\beta$"),
591            "We have α and β"
592        );
593    }
594
595    #[test]
596    fn test_latex_plugin_single_line() {
597        let mut plugin = LatexPlugin::new();
598        let state = ParseState::new();
599        let style = ComputedStyle::default();
600
601        let result = plugin.process_line("$$E = mc^2$$", &state, &style);
602        assert!(matches!(result, Some(ProcessResult::Lines(_))));
603        if let Some(ProcessResult::Lines(lines)) = result {
604            assert_eq!(lines.len(), 1);
605            assert!(lines[0].contains("E = mc²"));
606        }
607    }
608
609    #[test]
610    fn test_latex_plugin_multiline() {
611        let mut plugin = LatexPlugin::new();
612        let state = ParseState::new();
613        let style = ComputedStyle::default();
614
615        // Start block
616        let result = plugin.process_line("$$\\sum_{i=1}^n", &state, &style);
617        assert!(matches!(result, Some(ProcessResult::Continue)));
618
619        // Continue
620        let result = plugin.process_line("x_i$$", &state, &style);
621        assert!(matches!(result, Some(ProcessResult::Lines(_))));
622        if let Some(ProcessResult::Lines(lines)) = result {
623            assert!(lines[0].contains("Σ"));
624        }
625    }
626
627    #[test]
628    fn test_latex_plugin_inline() {
629        let mut plugin = LatexPlugin::new();
630        let state = ParseState::new();
631        let style = ComputedStyle::default();
632
633        let result = plugin.process_line("The value $x^2$ is important", &state, &style);
634        assert!(matches!(result, Some(ProcessResult::Lines(_))));
635        if let Some(ProcessResult::Lines(lines)) = result {
636            assert!(lines[0].contains("x²"));
637        }
638    }
639
640    #[test]
641    fn test_latex_plugin_no_match() {
642        let mut plugin = LatexPlugin::new();
643        let state = ParseState::new();
644        let style = ComputedStyle::default();
645
646        let result = plugin.process_line("Normal text without math", &state, &style);
647        assert!(result.is_none());
648    }
649
650    #[test]
651    fn test_latex_plugin_flush() {
652        let mut plugin = LatexPlugin::new();
653        let state = ParseState::new();
654        let style = ComputedStyle::default();
655
656        // Start block without closing
657        plugin.process_line("$$x^2 + y^2", &state, &style);
658
659        // Flush should return incomplete content
660        let result = plugin.flush();
661        assert!(result.is_some());
662    }
663
664    #[test]
665    fn test_latex_plugin_reset() {
666        let mut plugin = LatexPlugin::new();
667        let state = ParseState::new();
668        let style = ComputedStyle::default();
669
670        plugin.process_line("$$x^2", &state, &style);
671        assert!(plugin.is_active());
672
673        plugin.reset();
674        assert!(!plugin.is_active());
675    }
676}