html_generator/math.rs
1// Copyright © 2023 - 2026 HTML Generator. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Server-side LaTeX → MathML and Mermaid diagram passthrough.
5//!
6//! Two post-processing steps that run against the HTML emitted by
7//! the Markdown pipeline:
8//!
9//! * [`convert_math`] (gated behind the `math` feature) walks the
10//! text of an HTML fragment, finds `$$..$$` and `$..$` spans, and
11//! replaces each one with a `<math>...</math>` element rendered via
12//! `pulldown-latex`. No client-side JavaScript is required —
13//! browsers render MathML natively.
14//!
15//! * [`rewrite_mermaid_blocks`] rewrites `<pre><code class="language-mermaid">…</code></pre>`
16//! blocks (the form `comrak`/`mdx-gen` emits for `\u{60}\u{60}\u{60}mermaid` fenced
17//! code) into `<pre class="mermaid">…</pre>` so the standard
18//! client-side mermaid.js bundle picks them up.
19//!
20//! Both functions take a `&str` and return a fresh `String`. Each
21//! has a fast-path: if the input contains no `$` (math) or no
22//! `language-mermaid` substring (diagrams), the input is returned
23//! unchanged with no allocation beyond the borrow check.
24//!
25//! # Examples
26//!
27//! Mermaid passthrough is always available:
28//!
29//! ```
30//! use html_generator::math::rewrite_mermaid_blocks;
31//!
32//! let html = r#"<pre><code class="language-mermaid">graph TD; A-->B</code></pre>"#;
33//! let out = rewrite_mermaid_blocks(html);
34//! // The block body is preserved verbatim — only the wrapping tag
35//! // changes from `<pre><code class="language-mermaid">` to
36//! // `<pre class="mermaid">` so client-side mermaid.js picks it up.
37//! assert!(out.contains(r#"<pre class="mermaid">graph TD; A-->B</pre>"#));
38//! ```
39//!
40//! Math is feature-gated. With the default `math` feature on:
41//!
42//! ```
43//! # #[cfg(feature = "math")]
44//! # {
45//! use html_generator::math::convert_math;
46//!
47//! let html = "<p>Energy: $$E = mc^2$$.</p>";
48//! let out = convert_math(html);
49//! assert!(out.contains("<math"));
50//! assert!(out.contains("display=\"block\""));
51//! # }
52//! ```
53//!
54//! # Error reporting
55//!
56//! Both functions are infallible. `pulldown-latex` reports parse
57//! errors *inline* via a `<merror style="border-color:#b22222">…</merror>`
58//! element rather than failing the whole render — invalid LaTeX
59//! shows up visibly in the page, not as a 500 from the build,
60//! which is the right UX for content tooling.
61
62use once_cell::sync::Lazy;
63use regex::Regex;
64
65// ─── Mermaid: rewrite the comrak/mdx-gen fenced output ───────────
66
67/// Matches a `<pre><code class="language-mermaid">…</code></pre>`
68/// block as emitted by comrak/mdx-gen. Captures the diagram source.
69/// `(?s)` enables `.` to match newlines so multi-line graphs work.
70static MERMAID_BLOCK_REGEX: Lazy<Regex> = Lazy::new(|| {
71 Regex::new(
72 r#"(?s)<pre><code class="language-mermaid">(.*?)</code></pre>"#,
73 )
74 .expect("static MERMAID_BLOCK_REGEX must compile")
75});
76
77/// Rewrite mermaid fenced blocks for client-side rendering.
78///
79/// The CommonMark engine emits `\u{60}\u{60}\u{60}mermaid` fenced blocks as
80/// `<pre><code class="language-mermaid">…</code></pre>`. The mermaid.js
81/// bundle, however, looks for `<pre class="mermaid">…</pre>` (or
82/// `<div class="mermaid">`). This function rewrites every such block
83/// so a page that includes `<script type="module">import mermaid
84/// from "https://…/mermaid.esm.mjs"; mermaid.initialize({startOnLoad:true});</script>`
85/// renders the diagrams without further work.
86///
87/// The diagram body is passed through verbatim (HTML-escaped — the
88/// markdown engine has already done that). mermaid.js handles the
89/// unescaping for the diagram parser.
90///
91/// Fast-path: returns immediately when the input contains no
92/// `language-mermaid` substring (SIMD-backed `str::contains`).
93///
94/// # Examples
95///
96/// ```
97/// use html_generator::math::rewrite_mermaid_blocks;
98///
99/// let input = r#"<p>Diagram below:</p>
100/// <pre><code class="language-mermaid">graph LR
101/// A --> B</code></pre>"#;
102/// let out = rewrite_mermaid_blocks(input);
103/// assert!(out.contains(r#"<pre class="mermaid">"#));
104/// assert!(!out.contains("<code class=\"language-mermaid\""));
105/// ```
106#[must_use]
107pub fn rewrite_mermaid_blocks(html: &str) -> String {
108 if !html.contains("language-mermaid") {
109 return html.to_string();
110 }
111 MERMAID_BLOCK_REGEX
112 .replace_all(html, r#"<pre class="mermaid">$1</pre>"#)
113 .into_owned()
114}
115
116// ─── Math: $..$ inline and $$..$$ display → MathML ───────────────
117
118/// Block math: `$$ … $$`. Greedy-by-default would over-match across
119/// paragraphs, so we use a non-greedy `(?s).*?`. We require at least
120/// one non-`$` character so empty `$$$$` is left alone.
121#[cfg(feature = "math")]
122static DISPLAY_MATH_REGEX: Lazy<Regex> = Lazy::new(|| {
123 Regex::new(r"(?s)\$\$([^$].*?)\$\$")
124 .expect("static DISPLAY_MATH_REGEX must compile")
125});
126
127/// Inline math: `$ … $`. Matches a single `$`, then captures up to
128/// the next single `$` that is not preceded by `\` (TeX-style
129/// escape) and is not followed by another digit (avoids matching
130/// `$1` and `$2` in plain prose). Run AFTER display math so the
131/// `$$..$$` form is eaten first.
132#[cfg(feature = "math")]
133static INLINE_MATH_REGEX: Lazy<Regex> = Lazy::new(|| {
134 Regex::new(r"\$([^\s$][^$]*?[^\s\\$])\$(?:[^0-9]|$)")
135 .expect("static INLINE_MATH_REGEX must compile")
136});
137
138/// Convert LaTeX math spans inside an HTML fragment to MathML.
139///
140/// Two delimiter styles are recognised, in this order:
141///
142/// * `$$...$$` → `<math display="block">…</math>`
143/// * `$...$` → `<math>…</math>` (inline)
144///
145/// The matchers are deliberately conservative: a `$` immediately
146/// followed by a digit is not treated as math (so `$5` and `$2.50`
147/// in prose stay literal), and an inline match must have non-space
148/// content. Unbalanced `$` is left as-is.
149///
150/// Fast-path: returns immediately when the input contains no `$`
151/// character.
152///
153/// # Examples
154///
155/// Block math:
156///
157/// ```
158/// use html_generator::math::convert_math;
159///
160/// let out = convert_math("<p>$$x + y$$</p>");
161/// assert!(out.contains(r#"display="block""#));
162/// assert!(out.contains("<math"));
163/// ```
164///
165/// Inline math:
166///
167/// ```
168/// use html_generator::math::convert_math;
169///
170/// let out = convert_math(r"<p>Pythagoras: $a^2 + b^2 = c^2$.</p>");
171/// assert!(out.contains("<math"));
172/// // Inline form is the default; no `display="block"`:
173/// assert!(!out.contains(r#"display="block""#));
174/// ```
175///
176/// Plain prose with `$` is not touched:
177///
178/// ```
179/// use html_generator::math::convert_math;
180///
181/// let out = convert_math("<p>That cost $5.</p>");
182/// assert_eq!(out, "<p>That cost $5.</p>");
183/// ```
184#[cfg(feature = "math")]
185#[must_use]
186pub fn convert_math(html: &str) -> String {
187 if !html.contains('$') {
188 return html.to_string();
189 }
190
191 // Phase 1: display math `$$..$$`. Apply first so `$$x$$` is not
192 // first eaten by the inline matcher.
193 let mut out = String::with_capacity(html.len());
194 let mut last = 0usize;
195 for m in DISPLAY_MATH_REGEX.captures_iter(html) {
196 let mat = m.get(0).expect("regex match has group 0");
197 let latex = m
198 .get(1)
199 .expect("DISPLAY_MATH_REGEX has capture group 1")
200 .as_str();
201 out.push_str(&html[last..mat.start()]);
202 out.push_str(&render_latex(latex, true));
203 last = mat.end();
204 }
205 out.push_str(&html[last..]);
206
207 if !out.contains('$') {
208 return out;
209 }
210
211 // Phase 2: inline math `$..$`.
212 let pass1 = out;
213 let mut out = String::with_capacity(pass1.len());
214 let mut last = 0usize;
215 for m in INLINE_MATH_REGEX.captures_iter(&pass1) {
216 let mat = m.get(0).expect("regex match has group 0");
217 let latex = m
218 .get(1)
219 .expect("INLINE_MATH_REGEX has capture group 1")
220 .as_str();
221 // The `(?:[^0-9]|$)` tail is captured in match_zero so we
222 // need to re-emit any trailing non-`$` byte.
223 let tail = mat.as_str();
224 let trailer = match tail.chars().last() {
225 Some('$') => "",
226 Some(_) => &tail[tail.len() - 1..],
227 None => "",
228 };
229 out.push_str(&pass1[last..mat.start()]);
230 out.push_str(&render_latex(latex, false));
231 out.push_str(trailer);
232 last = mat.end();
233 }
234 out.push_str(&pass1[last..]);
235
236 out
237}
238
239/// Render a single LaTeX span to MathML. Infallible: `pulldown-latex`
240/// emits parse errors as inline `<merror>` elements rather than
241/// returning `Err`, so writing into a `String` (no I/O) cannot fail.
242#[cfg(feature = "math")]
243fn render_latex(src: &str, display: bool) -> String {
244 use pulldown_latex::config::{DisplayMode, RenderConfig};
245 use pulldown_latex::{mathml::push_mathml, Parser, Storage};
246
247 let storage = Storage::new();
248 let parser = Parser::new(src, &storage);
249 let mut out = String::new();
250 let cfg = RenderConfig {
251 display_mode: if display {
252 DisplayMode::Block
253 } else {
254 DisplayMode::Inline
255 },
256 ..Default::default()
257 };
258 // `push_mathml` returns io::Result<()> for the writer. Writing
259 // into a String never fails, and parse errors are encoded
260 // inline as `<merror>` rather than returned as Err. The
261 // `unwrap_or_default()` is therefore unreachable in practice
262 // and exists only to satisfy the type signature.
263 let _ = push_mathml(&mut out, parser, cfg);
264 out
265}
266
267#[cfg(test)]
268mod tests {
269 use super::*;
270
271 #[test]
272 fn mermaid_fast_path_is_pass_through() {
273 let html = "<p>no diagrams here</p>";
274 assert_eq!(rewrite_mermaid_blocks(html), html);
275 }
276
277 #[test]
278 fn mermaid_block_is_rewritten() {
279 let input = r#"<pre><code class="language-mermaid">graph TD;A-->B</code></pre>"#;
280 let out = rewrite_mermaid_blocks(input);
281 assert_eq!(
282 out,
283 r#"<pre class="mermaid">graph TD;A-->B</pre>"#
284 );
285 }
286
287 #[test]
288 fn mermaid_multiple_blocks_each_rewritten() {
289 let input = r#"<pre><code class="language-mermaid">a-->b</code></pre>between<pre><code class="language-mermaid">c-->d</code></pre>"#;
290 let out = rewrite_mermaid_blocks(input);
291 assert_eq!(
292 out,
293 r#"<pre class="mermaid">a-->b</pre>between<pre class="mermaid">c-->d</pre>"#,
294 );
295 assert_eq!(out.matches(r#"<pre class="mermaid">"#).count(), 2);
296 assert!(!out.contains("language-mermaid"));
297 }
298
299 #[cfg(feature = "math")]
300 #[test]
301 fn math_fast_path_no_dollar_passes_through() {
302 let html = "<p>no math here</p>";
303 assert_eq!(convert_math(html), html);
304 }
305
306 #[cfg(feature = "math")]
307 #[test]
308 fn math_inline_is_rendered() {
309 let out = convert_math("<p>$x+y$</p>");
310 assert!(out.contains("<math"));
311 assert!(out.contains("</math>"));
312 // inline math must NOT carry display="block".
313 assert!(!out.contains(r#"display="block""#));
314 }
315
316 #[cfg(feature = "math")]
317 #[test]
318 fn math_display_uses_block_attribute() {
319 let out = convert_math("<p>$$E=mc^2$$</p>");
320 assert!(out.contains("<math"));
321 assert!(out.contains(r#"display="block""#));
322 }
323
324 #[cfg(feature = "math")]
325 #[test]
326 fn math_dollar_followed_by_digit_left_alone() {
327 let out = convert_math("<p>That cost $5 yesterday.</p>");
328 // `$5` is currency, not math — left as-is.
329 assert_eq!(out, "<p>That cost $5 yesterday.</p>");
330 }
331
332 #[cfg(feature = "math")]
333 #[test]
334 fn math_unbalanced_dollar_left_alone() {
335 let out = convert_math("<p>only one $ here</p>");
336 assert_eq!(out, "<p>only one $ here</p>");
337 }
338
339 #[cfg(feature = "math")]
340 #[test]
341 fn math_invalid_latex_emits_inline_merror_marker() {
342 // Double subscripts (`a_b_c`) are a LaTeX syntax error per
343 // pulldown-latex's own test suite. Rather than returning an
344 // error, the renderer encodes the failure as an `<merror>`
345 // element inline so the page surfaces the broken span
346 // visibly. Our wrapper preserves that behaviour.
347 let out = convert_math("<p>$a_b_c$</p>");
348 assert!(
349 out.contains("<merror"),
350 "expected inline <merror> marker, got: {out}"
351 );
352 }
353
354 #[cfg(feature = "math")]
355 #[test]
356 fn math_block_and_inline_in_same_input() {
357 let out = convert_math("<p>see $a+b$ and $$c+d$$.</p>");
358 // Two MathML blocks emitted.
359 assert_eq!(out.matches("<math").count(), 2);
360 assert!(out.contains(r#"display="block""#));
361 }
362}