1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
use std::borrow::Cow;
/// Decodes Mermaid's `encodeEntities` placeholders and shorthand `#...;` sequences into Unicode.
///
/// Upstream Mermaid runs `encodeEntities(text)` before parsing, and later uses `decodeEntities`
/// + browser `entityDecode(...)` to turn placeholders into actual characters.
///
/// In `merman` we decode these into Unicode as part of headless parsing so that:
/// - layout measurements operate on the same final text
/// - SVG output matches upstream DOM output
pub fn decode_mermaid_entities_to_unicode(input: &str) -> Cow<'_, str> {
// Fast path: nothing to decode.
if !input.contains('#') && !input.contains('&') && !input.contains('fl') && !input.contains('¶')
{
return Cow::Borrowed(input);
}
// Step 1: Mermaid placeholders -> `&...;` / `&#...;`
let mut s = input.to_string();
if s.contains('fl') || s.contains('¶') {
s = s.replace("fl°°", "&#").replace("fl°", "&").replace("¶ß", ";");
}
// Step 2 (shorthand): `#...;` -> `&...;` / `&#...;`
//
// This is primarily for older headless code paths / fixtures that bypass upstream-like
// preprocessing. It is intentionally conservative and only rewrites `#\w+;` patterns.
if s.contains('#') {
let mut out = String::with_capacity(s.len());
let mut it = s.chars().peekable();
let mut prev: Option<char> = None;
while let Some(ch) = it.next() {
if ch != '#' {
out.push(ch);
prev = Some(ch);
continue;
}
// Do not treat `&#...;` as Mermaid shorthand `#...;`.
if prev == Some('&') {
out.push('#');
prev = Some('#');
continue;
}
let mut entity = String::new();
let mut ok = false;
for _ in 0..64 {
match it.peek().copied() {
Some(';') => {
it.next();
ok = true;
break;
}
Some(c) if c.is_ascii_alphanumeric() || c == '_' || c == '+' => {
entity.push(c);
it.next();
}
_ => break,
}
}
if !ok {
out.push('#');
out.push_str(&entity);
continue;
}
let is_int = entity.chars().all(|c| c.is_ascii_digit() || c == '+')
&& entity.chars().any(|c| c.is_ascii_digit());
if is_int {
out.push('&');
out.push('#');
out.push_str(&entity);
out.push(';');
} else {
out.push('&');
out.push_str(&entity);
out.push(';');
}
prev = Some(';');
}
s = out;
}
// Step 3: HTML entity decode (` `, `♥`, `∞`, ...)
//
// Use a standards-based entity decoder so named entities match browser behavior.
Cow::Owned(htmlize::unescape(&s).into_owned())
}