Skip to main content

merman_core/preprocess/
mod.rs

1use crate::{DetectorRegistry, Error, MermaidConfig, Result};
2use regex::Regex;
3use serde_json::Value;
4use std::borrow::Cow;
5use std::sync::OnceLock;
6
7macro_rules! cached_regex {
8    ($fn_name:ident, $pat:literal) => {
9        fn $fn_name() -> &'static Regex {
10            static RE: OnceLock<Regex> = OnceLock::new();
11            RE.get_or_init(|| Regex::new($pat).expect("preprocess regex must compile"))
12        }
13    };
14}
15
16cached_regex!(re_crlf, r"\r\n?");
17cached_regex!(re_tag, r"<(\w+)([^>]*)>");
18cached_regex!(re_attr_eq_double_quoted, "=\"([^\"]*)\"");
19cached_regex!(re_style_hex, r"style.*:\S*#.*;");
20cached_regex!(re_classdef_hex, r"classDef.*:\S*#.*;");
21cached_regex!(re_entity, r"#\w+;");
22cached_regex!(re_int, r"^\+?\d+$");
23cached_regex!(
24    re_frontmatter,
25    r"(?s)^-{3}\s*[\n\r](.*?)[\n\r]-{3}\s*[\n\r]+"
26);
27
28#[derive(Debug, Clone)]
29pub struct PreprocessResult {
30    pub code: String,
31    pub title: Option<String>,
32    pub config: MermaidConfig,
33}
34
35pub fn preprocess_diagram(input: &str, registry: &DetectorRegistry) -> Result<PreprocessResult> {
36    preprocess_diagram_with_known_type(input, registry, None)
37}
38
39pub fn preprocess_diagram_with_known_type(
40    input: &str,
41    registry: &DetectorRegistry,
42    diagram_type: Option<&str>,
43) -> Result<PreprocessResult> {
44    let cleaned = cleanup_text(input);
45    let (without_frontmatter, title, mut frontmatter_config) =
46        process_frontmatter(cleaned.as_ref())?;
47    let (without_directives, directive_config) =
48        process_directives(without_frontmatter, registry, diagram_type)?;
49
50    frontmatter_config.deep_merge(directive_config.as_value());
51
52    let code = cleanup_comments(without_directives.as_ref());
53    Ok(PreprocessResult {
54        code: code.into_owned(),
55        title,
56        config: frontmatter_config,
57    })
58}
59
60fn cleanup_text(input: &str) -> Cow<'_, str> {
61    let mut s: Cow<'_, str> = if input.contains('\r') {
62        Cow::Owned(re_crlf().replace_all(input, "\n").into_owned())
63    } else {
64        Cow::Borrowed(input)
65    };
66
67    // Mermaid encodes `#quot;`-style sequences before parsing (`encodeEntities(...)`).
68    // This is required because `#` and `;` are significant in several grammars (comments and
69    // statement separators), and the encoded placeholders are later decoded by the renderer.
70    //
71    // Source of truth: `packages/mermaid/src/utils.ts::encodeEntities` at Mermaid@11.12.2.
72    if s.contains('#') {
73        s = Cow::Owned(encode_mermaid_entities_like_upstream(s.as_ref()));
74    }
75
76    // Mermaid performs this HTML attribute rewrite as part of preprocessing.
77    if s.contains('<') && s.contains("=\"") {
78        s = Cow::Owned(
79            re_tag()
80                .replace_all(s.as_ref(), |caps: &regex::Captures| {
81                    let tag = &caps[1];
82                    let attrs = &caps[2];
83                    let attrs = re_attr_eq_double_quoted().replace_all(attrs, "='$1'");
84                    format!("<{tag}{attrs}>")
85                })
86                .into_owned(),
87        );
88    }
89
90    s
91}
92
93fn encode_mermaid_entities_like_upstream(text: &str) -> String {
94    if !text.contains('#') {
95        return text.to_string();
96    }
97
98    // Mirrors Mermaid `encodeEntities` (Mermaid@11.12.2):
99    //
100    // 1) Protect `style...:#...;` and `classDef...:#...;` so color hex fragments are not mistaken
101    //    as entities by the `/#\\w+;/g` pass.
102    // 2) Encode `#<name>;` and `#<number>;` sequences into placeholders that do not contain `#`/`;`.
103    let mut txt = text.to_string();
104
105    if txt.contains("style") && txt.contains(';') {
106        txt = re_style_hex()
107            .replace_all(&txt, |caps: &regex::Captures| {
108                let s = caps.get(0).map(|m| m.as_str()).unwrap_or_default();
109                s.strip_suffix(';').unwrap_or(s).to_string()
110            })
111            .to_string();
112    }
113
114    if txt.contains("classDef") && txt.contains(';') {
115        txt = re_classdef_hex()
116            .replace_all(&txt, |caps: &regex::Captures| {
117                let s = caps.get(0).map(|m| m.as_str()).unwrap_or_default();
118                s.strip_suffix(';').unwrap_or(s).to_string()
119            })
120            .to_string();
121    }
122
123    if txt.contains(';') {
124        txt = re_entity()
125            .replace_all(&txt, |caps: &regex::Captures| {
126                let s = caps.get(0).map(|m| m.as_str()).unwrap_or_default();
127                let inner = s
128                    .strip_prefix('#')
129                    .and_then(|s| s.strip_suffix(';'))
130                    .unwrap_or("");
131                let is_int = re_int().is_match(inner);
132                if is_int {
133                    format!("fl°°{inner}¶ß")
134                } else {
135                    format!("fl°{inner}¶ß")
136                }
137            })
138            .to_string();
139    }
140
141    txt
142}
143
144fn cleanup_comments(input: &str) -> Cow<'_, str> {
145    if !input.contains("%%") {
146        return Cow::Borrowed(input.trim_start());
147    }
148    let mut out = String::with_capacity(input.len());
149    for line in input.split_inclusive('\n') {
150        let trimmed = line.trim_start();
151        if trimmed.starts_with("%%") && !trimmed.starts_with("%%{") {
152            continue;
153        }
154        out.push_str(line);
155    }
156    Cow::Owned(out.trim_start().to_string())
157}
158
159fn process_frontmatter(input: &str) -> Result<(&str, Option<String>, MermaidConfig)> {
160    if !input.trim_start().starts_with("---") {
161        return Ok((input, None, MermaidConfig::empty_object()));
162    }
163
164    let Some(caps) = re_frontmatter().captures(input) else {
165        return Ok((input, None, MermaidConfig::empty_object()));
166    };
167
168    let yaml_body = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
169    let raw_yaml: serde_yaml::Value =
170        serde_yaml::from_str(yaml_body).map_err(|e| Error::InvalidFrontMatterYaml {
171            message: e.to_string(),
172        })?;
173
174    let parsed = serde_json::to_value(raw_yaml).unwrap_or(Value::Null);
175    let parsed_obj = parsed.as_object().cloned().unwrap_or_default();
176
177    let mut title = None;
178    let mut config_value = Value::Object(Default::default());
179    let mut display_mode = None;
180
181    if let Some(Value::String(t)) = parsed_obj.get("title") {
182        title = Some(t.clone());
183    }
184    if let Some(v) = parsed_obj.get("config") {
185        config_value = v.clone();
186    }
187    if let Some(Value::String(dm)) = parsed_obj.get("displayMode") {
188        display_mode = Some(dm.clone());
189    }
190
191    let mut config = MermaidConfig::empty_object();
192    config.deep_merge(&config_value);
193    if let Some(dm) = display_mode {
194        config.set_value("gantt.displayMode", Value::String(dm));
195    }
196
197    let stripped = &input[caps.get(0).unwrap().end()..];
198    Ok((stripped, title, config))
199}
200
201fn process_directives<'a>(
202    input: &'a str,
203    registry: &DetectorRegistry,
204    diagram_type: Option<&str>,
205) -> Result<(Cow<'a, str>, MermaidConfig)> {
206    let directives = detect_directives(input)?;
207    if directives.is_empty() {
208        return Ok((Cow::Borrowed(input), MermaidConfig::empty_object()));
209    }
210    let init = detect_init(&directives, input, registry, diagram_type)?;
211    let wrap = directives.iter().any(|d| d.ty == "wrap");
212
213    let mut merged = init;
214    if wrap {
215        merged.set_value("wrap", Value::Bool(true));
216    }
217
218    Ok((Cow::Owned(remove_directives(input)), merged))
219}
220
221fn detect_init(
222    directives: &[Directive],
223    input: &str,
224    registry: &DetectorRegistry,
225    diagram_type: Option<&str>,
226) -> Result<MermaidConfig> {
227    let mut merged = MermaidConfig::empty_object();
228    let mut config_for_detect = MermaidConfig::empty_object();
229
230    for d in directives {
231        if d.ty != "init" && d.ty != "initialize" {
232            continue;
233        }
234
235        let mut args = match &d.args {
236            Some(v) => v.clone(),
237            None => Value::Object(Default::default()),
238        };
239
240        sanitize_directive(&mut args);
241
242        // Mermaid moves a top-level `config` directive field into the diagram-type-specific config.
243        if let Some(diagram_specific) = args.get("config").cloned() {
244            let detected = diagram_type.map(|t| t.to_string()).or_else(|| {
245                registry
246                    .detect_type(input, &mut config_for_detect)
247                    .ok()
248                    .map(ToString::to_string)
249            });
250
251            if let Some(mut ty) = detected {
252                if ty == "flowchart-v2" {
253                    ty = "flowchart".to_string();
254                }
255                if let Value::Object(obj) = &mut args {
256                    obj.insert(ty, diagram_specific);
257                    obj.remove("config");
258                }
259            }
260        }
261
262        merged.deep_merge(&args);
263    }
264
265    Ok(merged)
266}
267
268#[derive(Debug, Clone)]
269struct Directive {
270    ty: String,
271    args: Option<Value>,
272}
273
274fn detect_directives(input: &str) -> Result<Vec<Directive>> {
275    let mut out = Vec::new();
276    let mut pos = 0;
277    let trimmed = input.trim();
278    if !trimmed.contains("%%{") {
279        return Ok(out);
280    }
281
282    // Mermaid's directive parser effectively treats single quotes as double quotes for JSON-like
283    // directive bodies. Keep this behavior, but only pay the allocation when directives exist.
284    let text = trimmed.replace('\'', "\"");
285
286    while let Some(rel) = text[pos..].find("%%{") {
287        let start = pos + rel;
288        let content_start = start + 3;
289        let Some(rel_end) = text[content_start..].find("}%%") else {
290            break;
291        };
292        let content_end = content_start + rel_end;
293        let raw = text[content_start..content_end].trim();
294
295        if let Some(d) = parse_directive(raw)? {
296            out.push(d);
297        }
298
299        pos = content_end + 3;
300    }
301
302    Ok(out)
303}
304
305fn sanitize_directive(value: &mut Value) {
306    match value {
307        Value::Object(map) => {
308            map.remove("secure");
309            map.retain(|k, _| !k.starts_with("__"));
310            for (_, v) in map.iter_mut() {
311                sanitize_directive(v);
312            }
313        }
314        Value::Array(arr) => {
315            for v in arr {
316                sanitize_directive(v);
317            }
318        }
319        Value::String(s) => {
320            let blocked = s.contains('<') || s.contains('>') || s.contains("url(data:");
321            if blocked {
322                *s = String::new();
323            }
324        }
325        _ => {}
326    }
327}
328
329fn remove_directives(text: &str) -> String {
330    let mut out = String::with_capacity(text.len());
331    let mut pos = 0;
332    while let Some(rel) = text[pos..].find("%%{") {
333        let start = pos + rel;
334        out.push_str(&text[pos..start]);
335        let after_start = start + 3;
336        if let Some(rel_end) = text[after_start..].find("}%%") {
337            let end = after_start + rel_end + 3;
338            pos = end;
339        } else {
340            return out;
341        }
342    }
343    out.push_str(&text[pos..]);
344    out
345}
346
347fn parse_directive(raw: &str) -> Result<Option<Directive>> {
348    let raw = raw.trim();
349    if raw.is_empty() {
350        return Ok(None);
351    }
352
353    let mut chars = raw.chars().peekable();
354    let mut ty = String::new();
355    while let Some(&c) = chars.peek() {
356        if c.is_ascii_alphanumeric() || c == '_' {
357            ty.push(c);
358            chars.next();
359            continue;
360        }
361        break;
362    }
363    if ty.is_empty() {
364        return Ok(None);
365    }
366
367    while matches!(chars.peek(), Some(c) if c.is_whitespace()) {
368        chars.next();
369    }
370
371    let args = if matches!(chars.peek(), Some(':')) {
372        chars.next();
373        while matches!(chars.peek(), Some(c) if c.is_whitespace()) {
374            chars.next();
375        }
376        let rest: String = chars.collect();
377        let rest = rest.trim();
378        if rest.is_empty() {
379            None
380        } else if rest.starts_with('{') || rest.starts_with('[') {
381            Some(
382                json5::from_str::<Value>(rest).map_err(|e| Error::InvalidDirectiveJson {
383                    message: e.to_string(),
384                })?,
385            )
386        } else {
387            Some(Value::String(rest.to_string()))
388        }
389    } else {
390        None
391    };
392
393    Ok(Some(Directive { ty, args }))
394}