merman_core/preprocess/
mod.rs1use crate::{DetectorRegistry, Error, MermaidConfig, Result};
2use regex::Regex;
3use serde_json::Value;
4use std::borrow::Cow;
5use std::sync::OnceLock;
6
7macro_rules! cached_regex {
8 ($fn_name:ident, $pat:literal) => {
9 fn $fn_name() -> &'static Regex {
10 static RE: OnceLock<Regex> = OnceLock::new();
11 RE.get_or_init(|| Regex::new($pat).expect("preprocess regex must compile"))
12 }
13 };
14}
15
16cached_regex!(re_crlf, r"\r\n?");
17cached_regex!(re_tag, r"<(\w+)([^>]*)>");
18cached_regex!(re_attr_eq_double_quoted, "=\"([^\"]*)\"");
19cached_regex!(re_style_hex, r"style.*:\S*#.*;");
20cached_regex!(re_classdef_hex, r"classDef.*:\S*#.*;");
21cached_regex!(re_entity, r"#\w+;");
22cached_regex!(re_int, r"^\+?\d+$");
23cached_regex!(
24 re_frontmatter,
25 r"(?s)^-{3}\s*[\n\r](.*?)[\n\r]-{3}\s*[\n\r]+"
26);
27
28#[derive(Debug, Clone)]
29pub struct PreprocessResult {
30 pub code: String,
31 pub title: Option<String>,
32 pub config: MermaidConfig,
33}
34
35pub fn preprocess_diagram(input: &str, registry: &DetectorRegistry) -> Result<PreprocessResult> {
36 preprocess_diagram_with_known_type(input, registry, None)
37}
38
39pub fn preprocess_diagram_with_known_type(
40 input: &str,
41 registry: &DetectorRegistry,
42 diagram_type: Option<&str>,
43) -> Result<PreprocessResult> {
44 let cleaned = cleanup_text(input);
45 let (without_frontmatter, title, mut frontmatter_config) =
46 process_frontmatter(cleaned.as_ref())?;
47 let (without_directives, directive_config) =
48 process_directives(without_frontmatter, registry, diagram_type)?;
49
50 frontmatter_config.deep_merge(directive_config.as_value());
51
52 let code = cleanup_comments(without_directives.as_ref());
53 Ok(PreprocessResult {
54 code: code.into_owned(),
55 title,
56 config: frontmatter_config,
57 })
58}
59
60fn cleanup_text(input: &str) -> Cow<'_, str> {
61 let mut s: Cow<'_, str> = if input.contains('\r') {
62 Cow::Owned(re_crlf().replace_all(input, "\n").into_owned())
63 } else {
64 Cow::Borrowed(input)
65 };
66
67 if s.contains('#') {
73 s = Cow::Owned(encode_mermaid_entities_like_upstream(s.as_ref()));
74 }
75
76 if s.contains('<') && s.contains("=\"") {
78 s = Cow::Owned(
79 re_tag()
80 .replace_all(s.as_ref(), |caps: ®ex::Captures| {
81 let tag = &caps[1];
82 let attrs = &caps[2];
83 let attrs = re_attr_eq_double_quoted().replace_all(attrs, "='$1'");
84 format!("<{tag}{attrs}>")
85 })
86 .into_owned(),
87 );
88 }
89
90 s
91}
92
93fn encode_mermaid_entities_like_upstream(text: &str) -> String {
94 if !text.contains('#') {
95 return text.to_string();
96 }
97
98 let mut txt = text.to_string();
104
105 if txt.contains("style") && txt.contains(';') {
106 txt = re_style_hex()
107 .replace_all(&txt, |caps: ®ex::Captures| {
108 let s = caps.get(0).map(|m| m.as_str()).unwrap_or_default();
109 s.strip_suffix(';').unwrap_or(s).to_string()
110 })
111 .to_string();
112 }
113
114 if txt.contains("classDef") && txt.contains(';') {
115 txt = re_classdef_hex()
116 .replace_all(&txt, |caps: ®ex::Captures| {
117 let s = caps.get(0).map(|m| m.as_str()).unwrap_or_default();
118 s.strip_suffix(';').unwrap_or(s).to_string()
119 })
120 .to_string();
121 }
122
123 if txt.contains(';') {
124 txt = re_entity()
125 .replace_all(&txt, |caps: ®ex::Captures| {
126 let s = caps.get(0).map(|m| m.as_str()).unwrap_or_default();
127 let inner = s
128 .strip_prefix('#')
129 .and_then(|s| s.strip_suffix(';'))
130 .unwrap_or("");
131 let is_int = re_int().is_match(inner);
132 if is_int {
133 format!("fl°°{inner}¶ß")
134 } else {
135 format!("fl°{inner}¶ß")
136 }
137 })
138 .to_string();
139 }
140
141 txt
142}
143
144fn cleanup_comments(input: &str) -> Cow<'_, str> {
145 if !input.contains("%%") {
146 return Cow::Borrowed(input.trim_start());
147 }
148 let mut out = String::with_capacity(input.len());
149 for line in input.split_inclusive('\n') {
150 let trimmed = line.trim_start();
151 if trimmed.starts_with("%%") && !trimmed.starts_with("%%{") {
152 continue;
153 }
154 out.push_str(line);
155 }
156 Cow::Owned(out.trim_start().to_string())
157}
158
159fn process_frontmatter(input: &str) -> Result<(&str, Option<String>, MermaidConfig)> {
160 if !input.trim_start().starts_with("---") {
161 return Ok((input, None, MermaidConfig::empty_object()));
162 }
163
164 let Some(caps) = re_frontmatter().captures(input) else {
165 return Ok((input, None, MermaidConfig::empty_object()));
166 };
167
168 let yaml_body = caps.get(1).map(|m| m.as_str()).unwrap_or_default();
169 let raw_yaml: serde_yaml::Value =
170 serde_yaml::from_str(yaml_body).map_err(|e| Error::InvalidFrontMatterYaml {
171 message: e.to_string(),
172 })?;
173
174 let parsed = serde_json::to_value(raw_yaml).unwrap_or(Value::Null);
175 let parsed_obj = parsed.as_object().cloned().unwrap_or_default();
176
177 let mut title = None;
178 let mut config_value = Value::Object(Default::default());
179 let mut display_mode = None;
180
181 if let Some(Value::String(t)) = parsed_obj.get("title") {
182 title = Some(t.clone());
183 }
184 if let Some(v) = parsed_obj.get("config") {
185 config_value = v.clone();
186 }
187 if let Some(Value::String(dm)) = parsed_obj.get("displayMode") {
188 display_mode = Some(dm.clone());
189 }
190
191 let mut config = MermaidConfig::empty_object();
192 config.deep_merge(&config_value);
193 if let Some(dm) = display_mode {
194 config.set_value("gantt.displayMode", Value::String(dm));
195 }
196
197 let stripped = &input[caps.get(0).unwrap().end()..];
198 Ok((stripped, title, config))
199}
200
201fn process_directives<'a>(
202 input: &'a str,
203 registry: &DetectorRegistry,
204 diagram_type: Option<&str>,
205) -> Result<(Cow<'a, str>, MermaidConfig)> {
206 let directives = detect_directives(input)?;
207 if directives.is_empty() {
208 return Ok((Cow::Borrowed(input), MermaidConfig::empty_object()));
209 }
210 let init = detect_init(&directives, input, registry, diagram_type)?;
211 let wrap = directives.iter().any(|d| d.ty == "wrap");
212
213 let mut merged = init;
214 if wrap {
215 merged.set_value("wrap", Value::Bool(true));
216 }
217
218 Ok((Cow::Owned(remove_directives(input)), merged))
219}
220
221fn detect_init(
222 directives: &[Directive],
223 input: &str,
224 registry: &DetectorRegistry,
225 diagram_type: Option<&str>,
226) -> Result<MermaidConfig> {
227 let mut merged = MermaidConfig::empty_object();
228 let mut config_for_detect = MermaidConfig::empty_object();
229
230 for d in directives {
231 if d.ty != "init" && d.ty != "initialize" {
232 continue;
233 }
234
235 let mut args = match &d.args {
236 Some(v) => v.clone(),
237 None => Value::Object(Default::default()),
238 };
239
240 sanitize_directive(&mut args);
241
242 if let Some(diagram_specific) = args.get("config").cloned() {
244 let detected = diagram_type.map(|t| t.to_string()).or_else(|| {
245 registry
246 .detect_type(input, &mut config_for_detect)
247 .ok()
248 .map(ToString::to_string)
249 });
250
251 if let Some(mut ty) = detected {
252 if ty == "flowchart-v2" {
253 ty = "flowchart".to_string();
254 }
255 if let Value::Object(obj) = &mut args {
256 obj.insert(ty, diagram_specific);
257 obj.remove("config");
258 }
259 }
260 }
261
262 merged.deep_merge(&args);
263 }
264
265 Ok(merged)
266}
267
268#[derive(Debug, Clone)]
269struct Directive {
270 ty: String,
271 args: Option<Value>,
272}
273
274fn detect_directives(input: &str) -> Result<Vec<Directive>> {
275 let mut out = Vec::new();
276 let mut pos = 0;
277 let trimmed = input.trim();
278 if !trimmed.contains("%%{") {
279 return Ok(out);
280 }
281
282 let text = trimmed.replace('\'', "\"");
285
286 while let Some(rel) = text[pos..].find("%%{") {
287 let start = pos + rel;
288 let content_start = start + 3;
289 let Some(rel_end) = text[content_start..].find("}%%") else {
290 break;
291 };
292 let content_end = content_start + rel_end;
293 let raw = text[content_start..content_end].trim();
294
295 if let Some(d) = parse_directive(raw)? {
296 out.push(d);
297 }
298
299 pos = content_end + 3;
300 }
301
302 Ok(out)
303}
304
305fn sanitize_directive(value: &mut Value) {
306 match value {
307 Value::Object(map) => {
308 map.remove("secure");
309 map.retain(|k, _| !k.starts_with("__"));
310 for (_, v) in map.iter_mut() {
311 sanitize_directive(v);
312 }
313 }
314 Value::Array(arr) => {
315 for v in arr {
316 sanitize_directive(v);
317 }
318 }
319 Value::String(s) => {
320 let blocked = s.contains('<') || s.contains('>') || s.contains("url(data:");
321 if blocked {
322 *s = String::new();
323 }
324 }
325 _ => {}
326 }
327}
328
329fn remove_directives(text: &str) -> String {
330 let mut out = String::with_capacity(text.len());
331 let mut pos = 0;
332 while let Some(rel) = text[pos..].find("%%{") {
333 let start = pos + rel;
334 out.push_str(&text[pos..start]);
335 let after_start = start + 3;
336 if let Some(rel_end) = text[after_start..].find("}%%") {
337 let end = after_start + rel_end + 3;
338 pos = end;
339 } else {
340 return out;
341 }
342 }
343 out.push_str(&text[pos..]);
344 out
345}
346
347fn parse_directive(raw: &str) -> Result<Option<Directive>> {
348 let raw = raw.trim();
349 if raw.is_empty() {
350 return Ok(None);
351 }
352
353 let mut chars = raw.chars().peekable();
354 let mut ty = String::new();
355 while let Some(&c) = chars.peek() {
356 if c.is_ascii_alphanumeric() || c == '_' {
357 ty.push(c);
358 chars.next();
359 continue;
360 }
361 break;
362 }
363 if ty.is_empty() {
364 return Ok(None);
365 }
366
367 while matches!(chars.peek(), Some(c) if c.is_whitespace()) {
368 chars.next();
369 }
370
371 let args = if matches!(chars.peek(), Some(':')) {
372 chars.next();
373 while matches!(chars.peek(), Some(c) if c.is_whitespace()) {
374 chars.next();
375 }
376 let rest: String = chars.collect();
377 let rest = rest.trim();
378 if rest.is_empty() {
379 None
380 } else if rest.starts_with('{') || rest.starts_with('[') {
381 Some(
382 json5::from_str::<Value>(rest).map_err(|e| Error::InvalidDirectiveJson {
383 message: e.to_string(),
384 })?,
385 )
386 } else {
387 Some(Value::String(rest.to_string()))
388 }
389 } else {
390 None
391 };
392
393 Ok(Some(Directive { ty, args }))
394}