Skip to main content

dmc_codegen/
html.rs

1use crate::{
2  NodeSink, WalkCtx, Walker,
3  escape::{escape_attr, escape_text, escape_url},
4};
5use dmc_diagnostic::Code;
6use dmc_parser::ast::*;
7use duck_diagnostic::{DiagnosticEngine, diag};
8
9#[derive(Debug, Clone, Copy, Default)]
10pub struct RenderOptions {
11  /// GFM disallowed raw HTML extension. When enabled, a fixed tag-name
12  /// set gets its leading `<` escaped in raw HTML output.
13  pub gfm_disallowed_raw_html: bool,
14}
15
16/// Emits static HTML by reacting to walker enter/leave events. Container
17/// nodes split into `open_tag` / `close_tag` halves; leaves write their
18/// markup once on enter. Tables are rendered up-front in `enter Table`
19/// (rows + cells aren't `Node` variants the walker can surface) and
20/// `in_table_depth` suppresses subsequent walker events on cell content.
21///
22/// Owns its own `DiagnosticEngine` during the walk; merge into the
23/// caller's engine via `into_parts` after the walk completes.
24pub struct HtmlEmitter {
25  out: String,
26  diag_engine: DiagnosticEngine<Code>,
27  in_table_depth: usize,
28  options: RenderOptions,
29}
30
31impl NodeSink for HtmlEmitter {
32  fn enter(&mut self, node: &Node, ctx: &WalkCtx) {
33    if self.in_table_depth > 0 {
34      return;
35    }
36    self.maybe_separate_list_item_block_child(node, ctx);
37    match node {
38      Node::Text(t) => self.out.push_str(&escape_text(&t.value)),
39      Node::InlineCode(c) => {
40        self.out.push_str("<code>");
41        self.out.push_str(&escape_text(&c.value));
42        self.out.push_str("</code>");
43      },
44      Node::CodeBlock(cb) => self.code_block(cb),
45      Node::Image(i) => self.image(i),
46      Node::HorizontalRule(_) => self.out.push_str("<hr />\n"),
47      Node::HardBreak(_) => self.out.push_str("<br />\n"),
48      // Raw HTML: at block level we add a trailing `\n` to match the
49      // CM reference layout (each block sits on its own line). Inside
50      // a paragraph the same Html node represents an inline raw HTML
51      // span and must NOT inject a newline before `</p>`.
52      Node::Html(h) => {
53        let value =
54          if self.options.gfm_disallowed_raw_html { escape_disallowed_raw_html_tag(&h.value) } else { h.value.clone() };
55        self.out.push_str(&value);
56        let inline_context = matches!(ctx.parent, Some(Node::Paragraph(_)) | Some(Node::Heading(_)));
57        if !inline_context && !value.ends_with('\n') {
58          self.out.push('\n');
59        }
60      },
61      Node::SoftBreak(_) => self.out.push('\n'),
62      Node::JsxSelfClosing(s) => self.jsx_self_closing(s),
63      Node::JsxExpression(e) => {
64        // Trivial string-literal expressions (`{' '}`, `{"x"}`, `` {`y`} ``)
65        // are idiomatic MDX for inline whitespace / inserted text. They
66        // need no JS runtime, so render them as escaped text instead of
67        // dropping + warning. Only genuinely dynamic expressions
68        // (`{count}`, `{foo()}`) hit the GW002 path.
69        if let Some(text) = string_literal_expression(&e.value) {
70          self.out.push_str(&escape_text(&text));
71        } else {
72          self.diag(Code::HtmlExpressionDropped, format!("html: raw `{{...}}` expression dropped: {}", e.value.trim()));
73        }
74      },
75      Node::Table(t) => {
76        self.in_table_depth += 1;
77        self.inline_table(t);
78      },
79      Node::Frontmatter(_) | Node::Import(_) | Node::Export(_) => {},
80      _ => self.open_tag(node),
81    }
82  }
83
84  fn leave(&mut self, node: &Node, _ctx: &WalkCtx) {
85    if let Node::Table(_) = node {
86      self.in_table_depth = self.in_table_depth.saturating_sub(1);
87      return;
88    }
89    if self.in_table_depth > 0 {
90      return;
91    }
92    self.close_tag(node);
93  }
94}
95
96impl Default for HtmlEmitter {
97  fn default() -> Self {
98    Self::new()
99  }
100}
101
102impl HtmlEmitter {
103  pub fn new() -> Self {
104    Self::new_with_options(RenderOptions::default())
105  }
106
107  pub fn new_with_options(options: RenderOptions) -> Self {
108    Self { out: String::new(), diag_engine: DiagnosticEngine::new(), in_table_depth: 0, options }
109  }
110
111  pub fn into_string(self) -> String {
112    self.out
113  }
114
115  /// Take both buffers: the rendered HTML and the per-emitter diagnostic
116  /// engine. Caller merges the diags into a shared engine via
117  /// `outer.extend(diag)`.
118  pub fn into_parts(self) -> (String, DiagnosticEngine<Code>) {
119    (self.out, self.diag_engine)
120  }
121
122  /// Drive the walker; return `(html, diag)`. Use when no other sink
123  /// shares the walk.
124  pub fn render(doc: &Document) -> (String, DiagnosticEngine<Code>) {
125    let mut e = Self::new();
126    Walker::new(doc).walk(&mut [&mut e]);
127    e.into_parts()
128  }
129
130  pub fn render_with(doc: &Document, options: RenderOptions) -> (String, DiagnosticEngine<Code>) {
131    let mut e = Self::new_with_options(options);
132    Walker::new(doc).walk(&mut [&mut e]);
133    e.into_parts()
134  }
135
136  fn diag(&mut self, code: Code, message: impl Into<String>) {
137    self.diag_engine.emit(diag!(code, message.into()));
138  }
139
140  fn is_block_node(node: &Node) -> bool {
141    matches!(
142      node,
143      Node::Paragraph(_)
144        | Node::List(_)
145        | Node::Blockquote(_)
146        | Node::CodeBlock(_)
147        | Node::Heading(_)
148        | Node::HorizontalRule(_)
149        | Node::Table(_)
150        | Node::Html(_)
151    )
152  }
153
154  fn maybe_separate_list_item_block_child(&mut self, node: &Node, ctx: &WalkCtx) {
155    let Some(parent) = ctx.parent else {
156      return;
157    };
158    if !matches!(parent, Node::ListItem(_) | Node::TaskListItem(_)) || ctx.index == 0 || !Self::is_block_node(node) {
159      return;
160    }
161    let prev = Node::children_of(parent).get(ctx.index - 1);
162    if prev.is_some_and(|n| !Self::is_block_node(n)) && !self.out.ends_with('\n') {
163      self.out.push('\n');
164    }
165  }
166
167  // container open / close (walker fills the children in between)
168
169  /// Write the opening tag for a container node.
170  fn open_tag(&mut self, node: &Node) {
171    match node {
172      Node::Heading(h) => match &h.id {
173        Some(id) => self.out.push_str(&format!("<h{} id=\"{}\">", h.level, escape_attr(id))),
174        None => self.out.push_str(&format!("<h{}>", h.level)),
175      },
176      Node::Paragraph(_) => self.out.push_str("<p>"),
177      Node::Bold(_) => self.out.push_str("<strong>"),
178      Node::Italic(_) => self.out.push_str("<em>"),
179      Node::Strikethrough(_) => self.out.push_str("<del>"),
180      Node::Blockquote(_) => self.out.push_str("<blockquote>\n"),
181      Node::List(l) => {
182        let tag = if l.ordered { "ol" } else { "ul" };
183        self.out.push('<');
184        self.out.push_str(tag);
185        // remark-gfm tags any list with a `TaskListItem` child as
186        // `class="contains-task-list"` on the parent `<ul>` / `<ol>`.
187        if l.children.iter().any(|c| matches!(c, Node::TaskListItem(_))) {
188          self.out.push_str(" class=\"contains-task-list\"");
189        }
190        if l.ordered
191          && let Some(s) = l.start
192          && s != 1
193        {
194          self.out.push_str(&format!(" start=\"{}\"", s));
195        }
196        self.out.push_str(">\n");
197      },
198      // CM emits `<li>\n` when the item has block children (loose
199      // list / contains a paragraph). Tight items hug the inline
200      // content directly after `<li>`.
201      Node::ListItem(li) => {
202        let has_block_child = li.children.first().is_some_and(|c| {
203          matches!(
204            c,
205            Node::Paragraph(_)
206              | Node::List(_)
207              | Node::Blockquote(_)
208              | Node::CodeBlock(_)
209              | Node::Heading(_)
210              | Node::HorizontalRule(_)
211              | Node::Table(_)
212              | Node::Html(_)
213          )
214        });
215        if has_block_child {
216          self.out.push_str("<li>\n");
217        } else {
218          self.out.push_str("<li>");
219        }
220      },
221      Node::TaskListItem(t) => {
222        // HTML5 self-closes void elements implicitly - match remark-gfm's
223        // emitted markup which writes `<input type="checkbox" ...>` (no `/>`)
224        // and follows it with a literal space before the item content.
225        let checked = if t.checked { " checked" } else { "" };
226        self.out.push_str(&format!("<li class=\"task-list-item\"><input type=\"checkbox\"{} disabled> ", checked));
227      },
228      Node::Link(l) => {
229        self.out.push_str(&format!("<a href=\"{}\"", escape_attr(&escape_url(&l.href))));
230        // CM 6.3 / 4.7: link title becomes the `title` attribute on
231        // the anchor. (The autolink-headings transformer's tooltip
232        // currently borrows the same field; if it ever needs distinct
233        // semantics, route through a separate AST field.)
234        if let Some(title) = &l.title {
235          self.out.push_str(&format!(" title=\"{}\"", escape_attr(title)));
236        }
237        self.out.push('>');
238      },
239      Node::JsxElement(e) => {
240        if e.name.is_empty() {
241          self.diag(Code::MalformedJsxTagName, "html: JSX element has empty name; skipped".to_string());
242          return;
243        }
244        // GFM Disallowed Raw HTML extension: a fixed set of tag names
245        // get their leading `<` escaped (the tag wouldn't render
246        // safely in a browser). Affects open + close tags.
247        if self.options.gfm_disallowed_raw_html && is_disallowed_raw_html(&e.name) {
248          self.out.push_str("&lt;");
249        } else {
250          self.out.push('<');
251        }
252        self.out.push_str(&e.name);
253        for a in &e.attrs {
254          self.jsx_attr(a);
255        }
256        self.out.push('>');
257      },
258      Node::JsxFragment(_) => {},
259      _ => {},
260    }
261  }
262
263  /// Write the closing tag for a container node opened by `open_tag`.
264  /// Block-level closes get a trailing `\n` so the output matches the
265  /// CommonMark reference renderer's line-per-block layout.
266  fn close_tag(&mut self, node: &Node) {
267    match node {
268      Node::Heading(h) => self.out.push_str(&format!("</h{}>\n", h.level)),
269      Node::Paragraph(_) => self.out.push_str("</p>\n"),
270      Node::Bold(_) => self.out.push_str("</strong>"),
271      Node::Italic(_) => self.out.push_str("</em>"),
272      Node::Strikethrough(_) => self.out.push_str("</del>"),
273      Node::Blockquote(_) => self.out.push_str("</blockquote>\n"),
274      Node::List(l) => {
275        let tag = if l.ordered { "ol" } else { "ul" };
276        self.out.push_str(&format!("</{}>\n", tag));
277      },
278      Node::ListItem(_) | Node::TaskListItem(_) => self.out.push_str("</li>\n"),
279      Node::Link(_) => self.out.push_str("</a>"),
280      Node::JsxElement(e) if !e.name.is_empty() => {
281        if self.options.gfm_disallowed_raw_html && is_disallowed_raw_html(&e.name) {
282          self.out.push_str(&format!("&lt;/{}>", e.name));
283        } else {
284          self.out.push_str(&format!("</{}>", e.name));
285        }
286      },
287      Node::JsxFragment(_) => {},
288      _ => {},
289    }
290  }
291
292  // leaf-shaped emitters
293
294  fn code_block(&mut self, cb: &CodeBlock) {
295    self.out.push_str("<pre><code");
296    if let Some(lang) = &cb.lang {
297      // CM reference output uses the bare `language-{lang}` class.
298      self.out.push_str(&format!(" class=\"language-{}\"", escape_attr(lang)));
299    }
300    self.out.push('>');
301    self.out.push_str(&escape_text(&cb.value));
302    self.out.push_str("</code></pre>\n");
303  }
304
305  fn image(&mut self, i: &Image) {
306    self.out.push_str(&format!("<img src=\"{}\" alt=\"{}\"", escape_attr(&escape_url(&i.src)), escape_attr(&i.alt)));
307    if let Some(title) = &i.title {
308      self.out.push_str(&format!(" title=\"{}\"", escape_attr(title)));
309    }
310    // CM reference output uses the XHTML self-closing slash on `<img>`
311    // (matches `<hr />` / `<br />` style). Browsers treat both forms
312    // identically.
313    self.out.push_str(" />");
314  }
315
316  fn jsx_self_closing(&mut self, s: &JsxSelfClosing) {
317    if s.name.is_empty() {
318      self.diag(Code::MalformedJsxTagName, "html: self-closing JSX has empty name; skipped".to_string());
319      return;
320    }
321    match s.name.as_str() {
322      "MermaidSvg" => {
323        if let Some(attr) = s.attrs.iter().find(|a| a.name == "svg")
324          && let JsxAttrValue::String(svg) = &attr.value
325        {
326          self.out.push_str(svg);
327        }
328      },
329      "MathMl" => {
330        if let Some(attr) = s.attrs.iter().find(|a| a.name == "mathml")
331          && let JsxAttrValue::String(mathml) = &attr.value
332        {
333          // Reverse the JSX-attribute escape applied by Math::preprocess_source
334          // (`"` -> `&quot;`, `&` -> `&amp;`) before emitting raw HTML.
335          let unescaped = mathml.replace("&quot;", "\"").replace("&amp;", "&");
336          self.out.push_str(&unescaped);
337        }
338      },
339      "PackageManagerTabs" => {
340        self.out.push_str("<div class=\"gentledmc-pm-tabs\">");
341        for pm in ["npm", "yarn", "pnpm", "bun"] {
342          if let Some(attr) = s.attrs.iter().find(|a| a.name == pm)
343            && let JsxAttrValue::String(cmd) = &attr.value
344          {
345            self.out.push_str(&format!(
346              "<pre><code class=\"gentledmc-language-bash\" data-pm=\"{}\">{}</code></pre>",
347              pm,
348              escape_text(cmd)
349            ));
350          }
351        }
352        self.out.push_str("</div>");
353      },
354      _ => {
355        self.out.push('<');
356        self.out.push_str(&s.name);
357        for a in &s.attrs {
358          self.jsx_attr(a);
359        }
360        self.out.push_str(" />");
361      },
362    }
363  }
364
365  fn jsx_attr(&mut self, a: &JsxAttr) {
366    self.out.push(' ');
367    self.out.push_str(&a.name);
368    match &a.value {
369      // Match the rehype/shiki HTML output: boolean JSX attrs serialize
370      // as empty-string attributes (`data-rehype-pretty-code-figure=""`).
371      // It is semantically identical for the browser and keeps consumer
372      // selectors that key off `[attr=""]` working.
373      JsxAttrValue::Boolean => self.out.push_str("=\"\""),
374      JsxAttrValue::String(s) => self.out.push_str(&format!("=\"{}\"", escape_attr(s))),
375      JsxAttrValue::Expression(e) => self.out.push_str(&format!("={{{}}}", e)),
376      // Spread attributes have no HTML representation; drop them. The
377      // leading space pushed before the (empty) name comes back when
378      // we pop it.
379      JsxAttrValue::Spread(_) => {
380        self.out.pop();
381      },
382    }
383  }
384
385  // table inline path (walker can't surface row/cell events)
386
387  /// Render the entire `<table>...</table>` up-front. Cell content uses
388  /// `inline_node` recursion since the walker is suppressed inside.
389  fn inline_table(&mut self, t: &Table) {
390    self.out.push_str("<table>\n");
391    if let Some(header) = t.children.first() {
392      self.out.push_str("<thead>\n<tr>\n");
393      for (i, cell) in header.cells.iter().enumerate() {
394        self.inline_cell("th", cell, t.align.get(i).copied().unwrap_or(TableAlign::None));
395      }
396      self.out.push_str("</tr>\n</thead>\n");
397    }
398    if t.children.len() > 1 {
399      self.out.push_str("<tbody>\n");
400      for row in &t.children[1..] {
401        self.out.push_str("<tr>\n");
402        for (i, cell) in row.cells.iter().enumerate() {
403          self.inline_cell("td", cell, t.align.get(i).copied().unwrap_or(TableAlign::None));
404        }
405        self.out.push_str("</tr>\n");
406      }
407      self.out.push_str("</tbody>\n");
408    }
409    self.out.push_str("</table>\n");
410  }
411
412  fn inline_cell(&mut self, tag: &str, cell: &TableCell, align: TableAlign) {
413    self.out.push('<');
414    self.out.push_str(tag);
415    let align_str = match align {
416      TableAlign::Left => Some("left"),
417      TableAlign::Right => Some("right"),
418      TableAlign::Center => Some("center"),
419      TableAlign::None => None,
420    };
421    if let Some(a) = align_str {
422      self.out.push_str(&format!(" align=\"{}\"", a));
423    }
424    self.out.push('>');
425    for c in &cell.children {
426      self.inline_node(c);
427    }
428    self.out.push_str("</");
429    self.out.push_str(tag);
430    self.out.push_str(">\n");
431  }
432
433  /// Self-recursive render used only inside the table inline path. The
434  /// walker is suppressed via `in_table_depth`, so cell content doesn't
435  /// get a second pass.
436  fn inline_node(&mut self, node: &Node) {
437    match node {
438      Node::Text(t) => self.out.push_str(&escape_text(&t.value)),
439      Node::Bold(i) => self.wrap_tag("strong", &i.children),
440      Node::Italic(i) => self.wrap_tag("em", &i.children),
441      Node::Strikethrough(i) => self.wrap_tag("del", &i.children),
442      Node::InlineCode(c) => {
443        self.out.push_str("<code>");
444        self.out.push_str(&escape_text(&c.value));
445        self.out.push_str("</code>");
446      },
447      Node::Link(l) => {
448        self.out.push_str(&format!("<a href=\"{}\"", escape_attr(&escape_url(&l.href))));
449        if let Some(label) = &l.title {
450          self.out.push_str(&format!(" aria-label=\"{}\"", escape_attr(label)));
451        }
452        self.out.push('>');
453        for c in &l.children {
454          self.inline_node(c);
455        }
456        self.out.push_str("</a>");
457      },
458      Node::Image(i) => self.image(i),
459      Node::HardBreak(_) => self.out.push_str("<br />\n"),
460      Node::SoftBreak(_) => self.out.push('\n'),
461      Node::CodeBlock(cb) => self.code_block(cb),
462      _ => {
463        self.open_tag(node);
464        for kid in Node::children_of(node) {
465          self.inline_node(kid);
466        }
467        self.close_tag(node);
468      },
469    }
470  }
471
472  fn wrap_tag(&mut self, tag: &str, children: &[Node]) {
473    self.out.push('<');
474    self.out.push_str(tag);
475    self.out.push('>');
476    for c in children {
477      self.inline_node(c);
478    }
479    self.out.push_str("</");
480    self.out.push_str(tag);
481    self.out.push('>');
482  }
483}
484
485/// Convenience: render `doc` to HTML with a throwaway diagnostic engine.
486/// GFM Disallowed Raw HTML extension: these tag names get their `<`
487/// escaped to `&lt;` so they don't render in the browser. Comparison
488/// is ASCII case-insensitive (`<XMP>` and `<xmp>` both match).
489fn is_disallowed_raw_html(name: &str) -> bool {
490  matches!(
491    name.to_ascii_lowercase().as_str(),
492    "title" | "textarea" | "style" | "xmp" | "iframe" | "noembed" | "noframes" | "script" | "plaintext"
493  )
494}
495
496fn escape_disallowed_raw_html_tag(raw: &str) -> String {
497  let bytes = raw.as_bytes();
498  let mut out = String::with_capacity(raw.len());
499  let mut i = 0;
500  while i < bytes.len() {
501    if bytes[i] == b'<' {
502      let mut j = i + 1;
503      if j < bytes.len() && bytes[j] == b'/' {
504        j += 1;
505      }
506      let name_start = j;
507      while j < bytes.len() && ((bytes[j] as char).is_ascii_alphanumeric() || bytes[j] == b'-') {
508        j += 1;
509      }
510      if j > name_start && is_disallowed_raw_html(&raw[name_start..j]) {
511        out.push_str("&lt;");
512        i += 1;
513        continue;
514      }
515    }
516    out.push(bytes[i] as char);
517    i += 1;
518  }
519  out
520}
521
522pub fn render_html(doc: &Document) -> String {
523  let mut e = HtmlEmitter::new();
524  Walker::new(doc).walk(&mut [&mut e]);
525  e.into_string()
526}
527
528pub fn render_html_with(doc: &Document, options: RenderOptions) -> String {
529  let mut e = HtmlEmitter::new_with_options(options);
530  Walker::new(doc).walk(&mut [&mut e]);
531  e.into_string()
532}
533
534/// Recognise a JSX expression whose entire body is a single string
535/// literal (single-quoted, double-quoted, or backtick template with no
536/// `${...}` interpolation). MDX authors use these as inline whitespace /
537/// inserted text (`{' '}`, `{"x"}`, `` {`y`} ``); they need no JS
538/// runtime, so the HTML emitter can lower them to plain text instead
539/// of dropping + warning. Genuinely dynamic expressions (`{count}`,
540/// `{foo()}`) return `None` and still trip GW002.
541fn string_literal_expression(raw: &str) -> Option<String> {
542  let s = raw.trim();
543  if s.len() < 2 {
544    return None;
545  }
546  let bytes = s.as_bytes();
547  let q = bytes[0];
548  if !matches!(q, b'\'' | b'"' | b'`') || bytes[bytes.len() - 1] != q {
549    return None;
550  }
551  let inner = &s[1..s.len() - 1];
552  // Reject template literals with interpolation - those need JS to
553  // evaluate. `${` must be escaped (`\${`) or absent for the literal
554  // to be safe to lower to plain text.
555  if q == b'`' {
556    let mut prev_backslash = false;
557    let bs = inner.as_bytes();
558    let mut i = 0;
559    while i + 1 < bs.len() {
560      if !prev_backslash && bs[i] == b'$' && bs[i + 1] == b'{' {
561        return None;
562      }
563      prev_backslash = bs[i] == b'\\' && !prev_backslash;
564      i += 1;
565    }
566  }
567  // Decode the common JS escapes we expect to see in MDX prose:
568  // `\n`, `\t`, `\r`, `\\`, `\'`, `\"`, `` \` ``. Anything else is
569  // passed through verbatim - no need for full ECMA-262 escape
570  // semantics here, the result is going straight into HTML text.
571  let mut out = String::with_capacity(inner.len());
572  let mut chars = inner.chars();
573  while let Some(c) = chars.next() {
574    if c != '\\' {
575      out.push(c);
576      continue;
577    }
578    match chars.next() {
579      Some('n') => out.push('\n'),
580      Some('t') => out.push('\t'),
581      Some('r') => out.push('\r'),
582      Some('\\') => out.push('\\'),
583      Some('\'') => out.push('\''),
584      Some('"') => out.push('"'),
585      Some('`') => out.push('`'),
586      Some(other) => {
587        out.push('\\');
588        out.push(other);
589      },
590      None => out.push('\\'),
591    }
592  }
593  Some(out)
594}
595
596#[cfg(test)]
597mod tests {
598  use super::string_literal_expression;
599
600  #[test]
601  fn recognises_simple_quoted_strings() {
602    assert_eq!(string_literal_expression("' '"), Some(" ".into()));
603    assert_eq!(string_literal_expression("\"x\""), Some("x".into()));
604    assert_eq!(string_literal_expression("`y`"), Some("y".into()));
605  }
606
607  #[test]
608  fn rejects_template_with_interpolation() {
609    assert!(string_literal_expression("`hi ${name}`").is_none());
610  }
611
612  #[test]
613  fn rejects_dynamic_expression() {
614    assert!(string_literal_expression("count").is_none());
615    assert!(string_literal_expression("foo()").is_none());
616    assert!(string_literal_expression("a + b").is_none());
617  }
618
619  #[test]
620  fn decodes_common_escapes() {
621    assert_eq!(string_literal_expression("'\\n'"), Some("\n".into()));
622    assert_eq!(string_literal_expression("'\\\\'"), Some("\\".into()));
623  }
624}