Skip to main content

dmc_codegen/
html.rs

1use crate::{
2  NodeSink, WalkCtx, Walker,
3  escape::{escape_attr, escape_text, escape_url, sanitize_url},
4};
5use dmc_diagnostic::Code;
6use dmc_parser::ast::*;
7use duck_diagnostic::{DiagnosticEngine, diag};
8
9#[derive(Debug, Clone, Copy, Default)]
10pub struct RenderOptions {
11  /// GFM disallowed raw HTML extension. When enabled, a fixed tag-name
12  /// set gets its leading `<` escaped in raw HTML output.
13  pub gfm_disallowed_raw_html: bool,
14  /// Raw embedded HTML passthrough (CommonMark "unsafe" mode). When
15  /// `false` (the default), raw HTML is NOT emitted verbatim: block-level
16  /// raw HTML is omitted and inline raw HTML is escaped to visible text,
17  /// so attacker-supplied `<script>` / `<iframe>` / `on*=` markup cannot
18  /// reach the output. Set `true` to opt back into verbatim passthrough
19  /// (the caller then owns the XSS risk).
20  pub allow_dangerous_html: bool,
21}
22
23/// Static HTML emitter driven by walker enter/leave events. Tables are
24/// rendered up-front on `enter Table` (rows/cells aren't `Node` variants)
25/// and `in_table_depth` suppresses walker events on cell content.
26pub struct HtmlEmitter {
27  out: String,
28  diag_engine: DiagnosticEngine<Code>,
29  in_table_depth: usize,
30  options: RenderOptions,
31}
32
33impl NodeSink for HtmlEmitter {
34  fn enter(&mut self, node: &Node, ctx: &WalkCtx) {
35    if self.in_table_depth > 0 {
36      return;
37    }
38    self.maybe_separate_list_item_block_child(node, ctx);
39    match node {
40      Node::Text(t) => self.out.push_str(&escape_text(&t.value)),
41      Node::InlineCode(c) => {
42        self.out.push_str("<code>");
43        self.out.push_str(&escape_text(&c.value));
44        self.out.push_str("</code>");
45      },
46      Node::CodeBlock(cb) => self.code_block(cb),
47      Node::Image(i) => self.image(i),
48      Node::HorizontalRule(_) => self.out.push_str("<hr />\n"),
49      Node::HardBreak(_) => self.out.push_str("<br />\n"),
50      // Block-level raw HTML gets a trailing `\n` (CM line-per-block);
51      // inline raw HTML inside a paragraph/heading must not.
52      //
53      // SEC-002: raw HTML passthrough is gated behind `allow_dangerous_html`.
54      // When off (default, CommonMark "safe" mode): block-level raw HTML is
55      // omitted entirely; inline raw HTML is escaped to visible text.
56      Node::Html(h) => {
57        let inline_context = matches!(ctx.parent, Some(Node::Paragraph(_)) | Some(Node::Heading(_)));
58        if !self.options.allow_dangerous_html {
59          if inline_context {
60            self.out.push_str(&escape_text(&h.value));
61          }
62          // Block-level raw HTML: omitted entirely in safe mode.
63          return;
64        }
65        let value =
66          if self.options.gfm_disallowed_raw_html { escape_disallowed_raw_html_tag(&h.value) } else { h.value.clone() };
67        self.out.push_str(&value);
68        if !inline_context && !value.ends_with('\n') {
69          self.out.push('\n');
70        }
71      },
72      Node::SoftBreak(_) => self.out.push('\n'),
73      Node::JsxSelfClosing(s) => self.jsx_self_closing(s),
74      Node::JsxExpression(e) => {
75        // Lower trivial string-literal expressions (`{' '}`, `{"x"}`,
76        // `` {`y`} ``) to plain text; dynamic expressions still trip GW002.
77        if let Some(text) = string_literal_expression(&e.value) {
78          self.out.push_str(&escape_text(&text));
79        } else {
80          self.diag(Code::HtmlExpressionDropped, format!("html: raw `{{...}}` expression dropped: {}", e.value.trim()));
81        }
82      },
83      Node::Table(t) => {
84        self.in_table_depth += 1;
85        self.inline_table(t);
86      },
87      Node::Frontmatter(_) | Node::Import(_) | Node::Export(_) => {},
88      _ => self.open_tag(node),
89    }
90  }
91
92  fn leave(&mut self, node: &Node, _ctx: &WalkCtx) {
93    if let Node::Table(_) = node {
94      self.in_table_depth = self.in_table_depth.saturating_sub(1);
95      return;
96    }
97    if self.in_table_depth > 0 {
98      return;
99    }
100    self.close_tag(node);
101  }
102}
103
104impl Default for HtmlEmitter {
105  fn default() -> Self {
106    Self::new()
107  }
108}
109
110impl HtmlEmitter {
111  pub fn new() -> Self {
112    Self::new_with_options(RenderOptions::default())
113  }
114
115  pub fn new_with_options(options: RenderOptions) -> Self {
116    Self { out: String::new(), diag_engine: DiagnosticEngine::new(), in_table_depth: 0, options }
117  }
118
119  pub fn into_string(self) -> String {
120    self.out
121  }
122
123  /// Returned `DiagnosticEngine` is per-emitter; merge into a shared
124  /// engine via `outer.extend(diag)`.
125  pub fn into_parts(self) -> (String, DiagnosticEngine<Code>) {
126    (self.out, self.diag_engine)
127  }
128
129  /// Drive the walker; use when no other sink shares the walk.
130  pub fn render(doc: &Document) -> (String, DiagnosticEngine<Code>) {
131    let mut e = Self::new();
132    Walker::new(doc).walk(&mut [&mut e]);
133    e.into_parts()
134  }
135
136  pub fn render_with(doc: &Document, options: RenderOptions) -> (String, DiagnosticEngine<Code>) {
137    let mut e = Self::new_with_options(options);
138    Walker::new(doc).walk(&mut [&mut e]);
139    e.into_parts()
140  }
141
142  fn diag(&mut self, code: Code, message: impl Into<String>) {
143    self.diag_engine.emit(diag!(code, message.into()));
144  }
145
146  fn is_block_node(node: &Node) -> bool {
147    matches!(
148      node,
149      Node::Paragraph(_)
150        | Node::List(_)
151        | Node::Blockquote(_)
152        | Node::CodeBlock(_)
153        | Node::Heading(_)
154        | Node::HorizontalRule(_)
155        | Node::Table(_)
156        | Node::Html(_)
157    )
158  }
159
160  fn maybe_separate_list_item_block_child(&mut self, node: &Node, ctx: &WalkCtx) {
161    let Some(parent) = ctx.parent else {
162      return;
163    };
164    if !matches!(parent, Node::ListItem(_) | Node::TaskListItem(_)) || ctx.index == 0 || !Self::is_block_node(node) {
165      return;
166    }
167    let prev = Node::children_of(parent).get(ctx.index - 1);
168    if prev.is_some_and(|n| !Self::is_block_node(n)) && !self.out.ends_with('\n') {
169      self.out.push('\n');
170    }
171  }
172
173  fn open_tag(&mut self, node: &Node) {
174    match node {
175      Node::Heading(h) => match &h.id {
176        Some(id) => self.out.push_str(&format!("<h{} id=\"{}\">", h.level, escape_attr(id))),
177        None => self.out.push_str(&format!("<h{}>", h.level)),
178      },
179      Node::Paragraph(_) => self.out.push_str("<p>"),
180      Node::Bold(_) => self.out.push_str("<strong>"),
181      Node::Italic(_) => self.out.push_str("<em>"),
182      Node::Strikethrough(_) => self.out.push_str("<del>"),
183      Node::Blockquote(_) => self.out.push_str("<blockquote>\n"),
184      Node::List(l) => {
185        let tag = if l.ordered { "ol" } else { "ul" };
186        self.out.push('<');
187        self.out.push_str(tag);
188        // Match remark-gfm: parent gets `class="contains-task-list"`.
189        if l.children.iter().any(|c| matches!(c, Node::TaskListItem(_))) {
190          self.out.push_str(" class=\"contains-task-list\"");
191        }
192        if l.ordered
193          && let Some(s) = l.start
194          && s != 1
195        {
196          self.out.push_str(&format!(" start=\"{}\"", s));
197        }
198        self.out.push_str(">\n");
199      },
200      // CM: `<li>\n` for items with block children; tight items hug
201      // inline content.
202      Node::ListItem(li) => {
203        let has_block_child = li.children.first().is_some_and(|c| {
204          matches!(
205            c,
206            Node::Paragraph(_)
207              | Node::List(_)
208              | Node::Blockquote(_)
209              | Node::CodeBlock(_)
210              | Node::Heading(_)
211              | Node::HorizontalRule(_)
212              | Node::Table(_)
213              | Node::Html(_)
214          )
215        });
216        if has_block_child {
217          self.out.push_str("<li>\n");
218        } else {
219          self.out.push_str("<li>");
220        }
221      },
222      Node::TaskListItem(t) => {
223        // remark-gfm shape: `<input type="checkbox" ...>` (no `/>`) plus
224        // a literal trailing space before item content.
225        let checked = if t.checked { " checked" } else { "" };
226        self.out.push_str(&format!("<li class=\"task-list-item\"><input type=\"checkbox\"{} disabled> ", checked));
227      },
228      Node::Link(l) => {
229        self.out.push_str(&format!("<a href=\"{}\"", escape_attr(&escape_url(&sanitize_url(&l.href)))));
230        // CM 6.3 / 4.7: link title -> anchor `title` attribute.
231        // (autolink-headings tooltip borrows this same field.)
232        if let Some(title) = &l.title {
233          self.out.push_str(&format!(" title=\"{}\"", escape_attr(title)));
234        }
235        self.out.push('>');
236      },
237      Node::JsxElement(e) => {
238        if e.name.is_empty() {
239          self.diag(Code::MalformedJsxTagName, "html: JSX element has empty name; skipped".to_string());
240          return;
241        }
242        // GFM Disallowed Raw HTML: escape `<` on the fixed tag-name set.
243        if self.options.gfm_disallowed_raw_html && is_disallowed_raw_html(&e.name) {
244          self.out.push_str("&lt;");
245        } else {
246          self.out.push('<');
247        }
248        self.out.push_str(&e.name);
249        for a in &e.attrs {
250          self.jsx_attr(a);
251        }
252        self.out.push('>');
253      },
254      Node::JsxFragment(_) => {},
255      _ => {},
256    }
257  }
258
259  /// Block-level closes get a trailing `\n` to match CM's line-per-block
260  /// layout.
261  fn close_tag(&mut self, node: &Node) {
262    match node {
263      Node::Heading(h) => self.out.push_str(&format!("</h{}>\n", h.level)),
264      Node::Paragraph(_) => self.out.push_str("</p>\n"),
265      Node::Bold(_) => self.out.push_str("</strong>"),
266      Node::Italic(_) => self.out.push_str("</em>"),
267      Node::Strikethrough(_) => self.out.push_str("</del>"),
268      Node::Blockquote(_) => self.out.push_str("</blockquote>\n"),
269      Node::List(l) => {
270        let tag = if l.ordered { "ol" } else { "ul" };
271        self.out.push_str(&format!("</{}>\n", tag));
272      },
273      Node::ListItem(_) | Node::TaskListItem(_) => self.out.push_str("</li>\n"),
274      Node::Link(_) => self.out.push_str("</a>"),
275      Node::JsxElement(e) if !e.name.is_empty() => {
276        if self.options.gfm_disallowed_raw_html && is_disallowed_raw_html(&e.name) {
277          self.out.push_str(&format!("&lt;/{}>", e.name));
278        } else {
279          self.out.push_str(&format!("</{}>", e.name));
280        }
281      },
282      Node::JsxFragment(_) => {},
283      _ => {},
284    }
285  }
286
287  fn code_block(&mut self, cb: &CodeBlock) {
288    self.out.push_str("<pre><code");
289    if let Some(lang) = &cb.lang {
290      self.out.push_str(&format!(" class=\"language-{}\"", escape_attr(lang)));
291    }
292    self.out.push('>');
293    self.out.push_str(&escape_text(&cb.value));
294    self.out.push_str("</code></pre>\n");
295  }
296
297  fn image(&mut self, i: &Image) {
298    self.out.push_str(&format!(
299      "<img src=\"{}\" alt=\"{}\"",
300      escape_attr(&escape_url(&sanitize_url(&i.src))),
301      escape_attr(&i.alt)
302    ));
303    if let Some(title) = &i.title {
304      self.out.push_str(&format!(" title=\"{}\"", escape_attr(title)));
305    }
306    // CM reference uses XHTML self-closing on `<img>`.
307    self.out.push_str(" />");
308  }
309
310  fn jsx_self_closing(&mut self, s: &JsxSelfClosing) {
311    if s.name.is_empty() {
312      self.diag(Code::MalformedJsxTagName, "html: self-closing JSX has empty name; skipped".to_string());
313      return;
314    }
315    match s.name.as_str() {
316      // SEC-002: `MermaidSvg` / `MathMl` emit renderer-produced markup
317      // (an `<svg>` / `<math>` document) verbatim. Both are derived from
318      // attacker-influenced source (chart text / math source), so the
319      // raw passthrough is gated behind `allow_dangerous_html`. In safe
320      // mode the markup is dropped rather than escaped — an escaped SVG
321      // document is meaningless as visible text.
322      "MermaidSvg" => {
323        if self.options.allow_dangerous_html
324          && let Some(attr) = s.attrs.iter().find(|a| a.name == "svg")
325          && let JsxAttrValue::String(svg) = &attr.value
326        {
327          self.out.push_str(svg);
328        }
329      },
330      "MathMl" => {
331        if self.options.allow_dangerous_html
332          && let Some(attr) = s.attrs.iter().find(|a| a.name == "mathml")
333          && let JsxAttrValue::String(mathml) = &attr.value
334        {
335          // Reverse the JSX-attr escape from Math::preprocess_source.
336          let unescaped = mathml.replace("&quot;", "\"").replace("&amp;", "&");
337          self.out.push_str(&unescaped);
338        }
339      },
340      "PackageManagerTabs" => {
341        self.out.push_str("<div class=\"gentledmc-pm-tabs\">");
342        for pm in ["npm", "yarn", "pnpm", "bun"] {
343          if let Some(attr) = s.attrs.iter().find(|a| a.name == pm)
344            && let JsxAttrValue::String(cmd) = &attr.value
345          {
346            self.out.push_str(&format!(
347              "<pre><code class=\"gentledmc-language-bash\" data-pm=\"{}\">{}</code></pre>",
348              pm,
349              escape_text(cmd)
350            ));
351          }
352        }
353        self.out.push_str("</div>");
354      },
355      _ => {
356        self.out.push('<');
357        self.out.push_str(&s.name);
358        for a in &s.attrs {
359          self.jsx_attr(a);
360        }
361        self.out.push_str(" />");
362      },
363    }
364  }
365
366  fn jsx_attr(&mut self, a: &JsxAttr) {
367    self.out.push(' ');
368    self.out.push_str(&a.name);
369    match &a.value {
370      // Match rehype/shiki: boolean JSX attrs serialize as empty-string
371      // (`attr=""`) so consumer selectors keying off `[attr=""]` work.
372      JsxAttrValue::Boolean => self.out.push_str("=\"\""),
373      JsxAttrValue::String(s) => self.out.push_str(&format!("=\"{}\"", escape_attr(s))),
374      JsxAttrValue::Expression(e) => self.out.push_str(&format!("={{{}}}", e)),
375      // Spread has no HTML form; drop, and pop the leading space.
376      JsxAttrValue::Spread(_) => {
377        self.out.pop();
378      },
379    }
380  }
381
382  /// Render the entire `<table>...</table>` up-front; cell content uses
383  /// `inline_node` recursion since the walker is suppressed inside.
384  fn inline_table(&mut self, t: &Table) {
385    self.out.push_str("<table>\n");
386    if let Some(header) = t.children.first() {
387      self.out.push_str("<thead>\n<tr>\n");
388      for (i, cell) in header.cells.iter().enumerate() {
389        self.inline_cell("th", cell, t.align.get(i).copied().unwrap_or(TableAlign::None));
390      }
391      self.out.push_str("</tr>\n</thead>\n");
392    }
393    if t.children.len() > 1 {
394      self.out.push_str("<tbody>\n");
395      for row in &t.children[1..] {
396        self.out.push_str("<tr>\n");
397        for (i, cell) in row.cells.iter().enumerate() {
398          self.inline_cell("td", cell, t.align.get(i).copied().unwrap_or(TableAlign::None));
399        }
400        self.out.push_str("</tr>\n");
401      }
402      self.out.push_str("</tbody>\n");
403    }
404    self.out.push_str("</table>\n");
405  }
406
407  fn inline_cell(&mut self, tag: &str, cell: &TableCell, align: TableAlign) {
408    self.out.push('<');
409    self.out.push_str(tag);
410    let align_str = match align {
411      TableAlign::Left => Some("left"),
412      TableAlign::Right => Some("right"),
413      TableAlign::Center => Some("center"),
414      TableAlign::None => None,
415    };
416    if let Some(a) = align_str {
417      self.out.push_str(&format!(" align=\"{}\"", a));
418    }
419    self.out.push('>');
420    for c in &cell.children {
421      self.inline_node(c);
422    }
423    self.out.push_str("</");
424    self.out.push_str(tag);
425    self.out.push_str(">\n");
426  }
427
428  /// Self-recursive render for the table inline path (walker is
429  /// suppressed via `in_table_depth`).
430  fn inline_node(&mut self, node: &Node) {
431    match node {
432      Node::Text(t) => self.out.push_str(&escape_text(&t.value)),
433      Node::Bold(i) => self.wrap_tag("strong", &i.children),
434      Node::Italic(i) => self.wrap_tag("em", &i.children),
435      Node::Strikethrough(i) => self.wrap_tag("del", &i.children),
436      Node::InlineCode(c) => {
437        self.out.push_str("<code>");
438        self.out.push_str(&escape_text(&c.value));
439        self.out.push_str("</code>");
440      },
441      Node::Link(l) => {
442        self.out.push_str(&format!("<a href=\"{}\"", escape_attr(&escape_url(&sanitize_url(&l.href)))));
443        if let Some(label) = &l.title {
444          self.out.push_str(&format!(" aria-label=\"{}\"", escape_attr(label)));
445        }
446        self.out.push('>');
447        for c in &l.children {
448          self.inline_node(c);
449        }
450        self.out.push_str("</a>");
451      },
452      Node::Image(i) => self.image(i),
453      Node::HardBreak(_) => self.out.push_str("<br />\n"),
454      Node::SoftBreak(_) => self.out.push('\n'),
455      Node::CodeBlock(cb) => self.code_block(cb),
456      _ => {
457        self.open_tag(node);
458        for kid in Node::children_of(node) {
459          self.inline_node(kid);
460        }
461        self.close_tag(node);
462      },
463    }
464  }
465
466  fn wrap_tag(&mut self, tag: &str, children: &[Node]) {
467    self.out.push('<');
468    self.out.push_str(tag);
469    self.out.push('>');
470    for c in children {
471      self.inline_node(c);
472    }
473    self.out.push_str("</");
474    self.out.push_str(tag);
475    self.out.push('>');
476  }
477}
478
479/// GFM Disallowed Raw HTML tag set. ASCII case-insensitive.
480fn is_disallowed_raw_html(name: &str) -> bool {
481  matches!(
482    name.to_ascii_lowercase().as_str(),
483    "title" | "textarea" | "style" | "xmp" | "iframe" | "noembed" | "noframes" | "script" | "plaintext"
484  )
485}
486
487fn escape_disallowed_raw_html_tag(raw: &str) -> String {
488  let bytes = raw.as_bytes();
489  let mut out = String::with_capacity(raw.len());
490  let mut i = 0;
491  while i < bytes.len() {
492    if bytes[i] == b'<' {
493      let mut j = i + 1;
494      if j < bytes.len() && bytes[j] == b'/' {
495        j += 1;
496      }
497      let name_start = j;
498      while j < bytes.len() && ((bytes[j] as char).is_ascii_alphanumeric() || bytes[j] == b'-') {
499        j += 1;
500      }
501      if j > name_start && is_disallowed_raw_html(&raw[name_start..j]) {
502        out.push_str("&lt;");
503        i += 1;
504        continue;
505      }
506    }
507    out.push(bytes[i] as char);
508    i += 1;
509  }
510  out
511}
512
513pub fn render_html(doc: &Document) -> String {
514  let mut e = HtmlEmitter::new();
515  Walker::new(doc).walk(&mut [&mut e]);
516  e.into_string()
517}
518
519pub fn render_html_with(doc: &Document, options: RenderOptions) -> String {
520  let mut e = HtmlEmitter::new_with_options(options);
521  Walker::new(doc).walk(&mut [&mut e]);
522  e.into_string()
523}
524
525/// Match a JSX expression whose entire body is a single string literal
526/// (single/double-quoted, or backtick template without `${...}`). Used
527/// to lower idiomatic `{' '}` / `{"x"}` to plain text; dynamic
528/// expressions return `None` and still trip GW002.
529fn string_literal_expression(raw: &str) -> Option<String> {
530  let s = raw.trim();
531  if s.len() < 2 {
532    return None;
533  }
534  let bytes = s.as_bytes();
535  let q = bytes[0];
536  if !matches!(q, b'\'' | b'"' | b'`') || bytes[bytes.len() - 1] != q {
537    return None;
538  }
539  let inner = &s[1..s.len() - 1];
540  // Reject unescaped `${` in templates - those need JS to evaluate.
541  if q == b'`' {
542    let mut prev_backslash = false;
543    let bs = inner.as_bytes();
544    let mut i = 0;
545    while i + 1 < bs.len() {
546      if !prev_backslash && bs[i] == b'$' && bs[i + 1] == b'{' {
547        return None;
548      }
549      prev_backslash = bs[i] == b'\\' && !prev_backslash;
550      i += 1;
551    }
552  }
553  // Decode common JS escapes; unknown ones pass through verbatim
554  // (full ECMA-262 escape semantics not needed here).
555  let mut out = String::with_capacity(inner.len());
556  let mut chars = inner.chars();
557  while let Some(c) = chars.next() {
558    if c != '\\' {
559      out.push(c);
560      continue;
561    }
562    match chars.next() {
563      Some('n') => out.push('\n'),
564      Some('t') => out.push('\t'),
565      Some('r') => out.push('\r'),
566      Some('\\') => out.push('\\'),
567      Some('\'') => out.push('\''),
568      Some('"') => out.push('"'),
569      Some('`') => out.push('`'),
570      Some(other) => {
571        out.push('\\');
572        out.push(other);
573      },
574      None => out.push('\\'),
575    }
576  }
577  Some(out)
578}
579
580#[cfg(test)]
581mod tests {
582  use super::string_literal_expression;
583
584  #[test]
585  fn recognises_simple_quoted_strings() {
586    assert_eq!(string_literal_expression("' '"), Some(" ".into()));
587    assert_eq!(string_literal_expression("\"x\""), Some("x".into()));
588    assert_eq!(string_literal_expression("`y`"), Some("y".into()));
589  }
590
591  #[test]
592  fn rejects_template_with_interpolation() {
593    assert!(string_literal_expression("`hi ${name}`").is_none());
594  }
595
596  #[test]
597  fn rejects_dynamic_expression() {
598    assert!(string_literal_expression("count").is_none());
599    assert!(string_literal_expression("foo()").is_none());
600    assert!(string_literal_expression("a + b").is_none());
601  }
602
603  #[test]
604  fn decodes_common_escapes() {
605    assert_eq!(string_literal_expression("'\\n'"), Some("\n".into()));
606    assert_eq!(string_literal_expression("'\\\\'"), Some("\\".into()));
607  }
608}