Skip to main content

dmc_codegen/
html.rs

1use crate::{
2  NodeSink, WalkCtx, Walker,
3  escape::{escape_attr, escape_text, escape_url},
4};
5use dmc_diagnostic::Code;
6use dmc_parser::ast::*;
7use duck_diagnostic::{DiagnosticEngine, diag};
8
9#[derive(Debug, Clone, Copy, Default)]
10pub struct RenderOptions {
11  /// GFM disallowed raw HTML extension. When enabled, a fixed tag-name
12  /// set gets its leading `<` escaped in raw HTML output.
13  pub gfm_disallowed_raw_html: bool,
14}
15
16/// Static HTML emitter driven by walker enter/leave events. Tables are
17/// rendered up-front on `enter Table` (rows/cells aren't `Node` variants)
18/// and `in_table_depth` suppresses walker events on cell content.
19pub struct HtmlEmitter {
20  out: String,
21  diag_engine: DiagnosticEngine<Code>,
22  in_table_depth: usize,
23  options: RenderOptions,
24}
25
26impl NodeSink for HtmlEmitter {
27  fn enter(&mut self, node: &Node, ctx: &WalkCtx) {
28    if self.in_table_depth > 0 {
29      return;
30    }
31    self.maybe_separate_list_item_block_child(node, ctx);
32    match node {
33      Node::Text(t) => self.out.push_str(&escape_text(&t.value)),
34      Node::InlineCode(c) => {
35        self.out.push_str("<code>");
36        self.out.push_str(&escape_text(&c.value));
37        self.out.push_str("</code>");
38      },
39      Node::CodeBlock(cb) => self.code_block(cb),
40      Node::Image(i) => self.image(i),
41      Node::HorizontalRule(_) => self.out.push_str("<hr />\n"),
42      Node::HardBreak(_) => self.out.push_str("<br />\n"),
43      // Block-level raw HTML gets a trailing `\n` (CM line-per-block);
44      // inline raw HTML inside a paragraph/heading must not.
45      Node::Html(h) => {
46        let value =
47          if self.options.gfm_disallowed_raw_html { escape_disallowed_raw_html_tag(&h.value) } else { h.value.clone() };
48        self.out.push_str(&value);
49        let inline_context = matches!(ctx.parent, Some(Node::Paragraph(_)) | Some(Node::Heading(_)));
50        if !inline_context && !value.ends_with('\n') {
51          self.out.push('\n');
52        }
53      },
54      Node::SoftBreak(_) => self.out.push('\n'),
55      Node::JsxSelfClosing(s) => self.jsx_self_closing(s),
56      Node::JsxExpression(e) => {
57        // Lower trivial string-literal expressions (`{' '}`, `{"x"}`,
58        // `` {`y`} ``) to plain text; dynamic expressions still trip GW002.
59        if let Some(text) = string_literal_expression(&e.value) {
60          self.out.push_str(&escape_text(&text));
61        } else {
62          self.diag(Code::HtmlExpressionDropped, format!("html: raw `{{...}}` expression dropped: {}", e.value.trim()));
63        }
64      },
65      Node::Table(t) => {
66        self.in_table_depth += 1;
67        self.inline_table(t);
68      },
69      Node::Frontmatter(_) | Node::Import(_) | Node::Export(_) => {},
70      _ => self.open_tag(node),
71    }
72  }
73
74  fn leave(&mut self, node: &Node, _ctx: &WalkCtx) {
75    if let Node::Table(_) = node {
76      self.in_table_depth = self.in_table_depth.saturating_sub(1);
77      return;
78    }
79    if self.in_table_depth > 0 {
80      return;
81    }
82    self.close_tag(node);
83  }
84}
85
86impl Default for HtmlEmitter {
87  fn default() -> Self {
88    Self::new()
89  }
90}
91
92impl HtmlEmitter {
93  pub fn new() -> Self {
94    Self::new_with_options(RenderOptions::default())
95  }
96
97  pub fn new_with_options(options: RenderOptions) -> Self {
98    Self { out: String::new(), diag_engine: DiagnosticEngine::new(), in_table_depth: 0, options }
99  }
100
101  pub fn into_string(self) -> String {
102    self.out
103  }
104
105  /// Returned `DiagnosticEngine` is per-emitter; merge into a shared
106  /// engine via `outer.extend(diag)`.
107  pub fn into_parts(self) -> (String, DiagnosticEngine<Code>) {
108    (self.out, self.diag_engine)
109  }
110
111  /// Drive the walker; use when no other sink shares the walk.
112  pub fn render(doc: &Document) -> (String, DiagnosticEngine<Code>) {
113    let mut e = Self::new();
114    Walker::new(doc).walk(&mut [&mut e]);
115    e.into_parts()
116  }
117
118  pub fn render_with(doc: &Document, options: RenderOptions) -> (String, DiagnosticEngine<Code>) {
119    let mut e = Self::new_with_options(options);
120    Walker::new(doc).walk(&mut [&mut e]);
121    e.into_parts()
122  }
123
124  fn diag(&mut self, code: Code, message: impl Into<String>) {
125    self.diag_engine.emit(diag!(code, message.into()));
126  }
127
128  fn is_block_node(node: &Node) -> bool {
129    matches!(
130      node,
131      Node::Paragraph(_)
132        | Node::List(_)
133        | Node::Blockquote(_)
134        | Node::CodeBlock(_)
135        | Node::Heading(_)
136        | Node::HorizontalRule(_)
137        | Node::Table(_)
138        | Node::Html(_)
139    )
140  }
141
142  fn maybe_separate_list_item_block_child(&mut self, node: &Node, ctx: &WalkCtx) {
143    let Some(parent) = ctx.parent else {
144      return;
145    };
146    if !matches!(parent, Node::ListItem(_) | Node::TaskListItem(_)) || ctx.index == 0 || !Self::is_block_node(node) {
147      return;
148    }
149    let prev = Node::children_of(parent).get(ctx.index - 1);
150    if prev.is_some_and(|n| !Self::is_block_node(n)) && !self.out.ends_with('\n') {
151      self.out.push('\n');
152    }
153  }
154
155  fn open_tag(&mut self, node: &Node) {
156    match node {
157      Node::Heading(h) => match &h.id {
158        Some(id) => self.out.push_str(&format!("<h{} id=\"{}\">", h.level, escape_attr(id))),
159        None => self.out.push_str(&format!("<h{}>", h.level)),
160      },
161      Node::Paragraph(_) => self.out.push_str("<p>"),
162      Node::Bold(_) => self.out.push_str("<strong>"),
163      Node::Italic(_) => self.out.push_str("<em>"),
164      Node::Strikethrough(_) => self.out.push_str("<del>"),
165      Node::Blockquote(_) => self.out.push_str("<blockquote>\n"),
166      Node::List(l) => {
167        let tag = if l.ordered { "ol" } else { "ul" };
168        self.out.push('<');
169        self.out.push_str(tag);
170        // Match remark-gfm: parent gets `class="contains-task-list"`.
171        if l.children.iter().any(|c| matches!(c, Node::TaskListItem(_))) {
172          self.out.push_str(" class=\"contains-task-list\"");
173        }
174        if l.ordered
175          && let Some(s) = l.start
176          && s != 1
177        {
178          self.out.push_str(&format!(" start=\"{}\"", s));
179        }
180        self.out.push_str(">\n");
181      },
182      // CM: `<li>\n` for items with block children; tight items hug
183      // inline content.
184      Node::ListItem(li) => {
185        let has_block_child = li.children.first().is_some_and(|c| {
186          matches!(
187            c,
188            Node::Paragraph(_)
189              | Node::List(_)
190              | Node::Blockquote(_)
191              | Node::CodeBlock(_)
192              | Node::Heading(_)
193              | Node::HorizontalRule(_)
194              | Node::Table(_)
195              | Node::Html(_)
196          )
197        });
198        if has_block_child {
199          self.out.push_str("<li>\n");
200        } else {
201          self.out.push_str("<li>");
202        }
203      },
204      Node::TaskListItem(t) => {
205        // remark-gfm shape: `<input type="checkbox" ...>` (no `/>`) plus
206        // a literal trailing space before item content.
207        let checked = if t.checked { " checked" } else { "" };
208        self.out.push_str(&format!("<li class=\"task-list-item\"><input type=\"checkbox\"{} disabled> ", checked));
209      },
210      Node::Link(l) => {
211        self.out.push_str(&format!("<a href=\"{}\"", escape_attr(&escape_url(&l.href))));
212        // CM 6.3 / 4.7: link title -> anchor `title` attribute.
213        // (autolink-headings tooltip borrows this same field.)
214        if let Some(title) = &l.title {
215          self.out.push_str(&format!(" title=\"{}\"", escape_attr(title)));
216        }
217        self.out.push('>');
218      },
219      Node::JsxElement(e) => {
220        if e.name.is_empty() {
221          self.diag(Code::MalformedJsxTagName, "html: JSX element has empty name; skipped".to_string());
222          return;
223        }
224        // GFM Disallowed Raw HTML: escape `<` on the fixed tag-name set.
225        if self.options.gfm_disallowed_raw_html && is_disallowed_raw_html(&e.name) {
226          self.out.push_str("&lt;");
227        } else {
228          self.out.push('<');
229        }
230        self.out.push_str(&e.name);
231        for a in &e.attrs {
232          self.jsx_attr(a);
233        }
234        self.out.push('>');
235      },
236      Node::JsxFragment(_) => {},
237      _ => {},
238    }
239  }
240
241  /// Block-level closes get a trailing `\n` to match CM's line-per-block
242  /// layout.
243  fn close_tag(&mut self, node: &Node) {
244    match node {
245      Node::Heading(h) => self.out.push_str(&format!("</h{}>\n", h.level)),
246      Node::Paragraph(_) => self.out.push_str("</p>\n"),
247      Node::Bold(_) => self.out.push_str("</strong>"),
248      Node::Italic(_) => self.out.push_str("</em>"),
249      Node::Strikethrough(_) => self.out.push_str("</del>"),
250      Node::Blockquote(_) => self.out.push_str("</blockquote>\n"),
251      Node::List(l) => {
252        let tag = if l.ordered { "ol" } else { "ul" };
253        self.out.push_str(&format!("</{}>\n", tag));
254      },
255      Node::ListItem(_) | Node::TaskListItem(_) => self.out.push_str("</li>\n"),
256      Node::Link(_) => self.out.push_str("</a>"),
257      Node::JsxElement(e) if !e.name.is_empty() => {
258        if self.options.gfm_disallowed_raw_html && is_disallowed_raw_html(&e.name) {
259          self.out.push_str(&format!("&lt;/{}>", e.name));
260        } else {
261          self.out.push_str(&format!("</{}>", e.name));
262        }
263      },
264      Node::JsxFragment(_) => {},
265      _ => {},
266    }
267  }
268
269  fn code_block(&mut self, cb: &CodeBlock) {
270    self.out.push_str("<pre><code");
271    if let Some(lang) = &cb.lang {
272      self.out.push_str(&format!(" class=\"language-{}\"", escape_attr(lang)));
273    }
274    self.out.push('>');
275    self.out.push_str(&escape_text(&cb.value));
276    self.out.push_str("</code></pre>\n");
277  }
278
279  fn image(&mut self, i: &Image) {
280    self.out.push_str(&format!("<img src=\"{}\" alt=\"{}\"", escape_attr(&escape_url(&i.src)), escape_attr(&i.alt)));
281    if let Some(title) = &i.title {
282      self.out.push_str(&format!(" title=\"{}\"", escape_attr(title)));
283    }
284    // CM reference uses XHTML self-closing on `<img>`.
285    self.out.push_str(" />");
286  }
287
288  fn jsx_self_closing(&mut self, s: &JsxSelfClosing) {
289    if s.name.is_empty() {
290      self.diag(Code::MalformedJsxTagName, "html: self-closing JSX has empty name; skipped".to_string());
291      return;
292    }
293    match s.name.as_str() {
294      "MermaidSvg" => {
295        if let Some(attr) = s.attrs.iter().find(|a| a.name == "svg")
296          && let JsxAttrValue::String(svg) = &attr.value
297        {
298          self.out.push_str(svg);
299        }
300      },
301      "MathMl" => {
302        if let Some(attr) = s.attrs.iter().find(|a| a.name == "mathml")
303          && let JsxAttrValue::String(mathml) = &attr.value
304        {
305          // Reverse the JSX-attr escape from Math::preprocess_source.
306          let unescaped = mathml.replace("&quot;", "\"").replace("&amp;", "&");
307          self.out.push_str(&unescaped);
308        }
309      },
310      "PackageManagerTabs" => {
311        self.out.push_str("<div class=\"gentledmc-pm-tabs\">");
312        for pm in ["npm", "yarn", "pnpm", "bun"] {
313          if let Some(attr) = s.attrs.iter().find(|a| a.name == pm)
314            && let JsxAttrValue::String(cmd) = &attr.value
315          {
316            self.out.push_str(&format!(
317              "<pre><code class=\"gentledmc-language-bash\" data-pm=\"{}\">{}</code></pre>",
318              pm,
319              escape_text(cmd)
320            ));
321          }
322        }
323        self.out.push_str("</div>");
324      },
325      _ => {
326        self.out.push('<');
327        self.out.push_str(&s.name);
328        for a in &s.attrs {
329          self.jsx_attr(a);
330        }
331        self.out.push_str(" />");
332      },
333    }
334  }
335
336  fn jsx_attr(&mut self, a: &JsxAttr) {
337    self.out.push(' ');
338    self.out.push_str(&a.name);
339    match &a.value {
340      // Match rehype/shiki: boolean JSX attrs serialize as empty-string
341      // (`attr=""`) so consumer selectors keying off `[attr=""]` work.
342      JsxAttrValue::Boolean => self.out.push_str("=\"\""),
343      JsxAttrValue::String(s) => self.out.push_str(&format!("=\"{}\"", escape_attr(s))),
344      JsxAttrValue::Expression(e) => self.out.push_str(&format!("={{{}}}", e)),
345      // Spread has no HTML form; drop, and pop the leading space.
346      JsxAttrValue::Spread(_) => {
347        self.out.pop();
348      },
349    }
350  }
351
352  /// Render the entire `<table>...</table>` up-front; cell content uses
353  /// `inline_node` recursion since the walker is suppressed inside.
354  fn inline_table(&mut self, t: &Table) {
355    self.out.push_str("<table>\n");
356    if let Some(header) = t.children.first() {
357      self.out.push_str("<thead>\n<tr>\n");
358      for (i, cell) in header.cells.iter().enumerate() {
359        self.inline_cell("th", cell, t.align.get(i).copied().unwrap_or(TableAlign::None));
360      }
361      self.out.push_str("</tr>\n</thead>\n");
362    }
363    if t.children.len() > 1 {
364      self.out.push_str("<tbody>\n");
365      for row in &t.children[1..] {
366        self.out.push_str("<tr>\n");
367        for (i, cell) in row.cells.iter().enumerate() {
368          self.inline_cell("td", cell, t.align.get(i).copied().unwrap_or(TableAlign::None));
369        }
370        self.out.push_str("</tr>\n");
371      }
372      self.out.push_str("</tbody>\n");
373    }
374    self.out.push_str("</table>\n");
375  }
376
377  fn inline_cell(&mut self, tag: &str, cell: &TableCell, align: TableAlign) {
378    self.out.push('<');
379    self.out.push_str(tag);
380    let align_str = match align {
381      TableAlign::Left => Some("left"),
382      TableAlign::Right => Some("right"),
383      TableAlign::Center => Some("center"),
384      TableAlign::None => None,
385    };
386    if let Some(a) = align_str {
387      self.out.push_str(&format!(" align=\"{}\"", a));
388    }
389    self.out.push('>');
390    for c in &cell.children {
391      self.inline_node(c);
392    }
393    self.out.push_str("</");
394    self.out.push_str(tag);
395    self.out.push_str(">\n");
396  }
397
398  /// Self-recursive render for the table inline path (walker is
399  /// suppressed via `in_table_depth`).
400  fn inline_node(&mut self, node: &Node) {
401    match node {
402      Node::Text(t) => self.out.push_str(&escape_text(&t.value)),
403      Node::Bold(i) => self.wrap_tag("strong", &i.children),
404      Node::Italic(i) => self.wrap_tag("em", &i.children),
405      Node::Strikethrough(i) => self.wrap_tag("del", &i.children),
406      Node::InlineCode(c) => {
407        self.out.push_str("<code>");
408        self.out.push_str(&escape_text(&c.value));
409        self.out.push_str("</code>");
410      },
411      Node::Link(l) => {
412        self.out.push_str(&format!("<a href=\"{}\"", escape_attr(&escape_url(&l.href))));
413        if let Some(label) = &l.title {
414          self.out.push_str(&format!(" aria-label=\"{}\"", escape_attr(label)));
415        }
416        self.out.push('>');
417        for c in &l.children {
418          self.inline_node(c);
419        }
420        self.out.push_str("</a>");
421      },
422      Node::Image(i) => self.image(i),
423      Node::HardBreak(_) => self.out.push_str("<br />\n"),
424      Node::SoftBreak(_) => self.out.push('\n'),
425      Node::CodeBlock(cb) => self.code_block(cb),
426      _ => {
427        self.open_tag(node);
428        for kid in Node::children_of(node) {
429          self.inline_node(kid);
430        }
431        self.close_tag(node);
432      },
433    }
434  }
435
436  fn wrap_tag(&mut self, tag: &str, children: &[Node]) {
437    self.out.push('<');
438    self.out.push_str(tag);
439    self.out.push('>');
440    for c in children {
441      self.inline_node(c);
442    }
443    self.out.push_str("</");
444    self.out.push_str(tag);
445    self.out.push('>');
446  }
447}
448
449/// GFM Disallowed Raw HTML tag set. ASCII case-insensitive.
450fn is_disallowed_raw_html(name: &str) -> bool {
451  matches!(
452    name.to_ascii_lowercase().as_str(),
453    "title" | "textarea" | "style" | "xmp" | "iframe" | "noembed" | "noframes" | "script" | "plaintext"
454  )
455}
456
457fn escape_disallowed_raw_html_tag(raw: &str) -> String {
458  let bytes = raw.as_bytes();
459  let mut out = String::with_capacity(raw.len());
460  let mut i = 0;
461  while i < bytes.len() {
462    if bytes[i] == b'<' {
463      let mut j = i + 1;
464      if j < bytes.len() && bytes[j] == b'/' {
465        j += 1;
466      }
467      let name_start = j;
468      while j < bytes.len() && ((bytes[j] as char).is_ascii_alphanumeric() || bytes[j] == b'-') {
469        j += 1;
470      }
471      if j > name_start && is_disallowed_raw_html(&raw[name_start..j]) {
472        out.push_str("&lt;");
473        i += 1;
474        continue;
475      }
476    }
477    out.push(bytes[i] as char);
478    i += 1;
479  }
480  out
481}
482
483pub fn render_html(doc: &Document) -> String {
484  let mut e = HtmlEmitter::new();
485  Walker::new(doc).walk(&mut [&mut e]);
486  e.into_string()
487}
488
489pub fn render_html_with(doc: &Document, options: RenderOptions) -> String {
490  let mut e = HtmlEmitter::new_with_options(options);
491  Walker::new(doc).walk(&mut [&mut e]);
492  e.into_string()
493}
494
495/// Match a JSX expression whose entire body is a single string literal
496/// (single/double-quoted, or backtick template without `${...}`). Used
497/// to lower idiomatic `{' '}` / `{"x"}` to plain text; dynamic
498/// expressions return `None` and still trip GW002.
499fn string_literal_expression(raw: &str) -> Option<String> {
500  let s = raw.trim();
501  if s.len() < 2 {
502    return None;
503  }
504  let bytes = s.as_bytes();
505  let q = bytes[0];
506  if !matches!(q, b'\'' | b'"' | b'`') || bytes[bytes.len() - 1] != q {
507    return None;
508  }
509  let inner = &s[1..s.len() - 1];
510  // Reject unescaped `${` in templates - those need JS to evaluate.
511  if q == b'`' {
512    let mut prev_backslash = false;
513    let bs = inner.as_bytes();
514    let mut i = 0;
515    while i + 1 < bs.len() {
516      if !prev_backslash && bs[i] == b'$' && bs[i + 1] == b'{' {
517        return None;
518      }
519      prev_backslash = bs[i] == b'\\' && !prev_backslash;
520      i += 1;
521    }
522  }
523  // Decode common JS escapes; unknown ones pass through verbatim
524  // (full ECMA-262 escape semantics not needed here).
525  let mut out = String::with_capacity(inner.len());
526  let mut chars = inner.chars();
527  while let Some(c) = chars.next() {
528    if c != '\\' {
529      out.push(c);
530      continue;
531    }
532    match chars.next() {
533      Some('n') => out.push('\n'),
534      Some('t') => out.push('\t'),
535      Some('r') => out.push('\r'),
536      Some('\\') => out.push('\\'),
537      Some('\'') => out.push('\''),
538      Some('"') => out.push('"'),
539      Some('`') => out.push('`'),
540      Some(other) => {
541        out.push('\\');
542        out.push(other);
543      },
544      None => out.push('\\'),
545    }
546  }
547  Some(out)
548}
549
550#[cfg(test)]
551mod tests {
552  use super::string_literal_expression;
553
554  #[test]
555  fn recognises_simple_quoted_strings() {
556    assert_eq!(string_literal_expression("' '"), Some(" ".into()));
557    assert_eq!(string_literal_expression("\"x\""), Some("x".into()));
558    assert_eq!(string_literal_expression("`y`"), Some("y".into()));
559  }
560
561  #[test]
562  fn rejects_template_with_interpolation() {
563    assert!(string_literal_expression("`hi ${name}`").is_none());
564  }
565
566  #[test]
567  fn rejects_dynamic_expression() {
568    assert!(string_literal_expression("count").is_none());
569    assert!(string_literal_expression("foo()").is_none());
570    assert!(string_literal_expression("a + b").is_none());
571  }
572
573  #[test]
574  fn decodes_common_escapes() {
575    assert_eq!(string_literal_expression("'\\n'"), Some("\n".into()));
576    assert_eq!(string_literal_expression("'\\\\'"), Some("\\".into()));
577  }
578}