1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
use super::{HTML_BLOCK_TYPE1_TAGS, HTML_BLOCK_TYPE6_TAGS, HtmlBlockMode};
use crate::ast::*;
use crate::parser::Parser;
use dmc_diagnostic::Code;
use dmc_lexer::token::TokenKind;
impl<'eng, 'tokens> Parser<'eng, 'tokens> {
/// CM 4.6 raw-HTML block detection, keyed off a JSX-style open tag at
/// column 0. Returns `Some(mode)` when the upcoming tag belongs to
/// the type-1 or type-6 set; cursor untouched.
/// Is the upcoming JSX open tag a lowercase HTML-ish tag (so the
/// surrounding paragraph wraps it as inline raw HTML)?
pub(super) fn is_lowercase_jsx_tag(&self) -> bool {
matches!(self.peek_kind(), Some(TokenKind::JsxOpenTagStart)) && self.is_plain_html_jsx_tag()
}
pub(super) fn jsx_html_block_mode(&self) -> Option<HtmlBlockMode> {
let open = self.tokens.get(self.pos)?;
// Span column is 1-based; accept 1-4 (col 0-3 in 0-based) per CM
// 4.6: up to three leading spaces are allowed before any block.
if open.span.column > 4 {
return None;
}
if !matches!(open.kind, TokenKind::JsxOpenTagStart | TokenKind::JsxCloseTagStart) {
return None;
}
let name_tok = self.tokens.get(self.pos + 1)?;
if !matches!(name_tok.kind, TokenKind::JsxTagName) {
return None;
}
let raw_name = name_tok.raw;
let lower = raw_name.to_ascii_lowercase();
if HTML_BLOCK_TYPE1_TAGS.contains(&lower.as_str()) {
Some(HtmlBlockMode::Type1(lower))
} else if HTML_BLOCK_TYPE6_TAGS.contains(&lower.as_str()) {
Some(HtmlBlockMode::Type6)
} else if self.is_plain_html_jsx_tag() && self.jsx_raw_html_tag_is_valid() && self.line_after_tag_is_blank() {
// CM 4.6 Type-7: any tag at col 0 closes on next blank line --
// BUT the start line itself must contain only the tag plus
// whitespace (no inline content after the closing `>`).
// Restricted to lowercase / kebab-case names so MDX components
// like `<MyComponent>` and namespaces like `<svg:circle>` stay
// on the JSX path and compile to component invocations.
Some(HtmlBlockMode::Type7)
} else if self.options.cm_strict_html_blocks
&& self.is_htmlish_jsx_tag()
&& self.jsx_raw_html_tag_is_valid_htmlish()
&& self.line_after_tag_is_blank()
{
// CM-strict spec runner: also treat uppercase HTML-ish names
// (like `<Warning>`) as Type-7 raw HTML blocks. MDX mode keeps
// these on the JSX path so the component compiles correctly.
Some(HtmlBlockMode::Type7)
} else {
None
}
}
/// After the upcoming JSX tag's `>` / `/>`, is the rest of the line
/// whitespace-only? Required for CM 4.6 Type-7 trigger.
fn line_after_tag_is_blank(&self) -> bool {
// Skip over the open tag tokens until JsxOpenTagEnd / JsxSelfClosingEnd.
// CM 4.6 type-7 requires the open tag to be a single complete tag on
// the start line, so reject if any tag-internal token spans a newline.
let mut i = self.pos;
let start_line = self.tokens.get(i).map(|t| t.span.line);
let mut depth = 0i32;
while let Some(t) = self.tokens.get(i) {
if t.raw.contains('\n') || start_line.is_some_and(|line| t.span.line != line) {
return false;
}
match t.kind {
TokenKind::JsxOpenTagStart | TokenKind::JsxCloseTagStart => depth += 1,
TokenKind::JsxOpenTagEnd | TokenKind::JsxCloseTagEnd | TokenKind::JsxSelfClosingEnd => {
depth -= 1;
if depth == 0 {
i += 1;
break;
}
},
_ => {},
}
i += 1;
}
// Now scan to end-of-line; tolerate Whitespace, reject anything else.
while let Some(t) = self.tokens.get(i) {
match &t.kind {
TokenKind::SoftBreak | TokenKind::HardBreak | TokenKind::BlankLine | TokenKind::Eof => return true,
TokenKind::Whitespace(_) => i += 1,
_ => return false,
}
}
true
}
/// Reconstruct a raw HTML block from a JSX-tokenized stream. Type-1
/// closes on the matching `</tag>`; Type-6 closes on the next blank
/// line. Captures the verbatim source span from the first token's
/// start to the closer's end so internal whitespace and attribute
/// formatting survive intact (the lexer's JSX path normalizes
/// whitespace, so per-token concat alone would drop it).
pub(super) fn parse_html_block_from_jsx(&mut self, mode: HtmlBlockMode) -> Node {
let span = self.current_span();
let start_idx = self.pos;
if matches!(self.peek_kind(), Some(TokenKind::JsxCloseTagStart)) {
let close_name = self
.tokens
.get(self.pos + 1)
.filter(|t| matches!(t.kind, TokenKind::JsxTagName))
.map(|t| t.raw)
.unwrap_or_default();
let diagnostic = duck_diagnostic::diag!(
Code::MismatchedJsxCloseTag,
span.clone(),
format!("orphan close tag `</{close_name}>` has no matching opener in this block; preserving it as raw HTML")
)
.with_help(
"add the matching opening tag earlier in the block, or escape the leading `<` if this should render as text",
);
self.emit_diagnostic(diagnostic);
}
match mode {
HtmlBlockMode::Type1(tag) => loop {
match self.peek_kind() {
Some(TokenKind::JsxCloseTagStart) => {
self.advance();
let close_name = match self.peek() {
Some(t) if matches!(t.kind, TokenKind::JsxTagName) => {
let n = t.raw.to_ascii_lowercase();
self.advance();
n
},
_ => String::new(),
};
if matches!(self.peek_kind(), Some(TokenKind::JsxCloseTagEnd)) {
self.advance();
}
if close_name == tag {
// CM 4.6 type-1: the block extends to the end of the line
// that contains the matching close tag (everything after
// `</tag>` on that line stays inside the block).
while let Some(t) = self.peek() {
match &t.kind {
TokenKind::SoftBreak | TokenKind::HardBreak | TokenKind::BlankLine | TokenKind::Eof => break,
_ => {
self.advance();
},
}
}
break;
}
},
Some(TokenKind::Eof) | None => break,
_ => {
self.advance();
},
}
},
HtmlBlockMode::Type6 | HtmlBlockMode::Type7 => loop {
match self.peek_kind() {
Some(TokenKind::BlankLine) | Some(TokenKind::Eof) | None => break,
_ => {
self.advance();
},
}
},
}
let mut value = self.raw_source_for_token_range(start_idx, self.pos);
// CM 5.1: when an HTML block lives inside a blockquote, each
// continuation line carries its own `>` marker(s). Strip them so
// the rendered raw HTML matches the spec output.
if value.contains("\n>") {
let stripped: String = value
.split_inclusive('\n')
.enumerate()
.map(|(i, line)| {
if i == 0 {
line.to_string()
} else {
// Strip leading `>` markers (with optional one space each).
let mut rest = line;
while let Some(stripped) = rest.strip_prefix('>') {
rest = stripped.strip_prefix(' ').unwrap_or(stripped);
}
rest.to_string()
}
})
.collect();
value = stripped;
}
Node::Html(Html { value, span })
}
/// Raw HTML block (CM 4.6 types 2-5). Lexer flagged the open token
/// with the type discriminator; we capture the entire span verbatim
/// (open + body + close) into a single `Html` node.
/// CM 4.6 type-2: HTML comment as a block (cursor on
/// `HtmlCommentOpen` at col 0). Slurps tokens through the matching
/// `HtmlCommentClose` and emits a single `Html` node containing the
/// verbatim source. The block extends to a blank line if the close
/// never fires on the same line.
pub(super) fn parse_html_comment_block(&mut self) -> Node {
let span = self.current_span();
let mut value = String::new();
if let Some(t) = self.peek() {
value.push_str(t.raw);
}
self.advance();
let mut closed = false;
loop {
match self.peek_kind() {
Some(TokenKind::HtmlCommentClose) => {
if let Some(t) = self.peek() {
value.push_str(t.raw);
}
self.advance();
closed = true;
},
Some(TokenKind::BlankLine) | Some(TokenKind::Eof) | None => break,
Some(TokenKind::SoftBreak) | Some(TokenKind::HardBreak) => {
if !closed {
// Comment still open: absorb the newline verbatim and keep
// slurping into the next line.
value.push('\n');
self.advance();
continue;
}
// CM 4.6 type-2: block ends at the end of the line that
// contains `-->`. Stop here so the next line opens a fresh
// block.
break;
},
_ => {
if let Some(t) = self.peek() {
value.push_str(t.raw);
}
self.advance();
},
}
}
Node::Html(Html { value, span })
}
pub(super) fn parse_html_block(&mut self) -> Node {
let span = self.current_span();
let mut value = String::new();
if let Some(t) = self.peek() {
value.push_str(t.raw);
}
self.advance();
loop {
match self.peek_kind() {
Some(TokenKind::HtmlBlockClose) => {
if let Some(t) = self.peek() {
value.push_str(t.raw);
}
self.advance();
break;
},
Some(TokenKind::Eof) | None => break,
_ => {
if let Some(t) = self.peek() {
value.push_str(t.raw);
}
self.advance();
},
}
}
Node::Html(Html { value, span })
}
/// Top-level lowercase HTML close tags like `</a></b>` are inline raw
/// HTML, not JSX terminators. Use the normal paragraph break rules but
/// do not stop on `JsxCloseTagStart`.
pub(super) fn parse_plain_html_close_paragraph(&mut self) -> Node {
let span = self.current_span();
let children = self.collect_inline(&|k| {
matches!(
k,
TokenKind::BlankLine
| TokenKind::SoftBreak
| TokenKind::Eof
| TokenKind::Heading(_)
| TokenKind::FrontmatterStart(_)
| TokenKind::Import
| TokenKind::Export
)
});
if matches!(self.peek_kind(), Some(TokenKind::SoftBreak) | Some(TokenKind::HardBreak)) {
self.advance();
}
Node::Paragraph(Paragraph { children, span })
}
}