Skip to main content

fhp_tokenizer/
state_machine.rs

1//! Branchless state machine for HTML token extraction.
2//!
3//! The core idea: a 2D lookup table `STATE_TABLE[state][byte_class]` maps
4//! every (state, input-byte-class) pair to a `(new_state, action)` without
5//! any conditional branches. This eliminates branch misprediction costs
6//! that dominate traditional HTML tokenizers.
7
8/// Tokenizer states — models the HTML5 tokenizer states relevant for
9/// our structural-index-driven approach.
10#[derive(Clone, Copy, Debug, PartialEq, Eq)]
11#[repr(u8)]
12pub enum State {
13    /// Outside any tag — consuming text content.
14    Data = 0,
15    /// Saw `<` — deciding if open tag, close tag, comment, or doctype.
16    TagOpen,
17    /// Inside an open tag name (e.g. reading `div` in `<div`).
18    TagName,
19    /// Saw `</` — expecting a close tag name.
20    EndTagOpen,
21    /// Inside a close tag name.
22    EndTagName,
23    /// After tag name, before attribute name or `>`.
24    BeforeAttrName,
25    /// Inside an attribute name.
26    AttrName,
27    /// After attribute name, before `=` or next attribute.
28    AfterAttrName,
29    /// Saw `=` after attribute name — expecting value.
30    BeforeAttrValue,
31    /// Inside a quoted attribute value.
32    AttrValueQuoted,
33    /// Inside an unquoted attribute value.
34    AttrValueUnquoted,
35    /// Saw `/` inside a tag — expecting `>` for self-closing.
36    SelfClosingStartTag,
37    /// Inside `<!` — detecting comment vs doctype vs CDATA.
38    MarkupDecl,
39    /// Inside `<!--` comment body.
40    Comment,
41    /// Saw first `-` at end of comment (`-`).
42    CommentEndDash,
43    /// Saw `--` at end of comment.
44    CommentEnd,
45    /// Inside `<!DOCTYPE` content.
46    Doctype,
47    /// Inside `<![CDATA[` content.
48    CData,
49    /// Inside raw text elements (`<script>`, `<style>`).
50    RawText,
51}
52
53/// Number of states — used for table dimensions.
54pub const STATE_COUNT: usize = 19;
55
56/// Byte classification — maps raw bytes to a small enum for table indexing.
57#[derive(Clone, Copy, Debug, PartialEq, Eq)]
58#[repr(u8)]
59pub enum ByteClass {
60    /// `<`
61    Lt = 0,
62    /// `>`
63    Gt,
64    /// `/`
65    Slash,
66    /// `=`
67    Eq,
68    /// `"` or `'`
69    Quot,
70    /// `&`
71    Amp,
72    /// `!`
73    Bang,
74    /// `-`
75    Dash,
76    /// `a-z`, `A-Z`
77    Alpha,
78    /// Space, tab, newline, carriage return
79    Whitespace,
80    /// Everything else
81    Other,
82}
83
84/// Number of byte classes — used for table dimensions.
85pub const BYTE_CLASS_COUNT: usize = 11;
86
87impl ByteClass {
88    /// Classify a raw byte into its [`ByteClass`].
89    #[inline(always)]
90    pub fn from_byte(b: u8) -> Self {
91        match b {
92            b'<' => ByteClass::Lt,
93            b'>' => ByteClass::Gt,
94            b'/' => ByteClass::Slash,
95            b'=' => ByteClass::Eq,
96            b'"' | b'\'' => ByteClass::Quot,
97            b'&' => ByteClass::Amp,
98            b'!' => ByteClass::Bang,
99            b'-' => ByteClass::Dash,
100            b'a'..=b'z' | b'A'..=b'Z' => ByteClass::Alpha,
101            b' ' | b'\t' | b'\n' | b'\r' => ByteClass::Whitespace,
102            _ => ByteClass::Other,
103        }
104    }
105}
106
107/// Actions to perform during state transitions.
108#[derive(Clone, Copy, Debug, PartialEq, Eq)]
109#[repr(u8)]
110pub enum Action {
111    /// Do nothing.
112    None = 0,
113    /// Flush accumulated text as a Text token.
114    FlushText,
115    /// Begin recording an open tag name.
116    StartTag,
117    /// Begin recording a close tag name.
118    StartEndTag,
119    /// Emit the open tag (tag name is complete).
120    EmitTagName,
121    /// Emit the close tag name.
122    EmitEndTagName,
123    /// Begin recording an attribute name.
124    StartAttrName,
125    /// Attribute name is complete.
126    EmitAttrName,
127    /// Begin recording attribute value.
128    StartAttrValue,
129    /// Emit attribute value (quoted attribute complete).
130    EmitAttrValue,
131    /// Emit self-closing tag.
132    EmitSelfClose,
133    /// Begin comment recording.
134    StartComment,
135    /// Emit comment token.
136    EmitComment,
137    /// Begin doctype recording.
138    StartDoctype,
139    /// Emit doctype token.
140    EmitDoctype,
141    /// Begin CDATA recording.
142    StartCData,
143    /// Emit CDATA token.
144    EmitCData,
145    /// Enter raw text mode (script/style).
146    EnterRawText,
147    /// Emit open tag and close it (for `>`).
148    EmitOpenTagClose,
149}
150
151/// A state transition entry: new state + action to perform.
152#[derive(Clone, Copy, Debug)]
153pub struct Transition {
154    /// The next state.
155    pub state: State,
156    /// The action to perform.
157    pub action: Action,
158}
159
160impl Transition {
161    /// No-op transition: stay in current state, do nothing.
162    const fn noop(state: State) -> Self {
163        Self {
164            state,
165            action: Action::None,
166        }
167    }
168
169    /// Transition to a new state with an action.
170    const fn new(state: State, action: Action) -> Self {
171        Self { state, action }
172    }
173}
174
175/// The master state transition table.
176///
177/// `STATE_TABLE[state][byte_class]` yields the `Transition` to apply.
178/// Const-initialized at compile time — no runtime cost.
179pub static STATE_TABLE: [[Transition; BYTE_CLASS_COUNT]; STATE_COUNT] = build_state_table();
180
181/// Build the state transition table at compile time.
182const fn build_state_table() -> [[Transition; BYTE_CLASS_COUNT]; STATE_COUNT] {
183    // Default: stay in same state, do nothing. We'll fill per-state below.
184    // Can't use Default in const, so manually init.
185    let noop = Transition::noop(State::Data);
186    let mut table = [[noop; BYTE_CLASS_COUNT]; STATE_COUNT];
187
188    // ----- Data state -----
189    // Default: stay in Data
190    table[State::Data as usize][ByteClass::Lt as usize] =
191        Transition::new(State::TagOpen, Action::FlushText);
192    table[State::Data as usize][ByteClass::Gt as usize] = Transition::noop(State::Data);
193    table[State::Data as usize][ByteClass::Slash as usize] = Transition::noop(State::Data);
194    table[State::Data as usize][ByteClass::Eq as usize] = Transition::noop(State::Data);
195    table[State::Data as usize][ByteClass::Quot as usize] = Transition::noop(State::Data);
196    table[State::Data as usize][ByteClass::Amp as usize] = Transition::noop(State::Data);
197    table[State::Data as usize][ByteClass::Bang as usize] = Transition::noop(State::Data);
198    table[State::Data as usize][ByteClass::Dash as usize] = Transition::noop(State::Data);
199    table[State::Data as usize][ByteClass::Alpha as usize] = Transition::noop(State::Data);
200    table[State::Data as usize][ByteClass::Whitespace as usize] = Transition::noop(State::Data);
201    table[State::Data as usize][ByteClass::Other as usize] = Transition::noop(State::Data);
202
203    // ----- TagOpen state (saw '<') -----
204    table[State::TagOpen as usize][ByteClass::Alpha as usize] =
205        Transition::new(State::TagName, Action::StartTag);
206    table[State::TagOpen as usize][ByteClass::Slash as usize] =
207        Transition::new(State::EndTagOpen, Action::None);
208    table[State::TagOpen as usize][ByteClass::Bang as usize] =
209        Transition::new(State::MarkupDecl, Action::None);
210    // Malformed: '<' followed by non-alpha — treat as text
211    table[State::TagOpen as usize][ByteClass::Lt as usize] =
212        Transition::new(State::TagOpen, Action::FlushText);
213    table[State::TagOpen as usize][ByteClass::Gt as usize] = Transition::noop(State::Data);
214    table[State::TagOpen as usize][ByteClass::Other as usize] = Transition::noop(State::Data);
215    table[State::TagOpen as usize][ByteClass::Whitespace as usize] = Transition::noop(State::Data);
216    table[State::TagOpen as usize][ByteClass::Eq as usize] = Transition::noop(State::Data);
217    table[State::TagOpen as usize][ByteClass::Quot as usize] = Transition::noop(State::Data);
218    table[State::TagOpen as usize][ByteClass::Amp as usize] = Transition::noop(State::Data);
219    table[State::TagOpen as usize][ByteClass::Dash as usize] = Transition::noop(State::Data);
220
221    // ----- TagName state -----
222    table[State::TagName as usize][ByteClass::Alpha as usize] = Transition::noop(State::TagName);
223    table[State::TagName as usize][ByteClass::Other as usize] = Transition::noop(State::TagName);
224    table[State::TagName as usize][ByteClass::Dash as usize] = Transition::noop(State::TagName);
225    table[State::TagName as usize][ByteClass::Whitespace as usize] =
226        Transition::new(State::BeforeAttrName, Action::EmitTagName);
227    table[State::TagName as usize][ByteClass::Gt as usize] =
228        Transition::new(State::Data, Action::EmitOpenTagClose);
229    table[State::TagName as usize][ByteClass::Slash as usize] =
230        Transition::new(State::SelfClosingStartTag, Action::EmitTagName);
231    table[State::TagName as usize][ByteClass::Lt as usize] =
232        Transition::new(State::TagOpen, Action::EmitOpenTagClose);
233    table[State::TagName as usize][ByteClass::Eq as usize] = Transition::noop(State::TagName);
234    table[State::TagName as usize][ByteClass::Quot as usize] = Transition::noop(State::TagName);
235    table[State::TagName as usize][ByteClass::Amp as usize] = Transition::noop(State::TagName);
236    table[State::TagName as usize][ByteClass::Bang as usize] = Transition::noop(State::TagName);
237
238    // ----- EndTagOpen state (saw '</') -----
239    table[State::EndTagOpen as usize][ByteClass::Alpha as usize] =
240        Transition::new(State::EndTagName, Action::StartEndTag);
241    table[State::EndTagOpen as usize][ByteClass::Gt as usize] = Transition::noop(State::Data); // </> — malformed, ignore
242    table[State::EndTagOpen as usize][ByteClass::Lt as usize] =
243        Transition::new(State::TagOpen, Action::FlushText);
244    table[State::EndTagOpen as usize][ByteClass::Other as usize] = Transition::noop(State::Data);
245    table[State::EndTagOpen as usize][ByteClass::Slash as usize] = Transition::noop(State::Data);
246    table[State::EndTagOpen as usize][ByteClass::Eq as usize] = Transition::noop(State::Data);
247    table[State::EndTagOpen as usize][ByteClass::Quot as usize] = Transition::noop(State::Data);
248    table[State::EndTagOpen as usize][ByteClass::Amp as usize] = Transition::noop(State::Data);
249    table[State::EndTagOpen as usize][ByteClass::Bang as usize] = Transition::noop(State::Data);
250    table[State::EndTagOpen as usize][ByteClass::Dash as usize] = Transition::noop(State::Data);
251    table[State::EndTagOpen as usize][ByteClass::Whitespace as usize] =
252        Transition::noop(State::Data);
253
254    // ----- EndTagName state -----
255    table[State::EndTagName as usize][ByteClass::Alpha as usize] =
256        Transition::noop(State::EndTagName);
257    table[State::EndTagName as usize][ByteClass::Other as usize] =
258        Transition::noop(State::EndTagName);
259    table[State::EndTagName as usize][ByteClass::Dash as usize] =
260        Transition::noop(State::EndTagName);
261    table[State::EndTagName as usize][ByteClass::Gt as usize] =
262        Transition::new(State::Data, Action::EmitEndTagName);
263    table[State::EndTagName as usize][ByteClass::Whitespace as usize] =
264        Transition::new(State::EndTagName, Action::EmitEndTagName);
265    table[State::EndTagName as usize][ByteClass::Lt as usize] =
266        Transition::new(State::TagOpen, Action::EmitEndTagName);
267    table[State::EndTagName as usize][ByteClass::Slash as usize] =
268        Transition::noop(State::EndTagName);
269    table[State::EndTagName as usize][ByteClass::Eq as usize] = Transition::noop(State::EndTagName);
270    table[State::EndTagName as usize][ByteClass::Quot as usize] =
271        Transition::noop(State::EndTagName);
272    table[State::EndTagName as usize][ByteClass::Amp as usize] =
273        Transition::noop(State::EndTagName);
274    table[State::EndTagName as usize][ByteClass::Bang as usize] =
275        Transition::noop(State::EndTagName);
276
277    // ----- BeforeAttrName state (after tag name whitespace) -----
278    table[State::BeforeAttrName as usize][ByteClass::Alpha as usize] =
279        Transition::new(State::AttrName, Action::StartAttrName);
280    table[State::BeforeAttrName as usize][ByteClass::Gt as usize] =
281        Transition::new(State::Data, Action::EmitOpenTagClose);
282    table[State::BeforeAttrName as usize][ByteClass::Slash as usize] =
283        Transition::new(State::SelfClosingStartTag, Action::None);
284    table[State::BeforeAttrName as usize][ByteClass::Whitespace as usize] =
285        Transition::noop(State::BeforeAttrName);
286    table[State::BeforeAttrName as usize][ByteClass::Other as usize] =
287        Transition::new(State::AttrName, Action::StartAttrName);
288    table[State::BeforeAttrName as usize][ByteClass::Dash as usize] =
289        Transition::new(State::AttrName, Action::StartAttrName);
290    table[State::BeforeAttrName as usize][ByteClass::Lt as usize] =
291        Transition::new(State::TagOpen, Action::EmitOpenTagClose);
292    table[State::BeforeAttrName as usize][ByteClass::Eq as usize] =
293        Transition::new(State::AttrName, Action::StartAttrName);
294    table[State::BeforeAttrName as usize][ByteClass::Quot as usize] =
295        Transition::new(State::AttrName, Action::StartAttrName);
296    table[State::BeforeAttrName as usize][ByteClass::Amp as usize] =
297        Transition::new(State::AttrName, Action::StartAttrName);
298    table[State::BeforeAttrName as usize][ByteClass::Bang as usize] =
299        Transition::new(State::AttrName, Action::StartAttrName);
300
301    // ----- AttrName state -----
302    table[State::AttrName as usize][ByteClass::Alpha as usize] = Transition::noop(State::AttrName);
303    table[State::AttrName as usize][ByteClass::Other as usize] = Transition::noop(State::AttrName);
304    table[State::AttrName as usize][ByteClass::Dash as usize] = Transition::noop(State::AttrName);
305    table[State::AttrName as usize][ByteClass::Eq as usize] =
306        Transition::new(State::BeforeAttrValue, Action::EmitAttrName);
307    table[State::AttrName as usize][ByteClass::Whitespace as usize] =
308        Transition::new(State::AfterAttrName, Action::EmitAttrName);
309    table[State::AttrName as usize][ByteClass::Gt as usize] =
310        Transition::new(State::Data, Action::EmitOpenTagClose);
311    table[State::AttrName as usize][ByteClass::Slash as usize] =
312        Transition::new(State::SelfClosingStartTag, Action::EmitAttrName);
313    table[State::AttrName as usize][ByteClass::Lt as usize] =
314        Transition::new(State::TagOpen, Action::EmitOpenTagClose);
315    table[State::AttrName as usize][ByteClass::Quot as usize] = Transition::noop(State::AttrName);
316    table[State::AttrName as usize][ByteClass::Amp as usize] = Transition::noop(State::AttrName);
317    table[State::AttrName as usize][ByteClass::Bang as usize] = Transition::noop(State::AttrName);
318
319    // ----- AfterAttrName state (after attr name, looking for = or next attr) -----
320    table[State::AfterAttrName as usize][ByteClass::Eq as usize] =
321        Transition::new(State::BeforeAttrValue, Action::None);
322    table[State::AfterAttrName as usize][ByteClass::Whitespace as usize] =
323        Transition::noop(State::AfterAttrName);
324    table[State::AfterAttrName as usize][ByteClass::Alpha as usize] =
325        Transition::new(State::AttrName, Action::StartAttrName);
326    table[State::AfterAttrName as usize][ByteClass::Gt as usize] =
327        Transition::new(State::Data, Action::EmitOpenTagClose);
328    table[State::AfterAttrName as usize][ByteClass::Slash as usize] =
329        Transition::new(State::SelfClosingStartTag, Action::None);
330    table[State::AfterAttrName as usize][ByteClass::Lt as usize] =
331        Transition::new(State::TagOpen, Action::EmitOpenTagClose);
332    table[State::AfterAttrName as usize][ByteClass::Other as usize] =
333        Transition::new(State::AttrName, Action::StartAttrName);
334    table[State::AfterAttrName as usize][ByteClass::Dash as usize] =
335        Transition::new(State::AttrName, Action::StartAttrName);
336    table[State::AfterAttrName as usize][ByteClass::Quot as usize] =
337        Transition::new(State::AttrName, Action::StartAttrName);
338    table[State::AfterAttrName as usize][ByteClass::Amp as usize] =
339        Transition::new(State::AttrName, Action::StartAttrName);
340    table[State::AfterAttrName as usize][ByteClass::Bang as usize] =
341        Transition::new(State::AttrName, Action::StartAttrName);
342
343    // ----- BeforeAttrValue state (saw '=', expecting quote or unquoted) -----
344    table[State::BeforeAttrValue as usize][ByteClass::Quot as usize] =
345        Transition::new(State::AttrValueQuoted, Action::StartAttrValue);
346    table[State::BeforeAttrValue as usize][ByteClass::Whitespace as usize] =
347        Transition::noop(State::BeforeAttrValue);
348    table[State::BeforeAttrValue as usize][ByteClass::Alpha as usize] =
349        Transition::new(State::AttrValueUnquoted, Action::StartAttrValue);
350    table[State::BeforeAttrValue as usize][ByteClass::Other as usize] =
351        Transition::new(State::AttrValueUnquoted, Action::StartAttrValue);
352    table[State::BeforeAttrValue as usize][ByteClass::Gt as usize] =
353        Transition::new(State::Data, Action::EmitOpenTagClose);
354    table[State::BeforeAttrValue as usize][ByteClass::Dash as usize] =
355        Transition::new(State::AttrValueUnquoted, Action::StartAttrValue);
356    table[State::BeforeAttrValue as usize][ByteClass::Lt as usize] =
357        Transition::new(State::TagOpen, Action::EmitOpenTagClose);
358    table[State::BeforeAttrValue as usize][ByteClass::Slash as usize] =
359        Transition::new(State::AttrValueUnquoted, Action::StartAttrValue);
360    table[State::BeforeAttrValue as usize][ByteClass::Eq as usize] =
361        Transition::new(State::AttrValueUnquoted, Action::StartAttrValue);
362    table[State::BeforeAttrValue as usize][ByteClass::Amp as usize] =
363        Transition::new(State::AttrValueUnquoted, Action::StartAttrValue);
364    table[State::BeforeAttrValue as usize][ByteClass::Bang as usize] =
365        Transition::new(State::AttrValueUnquoted, Action::StartAttrValue);
366
367    // ----- AttrValueQuoted state -----
368    // Quotes end the value; everything else stays in quoted value.
369    table[State::AttrValueQuoted as usize][ByteClass::Quot as usize] =
370        Transition::new(State::BeforeAttrName, Action::EmitAttrValue);
371    table[State::AttrValueQuoted as usize][ByteClass::Lt as usize] =
372        Transition::noop(State::AttrValueQuoted);
373    table[State::AttrValueQuoted as usize][ByteClass::Gt as usize] =
374        Transition::noop(State::AttrValueQuoted);
375    table[State::AttrValueQuoted as usize][ByteClass::Slash as usize] =
376        Transition::noop(State::AttrValueQuoted);
377    table[State::AttrValueQuoted as usize][ByteClass::Eq as usize] =
378        Transition::noop(State::AttrValueQuoted);
379    table[State::AttrValueQuoted as usize][ByteClass::Amp as usize] =
380        Transition::noop(State::AttrValueQuoted);
381    table[State::AttrValueQuoted as usize][ByteClass::Bang as usize] =
382        Transition::noop(State::AttrValueQuoted);
383    table[State::AttrValueQuoted as usize][ByteClass::Dash as usize] =
384        Transition::noop(State::AttrValueQuoted);
385    table[State::AttrValueQuoted as usize][ByteClass::Alpha as usize] =
386        Transition::noop(State::AttrValueQuoted);
387    table[State::AttrValueQuoted as usize][ByteClass::Whitespace as usize] =
388        Transition::noop(State::AttrValueQuoted);
389    table[State::AttrValueQuoted as usize][ByteClass::Other as usize] =
390        Transition::noop(State::AttrValueQuoted);
391
392    // ----- AttrValueUnquoted state -----
393    table[State::AttrValueUnquoted as usize][ByteClass::Whitespace as usize] =
394        Transition::new(State::BeforeAttrName, Action::EmitAttrValue);
395    table[State::AttrValueUnquoted as usize][ByteClass::Gt as usize] =
396        Transition::new(State::Data, Action::EmitOpenTagClose);
397    table[State::AttrValueUnquoted as usize][ByteClass::Lt as usize] =
398        Transition::new(State::TagOpen, Action::EmitOpenTagClose);
399    table[State::AttrValueUnquoted as usize][ByteClass::Alpha as usize] =
400        Transition::noop(State::AttrValueUnquoted);
401    table[State::AttrValueUnquoted as usize][ByteClass::Other as usize] =
402        Transition::noop(State::AttrValueUnquoted);
403    table[State::AttrValueUnquoted as usize][ByteClass::Dash as usize] =
404        Transition::noop(State::AttrValueUnquoted);
405    table[State::AttrValueUnquoted as usize][ByteClass::Slash as usize] =
406        Transition::noop(State::AttrValueUnquoted);
407    table[State::AttrValueUnquoted as usize][ByteClass::Eq as usize] =
408        Transition::noop(State::AttrValueUnquoted);
409    table[State::AttrValueUnquoted as usize][ByteClass::Quot as usize] =
410        Transition::noop(State::AttrValueUnquoted);
411    table[State::AttrValueUnquoted as usize][ByteClass::Amp as usize] =
412        Transition::noop(State::AttrValueUnquoted);
413    table[State::AttrValueUnquoted as usize][ByteClass::Bang as usize] =
414        Transition::noop(State::AttrValueUnquoted);
415
416    // ----- SelfClosingStartTag state (saw '/' inside tag) -----
417    table[State::SelfClosingStartTag as usize][ByteClass::Gt as usize] =
418        Transition::new(State::Data, Action::EmitSelfClose);
419    // Not a self-close — treat '/' as ignored, go back to before-attr
420    table[State::SelfClosingStartTag as usize][ByteClass::Alpha as usize] =
421        Transition::new(State::AttrName, Action::StartAttrName);
422    table[State::SelfClosingStartTag as usize][ByteClass::Whitespace as usize] =
423        Transition::noop(State::BeforeAttrName);
424    table[State::SelfClosingStartTag as usize][ByteClass::Lt as usize] =
425        Transition::new(State::TagOpen, Action::EmitSelfClose);
426    table[State::SelfClosingStartTag as usize][ByteClass::Other as usize] =
427        Transition::noop(State::BeforeAttrName);
428    table[State::SelfClosingStartTag as usize][ByteClass::Slash as usize] =
429        Transition::noop(State::SelfClosingStartTag);
430    table[State::SelfClosingStartTag as usize][ByteClass::Eq as usize] =
431        Transition::noop(State::BeforeAttrName);
432    table[State::SelfClosingStartTag as usize][ByteClass::Quot as usize] =
433        Transition::noop(State::BeforeAttrName);
434    table[State::SelfClosingStartTag as usize][ByteClass::Amp as usize] =
435        Transition::noop(State::BeforeAttrName);
436    table[State::SelfClosingStartTag as usize][ByteClass::Bang as usize] =
437        Transition::noop(State::BeforeAttrName);
438    table[State::SelfClosingStartTag as usize][ByteClass::Dash as usize] =
439        Transition::noop(State::BeforeAttrName);
440
441    // ----- MarkupDecl state (saw '<!') -----
442    table[State::MarkupDecl as usize][ByteClass::Dash as usize] =
443        Transition::new(State::Comment, Action::StartComment);
444    table[State::MarkupDecl as usize][ByteClass::Alpha as usize] =
445        Transition::new(State::Doctype, Action::StartDoctype);
446    // '[' for CDATA — classified as Other
447    table[State::MarkupDecl as usize][ByteClass::Other as usize] =
448        Transition::new(State::CData, Action::StartCData);
449    table[State::MarkupDecl as usize][ByteClass::Gt as usize] = Transition::noop(State::Data);
450    table[State::MarkupDecl as usize][ByteClass::Lt as usize] =
451        Transition::new(State::TagOpen, Action::FlushText);
452    table[State::MarkupDecl as usize][ByteClass::Slash as usize] = Transition::noop(State::Data);
453    table[State::MarkupDecl as usize][ByteClass::Eq as usize] = Transition::noop(State::Data);
454    table[State::MarkupDecl as usize][ByteClass::Quot as usize] = Transition::noop(State::Data);
455    table[State::MarkupDecl as usize][ByteClass::Amp as usize] = Transition::noop(State::Data);
456    table[State::MarkupDecl as usize][ByteClass::Bang as usize] = Transition::noop(State::Data);
457    table[State::MarkupDecl as usize][ByteClass::Whitespace as usize] =
458        Transition::noop(State::Data);
459
460    // ----- Comment state -----
461    table[State::Comment as usize][ByteClass::Dash as usize] =
462        Transition::noop(State::CommentEndDash);
463    table[State::Comment as usize][ByteClass::Lt as usize] = Transition::noop(State::Comment);
464    table[State::Comment as usize][ByteClass::Gt as usize] = Transition::noop(State::Comment);
465    table[State::Comment as usize][ByteClass::Slash as usize] = Transition::noop(State::Comment);
466    table[State::Comment as usize][ByteClass::Eq as usize] = Transition::noop(State::Comment);
467    table[State::Comment as usize][ByteClass::Quot as usize] = Transition::noop(State::Comment);
468    table[State::Comment as usize][ByteClass::Amp as usize] = Transition::noop(State::Comment);
469    table[State::Comment as usize][ByteClass::Bang as usize] = Transition::noop(State::Comment);
470    table[State::Comment as usize][ByteClass::Alpha as usize] = Transition::noop(State::Comment);
471    table[State::Comment as usize][ByteClass::Whitespace as usize] =
472        Transition::noop(State::Comment);
473    table[State::Comment as usize][ByteClass::Other as usize] = Transition::noop(State::Comment);
474
475    // ----- CommentEndDash state (saw '-' in comment) -----
476    table[State::CommentEndDash as usize][ByteClass::Dash as usize] =
477        Transition::noop(State::CommentEnd);
478    // Not end of comment — back to Comment
479    table[State::CommentEndDash as usize][ByteClass::Lt as usize] =
480        Transition::noop(State::Comment);
481    table[State::CommentEndDash as usize][ByteClass::Gt as usize] =
482        Transition::noop(State::Comment);
483    table[State::CommentEndDash as usize][ByteClass::Slash as usize] =
484        Transition::noop(State::Comment);
485    table[State::CommentEndDash as usize][ByteClass::Eq as usize] =
486        Transition::noop(State::Comment);
487    table[State::CommentEndDash as usize][ByteClass::Quot as usize] =
488        Transition::noop(State::Comment);
489    table[State::CommentEndDash as usize][ByteClass::Amp as usize] =
490        Transition::noop(State::Comment);
491    table[State::CommentEndDash as usize][ByteClass::Bang as usize] =
492        Transition::noop(State::Comment);
493    table[State::CommentEndDash as usize][ByteClass::Alpha as usize] =
494        Transition::noop(State::Comment);
495    table[State::CommentEndDash as usize][ByteClass::Whitespace as usize] =
496        Transition::noop(State::Comment);
497    table[State::CommentEndDash as usize][ByteClass::Other as usize] =
498        Transition::noop(State::Comment);
499
500    // ----- CommentEnd state (saw '--' in comment) -----
501    table[State::CommentEnd as usize][ByteClass::Gt as usize] =
502        Transition::new(State::Data, Action::EmitComment);
503    // Not closing yet — back to Comment
504    table[State::CommentEnd as usize][ByteClass::Dash as usize] =
505        Transition::noop(State::CommentEnd);
506    table[State::CommentEnd as usize][ByteClass::Lt as usize] = Transition::noop(State::Comment);
507    table[State::CommentEnd as usize][ByteClass::Slash as usize] = Transition::noop(State::Comment);
508    table[State::CommentEnd as usize][ByteClass::Eq as usize] = Transition::noop(State::Comment);
509    table[State::CommentEnd as usize][ByteClass::Quot as usize] = Transition::noop(State::Comment);
510    table[State::CommentEnd as usize][ByteClass::Amp as usize] = Transition::noop(State::Comment);
511    table[State::CommentEnd as usize][ByteClass::Bang as usize] = Transition::noop(State::Comment);
512    table[State::CommentEnd as usize][ByteClass::Alpha as usize] = Transition::noop(State::Comment);
513    table[State::CommentEnd as usize][ByteClass::Whitespace as usize] =
514        Transition::noop(State::Comment);
515    table[State::CommentEnd as usize][ByteClass::Other as usize] = Transition::noop(State::Comment);
516
517    // ----- Doctype state -----
518    table[State::Doctype as usize][ByteClass::Gt as usize] =
519        Transition::new(State::Data, Action::EmitDoctype);
520    table[State::Doctype as usize][ByteClass::Lt as usize] = Transition::noop(State::Doctype);
521    table[State::Doctype as usize][ByteClass::Slash as usize] = Transition::noop(State::Doctype);
522    table[State::Doctype as usize][ByteClass::Eq as usize] = Transition::noop(State::Doctype);
523    table[State::Doctype as usize][ByteClass::Quot as usize] = Transition::noop(State::Doctype);
524    table[State::Doctype as usize][ByteClass::Amp as usize] = Transition::noop(State::Doctype);
525    table[State::Doctype as usize][ByteClass::Bang as usize] = Transition::noop(State::Doctype);
526    table[State::Doctype as usize][ByteClass::Dash as usize] = Transition::noop(State::Doctype);
527    table[State::Doctype as usize][ByteClass::Alpha as usize] = Transition::noop(State::Doctype);
528    table[State::Doctype as usize][ByteClass::Whitespace as usize] =
529        Transition::noop(State::Doctype);
530    table[State::Doctype as usize][ByteClass::Other as usize] = Transition::noop(State::Doctype);
531
532    // ----- CData state -----
533    // CDATA ends with `]]>` — we detect `>` and check context in extraction.
534    table[State::CData as usize][ByteClass::Gt as usize] =
535        Transition::new(State::Data, Action::EmitCData);
536    table[State::CData as usize][ByteClass::Lt as usize] = Transition::noop(State::CData);
537    table[State::CData as usize][ByteClass::Slash as usize] = Transition::noop(State::CData);
538    table[State::CData as usize][ByteClass::Eq as usize] = Transition::noop(State::CData);
539    table[State::CData as usize][ByteClass::Quot as usize] = Transition::noop(State::CData);
540    table[State::CData as usize][ByteClass::Amp as usize] = Transition::noop(State::CData);
541    table[State::CData as usize][ByteClass::Bang as usize] = Transition::noop(State::CData);
542    table[State::CData as usize][ByteClass::Dash as usize] = Transition::noop(State::CData);
543    table[State::CData as usize][ByteClass::Alpha as usize] = Transition::noop(State::CData);
544    table[State::CData as usize][ByteClass::Whitespace as usize] = Transition::noop(State::CData);
545    table[State::CData as usize][ByteClass::Other as usize] = Transition::noop(State::CData);
546
547    // ----- RawText state (script/style content) -----
548    // Everything stays in RawText until we see '<' (potential end tag).
549    table[State::RawText as usize][ByteClass::Lt as usize] =
550        Transition::new(State::TagOpen, Action::FlushText);
551    table[State::RawText as usize][ByteClass::Gt as usize] = Transition::noop(State::RawText);
552    table[State::RawText as usize][ByteClass::Slash as usize] = Transition::noop(State::RawText);
553    table[State::RawText as usize][ByteClass::Eq as usize] = Transition::noop(State::RawText);
554    table[State::RawText as usize][ByteClass::Quot as usize] = Transition::noop(State::RawText);
555    table[State::RawText as usize][ByteClass::Amp as usize] = Transition::noop(State::RawText);
556    table[State::RawText as usize][ByteClass::Bang as usize] = Transition::noop(State::RawText);
557    table[State::RawText as usize][ByteClass::Dash as usize] = Transition::noop(State::RawText);
558    table[State::RawText as usize][ByteClass::Alpha as usize] = Transition::noop(State::RawText);
559    table[State::RawText as usize][ByteClass::Whitespace as usize] =
560        Transition::noop(State::RawText);
561    table[State::RawText as usize][ByteClass::Other as usize] = Transition::noop(State::RawText);
562
563    table
564}
565
566#[cfg(test)]
567mod tests {
568    use super::*;
569
570    #[test]
571    fn byte_class_delimiters() {
572        assert_eq!(ByteClass::from_byte(b'<'), ByteClass::Lt);
573        assert_eq!(ByteClass::from_byte(b'>'), ByteClass::Gt);
574        assert_eq!(ByteClass::from_byte(b'/'), ByteClass::Slash);
575        assert_eq!(ByteClass::from_byte(b'='), ByteClass::Eq);
576        assert_eq!(ByteClass::from_byte(b'"'), ByteClass::Quot);
577        assert_eq!(ByteClass::from_byte(b'\''), ByteClass::Quot);
578        assert_eq!(ByteClass::from_byte(b'&'), ByteClass::Amp);
579        assert_eq!(ByteClass::from_byte(b'!'), ByteClass::Bang);
580        assert_eq!(ByteClass::from_byte(b'-'), ByteClass::Dash);
581    }
582
583    #[test]
584    fn byte_class_alpha() {
585        assert_eq!(ByteClass::from_byte(b'a'), ByteClass::Alpha);
586        assert_eq!(ByteClass::from_byte(b'z'), ByteClass::Alpha);
587        assert_eq!(ByteClass::from_byte(b'A'), ByteClass::Alpha);
588        assert_eq!(ByteClass::from_byte(b'Z'), ByteClass::Alpha);
589    }
590
591    #[test]
592    fn byte_class_whitespace() {
593        assert_eq!(ByteClass::from_byte(b' '), ByteClass::Whitespace);
594        assert_eq!(ByteClass::from_byte(b'\t'), ByteClass::Whitespace);
595        assert_eq!(ByteClass::from_byte(b'\n'), ByteClass::Whitespace);
596        assert_eq!(ByteClass::from_byte(b'\r'), ByteClass::Whitespace);
597    }
598
599    #[test]
600    fn byte_class_other() {
601        assert_eq!(ByteClass::from_byte(b'0'), ByteClass::Other);
602        assert_eq!(ByteClass::from_byte(b'['), ByteClass::Other);
603        assert_eq!(ByteClass::from_byte(0xFF), ByteClass::Other);
604    }
605
606    #[test]
607    fn data_lt_transitions_to_tag_open() {
608        let t = STATE_TABLE[State::Data as usize][ByteClass::Lt as usize];
609        assert_eq!(t.state, State::TagOpen);
610        assert_eq!(t.action, Action::FlushText);
611    }
612
613    #[test]
614    fn tag_open_alpha_starts_tag() {
615        let t = STATE_TABLE[State::TagOpen as usize][ByteClass::Alpha as usize];
616        assert_eq!(t.state, State::TagName);
617        assert_eq!(t.action, Action::StartTag);
618    }
619
620    #[test]
621    fn tag_open_slash_to_end_tag() {
622        let t = STATE_TABLE[State::TagOpen as usize][ByteClass::Slash as usize];
623        assert_eq!(t.state, State::EndTagOpen);
624    }
625
626    #[test]
627    fn tag_name_gt_emits_tag() {
628        let t = STATE_TABLE[State::TagName as usize][ByteClass::Gt as usize];
629        assert_eq!(t.state, State::Data);
630        assert_eq!(t.action, Action::EmitOpenTagClose);
631    }
632
633    #[test]
634    fn self_closing_gt_emits() {
635        let t = STATE_TABLE[State::SelfClosingStartTag as usize][ByteClass::Gt as usize];
636        assert_eq!(t.state, State::Data);
637        assert_eq!(t.action, Action::EmitSelfClose);
638    }
639
640    #[test]
641    fn comment_dash_dash_gt_emits() {
642        // First dash
643        let t1 = STATE_TABLE[State::Comment as usize][ByteClass::Dash as usize];
644        assert_eq!(t1.state, State::CommentEndDash);
645        // Second dash
646        let t2 = STATE_TABLE[State::CommentEndDash as usize][ByteClass::Dash as usize];
647        assert_eq!(t2.state, State::CommentEnd);
648        // >
649        let t3 = STATE_TABLE[State::CommentEnd as usize][ByteClass::Gt as usize];
650        assert_eq!(t3.state, State::Data);
651        assert_eq!(t3.action, Action::EmitComment);
652    }
653}