Skip to main content

lol_html/parser/state_machine/
mod.rs

1#[macro_use]
2mod syntax_dsl;
3
4#[macro_use]
5mod syntax;
6
7use crate::html::{LocalNameHash, TextType};
8use crate::parser::{ParserDirective, ParsingAmbiguityError, TreeBuilderFeedback};
9use crate::rewriter::RewritingError;
10use std::fmt::{self, Debug};
11use std::mem;
12
13pub(crate) enum FeedbackDirective {
14    ApplyUnhandledFeedback(TreeBuilderFeedback),
15    Skip,
16    None,
17}
18
19impl FeedbackDirective {
20    #[inline]
21    pub fn take(&mut self) -> Self {
22        mem::replace(self, Self::None)
23    }
24}
25
26impl Debug for FeedbackDirective {
27    #[cold]
28    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
29        write!(
30            f,
31            "{}",
32            match self {
33                Self::ApplyUnhandledFeedback(_) => "ApplyPendingFeedback",
34                Self::Skip => "Skip",
35                Self::None => "None",
36            }
37        )
38    }
39}
40
41#[derive(Debug)]
42pub(crate) struct StateMachineBookmark {
43    cdata_allowed: bool,
44    text_type: TextType,
45    last_start_tag_name_hash: LocalNameHash,
46    // NOTE: pub because it's used by trace!.
47    pub pos: usize,
48    feedback_directive: FeedbackDirective,
49}
50
51pub(crate) enum ActionError {
52    RewritingError(RewritingError),
53    ParserDirectiveChangeRequired(ParserDirective, StateMachineBookmark),
54    EndOfInput { consumed_byte_count: usize },
55    Internal(&'static str),
56}
57
58impl ActionError {
59    #[cold]
60    #[cfg_attr(debug_assertions, track_caller)]
61    #[allow(clippy::unnecessary_box_returns)]
62    pub(crate) fn internal(error: &'static str) -> Box<Self> {
63        debug_assert!(false, "{error}");
64        Box::new(Self::Internal(error))
65    }
66}
67
68impl From<ParsingAmbiguityError> for Box<ActionError> {
69    #[cold]
70    fn from(err: ParsingAmbiguityError) -> Self {
71        Self::new(ActionError::RewritingError(
72            RewritingError::ParsingAmbiguity(err),
73        ))
74    }
75}
76
77impl From<RewritingError> for Box<ActionError> {
78    #[cold]
79    fn from(err: RewritingError) -> Self {
80        Self::new(ActionError::RewritingError(err))
81    }
82}
83
84// TODO: use `!` type when it become stable.
85pub enum Never {}
86
87pub type ActionResult<T = ()> = Result<T, Box<ActionError>>;
88pub type StateResult = ActionResult<()>;
89pub type ParseResult = ActionResult<Never>;
90
91pub(crate) trait StateMachineActions {
92    type Context;
93
94    fn emit_text_and_eof(&mut self, context: &mut Self::Context, input: &[u8]) -> ActionResult;
95    fn emit_text(&mut self, context: &mut Self::Context, input: &[u8]) -> ActionResult;
96    fn emit_current_token(&mut self, context: &mut Self::Context, input: &[u8]) -> ActionResult;
97    fn emit_tag(&mut self, context: &mut Self::Context, input: &[u8]) -> ActionResult;
98    fn emit_current_token_and_eof(
99        &mut self,
100        context: &mut Self::Context,
101        input: &[u8],
102    ) -> ActionResult;
103    fn emit_raw_without_token(&mut self, context: &mut Self::Context, input: &[u8])
104    -> ActionResult;
105    fn emit_raw_without_token_and_eof(
106        &mut self,
107        context: &mut Self::Context,
108        input: &[u8],
109    ) -> ActionResult;
110
111    fn create_start_tag(&mut self, context: &mut Self::Context, input: &[u8]);
112    fn create_end_tag(&mut self, context: &mut Self::Context, input: &[u8]);
113    fn create_doctype(&mut self, context: &mut Self::Context, input: &[u8]);
114    fn create_comment(&mut self, context: &mut Self::Context, input: &[u8]);
115
116    fn start_token_part(&mut self, context: &mut Self::Context, input: &[u8]);
117
118    fn mark_comment_text_end(&mut self, context: &mut Self::Context, input: &[u8]);
119    fn shift_comment_text_end_by(
120        &mut self,
121        context: &mut Self::Context,
122        input: &[u8],
123        offset: usize,
124    );
125
126    fn set_force_quirks(&mut self, context: &mut Self::Context, input: &[u8]);
127    fn finish_doctype_name(&mut self, context: &mut Self::Context, input: &[u8]);
128    fn finish_doctype_public_id(&mut self, context: &mut Self::Context, input: &[u8]);
129    fn finish_doctype_system_id(&mut self, context: &mut Self::Context, input: &[u8]);
130
131    fn finish_tag_name(&mut self, context: &mut Self::Context, input: &[u8]) -> ActionResult;
132    fn update_tag_name_hash(&mut self, context: &mut Self::Context, input: &[u8]);
133    fn mark_as_self_closing(&mut self, context: &mut Self::Context, input: &[u8]);
134
135    fn start_attr(&mut self, context: &mut Self::Context, input: &[u8]);
136    fn finish_attr_name(&mut self, context: &mut Self::Context, input: &[u8]);
137    fn finish_attr_value(&mut self, context: &mut Self::Context, input: &[u8]);
138    fn finish_attr(&mut self, context: &mut Self::Context, input: &[u8]);
139
140    fn set_closing_quote_to_double(&mut self, context: &mut Self::Context, input: &[u8]);
141    fn set_closing_quote_to_single(&mut self, context: &mut Self::Context, input: &[u8]);
142
143    fn mark_tag_start(&mut self, context: &mut Self::Context, input: &[u8]);
144    fn unmark_tag_start(&mut self, context: &mut Self::Context, input: &[u8]);
145
146    fn enter_cdata(&mut self, context: &mut Self::Context, input: &[u8]);
147    fn leave_cdata(&mut self, context: &mut Self::Context, input: &[u8]);
148}
149
150pub(crate) trait StateMachineConditions {
151    fn is_appropriate_end_tag(&self) -> bool;
152    fn cdata_allowed(&self) -> bool;
153}
154
155pub(crate) trait StateMachine: StateMachineActions + StateMachineConditions {
156    cdata_section_states_group!();
157    data_states_group!();
158    plaintext_states_group!();
159    rawtext_states_group!();
160    rcdata_states_group!();
161    script_data_states_group!();
162    script_data_escaped_states_group!();
163    script_data_double_escaped_states_group!();
164    tag_states_group!();
165    attributes_states_group!();
166    comment_states_group!();
167    doctype_states_group!();
168
169    fn state(&self) -> fn(&mut Self, context: &mut Self::Context, &[u8]) -> StateResult;
170    fn set_state(
171        &mut self,
172        state: fn(&mut Self, context: &mut Self::Context, &[u8]) -> StateResult,
173    );
174
175    fn last_start_tag_name_hash(&self) -> LocalNameHash;
176    fn set_last_start_tag_name_hash(&mut self, name_hash: LocalNameHash);
177
178    fn set_last_text_type(&mut self, text_type: TextType);
179    fn last_text_type(&self) -> TextType;
180
181    fn set_cdata_allowed(&mut self, cdata_allowed: bool);
182
183    fn closing_quote(&self) -> u8;
184
185    fn adjust_for_next_input(&mut self);
186    fn adjust_to_bookmark(&mut self, pos: usize, feedback_directive: FeedbackDirective);
187    fn enter_ch_sequence_matching(&mut self);
188    fn leave_ch_sequence_matching(&mut self);
189    fn get_consumed_byte_count(&self, input: &[u8]) -> usize;
190
191    fn consume_ch(&mut self, input: &[u8]) -> Option<u8>;
192    /// true if it matched (`consume_ch` would return the `needle`), false if reached end of input
193    fn consume_until(&mut self, needle: u8, input: &[u8]) -> bool;
194    fn unconsume_ch(&mut self);
195    fn consume_several(&mut self, count: usize);
196    fn lookahead(&self, input: &[u8], offset: usize) -> Option<u8>;
197    fn pos(&self) -> usize;
198    fn set_pos(&mut self, pos: usize);
199    fn is_last_input(&self) -> bool;
200    fn set_is_last_input(&mut self, last: bool);
201
202    fn run_parsing_loop(
203        &mut self,
204        context: &mut Self::Context,
205        input: &[u8],
206        last: bool,
207    ) -> ParseResult {
208        self.set_is_last_input(last);
209
210        loop {
211            self.state()(self, context, input)?;
212        }
213    }
214
215    fn continue_from_bookmark(
216        &mut self,
217        context: &mut Self::Context,
218        input: &[u8],
219        last: bool,
220        bookmark: StateMachineBookmark,
221    ) -> ParseResult {
222        self.set_cdata_allowed(bookmark.cdata_allowed);
223        self.switch_text_type(bookmark.text_type);
224        self.set_last_start_tag_name_hash(bookmark.last_start_tag_name_hash);
225        self.adjust_to_bookmark(bookmark.pos, bookmark.feedback_directive);
226        self.set_pos(bookmark.pos);
227
228        self.run_parsing_loop(context, input, last)
229    }
230
231    #[cold]
232    fn break_on_end_of_input(&mut self, input: &[u8]) -> StateResult {
233        let consumed_byte_count = self.get_consumed_byte_count(input);
234
235        if !self.is_last_input() {
236            self.adjust_for_next_input();
237        }
238
239        self.set_pos(self.pos() - consumed_byte_count);
240
241        Err(Box::new(ActionError::EndOfInput {
242            consumed_byte_count,
243        }))
244    }
245
246    #[inline]
247    fn create_bookmark(
248        &self,
249        pos: usize,
250        feedback_directive: FeedbackDirective,
251    ) -> StateMachineBookmark {
252        StateMachineBookmark {
253            cdata_allowed: self.cdata_allowed(),
254            text_type: self.last_text_type(),
255            last_start_tag_name_hash: self.last_start_tag_name_hash(),
256            pos,
257            feedback_directive,
258        }
259    }
260
261    #[inline]
262    fn change_parser_directive(
263        &self,
264        pos: usize,
265        new_parser_directive: ParserDirective,
266        feedback_directive: FeedbackDirective,
267    ) -> ActionResult {
268        Err(Box::new(ActionError::ParserDirectiveChangeRequired(
269            new_parser_directive,
270            self.create_bookmark(pos, feedback_directive),
271        )))
272    }
273
274    #[inline]
275    fn switch_text_type(&mut self, text_type: TextType) {
276        self.set_last_text_type(text_type);
277        self.set_state(self.next_text_parsing_state());
278    }
279
280    #[inline]
281    fn next_text_parsing_state(&self) -> fn(&mut Self, &mut Self::Context, &[u8]) -> StateResult {
282        match self.last_text_type() {
283            TextType::Data => Self::data_state,
284            TextType::PlainText => Self::plaintext_state,
285            TextType::RCData => Self::rcdata_state,
286            TextType::RawText => Self::rawtext_state,
287            TextType::ScriptData => Self::script_data_state,
288            TextType::CDataSection => Self::cdata_section_state,
289        }
290    }
291}
292
293macro_rules! impl_common_sm_accessors {
294    () => {
295        #[inline]
296        fn set_last_text_type(&mut self, text_type: TextType) {
297            self.last_text_type = text_type;
298        }
299
300        #[inline]
301        fn last_text_type(&self) -> TextType {
302            self.last_text_type
303        }
304
305        #[inline]
306        fn closing_quote(&self) -> u8 {
307            self.closing_quote
308        }
309
310        #[inline]
311        fn last_start_tag_name_hash(&self) -> LocalNameHash {
312            self.last_start_tag_name_hash
313        }
314
315        #[inline]
316        fn set_last_start_tag_name_hash(&mut self, name_hash: LocalNameHash) {
317            self.last_start_tag_name_hash = name_hash;
318        }
319
320        #[inline]
321        fn set_cdata_allowed(&mut self, cdata_allowed: bool) {
322            self.cdata_allowed = cdata_allowed;
323        }
324    };
325}
326
327macro_rules! impl_common_sm_actions {
328    () => {
329        #[inline]
330        fn set_closing_quote_to_double(&mut self, _context: &mut Self::Context, _input: &[u8]) {
331            self.closing_quote = b'"';
332        }
333
334        #[inline]
335        fn set_closing_quote_to_single(&mut self, _context: &mut Self::Context, _input: &[u8]) {
336            self.closing_quote = b'\'';
337        }
338
339        #[inline]
340        fn enter_cdata(&mut self, _context: &mut Self::Context, _input: &[u8]) {
341            self.set_last_text_type(TextType::CDataSection);
342        }
343
344        #[inline]
345        fn leave_cdata(&mut self, _context: &mut Self::Context, _input: &[u8]) {
346            self.set_last_text_type(TextType::Data);
347        }
348    };
349}
350
351macro_rules! impl_common_input_cursor_methods {
352    () => {
353        #[inline]
354        #[allow(clippy::let_and_return)]
355        fn consume_ch(&mut self, input: &[u8]) -> Option<u8> {
356            let ch = input.get(self.next_pos).copied();
357
358            self.next_pos += 1;
359
360            trace!(@chars "consume", ch);
361
362            ch
363        }
364
365        #[inline]
366        fn consume_until(&mut self, needle: u8, input: &[u8]) -> bool {
367            let rest = input.get(self.next_pos..).unwrap_or(&input[..0]);
368
369            match memchr::memchr(needle, rest) {
370                None => {
371                    self.next_pos += 1 + rest.len();
372                    false
373                },
374                Some(pos) => {
375                    self.next_pos += 1 + pos;
376                    true
377                }
378            }
379        }
380
381        #[inline]
382        fn unconsume_ch(&mut self) {
383            self.next_pos -= 1;
384
385            trace!(@chars "unconsume");
386        }
387
388        #[inline]
389        fn consume_several(&mut self, count: usize) {
390            self.next_pos += count;
391
392            trace!(@chars "consume several");
393        }
394
395        #[inline]
396        #[allow(clippy::let_and_return)]
397        fn lookahead(&self, input: &[u8], offset: usize) -> Option<u8> {
398            let ch = input.get(self.next_pos + offset - 1).copied();
399
400            trace!(@chars "lookahead", ch);
401
402            ch
403        }
404
405        #[inline]
406        fn pos(&self) -> usize {
407            self.next_pos - 1
408        }
409
410        #[inline]
411        fn set_pos(&mut self, pos: usize) {
412            self.next_pos = pos;
413        }
414
415        #[inline]
416        fn is_last_input(&self) -> bool {
417            self.is_last_input
418        }
419
420        #[inline]
421        fn set_is_last_input(&mut self, last: bool) {
422            self.is_last_input = last;
423        }
424    };
425}
426
427macro_rules! noop_action {
428    ($($fn_name:ident),*) => {
429        $(
430            #[inline]
431            fn $fn_name(&mut self, _context: &mut Self::Context, _input: &[u8]) {
432                trace!(@noop);
433            }
434        )*
435    };
436}
437
438macro_rules! noop_action_with_result {
439    ($($fn_name:ident),*) => {
440        $(
441            #[inline]
442            fn $fn_name(&mut self, _context: &mut Self::Context, _input: &[u8]) -> ActionResult {
443                trace!(@noop);
444
445                Ok(())
446            }
447        )*
448    };
449}