Skip to main content

lol_html/parser/
mod.rs

1#[macro_use]
2mod state_machine;
3
4mod lexer;
5mod tag_scanner;
6mod tree_builder_simulator;
7
8use self::lexer::Lexer;
9pub(crate) use self::lexer::{
10    AttributeBuffer, AttributeOutline, Lexeme, LexemeSink, NonTagContentLexeme,
11    NonTagContentTokenOutline, TagLexeme, TagTokenOutline,
12};
13use self::state_machine::StateMachine;
14pub(crate) use self::state_machine::{ActionError, ActionResult};
15pub(crate) use self::tag_scanner::TagHintSink;
16use self::tag_scanner::TagScanner;
17pub use self::tree_builder_simulator::ParsingAmbiguityError;
18use self::tree_builder_simulator::{TreeBuilderFeedback, TreeBuilderSimulator};
19use crate::rewriter::RewritingError;
20use cfg_if::cfg_if;
21
22// NOTE: tag scanner can implicitly force parser to switch to
23// the lexer mode if it fails to get tree builder feedback. It's up
24// to consumer to switch the parser back to the tag scan mode in
25// the tag handler.
26#[derive(Clone, Copy, Debug)]
27pub(crate) enum ParserDirective {
28    WherePossibleScanForTagsOnly,
29    Lex,
30}
31
32pub(crate) struct ParserContext<S> {
33    output_sink: S,
34    tree_builder_simulator: TreeBuilderSimulator,
35    /// Amount of bytes consumed by previous calls to `parse()`,
36    /// i.e. number of bytes from the start of the document until the start of the current input slice
37    previously_consumed_byte_count: usize,
38}
39
40pub(crate) trait ParserOutputSink: LexemeSink + TagHintSink {}
41
42// Pub only for integration tests
43pub struct Parser<S> {
44    lexer: Lexer<S>,
45    tag_scanner: TagScanner<S>,
46    current_directive: ParserDirective,
47    context: ParserContext<S>,
48}
49
50// public only for integration tests
51#[allow(private_bounds, private_interfaces)]
52impl<S: ParserOutputSink> Parser<S> {
53    #[must_use]
54    #[inline(never)]
55    pub fn new(output_sink: S, initial_directive: ParserDirective, strict: bool) -> Self {
56        let context = ParserContext {
57            output_sink,
58            previously_consumed_byte_count: 0,
59            tree_builder_simulator: TreeBuilderSimulator::new(strict),
60        };
61
62        Self {
63            lexer: Lexer::new(),
64            tag_scanner: TagScanner::new(),
65            current_directive: initial_directive,
66            context,
67        }
68    }
69
70    // generic methods tend to be inlined, but this one is called from a couple of places,
71    // and has cheap-to-pass non-constants args, so it won't benefit from being merged into its callers.
72    // It's better to outline it, and let its callers be inlined.
73    #[inline(never)]
74    pub fn parse(&mut self, input: &[u8], last: bool) -> Result<usize, RewritingError> {
75        let mut parse_result = match self.current_directive {
76            ParserDirective::WherePossibleScanForTagsOnly => {
77                self.tag_scanner
78                    .run_parsing_loop(&mut self.context, input, last)
79            }
80            ParserDirective::Lex => self.lexer.run_parsing_loop(&mut self.context, input, last),
81        };
82
83        loop {
84            let unboxed = match parse_result {
85                Ok(unreachable) => match unreachable {},
86                Err(boxed) => *boxed,
87            };
88            match unboxed {
89                ActionError::EndOfInput {
90                    consumed_byte_count,
91                } => {
92                    self.context.previously_consumed_byte_count += consumed_byte_count;
93                    return Ok(consumed_byte_count);
94                }
95                ActionError::ParserDirectiveChangeRequired(new_directive, sm_bookmark) => {
96                    self.current_directive = new_directive;
97
98                    trace!(@continue_from_bookmark sm_bookmark, self.current_directive, input);
99
100                    parse_result = match self.current_directive {
101                        ParserDirective::WherePossibleScanForTagsOnly => self
102                            .tag_scanner
103                            .continue_from_bookmark(&mut self.context, input, last, sm_bookmark),
104                        ParserDirective::Lex => self.lexer.continue_from_bookmark(
105                            &mut self.context,
106                            input,
107                            last,
108                            sm_bookmark,
109                        ),
110                    };
111                }
112                ActionError::RewritingError(err) => return Err(err),
113                ActionError::Internal(err) => {
114                    return Err(RewritingError::ContentHandlerError(err.into()));
115                }
116            }
117        }
118    }
119
120    pub fn get_dispatcher(&mut self) -> &mut S {
121        &mut self.context.output_sink
122    }
123}
124
125cfg_if! {
126    if #[cfg(feature = "integration_test")] {
127        use crate::html::{LocalNameHash, TextType};
128
129        #[allow(private_bounds)]
130        impl<S: ParserOutputSink> Parser<S> {
131            pub fn switch_text_type(&mut self, text_type: TextType) {
132                match self.current_directive {
133                    ParserDirective::WherePossibleScanForTagsOnly => {
134                        self.tag_scanner.switch_text_type(text_type);
135                    }
136                    ParserDirective::Lex => self.lexer.switch_text_type(text_type),
137                }
138            }
139
140            pub fn set_last_start_tag_name_hash(&mut self, name_hash: LocalNameHash) {
141                match self.current_directive {
142                    ParserDirective::WherePossibleScanForTagsOnly => {
143                        self.tag_scanner.set_last_start_tag_name_hash(name_hash);
144                    }
145                    ParserDirective::Lex => self.lexer.set_last_start_tag_name_hash(name_hash),
146                }
147            }
148        }
149    }
150}