lol_html/parser/tag_scanner/
mod.rs

1#[macro_use]
2mod actions;
3mod conditions;
4
5use crate::base::{Align, Bytes, Range};
6use crate::html::{LocalName, LocalNameHash, Namespace, TextType};
7use crate::parser::state_machine::{FeedbackDirective, StateMachine, StateResult};
8use crate::parser::{ParserContext, ParserDirective, ParsingAmbiguityError, TreeBuilderFeedback};
9use crate::rewriter::RewritingError;
10use std::cmp::min;
11
12pub(crate) trait TagHintSink {
13    fn handle_start_tag_hint(
14        &mut self,
15        name: LocalName<'_>,
16        ns: Namespace,
17    ) -> Result<ParserDirective, RewritingError>;
18    fn handle_end_tag_hint(
19        &mut self,
20        name: LocalName<'_>,
21    ) -> Result<ParserDirective, RewritingError>;
22}
23
24pub(crate) type State<S> =
25    fn(&mut TagScanner<S>, context: &mut ParserContext<S>, &[u8]) -> StateResult;
26
27/// Tag scanner skips the majority of lexer operations and, thus,
28/// is faster. It also has much less requirements for buffering which makes it more
29/// prone to bailouts caused by buffer exhaustion (actually it buffers only tag names).
30///
31/// Tag scanner produces tag previews as an output which serve as a hint for
32/// the matcher which can then switch to the lexer if required.
33///
34/// It's not guaranteed that tag preview will actually produce the token in the end
35/// of the input (e.g. `<div` will produce a tag preview, but not tag token). However,
36/// it's not a concern for our use case as no content will be erroneously captured
37/// in this case.
38pub(crate) struct TagScanner<S> {
39    next_pos: usize,
40    is_last_input: bool,
41    tag_start: Option<usize>,
42    ch_sequence_matching_start: Option<usize>,
43    tag_name_start: usize,
44    is_in_end_tag: bool,
45    tag_name_hash: LocalNameHash,
46    last_start_tag_name_hash: LocalNameHash,
47    cdata_allowed: bool,
48    state: State<S>,
49    closing_quote: u8,
50    pending_text_type_change: Option<TextType>,
51    last_text_type: TextType,
52}
53
54impl<S: TagHintSink> TagScanner<S> {
55    pub fn new() -> Self {
56        Self {
57            next_pos: 0,
58            is_last_input: false,
59            tag_start: None,
60            ch_sequence_matching_start: None,
61            tag_name_start: 0,
62            is_in_end_tag: false,
63            tag_name_hash: LocalNameHash::default(),
64            last_start_tag_name_hash: LocalNameHash::default(),
65            cdata_allowed: false,
66            state: Self::data_state,
67            closing_quote: b'"',
68            pending_text_type_change: None,
69            last_text_type: TextType::Data,
70        }
71    }
72
73    fn emit_tag_hint(
74        &mut self,
75        context: &mut ParserContext<S>,
76        input: &[u8],
77        is_in_end_tag: bool,
78    ) -> Result<ParserDirective, RewritingError> {
79        let name_range = Range {
80            start: self.tag_name_start,
81            end: self.pos(),
82        };
83
84        let input_bytes = Bytes::new(input);
85        let name = LocalName::new(&input_bytes, name_range, self.tag_name_hash);
86
87        trace!(@output name);
88
89        if is_in_end_tag {
90            context.output_sink.handle_end_tag_hint(name)
91        } else {
92            self.last_start_tag_name_hash = self.tag_name_hash;
93
94            let ns = context.tree_builder_simulator.current_ns();
95
96            context.output_sink.handle_start_tag_hint(name, ns)
97        }
98    }
99
100    #[inline]
101    fn try_apply_tree_builder_feedback(
102        &mut self,
103        context: &mut ParserContext<S>,
104    ) -> Result<Option<TreeBuilderFeedback>, ParsingAmbiguityError> {
105        let feedback = if self.is_in_end_tag {
106            context
107                .tree_builder_simulator
108                .get_feedback_for_end_tag(self.tag_name_hash)
109        } else {
110            context
111                .tree_builder_simulator
112                .get_feedback_for_start_tag(self.tag_name_hash)?
113        };
114
115        Ok(match feedback {
116            TreeBuilderFeedback::SwitchTextType(text_type) => {
117                // NOTE: we can't switch type immediately as we are in the middle of tag parsing.
118                // So, we need to switch later on the `emit_tag` action.
119                self.pending_text_type_change = Some(text_type);
120                None
121            }
122            TreeBuilderFeedback::SetAllowCdata(cdata_allowed) => {
123                self.cdata_allowed = cdata_allowed;
124                None
125            }
126            TreeBuilderFeedback::RequestLexeme(_) => Some(feedback),
127            TreeBuilderFeedback::None => None,
128        })
129    }
130
131    #[inline]
132    fn take_feedback_directive(&mut self) -> FeedbackDirective {
133        self.pending_text_type_change
134            .take()
135            .map_or(FeedbackDirective::Skip, |text_type| {
136                FeedbackDirective::ApplyUnhandledFeedback(TreeBuilderFeedback::SwitchTextType(
137                    text_type,
138                ))
139            })
140    }
141}
142
143impl<S: TagHintSink> StateMachine for TagScanner<S> {
144    impl_common_sm_accessors!();
145    impl_common_input_cursor_methods!();
146
147    #[inline]
148    fn set_state(&mut self, state: State<S>) {
149        self.state = state;
150    }
151
152    #[inline]
153    fn state(&self) -> State<S> {
154        self.state
155    }
156
157    #[inline]
158    fn get_consumed_byte_count(&self, input: &[u8]) -> usize {
159        // NOTE: if we are in character sequence matching we need
160        // to block from the position where matching starts. We don't
161        // need to do that manually in the lexer because it
162        // always blocks all bytes starting from lexeme start and it's
163        // guaranteed that character sequence matching occurs withih
164        // lexeme boundaries.
165        match (self.tag_start, self.ch_sequence_matching_start) {
166            (Some(tag_start), Some(ch_sequence_matching_start)) => {
167                min(tag_start, ch_sequence_matching_start)
168            }
169            (Some(tag_start), None) => tag_start,
170            (None, Some(ch_sequence_matching_start)) => ch_sequence_matching_start,
171            (None, None) => input.len(),
172        }
173    }
174
175    fn adjust_for_next_input(&mut self) {
176        if let Some(tag_start) = self.tag_start {
177            self.tag_name_start.align(tag_start);
178            self.tag_start = Some(0);
179        }
180    }
181
182    #[inline]
183    fn adjust_to_bookmark(&mut self, _pos: usize, _feedback_directive: FeedbackDirective) {
184        trace!(@noop);
185    }
186
187    #[inline]
188    fn enter_ch_sequence_matching(&mut self) {
189        self.ch_sequence_matching_start = Some(self.pos());
190    }
191
192    #[inline]
193    fn leave_ch_sequence_matching(&mut self) {
194        self.ch_sequence_matching_start = None;
195    }
196}