html5tokenizer/
trace.rs

1//! Provides the [`Trace`] type (byte offsets and syntax information about tokens).
2
3use std::{
4    num::NonZeroUsize,
5    ops::{Index, Range},
6};
7
8use crate::let_else::assume;
9use crate::token::AttributeTraceIdx;
10
11/// Provides byte offsets and syntax information about a [`Token`].
12///
13/// [`Token`]: crate::token::Token
14#[allow(missing_docs)]
15#[derive(Eq, PartialEq, Debug)]
16pub enum Trace {
17    Char(Range<usize>),
18    StartTag(StartTagTrace),
19    EndTag(EndTagTrace),
20    Comment(CommentTrace),
21    Doctype(DoctypeTrace),
22    EndOfFile(usize),
23}
24
25/// Provides byte offsets and syntax information for a [`StartTag`] token.
26///
27/// [`StartTag`]: crate::token::StartTag
28#[derive(Eq, PartialEq, Debug)]
29pub struct StartTagTrace {
30    /// The span of the tag.
31    pub span: Range<usize>,
32
33    /// The span of the tag name.
34    pub name_span: Range<usize>,
35
36    /// List of [`AttributeTrace`]s for the attributes that were present in the source.
37    pub attribute_traces: AttributeTraceList,
38}
39
40/// Provides byte offsets for an [`EndTag`] token.
41///
42/// [`EndTag`]: crate::token::EndTag
43#[derive(Eq, PartialEq, Debug)]
44pub struct EndTagTrace {
45    /// The span of the tag.
46    pub span: Range<usize>,
47
48    /// The span of the tag name.
49    pub name_span: Range<usize>,
50}
51
52/// Provides byte offsets for a [`Token::Comment`].
53///
54/// [`Token::Comment`]: crate::token::Token::Comment
55#[derive(Eq, PartialEq, Debug)]
56pub struct CommentTrace {
57    /// The offset of the comment data.
58    pub data_span: Range<usize>,
59}
60
61/// Provides byte offsets for a [`Doctype`] token.
62///
63/// [`Doctype`]: crate::token::Doctype
64#[derive(Eq, PartialEq, Debug)]
65pub struct DoctypeTrace {
66    pub(crate) span: Range<usize>,
67    // Using NonZeroUsize to optimize the size of the struct.
68    name_span: Option<Range<std::num::NonZeroUsize>>,
69    public_id_span: Option<Range<std::num::NonZeroUsize>>,
70    system_id_span: Option<Range<std::num::NonZeroUsize>>,
71}
72
73impl DoctypeTrace {
74    /// Returns the span of the DOCTYPE.
75    pub fn span(&self) -> Range<usize> {
76        self.span.clone()
77    }
78
79    /// Returns the span of the name.
80    pub fn name_span(&self) -> Option<Range<usize>> {
81        self.name_span
82            .as_ref()
83            .map(|range| range.start.get()..range.end.get())
84    }
85
86    /// Returns the span of the public identifier.
87    pub fn public_id_span(&self) -> Option<Range<usize>> {
88        self.public_id_span
89            .as_ref()
90            .map(|range| range.start.get()..range.end.get())
91    }
92
93    /// Returns the span of the system identifier.
94    pub fn system_id_span(&self) -> Option<Range<usize>> {
95        self.system_id_span
96            .as_ref()
97            .map(|range| range.start.get()..range.end.get())
98    }
99}
100
101/// Internal [`DoctypeTrace`] methods.
102///
103/// Note that even though it stands to reason that the offsets provided to the `set_`
104/// methods can never be zero, we intentionally don't use `new_unchecked` since
105/// actually verifying that the offsets provided to the respective Emitter methods can
106/// never be zero would non-trivial (since the tokenizer state machine has 80 states).
107impl DoctypeTrace {
108    #[inline]
109    pub(crate) fn new(span_start: usize) -> Self {
110        Self {
111            span: span_start..0,
112            name_span: None,
113            public_id_span: None,
114            system_id_span: None,
115        }
116    }
117
118    #[inline]
119    pub(crate) fn set_name_start(&mut self, start: usize) {
120        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero");
121        self.name_span = Some(start..start);
122    }
123
124    #[inline]
125    pub(crate) fn set_public_id_start(&mut self, start: usize) {
126        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero");
127        self.public_id_span = Some(start..start);
128    }
129
130    #[inline]
131    pub(crate) fn set_system_id_start(&mut self, start: usize) {
132        let start = NonZeroUsize::new(start).expect("expected offset to be non-zero");
133        self.system_id_span = Some(start..start);
134    }
135
136    #[inline]
137    pub(crate) fn set_name_end(&mut self, end: usize) {
138        assume!(Some(span), &mut self.name_span);
139        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero");
140    }
141
142    #[inline]
143    pub(crate) fn set_public_id_end(&mut self, end: usize) {
144        assume!(Some(span), &mut self.public_id_span);
145        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero");
146    }
147
148    #[inline]
149    pub(crate) fn set_system_id_end(&mut self, end: usize) {
150        assume!(Some(span), &mut self.system_id_span);
151        span.end = NonZeroUsize::new(end).expect("expected offset to be non-zero");
152    }
153}
154
155/// The syntax of the attribute value.
156#[derive(Clone, Copy, PartialEq, Eq, Debug)]
157pub enum AttrValueSyntax {
158    /// An unquoted attribute value, e.g. `id=foo`.
159    Unquoted,
160    /// A single-quoted attribute value, e.g. `id='foo'`.
161    SingleQuoted,
162    /// A double-quoted attribute value, e.g. `id="foo"`.
163    DoubleQuoted,
164}
165
166/// Provides byte offsets and the [`AttrValueSyntax`] for an attribute that was present in the source.
167#[derive(Eq, PartialEq, Debug)]
168pub struct AttributeTrace {
169    pub(crate) value_syntax: Option<AttrValueSyntax>,
170    pub(crate) name_span: Range<usize>,
171    /// We intentionally don't use `Option<Range<O>>` here to spare us a byte (and padding) per attribute.
172    /// For the empty attribute syntax this is just `O::default()..O::default()`.
173    pub(crate) value_span: Range<usize>,
174}
175
176impl AttributeTrace {
177    /// [`AttributeTrace`] intentionally doesn't implement Default
178    /// (since it's part of the public API and it wouldn't make sense semantically).
179    pub(crate) fn new() -> Self {
180        Self {
181            value_syntax: None,
182            name_span: Default::default(),
183            value_span: Default::default(),
184        }
185    }
186
187    /// Returns the span of the attribute name.
188    pub fn name_span(&self) -> Range<usize> {
189        self.name_span.clone()
190    }
191
192    /// For explicitly defined values returns the span of the attribute value.
193    ///
194    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
195    pub fn value_span(&self) -> Option<Range<usize>> {
196        if self.value_syntax.is_none() {
197            return None;
198        }
199        Some(self.value_span.clone())
200    }
201
202    /// Returns the attribute value syntax in case the value is explicitly defined.
203    ///
204    /// Returns `None` for attributes using the empty attribute syntax (e.g. `disabled` in `<input disabled>`).
205    pub fn value_syntax(&self) -> Option<AttrValueSyntax> {
206        self.value_syntax
207    }
208}
209
210/// List of [`AttributeTrace`]s for the attributes that were present in the source.
211#[derive(Eq, PartialEq, Debug)]
212pub struct AttributeTraceList {
213    /// We don't use `HashMap<String, AttributeTrace>` since this would require
214    /// the attribute names to be cloned (which would be less efficient).
215    traces: Vec<AttributeTrace>,
216}
217
218impl Index<AttributeTraceIdx> for AttributeTraceList {
219    type Output = AttributeTrace;
220
221    fn index(&self, index: AttributeTraceIdx) -> &Self::Output {
222        &self.traces[index.0.get() - 1]
223    }
224}
225
226impl AttributeTraceList {
227    pub(crate) fn new() -> Self {
228        Self {
229            traces: Default::default(),
230        }
231    }
232
233    pub(crate) fn insert(&mut self, trace: AttributeTrace) -> AttributeTraceIdx {
234        self.traces.push(trace);
235        let len = self.traces.len();
236        AttributeTraceIdx(
237            // SAFETY: len cannot be zero because we push before calling Vec::len.
238            unsafe { std::num::NonZeroUsize::new_unchecked(len) },
239        )
240    }
241}