1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
//! Provides the [`Token`] type.
use std::collections::{btree_map, BTreeMap};
use std::fmt::Debug;
use std::iter::FromIterator;
use std::ops::Index;
/// A type for the tokens emitted by a WHATWG-compliant HTML tokenizer.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Token {
/// A literal character, a resolved character reference,
/// or part of a resolved character reference (since some
/// character references resolve to two `char`s).
Char(char),
/// An HTML start tag.
StartTag(StartTag),
/// An HTML end tag.
EndTag(EndTag),
/// An HTML comment.
Comment(String),
/// An HTML doctype declaration.
Doctype(Doctype),
/// An end-of-file token.
EndOfFile,
}
/// An HTML start tag, such as `<p>` or `<a>`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct StartTag {
/// The tag name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
/// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
/// expected.
pub self_closing: bool,
/// A mapping for any HTML attributes this start tag may have.
///
/// Duplicate attributes are ignored after the first one as per WHATWG spec.
pub attributes: AttributeMap,
}
/// An HTML end/close tag, such as `</p>` or `</a>`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct EndTag {
/// The tag name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
}
/// A doctype. Some examples:
///
/// * `<!DOCTYPE {name}>`
/// * `<!DOCTYPE {name} PUBLIC '{public_id}'>`
/// * `<!DOCTYPE {name} SYSTEM '{system_id}'>`
/// * `<!DOCTYPE {name} PUBLIC '{public_id}' '{system_id}'>`
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Doctype {
/// The [force-quirks flag].
///
/// [force-quirks flag]: https://html.spec.whatwg.org/multipage/parsing.html#force-quirks-flag
pub force_quirks: bool,
/// The doctype's name. Uppercase ASCII characters (A-Z) have been
/// converted to lowercase. For HTML documents this should be "html".
pub name: Option<String>,
/// The doctype's public identifier.
pub public_id: Option<String>,
/// The doctype's system identifier.
pub system_id: Option<String>,
}
/// A map of HTML attributes.
///
/// Does not preserve the order of attributes.
/// Iterating always yields attributes in order by name.
///
/// # Example
///
/// ```
/// # use html5tokenizer::attr::AttributeMap;
/// let attrs: AttributeMap = vec![("href".into(), "http://example.com".into())]
/// .into_iter()
/// .collect();
/// assert_eq!(&attrs["href"], "http://example.com");
/// ```
#[derive(Clone, Default, PartialEq, Eq)] // Debug has a custom impl below
pub struct AttributeMap {
pub(crate) inner: BTreeMap<String, AttrInternal>,
}
/// The value type internally used by the [`AttributeMap`].
/// Not part of the public API.
#[derive(Clone, Default, Eq)] // Debug has a custom impl below
pub(crate) struct AttrInternal {
pub value: String,
pub trace_idx: Option<AttributeTraceIdx>,
}
/// The index of an [`AttributeTrace`] within an [`AttributeTraceList`].
///
/// [`AttributeTrace`]: crate::trace::AttributeTrace
/// [`AttributeTraceList`]: crate::trace::AttributeTraceList
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
pub struct AttributeTraceIdx(
// Using NonZeroUsize so that `Option<AttributeTraceIdx>`
// has the same size as `AttributeTraceIdx`.
pub std::num::NonZeroUsize,
);
impl PartialEq for AttrInternal {
fn eq(&self, other: &Self) -> bool {
// We intentionally don't include the trace_idx,
// so that PartialEq of Token only compares semantics.
self.value == other.value
}
}
/// An HTML attribute borrowed from an [`AttributeMap`].
#[derive(Eq, PartialEq)] // Debug has a custom impl below
pub struct Attribute<'a> {
name: &'a str,
map_val: &'a AttrInternal,
}
/// An owned HTML attribute.
#[derive(Debug, PartialEq, Eq)]
pub struct AttributeOwned {
/// The attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub name: String,
/// The attribute value. Character references have been resolved.
pub value: String,
/// The index of the corresponding [`AttributeTrace`] in the
/// `attribute_traces` field of [`StartTagTrace`], in case this attribute
/// was present in the source and the [`Emitter`] has tracked this.
///
/// [`AttributeTrace`]: super::trace::AttributeTrace
/// [`StartTagTrace`]: super::trace::AttributeTrace
/// [`Emitter`]: super::Emitter
pub trace_idx: Option<AttributeTraceIdx>,
}
impl AttributeMap {
/// Returns the value for the given attribute name.
///
/// The name must not contain any uppercase ASCII character (A-Z)
/// or the method will always return `None`.
pub fn get(&self, name: &str) -> Option<&str> {
self.inner.get(name).map(|map_val| map_val.value.as_str())
}
/// Returns the value and trace index for a given attribute name.
///
/// The name must not contain any uppercase ASCII character (A-Z)
/// or the method will always return `None`.
pub fn value_and_trace_idx(&self, name: &str) -> Option<(&str, Option<AttributeTraceIdx>)> {
self.inner
.get(name)
.map(|map_val| (map_val.value.as_str(), map_val.trace_idx))
}
}
impl<'a> Attribute<'a> {
/// Returns the attribute name.
/// Uppercase ASCII characters (A-Z) have been converted to lowercase.
pub fn name(&self) -> &'a str {
self.name
}
/// Returns the attribute value. Character references have been resolved.
pub fn value(&self) -> &'a str {
&self.map_val.value
}
/// Returns the index of the corresponding [`AttributeTrace`] in the
/// `attribute_traces` field of [`StartTagTrace`], in case this attribute
/// was present in the source and the [`Emitter`] has tracked that.
///
/// [`AttributeTrace`]: super::trace::AttributeTrace
/// [`StartTagTrace`]: super::trace::AttributeTrace
/// [`Emitter`]: super::Emitter
pub fn trace_idx(&self) -> Option<AttributeTraceIdx> {
self.map_val.trace_idx
}
}
// We cannot impl Index<Output=Attribute> because Index::index returns a reference of
// the Output type (and you cannot return a value referencing a temporary value).
impl Index<&str> for AttributeMap {
type Output = str;
/// Returns the attribute value with the given name.
///
/// The name must not contain any uppercase ASCII character (A-Z)
/// or the method will always panic.
fn index(&self, name: &str) -> &Self::Output {
&self.inner[name].value
}
}
impl IntoIterator for AttributeMap {
type Item = AttributeOwned;
type IntoIter = AttrIntoIter;
fn into_iter(self) -> Self::IntoIter {
AttrIntoIter(self.inner.into_iter())
}
}
/// A consuming iterator over the attributes of an [`AttributeMap`].
pub struct AttrIntoIter(btree_map::IntoIter<String, AttrInternal>);
impl Iterator for AttrIntoIter {
type Item = AttributeOwned;
fn next(&mut self) -> Option<Self::Item> {
let (name, map_val) = self.0.next()?;
Some(AttributeOwned {
name,
value: map_val.value,
trace_idx: map_val.trace_idx,
})
}
}
impl<'a> IntoIterator for &'a AttributeMap {
type Item = Attribute<'a>;
type IntoIter = AttrIter<'a>;
fn into_iter(self) -> Self::IntoIter {
AttrIter(self.inner.iter())
}
}
/// A borrowed iterator over the attributes of an [`AttributeMap`].
pub struct AttrIter<'a>(btree_map::Iter<'a, String, AttrInternal>);
impl<'a> Iterator for AttrIter<'a> {
type Item = Attribute<'a>;
fn next(&mut self) -> Option<Self::Item> {
let (name, map_val) = self.0.next()?;
Some(Attribute { name, map_val })
}
}
impl FromIterator<(String, String)> for AttributeMap {
fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
Self {
inner: iter
.into_iter()
.map(|(name, value)| {
(
name,
AttrInternal {
value,
trace_idx: None,
},
)
})
.collect(),
}
}
}
impl Debug for AttributeMap {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.inner.fmt(f)
}
}
impl Debug for AttrInternal {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self.value)?;
if let Some(idx) = self.trace_idx {
write!(f, " (trace #{})", idx.0)?;
}
Ok(())
}
}
impl Debug for Attribute<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Attribute")
.field("name", &self.name)
.field("value", &self.value())
.field("trace_idx", &self.trace_idx().map(|idx| idx.0))
.finish()
}
}