1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
pub(crate) mod machine;
use crate::offset::{Offset, Position};
use crate::reader::{IntoReader, Reader};
use crate::Emitter;
use machine::ControlToken;
#[cfg(feature = "integration-tests")]
pub use machine::State as InternalState;
/// An HTML tokenizer.
///
/// # Warning
///
/// Iterating over the tokenizer directly without calling [`Tokenizer::set_state`]
/// results in wrong state transitions:
///
/// ```
/// # use html5tokenizer::{BasicEmitter, Event, Tokenizer, Token};
/// let emitter = BasicEmitter::default();
/// let html = "<script><b>";
/// let mut tokens = Tokenizer::new(html, emitter).flatten();
/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
/// assert!(matches!(tokens.next(), Some(Event::Token(Token::StartTag(_)))));
/// ```
///
/// Instead use the [`NaiveParser`] (in the future this crate will also provide a proper implementation of [tree construction]).
///
/// [`NaiveParser`]: crate::NaiveParser
/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub struct Tokenizer<R, O, E> {
machine: machine::Machine<R, O, E>,
eof: bool,
}
impl<R: Reader + Position<O>, O: Offset, E: Emitter<O>> Tokenizer<R, O, E> {
/// Creates a new tokenizer from some input and an emitter.
///
/// Note that properly parsing HTML with this tokenizer requires you to
/// implement [tree construction] and call [`Tokenizer::set_state`] accordingly.
///
/// [tree construction]: https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
pub fn new<'a>(reader: impl IntoReader<'a, Reader = R>, emitter: E) -> Self {
Tokenizer {
machine: machine::Machine::new(reader.into_reader(), emitter),
eof: false,
}
}
/// To be called when the tokenizer iterator implementation yields [`Event::CdataOpen`].
///
/// For spec-compliant parsing the supplied boolean must be `true`
/// if there is an _adjusted current node_ and it is not an element in
/// the HTML namespace, or `false` otherwise (as per the third condition
/// under [Markup declaration open state]).
///
/// [Markup declaration open state]: https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
pub fn handle_cdata_open(
&mut self,
adjusted_current_node_present_and_not_in_html_namespace: bool,
) {
machine::handle_cdata_open(
&mut self.machine,
adjusted_current_node_present_and_not_in_html_namespace,
);
}
/// Returns a mutable reference to the emitter.
pub fn emitter_mut(&mut self) -> &mut E {
&mut self.machine.emitter
}
}
/// An event yielded by the [`Iterator`] implementation for the [`Tokenizer`].
#[derive(Clone, Debug)]
pub enum Event<T> {
/// A token emitted by the [`Emitter`].
Token(T),
/// The state machine encountered `<![CDATA[`. You must call [`Tokenizer::handle_cdata_open`],
/// before advancing the tokenizer iterator again.
CdataOpen,
}
/// The states you can set the tokenizer to.
#[derive(Debug)]
#[non_exhaustive]
pub enum State {
/// The [data state].
///
/// [data state]: https://html.spec.whatwg.org/multipage/parsing.html#data-state
Data,
/// The [PLAINTEXT state].
///
/// [PLAINTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
Plaintext,
/// The [RCDATA state].
///
/// [RCDATA state]: https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
Rcdata,
/// The [RAWTEXT state].
///
/// [RAWTEXT state]: https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
Rawtext,
/// The [script data state].
///
/// [script data state]: https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
ScriptData,
}
impl From<State> for machine::State {
fn from(state: State) -> Self {
match state {
State::Data => machine::State::Data,
State::Plaintext => machine::State::Plaintext,
State::Rcdata => machine::State::Rcdata,
State::Rawtext => machine::State::Rawtext,
State::ScriptData => machine::State::ScriptData,
}
}
}
impl<R, O, E> Tokenizer<R, O, E> {
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
#[cfg(feature = "integration-tests")]
pub fn set_internal_state(&mut self, state: InternalState) {
self.machine.state = state;
}
/// Set the statemachine to start/continue in the given state.
pub fn set_state(&mut self, state: State) {
self.machine.state = state.into();
}
}
impl<O, R, E> Iterator for Tokenizer<R, O, E>
where
O: Offset,
R: Reader + Position<O>,
E: Emitter<O> + Iterator,
{
type Item = Result<Event<E::Item>, R::Error>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.machine.emitter.next() {
return Some(Ok(Event::Token(token)));
}
if self.eof {
return None;
}
match machine::consume(&mut self.machine) {
Err(e) => return Some(Err(e)),
Ok(ControlToken::Continue) => (),
Ok(ControlToken::Eof) => {
self.eof = true;
self.machine
.emitter
.emit_eof(self.machine.reader_position());
}
Ok(ControlToken::CdataOpen) => return Some(Ok(Event::CdataOpen)),
}
}
}
}
impl<R, O, E> Tokenizer<R, O, E> {
pub(crate) fn enable_naive_state_switching(&mut self) {
self.machine.naively_switch_state = true;
}
/// Test-internal function to override internal state.
///
/// Only available with the `integration-tests` feature which is not public API.
#[cfg(feature = "integration-tests")]
pub fn set_last_start_tag(&mut self, last_start_tag: &str) {
self.machine.last_start_tag_name.clear();
self.machine.last_start_tag_name.push_str(last_start_tag);
}
}