1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
use std::convert::Infallible;
use crate::char_validator::CharValidator;
use crate::machine_helper::{ControlToken, MachineHelper};
use crate::read_helper::ReadHelper;
use crate::State;
use crate::{DefaultEmitter, Emitter, Readable, Reader};
/// A HTML tokenizer. See crate-level docs for basic usage.
#[derive(Debug)]
pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter<()>> {
eof: bool,
pub(crate) validator: CharValidator,
pub(crate) emitter: E,
pub(crate) reader: ReadHelper<R>,
pub(crate) machine_helper: MachineHelper<R, E>,
}
impl<R: Reader> Tokenizer<R, DefaultEmitter<()>> {
/// Create a new tokenizer from some input.
///
/// `input` can be `&String` or `&str` at the moment, as those are the types for which
/// [`crate::Readable`] is implemented, but you can implement that trait on your own types.
///
/// Patches are welcome for providing an efficient implementation over async streams,
/// iterators, files, etc, as long as any dependencies come behind featureflags.
pub fn new<'a, S: Readable<'a, Reader = R>>(input: S) -> Self {
Tokenizer::<S::Reader, DefaultEmitter<()>>::new_with_emitter(
input,
DefaultEmitter::<()>::default(),
)
}
}
impl<R: Reader, E: Emitter> Tokenizer<R, E> {
/// Construct a new tokenizer from some input and a custom emitter.
///
/// Use this method over [`Tokenizer::new`] when you want to have more control over string allocation for
/// tokens.
pub fn new_with_emitter<'a, S: Readable<'a, Reader = R>>(input: S, emitter: E) -> Self {
Tokenizer {
eof: false,
validator: CharValidator::default(),
emitter,
reader: ReadHelper::new(input.to_reader()),
machine_helper: MachineHelper::default(),
}
}
/// Override internal state. Necessary for parsing partial documents ("fragment parsing")
pub fn set_state(&mut self, state: State) {
self.machine_helper.state = state.into();
}
/// Test-internal function to override internal state.
#[cfg(debug_assertions)]
#[doc(hidden)]
pub fn set_last_start_tag(&mut self, last_start_tag: Option<&str>) {
self.emitter
.set_last_start_tag(last_start_tag.map(str::as_bytes));
}
#[inline(always)]
pub(crate) fn try_read_string(
&mut self,
s: &str,
case_sensitive: bool,
) -> Result<bool, R::Error> {
match self
.reader
.try_read_string(&mut self.validator, s, case_sensitive)
{
Ok(true) => {
self.emitter.move_position(s.len() as isize);
Ok(true)
}
Ok(false) => Ok(false),
Err(e) => Err(e),
}
}
#[inline(always)]
pub(crate) fn read_byte(&mut self) -> Result<Option<u8>, R::Error> {
let res = self
.reader
.read_byte(&mut self.validator, &mut self.emitter)?;
crate::utils::trace_log!("read_byte = {:?}", res.map(|x| x as char));
if res.is_some() {
self.emitter.move_position(1);
}
Ok(res)
}
#[inline(always)]
pub(crate) fn unread_byte(&mut self, c: Option<u8>) {
if c.is_some() {
self.emitter.move_position(-1);
}
self.reader.unread_byte(c);
}
#[inline(always)]
pub(crate) fn read_until<'b>(
reader: &'b mut ReadHelper<R>,
needle: &[u8],
char_validator: &mut CharValidator,
emitter: &mut E,
char_buf: &'b mut [u8; 4],
) -> Result<Option<&'b [u8]>, R::Error> {
let res = reader.read_until(needle, char_validator, emitter, char_buf)?;
crate::utils::trace_log!("read_until = {:?}", res.map(String::from_utf8_lossy));
if let Some(res) = &res {
emitter.move_position(res.len() as isize);
}
Ok(res)
}
}
impl<R: Reader, E: Emitter<Token = Infallible>> Tokenizer<R, E> {
/// Some emitters don't ever produce any tokens and instead have other side effects. In those
/// cases, you will find yourself writing code like this to handle errors:
///
/// ```
/// use std::convert::Infallible;
///
/// use html5gum::{Span, Tokenizer};
/// use html5gum::emitters::callback::{CallbackEvent, CallbackEmitter};
///
/// let emitter = CallbackEmitter::new(move |event: CallbackEvent<'_>, span: Span<()>| -> Option<Infallible> {
/// if let CallbackEvent::String { value } = event {
/// println!("{}", String::from_utf8_lossy(value));
/// }
///
/// // We may choose to return any Option<T> (such as errors, or our own tokens), but since
/// // we do all the real work in the callback itself, we choose to use Option<Infallible>.
/// None
/// });
///
/// let tokenizer = Tokenizer::new_with_emitter("hello <div><div><div> world!", emitter);
///
/// // this is a bit silly
/// // for _ in tokenizer {
/// // result.unwrap();
/// // }
///
/// // much better:
/// tokenizer.finish();
/// ```
pub fn finish(self) -> Result<(), R::Error> {
for result in self {
result?;
}
Ok(())
}
}
impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
type Item = Result<E::Token, R::Error>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(token) = self.emitter.pop_token() {
break Some(Ok(token));
} else if !self.eof {
match (self.machine_helper.state.function)(self) {
Ok(ControlToken::Continue) => (),
Ok(ControlToken::SwitchTo(next_state)) => {
self.machine_helper.switch_to(next_state);
}
Ok(ControlToken::Eof) => {
self.validator.flush_character_error(&mut self.emitter);
self.eof = true;
self.emitter.emit_eof();
}
Err(e) => break Some(Err(e)),
}
} else {
break None;
}
}
}
}