1mod element_info;
2
3use crate::element_info::{
4 is_foreign_context_elements, is_html_integration_elements, is_void_elements, open_implies_close,
5};
6use lazy_static::lazy_static;
7use regex::Regex;
8use rs_html_parser_tokenizer::{Tokenizer, TokenizerOptions};
9use rs_html_parser_tokenizer_tokens::{QuoteType, TokenizerToken, TokenizerTokenLocation};
10use rs_html_parser_tokens::{Token, TokenKind};
11use std::borrow::Cow;
12use std::collections::VecDeque;
13use std::mem::take;
14use std::str;
15use unicase_collections::unicase_btree_map::UniCaseBTreeMap;
16
17pub struct ParserOptions {
18 pub xml_mode: bool,
27
28 pub tokenizer_options: TokenizerOptions,
29}
30
31lazy_static! {
32 static ref RE_NAME_END: Regex = Regex::new(r"/\s|\//").unwrap();
33}
34
35pub struct Parser<'a> {
36 html_mode: bool,
37
38 buffer: &'a [u8],
39
40 tokenizer: Tokenizer<'a>,
41 tag_name: &'a str,
42 next_nodes: VecDeque<Token>,
43 stack: VecDeque<Box<str>>,
44 foreign_context: VecDeque<bool>,
45 attribs: UniCaseBTreeMap<Option<(Box<str>, QuoteType)>>,
46 attrib_value: Option<String>,
47 attrib_name: &'a str,
48}
49
50fn get_instruction_name(value: &str) -> Cow<str> {
51 if let Some(index) = RE_NAME_END.find(value) {
53 let name = value[..index.start()].to_string();
55
56 return Cow::Owned(name);
57 }
58
59 Cow::Borrowed(value)
60}
61
62impl<'i> Parser<'i> {
63 pub fn new<'a>(html: &'a str, options: &'a ParserOptions) -> Parser<'a> {
64 let bytes = html.as_bytes();
65
66 Parser {
67 buffer: bytes,
68 html_mode: !options.xml_mode,
69 tokenizer: Tokenizer::new(&bytes, &options.tokenizer_options),
70 tag_name: "".into(),
71 next_nodes: Default::default(),
72 stack: Default::default(),
73 foreign_context: VecDeque::from([options.xml_mode]),
74 attribs: Default::default(),
75 attrib_value: None,
76 attrib_name: Default::default(),
77 }
78 }
79
80 unsafe fn on_text(&mut self, tokenizer_token: TokenizerToken) {
81 self.next_nodes.push_back(Token {
82 data: String::from_utf8_unchecked(
83 self.buffer[tokenizer_token.start..tokenizer_token.end].to_owned(),
84 ).into_boxed_str(),
85 attrs: None,
86 kind: TokenKind::Text,
87 is_implied: false,
88 });
89 }
90
91 fn on_text_entity(&mut self, tokenizer_token: TokenizerToken) {
92 let data_string = char::from_u32(tokenizer_token.code).unwrap();
93
94 self.next_nodes.push_back(Token {
95 data: data_string.to_string().into_boxed_str(),
96 attrs: None,
97 kind: TokenKind::Text,
98 is_implied: false,
99 });
100 }
101
102 fn is_void_element(&self, name: &str) -> bool {
103 self.html_mode && is_void_elements(name)
104 }
105
106 unsafe fn on_open_tag_name(&mut self, tokenizer_token: TokenizerToken) {
107 let name = str::from_utf8_unchecked(
108 &self.buffer[tokenizer_token.start..tokenizer_token.end],
109 );
110
111 self.emit_open_tag(name);
112 }
113
114 fn emit_open_tag(&mut self, name: &'i str) {
115 self.tag_name = name;
116
117 let open_implies_close_option: Option<fn(tag_name: &str) -> bool> =
118 open_implies_close(&self.tag_name);
119
120 if let Some(open_implies_close_fn) = open_implies_close_option {
121 while !self.stack.is_empty() && open_implies_close_fn(&self.stack[0]) {
122 let element = self.stack.pop_front().unwrap();
123
124 self.next_nodes.push_back(Token {
125 data: element,
126 attrs: None,
127 kind: TokenKind::CloseTag,
128 is_implied: true,
129 });
130 }
131 }
132 if !self.is_void_element(&self.tag_name) {
133 self.stack.push_front(self.tag_name.to_string().into_boxed_str());
134
135 if self.html_mode {
136 if is_foreign_context_elements(&self.tag_name) {
137 self.foreign_context.push_front(true);
138 } else if is_html_integration_elements(&self.tag_name) {
139 self.foreign_context.push_front(false);
140 }
141 }
142 }
143 }
144
145 fn end_open_tag(&mut self, is_implied: bool) {
146 let is_void = self.is_void_element(&self.tag_name);
147
148 let close_node_option = if is_void {
149 Some(Token {
150 data: self.tag_name.to_string().into_boxed_str(),
151 attrs: None,
152 kind: TokenKind::CloseTag,
153 is_implied: true,
154 })
155 } else {
156 None
157 };
158
159 self.next_nodes.push_back(Token {
160 data: self.tag_name.to_string().into_boxed_str(),
161 attrs: if self.attribs.is_empty() {
162 None
163 } else {
164 Some(take(&mut self.attribs))
165 },
166 kind: TokenKind::OpenTag,
167 is_implied,
168 });
169
170 if let Some(close_node) = close_node_option {
171 self.next_nodes.push_back(close_node);
172 }
173 }
174
175 fn on_open_tag_end(&mut self) {
176 self.end_open_tag(false);
177 }
178
179 unsafe fn on_close_tag(&mut self, tokenizer_token: TokenizerToken) {
180 let name: &str =
181 str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
182
183 if is_foreign_context_elements(name) || is_html_integration_elements(name) {
184 self.foreign_context.pop_front();
185 }
186
187 if !self.is_void_element(name) {
188 let pos = self.stack.iter().position(|n| &**n == name);
189 if let Some(index) = pos {
190 for i in 0..index + 1 {
191 let tag = self.stack.pop_front().unwrap();
192 self.next_nodes.push_back(Token {
193 data: tag,
194 attrs: None,
195 kind: TokenKind::CloseTag,
196 is_implied: i != index,
197 });
198 }
199 } else if self.html_mode && name == "p" {
200 self.emit_open_tag("p");
202 self.close_current_tag(true);
203 }
204 } else if self.html_mode && name == "br" {
205 self.next_nodes.push_back(Token {
207 data: "br".to_string().into_boxed_str(),
208 attrs: None,
209 kind: TokenKind::OpenTag,
210 is_implied: false,
211 });
212 self.next_nodes.push_back(Token {
213 data: "br".to_string().into_boxed_str(),
214 attrs: None,
215 kind: TokenKind::CloseTag,
216 is_implied: false,
217 });
218 }
219 }
220
221 fn on_self_closing_tag(&mut self) {
222 if self.foreign_context[0] {
223 self.close_current_tag(false);
224 } else {
225 self.on_open_tag_end();
227 }
228 }
229
230 fn close_current_tag(&mut self, is_open_implied: bool) {
231 self.end_open_tag(is_open_implied);
232
233 if &*self.stack[0] == self.tag_name {
235 self.next_nodes.push_back(Token {
237 data: self.tag_name.to_string().into_boxed_str(),
238 attrs: None,
239 kind: TokenKind::CloseTag,
240 is_implied: !is_open_implied,
241 });
242 self.stack.pop_front();
243 }
244 }
245
246 unsafe fn on_attrib_name(&mut self, tokenizer_token: TokenizerToken) {
247 let name: &str =
248 str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
249
250 self.attrib_name = name;
251 }
252
253 unsafe fn on_attrib_data(&mut self, tokenizer_token: TokenizerToken) {
254 let new_attrib = match self.attrib_value.take() {
255 None => Some(String::from_utf8_unchecked(
256 self.buffer[tokenizer_token.start..tokenizer_token.end].to_owned(),
257 )),
258 Some(existing_value) => {
259 let mut modified_cow = existing_value;
260
261 modified_cow.push_str(str::from_utf8_unchecked(
262 &self.buffer[tokenizer_token.start..tokenizer_token.end],
263 ));
264
265 Some(modified_cow)
266 }
267 };
268
269 self.attrib_value = new_attrib;
270 }
271
272 fn on_attrib_entity(&mut self, tokenizer_token: TokenizerToken) {
273 let c = char::from_u32(tokenizer_token.code).unwrap();
274
275 let new_attrib = match self.attrib_value.take() {
276 None => Some(c.to_string()),
277 Some(existing_value) => {
278 let mut owned_value = existing_value;
279 owned_value.push(c);
280
281 Some(owned_value)
282 }
283 };
284
285 self.attrib_value = new_attrib;
286 }
287
288 fn on_attrib_end(&mut self, tokenizer_token: TokenizerToken) {
289 if !self.attribs.contains_key(self.attrib_name) {
290 let new_attribute: Option<(Box<str>, QuoteType)> = self
291 .attrib_value
292 .as_mut()
293 .map(|attrib_value| (attrib_value.clone().into_boxed_str(), tokenizer_token.quote));
294
295 self.attribs.insert(self.attrib_name.to_owned(), new_attribute);
296 }
297 self.attrib_value = None;
298 }
299
300 unsafe fn on_declaration(&mut self, tokenizer_token: TokenizerToken) {
301 let value: &str =
302 str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
303 let name = get_instruction_name(&value);
304
305 self.next_nodes.push_back(Token {
306 data: name.to_string().into_boxed_str(),
307 attrs: None,
308 kind: TokenKind::ProcessingInstruction,
309 is_implied: false,
310 });
311 }
312
313 unsafe fn on_processing_instruction(&mut self, tokenizer_token: TokenizerToken) {
314 let value: &str =
315 str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
316 let name = get_instruction_name(value);
317
318 self.next_nodes.push_back(Token {
319 data: name.to_string().into_boxed_str(),
320 attrs: None,
321 kind: TokenKind::ProcessingInstruction,
322 is_implied: false,
323 });
324 }
325
326 unsafe fn on_comment(&mut self, tokenizer_token: TokenizerToken) {
327 self.next_nodes.push_back(Token {
328 data: String::from_utf8_unchecked(
329 self.buffer[tokenizer_token.start..tokenizer_token.end].to_owned(),
330 ).into_boxed_str(),
331 attrs: None,
332 kind: TokenKind::Comment,
333 is_implied: false,
334 });
335 self.next_nodes.push_back(Token {
336 data: "".into(),
337 attrs: None,
338 kind: TokenKind::CommentEnd,
339 is_implied: false,
340 });
341 }
342
343 unsafe fn on_cdata(&mut self, tokenizer_token: TokenizerToken) {
344 self.on_comment(tokenizer_token);
345 }
346
347 fn onend(&mut self) {
348 let stack_iter = self.stack.iter();
350 for item in stack_iter {
351 self.next_nodes.push_back(Token {
352 data: item.to_owned(),
353 attrs: None,
354 kind: TokenKind::CloseTag,
355 is_implied: true,
356 })
357 }
358
359 self.stack.clear();
360 }
361 unsafe fn parser_next(&mut self) -> Option<Token> {
362 loop {
363 if let Some(existing_node) = self.next_nodes.pop_front() {
364 return Some(existing_node);
365 }
366
367 let possible_token = self.tokenizer.next();
368
369 match possible_token {
370 None => return None,
371 Some(tokenizer_token) => match tokenizer_token.location {
372 TokenizerTokenLocation::AttrData => self.on_attrib_data(tokenizer_token),
373 TokenizerTokenLocation::AttrEntity => self.on_attrib_entity(tokenizer_token),
374 TokenizerTokenLocation::AttrEnd => self.on_attrib_end(tokenizer_token),
375 TokenizerTokenLocation::AttrName => self.on_attrib_name(tokenizer_token),
376 TokenizerTokenLocation::CData => self.on_cdata(tokenizer_token),
377 TokenizerTokenLocation::CloseTag => self.on_close_tag(tokenizer_token),
378 TokenizerTokenLocation::Comment => self.on_comment(tokenizer_token),
379 TokenizerTokenLocation::Declaration => self.on_declaration(tokenizer_token),
380 TokenizerTokenLocation::OpenTagEnd => self.on_open_tag_end(),
381 TokenizerTokenLocation::OpenTagName => self.on_open_tag_name(tokenizer_token),
382 TokenizerTokenLocation::ProcessingInstruction => {
383 self.on_processing_instruction(tokenizer_token)
384 }
385 TokenizerTokenLocation::SelfClosingTag => self.on_self_closing_tag(),
386 TokenizerTokenLocation::Text => self.on_text(tokenizer_token),
387 TokenizerTokenLocation::TextEntity => self.on_text_entity(tokenizer_token),
388 TokenizerTokenLocation::End => self.onend(),
389 },
390 }
391 }
392 }
393}
394
395impl <'a> Iterator for Parser<'a> {
396 type Item = Token;
397
398 fn next(&mut self) -> Option<Token> {
399 unsafe { self.parser_next() }
400 }
401}