rs_html_parser/
lib.rs

1mod element_info;
2
3use crate::element_info::{
4    is_foreign_context_elements, is_html_integration_elements, is_void_elements, open_implies_close,
5};
6use lazy_static::lazy_static;
7use regex::Regex;
8use rs_html_parser_tokenizer::{Tokenizer, TokenizerOptions};
9use rs_html_parser_tokenizer_tokens::{QuoteType, TokenizerToken, TokenizerTokenLocation};
10use rs_html_parser_tokens::{Token, TokenKind};
11use std::borrow::Cow;
12use std::collections::VecDeque;
13use std::mem::take;
14use std::str;
15use unicase_collections::unicase_btree_map::UniCaseBTreeMap;
16
17pub struct ParserOptions {
18    /**
19     * Indicates whether special tags (`<script>`, `<style>`, `<textarea>` and `<title>`) should get special treatment
20     * and if "empty" tags (eg. `<br>`) can have children.  If `false`, the content of special tags
21     * will be text only. For feeds and other XML content (documents that don't consist of HTML),
22     * set self to `true`.
23     *
24     * @default false
25     */
26    pub xml_mode: bool,
27
28    pub tokenizer_options: TokenizerOptions,
29}
30
31lazy_static! {
32    static ref RE_NAME_END: Regex = Regex::new(r"/\s|\//").unwrap();
33}
34
35pub struct Parser<'a> {
36    html_mode: bool,
37
38    buffer: &'a [u8],
39
40    tokenizer: Tokenizer<'a>,
41    tag_name: &'a str,
42    next_nodes: VecDeque<Token>,
43    stack: VecDeque<Box<str>>,
44    foreign_context: VecDeque<bool>,
45    attribs: UniCaseBTreeMap<Option<(Box<str>, QuoteType)>>,
46    attrib_value: Option<String>,
47    attrib_name: &'a str,
48}
49
50fn get_instruction_name(value: &str) -> Cow<str> {
51    // Use the regex search method to find the index
52    if let Some(index) = RE_NAME_END.find(value) {
53        // Extract the substring up to the match index
54        let name = value[..index.start()].to_string();
55
56        return Cow::Owned(name);
57    }
58
59    Cow::Borrowed(value)
60}
61
62impl<'i> Parser<'i> {
63    pub fn new<'a>(html: &'a str, options: &'a ParserOptions) -> Parser<'a> {
64        let bytes = html.as_bytes();
65
66        Parser {
67            buffer: bytes,
68            html_mode: !options.xml_mode,
69            tokenizer: Tokenizer::new(&bytes, &options.tokenizer_options),
70            tag_name: "".into(),
71            next_nodes: Default::default(),
72            stack: Default::default(),
73            foreign_context: VecDeque::from([options.xml_mode]),
74            attribs: Default::default(),
75            attrib_value: None,
76            attrib_name: Default::default(),
77        }
78    }
79
80    unsafe fn on_text(&mut self, tokenizer_token: TokenizerToken) {
81        self.next_nodes.push_back(Token {
82            data: String::from_utf8_unchecked(
83                self.buffer[tokenizer_token.start..tokenizer_token.end].to_owned(),
84            ).into_boxed_str(),
85            attrs: None,
86            kind: TokenKind::Text,
87            is_implied: false,
88        });
89    }
90
91    fn on_text_entity(&mut self, tokenizer_token: TokenizerToken) {
92        let data_string = char::from_u32(tokenizer_token.code).unwrap();
93
94        self.next_nodes.push_back(Token {
95            data: data_string.to_string().into_boxed_str(),
96            attrs: None,
97            kind: TokenKind::Text,
98            is_implied: false,
99        });
100    }
101
102    fn is_void_element(&self, name: &str) -> bool {
103        self.html_mode && is_void_elements(name)
104    }
105
106    unsafe fn on_open_tag_name(&mut self, tokenizer_token: TokenizerToken) {
107        let name = str::from_utf8_unchecked(
108            &self.buffer[tokenizer_token.start..tokenizer_token.end],
109        );
110
111        self.emit_open_tag(name);
112    }
113
114    fn emit_open_tag(&mut self, name: &'i str) {
115        self.tag_name = name;
116
117        let open_implies_close_option: Option<fn(tag_name: &str) -> bool> =
118            open_implies_close(&self.tag_name);
119
120        if let Some(open_implies_close_fn) = open_implies_close_option {
121            while !self.stack.is_empty() && open_implies_close_fn(&self.stack[0]) {
122                let element = self.stack.pop_front().unwrap();
123
124                self.next_nodes.push_back(Token {
125                    data: element,
126                    attrs: None,
127                    kind: TokenKind::CloseTag,
128                    is_implied: true,
129                });
130            }
131        }
132        if !self.is_void_element(&self.tag_name) {
133            self.stack.push_front(self.tag_name.to_string().into_boxed_str());
134
135            if self.html_mode {
136                if is_foreign_context_elements(&self.tag_name) {
137                    self.foreign_context.push_front(true);
138                } else if is_html_integration_elements(&self.tag_name) {
139                    self.foreign_context.push_front(false);
140                }
141            }
142        }
143    }
144
145    fn end_open_tag(&mut self, is_implied: bool) {
146        let is_void = self.is_void_element(&self.tag_name);
147
148        let close_node_option = if is_void {
149            Some(Token {
150                data: self.tag_name.to_string().into_boxed_str(),
151                attrs: None,
152                kind: TokenKind::CloseTag,
153                is_implied: true,
154            })
155        } else {
156            None
157        };
158
159        self.next_nodes.push_back(Token {
160            data: self.tag_name.to_string().into_boxed_str(),
161            attrs: if self.attribs.is_empty() {
162                None
163            } else {
164                Some(take(&mut self.attribs))
165            },
166            kind: TokenKind::OpenTag,
167            is_implied,
168        });
169
170        if let Some(close_node) = close_node_option {
171            self.next_nodes.push_back(close_node);
172        }
173    }
174
175    fn on_open_tag_end(&mut self) {
176        self.end_open_tag(false);
177    }
178
179    unsafe fn on_close_tag(&mut self, tokenizer_token: TokenizerToken) {
180        let name: &str =
181            str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
182
183        if is_foreign_context_elements(name) || is_html_integration_elements(name) {
184            self.foreign_context.pop_front();
185        }
186
187        if !self.is_void_element(name) {
188            let pos = self.stack.iter().position(|n| &**n == name);
189            if let Some(index) = pos {
190                for i in 0..index + 1 {
191                    let tag = self.stack.pop_front().unwrap();
192                    self.next_nodes.push_back(Token {
193                        data: tag,
194                        attrs: None,
195                        kind: TokenKind::CloseTag,
196                        is_implied: i != index,
197                    });
198                }
199            } else if self.html_mode && name == "p" {
200                // Implicit open before close
201                self.emit_open_tag("p");
202                self.close_current_tag(true);
203            }
204        } else if self.html_mode && name == "br" {
205            // We can't use `emit_open_tag` for implicit open, as `br` would be implicitly closed.
206            self.next_nodes.push_back(Token {
207                data: "br".to_string().into_boxed_str(),
208                attrs: None,
209                kind: TokenKind::OpenTag,
210                is_implied: false,
211            });
212            self.next_nodes.push_back(Token {
213                data: "br".to_string().into_boxed_str(),
214                attrs: None,
215                kind: TokenKind::CloseTag,
216                is_implied: false,
217            });
218        }
219    }
220
221    fn on_self_closing_tag(&mut self) {
222        if self.foreign_context[0] {
223            self.close_current_tag(false);
224        } else {
225            // Ignore the fact that the tag is self-closing.
226            self.on_open_tag_end();
227        }
228    }
229
230    fn close_current_tag(&mut self, is_open_implied: bool) {
231        self.end_open_tag(is_open_implied);
232
233        // Self-closing tags will be on the top of the stack
234        if &*self.stack[0] == self.tag_name {
235            // If the opening tag isn't implied, the closing tag has to be implied.
236            self.next_nodes.push_back(Token {
237                data: self.tag_name.to_string().into_boxed_str(),
238                attrs: None,
239                kind: TokenKind::CloseTag,
240                is_implied: !is_open_implied,
241            });
242            self.stack.pop_front();
243        }
244    }
245
246    unsafe fn on_attrib_name(&mut self, tokenizer_token: TokenizerToken) {
247        let name: &str =
248            str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
249
250        self.attrib_name = name;
251    }
252
253    unsafe fn on_attrib_data(&mut self, tokenizer_token: TokenizerToken) {
254        let new_attrib = match self.attrib_value.take() {
255            None => Some(String::from_utf8_unchecked(
256                self.buffer[tokenizer_token.start..tokenizer_token.end].to_owned(),
257            )),
258            Some(existing_value) => {
259                let mut modified_cow = existing_value;
260
261                modified_cow.push_str(str::from_utf8_unchecked(
262                    &self.buffer[tokenizer_token.start..tokenizer_token.end],
263                ));
264
265                Some(modified_cow)
266            }
267        };
268
269        self.attrib_value = new_attrib;
270    }
271
272    fn on_attrib_entity(&mut self, tokenizer_token: TokenizerToken) {
273        let c = char::from_u32(tokenizer_token.code).unwrap();
274
275        let new_attrib = match self.attrib_value.take() {
276            None => Some(c.to_string()),
277            Some(existing_value) => {
278                let mut owned_value = existing_value;
279                owned_value.push(c);
280
281                Some(owned_value)
282            }
283        };
284
285        self.attrib_value = new_attrib;
286    }
287
288    fn on_attrib_end(&mut self, tokenizer_token: TokenizerToken) {
289        if !self.attribs.contains_key(self.attrib_name) {
290            let new_attribute: Option<(Box<str>, QuoteType)> = self
291                .attrib_value
292                .as_mut()
293                .map(|attrib_value| (attrib_value.clone().into_boxed_str(), tokenizer_token.quote));
294
295            self.attribs.insert(self.attrib_name.to_owned(), new_attribute);
296        }
297        self.attrib_value = None;
298    }
299
300    unsafe fn on_declaration(&mut self, tokenizer_token: TokenizerToken) {
301        let value: &str =
302            str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
303        let name = get_instruction_name(&value);
304
305        self.next_nodes.push_back(Token {
306            data: name.to_string().into_boxed_str(),
307            attrs: None,
308            kind: TokenKind::ProcessingInstruction,
309            is_implied: false,
310        });
311    }
312
313    unsafe fn on_processing_instruction(&mut self, tokenizer_token: TokenizerToken) {
314        let value: &str =
315            str::from_utf8_unchecked(&self.buffer[tokenizer_token.start..tokenizer_token.end]);
316        let name = get_instruction_name(value);
317
318        self.next_nodes.push_back(Token {
319            data: name.to_string().into_boxed_str(),
320            attrs: None,
321            kind: TokenKind::ProcessingInstruction,
322            is_implied: false,
323        });
324    }
325
326    unsafe fn on_comment(&mut self, tokenizer_token: TokenizerToken) {
327        self.next_nodes.push_back(Token {
328            data: String::from_utf8_unchecked(
329                self.buffer[tokenizer_token.start..tokenizer_token.end].to_owned(),
330            ).into_boxed_str(),
331            attrs: None,
332            kind: TokenKind::Comment,
333            is_implied: false,
334        });
335        self.next_nodes.push_back(Token {
336            data: "".into(),
337            attrs: None,
338            kind: TokenKind::CommentEnd,
339            is_implied: false,
340        });
341    }
342
343    unsafe fn on_cdata(&mut self, tokenizer_token: TokenizerToken) {
344        self.on_comment(tokenizer_token);
345    }
346
347    fn onend(&mut self) {
348        // Set the end index for all remaining tags
349        let stack_iter = self.stack.iter();
350        for item in stack_iter {
351            self.next_nodes.push_back(Token {
352                data: item.to_owned(),
353                attrs: None,
354                kind: TokenKind::CloseTag,
355                is_implied: true,
356            })
357        }
358
359        self.stack.clear();
360    }
361    unsafe fn parser_next(&mut self) -> Option<Token> {
362        loop {
363            if let Some(existing_node) = self.next_nodes.pop_front() {
364                return Some(existing_node);
365            }
366
367            let possible_token = self.tokenizer.next();
368
369            match possible_token {
370                None => return None,
371                Some(tokenizer_token) => match tokenizer_token.location {
372                    TokenizerTokenLocation::AttrData => self.on_attrib_data(tokenizer_token),
373                    TokenizerTokenLocation::AttrEntity => self.on_attrib_entity(tokenizer_token),
374                    TokenizerTokenLocation::AttrEnd => self.on_attrib_end(tokenizer_token),
375                    TokenizerTokenLocation::AttrName => self.on_attrib_name(tokenizer_token),
376                    TokenizerTokenLocation::CData => self.on_cdata(tokenizer_token),
377                    TokenizerTokenLocation::CloseTag => self.on_close_tag(tokenizer_token),
378                    TokenizerTokenLocation::Comment => self.on_comment(tokenizer_token),
379                    TokenizerTokenLocation::Declaration => self.on_declaration(tokenizer_token),
380                    TokenizerTokenLocation::OpenTagEnd => self.on_open_tag_end(),
381                    TokenizerTokenLocation::OpenTagName => self.on_open_tag_name(tokenizer_token),
382                    TokenizerTokenLocation::ProcessingInstruction => {
383                        self.on_processing_instruction(tokenizer_token)
384                    }
385                    TokenizerTokenLocation::SelfClosingTag => self.on_self_closing_tag(),
386                    TokenizerTokenLocation::Text => self.on_text(tokenizer_token),
387                    TokenizerTokenLocation::TextEntity => self.on_text_entity(tokenizer_token),
388                    TokenizerTokenLocation::End => self.onend(),
389                },
390            }
391        }
392    }
393}
394
395impl <'a> Iterator for Parser<'a> {
396    type Item = Token;
397
398    fn next(&mut self) -> Option<Token> {
399        unsafe { self.parser_next() }
400    }
401}