tl/
vdom.rs

1use crate::errors::ParseError;
2use crate::parser::HTMLVersion;
3use crate::parser::NodeHandle;
4use crate::queryselector;
5use crate::queryselector::QuerySelectorIterator;
6use crate::Bytes;
7use crate::InnerNodeHandle;
8use crate::ParserOptions;
9use crate::{Node, Parser};
10use std::marker::PhantomData;
11
12/// VDom represents a [Document Object Model](https://developer.mozilla.org/en/docs/Web/API/Document_Object_Model)
13///
14/// It is the result of parsing an HTML document.
15/// Internally it is only a wrapper around the [`Parser`] struct, in which all of the HTML tags are stored.
16/// Many functions of the public API take a reference to a [`Parser`] as a parameter to resolve [`NodeHandle`]s to [`Node`]s.
17#[derive(Debug)]
18pub struct VDom<'a> {
19    /// Internal parser
20    parser: Parser<'a>,
21}
22
23impl<'a> From<Parser<'a>> for VDom<'a> {
24    fn from(parser: Parser<'a>) -> Self {
25        Self { parser }
26    }
27}
28
29impl<'a> VDom<'a> {
30    /// Returns a reference to the underlying parser
31    #[inline]
32    pub fn parser(&self) -> &Parser<'a> {
33        &self.parser
34    }
35
36    /// Returns a mutable reference to the underlying parser
37    #[inline]
38    pub fn parser_mut(&mut self) -> &mut Parser<'a> {
39        &mut self.parser
40    }
41
42    /// Finds an element by its `id` attribute.
43    pub fn get_element_by_id<'b, S>(&'b self, id: S) -> Option<NodeHandle>
44    where
45        S: Into<Bytes<'a>>,
46    {
47        let bytes: Bytes = id.into();
48        let parser = self.parser();
49
50        if parser.options.is_tracking_ids() {
51            parser.ids.get(&bytes).copied()
52        } else {
53            self.nodes()
54                .iter()
55                .enumerate()
56                .find(|(_, node)| {
57                    node.as_tag().is_some_and(|tag| {
58                        tag._attributes.id.as_ref().is_some_and(|x| x.eq(&bytes))
59                    })
60                })
61                .map(|(id, _)| NodeHandle::new(id as InnerNodeHandle))
62        }
63    }
64
65    /// Returns a list of elements that match a given class name.
66    pub fn get_elements_by_class_name<'b>(
67        &'b self,
68        id: &'b str,
69    ) -> Box<dyn Iterator<Item = NodeHandle> + 'b> {
70        let parser = self.parser();
71
72        if parser.options.is_tracking_classes() {
73            parser
74                .classes
75                .get(&Bytes::from(id.as_bytes()))
76                .map(|x| Box::new(x.iter().cloned()) as Box<dyn Iterator<Item = NodeHandle>>)
77                .unwrap_or_else(|| Box::new(std::iter::empty()))
78        } else {
79            let member = id;
80
81            let iter = self
82                .nodes()
83                .iter()
84                .enumerate()
85                .filter_map(move |(id, node)| {
86                    node.as_tag().and_then(|tag| {
87                        tag._attributes
88                            .is_class_member(member)
89                            .then(|| NodeHandle::new(id as InnerNodeHandle))
90                    })
91                });
92
93            Box::new(iter)
94        }
95    }
96
97    /// Returns a slice of *all* the elements in the HTML document
98    ///
99    /// The difference between `children()` and `nodes()` is that children only returns the immediate children of the root node,
100    /// while `nodes()` returns all nodes, including nested tags.
101    ///
102    /// # Order
103    /// The order of the returned nodes is the same as the order of the nodes in the HTML document.
104    pub fn nodes(&self) -> &[Node<'a>] {
105        &self.parser.tags
106    }
107
108    /// Returns a mutable slice of *all* the elements in the HTML document
109    ///
110    /// The difference between `children()` and `nodes()` is that children only returns the immediate children of the root node,
111    /// while `nodes()` returns all nodes, including nested tags.
112    pub fn nodes_mut(&mut self) -> &mut [Node<'a>] {
113        &mut self.parser.tags
114    }
115
116    /// Returns the topmost subnodes ("children") of this DOM
117    pub fn children(&self) -> &[NodeHandle] {
118        &self.parser.ast
119    }
120
121    /// Returns a mutable reference to the topmost subnodes ("children") of this DOM
122    pub fn children_mut(&mut self) -> &mut [NodeHandle] {
123        &mut self.parser.ast
124    }
125
126    /// Returns the HTML version.
127    /// This is determined by the `<!DOCTYPE>` tag
128    pub fn version(&self) -> Option<HTMLVersion> {
129        self.parser.version
130    }
131
132    /// Returns the contained markup of all of the elements in this DOM.
133    ///
134    /// Equivalent to [Element#outerHTML](https://developer.mozilla.org/en-US/docs/Web/API/Element/outerHTML) in browsers)
135    ///
136    /// # Example
137    /// ```
138    /// let html = r#"<div><p href="/about" id="find-me">Hello world</p></div>"#;
139    /// let mut dom = tl::parse(html, Default::default()).unwrap();
140    ///
141    /// let element = dom.get_element_by_id("find-me")
142    ///     .unwrap()
143    ///     .get_mut(dom.parser_mut())
144    ///     .unwrap()
145    ///     .as_tag_mut()
146    ///     .unwrap();
147    ///
148    /// element.attributes_mut().get_mut("href").flatten().unwrap().set("/");
149    ///
150    /// assert_eq!(dom.outer_html(), r#"<div><p href="/" id="find-me">Hello world</p></div>"#);
151    /// ```
152    pub fn outer_html(&self) -> String {
153        let mut inner_html = String::with_capacity(self.parser.stream.len());
154
155        for node in self.children() {
156            let node = node.get(&self.parser).unwrap();
157            inner_html.push_str(&node.outer_html(&self.parser));
158        }
159
160        inner_html
161    }
162
163    /// Tries to parse the query selector and returns an iterator over elements that match the given query selector.
164    ///
165    /// # Example
166    /// ```
167    /// let dom = tl::parse("<div><p class=\"foo\">bar</div>", tl::ParserOptions::default()).unwrap();
168    /// let handle = dom.query_selector("p.foo").and_then(|mut iter| iter.next()).unwrap();
169    /// let node = handle.get(dom.parser()).unwrap();
170    /// assert_eq!(node.inner_text(dom.parser()), "bar");
171    /// ```
172    pub fn query_selector<'b>(
173        &'b self,
174        selector: &'b str,
175    ) -> Option<QuerySelectorIterator<'a, 'b, Self>> {
176        let selector = crate::parse_query_selector(selector)?;
177        let iter = queryselector::QuerySelectorIterator::new(selector, self.parser(), self);
178        Some(iter)
179    }
180}
181
182/// A RAII guarded version of VDom
183///
184/// The input string is freed once this struct goes out of scope.
185/// The only way to construct this is by calling `parse_owned()`.
186#[derive(Debug)]
187pub struct VDomGuard {
188    /// Wrapped VDom instance
189    dom: VDom<'static>,
190    /// The leaked input string that is referenced by self.dom
191    _s: RawString,
192    /// PhantomData for self.dom
193    _phantom: PhantomData<&'static str>,
194}
195
196unsafe impl Send for VDomGuard {}
197unsafe impl Sync for VDomGuard {}
198
199impl VDomGuard {
200    /// Parses the input string
201    pub(crate) fn parse(input: String, options: ParserOptions) -> Result<VDomGuard, ParseError> {
202        let input = RawString::new(input);
203
204        let ptr = input.as_ptr();
205
206        let input_ref: &'static str = unsafe { &*ptr };
207
208        // Parsing will either:
209        // a) succeed, and we return a VDom instance
210        //    that, when dropped, will free the input string
211        // b) fail, and we return a ParseError
212        //    and `RawString`s destructor will run and deallocate the string properly
213        let mut parser = Parser::new(input_ref, options);
214        parser.parse()?;
215
216        Ok(Self {
217            _s: input,
218            dom: VDom::from(parser),
219            _phantom: PhantomData,
220        })
221    }
222}
223
224impl VDomGuard {
225    /// Returns a reference to the inner DOM.
226    ///
227    /// The lifetime of the returned `VDom` is bound to self so that elements cannot outlive this `VDomGuard` struct.
228    pub fn get_ref<'a>(&'a self) -> &'a VDom<'a> {
229        &self.dom
230    }
231
232    /// Returns a mutable reference to the inner DOM.
233    ///
234    /// The lifetime of the returned `VDom` is bound to self so that elements cannot outlive this `VDomGuard` struct.
235    pub fn get_mut_ref<'a, 'b: 'a>(&'b mut self) -> &'b VDom<'a> {
236        &mut self.dom
237    }
238}
239
240#[derive(Debug)]
241struct RawString(*mut str);
242
243impl RawString {
244    pub fn new(s: String) -> Self {
245        Self(Box::into_raw(s.into_boxed_str()))
246    }
247
248    pub fn as_ptr(&self) -> *mut str {
249        self.0
250    }
251}
252
253impl Drop for RawString {
254    fn drop(&mut self) {
255        // SAFETY: the pointer is always valid because `RawString` can only be constructed through `RawString::new()`
256        unsafe {
257            drop(Box::from_raw(self.0));
258        };
259    }
260}