tl/vdom.rs
1use crate::errors::ParseError;
2use crate::parser::HTMLVersion;
3use crate::parser::NodeHandle;
4use crate::queryselector;
5use crate::queryselector::QuerySelectorIterator;
6use crate::Bytes;
7use crate::InnerNodeHandle;
8use crate::ParserOptions;
9use crate::{Node, Parser};
10use std::marker::PhantomData;
11
12/// VDom represents a [Document Object Model](https://developer.mozilla.org/en/docs/Web/API/Document_Object_Model)
13///
14/// It is the result of parsing an HTML document.
15/// Internally it is only a wrapper around the [`Parser`] struct, in which all of the HTML tags are stored.
16/// Many functions of the public API take a reference to a [`Parser`] as a parameter to resolve [`NodeHandle`]s to [`Node`]s.
17#[derive(Debug)]
18pub struct VDom<'a> {
19 /// Internal parser
20 parser: Parser<'a>,
21}
22
23impl<'a> From<Parser<'a>> for VDom<'a> {
24 fn from(parser: Parser<'a>) -> Self {
25 Self { parser }
26 }
27}
28
29impl<'a> VDom<'a> {
30 /// Returns a reference to the underlying parser
31 #[inline]
32 pub fn parser(&self) -> &Parser<'a> {
33 &self.parser
34 }
35
36 /// Returns a mutable reference to the underlying parser
37 #[inline]
38 pub fn parser_mut(&mut self) -> &mut Parser<'a> {
39 &mut self.parser
40 }
41
42 /// Finds an element by its `id` attribute.
43 pub fn get_element_by_id<'b, S>(&'b self, id: S) -> Option<NodeHandle>
44 where
45 S: Into<Bytes<'a>>,
46 {
47 let bytes: Bytes = id.into();
48 let parser = self.parser();
49
50 if parser.options.is_tracking_ids() {
51 parser.ids.get(&bytes).copied()
52 } else {
53 self.nodes()
54 .iter()
55 .enumerate()
56 .find(|(_, node)| {
57 node.as_tag().is_some_and(|tag| {
58 tag._attributes.id.as_ref().is_some_and(|x| x.eq(&bytes))
59 })
60 })
61 .map(|(id, _)| NodeHandle::new(id as InnerNodeHandle))
62 }
63 }
64
65 /// Returns a list of elements that match a given class name.
66 pub fn get_elements_by_class_name<'b>(
67 &'b self,
68 id: &'b str,
69 ) -> Box<dyn Iterator<Item = NodeHandle> + 'b> {
70 let parser = self.parser();
71
72 if parser.options.is_tracking_classes() {
73 parser
74 .classes
75 .get(&Bytes::from(id.as_bytes()))
76 .map(|x| Box::new(x.iter().cloned()) as Box<dyn Iterator<Item = NodeHandle>>)
77 .unwrap_or_else(|| Box::new(std::iter::empty()))
78 } else {
79 let member = id;
80
81 let iter = self
82 .nodes()
83 .iter()
84 .enumerate()
85 .filter_map(move |(id, node)| {
86 node.as_tag().and_then(|tag| {
87 tag._attributes
88 .is_class_member(member)
89 .then(|| NodeHandle::new(id as InnerNodeHandle))
90 })
91 });
92
93 Box::new(iter)
94 }
95 }
96
97 /// Returns a slice of *all* the elements in the HTML document
98 ///
99 /// The difference between `children()` and `nodes()` is that children only returns the immediate children of the root node,
100 /// while `nodes()` returns all nodes, including nested tags.
101 ///
102 /// # Order
103 /// The order of the returned nodes is the same as the order of the nodes in the HTML document.
104 pub fn nodes(&self) -> &[Node<'a>] {
105 &self.parser.tags
106 }
107
108 /// Returns a mutable slice of *all* the elements in the HTML document
109 ///
110 /// The difference between `children()` and `nodes()` is that children only returns the immediate children of the root node,
111 /// while `nodes()` returns all nodes, including nested tags.
112 pub fn nodes_mut(&mut self) -> &mut [Node<'a>] {
113 &mut self.parser.tags
114 }
115
116 /// Returns the topmost subnodes ("children") of this DOM
117 pub fn children(&self) -> &[NodeHandle] {
118 &self.parser.ast
119 }
120
121 /// Returns a mutable reference to the topmost subnodes ("children") of this DOM
122 pub fn children_mut(&mut self) -> &mut [NodeHandle] {
123 &mut self.parser.ast
124 }
125
126 /// Returns the HTML version.
127 /// This is determined by the `<!DOCTYPE>` tag
128 pub fn version(&self) -> Option<HTMLVersion> {
129 self.parser.version
130 }
131
132 /// Returns the contained markup of all of the elements in this DOM.
133 ///
134 /// Equivalent to [Element#outerHTML](https://developer.mozilla.org/en-US/docs/Web/API/Element/outerHTML) in browsers)
135 ///
136 /// # Example
137 /// ```
138 /// let html = r#"<div><p href="/about" id="find-me">Hello world</p></div>"#;
139 /// let mut dom = tl::parse(html, Default::default()).unwrap();
140 ///
141 /// let element = dom.get_element_by_id("find-me")
142 /// .unwrap()
143 /// .get_mut(dom.parser_mut())
144 /// .unwrap()
145 /// .as_tag_mut()
146 /// .unwrap();
147 ///
148 /// element.attributes_mut().get_mut("href").flatten().unwrap().set("/");
149 ///
150 /// assert_eq!(dom.outer_html(), r#"<div><p href="/" id="find-me">Hello world</p></div>"#);
151 /// ```
152 pub fn outer_html(&self) -> String {
153 let mut inner_html = String::with_capacity(self.parser.stream.len());
154
155 for node in self.children() {
156 let node = node.get(&self.parser).unwrap();
157 inner_html.push_str(&node.outer_html(&self.parser));
158 }
159
160 inner_html
161 }
162
163 /// Tries to parse the query selector and returns an iterator over elements that match the given query selector.
164 ///
165 /// # Example
166 /// ```
167 /// let dom = tl::parse("<div><p class=\"foo\">bar</div>", tl::ParserOptions::default()).unwrap();
168 /// let handle = dom.query_selector("p.foo").and_then(|mut iter| iter.next()).unwrap();
169 /// let node = handle.get(dom.parser()).unwrap();
170 /// assert_eq!(node.inner_text(dom.parser()), "bar");
171 /// ```
172 pub fn query_selector<'b>(
173 &'b self,
174 selector: &'b str,
175 ) -> Option<QuerySelectorIterator<'a, 'b, Self>> {
176 let selector = crate::parse_query_selector(selector)?;
177 let iter = queryselector::QuerySelectorIterator::new(selector, self.parser(), self);
178 Some(iter)
179 }
180}
181
182/// A RAII guarded version of VDom
183///
184/// The input string is freed once this struct goes out of scope.
185/// The only way to construct this is by calling `parse_owned()`.
186#[derive(Debug)]
187pub struct VDomGuard {
188 /// Wrapped VDom instance
189 dom: VDom<'static>,
190 /// The leaked input string that is referenced by self.dom
191 _s: RawString,
192 /// PhantomData for self.dom
193 _phantom: PhantomData<&'static str>,
194}
195
196unsafe impl Send for VDomGuard {}
197unsafe impl Sync for VDomGuard {}
198
199impl VDomGuard {
200 /// Parses the input string
201 pub(crate) fn parse(input: String, options: ParserOptions) -> Result<VDomGuard, ParseError> {
202 let input = RawString::new(input);
203
204 let ptr = input.as_ptr();
205
206 let input_ref: &'static str = unsafe { &*ptr };
207
208 // Parsing will either:
209 // a) succeed, and we return a VDom instance
210 // that, when dropped, will free the input string
211 // b) fail, and we return a ParseError
212 // and `RawString`s destructor will run and deallocate the string properly
213 let mut parser = Parser::new(input_ref, options);
214 parser.parse()?;
215
216 Ok(Self {
217 _s: input,
218 dom: VDom::from(parser),
219 _phantom: PhantomData,
220 })
221 }
222}
223
224impl VDomGuard {
225 /// Returns a reference to the inner DOM.
226 ///
227 /// The lifetime of the returned `VDom` is bound to self so that elements cannot outlive this `VDomGuard` struct.
228 pub fn get_ref<'a>(&'a self) -> &'a VDom<'a> {
229 &self.dom
230 }
231
232 /// Returns a mutable reference to the inner DOM.
233 ///
234 /// The lifetime of the returned `VDom` is bound to self so that elements cannot outlive this `VDomGuard` struct.
235 pub fn get_mut_ref<'a, 'b: 'a>(&'b mut self) -> &'b VDom<'a> {
236 &mut self.dom
237 }
238}
239
240#[derive(Debug)]
241struct RawString(*mut str);
242
243impl RawString {
244 pub fn new(s: String) -> Self {
245 Self(Box::into_raw(s.into_boxed_str()))
246 }
247
248 pub fn as_ptr(&self) -> *mut str {
249 self.0
250 }
251}
252
253impl Drop for RawString {
254 fn drop(&mut self) {
255 // SAFETY: the pointer is always valid because `RawString` can only be constructed through `RawString::new()`
256 unsafe {
257 drop(Box::from_raw(self.0));
258 };
259 }
260}