html_streaming_editor/html/
mod.rs1use rctree::{Children, Node};
2use snafu::{Backtrace, Snafu};
3use std::collections::BTreeMap;
4
5use crate::CssSelector;
6use tl::{HTMLTag, HTMLVersion, NodeHandle, Parser, VDom};
7
8#[cfg(test)]
9mod tests;
10
11#[derive(Debug, Snafu)]
12#[snafu(visibility(pub(crate)))]
13pub enum HtmlDomError {
14 #[snafu(display("Nothing Imported from tl"))]
15 NothingImported { backtrace: Backtrace },
16 #[snafu(display("Node not resolved by Parser"))]
17 InvalidParserState { backtrace: Backtrace },
18 #[snafu(display("HTML Document has invalid structure: {}", message))]
19 InvalidHtmlDocument {
20 message: &'static str,
21 backtrace: Backtrace,
22 },
23}
24
25const HTML_VOID_ELEMENTS: [&str; 16] = [
26 "area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link",
27 "meta", "param", "source", "track", "wbr",
28];
29
30#[derive(Debug, PartialEq, Clone)]
31pub(crate) struct HtmlDocument {
32 pub doctype: Option<HTMLVersion>,
33}
34
35#[derive(Debug, PartialEq, Clone)]
36pub(crate) struct HtmlTag {
37 pub name: String,
38 pub attributes: BTreeMap<String, String>,
39}
40
41impl HtmlTag {
42 pub(crate) fn of_name(name: impl Into<String>) -> Self {
43 HtmlTag {
44 name: name.into(),
45 attributes: BTreeMap::<String, String>::new(),
46 }
47 }
48
49 pub(crate) fn build_start_tag(&self, mut add_string: impl FnMut(String)) {
50 add_string(format!("<{}", self.name));
51 self.attributes
52 .iter()
53 .for_each(|(key, value)| add_string(format!(r#" {}="{}""#, key, value)));
54 add_string(String::from(">"));
55 }
56
57 pub(crate) fn build_end_tag(&self, mut add_string: impl FnMut(String)) {
58 if HTML_VOID_ELEMENTS.contains(&self.name.as_ref()) {
59 return;
60 }
61
62 add_string(format!("</{}>", self.name));
63 }
64
65 fn matches_selector(&self, selector: &CssSelector) -> bool {
66 if let Some(element) = selector.element {
67 if element.as_bytes() != self.name.as_bytes() {
68 return false;
69 }
70 }
71
72 if let Some(id) = selector.id {
73 if let Some(tag_id) = self.attributes.get(&String::from("id")) {
74 if id.as_bytes() != tag_id.as_bytes() {
75 return false;
76 }
77 } else {
78 return false;
79 }
80 }
81
82 for class in &selector.classes {
83 if !self.is_class_member(class) {
84 return false;
85 }
86 }
87
88 for _pseudo_class in &selector.pseudo_classes {
89 todo!("Implement pseudo-class support")
90 }
91
92 for attribute in &selector.attributes {
93 if let Some(attribute_value) = self.attributes.get(&String::from(attribute.attribute)) {
94 if !attribute.matches(attribute_value) {
95 return false;
96 }
97 } else {
98 return false;
99 }
100 }
101
102 true
103 }
104
105 fn is_class_member(&self, class: &str) -> bool {
106 if let Some(classes) = self.attributes.get(&String::from("class")) {
107 classes.split(' ').any(|c| c == class)
108 } else {
109 false
110 }
111 }
112}
113
114#[derive(Debug, PartialEq, Clone)]
115pub(crate) enum HtmlContent {
116 Document(HtmlDocument),
117 Tag(HtmlTag),
118 Text(String),
119 Comment(String),
120}
121
122impl HtmlContent {
123 pub(crate) fn is_tag(&self) -> bool {
124 matches!(self, HtmlContent::Tag(_))
125 }
126
127 pub(crate) fn import(dom: VDom) -> Result<Node<HtmlContent>, HtmlDomError> {
128 let (root_tag, root_tag_name) = Self::find_root_tag(&dom)?;
129
130 if root_tag_name == *"html" {
131 let document = Node::new(HtmlContent::Document(HtmlDocument {
132 doctype: dom.version(),
133 }));
134 document.append(root_tag);
135
136 Ok(document)
137 } else {
138 Ok(root_tag)
139 }
140 }
141
142 fn find_root_tag(dom: &VDom) -> Result<(Node<HtmlContent>, String), HtmlDomError> {
143 let parser = dom.parser();
144
145 for child in dom.children() {
146 if let Some(node) = child.get(parser) {
147 if let Some(tag) = node.as_tag() {
148 let name = String::from(tag.name().as_utf8_str());
149 let converted = Self::convert_tag(tag, parser)?;
150 return Ok((converted, name));
151 }
152 }
153 }
154
155 NothingImportedSnafu {}.fail()
156 }
157
158 fn convert_tag(tag: &HTMLTag, parser: &Parser) -> Result<Node<HtmlContent>, HtmlDomError> {
159 let name = String::from(tag.name().as_utf8_str());
160 let mut attributes = BTreeMap::new();
161
162 for (key, value) in tag.attributes().iter() {
163 let value_string = if let Some(value_content) = value {
164 String::from(value_content)
165 } else {
166 String::new()
167 };
168
169 attributes.insert(String::from(key), value_string);
170 }
171
172 let converted = Node::<HtmlContent>::new(HtmlContent::Tag(HtmlTag { name, attributes }));
173
174 for child in tag.children().top().iter() {
175 converted.append(Self::convert_node(child, parser)?)
176 }
177
178 Ok(converted)
179 }
180
181 fn convert_node(
182 node_handle: &NodeHandle,
183 parser: &Parser,
184 ) -> Result<Node<HtmlContent>, HtmlDomError> {
185 if let Some(node) = node_handle.get(parser) {
186 return match node {
187 tl::Node::Tag(tag) => Self::convert_tag(tag, parser),
188 tl::Node::Raw(text) => Self::convert_text(text.as_utf8_str()),
189 tl::Node::Comment(comment) => Self::convert_comment(comment.as_utf8_str()),
190 };
191 }
192
193 InvalidParserStateSnafu {}.fail()
194 }
195
196 fn convert_text(text: impl Into<String>) -> Result<Node<HtmlContent>, HtmlDomError> {
197 Ok(Node::new(HtmlContent::Text(text.into())))
198 }
199
200 fn convert_comment(comment: impl Into<String>) -> Result<Node<HtmlContent>, HtmlDomError> {
201 let comment = comment.into();
202 let comment = comment.trim_start_matches("<!--");
203 let comment = comment.trim_end_matches("-->");
204 let comment = comment.trim();
205 Ok(Node::new(HtmlContent::Comment(comment.into())))
206 }
207
208 fn inner_html(&self, children: Children<HtmlContent>) -> String {
209 match self {
210 HtmlContent::Comment(_) => String::new(),
211 HtmlContent::Text(s) => s.clone(),
212 HtmlContent::Document(d) => {
213 let mut inner_content = children
214 .into_iter()
215 .map(|c| c.outer_html())
216 .collect::<Vec<_>>();
217 if let Some(doctype) = &d.doctype {
218 inner_content.insert(0, doctype.outer_html());
219 inner_content.insert(1, String::from('\n'));
220 }
221
222 inner_content.join("")
223 }
224 HtmlContent::Tag(_t) => children
225 .into_iter()
226 .map(|c| c.outer_html())
227 .collect::<Vec<_>>()
228 .join(""),
229 }
230 }
231
232 fn outer_html(&self, children: Children<HtmlContent>) -> String {
233 match self {
234 HtmlContent::Comment(s) => format!("<!-- {} -->", s),
235 HtmlContent::Text(s) => s.clone(),
236 HtmlContent::Document(_) => self.inner_html(children),
237 HtmlContent::Tag(t) => {
238 let mut parts = Vec::<String>::new();
239 t.build_start_tag(|content| parts.push(content));
240
241 for child in children {
242 parts.push(child.outer_html());
243 }
244
245 t.build_end_tag(|content| parts.push(content));
246 parts.join("")
247 }
248 }
249 }
250
251 fn text_content(&self, children: Children<HtmlContent>) -> String {
252 match self {
253 HtmlContent::Comment(_) => String::new(),
254 HtmlContent::Text(s) => s.clone(),
255 HtmlContent::Tag(_) | HtmlContent::Document(_) => children
256 .into_iter()
257 .filter_map(|c| {
258 let child_render = c.text_content();
259
260 if child_render.is_empty() {
261 None
262 } else {
263 Some(child_render)
264 }
265 })
266 .collect::<Vec<_>>()
267 .join(" "),
268 }
269 }
270
271 fn matches_selector(&self, selector: &CssSelector) -> bool {
272 match self {
273 HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => false,
274 HtmlContent::Tag(t) => t.matches_selector(selector),
275 }
276 }
277
278 pub(crate) fn clear_attribute(&mut self, attribute: &String) {
279 match self {
280 HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => (),
281 HtmlContent::Tag(tag) => {
282 tag.attributes.remove(attribute);
283 }
284 }
285 }
286
287 pub(crate) fn set_attribute(&mut self, attribute: impl Into<String>, value: impl Into<String>) {
288 match self {
289 HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => (),
290 HtmlContent::Tag(tag) => {
291 tag.attributes.insert(attribute.into(), value.into());
292 }
293 }
294 }
295
296 pub(crate) fn get_attribute(&self, attribute: &String) -> Option<String> {
297 match self {
298 HtmlContent::Comment(_) | HtmlContent::Text(_) | HtmlContent::Document(_) => None,
299 HtmlContent::Tag(tag) => tag.attributes.get(attribute).cloned(),
300 }
301 }
302}
303
304pub trait HtmlRenderable {
305 fn inner_html(&self) -> String;
314 fn outer_html(&self) -> String;
323 fn text_content(&self) -> String;
328}
329
330impl HtmlRenderable for Node<HtmlContent> {
331 fn inner_html(&self) -> String {
332 let children = self.children();
333 let inner = self.borrow();
334
335 inner.inner_html(children)
336 }
337
338 fn outer_html(&self) -> String {
339 let children = self.children();
340 let inner = self.borrow();
341
342 inner.outer_html(children)
343 }
344
345 fn text_content(&self) -> String {
346 let children = self.children();
347 let inner = self.borrow();
348
349 inner.text_content(children)
350 }
351}
352
353impl HtmlRenderable for HTMLVersion {
354 fn inner_html(&self) -> String {
355 String::new()
356 }
357
358 fn outer_html(&self) -> String {
359 match self {
360 HTMLVersion::HTML5 => String::from("<!DOCTYPE html>"),
361 HTMLVersion::StrictHTML401 => String::from(
362 r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">"#,
363 ),
364 HTMLVersion::TransitionalHTML401 => String::from(
365 r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">"#,
366 ),
367 HTMLVersion::FramesetHTML401 => String::from(
368 r#"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/frameset.dtd">"#,
369 ),
370 }
371 }
372
373 fn text_content(&self) -> String {
374 String::new()
375 }
376}
377
378pub(crate) trait HtmlQueryable {
379 fn matches_selector(&self, selector: &CssSelector) -> bool;
380}
381
382impl HtmlQueryable for Node<HtmlContent> {
383 fn matches_selector(&self, selector: &CssSelector) -> bool {
384 let inner = self.borrow();
385 inner.matches_selector(selector)
386 }
387}