lightml/
lib.rs

1#![allow(clippy::result_unit_err)]
2#![doc = include_str!("../README.md")]
3
4pub use lexer::Lexer;
5use std::borrow::Cow;
6
7mod lexer;
8pub mod matching;
9pub mod operations;
10
11#[derive(Debug, Clone, PartialEq)]
12pub struct Element {
13    /// Name of the element (TODO or reference to element)
14    pub tag_name: String,
15    pub attributes: Vec<Attribute>,
16    pub children: ElementChildren,
17}
18
19pub type Children = Vec<Node>;
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum ElementChildren {
23    Children(Children),
24    /// For script + style elements
25    Literal(String),
26    /// For `img` elements etc
27    SelfClosing,
28}
29
30impl From<Element> for Node {
31    fn from(value: Element) -> Node {
32        Node::Element(value)
33    }
34}
35
36#[derive(Debug, Clone, PartialEq)]
37pub struct Document {
38    pub html_element: Element,
39}
40
41impl Document {
42    pub fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
43        // TODO temp
44        let _ = reader.is_operator_advance("<!DOCTYPE html>");
45        Element::from_reader(reader).map(|html_element| Document { html_element })
46    }
47}
48
49impl Element {
50    pub fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
51        reader.expect_start('<')?;
52        let tag_name = reader.parse_identifier("Element name", false)?.to_owned();
53        let mut attributes = Vec::new();
54        // TODO spread attributes
55        // Kind of weird / not clear conditions for breaking out of while loop
56        loop {
57            reader.skip();
58            if reader.is_operator_advance(">") {
59                break;
60            } else if reader.is_operator_advance("/>") {
61                // TODO check set closing
62                // Early return if self closing
63                return Ok(Element {
64                    tag_name,
65                    attributes,
66                    children: ElementChildren::SelfClosing,
67                });
68            } else {
69                // TODO extras here @ etc
70                // let start = reader.get_start();
71                let key = reader
72                    .parse_identifier("Element attribute", false)?
73                    .to_owned();
74                let attribute = if reader.is_operator_advance("=") {
75                    // let start = reader.get_start();
76                    if reader.starts_with_string_delimeter() {
77                        // TODO _quoted
78                        let (content, _quoted) = reader.parse_string_literal()?;
79                        // let position = start.with_length(content.len() + 2);
80                        Attribute {
81                            key,
82                            value: content.to_owned(),
83                        }
84                    } else {
85                        return Err(());
86                        // let error_position = start.with_length(
87                        // 	crate::lexer::utilities::next_empty_occurance(reader.get_current()),
88                        // );
89                        // return Err(ParseError::new(
90                        // 	ParseErrors::ExpectedAttribute,
91                        // 	error_position,
92                        // ));
93                    }
94                } else {
95                    // Boolean attribute
96                    Attribute {
97                        key,
98                        value: Default::default(),
99                    }
100                };
101                attributes.push(attribute);
102            }
103        }
104
105        if html_tag_is_self_closing(&tag_name) {
106            return Ok(Element {
107                tag_name,
108                attributes,
109                children: ElementChildren::SelfClosing,
110            });
111        } else if html_tag_contains_literal_content(&tag_name) {
112            // TODO could embedded parser?
113            let (content, _) = reader
114                .parse_until("</")
115                .map_err(|()| {
116                    // TODO might be a problem
117                    // let position = reader.get_start().with_length(reader.get_current().len());
118                    // ParseError::new(crate::ParseErrors::UnexpectedEnd, position)
119                })?
120                .to_owned();
121
122            reader.advance("</".len() as u32);
123
124            let content = content.to_owned();
125
126            let closing_tag_name = reader.parse_identifier("Closing tag", false)?;
127            if tag_name != closing_tag_name {
128                return Err(());
129                // return Err(ParseError::new(
130                // 	crate::ParseErrors::ClosingTagDoesNotMatch {
131                // 		tag_name: &tag_name,
132                // 		closing_tag_name,
133                // 	},
134                // 	start.with_length(closing_tag_name.len() + 2),
135                // ));
136            }
137            reader.expect('>')?;
138            let children = ElementChildren::Literal(content);
139            return Ok(Element {
140                tag_name,
141                attributes,
142                children,
143            });
144        }
145
146        let children = children_from_reader(reader)?;
147        if reader.is_operator_advance("</") {
148            let closing_tag_name = reader.parse_identifier("closing tag", false)?;
149            reader.expect('>')?;
150            if closing_tag_name != tag_name {
151                return Err(());
152                // return Err(ParseError::new(
153                // 	crate::ParseErrors::ClosingTagDoesNotMatch {
154                // 		tag_name: &tag_name,
155                // 		closing_tag_name,
156                // 	},
157                // 	start.with_length(closing_tag_name.len() + 2),
158                // ));
159            }
160            Ok(Element {
161                tag_name,
162                attributes,
163                children: ElementChildren::Children(children),
164            })
165        } else {
166            Err(())
167        }
168    }
169
170    // fn to_string_from_buffer<T: source_map::ToString>(
171    // 	&self,
172    // 	buf: &mut T,
173    // 	options: &crate::ToStringOptions,
174    // 	local: crate::LocalToStringInformation,
175    // ) {
176    // 	buf.push('<');
177    // 	buf.push_str(&self.tag_name);
178    // 	for attribute in &self.attributes {
179    // 		buf.push(' ');
180    // 		attribute.to_string_from_buffer(buf, options, local);
181    // 	}
182    // 	buf.push('>');
183
184    // 	match self.children {
185    // 		ElementChildren::Children(ref children) => {
186    // 			_children_to_string(children, buf, options, local);
187    // 			buf.push_str("</");
188    // 			buf.push_str(&self.tag_name);
189    // 			buf.push('>');
190    // 		}
191    // 		ElementChildren::SelfClosing => {}
192    // 		ElementChildren::Literal(ref content) => {
193    // 			buf.push_str(content);
194    // 			buf.push_str("</");
195    // 			buf.push_str(&self.tag_name);
196    // 			buf.push('>');
197    // 		}
198    // 	}
199    // }
200}
201
202#[derive(Debug, Clone, PartialEq)]
203pub struct Attribute {
204    key: String,
205    value: String,
206}
207
208impl Attribute {
209    // fn get_position(&self) -> Span {
210    // 	match self {
211    // 		Attribute::Static(_, _, pos)
212    // 		| Attribute::Dynamic(_, _, pos)
213    // 		| Attribute::Boolean(_, pos) => *pos,
214    // 		Attribute::Spread(_, spread_pos) => *spread_pos,
215    // 		Attribute::Shorthand(expr) => expr.get_position(),
216    // 	}
217    // }
218
219    fn _from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
220        // let start = reader.get_start();
221        let key = reader
222            .parse_identifier("Element attribute", false)?
223            .to_owned();
224        if reader.is_operator_advance("=") {
225            if reader.starts_with_string_delimeter() {
226                let (content, _quoted) = reader.parse_string_literal()?;
227                Ok(Attribute {
228                    key,
229                    value: content.to_owned(),
230                })
231            } else {
232                // let error_position = start.with_length(
233                // 	crate::lexer::utilities::next_empty_occurance(reader.get_current()),
234                // );
235                Err(())
236                // Err(ParseError::new(ParseErrors::ExpectedAttribute, error_position))
237            }
238        } else {
239            Ok(Attribute {
240                key,
241                value: Default::default(),
242            })
243        }
244    }
245
246    // fn to_string_from_buffer<T: source_map::ToString>(
247    // 	&self,
248    // 	buf: &mut T,
249    // 	options: &crate::ToStringOptions,
250    // 	local: crate::LocalToStringInformation,
251    // ) {
252    // 	match self {
253    // 		Attribute::Static(key, expression, _) => {
254    // 			buf.push_str(key.as_str());
255    // 			buf.push('=');
256    // 			buf.push('"');
257    // 			buf.push_str(expression.as_str());
258    // 			buf.push('"');
259    // 		}
260    // 		Attribute::Dynamic(key, expression, _) => {
261    // 			buf.push_str(key.as_str());
262    // 			buf.push('=');
263    // 			buf.push('{');
264    // 			expression.to_string_from_buffer(buf, options, local);
265    // 			buf.push('}');
266    // 		}
267    // 		Attribute::Boolean(key, _) => {
268    // 			buf.push_str(key.as_str());
269    // 		}
270    // 		Attribute::Spread(expr, _) => {
271    // 			buf.push_str("...");
272    // 			expr.to_string_from_buffer(buf, options, local);
273    // 		}
274    // 		Attribute::Shorthand(expr) => {
275    // 			expr.to_string_from_buffer(buf, options, local);
276    // 		}
277    // 	}
278    // }
279}
280
281type ParseResult<T> = Result<T, ()>;
282
283fn children_from_reader(reader: &mut crate::Lexer) -> ParseResult<Vec<Node>> {
284    let mut children = Vec::new();
285    // TODO count new lines etc
286    loop {
287        reader.skip();
288        // for _ in 0..reader.last_was_from_new_line() {
289        // 	children.push(Node::LineBreak);
290        // }
291        if reader.starts_with_str("</") {
292            return Ok(children);
293        }
294        children.push(Node::from_reader(reader)?);
295    }
296}
297
298// fn children_to_string<T: source_map::ToString>(
299// 	children: &[Node],
300// 	buf: &mut T,
301// 	options: &crate::ToStringOptions,
302// 	local: crate::LocalToStringInformation,
303// ) {
304// 	let element_or_line_break_in_children =
305// 		children.iter().any(|node| matches!(node, Node::Element(..) | Node::LineBreak));
306
307// 	let mut previous_was_element_or_line_break = true;
308
309// 	for node in children {
310// 		if element_or_line_break_in_children
311// 			&& !matches!(node, Node::LineBreak)
312// 			&& previous_was_element_or_line_break
313// 		{
314// 			options.add_indent(local.depth + 1, buf);
315// 		}
316// 		node.to_string_from_buffer(buf, options, local);
317// 		previous_was_element_or_line_break =
318// 			matches!(node, Node::Element(..) | Node::LineBreak);
319// 	}
320
321// 	if options.pretty && local.depth > 0 && previous_was_element_or_line_break {
322// 		options.add_indent(local.depth, buf);
323// 	}
324// }
325
326#[derive(Debug, Clone, PartialEq)]
327pub enum Node {
328    Element(Element),
329    TextNode(String),
330    Comment(String),
331}
332
333impl Node {
334    // fn get_position(&self) -> Span {
335    // 	match self {
336    // 		Node::TextNode(_, pos)
337    // 		| Node::Comment(_, pos) => *pos,
338    // 		Node::Element(element) => element.get_position(),
339    // 		Node::LineBreak => source_map::Nullable::NULL,
340    // 	}
341    // }
342
343    fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
344        reader.skip();
345        // let start = reader.get_start();
346        // if reader.is_operator_advance("{") {
347        // 	let expression = FunctionArgument::from_reader(reader)?;
348        // 	let end = reader.expect('}')?;
349        // 	// let position = start.union(end);
350        // 	let position = ();
351        // 	Ok(Node::InterpolatedExpression(Box::new(expression), position))
352        // } else
353        if reader.starts_with_str("<!--") {
354            reader.advance("<!--".len() as u32);
355            // .map_err(|()| {
356            // 	// TODO might be a problem
357            // 	let position = reader.get_start().with_length(reader.get_current().len());
358            // 	ParseError::new(crate::ParseErrors::UnexpectedEnd, position)
359            // })?
360            let (content, _) = reader.parse_until("-->")?.to_owned();
361            Ok(Node::Comment(content.to_owned()))
362        } else if reader.starts_with_str("<") {
363            let element = Element::from_reader(reader)?;
364            Ok(Node::Element(element))
365        } else {
366            let (content, _) = reader.parse_until("<")?;
367            // .map_err(|()| {
368            // 	// TODO might be a problem
369            // 	let position = reader.get_start().with_length(reader.get_current().len());
370            // 	ParseError::new(crate::ParseErrors::UnexpectedEnd, position)
371            // })?;
372            Ok(Node::TextNode(content.trim_start().into()))
373        }
374    }
375
376    // fn to_string_from_buffer<T: source_map::ToString>(
377    // 	&self,
378    // 	buf: &mut T,
379    // 	options: &crate::ToStringOptions,
380    // 	local: crate::LocalToStringInformation,
381    // ) {
382    // 	match self {
383    // 		Node::Element(element) => {
384    // 			element.to_string_from_buffer(buf, options, local.next_level());
385    // 		}
386    // 		Node::TextNode(text, _) => buf.push_str(text),
387    // 		Node::InterpolatedExpression(expression, _) => {
388    // 			buf.push('{');
389    // 			expression.to_string_from_buffer(buf, options, local.next_level());
390    // 			buf.push('}');
391    // 		}
392    // 		Node::LineBreak => {
393    // 			if options.pretty {
394    // 				buf.push_new_line();
395    // 			}
396    // 		}
397    // 		Node::Comment(comment, _) => {
398    // 			if options.pretty {
399    // 				buf.push_str("<!--");
400    // 				buf.push_str(comment);
401    // 				buf.push_str("-->");
402    // 			}
403    // 		}
404    // 	}
405    // }
406}
407
408/// Used for lexing
409#[must_use]
410pub fn html_tag_contains_literal_content(tag_name: &str) -> bool {
411    matches!(tag_name, "script" | "style")
412}
413
414/// Used for lexing
415#[must_use]
416pub fn html_tag_is_self_closing(tag_name: &str) -> bool {
417    matches!(
418        tag_name,
419        "area"
420            | "base"
421            | "br"
422            | "col"
423            | "embed"
424            | "hr"
425            | "img"
426            | "input"
427            | "link"
428            | "meta"
429            | "param"
430            | "source"
431            | "track"
432            | "wbr"
433    )
434}
435
436#[cfg_attr(target_family = "wasm", wasm_bindgen::prelude::wasm_bindgen)]
437pub fn retrieve(content: String, query: String) -> String {
438    use crate::{
439        matching::{query_selector, query_selector_all, Selector},
440        operations::inner_text,
441    };
442    let result = Document::from_reader(&mut Lexer::new(&content));
443    let document = result.unwrap();
444    let mut current: Vec<&Element> = vec![&document.html_element];
445    for query in query.split('\0') {
446        if let Some(selector) = query.strip_prefix("single ") {
447            let selector = Selector::from_string(selector.trim());
448            current = current
449                .into_iter()
450                .flat_map(|element| query_selector(element, &selector))
451                .collect();
452        } else if let Some(selector) = query.strip_prefix("all ") {
453            let selector = Selector::from_string(selector.trim());
454            current = current
455                .into_iter()
456                .flat_map(|element| query_selector_all(element, &selector))
457                .collect();
458        } else if let Some(expected_key) = query.strip_prefix("attribute ") {
459            let mut buf = String::new();
460            for element in current {
461                let value = element
462                    .attributes
463                    .iter()
464                    .find_map(|Attribute { key, value }| (key == expected_key).then_some(value));
465                if let Some(value) = value {
466                    if !buf.is_empty() {
467                        buf.push('\0');
468                    }
469                    buf.push_str(value);
470                }
471            }
472            return buf;
473        } else if let "text" = query {
474            let mut buf = String::new();
475            for element in current {
476                if !buf.is_empty() {
477                    buf.push('\0');
478                }
479                buf.push_str(&inner_text(element));
480            }
481            return buf;
482        }
483    }
484
485    panic!("no end query")
486}