lightml/lib.rs
1#![allow(clippy::result_unit_err)]
2#![doc = include_str!("../README.md")]
3
4pub use lexer::Lexer;
5use std::borrow::Cow;
6
7mod lexer;
8pub mod matching;
9pub mod operations;
10
11#[derive(Debug, Clone, PartialEq)]
12pub struct Element {
13 /// Name of the element (TODO or reference to element)
14 pub tag_name: String,
15 pub attributes: Vec<Attribute>,
16 pub children: ElementChildren,
17}
18
19pub type Children = Vec<Node>;
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum ElementChildren {
23 Children(Children),
24 /// For script + style elements
25 Literal(String),
26 /// For `img` elements etc
27 SelfClosing,
28}
29
30impl From<Element> for Node {
31 fn from(value: Element) -> Node {
32 Node::Element(value)
33 }
34}
35
36#[derive(Debug, Clone, PartialEq)]
37pub struct Document {
38 pub html_element: Element,
39}
40
41impl Document {
42 pub fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
43 // TODO temp
44 let _ = reader.is_operator_advance("<!DOCTYPE html>");
45 Element::from_reader(reader).map(|html_element| Document { html_element })
46 }
47}
48
49impl Element {
50 pub fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
51 reader.expect_start('<')?;
52 let tag_name = reader.parse_identifier("Element name", false)?.to_owned();
53 let mut attributes = Vec::new();
54 // TODO spread attributes
55 // Kind of weird / not clear conditions for breaking out of while loop
56 loop {
57 reader.skip();
58 if reader.is_operator_advance(">") {
59 break;
60 } else if reader.is_operator_advance("/>") {
61 // TODO check set closing
62 // Early return if self closing
63 return Ok(Element {
64 tag_name,
65 attributes,
66 children: ElementChildren::SelfClosing,
67 });
68 } else {
69 // TODO extras here @ etc
70 // let start = reader.get_start();
71 let key = reader
72 .parse_identifier("Element attribute", false)?
73 .to_owned();
74 let attribute = if reader.is_operator_advance("=") {
75 // let start = reader.get_start();
76 if reader.starts_with_string_delimeter() {
77 // TODO _quoted
78 let (content, _quoted) = reader.parse_string_literal()?;
79 // let position = start.with_length(content.len() + 2);
80 Attribute {
81 key,
82 value: content.to_owned(),
83 }
84 } else {
85 return Err(());
86 // let error_position = start.with_length(
87 // crate::lexer::utilities::next_empty_occurance(reader.get_current()),
88 // );
89 // return Err(ParseError::new(
90 // ParseErrors::ExpectedAttribute,
91 // error_position,
92 // ));
93 }
94 } else {
95 // Boolean attribute
96 Attribute {
97 key,
98 value: Default::default(),
99 }
100 };
101 attributes.push(attribute);
102 }
103 }
104
105 if html_tag_is_self_closing(&tag_name) {
106 return Ok(Element {
107 tag_name,
108 attributes,
109 children: ElementChildren::SelfClosing,
110 });
111 } else if html_tag_contains_literal_content(&tag_name) {
112 // TODO could embedded parser?
113 let (content, _) = reader
114 .parse_until("</")
115 .map_err(|()| {
116 // TODO might be a problem
117 // let position = reader.get_start().with_length(reader.get_current().len());
118 // ParseError::new(crate::ParseErrors::UnexpectedEnd, position)
119 })?
120 .to_owned();
121
122 reader.advance("</".len() as u32);
123
124 let content = content.to_owned();
125
126 let closing_tag_name = reader.parse_identifier("Closing tag", false)?;
127 if tag_name != closing_tag_name {
128 return Err(());
129 // return Err(ParseError::new(
130 // crate::ParseErrors::ClosingTagDoesNotMatch {
131 // tag_name: &tag_name,
132 // closing_tag_name,
133 // },
134 // start.with_length(closing_tag_name.len() + 2),
135 // ));
136 }
137 reader.expect('>')?;
138 let children = ElementChildren::Literal(content);
139 return Ok(Element {
140 tag_name,
141 attributes,
142 children,
143 });
144 }
145
146 let children = children_from_reader(reader)?;
147 if reader.is_operator_advance("</") {
148 let closing_tag_name = reader.parse_identifier("closing tag", false)?;
149 reader.expect('>')?;
150 if closing_tag_name != tag_name {
151 return Err(());
152 // return Err(ParseError::new(
153 // crate::ParseErrors::ClosingTagDoesNotMatch {
154 // tag_name: &tag_name,
155 // closing_tag_name,
156 // },
157 // start.with_length(closing_tag_name.len() + 2),
158 // ));
159 }
160 Ok(Element {
161 tag_name,
162 attributes,
163 children: ElementChildren::Children(children),
164 })
165 } else {
166 Err(())
167 }
168 }
169
170 // fn to_string_from_buffer<T: source_map::ToString>(
171 // &self,
172 // buf: &mut T,
173 // options: &crate::ToStringOptions,
174 // local: crate::LocalToStringInformation,
175 // ) {
176 // buf.push('<');
177 // buf.push_str(&self.tag_name);
178 // for attribute in &self.attributes {
179 // buf.push(' ');
180 // attribute.to_string_from_buffer(buf, options, local);
181 // }
182 // buf.push('>');
183
184 // match self.children {
185 // ElementChildren::Children(ref children) => {
186 // _children_to_string(children, buf, options, local);
187 // buf.push_str("</");
188 // buf.push_str(&self.tag_name);
189 // buf.push('>');
190 // }
191 // ElementChildren::SelfClosing => {}
192 // ElementChildren::Literal(ref content) => {
193 // buf.push_str(content);
194 // buf.push_str("</");
195 // buf.push_str(&self.tag_name);
196 // buf.push('>');
197 // }
198 // }
199 // }
200}
201
202#[derive(Debug, Clone, PartialEq)]
203pub struct Attribute {
204 key: String,
205 value: String,
206}
207
208impl Attribute {
209 // fn get_position(&self) -> Span {
210 // match self {
211 // Attribute::Static(_, _, pos)
212 // | Attribute::Dynamic(_, _, pos)
213 // | Attribute::Boolean(_, pos) => *pos,
214 // Attribute::Spread(_, spread_pos) => *spread_pos,
215 // Attribute::Shorthand(expr) => expr.get_position(),
216 // }
217 // }
218
219 fn _from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
220 // let start = reader.get_start();
221 let key = reader
222 .parse_identifier("Element attribute", false)?
223 .to_owned();
224 if reader.is_operator_advance("=") {
225 if reader.starts_with_string_delimeter() {
226 let (content, _quoted) = reader.parse_string_literal()?;
227 Ok(Attribute {
228 key,
229 value: content.to_owned(),
230 })
231 } else {
232 // let error_position = start.with_length(
233 // crate::lexer::utilities::next_empty_occurance(reader.get_current()),
234 // );
235 Err(())
236 // Err(ParseError::new(ParseErrors::ExpectedAttribute, error_position))
237 }
238 } else {
239 Ok(Attribute {
240 key,
241 value: Default::default(),
242 })
243 }
244 }
245
246 // fn to_string_from_buffer<T: source_map::ToString>(
247 // &self,
248 // buf: &mut T,
249 // options: &crate::ToStringOptions,
250 // local: crate::LocalToStringInformation,
251 // ) {
252 // match self {
253 // Attribute::Static(key, expression, _) => {
254 // buf.push_str(key.as_str());
255 // buf.push('=');
256 // buf.push('"');
257 // buf.push_str(expression.as_str());
258 // buf.push('"');
259 // }
260 // Attribute::Dynamic(key, expression, _) => {
261 // buf.push_str(key.as_str());
262 // buf.push('=');
263 // buf.push('{');
264 // expression.to_string_from_buffer(buf, options, local);
265 // buf.push('}');
266 // }
267 // Attribute::Boolean(key, _) => {
268 // buf.push_str(key.as_str());
269 // }
270 // Attribute::Spread(expr, _) => {
271 // buf.push_str("...");
272 // expr.to_string_from_buffer(buf, options, local);
273 // }
274 // Attribute::Shorthand(expr) => {
275 // expr.to_string_from_buffer(buf, options, local);
276 // }
277 // }
278 // }
279}
280
281type ParseResult<T> = Result<T, ()>;
282
283fn children_from_reader(reader: &mut crate::Lexer) -> ParseResult<Vec<Node>> {
284 let mut children = Vec::new();
285 // TODO count new lines etc
286 loop {
287 reader.skip();
288 // for _ in 0..reader.last_was_from_new_line() {
289 // children.push(Node::LineBreak);
290 // }
291 if reader.starts_with_str("</") {
292 return Ok(children);
293 }
294 children.push(Node::from_reader(reader)?);
295 }
296}
297
298// fn children_to_string<T: source_map::ToString>(
299// children: &[Node],
300// buf: &mut T,
301// options: &crate::ToStringOptions,
302// local: crate::LocalToStringInformation,
303// ) {
304// let element_or_line_break_in_children =
305// children.iter().any(|node| matches!(node, Node::Element(..) | Node::LineBreak));
306
307// let mut previous_was_element_or_line_break = true;
308
309// for node in children {
310// if element_or_line_break_in_children
311// && !matches!(node, Node::LineBreak)
312// && previous_was_element_or_line_break
313// {
314// options.add_indent(local.depth + 1, buf);
315// }
316// node.to_string_from_buffer(buf, options, local);
317// previous_was_element_or_line_break =
318// matches!(node, Node::Element(..) | Node::LineBreak);
319// }
320
321// if options.pretty && local.depth > 0 && previous_was_element_or_line_break {
322// options.add_indent(local.depth, buf);
323// }
324// }
325
326#[derive(Debug, Clone, PartialEq)]
327pub enum Node {
328 Element(Element),
329 TextNode(String),
330 Comment(String),
331}
332
333impl Node {
334 // fn get_position(&self) -> Span {
335 // match self {
336 // Node::TextNode(_, pos)
337 // | Node::Comment(_, pos) => *pos,
338 // Node::Element(element) => element.get_position(),
339 // Node::LineBreak => source_map::Nullable::NULL,
340 // }
341 // }
342
343 fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
344 reader.skip();
345 // let start = reader.get_start();
346 // if reader.is_operator_advance("{") {
347 // let expression = FunctionArgument::from_reader(reader)?;
348 // let end = reader.expect('}')?;
349 // // let position = start.union(end);
350 // let position = ();
351 // Ok(Node::InterpolatedExpression(Box::new(expression), position))
352 // } else
353 if reader.starts_with_str("<!--") {
354 reader.advance("<!--".len() as u32);
355 // .map_err(|()| {
356 // // TODO might be a problem
357 // let position = reader.get_start().with_length(reader.get_current().len());
358 // ParseError::new(crate::ParseErrors::UnexpectedEnd, position)
359 // })?
360 let (content, _) = reader.parse_until("-->")?.to_owned();
361 Ok(Node::Comment(content.to_owned()))
362 } else if reader.starts_with_str("<") {
363 let element = Element::from_reader(reader)?;
364 Ok(Node::Element(element))
365 } else {
366 let (content, _) = reader.parse_until("<")?;
367 // .map_err(|()| {
368 // // TODO might be a problem
369 // let position = reader.get_start().with_length(reader.get_current().len());
370 // ParseError::new(crate::ParseErrors::UnexpectedEnd, position)
371 // })?;
372 Ok(Node::TextNode(content.trim_start().into()))
373 }
374 }
375
376 // fn to_string_from_buffer<T: source_map::ToString>(
377 // &self,
378 // buf: &mut T,
379 // options: &crate::ToStringOptions,
380 // local: crate::LocalToStringInformation,
381 // ) {
382 // match self {
383 // Node::Element(element) => {
384 // element.to_string_from_buffer(buf, options, local.next_level());
385 // }
386 // Node::TextNode(text, _) => buf.push_str(text),
387 // Node::InterpolatedExpression(expression, _) => {
388 // buf.push('{');
389 // expression.to_string_from_buffer(buf, options, local.next_level());
390 // buf.push('}');
391 // }
392 // Node::LineBreak => {
393 // if options.pretty {
394 // buf.push_new_line();
395 // }
396 // }
397 // Node::Comment(comment, _) => {
398 // if options.pretty {
399 // buf.push_str("<!--");
400 // buf.push_str(comment);
401 // buf.push_str("-->");
402 // }
403 // }
404 // }
405 // }
406}
407
408/// Used for lexing
409#[must_use]
410pub fn html_tag_contains_literal_content(tag_name: &str) -> bool {
411 matches!(tag_name, "script" | "style")
412}
413
414/// Used for lexing
415#[must_use]
416pub fn html_tag_is_self_closing(tag_name: &str) -> bool {
417 matches!(
418 tag_name,
419 "area"
420 | "base"
421 | "br"
422 | "col"
423 | "embed"
424 | "hr"
425 | "img"
426 | "input"
427 | "link"
428 | "meta"
429 | "param"
430 | "source"
431 | "track"
432 | "wbr"
433 )
434}
435
436#[cfg_attr(target_family = "wasm", wasm_bindgen::prelude::wasm_bindgen)]
437pub fn retrieve(content: String, query: String) -> String {
438 use crate::{
439 matching::{query_selector, query_selector_all, Selector},
440 operations::inner_text,
441 };
442 let result = Document::from_reader(&mut Lexer::new(&content));
443 let document = result.unwrap();
444 let mut current: Vec<&Element> = vec![&document.html_element];
445 for query in query.split('\0') {
446 if let Some(selector) = query.strip_prefix("single ") {
447 let selector = Selector::from_string(selector.trim());
448 current = current
449 .into_iter()
450 .flat_map(|element| query_selector(element, &selector))
451 .collect();
452 } else if let Some(selector) = query.strip_prefix("all ") {
453 let selector = Selector::from_string(selector.trim());
454 current = current
455 .into_iter()
456 .flat_map(|element| query_selector_all(element, &selector))
457 .collect();
458 } else if let Some(expected_key) = query.strip_prefix("attribute ") {
459 let mut buf = String::new();
460 for element in current {
461 let value = element
462 .attributes
463 .iter()
464 .find_map(|Attribute { key, value }| (key == expected_key).then_some(value));
465 if let Some(value) = value {
466 if !buf.is_empty() {
467 buf.push('\0');
468 }
469 buf.push_str(value);
470 }
471 }
472 return buf;
473 } else if let "text" = query {
474 let mut buf = String::new();
475 for element in current {
476 if !buf.is_empty() {
477 buf.push('\0');
478 }
479 buf.push_str(&inner_text(element));
480 }
481 return buf;
482 }
483 }
484
485 panic!("no end query")
486}