#![allow(clippy::result_unit_err)]
#![doc = include_str!("../README.md")]
pub use lexer::Lexer;
use std::borrow::Cow;
mod lexer;
pub mod matching;
pub mod operations;
#[derive(Debug, Clone, PartialEq)]
pub struct Element {
pub tag_name: String,
pub attributes: Vec<Attribute>,
pub children: ElementChildren,
}
pub type Children = Vec<Node>;
#[derive(Debug, Clone, PartialEq)]
pub enum ElementChildren {
Children(Children),
Literal(String),
SelfClosing,
}
impl From<Element> for Node {
fn from(value: Element) -> Node {
Node::Element(value)
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Document {
pub html_element: Element,
}
impl Document {
pub fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
let _ = reader.is_operator_advance("<!DOCTYPE html>");
Element::from_reader(reader).map(|html_element| Document { html_element })
}
}
impl Element {
pub fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
reader.expect_start('<')?;
let tag_name = reader.parse_identifier("Element name", false)?.to_owned();
let mut attributes = Vec::new();
loop {
reader.skip();
if reader.is_operator_advance(">") {
break;
} else if reader.is_operator_advance("/>") {
return Ok(Element {
tag_name,
attributes,
children: ElementChildren::SelfClosing,
});
} else {
let key = reader
.parse_identifier("Element attribute", false)?
.to_owned();
let attribute = if reader.is_operator_advance("=") {
if reader.starts_with_string_delimeter() {
let (content, _quoted) = reader.parse_string_literal()?;
Attribute {
key,
value: content.to_owned(),
}
} else {
return Err(());
}
} else {
Attribute {
key,
value: Default::default(),
}
};
attributes.push(attribute);
}
}
if html_tag_is_self_closing(&tag_name) {
return Ok(Element {
tag_name,
attributes,
children: ElementChildren::SelfClosing,
});
} else if html_tag_contains_literal_content(&tag_name) {
let (content, _) = reader
.parse_until("</")
.map_err(|()| {
})?
.to_owned();
reader.advance("</".len() as u32);
let content = content.to_owned();
let closing_tag_name = reader.parse_identifier("Closing tag", false)?;
if tag_name != closing_tag_name {
return Err(());
}
reader.expect('>')?;
let children = ElementChildren::Literal(content);
return Ok(Element {
tag_name,
attributes,
children,
});
}
let children = children_from_reader(reader)?;
if reader.is_operator_advance("</") {
let closing_tag_name = reader.parse_identifier("closing tag", false)?;
reader.expect('>')?;
if closing_tag_name != tag_name {
return Err(());
}
Ok(Element {
tag_name,
attributes,
children: ElementChildren::Children(children),
})
} else {
Err(())
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Attribute {
key: String,
value: String,
}
impl Attribute {
fn _from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
let key = reader
.parse_identifier("Element attribute", false)?
.to_owned();
if reader.is_operator_advance("=") {
if reader.starts_with_string_delimeter() {
let (content, _quoted) = reader.parse_string_literal()?;
Ok(Attribute {
key,
value: content.to_owned(),
})
} else {
Err(())
}
} else {
Ok(Attribute {
key,
value: Default::default(),
})
}
}
}
type ParseResult<T> = Result<T, ()>;
fn children_from_reader(reader: &mut crate::Lexer) -> ParseResult<Vec<Node>> {
let mut children = Vec::new();
loop {
reader.skip();
if reader.starts_with_str("</") {
return Ok(children);
}
children.push(Node::from_reader(reader)?);
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum Node {
Element(Element),
TextNode(String),
Comment(String),
}
impl Node {
fn from_reader(reader: &mut crate::Lexer) -> Result<Self, ()> {
reader.skip();
if reader.starts_with_str("<!--") {
reader.advance("<!--".len() as u32);
let (content, _) = reader.parse_until("-->")?.to_owned();
Ok(Node::Comment(content.to_owned()))
} else if reader.starts_with_str("<") {
let element = Element::from_reader(reader)?;
Ok(Node::Element(element))
} else {
let (content, _) = reader.parse_until("<")?;
Ok(Node::TextNode(content.trim_start().into()))
}
}
}
#[must_use]
pub fn html_tag_contains_literal_content(tag_name: &str) -> bool {
matches!(tag_name, "script" | "style")
}
#[must_use]
pub fn html_tag_is_self_closing(tag_name: &str) -> bool {
matches!(
tag_name,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}
#[cfg_attr(target_family = "wasm", wasm_bindgen::prelude::wasm_bindgen)]
pub fn retrieve(content: String, query: String) -> String {
use crate::{
matching::{query_selector, query_selector_all, Selector},
operations::inner_text,
};
let result = Document::from_reader(&mut Lexer::new(&content));
let document = result.unwrap();
let mut current: Vec<&Element> = vec![&document.html_element];
for query in query.split('\0') {
if let Some(selector) = query.strip_prefix("single ") {
let selector = Selector::from_string(selector.trim());
current = current
.into_iter()
.flat_map(|element| query_selector(element, &selector))
.collect();
} else if let Some(selector) = query.strip_prefix("all ") {
let selector = Selector::from_string(selector.trim());
current = current
.into_iter()
.flat_map(|element| query_selector_all(element, &selector))
.collect();
} else if let Some(expected_key) = query.strip_prefix("attribute ") {
let mut buf = String::new();
for element in current {
let value = element
.attributes
.iter()
.find_map(|Attribute { key, value }| (key == expected_key).then_some(value));
if let Some(value) = value {
if !buf.is_empty() {
buf.push('\0');
}
buf.push_str(value);
}
}
return buf;
} else if let "text" = query {
let mut buf = String::new();
for element in current {
if !buf.is_empty() {
buf.push('\0');
}
buf.push_str(&inner_text(element));
}
return buf;
}
}
panic!("no end query")
}