extern crate quick_xml;
extern crate memchr;
use quick_xml::events::{Event, BytesEnd, BytesText, BytesStart};
use quick_xml::{Error, Reader};
use std::collections::LinkedList;
use memchr::{memchr_iter};
use std::sync::{Arc};
use std::ops::{Deref, DerefMut};
type SharedNode = Arc<Node>;
#[derive(Default, Clone, Debug, PartialEq)]
pub struct Children(Vec<NodeAccess>);
#[derive(Debug, Clone)]
pub enum NodeAccess {
Owned(Node),
Sharable(SharedNode),
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum ChildrenType {
Owned,
Sharable,
}
#[derive(Clone, Debug, PartialEq, Default)]
pub struct Node {
start: Option<OpeningTag>,
text: Option<String>,
end: Option<String>,
children: Children,
}
#[derive(Clone, Debug, PartialEq)]
pub struct OpeningTag {
empty: bool,
name: String,
attrs: Vec<Attribute>,
}
#[derive(Clone, Debug, PartialEq)]
pub struct Attribute {
name: String,
values: Vec<String>,
}
#[derive(Clone, PartialEq, Debug)]
pub struct LoadSettings {
all_text_separately: bool,
children_type: ChildrenType,
}
#[derive(Clone, Copy, Debug)]
pub struct ChildrenFetch<'a> {
node: &'a Node,
tag: Option<&'a str>,
key: Option<&'a str>,
value: Option<&'a str>,
value_part: Option<&'a str>,
}
#[derive(Clone, Copy, Debug)]
pub struct ChildrenFetchMut<'a> {
inner: ChildrenFetch<'a>,
}
impl IntoIterator for Children {
type Item = NodeAccess;
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}
impl Deref for Children {
type Target = Vec<NodeAccess>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for Children {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl Children {
fn iter_to_owned<T: IntoIterator<Item = Node>>(iter: T, capacity: usize) -> Children {
let mut arr = Vec::with_capacity(capacity);
for child in iter {
arr.push(NodeAccess::new_owned(child));
}
Children(arr)
}
fn iter_to_shared<T: IntoIterator<Item = Node>>(iter: T, capacity: usize) -> Children {
let mut arr = Vec::with_capacity(capacity);
for child in iter {
arr.push(NodeAccess::new_shared(child));
}
Children(arr)
}
fn iter_to<T: IntoIterator<Item = Node>>(children_type: &ChildrenType, iter: T, capacity: usize)
-> Children {
use ChildrenType::*;
match children_type {
Owned => Children::iter_to_owned(iter, capacity),
Sharable => Children::iter_to_shared(iter, capacity),
}
}
pub fn to_all_sharable(&self) -> Self {
let children = &self.0;
let mut vec = Vec::with_capacity(children.len());
for child in children {
let mut child = child.to_owned();
let children = child.children.to_all_sharable();
*child.children = children.0;
let child = NodeAccess::new_shared(child);
vec.push(child);
}
Children(vec)
}
pub fn to_all_owned(&self) -> Self {
let children = &self.0;
let mut vec = Vec::with_capacity(children.len());
for child in children {
let mut child = child.to_owned();
let children = child.children.to_all_owned();
*child.children = children.0;
vec.push(child.into());
}
Children(vec)
}
}
impl PartialEq for NodeAccess {
fn eq(&self, other: &NodeAccess) -> bool {
use std::mem::discriminant;
if discriminant(self) != discriminant(other) {
return false;
}
use NodeAccess::*;
match self {
Owned(node) => {
if let Owned(other) = other {
node == other
} else {
unreachable!()
}
},
Sharable(node) => {
if let Sharable(other) = other {
Arc::ptr_eq(node, other)
} else {
unreachable!()
}
},
}
}
}
impl Deref for NodeAccess {
type Target = Node;
fn deref(&self) -> &Node {
use NodeAccess::*;
match self {
Owned(n) => n,
Sharable(n) => n
}
}
}
impl NodeAccess {
fn new_owned(node: Node) -> NodeAccess {
NodeAccess::Owned(node)
}
fn new_shared(node: Node) -> NodeAccess {
let arc = Arc::new(node);
NodeAccess::Sharable(arc)
}
pub fn try_mut(&mut self) -> Option<&mut Node> {
if let NodeAccess::Owned(n) = self {
Some(n)
} else if let NodeAccess::Sharable(n) = self {
Arc::get_mut(n)
} else {
unreachable!()
}
}
pub fn to_sharable(&self) -> SharedNode {
use NodeAccess::*;
match self {
Owned(n) => Arc::new(n.clone()),
Sharable(n) => n.clone()
}
}
pub fn to_owned(&self) -> Node {
use NodeAccess::*;
match self {
Owned(n) => n.clone(),
Sharable(n) => n.as_ref().clone(),
}
}
pub fn wrap_to_root(self) -> Result<Self, Self> {
use NodeAccess::*;
if self.is_root() {
return Err(self);
}
match self {
Owned(n) => Ok(Owned(Node::wrap_to_root(n).unwrap())),
Sharable(n) => Ok(Sharable(
Arc::new(Node::wrap_to_root(n.as_ref().to_owned()).unwrap())
))
}
}
}
impl From<Node> for NodeAccess {
fn from(node: Node) -> Self {
NodeAccess::Owned(node)
}
}
impl From<SharedNode> for NodeAccess {
fn from(sn: SharedNode) -> Self {
NodeAccess::Sharable(sn)
}
}
impl Node {
pub fn new() -> Self {
Default::default()
}
pub fn from_html(html: &str, settings: &LoadSettings) -> Result<Option<Node>, Error> {
let events = Self::collect_events(html);
let children = {
let mut nodes = LinkedList::new();
let mut iter = events.iter();
loop {
let node = Self::next_node(&mut iter, settings);
if node.is_none() {
break;
}
nodes.push_back(node.unwrap());
}
let len = nodes.len();
Children::iter_to(&settings.children_type, nodes.into_iter(), len)
};
if children.is_empty() {
Ok(None)
} else {
Ok(Some(Node {
children,
start: None,
end: None,
text: None,
}))
}
}
fn collect_events(html: &str) -> LinkedList<Event> {
use Event::*;
let mut reader = Reader::from_str(html);
let mut buf = Vec::new();
let mut list = LinkedList::new();
reader.check_end_names(false);
loop {
let event
= Self::process_next_event(reader.read_event(&mut buf));
if event.is_err() {
break;
}
let event = event.unwrap();
if event.is_some() {
list.push_back(event.unwrap());
}
}
let fixed_list = {
let trim_start = |s: String| {
if s.is_empty() {
return s;
}
let mut iter = s.chars();
let first = iter.next().unwrap();
if first == '\n' {
String::from(s.trim_start())
} else if first == '\t' || first == ' ' {
while let Some(ch) = iter.next() {
if ch != '\t' && ch != ' ' && ch != '\n' {
return s;
}
}
String::from(s.trim_start())
} else {
s
}
};
let trim_end = |s: String| {
let bytes = s.as_bytes();
let mut memchr = memchr_iter('\n' as _, bytes);
if let Some(_) = memchr.next() {
String::from(s.trim_end())
} else {
s
}
};
let mut fixed_list = LinkedList::new();
for i in list {
if let Text(e) = i {
let text = std::str::from_utf8(e.escaped()).unwrap();
let text = String::from(text);
let s = trim_start(text);
let s = trim_end(s);
if !s.is_empty() {
let content = Vec::from(s.as_bytes());
let new = Text(BytesText::from_plain(&content)).into_owned();
fixed_list.push_back(new);
}
} else {
fixed_list.push_back(i);
}
}
fixed_list
};
fixed_list
}
fn process_next_event(event: quick_xml::Result<Event>) -> Result<Option<Event<'static>>, ()> {
use Event::*;
if event.is_err() {
return Err(());
}
let event: Event = event.unwrap();
match event {
Start(e) => {
let vec = e.to_vec();
let e = BytesStart::borrowed(
&vec, e.name().len()
).into_owned();
Ok(Some(Start(e)))
},
End(e) => {
let vec = e.to_vec();
let e = BytesEnd::borrowed(&vec).into_owned();
Ok(Some(End(e)))
},
Empty(e) => {
let vec = e.to_vec();
let e = BytesStart::borrowed(
&vec, e.name().len()
).into_owned();
Ok(Some(Empty(e)))
},
Text(e) => {
let vec = e.to_vec();
let e = BytesText::from_plain(&vec).into_owned();
Ok(Some(Text(e)))
},
DocType(_) => Ok(None),
Eof => Err(()),
_ => Err(()),
}
}
#[allow(unused_assignments)]
fn next_node(
iter: &mut std::collections::linked_list::Iter<Event>,
settings: &LoadSettings) -> Option<Node> {
use Event::*;
let mut biter = iter.clone();
let peek = biter.next();
if peek.is_none() {
return None;
}
let peek = peek.unwrap();
match peek {
Start(e) => {
iter.next();
let start = Some({
let name = String::from(unsafe {
std::str::from_utf8_unchecked(
&*e.name()).split_whitespace().next().unwrap()
});
let mut attrs = LinkedList::new();
for attr in e.attributes() {
if let Err(_) = attr {
continue;
}
let attr = attr.unwrap();
let name = String::from(unsafe {
std::str::from_utf8_unchecked(attr.key)
});
let attr = Attribute::from_name_and_str_values(
name,
unsafe { std::str::from_utf8_unchecked(&*attr.value) }
);
attrs.push_back(attr);
}
let mut attrsvec = Vec::with_capacity(attrs.len());
for attr in attrs {
attrsvec.push(attr);
}
OpeningTag {
empty: false,
name,
attrs: attrsvec
}
});
let mut text = {
let peek = biter.next();
if let Some(peek) = peek {
match peek {
Text(e) => {
iter.next();
let s = unsafe { std::str::from_utf8_unchecked(e) };
Some(String::from(s))
}
_ => {
biter = iter.clone();
None
}
}
} else {
biter = iter.clone();
None
}
};
let children = {
let mut children = LinkedList::new();
loop {
let child = Self::next_node(iter, settings);
if let Some(child) = child {
children.push_back(child);
} else {
break;
}
}
biter = iter.clone();
if text.is_some() {
if !children.is_empty() || settings.all_text_separately {
children.push_front(Node {
start: None,
end: None,
text,
children: Default::default(),
});
text = None;
}
}
let len = children.len();
Children::iter_to(
&settings.children_type,
children,
len
)
};
let end = {
if start.is_some() {
let peek = biter.next();
if peek.is_none() {
None
} else {
match peek.unwrap() {
End(e) => {
if e.name() == start.as_ref().unwrap().name().as_bytes() {
iter.next();
let s = unsafe {
std::str::from_utf8_unchecked(e.name())
};
Some(String::from(s))
} else {
biter = iter.clone();
None
}
},
_ => {
biter = iter.clone();
None
}
}
}
} else {
None
}
};
let e = Some(Node {
start,
end,
text,
children,
});
e
},
Text(e) => {
iter.next();
Some(Node {
start: None,
end: None,
children: Default::default(),
text: Some(
String::from(unsafe { std::str::from_utf8_unchecked(&*e) })
),
})
},
Empty(e) => {
iter.next();
let start = Some({
let name = e.name();
let name = String::from(unsafe {
std::str::from_utf8_unchecked(&*name)
.split_whitespace().next().unwrap()
});
OpeningTag {
empty: true,
name,
attrs: Default::default(),
}
});
Some(Node {
start,
end: None,
text: None,
children: Default::default(),
})
},
_ => None
}
}
pub fn from_html_first(html: &str, settings: &LoadSettings) -> Option<Self> {
let events = Self::collect_events(html);
let mut iter = events.iter();
let node = {
let mut result;
loop {
let node = Self::next_node(&mut iter, settings);
if node.is_none() {
result = None;
break;
} else {
result = node;
break;
}
}
result
};
node
}
pub fn start(&self) -> &Option<OpeningTag> {
&self.start
}
pub fn end(&self) -> Option<&str> {
if let Some(ref end) = self.end {
Some(end)
} else {
None
}
}
pub fn text(&self) -> Option<&str> {
if let Some(ref s) = self.text {
Some(s)
} else {
None
}
}
pub fn children(&self) -> &Children {
&self.children
}
pub fn tag_name(&self) -> Option<&str> {
if let Some(ref start) = self.start {
Some(&start.name)
} else {
None
}
}
pub fn attributes(&self) -> Option<&Vec<Attribute>> {
if let Some(ref start) = self.start {
Some(&start.attrs)
} else {
None
}
}
pub fn attribute_by_name(&self, key: &str) -> Option<&Attribute> {
if let Some(ref start) = self.start {
for attr in start.attributes() {
if attr.name() == key {
return Some(attr);
}
}
}
None
}
pub fn put_attribute(&mut self, attr: Attribute) -> Result<(), Attribute> {
if self.attribute_by_name(&attr.name).is_some() {
Err(attr)
} else {
self.overwrite_attribute(attr);
Ok(())
}
}
pub fn overwrite_attribute(&mut self, attr: Attribute) {
if self.start.is_none() {
return;
}
let mut i = 0;
let attrs = &mut self.start.as_mut().unwrap().attrs;
while i < attrs.len() {
let this = attrs.get_mut(i).unwrap();
if attr.name == this.name {
this.values = attr.values;
return;
}
i += 1;
}
attrs.push(attr);
}
pub fn children_fetch(&self) -> ChildrenFetch {
ChildrenFetch::for_node(self)
}
pub fn children_fetch_mut(&mut self) -> ChildrenFetchMut {
ChildrenFetchMut::for_node(self)
}
pub fn to_string(&self) -> String {
let mut s = String::new();
if let Some(name) = self.tag_name() {
s += "<";
s += &name;
let attrs = &self.start.as_ref().unwrap().attrs;
for attr in attrs {
s += " ";
s += &attr.name;
s += "=\"";
s += &attr.values_to_string();
s += "\"";
}
if self.start.as_ref().unwrap().is_self_closing() {
s += "/";
}
s += ">";
}
if let Some(ref text) = self.text {
s += text;
}
for child in self.children.iter() {
s += &child.to_string();
}
if let Some(ref end) = self.end {
s += "</";
s += end;
s += ">";
}
s.shrink_to_fit();
s
}
pub fn change_name(&mut self, name: &str) {
self.change_opening_name(name);
self.change_closing_name(name);
}
pub fn change_opening_name(&mut self, name: &str) {
if let Some(ref mut start) = self.start {
start.name = String::from(name);
}
}
pub fn change_closing_name(&mut self, name: &str) {
if let Some(ref mut end) = self.end {
*end = String::from(name);
}
}
pub fn children_mut(&mut self) -> &mut Children {
&mut self.children
}
pub fn clone_without_children(&self) -> Self {
Node {
start: self.start.clone(),
end: self.end.clone(),
text: self.text.clone(),
children: Default::default(),
}
}
pub fn wrap_to_root(self) -> Result<Self, Self> {
if self.start.is_none() && self.text.is_none() {
return Err(self);
}
let mut root = Node::new();
root.children = Children(vec![NodeAccess::Owned(self)]);
Ok(root)
}
pub fn is_root(&self) -> bool {
self.text.is_none() && self.start.is_none() && self.text.is_none()
}
}
impl<'a> ChildrenFetch<'a> {
pub fn for_node(node: &'a Node) -> Self {
ChildrenFetch {
node,
tag: None,
key: None,
value: None,
value_part: None,
}
}
pub fn same_for_node(&self, node: &'a Node) -> Self {
let mut new = self.clone();
new.node = node;
new
}
pub fn tag(mut self, tag: &'a str) -> Self {
self.tag = Some(tag);
self
}
pub fn set_tag(&mut self, tag: &'a str) {
self.tag = Some(tag);
}
pub fn key(mut self, key: &'a str) -> Self {
self.key = Some(key);
self
}
pub fn set_key(&mut self, key: &'a str) {
self.key = Some(key);
}
pub fn value(mut self, value: &'a str) -> Self {
self.value = Some(value);
self
}
pub fn set_value(&mut self, value: &'a str) {
self.value = Some(value);
}
pub fn value_part(mut self, part: &'a str) -> Self {
self.value_part = Some(part);
self
}
pub fn set_value_part(&mut self, part: &'a str) {
self.value_part = Some(part);
}
pub fn fetch(self) -> LinkedList<&'a NodeAccess> {
fn sub(criteria: ChildrenFetch) -> LinkedList<&NodeAccess> {
let mut list = LinkedList::new();
for child in criteria.node.children.iter() {
if let Some(tag) = criteria.tag {
if child.tag_name().unwrap_or("") != tag {
continue;
}
}
let mut check_value_criteria = |attr: &Attribute| {
if let Some(value) = criteria.value {
if attr.values_to_string() == value {
list.push_back(child);
}
} else if let Some(part) = criteria.value_part {
let iter = attr.values().iter();
for i in iter {
if i == part {
list.push_back(child);
break;
}
}
} else {
list.push_back(child);
}
};
if let Some(key) = criteria.key {
if let Some(attr) = child.attribute_by_name(key) {
check_value_criteria(attr)
}
} else {
if let Some(attrs) = child.attributes() {
for attr in attrs {
check_value_criteria(attr)
}
}
}
let new_fetch = criteria.same_for_node(&child);
let mut nodes = sub(new_fetch);
list.append(&mut nodes);
}
list
}
sub(self)
}
}
impl<'a> ChildrenFetchMut<'a> {
pub fn for_node(node: &'a Node) -> Self {
let inner = ChildrenFetch {
node,
tag: None,
key: None,
value: None,
value_part: None,
};
ChildrenFetchMut { inner }
}
pub fn fetch_mut(self) -> LinkedList<&'a mut NodeAccess> {
let fetch = self.fetch();
let mut result = LinkedList::new();
for i in fetch {
let a = i as *const NodeAccess as *mut NodeAccess;
let a = unsafe { &mut *a };
result.push_back(a);
}
result
}
pub fn fetch(self) -> LinkedList<&'a NodeAccess> {
self.inner.fetch()
}
pub fn same_for_node(&self, node: &'a Node) -> Self {
ChildrenFetchMut { inner: self.inner.same_for_node(node) }
}
pub fn key(self, key: &'a str) -> Self {
let inner = self.inner.key(key);
ChildrenFetchMut { inner }
}
pub fn value(self, value: &'a str) -> Self {
let inner = self.inner.value(value);
ChildrenFetchMut { inner }
}
pub fn value_part(self, part: &'a str) -> Self {
let inner = self.inner.value_part(part);
ChildrenFetchMut { inner }
}
}
impl OpeningTag {
pub fn name(&self) -> &str {
&self.name
}
pub fn attributes(&self) -> &Vec<Attribute> {
&self.attrs
}
pub fn is_self_closing(&self) -> bool {
self.empty
}
}
impl Attribute {
pub fn from_name_and_str_values(name: String, values: &str) -> Self {
let values = {
let mut list = LinkedList::new();
for val in values.split_whitespace() {
list.push_back(String::from(val));
}
let mut vec = Vec::with_capacity(list.len());
for val in list {
vec.push(val);
}
vec
};
Attribute {
name,
values
}
}
pub fn from_name_and_values(name: String, values: Vec<String>) -> Option<Self> {
Some(Attribute {
name,
values
})
}
pub fn name(&self) -> &str {
&self.name
}
pub fn values(&self) -> &Vec<String> {
&self.values
}
pub fn values_to_string(&self) -> String {
let len = {
let mut l = 0;
for val in &self.values {
l += val.len() + 1;
}
if l == 0 {
return String::new();
}
l - 1
};
let mut s = String::with_capacity(len);
let mut i = 0;
while i < self.values.len() {
s += self.values.get(i).unwrap();
i += 1;
if i < self.values.len() {
s += " ";
}
}
s
}
pub fn first_value(&self) -> &String {
self.values.get(0).unwrap()
}
pub fn set_name(&mut self, name: String) {
self.name = name;
}
pub fn set_values(&mut self, values: Vec<String>) -> Result<(), ()> {
for s in &values {
if s.split_whitespace().count() > 1 {
return Err(());
}
}
self.values = values;
Ok(())
}
pub fn set_values_from_str(&mut self, values: &str) -> Result<(), ()> {
let split = values.split_whitespace();
let vec: Vec<&str> = split.collect();
let mut new_vec = Vec::with_capacity(vec.len());
for i in vec {
new_vec.push(i.to_string());
}
self.set_values(new_vec)
}
}
impl Default for LoadSettings {
fn default() -> Self {
LoadSettings {
all_text_separately: true,
children_type: ChildrenType::Owned,
}
}
}
impl LoadSettings {
pub fn new() -> Self {
Default::default()
}
pub fn all_text_separately(mut self, b: bool) -> Self {
self.set_all_text_separately(b);
self
}
pub fn set_all_text_separately(&mut self, b: bool) {
self.all_text_separately = b;
}
pub fn owned_children(mut self) -> Self {
self.children_type = ChildrenType::Owned;
self
}
pub fn sharable_children(mut self) -> Self {
self.children_type = ChildrenType::Sharable;
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn from_html() {
let html = r#"
<p>Some text
<img src="a">
</p>
<a>Link</a>
<br />
"#;
let result = Node::from_html(html, &Default::default());
let result = result.unwrap();
let root = result.unwrap();
let node = root.children().get(0).unwrap();
let start = node.start().as_ref();
let name = start.unwrap().name();
assert_eq!("p", name);
let text = root.children().get(0).unwrap().children();
let text = text.get(0).unwrap().text();
assert_eq!("Some text", text.unwrap());
let child = root.children().get(0).unwrap().children().get(1).unwrap();
let child_name = child.tag_name();
assert_eq!("img", child_name.unwrap());
let child = root.children().get(1).unwrap();
assert_eq!(child.tag_name().unwrap(), "a");
assert_eq!("Link", child.children().get(0).unwrap().text().unwrap());
let node = root.children().get(2).unwrap();
assert_eq!("br", node.tag_name().unwrap());
}
#[test]
fn from_html_separate_text() {
let html = r#"
<p>Text</p>
"#;
let load = Node::from_html(html, &LoadSettings::new()
.all_text_separately(true));
let load = load.unwrap().unwrap();
let child = load.children().get(0).unwrap().children().get(0).unwrap();
assert_eq!(child.text().unwrap(), "Text");
}
#[test]
fn from_html_empty() {
let html = " ";
let result = Node::from_html(html, &Default::default());
assert!(result.unwrap().is_none());
}
#[test]
fn from_html_with_spaces() {
let html = " <p>\n Some </p>";
let result = Node::from_html(html, &Default::default());
let result = result.unwrap().unwrap();
let first = result.children().get(0).unwrap();
assert_eq!(first.tag_name().unwrap(), "p");
assert_eq!("Some ", first.children().get(0).unwrap().text().unwrap());
}
#[test]
fn node_to_html() {
let html = "<p><i>Text</i><br></p>";
let result = Node::from_html(html, &Default::default());
let result = result.unwrap().unwrap();
let new_html = result.to_string();
assert_eq!(html, &new_html);
}
#[test]
fn overwrite_attribute() {
let html = "<a href='a'>";
let result = Node::from_html(html, &Default::default());
let mut result = result.unwrap().unwrap();
let node = result.children_mut().get_mut(0).unwrap();
let mut attr = node.attribute_by_name("href").unwrap().clone();
attr.set_values(vec![String::from("b")]).unwrap();
node.try_mut().unwrap().overwrite_attribute(attr);
let html = result.to_string();
assert_eq!("<a href=\"b\">", &html);
}
}