htmldom_read/
lib.rs

1//!
2//! # Examples
3//!
4//! To load nodes from HTML.
5//! ```
6//! # use htmldom_read::Node;
7//! let html = r#"
8//!     <div><p>Text</p></div>
9//! "#;
10//! // Load with default settings.
11//! let nodes = Node::from_html(html, &Default::default()).unwrap().unwrap();
12//! let first_node = nodes.children().get(0).unwrap();
13//! // First node is <div>
14//! assert_eq!("div", first_node.tag_name().unwrap());
15//!
16//! let children = first_node.children();
17//!
18//! // First child of <div> is <p>
19//! let first_child = children.get(0).unwrap();
20//! assert_eq!("p", first_child.tag_name().unwrap());
21//! /// The child of <p> is Text
22//! assert_eq!("Text", first_child.children().get(0).unwrap().text().unwrap());
23//! ```
24//!
25//! Load node with text mixed with children. Text that is not mixed load inside the parent node and
26//! not as separate child.
27//! ```
28//! # use htmldom_read::{Node, LoadSettings};
29//! let html = r#"
30//!     <p>Text <sup>child</sup> more text</p>
31//! "#;
32//! let settings = LoadSettings::new().all_text_separately(false);
33//!
34//! let from = Node::from_html(html, &settings).unwrap().unwrap();
35//! let node = from.children().get(0).unwrap();
36//! let children = node.children();
37//!
38//! let first_text = children.get(0).unwrap();
39//! assert_eq!("Text ", first_text.text().unwrap());
40//!
41//! let sup = children.get(1).unwrap();
42//! assert_eq!("child", sup.text().unwrap());
43//!
44//! let last_text = children.get(2).unwrap();
45//! assert_eq!(" more text", last_text.text().unwrap());
46//! ```
47
48extern crate quick_xml;
49extern crate memchr;
50
51use quick_xml::events::{Event, BytesEnd, BytesText, BytesStart};
52use quick_xml::{Error, Reader};
53use std::collections::LinkedList;
54use memchr::{memchr_iter};
55use std::sync::{Arc};
56use std::ops::{Deref, DerefMut};
57
58type SharedNode = Arc<Node>;
59
60/// Children of the node. All tags that are inside of parent node are listed in this struct.
61#[derive(Default, Clone, Debug, PartialEq)]
62pub struct Children(Vec<NodeAccess>);
63
64/// How node is being stored and accessed.
65#[derive(Debug, Clone)]
66pub enum NodeAccess {
67    Owned(Node),
68    Sharable(SharedNode),
69}
70
71/// How children are stored in the node.
72#[derive(Clone, Copy, Debug, PartialEq, Eq)]
73pub enum ChildrenType {
74    Owned,
75    Sharable,
76}
77
78/// Contains information about opening and corresponding closing tags. It also can
79/// contain the value of the text between opening and closing tags if there are no children.
80/// Otherwise, if there are children mixed with text then each text chunk is separated in
81/// it's own node with other children in order they appear in the code.
82#[derive(Clone, Debug, PartialEq, Default)]
83pub struct Node {
84    /// Start of the tag if any. It may be empty if this is a trailing text at the beginning of
85    /// the HTML code. It also is empty in root node.
86    start: Option<OpeningTag>,
87
88    /// Text value if there is a text between opening and closing tags.
89    text: Option<String>,
90
91    /// Closing tag if any.
92    end: Option<String>,
93
94    /// Direct children of this node. Does not include children of children nodes.
95    children: Children,
96}
97
98/// Information carried in the opening tag.
99#[derive(Clone, Debug, PartialEq)]
100pub struct OpeningTag {
101    empty: bool, // Whether this tag is self-closing.
102    name: String,
103    attrs: Vec<Attribute>,
104}
105
106/// Attribute of the tag.
107#[derive(Clone, Debug, PartialEq)]
108pub struct Attribute {
109    name: String,
110    values: Vec<String>,
111}
112
113/// Settings that provide different options of how to parse HTML.
114#[derive(Clone, PartialEq, Debug)]
115pub struct LoadSettings {
116
117    all_text_separately: bool,
118    children_type: ChildrenType,
119}
120
121/// Settings to fetch children nodes that apply to given criteria.
122///
123/// # Examples
124/// ```
125/// # use htmldom_read::{ChildrenFetch, Node};
126/// let html = r#"
127/// <div id="mydiv">
128///     <p class="someclass">Text</p>
129/// </div>
130/// <a class="someclass else">link</a>
131/// "#;
132///
133/// // Create node tree for HTML code.
134/// let node = Node::from_html(html, &Default::default()).unwrap().unwrap();
135///
136/// // Create criteria. Find all `div` nodes with `id='mydiv'`.
137/// let fetch = node.children_fetch()
138///         .tag("div")
139///         .key("id")
140///         .value("mydiv");
141///
142/// // Search for all children that apply to criteria.
143/// let result = fetch.fetch();
144/// // Returns the first node: `<div id='mydiv'>`.
145/// assert_eq!(result.iter().nth(0).unwrap(), &node.children().get(0).unwrap());
146///
147/// // Search for all with class='someclass' allowing it to contain other classes too.
148/// let fetch = node.children_fetch()
149///         .key("class")
150///         .value_part("someclass");
151/// let result = fetch.fetch();
152/// // Returns the nodes <p> and <a>.
153/// assert_eq!(result.iter().nth(0).unwrap(),
154///         &node.children().get(0).unwrap().children().get(0).unwrap());
155/// assert_eq!(result.iter().nth(1).unwrap(), &node.children().get(1).unwrap());
156/// ```
157#[derive(Clone, Copy, Debug)]
158pub struct ChildrenFetch<'a> {
159    /// Node to search in.
160    node: &'a Node,
161
162    /// Tag to search for.
163    tag: Option<&'a str>,
164
165    /// Key to search for.
166    key: Option<&'a str>,
167
168    /// Exact value to search for.
169    value: Option<&'a str>,
170
171    /// If exact value is not set then this defines a part of the value separated with whitespaces
172    /// to be found.
173    value_part: Option<&'a str>,
174}
175
176/// Mutable `ChildrenFetch`. Allows to get mutable access to returned nodes.
177#[derive(Clone, Copy, Debug)]
178pub struct ChildrenFetchMut<'a> {
179    inner: ChildrenFetch<'a>,
180}
181
182impl IntoIterator for Children {
183
184    type Item = NodeAccess;
185    type IntoIter = std::vec::IntoIter<Self::Item>;
186
187    fn into_iter(self) -> Self::IntoIter {
188        self.0.into_iter()
189    }
190}
191
192impl Deref for Children {
193
194    type Target = Vec<NodeAccess>;
195
196    fn deref(&self) -> &Self::Target {
197        &self.0
198    }
199}
200
201impl DerefMut for Children {
202
203    fn deref_mut(&mut self) -> &mut Self::Target {
204        &mut self.0
205    }
206}
207
208impl Children {
209
210    fn iter_to_owned<T: IntoIterator<Item = Node>>(iter: T, capacity: usize) -> Children {
211        let mut arr = Vec::with_capacity(capacity);
212        for child in iter {
213            arr.push(NodeAccess::new_owned(child));
214        }
215
216        Children(arr)
217    }
218
219    fn iter_to_shared<T: IntoIterator<Item = Node>>(iter: T, capacity: usize) -> Children {
220        let mut arr = Vec::with_capacity(capacity);
221        for child in iter {
222            arr.push(NodeAccess::new_shared(child));
223        }
224
225        Children(arr)
226    }
227
228     fn iter_to<T: IntoIterator<Item = Node>>(children_type: &ChildrenType, iter: T, capacity: usize)
229            -> Children {
230        use ChildrenType::*;
231        match children_type {
232            Owned       => Children::iter_to_owned(iter, capacity),
233            Sharable => Children::iter_to_shared(iter, capacity),
234        }
235    }
236
237    /// Get sharable children by cloning data. All children and their children will get
238    /// sharable.
239    pub fn to_all_sharable(&self) -> Self {
240        let children = &self.0;
241        let mut vec = Vec::with_capacity(children.len());
242        for child in children {
243            let mut child = child.to_owned();
244            let children = child.children.to_all_sharable();
245            *child.children = children.0;
246
247            let child = NodeAccess::new_shared(child);
248            vec.push(child);
249        }
250
251        Children(vec)
252    }
253
254    /// Get owned children by cloning data. All children and their children will get
255    /// owned.
256    pub fn to_all_owned(&self) -> Self {
257        let children = &self.0;
258        let mut vec = Vec::with_capacity(children.len());
259        for child in children {
260            let mut child = child.to_owned();
261            let children = child.children.to_all_owned();
262            *child.children = children.0;
263
264            vec.push(child.into());
265        }
266
267        Children(vec)
268    }
269}
270
271impl PartialEq for NodeAccess {
272
273    fn eq(&self, other: &NodeAccess) -> bool {
274        use std::mem::discriminant;
275        if discriminant(self) != discriminant(other) {
276            return false;
277        }
278
279        use NodeAccess::*;
280        match self {
281            Owned(node) => {
282                if let Owned(other) = other {
283                    node == other
284                } else {
285                    unreachable!()
286                }
287            },
288            Sharable(node) => {
289                if let Sharable(other) = other {
290                    Arc::ptr_eq(node, other)
291                } else {
292                    unreachable!()
293                }
294            },
295        }
296    }
297}
298
299impl Deref for NodeAccess {
300
301    type Target = Node;
302
303    fn deref(&self) -> &Node {
304        use NodeAccess::*;
305        match self {
306            Owned(n) => n,
307            Sharable(n) => n
308        }
309    }
310}
311
312impl NodeAccess {
313
314    fn new_owned(node: Node) -> NodeAccess {
315        NodeAccess::Owned(node)
316    }
317
318    fn new_shared(node: Node) -> NodeAccess {
319        let arc = Arc::new(node);
320        NodeAccess::Sharable(arc)
321    }
322
323    /// Try to access node mutably. If this node is owned then this is possible. For sharable nodes
324    /// they can be accessed mutable only if they still were not shared.
325    pub fn try_mut(&mut self) -> Option<&mut Node> {
326        if let NodeAccess::Owned(n) = self {
327            Some(n)
328        } else if let NodeAccess::Sharable(n) = self {
329            Arc::get_mut(n)
330        } else {
331            unreachable!()
332        }
333    }
334
335    /// Convert this node to a sharable by cloning.
336    pub fn to_sharable(&self) -> SharedNode {
337        use NodeAccess::*;
338        match self {
339            Owned(n) => Arc::new(n.clone()),
340            Sharable(n) => n.clone()
341        }
342    }
343
344    /// Convert this node to an owned by cloning.
345    pub fn to_owned(&self) -> Node {
346        use NodeAccess::*;
347        match self {
348            Owned(n) => n.clone(),
349            Sharable(n) => n.as_ref().clone(),
350        }
351    }
352
353    /// Wrap this leaf node into root node. See `wrap_to_root` from `Node` for details.
354    pub fn wrap_to_root(self) -> Result<Self, Self> {
355        use NodeAccess::*;
356
357        if self.is_root() {
358            return Err(self);
359        }
360
361        match self {
362            Owned(n) => Ok(Owned(Node::wrap_to_root(n).unwrap())),
363            Sharable(n) => Ok(Sharable(
364                    Arc::new(Node::wrap_to_root(n.as_ref().to_owned()).unwrap())
365                ))
366        }
367    }
368}
369
370impl From<Node> for NodeAccess {
371
372    fn from(node: Node) -> Self {
373        NodeAccess::Owned(node)
374    }
375}
376
377impl From<SharedNode> for NodeAccess {
378
379    fn from(sn: SharedNode) -> Self {
380        NodeAccess::Sharable(sn)
381    }
382}
383
384impl Node {
385
386    /// Create new empty node with no children nor tags.
387    pub fn new() -> Self {
388        Default::default()
389    }
390
391    /// Load node tree from HTML string.
392    ///
393    /// The root node has no start, end or text elements. It does have only children in it.
394    /// When passing empty code, None will be returned.
395    /// If there is an error parsing the HTML, then this function will fail and return the error
396    /// type that occurred.
397    pub fn from_html(html: &str, settings: &LoadSettings) -> Result<Option<Node>, Error> {
398        let events = Self::collect_events(html);
399        let children = {
400            let mut nodes = LinkedList::new();
401            let mut iter = events.iter();
402            loop {
403                let node = Self::next_node(&mut iter, settings);
404                if node.is_none() {
405                    break;
406                }
407                nodes.push_back(node.unwrap());
408            }
409
410            let len = nodes.len();
411            Children::iter_to(&settings.children_type, nodes.into_iter(), len)
412        };
413
414        if children.is_empty() {
415            Ok(None)
416        } else {
417            Ok(Some(Node {
418                children,
419                start: None,
420                end: None,
421                text: None,
422            }))
423        }
424    }
425
426    fn collect_events(html: &str) -> LinkedList<Event> {
427        use Event::*;
428
429        let mut reader = Reader::from_str(html);
430        let mut buf = Vec::new();
431        let mut list = LinkedList::new();
432        reader.check_end_names(false);
433        loop {
434            let event
435                = Self::process_next_event(reader.read_event(&mut buf));
436            if event.is_err() {
437                break;
438            }
439
440            let event = event.unwrap();
441            if event.is_some() {
442                list.push_back(event.unwrap());
443            }
444        }
445
446        // Remove trailing empty text on newlines.
447        let fixed_list = {
448            let trim_start = |s: String| {
449                if s.is_empty() {
450                    return s;
451                }
452
453                let mut iter = s.chars();
454                let first = iter.next().unwrap();
455                if first == '\n' {
456                    String::from(s.trim_start())
457                } else if first == '\t' || first == ' ' {
458                    while let Some(ch) = iter.next() {
459                        if ch != '\t' && ch != ' ' && ch != '\n' {
460                            return s;
461                        }
462                    }
463                    String::from(s.trim_start())
464                } else {
465                    s
466                }
467            };
468            let trim_end = |s: String| {
469                let bytes = s.as_bytes();
470                let mut memchr = memchr_iter('\n' as _, bytes);
471                if let Some(_) = memchr.next() {
472                    String::from(s.trim_end())
473                } else {
474                    s
475                }
476            };
477
478            let mut fixed_list = LinkedList::new();
479            for i in list {
480                if let Text(e) = i {
481                    let text = std::str::from_utf8(e.escaped()).unwrap();
482                    let text = String::from(text);
483                    let s = trim_start(text);
484                    let s = trim_end(s);
485                    if !s.is_empty() {
486                        let content = Vec::from(s.as_bytes());
487                        let new = Text(BytesText::from_plain(&content)).into_owned();
488                        fixed_list.push_back(new);
489                    }
490                } else {
491                    fixed_list.push_back(i);
492                }
493            }
494            fixed_list
495        };
496
497        fixed_list
498    }
499
500    fn process_next_event(event: quick_xml::Result<Event>) -> Result<Option<Event<'static>>, ()> {
501        use Event::*;
502
503        if event.is_err() {
504            return Err(());
505        }
506        let event: Event = event.unwrap();
507
508        match event {
509            Start(e) => {
510                let vec = e.to_vec();
511                let e = BytesStart::borrowed(
512                    &vec, e.name().len()
513                ).into_owned();
514                Ok(Some(Start(e)))
515            },
516            End(e) => {
517                let vec = e.to_vec();
518                let e = BytesEnd::borrowed(&vec).into_owned();
519                Ok(Some(End(e)))
520            },
521            Empty(e) => {
522                let vec = e.to_vec();
523                let e = BytesStart::borrowed(
524                    &vec, e.name().len()
525                ).into_owned();
526                Ok(Some(Empty(e)))
527            },
528            Text(e) => {
529                let vec = e.to_vec();
530                let e = BytesText::from_plain(&vec).into_owned();
531                Ok(Some(Text(e)))
532            },
533            DocType(_) => Ok(None),
534            Eof => Err(()),
535            _ => Err(()),
536        }
537    }
538
539    /// Function to read next node and it's children from event iterator.
540    #[allow(unused_assignments)]
541    fn next_node(
542            iter: &mut std::collections::linked_list::Iter<Event>,
543            settings: &LoadSettings) -> Option<Node> {
544        use Event::*;
545
546        let mut biter = iter.clone();
547        let peek = biter.next();
548        if peek.is_none() {
549            return None;
550        }
551        let peek = peek.unwrap();
552        match peek {
553            Start(e) => {
554                iter.next(); // Confirm reading this event.
555
556                let start = Some({
557                    let name = String::from(unsafe {
558                        std::str::from_utf8_unchecked(
559                            &*e.name()).split_whitespace().next().unwrap()
560                    });
561
562                    let mut attrs = LinkedList::new();
563                    for attr in e.attributes() {
564                        if let Err(_) = attr {
565                            continue;
566                        }
567                        let attr = attr.unwrap();
568
569                        let name = String::from(unsafe {
570                            std::str::from_utf8_unchecked(attr.key)
571                        });
572                        let attr = Attribute::from_name_and_str_values(
573                            name,
574                            unsafe { std::str::from_utf8_unchecked(&*attr.value) }
575                        );
576                        attrs.push_back(attr);
577                    }
578                    let mut attrsvec = Vec::with_capacity(attrs.len());
579                    for attr in attrs {
580                        attrsvec.push(attr);
581                    }
582
583                    OpeningTag {
584                        empty: false,
585                        name,
586                        attrs: attrsvec
587                    }
588                });
589                let mut text = {
590                    let peek = biter.next();
591                    if let Some(peek) = peek {
592                        match peek {
593                            Text(e) => {
594                                iter.next(); // Confirm reading event.
595                                let s = unsafe { std::str::from_utf8_unchecked(e) };
596                                Some(String::from(s))
597                            }
598                            _ => {
599                                biter = iter.clone(); // Revert read.
600                                None
601                            }
602                        }
603                    } else {
604                        biter = iter.clone(); // Revert read.
605                        None
606                    }
607                };
608                let children = {
609                    let mut children = LinkedList::new();
610                    loop {
611                        let child = Self::next_node(iter, settings);
612                        if let Some(child) = child {
613                            children.push_back(child);
614                        } else {
615                            break;
616                        }
617                    }
618                    biter = iter.clone(); // Apply changes of iter.
619
620                    // Check whether to store text in separate node or in the same node.
621                    // Text cannot be mixed with children as this will loose information about
622                    // order of occurrences of children tags and the text values. So
623                    // in this case all texts are saved as nodes on their own in children array.
624                    // We only need to check already read text field as if it is read then it
625                    // precedes any children nodes. All other texts are already on their own
626                    // children nodes because of recursive call of this function.
627                    if text.is_some() {
628                        if !children.is_empty() || settings.all_text_separately {
629                            // Store as separate node as first child as it actually is the first
630                            // thing that was read.
631                            children.push_front(Node {
632                                start: None,
633                                end: None,
634                                text,
635                                children: Default::default(),
636                            });
637                            text = None;
638                        }
639                    }
640
641                    let len = children.len();
642                    Children::iter_to(
643                        &settings.children_type,
644                        children,
645                        len
646                    )
647                };
648                let end = {
649                    if start.is_some() { // Only opening tag can have a closing tag.
650                        let peek = biter.next();
651                        if peek.is_none() {
652                            None
653                        } else {
654                            match peek.unwrap() {
655                                End(e) => {
656                                    // Check if names are same. If not - discard and return None.
657                                    if e.name() == start.as_ref().unwrap().name().as_bytes() {
658                                        iter.next(); // Confirm reading end tag.
659                                        let s = unsafe {
660                                            std::str::from_utf8_unchecked(e.name())
661                                        };
662                                        Some(String::from(s))
663                                    } else {
664                                        biter = iter.clone();
665                                        None
666                                    }
667                                },
668                                _ => {
669                                    biter = iter.clone();
670                                    None
671                                }
672                            }
673                        }
674                    } else {
675                        None
676                    }
677                };
678
679                let e = Some(Node {
680                    start,
681                    end,
682                    text,
683                    children,
684                });
685                e
686            },
687            Text(e) => {
688                iter.next();
689
690                Some(Node {
691                    start: None,
692                    end: None,
693                    children: Default::default(),
694
695                    text: Some(
696                        String::from(unsafe { std::str::from_utf8_unchecked(&*e) })
697                    ),
698                })
699            },
700            Empty(e) => {
701                iter.next();
702
703                let start = Some({
704                    let name = e.name();
705                    let name = String::from(unsafe {
706                        std::str::from_utf8_unchecked(&*name)
707                            .split_whitespace().next().unwrap()
708                    });
709
710                    OpeningTag {
711                        empty: true,
712                        name,
713                        attrs: Default::default(),
714                    }
715                });
716
717                Some(Node {
718                    start,
719                    end: None,
720                    text: None,
721                    children: Default::default(),
722                })
723            },
724            _ => None
725        }
726    }
727
728    /// Load the first node from HTML string without wrapping node to the tree with root (empty
729    /// first node). Just return the exact single node.
730    ///
731    /// # Failures
732    /// None is returned if string does not contain any node (is empty).
733    pub fn from_html_first(html: &str, settings: &LoadSettings) -> Option<Self> {
734        let events = Self::collect_events(html);
735        let mut iter = events.iter();
736        let node = {
737            let mut result;
738            loop {
739                let node = Self::next_node(&mut iter, settings);
740                if node.is_none() {
741                    result = None;
742                    break;
743                } else {
744                    result = node;
745                    break;
746                }
747            }
748            result
749        };
750
751        node
752    }
753
754    /// Start tag information.
755    pub fn start(&self) -> &Option<OpeningTag> {
756        &self.start
757    }
758
759    /// End tag information.
760    pub fn end(&self) -> Option<&str> {
761        if let Some(ref end) = self.end {
762            Some(end)
763        } else {
764            None
765        }
766    }
767
768    /// Text that appears between opening and closing tags.
769    pub fn text(&self) -> Option<&str> {
770        if let Some(ref s) = self.text {
771            Some(s)
772        } else {
773            None
774        }
775    }
776
777    /// Children tags of this node.
778    pub fn children(&self) -> &Children {
779        &self.children
780    }
781
782    /// The name of the tag that is represented by the node.
783    pub fn tag_name(&self) -> Option<&str> {
784        if let Some(ref start) = self.start {
785            Some(&start.name)
786        } else {
787            None
788        }
789    }
790
791    /// Start tag attributes.
792    pub fn attributes(&self) -> Option<&Vec<Attribute>> {
793        if let Some(ref start) = self.start {
794            Some(&start.attrs)
795        } else {
796            None
797        }
798    }
799
800    /// Find attribute by it's name.
801    pub fn attribute_by_name(&self, key: &str) -> Option<&Attribute> {
802        if let Some(ref start) = self.start {
803            for attr in start.attributes() {
804                if attr.name() == key {
805                    return Some(attr);
806                }
807            }
808        }
809        None
810    }
811
812    /// Try saving given attribute in this node.
813    ///
814    /// # Failure
815    /// If this attribute is already present then this function will not change it.
816    /// If you need to overwrite the attribute anyway use [`overwrite_attribute`].
817    pub fn put_attribute(&mut self, attr: Attribute) -> Result<(), Attribute> {
818        if self.attribute_by_name(&attr.name).is_some() {
819            Err(attr)
820        } else {
821            self.overwrite_attribute(attr);
822            Ok(())
823        }
824    }
825
826    /// Save this attribute in the node. If it is already present then overwrite it.
827    pub fn overwrite_attribute(&mut self, attr: Attribute) {
828        if self.start.is_none() {
829            return;
830        }
831
832        // Find the attribute if it is present.
833        let mut i = 0;
834        let attrs = &mut self.start.as_mut().unwrap().attrs;
835        while i < attrs.len() {
836            let this = attrs.get_mut(i).unwrap();
837            if attr.name == this.name {
838                // Found. Overwrite.
839                this.values = attr.values;
840                return;
841            }
842            i += 1;
843        }
844
845        // Attribute was not found. Append new.
846        attrs.push(attr);
847    }
848
849    /// Get children fetcher for this node to find children that apply to some criteria.
850    pub fn children_fetch(&self) -> ChildrenFetch {
851        ChildrenFetch::for_node(self)
852    }
853
854    pub fn children_fetch_mut(&mut self) -> ChildrenFetchMut {
855        ChildrenFetchMut::for_node(self)
856    }
857
858    /// Convert this node and all it's children into HTML string.
859    pub fn to_string(&self) -> String {
860        let mut s = String::new();
861        if let Some(name) = self.tag_name() {
862            s += "<";
863            s += &name;
864
865            let attrs = &self.start.as_ref().unwrap().attrs;
866            for attr in attrs {
867                s += " ";
868                s += &attr.name;
869                s += "=\"";
870                s += &attr.values_to_string();
871                s += "\"";
872            }
873
874            if self.start.as_ref().unwrap().is_self_closing() {
875                s += "/";
876            }
877
878            s += ">";
879        }
880        if let Some(ref text) = self.text {
881            s += text;
882        }
883
884        for child in self.children.iter() {
885            s += &child.to_string();
886        }
887
888        if let Some(ref end) = self.end {
889            s += "</";
890            s += end;
891            s += ">";
892        }
893
894        s.shrink_to_fit();
895        s
896    }
897
898    /// Change name of opening and closing tags (if any).
899    pub fn change_name(&mut self, name: &str) {
900        self.change_opening_name(name);
901        self.change_closing_name(name);
902    }
903
904    /// Change the name of only opening tag if it exists.
905    pub fn change_opening_name(&mut self, name: &str) {
906        if let Some(ref mut start) = self.start {
907            start.name = String::from(name);
908        }
909    }
910
911    /// Change the name of only closing tag if it exists.
912    pub fn change_closing_name(&mut self, name: &str) {
913        if let Some(ref mut end) = self.end {
914            *end = String::from(name);
915        }
916    }
917
918    /// Mutable access to array of node's children.
919    pub fn children_mut(&mut self) -> &mut Children {
920        &mut self.children
921    }
922
923    /// Clone this node without cloning children leaving new node with empty children list.
924    pub fn clone_without_children(&self) -> Self {
925        Node {
926            start: self.start.clone(),
927            end: self.end.clone(),
928            text: self.text.clone(),
929            children: Default::default(),
930        }
931    }
932
933    /// Try wrapping this node into root. This makes it possible to use this node as individual
934    /// tree.
935    ///
936    /// It is required for trees to start with empty node only with children. Many functions
937    /// rely on this rule. For example, Fetch functions filter data only in children ignoring
938    /// parent attributes and data.
939    ///
940    /// # Failures
941    /// If this node already is root it is returned back in Err.
942    pub fn wrap_to_root(self) -> Result<Self, Self> {
943        if self.start.is_none() && self.text.is_none() {
944            return Err(self);
945        }
946
947        let mut root = Node::new();
948        root.children = Children(vec![NodeAccess::Owned(self)]);
949        Ok(root)
950    }
951
952    /// Check whether this node is the root of the tree.
953    pub fn is_root(&self) -> bool {
954        self.text.is_none() && self.start.is_none() && self.text.is_none()
955    }
956}
957
958impl<'a> ChildrenFetch<'a> {
959
960    /// Get children fetcher for given node to find children that apply to some criteria.
961    pub fn for_node(node: &'a Node) -> Self {
962        ChildrenFetch {
963            node,
964            tag:        None,
965            key:        None,
966            value:      None,
967            value_part: None,
968        }
969    }
970
971    /// Clone the fetcher with already set criteria but for given different node.
972    pub fn same_for_node(&self, node: &'a Node) -> Self {
973        let mut new = self.clone();
974        new.node = node;
975        new
976    }
977
978    /// Tag to search for.
979    pub fn tag(mut self, tag: &'a str) -> Self {
980        self.tag = Some(tag);
981        self
982    }
983
984    pub fn set_tag(&mut self, tag: &'a str) {
985        self.tag = Some(tag);
986    }
987
988    /// Key to search for.
989    pub fn key(mut self, key: &'a str) -> Self {
990        self.key = Some(key);
991        self
992    }
993
994    pub fn set_key(&mut self, key: &'a str) {
995        self.key = Some(key);
996    }
997
998    /// Exact value to search for.
999    pub fn value(mut self, value: &'a str) -> Self {
1000        self.value = Some(value);
1001        self
1002    }
1003
1004    pub fn set_value(&mut self, value: &'a str) {
1005        self.value = Some(value);
1006    }
1007
1008    /// If exact value is not set then this defines a part of the value separated with whitespaces
1009    /// to be found. If `value` is, however, set then this field is ignored entirely.
1010    pub fn value_part(mut self, part: &'a str) -> Self {
1011        self.value_part = Some(part);
1012        self
1013    }
1014
1015    pub fn set_value_part(&mut self, part: &'a str) {
1016        self.value_part = Some(part);
1017    }
1018
1019    /// Get all children and their children that apply to the criteria.
1020    /// This function does not check the parent node!
1021    pub fn fetch(self) -> LinkedList<&'a NodeAccess> {
1022        fn sub(criteria: ChildrenFetch) -> LinkedList<&NodeAccess> {
1023            let mut list = LinkedList::new();
1024
1025            for child in criteria.node.children.iter() {
1026                // Filter on tag if present.
1027                if let Some(tag) = criteria.tag {
1028                    if child.tag_name().unwrap_or("") != tag {
1029                        continue;
1030                    }
1031                }
1032                // Filter value and value_part by criteria. Append filtered values to list.
1033                let mut check_value_criteria = |attr: &Attribute| {
1034                    if let Some(value) = criteria.value {
1035                        if attr.values_to_string() == value {
1036                            list.push_back(child);
1037                        }
1038                    } else if let Some(part) = criteria.value_part {
1039                        let iter = attr.values().iter();
1040                        for i in iter {
1041                            if i == part {
1042                                list.push_back(child);
1043                                break;
1044                            }
1045                        }
1046                    } else {
1047                        // No value expected and finding of a key is enough.
1048                        list.push_back(child);
1049                    }
1050                };
1051
1052                if let Some(key) = criteria.key {
1053                    if let Some(attr) = child.attribute_by_name(key) {
1054                        check_value_criteria(attr)
1055                    }
1056                } else {
1057                    if let Some(attrs) = child.attributes() {
1058                        for attr in attrs {
1059                            check_value_criteria(attr)
1060                        }
1061                    }
1062                }
1063
1064                let new_fetch = criteria.same_for_node(&child);
1065                let mut nodes = sub(new_fetch);
1066                list.append(&mut nodes);
1067            }
1068
1069            list
1070        }
1071
1072        sub(self)
1073    }
1074}
1075
1076impl<'a> ChildrenFetchMut<'a> {
1077
1078    /// Get children fetcher for given node to find children that apply to some criteria.
1079    pub fn for_node(node: &'a Node) -> Self {
1080        let inner = ChildrenFetch {
1081            node,
1082            tag:        None,
1083            key:        None,
1084            value:      None,
1085            value_part: None,
1086        };
1087        ChildrenFetchMut { inner }
1088    }
1089
1090    /// Get all children and their children that apply to the criteria.
1091    pub fn fetch_mut(self) -> LinkedList<&'a mut NodeAccess> {
1092        let fetch = self.fetch();
1093        let mut result = LinkedList::new();
1094        for i in fetch {
1095            let a = i as *const NodeAccess as *mut NodeAccess;
1096            let a = unsafe { &mut *a };
1097            result.push_back(a);
1098        }
1099        result
1100    }
1101
1102    pub fn fetch(self) -> LinkedList<&'a NodeAccess> {
1103        self.inner.fetch()
1104    }
1105
1106    /// Clone the fetcher with already set criteria but for given different node.
1107    pub fn same_for_node(&self, node: &'a Node) -> Self {
1108        ChildrenFetchMut { inner: self.inner.same_for_node(node) }
1109    }
1110
1111    /// Key to search for.
1112    pub fn key(self, key: &'a str) -> Self {
1113        let inner = self.inner.key(key);
1114        ChildrenFetchMut { inner }
1115    }
1116
1117    /// Exact value to search for.
1118    pub fn value(self, value: &'a str) -> Self {
1119        let inner = self.inner.value(value);
1120        ChildrenFetchMut { inner }
1121    }
1122
1123    /// If exact value is not set then this defines a part of the value separated with whitespaces
1124    /// to be found. If `value` is, however, set then this field is ignored entirely.
1125    pub fn value_part(self, part: &'a str) -> Self {
1126        let inner = self.inner.value_part(part);
1127        ChildrenFetchMut { inner }
1128    }
1129}
1130
1131impl OpeningTag {
1132
1133    /// Name of this tag.
1134    pub fn name(&self) -> &str {
1135        &self.name
1136    }
1137
1138    /// Attributes of tag.
1139    pub fn attributes(&self) -> &Vec<Attribute> {
1140        &self.attrs
1141    }
1142
1143    pub fn is_self_closing(&self) -> bool {
1144        self.empty
1145    }
1146}
1147
1148impl Attribute {
1149
1150    /// Create from a name and values passed as single string that are separated by whitespaces.
1151    pub fn from_name_and_str_values(name: String, values: &str) -> Self {
1152        let values = {
1153            let mut list = LinkedList::new();
1154            for val in values.split_whitespace() {
1155                list.push_back(String::from(val));
1156            }
1157
1158            let mut vec = Vec::with_capacity(list.len());
1159            for val in list {
1160                vec.push(val);
1161            }
1162
1163            vec
1164        };
1165
1166        Attribute {
1167            name,
1168            values
1169        }
1170    }
1171
1172    /// Create from a name and values passed as array of strings.
1173    /// They should not contain whitespaces and invalid characters for attributes or names.
1174    pub fn from_name_and_values(name: String, values: Vec<String>) -> Option<Self> {
1175        // TODO check on whitespaces.
1176        Some(Attribute {
1177            name,
1178            values
1179        })
1180    }
1181
1182    /// The name of the attribute.
1183    pub fn name(&self) -> &str {
1184        &self.name
1185    }
1186
1187    /// All values stored in the attribute. Each value separated with whitespace is
1188    /// located in another string in the array. To get values as single string, use
1189    /// [`values_to_string`]
1190    pub fn values(&self) -> &Vec<String> {
1191        &self.values
1192    }
1193
1194    /// Store all values in a string separated with spaces.
1195    pub fn values_to_string(&self) -> String {
1196        // Calculate the length of the string to allocate.
1197        let len = {
1198            let mut l = 0;
1199            for val in &self.values {
1200                l += val.len() + 1; // For space at the end.
1201            }
1202            if l == 0 {
1203                // There are no values - empty string.
1204                return String::new();
1205            }
1206            l - 1 // Remove trailing last space.
1207        };
1208
1209        let mut s = String::with_capacity(len);
1210
1211        let mut i = 0;
1212        while i < self.values.len() {
1213            s += self.values.get(i).unwrap();
1214
1215            i += 1;
1216            // Do not add last (trailing) space.
1217            if i < self.values.len() {
1218                s += " ";
1219            }
1220        }
1221
1222        s
1223    }
1224
1225    /// Get first value of the attribute if any.
1226    ///
1227    /// Usually, when attribute is known to contain single value this function makes it easier
1228    /// to obtain this value. It does not construct new string as `values_to_string` and
1229    /// is shorter than calling `values` and gettings first value manually (but is equivalent).
1230    ///
1231    /// # Panics
1232    /// This function will panic if there are no attribute values.
1233    pub fn first_value(&self) -> &String {
1234        self.values.get(0).unwrap()
1235    }
1236
1237    /// Set new name for attribute.
1238    pub fn set_name(&mut self, name: String) {
1239        self.name = name;
1240    }
1241
1242    /// Set new values for attribute. If any of passed strings have whitespaces then this
1243    /// function will fail.
1244    pub fn set_values(&mut self, values: Vec<String>) -> Result<(), ()> {
1245        // Check strings
1246        for s in &values {
1247            if s.split_whitespace().count() > 1 {
1248                return Err(());
1249            }
1250        }
1251
1252        self.values = values;
1253
1254        Ok(())
1255    }
1256
1257    /// Set values from string.
1258    pub fn set_values_from_str(&mut self, values: &str) -> Result<(), ()> {
1259        let split = values.split_whitespace();
1260        let vec: Vec<&str> = split.collect();
1261        let mut new_vec = Vec::with_capacity(vec.len());
1262        for i in vec {
1263            new_vec.push(i.to_string());
1264        }
1265
1266        self.set_values(new_vec)
1267    }
1268}
1269
1270impl Default for LoadSettings {
1271
1272    fn default() -> Self {
1273        LoadSettings {
1274            all_text_separately: true,
1275            children_type: ChildrenType::Owned,
1276        }
1277    }
1278}
1279
1280impl LoadSettings {
1281
1282    pub fn new() -> Self {
1283        Default::default()
1284    }
1285
1286    /// Store all text values in separate children nodes. Even those text which is alone
1287    /// in tag body without other children.
1288    ///
1289    /// True by default.
1290    pub fn all_text_separately(mut self, b: bool) -> Self {
1291        self.set_all_text_separately(b);
1292        self
1293    }
1294
1295    /// See [`all_text_separately`].
1296    pub fn set_all_text_separately(&mut self, b: bool) {
1297        self.all_text_separately = b;
1298    }
1299
1300    /// Node owns all of its children. This is a default value.
1301    pub fn owned_children(mut self) -> Self {
1302        self.children_type = ChildrenType::Owned;
1303        self
1304    }
1305
1306    /// Node can share its children. Opposite to `owned_children`.
1307    pub fn sharable_children(mut self) -> Self {
1308        self.children_type = ChildrenType::Sharable;
1309        self
1310    }
1311}
1312
1313#[cfg(test)]
1314mod tests {
1315    use super::*;
1316
1317    #[test]
1318    fn from_html() {
1319        let html = r#"
1320        <p>Some text
1321            <img src="a">
1322        </p>
1323        <a>Link</a>
1324        <br />
1325        "#;
1326
1327        let result = Node::from_html(html, &Default::default());
1328        let result = result.unwrap();
1329        let root = result.unwrap();
1330
1331        let node = root.children().get(0).unwrap();
1332        let start = node.start().as_ref();
1333        let name = start.unwrap().name();
1334        assert_eq!("p", name);
1335
1336        let text = root.children().get(0).unwrap().children();
1337        let text = text.get(0).unwrap().text();
1338        assert_eq!("Some text", text.unwrap());
1339
1340        let child = root.children().get(0).unwrap().children().get(1).unwrap();
1341        let child_name = child.tag_name();
1342        assert_eq!("img", child_name.unwrap());
1343
1344        let child = root.children().get(1).unwrap();
1345        assert_eq!(child.tag_name().unwrap(), "a");
1346        assert_eq!("Link", child.children().get(0).unwrap().text().unwrap());
1347
1348        let node = root.children().get(2).unwrap();
1349        assert_eq!("br", node.tag_name().unwrap());
1350    }
1351
1352    #[test]
1353    fn from_html_separate_text() {
1354        let html = r#"
1355        <p>Text</p>
1356        "#;
1357        let load = Node::from_html(html, &LoadSettings::new()
1358            .all_text_separately(true));
1359        let load = load.unwrap().unwrap();
1360
1361        let child = load.children().get(0).unwrap().children().get(0).unwrap();
1362        assert_eq!(child.text().unwrap(), "Text");
1363    }
1364
1365    #[test]
1366    fn from_html_empty() {
1367        let html = "   ";
1368
1369        let result = Node::from_html(html, &Default::default());
1370        assert!(result.unwrap().is_none());
1371    }
1372
1373    #[test]
1374    fn from_html_with_spaces() {
1375        let html = "   <p>\n  Some  </p>";
1376
1377        let result = Node::from_html(html, &Default::default());
1378        let result = result.unwrap().unwrap();
1379
1380        let first = result.children().get(0).unwrap();
1381        assert_eq!(first.tag_name().unwrap(), "p");
1382        assert_eq!("Some  ", first.children().get(0).unwrap().text().unwrap());
1383    }
1384
1385    #[test]
1386    fn node_to_html() {
1387        let html = "<p><i>Text</i><br></p>";
1388
1389        let result = Node::from_html(html, &Default::default());
1390        let result = result.unwrap().unwrap();
1391
1392        let new_html = result.to_string();
1393
1394        assert_eq!(html, &new_html);
1395    }
1396
1397    #[test]
1398    fn overwrite_attribute() {
1399        let html = "<a href='a'>";
1400        let result = Node::from_html(html, &Default::default());
1401        let mut result = result.unwrap().unwrap();
1402        let node = result.children_mut().get_mut(0).unwrap();
1403
1404        let mut attr = node.attribute_by_name("href").unwrap().clone();
1405        attr.set_values(vec![String::from("b")]).unwrap();
1406
1407        node.try_mut().unwrap().overwrite_attribute(attr);
1408        let html = result.to_string();
1409
1410        assert_eq!("<a href=\"b\">", &html);
1411    }
1412}