Crate parsercher[−][src]
Parses and searches Tag documents. (e.g. HTML, XML)
parsercher parses documents written in tags such as HTML and XML.
- Create a tree of Dom structures from the tag document.
- Search for tags and text in the tree of Dom structures.
Usage
Add this to your Cargo.toml
:
[dependencies] parsercher = "1.0.0"
Examples
Example of getting text from HTML.
Create a tree of Dom structure from HTML and get the text of li
tag that value of class
attribute is target
.
use std::collections::HashMap; use parsercher; use parsercher::dom::Tag; let html = r#" <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>sample html</title> </head> <body> <ol> <li class="target">first</li> <li>second</li> <li class="target">therd</li> </ol> </body> </html> "#; if let Ok(root_dom) = parsercher::parse(&html) { let mut needle = Tag::new("li".to_string()); let mut attr = HashMap::new(); attr.insert("class".to_string(), "target".to_string()); needle.set_attr(attr); if let Some(texts) = parsercher::search_text_from_tag_children(&root_dom, &needle) { assert_eq!(texts.len(), 2); assert_eq!(texts[0], "first".to_string()); assert_eq!(texts[1], "therd".to_string()); } }
More complex examples of Dom structure tree
use parsercher; let html = r#" <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>sample html</title> </head> <body> <h1>Hello, world!</h1> <div id="content"></div> <ol> <li>first</li> <li>second</li> <li>therd</li> </ol> <!-- All script code becomes one text --> <script> let content = document.getElementById('content'); content.textContent = 'content'; </script> </body> </html> "#; if let Ok(dom) = parsercher::parse(&html) { println!("{:#?}", dom); }
output:
Dom { dom_type: Tag, tag: Some( Tag { name: "root", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "!DOCTYPE", attr: Some( { "html": "", }, ), terminated: false, terminator: false, }, ), text: None, comment: None, children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "html", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "head", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "meta", attr: Some( { "charset": "UTF-8", }, ), terminated: false, terminator: false, }, ), text: None, comment: None, children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "title", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "sample html", }, ), comment: None, children: None, }, ], ), }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "body", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "h1", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "Hello, world!", }, ), comment: None, children: None, }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "div", attr: Some( { "id": "content", }, ), terminated: false, terminator: false, }, ), text: None, comment: None, children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "ol", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Tag, tag: Some( Tag { name: "li", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "first", }, ), comment: None, children: None, }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "li", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "second", }, ), comment: None, children: None, }, ], ), }, Dom { dom_type: Tag, tag: Some( Tag { name: "li", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "therd", }, ), comment: None, children: None, }, ], ), }, ], ), }, Dom { dom_type: Comment, tag: None, text: None, comment: Some( Comment { comment: " All script code becomes one text ", }, ), children: None, }, Dom { dom_type: Tag, tag: Some( Tag { name: "script", attr: None, terminated: false, terminator: false, }, ), text: None, comment: None, children: Some( [ Dom { dom_type: Text, tag: None, text: Some( Text { text: "\n let content = document.getElementById(\'content\');\n content.textContent = \'content\';\n", }, ), comment: None, children: None, }, ], ), }, ], ), }, ], ), }, ], ), }
Modules
dom | Module for representing a tree of Dom structures. |
Functions
parse | Parses the tag document and returns a Dom structure tree. |
print_dom_tree | Output the Dom structure in a human readable format. |
satisfy_sufficient_condition | Returns true if p is a sufficient condition for q.
|
search_tag | Returns Tag structures from which the needle is a sufficient condition from the Dom structure tree. |
search_tag_from_name | Returns Tag structures with a tag name equal to |
search_text_from_tag_children | Returns texts of the child of the Tag structure for which |