stork_http
This is a stork implementation for the HTTP
protocol and specifically HTML-based web scraping. Given an initial
page to scrape, stork_http will find all indexable links on the page
and yield them back to you - ready to scrape again in an instant
or store for later to come back to at another time, all using futures
to allow for parallel processing.
At this time rel="nofollow" is strictly enforced and not possible
to change although this will come in time as more filters are added.
Example usage:
# use stork::FilterSet;
# use failure::err_msg;
# use stork_http::{HttpStorkable, filters::*};
# use futures::StreamExt;
#
# #[tokio::main]
# async fn main() -> failure::Fallible<()> {
let stream = HttpStorkable::new("https://example.com/".parse()?)
.with_filters(
FilterSet::default()
.add_filter(DomainFilter::new("www.iana.org"))
.add_filter(SchemeFilter::new("https"))
)
.exec();
# futures::pin_mut!(stream); let first_link_on_example: HttpStorkable = match stream.next().await {
Some(Ok(link)) => {
assert_eq!(link.val().text(), Some("More information...".to_string()));
assert_eq!(link.val().url().as_str(), "https://www.iana.org/domains/example");
assert_eq!(link.parent().unwrap().val().url().as_str(), "https://example.com/");
link
},
_ => panic!("failed to get links from page")
};
let filters = first_link_on_example.filters().clone()
.add_filter(PathFilter::new(FilterType::Equals, "/"));
let stream = first_link_on_example
.with_filters(filters)
.exec();
# futures::pin_mut!(stream); match stream.next().await {
Some(Ok(link)) => {
assert_eq!(link.val().url().as_str(), "https://www.iana.org/");
assert_eq!(link.parent().unwrap().val().url().as_str(), "https://www.iana.org/domains/example");
assert_eq!(link.parent().unwrap().parent().unwrap().val().url().as_str(), "https://example.com/")
},
_ => panic!("failed to get links from page")
}
assert!(stream.next().await.is_none(), "should've been only one homepage link on the page!");
# Ok(())
# }