Crabler - Web crawler for Crabs
Asynchronous web scraper engine written in rust.
Features:
- fully based on
async-std
- derive macro based api
- struct based api
- stateful scraper (structs can hold state)
- ability to download files
- ability to schedule navigation jobs in an async manner
Example
extern crate crabler;
use crabler::*;
#[derive(WebScraper)]
#[on_response(response_handler)]
#[on_html("a[href]", walk_handler)]
struct Scraper {}
impl Scraper {
async fn response_handler(&self, response: Response) -> Result<()> {
if response.url.ends_with(".jpg") && response.status == 200 {
println!("Finished downloading {} -> {}", response.url, response.download_destination);
}
Ok(())
}
async fn walk_handler(&self, response: Response, a: Element) -> Result<()> {
if let Some(href) = a.attr("href") {
if href.ends_with(".jpg") {
let p = Path::new("/tmp").join("image.jpg");
let destination = p.to_string_lossy().to_string();
if !p.exists() {
println!("Downloading {}", destination);
response.download_file(href, destination).await?;
} else {
println!("Skipping exist file {}", destination);
}
} else {
response.navigate(href).await?;
};
}
Ok(())
}
}
#[async_std::main]
async fn main() -> Result<()> {
let scraper = Scraper {};
scraper.run(Opts::new().with_urls(vec!["https://www.rust-lang.org/"]).with_threads(20)).await
}
Sample project
Gonzih/apod-nasa-scraper-rs