use kumo::prelude::*;
use std::time::Duration;
struct HttpbinSpider {
urls: Vec<String>,
}
#[async_trait::async_trait]
impl Spider for HttpbinSpider {
type Item = serde_json::Value;
fn name(&self) -> &str {
"httpbin"
}
fn start_urls(&self) -> Vec<String> {
self.urls.clone()
}
async fn parse(&self, response: &Response) -> Result<Output<Self::Item>, KumoError> {
if let Ok(json) = response.json::<serde_json::Value>() {
let ua = json["headers"]["User-Agent"].as_str().unwrap_or("?");
let origin = json["origin"].as_str().unwrap_or("?");
println!("IP: {origin} | UA: {ua}");
return Ok(Output::new().item(json));
}
Ok(Output::new())
}
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt()
.with_env_filter("kumo=info")
.init();
let urls = vec!["https://httpbin.org/anything".to_string(); 5];
let proxies: Vec<String> = vec![
];
let mut engine = CrawlEngine::builder()
.concurrency(2)
.middleware(UserAgentRotator::common_browsers())
.middleware(RateLimiter::per_second(2.0));
if !proxies.is_empty() {
engine = engine.middleware(ProxyRotator::new(proxies));
}
let stats = engine
.crawl_delay(Duration::from_millis(500))
.store(StdoutStore)
.run(HttpbinSpider { urls })
.await?;
println!(
"\nCrawled {} page(s) in {:.2}s",
stats.pages_crawled,
stats.duration.as_secs_f64()
);
Ok(())
}