use std::env;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use crawlex::hooks::{HookDecision, HookRegistry};
use crawlex::queue::FetchMethod;
use crawlex::{Config, Crawler, Result};
#[tokio::main]
async fn main() -> Result<()> {
let seed = env::args()
.nth(1)
.unwrap_or_else(|| "https://example.com".into());
let seed = url::Url::parse(&seed).expect("invalid seed url");
let hooks = HookRegistry::new();
let skipped = Arc::new(AtomicUsize::new(0));
let skipped_count = skipped.clone();
hooks.on_before_each_request(move |ctx| {
let skipped = skipped.clone();
Box::pin(async move {
if ctx.url.path().contains("/private/") {
skipped.fetch_add(1, Ordering::SeqCst);
return Ok(HookDecision::Skip);
}
Ok(HookDecision::Continue)
})
});
hooks.on_after_first_byte(|ctx| {
let status = ctx.response_status;
Box::pin(async move {
match status {
Some(429) | Some(503) => Ok(HookDecision::Retry),
_ => Ok(HookDecision::Continue),
}
})
});
hooks.on_discovery(|ctx| {
Box::pin(async move {
if let Some(host) = ctx.url.host_str() {
if let Ok(sitemap) = url::Url::parse(&format!("https://{host}/sitemap.xml")) {
ctx.captured_urls.push(sitemap);
}
}
Ok(HookDecision::Continue)
})
});
hooks.on_response_body(|ctx| {
Box::pin(async move {
ctx.user_data.insert(
"tagged_by_example".into(),
serde_json::Value::String("embedded_with_hooks".into()),
);
Ok(HookDecision::Continue)
})
});
let config = Config::builder().max_concurrent_http(4).build()?;
let crawler = Crawler::new(config)?.with_hooks(hooks);
crawler
.seed_with(vec![seed], FetchMethod::HttpSpoof)
.await?;
crawler.run().await?;
eprintln!(
"embedded_with_hooks done — skipped {} private URLs",
skipped_count.load(Ordering::SeqCst)
);
Ok(())
}