netiquette/
lib.rs

1//! Polite behavior for web crawlers.
2//!
3//! A web crawler can use this crate's [`Limiter`] to honor `robots.txt` files on servers. This
4//! helps your spider be a good Internet citizen and avoid making a nuisance of itself and getting
5//! rate-limited.
6//!
7//! # Usage
8//!
9//! ```
10//! # const MY_USER_AGENT: &str = "ExampleCodeSomeoneCopiedAndPasted/0.1";
11//! # fn handle_web_page(_response: reqwest::Result<reqwest::Response>) {}
12//! # async fn crawl_urls(urls: Vec<netiquette::Url>) {
13//! use netiquette::Limiter;
14//!
15//! // Create a reqwest::Client to fetch web content.
16//! let client = reqwest::Client::builder()
17//!     .user_agent(MY_USER_AGENT)
18//!     .build()
19//!     .unwrap();
20//!
21//! let limiter = Limiter::new(client.clone(), MY_USER_AGENT.to_string());
22//! for url in urls {
23//!     match limiter.acquire(&url).await {
24//!         Ok(_permit) => handle_web_page(client.get(url).send().await),
25//!         Err(err) => eprintln!("can't crawl {url} - {err}"),
26//!     }
27//! }
28//! # }
29//! ```
30//!
31//! Of course, in a real spider, many tasks can fetch and process web pages concurrently. There can
32//! be thousands of HTTP requests in flight at a time. The purpose of `Limiter` is to slow down
33//! requests that would hit the same host concurrently or too frequently.
34#![deny(missing_docs)]
35
36mod error;
37mod limiter;
38
39pub use error::{Error, Result};
40pub use limiter::{Limiter, Permit};
41pub use reqwest::Url;
netiquette/lib.rs

netiquette/
lib.rs