robotxt 0.2.0 - Docs.rs

#![forbid(unsafe_code)]

//! The implementation of the robots.txt protocol (or URL exclusion protocol)
//! with the support of `crawl-delay`, `sitemap`, and universal `*` match
//! extensions (according to the RFC specification).
//!
//! **Also check out other `xwde` projects [here](https://github.com/xwde).**
//!
//! ## Examples
//!
//! - parse the `user-agent` in the provided `robots.txt` file (See [Robots]):
//!
//! ```rust
//! use robotxt::Robots;
//!
//! let txt = r#"
//!     User-Agent: foobot
//!     Disallow: *
//!     Allow: /example/
//!     Disallow: /example/nope.txt
//! "#.as_bytes();
//!
//! let r = Robots::from_bytes(txt, "foobot");
//! assert!(r.is_allowed("/example/yeah.txt"));
//! assert!(!r.is_allowed("/example/nope.txt"));
//! assert!(!r.is_allowed("/invalid/path.txt"));
//! ```
//!
//! - build the new `robots.txt` file from provided directives (See [Factory]):
//!
//! ```rust
//! use url::Url;
//! use robotxt::Factory;
//!
//! let txt = Factory::default()
//!     .header("Robots.txt Header")
//!     .group(["foobot"], |u| {
//!         u.crawl_delay(5)
//!             .header("Rules for Foobot: Start")
//!             .allow("/example/yeah.txt")
//!             .disallow("/example/nope.txt")
//!             .footer("Rules for Foobot: End")
//!     })
//!     .group(["barbot", "nombot"], |u| {
//!         u.crawl_delay(2)
//!             .disallow("/example/yeah.txt")
//!             .disallow("/example/nope.txt")
//!     })
//!     .sitemap("https://example.com/sitemap_1.xml").unwrap()
//!     .sitemap("https://example.com/sitemap_2.xml").unwrap()
//!     .footer("Robots.txt Footer");
//!
//! println!("{}", txt.to_string());
//! ```
//!
//! ## Links
//!
//! - [Request for Comments: 9309](https://www.rfc-editor.org/rfc/rfc9309.txt) on
//!   RFC-Editor.com
//! - [Introduction to Robots.txt](https://developers.google.com/search/docs/crawling-indexing/robots/intro)
//!   on Google.com
//! - [How Google interprets Robots.txt](https://developers.google.com/search/docs/crawling-indexing/robots/robots_txt)
//!   on Google.com
//! - [What is Robots.txt file](https://moz.com/learn/seo/robotstxt) on Moz.com
//!
//! ## Notes
//!
//! - The parser is based on
//! [Smerity/texting_robots](https://github.com/Smerity/texting_robots)
//! - The `Host` directive is not supported
//!

mod build;
pub use build::*;

mod parse;
pub use parse::*;