mq_crawler/
lib.rs

1//! Web crawler for collecting markdown content from websites.
2//!
3//! This crate provides functionality to crawl websites and extract markdown content.
4//! It respects robots.txt, handles concurrent requests, and converts HTML to markdown
5//! for batch processing with mq.
6//!
7//! # Features
8//!
9//! - Asynchronous web crawling with configurable concurrency
10//! - robots.txt compliance
11//! - HTML to markdown conversion
12//! - Link discovery and following
13//! - Crawl statistics and result tracking
14//! - Support for custom HTTP headers and user agents
15//! - Rate limiting and politeness delays
16//!
17//! # Usage
18//!
19//! ```rust,ignore
20//! use mq_crawler::crawler::Crawler;
21//! use url::Url;
22//!
23//! #[tokio::main]
24//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
25//!     let start_url = Url::parse("https://your-target-site.com")?;
26//!     let crawler = Crawler::new(start_url, None, 10);
27//!     let result = crawler.crawl().await?;
28//!     println!("Crawled {} pages", result.pages_crawled);
29//!     Ok(())
30//! }
31//! ```
32//!
33//! # Crawling Behavior
34//!
35//! The crawler:
36//! - Starts from a specified URL
37//! - Follows links found on each page
38//! - Respects robots.txt directives
39//! - Limits depth and breadth of crawling
40//! - Converts HTML pages to markdown
41//! - Tracks statistics about the crawl
42//!
43pub mod crawler;
44pub mod http_client;
45pub mod robots;
mq_crawler/lib.rs

mq_crawler/
lib.rs