halldyll_core/lib.rs
1//! # Halldyll Core
2//!
3//! High-performance async web scraping engine designed for AI data collection.
4//!
5//! ## Features
6//!
7//! - **Async HTTP Fetching**: Connection pooling, compression, retries with exponential backoff
8//! - **Crawl Management**: URL normalization (RFC 3986), frontier scheduling, deduplication
9//! - **Politeness**: robots.txt (RFC 9309), adaptive rate limiting per domain
10//! - **Content Extraction**: Text, links, images, videos, structured data (JSON-LD, OpenGraph)
11//! - **Security**: SSRF protection, domain allowlists, resource limits
12//! - **Storage**: WARC (ISO 28500), snapshots with content hashing
13//! - **Observability**: Structured logging, metrics, distributed tracing
14//!
15//! ## Example
16//!
17//! ```rust,no_run
18//! use halldyll_core::{Orchestrator, Config};
19//! use url::Url;
20//!
21//! #[tokio::main]
22//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! let config = Config::default();
24//! let orchestrator = Orchestrator::new(config)?;
25//!
26//! let url = Url::parse("https://example.com")?;
27//! let result = orchestrator.scrape(&url).await?;
28//!
29//! println!("Title: {:?}", result.document.title);
30//! println!("Text length: {}", result.document.main_text.len());
31//! Ok(())
32//! }
33//! ```
34
35#![warn(missing_docs)]
36#![warn(clippy::all)]
37#![deny(unsafe_code)]
38
39pub mod types;
40pub mod fetch;
41pub mod crawl;
42pub mod politeness;
43pub mod parse;
44pub mod render;
45pub mod storage;
46pub mod security;
47pub mod observe;
48pub mod sitemap;
49pub mod orchestrator;
50
51// Re-exports for convenience
52pub use types::{Document, Assets, Provenance, Error, Config};
53pub use types::error::Result;
54pub use orchestrator::Orchestrator;
55
56// Production-ready utilities
57pub use fetch::{CircuitBreaker, CircuitBreakerConfig};
58pub use observe::{
59 HealthChecker, HealthResponse, HealthStatus, HealthMetrics,
60 PrometheusExporter, MetricsCollector, MetricsSnapshot,
61 GracefulShutdown, ShutdownResult,
62};