spider_core/state/mod.rs
1//! # State Module
2//!
3//! Provides state tracking primitives for the spider-lib framework.
4//!
5//! ## Overview
6//!
7//! This module offers two categories of state management:
8//!
9//! 1. **Crawler Internal State**: [`CrawlerState`] for tracking operational metrics
10//! 2. **Thread-Safe Primitives**: Ready-to-use types for building custom Spider state
11//!
12//! ## Thread-Safe Primitives
13//!
14//! The following types are designed for building custom Spider state structures
15//! with safe concurrent access:
16//!
17//! - [`Counter`]: Thread-safe atomic counter
18//! - [`Counter64`]: 64-bit thread-safe counter for large counts
19//! - [`Flag`]: Thread-safe boolean flag
20//! - [`VisitedUrls`]: Thread-safe URL tracking with DashMap
21//! - [`ConcurrentMap<K, V>`]: Thread-safe key-value map
22//! - [`ConcurrentVec<T>`]: Thread-safe dynamic vector
23//! - [`StateAccessMetrics`]: Metrics for tracking state access patterns
24//!
25//! ## Example
26//!
27//! ```rust
28//! use spider_core::{Counter, VisitedUrls, CrawlerState};
29//! use std::sync::Arc;
30//!
31//! #[derive(Clone, Default)]
32//! struct MySpiderState {
33//! page_count: Counter,
34//! visited_urls: VisitedUrls,
35//! }
36//!
37//! impl MySpiderState {
38//! fn increment_page_count(&self) {
39//! self.page_count.inc();
40//! }
41//!
42//! fn mark_url_visited(&self, url: String) {
43//! self.visited_urls.mark(url);
44//! }
45//! }
46//! ```
47
48mod primitives;
49
50pub use primitives::{
51 ConcurrentMap, ConcurrentVec, Counter, Counter64, Flag, StateAccessMetrics, VisitedUrls,
52};
53
54// ============================================================================
55// Crawler Internal State
56// ============================================================================
57
58use std::sync::Arc;
59use std::sync::atomic::{AtomicUsize, Ordering};
60
61/// Represents the shared state of the crawler's various actors.
62///
63/// This struct provides a centralized mechanism for monitoring the real-time
64/// activity of the web crawler. It utilizes atomic counters to keep track of:
65/// - The number of HTTP requests currently in flight (being downloaded).
66/// - The number of responses actively being parsed by spiders.
67/// - The number of scraped items currently being processed by pipelines.
68///
69/// This state information is crucial for determining when the crawler is idle
70/// and can be gracefully shut down, or when to trigger checkpointing.
71#[derive(Debug, Default)]
72pub struct CrawlerState {
73 /// The number of requests currently being downloaded.
74 pub in_flight_requests: AtomicUsize,
75 /// The number of responses currently being parsed.
76 pub parsing_responses: AtomicUsize,
77 /// The number of items currently being processed by pipelines.
78 pub processing_items: AtomicUsize,
79}
80
81impl CrawlerState {
82 /// Creates a new, atomically reference-counted `CrawlerState`.
83 pub fn new() -> Arc<Self> {
84 Arc::new(Self::default())
85 }
86
87 /// Checks if all crawler activities are idle.
88 pub fn is_idle(&self) -> bool {
89 self.in_flight_requests.load(Ordering::Acquire) == 0
90 && self.parsing_responses.load(Ordering::Acquire) == 0
91 && self.processing_items.load(Ordering::Acquire) == 0
92 }
93}