Skip to main content

spider_core/state/
mod.rs

1//! # State Module
2//!
3//! Provides state tracking primitives for the spider-lib framework.
4//!
5//! ## Overview
6//!
7//! This module offers two categories of state management:
8//!
9//! 1. **Crawler Internal State**: [`CrawlerState`] for tracking operational metrics
10//! 2. **Thread-Safe Primitives**: Ready-to-use types for building custom Spider state
11//!
12//! ## Thread-Safe Primitives
13//!
14//! The following types are designed for building custom Spider state structures
15//! with safe concurrent access:
16//!
17//! - [`Counter`]: Thread-safe atomic counter
18//! - [`Counter64`]: 64-bit thread-safe counter for large counts
19//! - [`Flag`]: Thread-safe boolean flag
20//! - [`VisitedUrls`]: Thread-safe URL tracking with DashMap
21//! - [`ConcurrentMap<K, V>`]: Thread-safe key-value map
22//! - [`ConcurrentVec<T>`]: Thread-safe dynamic vector
23//! - [`StateAccessMetrics`]: Metrics for tracking state access patterns
24//!
25//! ## Example
26//!
27//! ```rust
28//! use spider_core::{Counter, VisitedUrls, CrawlerState};
29//! use std::sync::Arc;
30//!
31//! #[derive(Clone, Default)]
32//! struct MySpiderState {
33//!     page_count: Counter,
34//!     visited_urls: VisitedUrls,
35//! }
36//!
37//! impl MySpiderState {
38//!     fn increment_page_count(&self) {
39//!         self.page_count.inc();
40//!     }
41//!
42//!     fn mark_url_visited(&self, url: String) {
43//!         self.visited_urls.mark(url);
44//!     }
45//! }
46//! ```
47
48mod primitives;
49
50pub use primitives::{
51    ConcurrentMap, ConcurrentVec, Counter, Counter64, Flag, StateAccessMetrics, VisitedUrls,
52};
53
54// ============================================================================
55// Crawler Internal State
56// ============================================================================
57
58use std::sync::Arc;
59use std::sync::atomic::{AtomicUsize, Ordering};
60
61/// Represents the shared state of the crawler's various actors.
62///
63/// This struct provides a centralized mechanism for monitoring the real-time
64/// activity of the web crawler. It utilizes atomic counters to keep track of:
65/// - The number of HTTP requests currently in flight (being downloaded).
66/// - The number of responses actively being parsed by spiders.
67/// - The number of scraped items currently being processed by pipelines.
68///
69/// This state information is crucial for determining when the crawler is idle
70/// and can be gracefully shut down, or when to trigger checkpointing.
71#[derive(Debug, Default)]
72pub struct CrawlerState {
73    /// The number of requests currently being downloaded.
74    pub in_flight_requests: AtomicUsize,
75    /// The number of responses currently being parsed.
76    pub parsing_responses: AtomicUsize,
77    /// The number of items currently being processed by pipelines.
78    pub processing_items: AtomicUsize,
79}
80
81impl CrawlerState {
82    /// Creates a new, atomically reference-counted `CrawlerState`.
83    pub fn new() -> Arc<Self> {
84        Arc::new(Self::default())
85    }
86
87    /// Checks if all crawler activities are idle.
88    pub fn is_idle(&self) -> bool {
89        self.in_flight_requests.load(Ordering::Acquire) == 0
90            && self.parsing_responses.load(Ordering::Acquire) == 0
91            && self.processing_items.load(Ordering::Acquire) == 0
92    }
93}