Skip to main content

spider_lib/
state.rs

1//! Module for tracking the operational state of the crawler.
2//!
3//! This module defines the `CrawlerState` struct, which provides a centralized
4//! mechanism for monitoring the real-time activity of the web crawler. It
5//! utilizes atomic counters to keep track of:
6//! - The number of HTTP requests currently in flight (being downloaded).
7//! - The number of responses actively being parsed by spiders.
8//! - The number of scraped items currently being processed by pipelines.
9//!
10//! This state information is crucial for determining when the crawler is idle
11//! and can be gracefully shut down, or when to trigger checkpointing.
12
13use std::sync::Arc;
14use std::sync::atomic::{AtomicUsize, Ordering};
15
16/// Represents the shared state of the crawler's various actors.
17#[derive(Debug, Default)]
18pub struct CrawlerState {
19    /// The number of requests currently being downloaded.
20    pub in_flight_requests: AtomicUsize,
21    /// The number of responses currently being parsed.
22    pub parsing_responses: AtomicUsize,
23    /// The number of items currently being processed by pipelines.
24    pub processing_items: AtomicUsize,
25}
26
27impl CrawlerState {
28    /// Creates a new, atomically reference-counted `CrawlerState`.
29    pub fn new() -> Arc<Self> {
30        Arc::new(Self::default())
31    }
32
33    /// Checks if all crawler activities are idle.
34    pub fn is_idle(&self) -> bool {
35        self.in_flight_requests.load(Ordering::SeqCst) == 0
36            && self.parsing_responses.load(Ordering::SeqCst) == 0
37            && self.processing_items.load(Ordering::SeqCst) == 0
38    }
39}