spider_lib/state.rs
1//! Module for tracking the operational state of the crawler.
2//!
3//! This module defines the `CrawlerState` struct, which provides a centralized
4//! mechanism for monitoring the real-time activity of the web crawler. It
5//! utilizes atomic counters to keep track of:
6//! - The number of HTTP requests currently in flight (being downloaded).
7//! - The number of responses actively being parsed by spiders.
8//! - The number of scraped items currently being processed by pipelines.
9//!
10//! This state information is crucial for determining when the crawler is idle
11//! and can be gracefully shut down, or when to trigger checkpointing.
12
13use std::sync::Arc;
14use std::sync::atomic::{AtomicUsize, Ordering};
15
16/// Represents the shared state of the crawler's various actors.
17#[derive(Debug, Default)]
18pub struct CrawlerState {
19 /// The number of requests currently being downloaded.
20 pub in_flight_requests: AtomicUsize,
21 /// The number of responses currently being parsed.
22 pub parsing_responses: AtomicUsize,
23 /// The number of items currently being processed by pipelines.
24 pub processing_items: AtomicUsize,
25}
26
27impl CrawlerState {
28 /// Creates a new, atomically reference-counted `CrawlerState`.
29 pub fn new() -> Arc<Self> {
30 Arc::new(Self::default())
31 }
32
33 /// Checks if all crawler activities are idle.
34 pub fn is_idle(&self) -> bool {
35 self.in_flight_requests.load(Ordering::SeqCst) == 0
36 && self.parsing_responses.load(Ordering::SeqCst) == 0
37 && self.processing_items.load(Ordering::SeqCst) == 0
38 }
39}