Skip to main content

spider_core/state/
mod.rs

1//! Runtime state helpers.
2//!
3//! This module exposes the internal crawler state plus a small set of
4//! thread-safe primitives that are useful in user-defined spider state.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_core::{Counter, VisitedUrls};
10//! use spider_core::state::CrawlerState;
11//! use std::sync::Arc;
12//!
13//! #[derive(Clone, Default)]
14//! struct MySpiderState {
15//!     page_count: Counter,
16//!     visited_urls: VisitedUrls,
17//! }
18//!
19//! impl MySpiderState {
20//!     fn increment_page_count(&self) {
21//!         self.page_count.inc();
22//!     }
23//!
24//!     fn mark_url_visited(&self, url: String) {
25//!         self.visited_urls.mark(url);
26//!     }
27//! }
28//! ```
29
30mod primitives;
31
32pub use primitives::{
33    ConcurrentMap, ConcurrentVec, Counter, Counter64, Flag, StateAccessMetrics, VisitedUrls,
34};
35
36// ============================================================================
37// Crawler Internal State
38// ============================================================================
39
40use std::sync::Arc;
41use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
42
43/// Internal shared state used by the runtime.
44#[derive(Debug, Default)]
45pub struct CrawlerState {
46    /// The number of requests currently being downloaded.
47    pub in_flight_requests: AtomicUsize,
48    /// The number of responses currently being parsed.
49    pub parsing_responses: AtomicUsize,
50    /// The number of items currently being processed by pipelines.
51    pub processing_items: AtomicUsize,
52    /// The number of scraped items admitted into the processing pipeline.
53    pub admitted_items: AtomicUsize,
54    /// Indicates that the crawl is shutting down because the item limit was reached.
55    pub item_limit_reached: AtomicBool,
56    /// Number of follow-up requests skipped because item-limit shutdown was in progress.
57    pub shutdown_skipped_requests: AtomicUsize,
58    /// Number of scraped items dropped because item-limit shutdown was in progress.
59    pub shutdown_dropped_items: AtomicUsize,
60    /// Number of visited-mark updates skipped because item-limit shutdown was in progress.
61    pub shutdown_skipped_visited_marks: AtomicUsize,
62}
63
64impl CrawlerState {
65    /// Creates a new, atomically reference-counted `CrawlerState`.
66    pub fn new() -> Arc<Self> {
67        Arc::new(Self::default())
68    }
69
70    /// Checks if all crawler activities are idle.
71    pub fn is_idle(&self) -> bool {
72        self.in_flight_requests.load(Ordering::Acquire) == 0
73            && self.parsing_responses.load(Ordering::Acquire) == 0
74            && self.processing_items.load(Ordering::Acquire) == 0
75    }
76}