Skip to main content

spider_core/state/
mod.rs

1//! Runtime state helpers.
2//!
3//! This module exposes the internal crawler state plus a small set of
4//! thread-safe primitives that are useful in user-defined spider state.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_core::{Counter, VisitedUrls};
10//! use spider_core::state::CrawlerState;
11//! use std::sync::Arc;
12//!
13//! #[derive(Clone, Default)]
14//! struct MySpiderState {
15//!     page_count: Counter,
16//!     visited_urls: VisitedUrls,
17//! }
18//!
19//! impl MySpiderState {
20//!     fn increment_page_count(&self) {
21//!         self.page_count.inc();
22//!     }
23//!
24//!     fn mark_url_visited(&self, url: String) {
25//!         self.visited_urls.mark(url);
26//!     }
27//! }
28//! ```
29
30mod primitives;
31
32pub use primitives::{
33    ConcurrentMap, ConcurrentVec, Counter, Counter64, Flag, StateAccessMetrics, VisitedUrls,
34};
35
36// ============================================================================
37// Crawler Internal State
38// ============================================================================
39
40use std::sync::Arc;
41use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
42
43/// Internal shared state used by the runtime.
44#[derive(Debug, Default)]
45pub struct CrawlerState {
46    /// The number of requests currently being downloaded.
47    pub in_flight_requests: AtomicUsize,
48    /// The number of responses currently being parsed.
49    pub parsing_responses: AtomicUsize,
50    /// The number of items currently being processed by pipelines.
51    pub processing_items: AtomicUsize,
52    /// The number of scraped items admitted into the processing pipeline.
53    pub admitted_items: AtomicUsize,
54    /// Indicates that the crawl is shutting down because the item limit was reached.
55    pub item_limit_reached: AtomicBool,
56}
57
58impl CrawlerState {
59    /// Creates a new, atomically reference-counted `CrawlerState`.
60    pub fn new() -> Arc<Self> {
61        Arc::new(Self::default())
62    }
63
64    /// Checks if all crawler activities are idle.
65    pub fn is_idle(&self) -> bool {
66        self.in_flight_requests.load(Ordering::Acquire) == 0
67            && self.parsing_responses.load(Ordering::Acquire) == 0
68            && self.processing_items.load(Ordering::Acquire) == 0
69    }
70}