spider_core/state/mod.rs
1//! Runtime state helpers.
2//!
3//! This module exposes the internal crawler state plus a small set of
4//! thread-safe primitives that are useful in user-defined spider state.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_core::{Counter, VisitedUrls};
10//! use spider_core::state::CrawlerState;
11//! use std::sync::Arc;
12//!
13//! #[derive(Clone, Default)]
14//! struct MySpiderState {
15//! page_count: Counter,
16//! visited_urls: VisitedUrls,
17//! }
18//!
19//! impl MySpiderState {
20//! fn increment_page_count(&self) {
21//! self.page_count.inc();
22//! }
23//!
24//! fn mark_url_visited(&self, url: String) {
25//! self.visited_urls.mark(url);
26//! }
27//! }
28//! ```
29
30mod primitives;
31
32pub use primitives::{
33 ConcurrentMap, ConcurrentVec, Counter, Counter64, Flag, StateAccessMetrics, VisitedUrls,
34};
35
36// ============================================================================
37// Crawler Internal State
38// ============================================================================
39
40use std::sync::Arc;
41use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
42
43/// Internal shared state used by the runtime.
44#[derive(Debug, Default)]
45pub struct CrawlerState {
46 /// The number of requests currently being downloaded.
47 pub in_flight_requests: AtomicUsize,
48 /// The number of responses currently being parsed.
49 pub parsing_responses: AtomicUsize,
50 /// The number of items currently being processed by pipelines.
51 pub processing_items: AtomicUsize,
52 /// The number of scraped items admitted into the processing pipeline.
53 pub admitted_items: AtomicUsize,
54 /// Indicates that the crawl is shutting down because the item limit was reached.
55 pub item_limit_reached: AtomicBool,
56}
57
58impl CrawlerState {
59 /// Creates a new, atomically reference-counted `CrawlerState`.
60 pub fn new() -> Arc<Self> {
61 Arc::new(Self::default())
62 }
63
64 /// Checks if all crawler activities are idle.
65 pub fn is_idle(&self) -> bool {
66 self.in_flight_requests.load(Ordering::Acquire) == 0
67 && self.parsing_responses.load(Ordering::Acquire) == 0
68 && self.processing_items.load(Ordering::Acquire) == 0
69 }
70}