spider_core/state/mod.rs
1//! Runtime state helpers.
2//!
3//! This module exposes the internal crawler state plus a small set of
4//! thread-safe primitives that are useful in user-defined spider state.
5//!
6//! ## Example
7//!
8//! ```rust,ignore
9//! use spider_core::{Counter, VisitedUrls};
10//! use spider_core::state::CrawlerState;
11//! use std::sync::Arc;
12//!
13//! #[derive(Clone, Default)]
14//! struct MySpiderState {
15//! page_count: Counter,
16//! visited_urls: VisitedUrls,
17//! }
18//!
19//! impl MySpiderState {
20//! fn increment_page_count(&self) {
21//! self.page_count.inc();
22//! }
23//!
24//! fn mark_url_visited(&self, url: String) {
25//! self.visited_urls.mark(url);
26//! }
27//! }
28//! ```
29
30mod primitives;
31
32pub use primitives::{
33 ConcurrentMap, ConcurrentVec, Counter, Counter64, Flag, StateAccessMetrics, VisitedUrls,
34};
35
36// ============================================================================
37// Crawler Internal State
38// ============================================================================
39
40use std::sync::Arc;
41use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
42
43/// Internal shared state used by the runtime.
44#[derive(Debug, Default)]
45pub struct CrawlerState {
46 /// The number of requests currently being downloaded.
47 pub in_flight_requests: AtomicUsize,
48 /// The number of responses currently being parsed.
49 pub parsing_responses: AtomicUsize,
50 /// The number of items currently being processed by pipelines.
51 pub processing_items: AtomicUsize,
52 /// The number of scraped items admitted into the processing pipeline.
53 pub admitted_items: AtomicUsize,
54 /// Indicates that the crawl is shutting down because the item limit was reached.
55 pub item_limit_reached: AtomicBool,
56 /// Number of follow-up requests skipped because item-limit shutdown was in progress.
57 pub shutdown_skipped_requests: AtomicUsize,
58 /// Number of scraped items dropped because item-limit shutdown was in progress.
59 pub shutdown_dropped_items: AtomicUsize,
60 /// Number of visited-mark updates skipped because item-limit shutdown was in progress.
61 pub shutdown_skipped_visited_marks: AtomicUsize,
62}
63
64impl CrawlerState {
65 /// Creates a new, atomically reference-counted `CrawlerState`.
66 pub fn new() -> Arc<Self> {
67 Arc::new(Self::default())
68 }
69
70 /// Checks if all crawler activities are idle.
71 pub fn is_idle(&self) -> bool {
72 self.in_flight_requests.load(Ordering::Acquire) == 0
73 && self.parsing_responses.load(Ordering::Acquire) == 0
74 && self.processing_items.load(Ordering::Acquire) == 0
75 }
76}