Skip to main content

scrapling_spider/
result.rs

1//! Crawl output types: statistics, scraped items, and the final result.
2//!
3//! After a crawl completes (or is paused), the engine produces a [`CrawlResult`]
4//! containing:
5//!
6//! - [`CrawlStats`] -- counters for requests, responses, cache hits, blocked
7//!   retries, bytes transferred, and more. Stats are also broken down per domain
8//!   and per session.
9//! - [`ItemList`] -- the collected scraped items as JSON values, with convenience
10//!   methods for serializing to `.json` or `.jsonl` files.
11//! - A `paused` flag indicating whether the crawl was interrupted by a pause
12//!   signal rather than running to natural completion.
13//!
14//! `CrawlStats` is `Serialize`/`Deserialize` so you can persist it alongside
15//! your scraped data for post-crawl analysis.
16
17use std::collections::HashMap;
18use std::path::Path;
19
20use serde::{Deserialize, Serialize};
21
22/// Aggregate statistics collected during a crawl run.
23///
24/// The crawler engine populates this struct as it processes requests. After the
25/// crawl finishes, you can inspect it via [`CrawlerEngine::stats`](crate::spider::CrawlerEngine::stats)
26/// or from the returned [`CrawlResult`]. All counters start at zero and are
27/// incremented atomically during the crawl loop.
28#[derive(Debug, Clone, Default, Serialize, Deserialize)]
29pub struct CrawlStats {
30    /// Total number of requests dispatched.
31    pub requests_count: u64,
32    /// Maximum number of concurrent requests allowed.
33    pub concurrent_requests: u32,
34    /// Maximum number of concurrent requests per domain.
35    pub concurrent_requests_per_domain: u32,
36    /// Number of requests that failed with an error.
37    pub failed_requests_count: u64,
38    /// Number of requests rejected because their domain was not allowed.
39    pub offsite_requests_count: u64,
40    /// Number of requests blocked by robots.txt rules.
41    pub robots_disallowed_count: u64,
42    /// Number of responses served from the cache.
43    pub cache_hits: u64,
44    /// Number of responses that were not found in the cache.
45    pub cache_misses: u64,
46    /// Total bytes received across all responses.
47    pub response_bytes: u64,
48    /// Number of items successfully scraped.
49    pub items_scraped: u64,
50    /// Number of items dropped by the item pipeline.
51    pub items_dropped: u64,
52    /// Unix timestamp when the crawl started.
53    pub start_time: f64,
54    /// Unix timestamp when the crawl ended.
55    pub end_time: f64,
56    /// Configured delay in seconds between consecutive requests.
57    pub download_delay: f64,
58    /// Number of requests that received a blocked status code.
59    pub blocked_requests_count: u64,
60    /// User-defined custom statistics.
61    pub custom_stats: HashMap<String, serde_json::Value>,
62    /// Count of responses grouped by HTTP status code.
63    pub response_status_count: HashMap<String, u64>,
64    /// Total bytes received grouped by domain.
65    pub domains_response_bytes: HashMap<String, u64>,
66    /// Number of requests dispatched per session.
67    pub sessions_requests_count: HashMap<String, u64>,
68    /// List of proxy addresses used during the crawl.
69    pub proxies: Vec<String>,
70    /// Count of log messages grouped by level.
71    pub log_levels_counter: HashMap<String, u64>,
72}
73
74impl CrawlStats {
75    /// Returns the wall-clock duration of the crawl in seconds, computed as
76    /// `end_time - start_time`. Both timestamps are Unix epoch seconds recorded
77    /// at the start and end of [`CrawlerEngine::crawl`](crate::spider::CrawlerEngine::crawl).
78    pub fn elapsed_seconds(&self) -> f64 {
79        self.end_time - self.start_time
80    }
81
82    /// Returns the average number of requests completed per second over the
83    /// entire crawl. Returns 0.0 if the crawl duration was zero (e.g., an
84    /// instant crawl with no network calls).
85    pub fn requests_per_second(&self) -> f64 {
86        let elapsed = self.elapsed_seconds();
87        if elapsed == 0.0 {
88            0.0
89        } else {
90            self.requests_count as f64 / elapsed
91        }
92    }
93
94    /// Increments the counter for the given HTTP status code. Status codes are
95    /// stored under keys like `"status_200"` or `"status_404"` in the
96    /// `response_status_count` map, making it easy to spot error patterns.
97    pub fn increment_status(&mut self, status: u16) {
98        let key = format!("status_{status}");
99        *self.response_status_count.entry(key).or_insert(0) += 1;
100    }
101
102    /// Adds `count` bytes to both the global `response_bytes` total and the
103    /// per-domain counter in `domains_response_bytes`. This is called by the
104    /// engine after every successful fetch so you can identify bandwidth-heavy
105    /// domains.
106    pub fn increment_response_bytes(&mut self, domain: &str, count: u64) {
107        self.response_bytes += count;
108        *self
109            .domains_response_bytes
110            .entry(domain.to_owned())
111            .or_insert(0) += count;
112    }
113
114    /// Increments the total `requests_count` and the per-session counter in
115    /// `sessions_requests_count`. The engine calls this before every fetch so
116    /// you can see how load is distributed across sessions.
117    pub fn increment_requests_count(&mut self, sid: &str) {
118        self.requests_count += 1;
119        *self
120            .sessions_requests_count
121            .entry(sid.to_owned())
122            .or_insert(0) += 1;
123    }
124}
125
126/// A collection of scraped JSON items with serialization helpers.
127///
128/// `ItemList` wraps a `Vec<serde_json::Value>` and adds convenience methods for
129/// writing the collected data to disk as JSON or JSON Lines. It implements
130/// `IntoIterator`, `Index`, and the standard `len` / `is_empty` API so you can
131/// treat it like a regular collection.
132#[derive(Debug, Default)]
133pub struct ItemList(Vec<serde_json::Value>);
134
135impl ItemList {
136    /// Creates an empty item list. This is equivalent to `ItemList::default()`
137    /// and is what the crawler engine uses at the start of every crawl run.
138    pub fn new() -> Self {
139        Self(Vec::new())
140    }
141
142    /// Appends a JSON item to the list. The engine calls this for every item
143    /// that passes through [`Spider::on_scraped_item`](crate::spider::Spider::on_scraped_item)
144    /// without being dropped.
145    pub fn push(&mut self, item: serde_json::Value) {
146        self.0.push(item);
147    }
148
149    /// Returns the number of items in the list.
150    pub fn len(&self) -> usize {
151        self.0.len()
152    }
153
154    /// Returns `true` if the list contains no items.
155    pub fn is_empty(&self) -> bool {
156        self.0.is_empty()
157    }
158
159    /// Returns an iterator over the items.
160    pub fn iter(&self) -> std::slice::Iter<'_, serde_json::Value> {
161        self.0.iter()
162    }
163
164    /// Writes all items to a JSON file at `path`, optionally pretty-printed.
165    /// Parent directories are created automatically if they do not exist. Pass
166    /// `indent: true` for human-readable output or `false` for compact output.
167    pub fn to_json(&self, path: &Path, indent: bool) -> std::io::Result<()> {
168        path.parent().map(std::fs::create_dir_all).transpose()?;
169        let data = match indent {
170            true => serde_json::to_vec_pretty(&self.0),
171            false => serde_json::to_vec(&self.0),
172        }
173        .unwrap_or_default();
174        std::fs::write(path, data)
175    }
176
177    /// Writes all items to a JSON Lines file (one JSON object per line).
178    /// This format is convenient for streaming ingestion into data pipelines
179    /// because each line is a self-contained JSON document. Parent directories
180    /// are created automatically.
181    pub fn to_jsonl(&self, path: &Path) -> std::io::Result<()> {
182        path.parent().map(std::fs::create_dir_all).transpose()?;
183        let content = self
184            .0
185            .iter()
186            .map(|item| serde_json::to_string(item).unwrap_or_default())
187            .collect::<Vec<_>>()
188            .join("\n");
189        std::fs::write(path, content)
190    }
191}
192
193impl IntoIterator for ItemList {
194    type Item = serde_json::Value;
195    type IntoIter = std::vec::IntoIter<serde_json::Value>;
196
197    fn into_iter(self) -> Self::IntoIter {
198        self.0.into_iter()
199    }
200}
201
202impl<'a> IntoIterator for &'a ItemList {
203    type Item = &'a serde_json::Value;
204    type IntoIter = std::slice::Iter<'a, serde_json::Value>;
205
206    fn into_iter(self) -> Self::IntoIter {
207        self.0.iter()
208    }
209}
210
211impl std::ops::Index<usize> for ItemList {
212    type Output = serde_json::Value;
213
214    fn index(&self, idx: usize) -> &Self::Output {
215        &self.0[idx]
216    }
217}
218
219/// The final output of a crawl run, bundling together statistics, scraped items,
220/// and a flag indicating whether the crawl ran to completion or was paused.
221///
222/// You obtain a `CrawlResult` by calling [`CrawlerEngine::crawl`](crate::spider::CrawlerEngine::crawl).
223/// If `paused` is `true`, the engine saved a checkpoint to disk and you can
224/// resume later by creating a new engine pointed at the same `crawldir`.
225pub struct CrawlResult {
226    /// The aggregate crawl statistics for the entire run, including request
227    /// counts, byte totals, cache hit/miss ratios, and per-domain breakdowns.
228    pub stats: CrawlStats,
229    /// The collected scraped items. Use [`ItemList::to_json`] or
230    /// [`ItemList::to_jsonl`] to persist them to disk.
231    pub items: ItemList,
232    /// Whether the crawl was paused (via [`CrawlerEngine::request_pause`](crate::spider::CrawlerEngine::request_pause))
233    /// rather than completing naturally. When `true`, a checkpoint was saved and
234    /// the crawl can be resumed.
235    pub paused: bool,
236}
237
238impl CrawlResult {
239    /// Returns `true` if the crawl ran to completion (was not paused). This is
240    /// the inverse of `self.paused` and exists as a convenience for readability
241    /// in conditional checks.
242    pub fn completed(&self) -> bool {
243        !self.paused
244    }
245
246    /// Returns the number of scraped items.
247    pub fn len(&self) -> usize {
248        self.items.len()
249    }
250
251    /// Returns `true` if no items were scraped.
252    pub fn is_empty(&self) -> bool {
253        self.items.is_empty()
254    }
255}