scrapling_spider/result.rs
1//! Crawl output types: statistics, scraped items, and the final result.
2//!
3//! After a crawl completes (or is paused), the engine produces a [`CrawlResult`]
4//! containing:
5//!
6//! - [`CrawlStats`] -- counters for requests, responses, cache hits, blocked
7//! retries, bytes transferred, and more. Stats are also broken down per domain
8//! and per session.
9//! - [`ItemList`] -- the collected scraped items as JSON values, with convenience
10//! methods for serializing to `.json` or `.jsonl` files.
11//! - A `paused` flag indicating whether the crawl was interrupted by a pause
12//! signal rather than running to natural completion.
13//!
14//! `CrawlStats` is `Serialize`/`Deserialize` so you can persist it alongside
15//! your scraped data for post-crawl analysis.
16
17use std::collections::HashMap;
18use std::path::Path;
19
20use serde::{Deserialize, Serialize};
21
22/// Aggregate statistics collected during a crawl run.
23///
24/// The crawler engine populates this struct as it processes requests. After the
25/// crawl finishes, you can inspect it via [`CrawlerEngine::stats`](crate::spider::CrawlerEngine::stats)
26/// or from the returned [`CrawlResult`]. All counters start at zero and are
27/// incremented atomically during the crawl loop.
28#[derive(Debug, Clone, Default, Serialize, Deserialize)]
29pub struct CrawlStats {
30 /// Total number of requests dispatched.
31 pub requests_count: u64,
32 /// Maximum number of concurrent requests allowed.
33 pub concurrent_requests: u32,
34 /// Maximum number of concurrent requests per domain.
35 pub concurrent_requests_per_domain: u32,
36 /// Number of requests that failed with an error.
37 pub failed_requests_count: u64,
38 /// Number of requests rejected because their domain was not allowed.
39 pub offsite_requests_count: u64,
40 /// Number of requests blocked by robots.txt rules.
41 pub robots_disallowed_count: u64,
42 /// Number of responses served from the cache.
43 pub cache_hits: u64,
44 /// Number of responses that were not found in the cache.
45 pub cache_misses: u64,
46 /// Total bytes received across all responses.
47 pub response_bytes: u64,
48 /// Number of items successfully scraped.
49 pub items_scraped: u64,
50 /// Number of items dropped by the item pipeline.
51 pub items_dropped: u64,
52 /// Unix timestamp when the crawl started.
53 pub start_time: f64,
54 /// Unix timestamp when the crawl ended.
55 pub end_time: f64,
56 /// Configured delay in seconds between consecutive requests.
57 pub download_delay: f64,
58 /// Number of requests that received a blocked status code.
59 pub blocked_requests_count: u64,
60 /// User-defined custom statistics.
61 pub custom_stats: HashMap<String, serde_json::Value>,
62 /// Count of responses grouped by HTTP status code.
63 pub response_status_count: HashMap<String, u64>,
64 /// Total bytes received grouped by domain.
65 pub domains_response_bytes: HashMap<String, u64>,
66 /// Number of requests dispatched per session.
67 pub sessions_requests_count: HashMap<String, u64>,
68 /// List of proxy addresses used during the crawl.
69 pub proxies: Vec<String>,
70 /// Count of log messages grouped by level.
71 pub log_levels_counter: HashMap<String, u64>,
72}
73
74impl CrawlStats {
75 /// Returns the wall-clock duration of the crawl in seconds, computed as
76 /// `end_time - start_time`. Both timestamps are Unix epoch seconds recorded
77 /// at the start and end of [`CrawlerEngine::crawl`](crate::spider::CrawlerEngine::crawl).
78 pub fn elapsed_seconds(&self) -> f64 {
79 self.end_time - self.start_time
80 }
81
82 /// Returns the average number of requests completed per second over the
83 /// entire crawl. Returns 0.0 if the crawl duration was zero (e.g., an
84 /// instant crawl with no network calls).
85 pub fn requests_per_second(&self) -> f64 {
86 let elapsed = self.elapsed_seconds();
87 if elapsed == 0.0 {
88 0.0
89 } else {
90 self.requests_count as f64 / elapsed
91 }
92 }
93
94 /// Increments the counter for the given HTTP status code. Status codes are
95 /// stored under keys like `"status_200"` or `"status_404"` in the
96 /// `response_status_count` map, making it easy to spot error patterns.
97 pub fn increment_status(&mut self, status: u16) {
98 let key = format!("status_{status}");
99 *self.response_status_count.entry(key).or_insert(0) += 1;
100 }
101
102 /// Adds `count` bytes to both the global `response_bytes` total and the
103 /// per-domain counter in `domains_response_bytes`. This is called by the
104 /// engine after every successful fetch so you can identify bandwidth-heavy
105 /// domains.
106 pub fn increment_response_bytes(&mut self, domain: &str, count: u64) {
107 self.response_bytes += count;
108 *self
109 .domains_response_bytes
110 .entry(domain.to_owned())
111 .or_insert(0) += count;
112 }
113
114 /// Increments the total `requests_count` and the per-session counter in
115 /// `sessions_requests_count`. The engine calls this before every fetch so
116 /// you can see how load is distributed across sessions.
117 pub fn increment_requests_count(&mut self, sid: &str) {
118 self.requests_count += 1;
119 *self
120 .sessions_requests_count
121 .entry(sid.to_owned())
122 .or_insert(0) += 1;
123 }
124}
125
126/// A collection of scraped JSON items with serialization helpers.
127///
128/// `ItemList` wraps a `Vec<serde_json::Value>` and adds convenience methods for
129/// writing the collected data to disk as JSON or JSON Lines. It implements
130/// `IntoIterator`, `Index`, and the standard `len` / `is_empty` API so you can
131/// treat it like a regular collection.
132#[derive(Debug, Default)]
133pub struct ItemList(Vec<serde_json::Value>);
134
135impl ItemList {
136 /// Creates an empty item list. This is equivalent to `ItemList::default()`
137 /// and is what the crawler engine uses at the start of every crawl run.
138 pub fn new() -> Self {
139 Self(Vec::new())
140 }
141
142 /// Appends a JSON item to the list. The engine calls this for every item
143 /// that passes through [`Spider::on_scraped_item`](crate::spider::Spider::on_scraped_item)
144 /// without being dropped.
145 pub fn push(&mut self, item: serde_json::Value) {
146 self.0.push(item);
147 }
148
149 /// Returns the number of items in the list.
150 pub fn len(&self) -> usize {
151 self.0.len()
152 }
153
154 /// Returns `true` if the list contains no items.
155 pub fn is_empty(&self) -> bool {
156 self.0.is_empty()
157 }
158
159 /// Returns an iterator over the items.
160 pub fn iter(&self) -> std::slice::Iter<'_, serde_json::Value> {
161 self.0.iter()
162 }
163
164 /// Writes all items to a JSON file at `path`, optionally pretty-printed.
165 /// Parent directories are created automatically if they do not exist. Pass
166 /// `indent: true` for human-readable output or `false` for compact output.
167 pub fn to_json(&self, path: &Path, indent: bool) -> std::io::Result<()> {
168 path.parent().map(std::fs::create_dir_all).transpose()?;
169 let data = match indent {
170 true => serde_json::to_vec_pretty(&self.0),
171 false => serde_json::to_vec(&self.0),
172 }
173 .unwrap_or_default();
174 std::fs::write(path, data)
175 }
176
177 /// Writes all items to a JSON Lines file (one JSON object per line).
178 /// This format is convenient for streaming ingestion into data pipelines
179 /// because each line is a self-contained JSON document. Parent directories
180 /// are created automatically.
181 pub fn to_jsonl(&self, path: &Path) -> std::io::Result<()> {
182 path.parent().map(std::fs::create_dir_all).transpose()?;
183 let content = self
184 .0
185 .iter()
186 .map(|item| serde_json::to_string(item).unwrap_or_default())
187 .collect::<Vec<_>>()
188 .join("\n");
189 std::fs::write(path, content)
190 }
191}
192
193impl IntoIterator for ItemList {
194 type Item = serde_json::Value;
195 type IntoIter = std::vec::IntoIter<serde_json::Value>;
196
197 fn into_iter(self) -> Self::IntoIter {
198 self.0.into_iter()
199 }
200}
201
202impl<'a> IntoIterator for &'a ItemList {
203 type Item = &'a serde_json::Value;
204 type IntoIter = std::slice::Iter<'a, serde_json::Value>;
205
206 fn into_iter(self) -> Self::IntoIter {
207 self.0.iter()
208 }
209}
210
211impl std::ops::Index<usize> for ItemList {
212 type Output = serde_json::Value;
213
214 fn index(&self, idx: usize) -> &Self::Output {
215 &self.0[idx]
216 }
217}
218
219/// The final output of a crawl run, bundling together statistics, scraped items,
220/// and a flag indicating whether the crawl ran to completion or was paused.
221///
222/// You obtain a `CrawlResult` by calling [`CrawlerEngine::crawl`](crate::spider::CrawlerEngine::crawl).
223/// If `paused` is `true`, the engine saved a checkpoint to disk and you can
224/// resume later by creating a new engine pointed at the same `crawldir`.
225pub struct CrawlResult {
226 /// The aggregate crawl statistics for the entire run, including request
227 /// counts, byte totals, cache hit/miss ratios, and per-domain breakdowns.
228 pub stats: CrawlStats,
229 /// The collected scraped items. Use [`ItemList::to_json`] or
230 /// [`ItemList::to_jsonl`] to persist them to disk.
231 pub items: ItemList,
232 /// Whether the crawl was paused (via [`CrawlerEngine::request_pause`](crate::spider::CrawlerEngine::request_pause))
233 /// rather than completing naturally. When `true`, a checkpoint was saved and
234 /// the crawl can be resumed.
235 pub paused: bool,
236}
237
238impl CrawlResult {
239 /// Returns `true` if the crawl ran to completion (was not paused). This is
240 /// the inverse of `self.paused` and exists as a convenience for readability
241 /// in conditional checks.
242 pub fn completed(&self) -> bool {
243 !self.paused
244 }
245
246 /// Returns the number of scraped items.
247 pub fn len(&self) -> usize {
248 self.items.len()
249 }
250
251 /// Returns `true` if no items were scraped.
252 pub fn is_empty(&self) -> bool {
253 self.items.is_empty()
254 }
255}