Skip to main content

spider_lib/
checkpoint.rs

1use crate::request::Request;
2use dashmap::DashSet;
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5use std::collections::{HashMap, VecDeque};
6
7/// A snapshot of the scheduler's state.
8#[derive(Serialize, Deserialize, Default, Clone, Debug)]
9pub struct SchedulerCheckpoint {
10    /// The queue of pending requests.
11    pub request_queue: VecDeque<Request>,
12    /// The set of visited URL fingerprints.
13    pub visited_urls: DashSet<String>,
14}
15
16/// A complete checkpoint of the crawler's state.
17#[derive(Debug, Serialize, Deserialize, Default)]
18pub struct Checkpoint {
19    /// The state of the scheduler.
20    pub scheduler: SchedulerCheckpoint,
21    /// A map of pipeline states, keyed by pipeline name.
22    pub pipelines: HashMap<String, Value>,
23}