Skip to main content

scrapling_spider/
checkpoint.rs

1//! Pause/resume support via JSON checkpoint files.
2//!
3//! Long-running crawls can be interrupted and resumed without losing progress.
4//! The [`CheckpointManager`] periodically (or on demand) writes a
5//! [`CheckpointData`] snapshot to disk. The snapshot contains the URLs of all
6//! pending requests and the set of seen fingerprints, which is enough to
7//! reconstruct the scheduler's state on restart.
8//!
9//! Checkpoints are written atomically (write to a temp file, then rename) to
10//! prevent corruption if the process is killed mid-write. When the crawl
11//! completes normally, the checkpoint file is cleaned up automatically.
12//!
13//! The engine creates a `CheckpointManager` when you pass a `crawldir` path to
14//! [`CrawlerEngine::new`](crate::spider::CrawlerEngine::new). If no `crawldir`
15//! is provided, checkpointing is disabled entirely.
16
17use std::path::PathBuf;
18
19use serde::{Deserialize, Serialize};
20use tracing::{debug, info, warn};
21
22use crate::error::{Result, SpiderError};
23
24/// The default filename used for checkpoint data on disk.
25const CHECKPOINT_FILE: &str = "checkpoint.json";
26
27/// Serializable snapshot of crawler state for pause/resume support.
28///
29/// This struct is serialized to JSON and written to the crawl directory. On
30/// resume, the engine reads it back, re-enqueues the pending URLs, and
31/// repopulates the "seen" fingerprint set so that already-fetched pages are
32/// not fetched again.
33#[derive(Debug, Clone, Serialize, Deserialize, Default)]
34pub struct CheckpointData {
35    /// URLs of requests still pending in the scheduler's queue at the time the
36    /// checkpoint was taken. These are re-enqueued on resume.
37    pub request_urls: Vec<String>,
38    /// SHA-1 fingerprints of requests that have already been seen. Restoring
39    /// these into the scheduler prevents re-fetching pages that were completed
40    /// before the pause.
41    pub seen_fingerprints: Vec<Vec<u8>>,
42}
43
44/// Handles saving and loading crawler checkpoints to disk.
45///
46/// You do not create this directly; the [`CrawlerEngine`](crate::spider::CrawlerEngine)
47/// manages it internally. It writes to `<crawldir>/checkpoint.json` and
48/// supports configurable save intervals so you can trade off between
49/// checkpoint frequency and disk I/O overhead.
50pub struct CheckpointManager {
51    crawldir: PathBuf,
52    checkpoint_path: PathBuf,
53    /// The minimum interval in seconds between automatic checkpoint saves.
54    pub interval_secs: f64,
55}
56
57impl CheckpointManager {
58    /// Creates a new checkpoint manager writing to the given directory.
59    ///
60    /// `interval_secs` controls how often the engine auto-saves during the
61    /// crawl loop. A value of 0.0 disables periodic saves (checkpoints are
62    /// still taken on pause). Returns an error if `interval_secs` is negative.
63    pub fn new(crawldir: impl Into<PathBuf>, interval_secs: f64) -> Result<Self> {
64        if interval_secs < 0.0 {
65            return Err(SpiderError::Checkpoint("interval must be >= 0".into()));
66        }
67        let crawldir = crawldir.into();
68        let checkpoint_path = crawldir.join(CHECKPOINT_FILE);
69        Ok(Self {
70            crawldir,
71            checkpoint_path,
72            interval_secs,
73        })
74    }
75
76    /// Returns `true` if a checkpoint file exists on disk. The engine checks
77    /// this at startup to decide whether to resume from a previous run or start
78    /// fresh.
79    pub fn has_checkpoint(&self) -> bool {
80        self.checkpoint_path.exists()
81    }
82
83    /// Atomically writes checkpoint data to disk using a temporary file and
84    /// rename. The two-step write ensures that a crash during serialization
85    /// cannot corrupt an existing checkpoint. The crawl directory is created
86    /// automatically if it does not exist.
87    pub fn save(&self, data: &CheckpointData) -> Result<()> {
88        std::fs::create_dir_all(&self.crawldir)
89            .map_err(|e| SpiderError::Checkpoint(format!("failed to create crawldir: {e}")))?;
90
91        let temp_path = self.crawldir.join(".checkpoint.tmp");
92        let json = serde_json::to_vec(data)
93            .map_err(|e| SpiderError::Checkpoint(format!("serialization failed: {e}")))?;
94
95        std::fs::write(&temp_path, &json)
96            .map_err(|e| SpiderError::Checkpoint(format!("failed to write checkpoint: {e}")))?;
97
98        std::fs::rename(&temp_path, &self.checkpoint_path).map_err(|e| {
99            let _ = std::fs::remove_file(&temp_path);
100            SpiderError::Checkpoint(format!("failed to rename checkpoint: {e}"))
101        })?;
102
103        debug!("checkpoint saved");
104        Ok(())
105    }
106
107    /// Loads checkpoint data from disk, returning `None` if no checkpoint
108    /// exists. If the file exists but cannot be deserialized (e.g., it was
109    /// corrupted), a warning is logged and `None` is returned rather than
110    /// propagating an error, so the crawl can start fresh.
111    pub fn load(&self) -> Result<Option<CheckpointData>> {
112        if !self.has_checkpoint() {
113            return Ok(None);
114        }
115
116        let data = std::fs::read(&self.checkpoint_path)
117            .map_err(|e| SpiderError::Checkpoint(format!("failed to read checkpoint: {e}")))?;
118
119        match serde_json::from_slice(&data) {
120            Ok(cp) => {
121                info!("checkpoint loaded");
122                Ok(Some(cp))
123            }
124            Err(e) => {
125                warn!(error = %e, "failed to deserialize checkpoint");
126                Ok(None)
127            }
128        }
129    }
130
131    /// Deletes the checkpoint file from disk if it exists. The engine calls
132    /// this after a crawl completes normally (not paused) to avoid accidentally
133    /// resuming a finished crawl on the next run.
134    pub fn cleanup(&self) -> Result<()> {
135        if self.checkpoint_path.exists() {
136            std::fs::remove_file(&self.checkpoint_path).map_err(|e| {
137                SpiderError::Checkpoint(format!("failed to cleanup checkpoint: {e}"))
138            })?;
139            debug!("checkpoint cleaned up");
140        }
141        Ok(())
142    }
143}