scrapling_spider/checkpoint.rs
1//! Pause/resume support via JSON checkpoint files.
2//!
3//! Long-running crawls can be interrupted and resumed without losing progress.
4//! The [`CheckpointManager`] periodically (or on demand) writes a
5//! [`CheckpointData`] snapshot to disk. The snapshot contains the URLs of all
6//! pending requests and the set of seen fingerprints, which is enough to
7//! reconstruct the scheduler's state on restart.
8//!
9//! Checkpoints are written atomically (write to a temp file, then rename) to
10//! prevent corruption if the process is killed mid-write. When the crawl
11//! completes normally, the checkpoint file is cleaned up automatically.
12//!
13//! The engine creates a `CheckpointManager` when you pass a `crawldir` path to
14//! [`CrawlerEngine::new`](crate::spider::CrawlerEngine::new). If no `crawldir`
15//! is provided, checkpointing is disabled entirely.
16
17use std::path::PathBuf;
18
19use serde::{Deserialize, Serialize};
20use tracing::{debug, info, warn};
21
22use crate::error::{Result, SpiderError};
23
24/// The default filename used for checkpoint data on disk.
25const CHECKPOINT_FILE: &str = "checkpoint.json";
26
27/// Serializable snapshot of crawler state for pause/resume support.
28///
29/// This struct is serialized to JSON and written to the crawl directory. On
30/// resume, the engine reads it back, re-enqueues the pending URLs, and
31/// repopulates the "seen" fingerprint set so that already-fetched pages are
32/// not fetched again.
33#[derive(Debug, Clone, Serialize, Deserialize, Default)]
34pub struct CheckpointData {
35 /// URLs of requests still pending in the scheduler's queue at the time the
36 /// checkpoint was taken. These are re-enqueued on resume.
37 pub request_urls: Vec<String>,
38 /// SHA-1 fingerprints of requests that have already been seen. Restoring
39 /// these into the scheduler prevents re-fetching pages that were completed
40 /// before the pause.
41 pub seen_fingerprints: Vec<Vec<u8>>,
42}
43
44/// Handles saving and loading crawler checkpoints to disk.
45///
46/// You do not create this directly; the [`CrawlerEngine`](crate::spider::CrawlerEngine)
47/// manages it internally. It writes to `<crawldir>/checkpoint.json` and
48/// supports configurable save intervals so you can trade off between
49/// checkpoint frequency and disk I/O overhead.
50pub struct CheckpointManager {
51 crawldir: PathBuf,
52 checkpoint_path: PathBuf,
53 /// The minimum interval in seconds between automatic checkpoint saves.
54 pub interval_secs: f64,
55}
56
57impl CheckpointManager {
58 /// Creates a new checkpoint manager writing to the given directory.
59 ///
60 /// `interval_secs` controls how often the engine auto-saves during the
61 /// crawl loop. A value of 0.0 disables periodic saves (checkpoints are
62 /// still taken on pause). Returns an error if `interval_secs` is negative.
63 pub fn new(crawldir: impl Into<PathBuf>, interval_secs: f64) -> Result<Self> {
64 if interval_secs < 0.0 {
65 return Err(SpiderError::Checkpoint("interval must be >= 0".into()));
66 }
67 let crawldir = crawldir.into();
68 let checkpoint_path = crawldir.join(CHECKPOINT_FILE);
69 Ok(Self {
70 crawldir,
71 checkpoint_path,
72 interval_secs,
73 })
74 }
75
76 /// Returns `true` if a checkpoint file exists on disk. The engine checks
77 /// this at startup to decide whether to resume from a previous run or start
78 /// fresh.
79 pub fn has_checkpoint(&self) -> bool {
80 self.checkpoint_path.exists()
81 }
82
83 /// Atomically writes checkpoint data to disk using a temporary file and
84 /// rename. The two-step write ensures that a crash during serialization
85 /// cannot corrupt an existing checkpoint. The crawl directory is created
86 /// automatically if it does not exist.
87 pub fn save(&self, data: &CheckpointData) -> Result<()> {
88 std::fs::create_dir_all(&self.crawldir)
89 .map_err(|e| SpiderError::Checkpoint(format!("failed to create crawldir: {e}")))?;
90
91 let temp_path = self.crawldir.join(".checkpoint.tmp");
92 let json = serde_json::to_vec(data)
93 .map_err(|e| SpiderError::Checkpoint(format!("serialization failed: {e}")))?;
94
95 std::fs::write(&temp_path, &json)
96 .map_err(|e| SpiderError::Checkpoint(format!("failed to write checkpoint: {e}")))?;
97
98 std::fs::rename(&temp_path, &self.checkpoint_path).map_err(|e| {
99 let _ = std::fs::remove_file(&temp_path);
100 SpiderError::Checkpoint(format!("failed to rename checkpoint: {e}"))
101 })?;
102
103 debug!("checkpoint saved");
104 Ok(())
105 }
106
107 /// Loads checkpoint data from disk, returning `None` if no checkpoint
108 /// exists. If the file exists but cannot be deserialized (e.g., it was
109 /// corrupted), a warning is logged and `None` is returned rather than
110 /// propagating an error, so the crawl can start fresh.
111 pub fn load(&self) -> Result<Option<CheckpointData>> {
112 if !self.has_checkpoint() {
113 return Ok(None);
114 }
115
116 let data = std::fs::read(&self.checkpoint_path)
117 .map_err(|e| SpiderError::Checkpoint(format!("failed to read checkpoint: {e}")))?;
118
119 match serde_json::from_slice(&data) {
120 Ok(cp) => {
121 info!("checkpoint loaded");
122 Ok(Some(cp))
123 }
124 Err(e) => {
125 warn!(error = %e, "failed to deserialize checkpoint");
126 Ok(None)
127 }
128 }
129 }
130
131 /// Deletes the checkpoint file from disk if it exists. The engine calls
132 /// this after a crawl completes normally (not paused) to avoid accidentally
133 /// resuming a finished crawl on the next run.
134 pub fn cleanup(&self) -> Result<()> {
135 if self.checkpoint_path.exists() {
136 std::fs::remove_file(&self.checkpoint_path).map_err(|e| {
137 SpiderError::Checkpoint(format!("failed to cleanup checkpoint: {e}"))
138 })?;
139 debug!("checkpoint cleaned up");
140 }
141 Ok(())
142 }
143}