use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use tracing::{debug, info, warn};
use crate::error::{Result, SpiderError};
const CHECKPOINT_FILE: &str = "checkpoint.json";
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct CheckpointData {
pub request_urls: Vec<String>,
pub seen_fingerprints: Vec<Vec<u8>>,
}
pub struct CheckpointManager {
crawldir: PathBuf,
checkpoint_path: PathBuf,
pub interval_secs: f64,
}
impl CheckpointManager {
pub fn new(crawldir: impl Into<PathBuf>, interval_secs: f64) -> Result<Self> {
if interval_secs < 0.0 {
return Err(SpiderError::Checkpoint("interval must be >= 0".into()));
}
let crawldir = crawldir.into();
let checkpoint_path = crawldir.join(CHECKPOINT_FILE);
Ok(Self {
crawldir,
checkpoint_path,
interval_secs,
})
}
pub fn has_checkpoint(&self) -> bool {
self.checkpoint_path.exists()
}
pub fn save(&self, data: &CheckpointData) -> Result<()> {
std::fs::create_dir_all(&self.crawldir)
.map_err(|e| SpiderError::Checkpoint(format!("failed to create crawldir: {e}")))?;
let temp_path = self.crawldir.join(".checkpoint.tmp");
let json = serde_json::to_vec(data)
.map_err(|e| SpiderError::Checkpoint(format!("serialization failed: {e}")))?;
std::fs::write(&temp_path, &json)
.map_err(|e| SpiderError::Checkpoint(format!("failed to write checkpoint: {e}")))?;
std::fs::rename(&temp_path, &self.checkpoint_path).map_err(|e| {
let _ = std::fs::remove_file(&temp_path);
SpiderError::Checkpoint(format!("failed to rename checkpoint: {e}"))
})?;
debug!("checkpoint saved");
Ok(())
}
pub fn load(&self) -> Result<Option<CheckpointData>> {
if !self.has_checkpoint() {
return Ok(None);
}
let data = std::fs::read(&self.checkpoint_path)
.map_err(|e| SpiderError::Checkpoint(format!("failed to read checkpoint: {e}")))?;
match serde_json::from_slice(&data) {
Ok(cp) => {
info!("checkpoint loaded");
Ok(Some(cp))
}
Err(e) => {
warn!(error = %e, "failed to deserialize checkpoint");
Ok(None)
}
}
}
pub fn cleanup(&self) -> Result<()> {
if self.checkpoint_path.exists() {
std::fs::remove_file(&self.checkpoint_path).map_err(|e| {
SpiderError::Checkpoint(format!("failed to cleanup checkpoint: {e}"))
})?;
debug!("checkpoint cleaned up");
}
Ok(())
}
}