Skip to main content

scrapling_spider/
cache.rs

1//! Filesystem-based HTTP response cache for development mode.
2//!
3//! When [`Spider::development_mode`](crate::spider::Spider::development_mode)
4//! returns `true`, the engine creates a [`ResponseCacheManager`] that saves
5//! every fetched response to disk as a JSON file. On subsequent runs, cached
6//! responses are served directly without hitting the network, which dramatically
7//! speeds up iterative development of parse logic.
8//!
9//! Each cached response is stored as `<hex-fingerprint>.json` inside the cache
10//! directory. The response body is base64-encoded to keep the JSON valid
11//! regardless of the body's content type. Writes are atomic (temp file + rename)
12//! to prevent partial files.
13//!
14//! This cache is not intended for production use. It does not implement
15//! expiration, size limits, or cache-control header semantics.
16
17use std::collections::HashMap;
18use std::path::PathBuf;
19
20use bytes::Bytes;
21use serde::{Deserialize, Serialize};
22use tracing::{debug, warn};
23
24use scrapling_fetch::Response;
25
26use crate::error::{Result, SpiderError};
27
28/// Manages a filesystem cache of HTTP responses, keyed by request fingerprint.
29///
30/// The cache stores each response as a JSON file named after the hex-encoded
31/// SHA-1 fingerprint of the request that produced it. This is only active when
32/// the spider has [`development_mode`](crate::spider::Spider::development_mode)
33/// enabled; production crawls skip the cache entirely.
34pub struct ResponseCacheManager {
35    cache_dir: PathBuf,
36}
37
38#[derive(Serialize, Deserialize)]
39struct CachedResponse {
40    url: String,
41    status: u16,
42    reason: String,
43    encoding: String,
44    cookies: HashMap<String, String>,
45    headers: HashMap<String, String>,
46    request_headers: HashMap<String, String>,
47    method: String,
48    content_base64: String,
49}
50
51impl ResponseCacheManager {
52    /// Creates a new cache manager that stores entries in the given directory.
53    /// The directory is created lazily on the first [`put`](ResponseCacheManager::put)
54    /// call, so it does not need to exist at construction time.
55    pub fn new(cache_dir: impl Into<PathBuf>) -> Self {
56        Self {
57            cache_dir: cache_dir.into(),
58        }
59    }
60
61    fn cache_path(&self, fingerprint: &[u8]) -> PathBuf {
62        self.cache_dir
63            .join(format!("{}.json", hex::encode(fingerprint)))
64    }
65
66    /// Retrieves a cached response by its fingerprint, or `None` if not found.
67    ///
68    /// The method reads the JSON file, deserializes the cached fields, and
69    /// base64-decodes the response body. If any step fails (missing file,
70    /// corrupt JSON, invalid base64), a warning is logged and `None` is
71    /// returned so the engine falls through to a live fetch.
72    pub fn get(&self, fingerprint: &[u8]) -> Option<Response> {
73        use base64::Engine;
74
75        let path = self.cache_path(fingerprint);
76        let data = std::fs::read(&path)
77            .inspect_err(|e| warn!(error = %e, "failed to read cache file"))
78            .ok()?;
79
80        let cached: CachedResponse = serde_json::from_slice(&data)
81            .inspect_err(|e| warn!(error = %e, "failed to deserialize cache entry"))
82            .ok()?;
83
84        let body = base64::engine::general_purpose::STANDARD
85            .decode(&cached.content_base64)
86            .inspect_err(|e| warn!(error = %e, "failed to decode cached body"))
87            .ok()
88            .map(Bytes::from)?;
89
90        Some(Response::new(
91            &cached.url,
92            body,
93            cached.status,
94            Some(cached.reason),
95            cached.cookies,
96            cached.headers,
97            cached.request_headers,
98            cached.encoding,
99            cached.method,
100            Vec::new(),
101            HashMap::new(),
102        ))
103    }
104
105    /// Stores a response in the cache under the given fingerprint.
106    ///
107    /// The response body is base64-encoded and the entire entry is serialized
108    /// to JSON. The write is atomic: data goes to a temp file first, then is
109    /// renamed to the final path, so a crash mid-write cannot corrupt an
110    /// existing cache entry.
111    pub fn put(&self, fingerprint: &[u8], response: &Response, method: &str) -> Result<()> {
112        std::fs::create_dir_all(&self.cache_dir)
113            .map_err(|e| SpiderError::Other(format!("failed to create cache dir: {e}")))?;
114
115        use base64::Engine;
116        let content_base64 = base64::engine::general_purpose::STANDARD.encode(&response.body);
117
118        let cached = CachedResponse {
119            url: response.url().to_owned(),
120            status: response.status,
121            reason: response.reason.clone(),
122            encoding: response.encoding.clone(),
123            cookies: response.cookies.clone(),
124            headers: response.headers.clone(),
125            request_headers: response.request_headers.clone(),
126            method: method.to_owned(),
127            content_base64,
128        };
129
130        let temp_path = self.cache_dir.join(".cache.tmp");
131        let json = serde_json::to_vec(&cached)
132            .map_err(|e| SpiderError::Other(format!("cache serialization failed: {e}")))?;
133
134        std::fs::write(&temp_path, &json)
135            .map_err(|e| SpiderError::Other(format!("failed to write cache: {e}")))?;
136
137        let target = self.cache_path(fingerprint);
138        std::fs::rename(&temp_path, &target).map_err(|e| {
139            let _ = std::fs::remove_file(&temp_path);
140            SpiderError::Other(format!("failed to rename cache file: {e}"))
141        })?;
142
143        debug!("response cached");
144        Ok(())
145    }
146
147    /// Removes all cached response files (`.json`) from the cache directory.
148    /// Call this when you want to force a fresh crawl during development. Only
149    /// files with a `.json` extension are deleted; other files in the directory
150    /// are left untouched.
151    pub fn clear(&self) -> Result<()> {
152        if self.cache_dir.exists() {
153            for entry in std::fs::read_dir(&self.cache_dir)
154                .map_err(|e| SpiderError::Other(format!("failed to read cache dir: {e}")))?
155                .flatten()
156            {
157                if entry.path().extension().is_some_and(|e| e == "json") {
158                    let _ = std::fs::remove_file(entry.path());
159                }
160            }
161        }
162        Ok(())
163    }
164}