scrapling_spider/
cache.rs1use std::collections::HashMap;
18use std::path::PathBuf;
19
20use bytes::Bytes;
21use serde::{Deserialize, Serialize};
22use tracing::{debug, warn};
23
24use scrapling_fetch::Response;
25
26use crate::error::{Result, SpiderError};
27
28pub struct ResponseCacheManager {
35 cache_dir: PathBuf,
36}
37
38#[derive(Serialize, Deserialize)]
39struct CachedResponse {
40 url: String,
41 status: u16,
42 reason: String,
43 encoding: String,
44 cookies: HashMap<String, String>,
45 headers: HashMap<String, String>,
46 request_headers: HashMap<String, String>,
47 method: String,
48 content_base64: String,
49}
50
51impl ResponseCacheManager {
52 pub fn new(cache_dir: impl Into<PathBuf>) -> Self {
56 Self {
57 cache_dir: cache_dir.into(),
58 }
59 }
60
61 fn cache_path(&self, fingerprint: &[u8]) -> PathBuf {
62 self.cache_dir
63 .join(format!("{}.json", hex::encode(fingerprint)))
64 }
65
66 pub fn get(&self, fingerprint: &[u8]) -> Option<Response> {
73 use base64::Engine;
74
75 let path = self.cache_path(fingerprint);
76 let data = std::fs::read(&path)
77 .inspect_err(|e| warn!(error = %e, "failed to read cache file"))
78 .ok()?;
79
80 let cached: CachedResponse = serde_json::from_slice(&data)
81 .inspect_err(|e| warn!(error = %e, "failed to deserialize cache entry"))
82 .ok()?;
83
84 let body = base64::engine::general_purpose::STANDARD
85 .decode(&cached.content_base64)
86 .inspect_err(|e| warn!(error = %e, "failed to decode cached body"))
87 .ok()
88 .map(Bytes::from)?;
89
90 Some(Response::new(
91 &cached.url,
92 body,
93 cached.status,
94 Some(cached.reason),
95 cached.cookies,
96 cached.headers,
97 cached.request_headers,
98 cached.encoding,
99 cached.method,
100 Vec::new(),
101 HashMap::new(),
102 ))
103 }
104
105 pub fn put(&self, fingerprint: &[u8], response: &Response, method: &str) -> Result<()> {
112 std::fs::create_dir_all(&self.cache_dir)
113 .map_err(|e| SpiderError::Other(format!("failed to create cache dir: {e}")))?;
114
115 use base64::Engine;
116 let content_base64 = base64::engine::general_purpose::STANDARD.encode(&response.body);
117
118 let cached = CachedResponse {
119 url: response.url().to_owned(),
120 status: response.status,
121 reason: response.reason.clone(),
122 encoding: response.encoding.clone(),
123 cookies: response.cookies.clone(),
124 headers: response.headers.clone(),
125 request_headers: response.request_headers.clone(),
126 method: method.to_owned(),
127 content_base64,
128 };
129
130 let temp_path = self.cache_dir.join(".cache.tmp");
131 let json = serde_json::to_vec(&cached)
132 .map_err(|e| SpiderError::Other(format!("cache serialization failed: {e}")))?;
133
134 std::fs::write(&temp_path, &json)
135 .map_err(|e| SpiderError::Other(format!("failed to write cache: {e}")))?;
136
137 let target = self.cache_path(fingerprint);
138 std::fs::rename(&temp_path, &target).map_err(|e| {
139 let _ = std::fs::remove_file(&temp_path);
140 SpiderError::Other(format!("failed to rename cache file: {e}"))
141 })?;
142
143 debug!("response cached");
144 Ok(())
145 }
146
147 pub fn clear(&self) -> Result<()> {
152 if self.cache_dir.exists() {
153 for entry in std::fs::read_dir(&self.cache_dir)
154 .map_err(|e| SpiderError::Other(format!("failed to read cache dir: {e}")))?
155 .flatten()
156 {
157 if entry.path().extension().is_some_and(|e| e == "json") {
158 let _ = std::fs::remove_file(entry.path());
159 }
160 }
161 }
162 Ok(())
163 }
164}