use sha2::{Sha256, Digest};
use std::collections::HashMap;
use std::sync::RwLock;
use url::Url;
#[derive(Debug, Clone)]
pub struct RawSnapshot {
pub url: Url,
pub html: String,
pub headers: HashMap<String, String>,
pub encoding: String,
pub compressed_size: Option<u64>,
pub decompressed_size: u64,
pub timestamp: chrono::DateTime<chrono::Utc>,
pub hash: String,
}
impl RawSnapshot {
pub fn new(url: Url, html: String, headers: HashMap<String, String>) -> Self {
let hash = Self::compute_hash(&html);
let decompressed_size = html.len() as u64;
Self {
url,
html,
headers,
encoding: "utf-8".to_string(),
compressed_size: None,
decompressed_size,
timestamp: chrono::Utc::now(),
hash,
}
}
pub fn from_response(
url: Url,
status_code: u16,
headers: HashMap<String, String>,
body: Vec<u8>,
) -> Self {
let html = String::from_utf8_lossy(&body).to_string();
let hash = Self::compute_hash(&html);
let decompressed_size = body.len() as u64;
let mut snapshot_headers = headers;
snapshot_headers.insert("x-status-code".to_string(), status_code.to_string());
Self {
url,
html,
headers: snapshot_headers,
encoding: "utf-8".to_string(),
compressed_size: None,
decompressed_size,
timestamp: chrono::Utc::now(),
hash,
}
}
fn compute_hash(content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
pub fn serialize(&self) -> Vec<u8> {
let mut output = String::new();
output.push_str(&format!("URL: {}\n", self.url));
output.push_str(&format!("Timestamp: {}\n", self.timestamp.to_rfc3339()));
output.push_str(&format!("Hash: {}\n", self.hash));
output.push_str(&format!("Encoding: {}\n", self.encoding));
for (key, value) in &self.headers {
output.push_str(&format!("{}: {}\n", key, value));
}
output.push_str("\n\n\n");
output.push_str(&self.html);
output.into_bytes()
}
}
pub struct SnapshotStore {
snapshots: RwLock<HashMap<String, RawSnapshot>>,
url_index: RwLock<HashMap<String, String>>,
max_snapshots: usize,
}
impl Default for SnapshotStore {
fn default() -> Self {
Self::new(10000)
}
}
impl SnapshotStore {
pub fn new(max_snapshots: usize) -> Self {
Self {
snapshots: RwLock::new(HashMap::new()),
url_index: RwLock::new(HashMap::new()),
max_snapshots,
}
}
pub fn store(&self, snapshot: RawSnapshot) -> String {
let hash = snapshot.hash.clone();
let url_key = snapshot.url.to_string();
{
let snapshots = self.snapshots.read().unwrap();
if snapshots.len() >= self.max_snapshots {
drop(snapshots);
let mut snapshots = self.snapshots.write().unwrap();
let mut url_index = self.url_index.write().unwrap();
let to_remove: Vec<_> = snapshots
.keys()
.take(self.max_snapshots / 10)
.cloned()
.collect();
for h in to_remove {
snapshots.remove(&h);
url_index.retain(|_, v| v != &h);
}
}
}
self.snapshots.write().unwrap().insert(hash.clone(), snapshot);
self.url_index.write().unwrap().insert(url_key, hash.clone());
hash
}
pub fn get_by_hash(&self, hash: &str) -> Option<RawSnapshot> {
self.snapshots.read().unwrap().get(hash).cloned()
}
pub fn get_by_url(&self, url: &Url) -> Option<RawSnapshot> {
let url_key = url.to_string();
let hash = self.url_index.read().unwrap().get(&url_key)?.clone();
self.get_by_hash(&hash)
}
pub fn has_hash(&self, hash: &str) -> bool {
self.snapshots.read().unwrap().contains_key(hash)
}
pub fn len(&self) -> usize {
self.snapshots.read().unwrap().len()
}
pub fn is_empty(&self) -> bool {
self.snapshots.read().unwrap().is_empty()
}
pub fn clear(&self) {
self.snapshots.write().unwrap().clear();
self.url_index.write().unwrap().clear();
}
}