halldyll_core/storage/
snapshot.rs

1//! Snapshot - Raw snapshot storage
2
3use sha2::{Sha256, Digest};
4use std::collections::HashMap;
5use std::sync::RwLock;
6use url::Url;
7
8
9/// Raw snapshot of a page
10#[derive(Debug, Clone)]
11pub struct RawSnapshot {
12    /// Source URL
13    pub url: Url,
14    /// Raw HTML
15    pub html: String,
16    /// Raw HTTP headers
17    pub headers: HashMap<String, String>,
18    /// Encoding used
19    pub encoding: String,
20    /// Compressed size
21    pub compressed_size: Option<u64>,
22    /// Decompressed size
23    pub decompressed_size: u64,
24    /// Timestamp
25    pub timestamp: chrono::DateTime<chrono::Utc>,
26    /// Hash SHA-256 du HTML
27    pub hash: String,
28}
29
30impl RawSnapshot {
31    /// Create a new snapshot from HTML string
32    pub fn new(url: Url, html: String, headers: HashMap<String, String>) -> Self {
33        let hash = Self::compute_hash(&html);
34        let decompressed_size = html.len() as u64;
35        
36        Self {
37            url,
38            html,
39            headers,
40            encoding: "utf-8".to_string(),
41            compressed_size: None,
42            decompressed_size,
43            timestamp: chrono::Utc::now(),
44            hash,
45        }
46    }
47
48    /// Create a new snapshot from raw response
49    pub fn from_response(
50        url: Url,
51        status_code: u16,
52        headers: HashMap<String, String>,
53        body: Vec<u8>,
54    ) -> Self {
55        let html = String::from_utf8_lossy(&body).to_string();
56        let hash = Self::compute_hash(&html);
57        let decompressed_size = body.len() as u64;
58
59        let mut snapshot_headers = headers;
60        snapshot_headers.insert("x-status-code".to_string(), status_code.to_string());
61
62        Self {
63            url,
64            html,
65            headers: snapshot_headers,
66            encoding: "utf-8".to_string(),
67            compressed_size: None,
68            decompressed_size,
69            timestamp: chrono::Utc::now(),
70            hash,
71        }
72    }
73
74    /// Compute SHA-256 hash
75    fn compute_hash(content: &str) -> String {
76        let mut hasher = Sha256::new();
77        hasher.update(content.as_bytes());
78        format!("{:x}", hasher.finalize())
79    }
80
81    /// Serialize to bytes
82    pub fn serialize(&self) -> Vec<u8> {
83        // Simple format: headers\n\n\nhtml
84        let mut output = String::new();
85        
86        // URL
87        output.push_str(&format!("URL: {}\n", self.url));
88        output.push_str(&format!("Timestamp: {}\n", self.timestamp.to_rfc3339()));
89        output.push_str(&format!("Hash: {}\n", self.hash));
90        output.push_str(&format!("Encoding: {}\n", self.encoding));
91        
92        // Headers
93        for (key, value) in &self.headers {
94            output.push_str(&format!("{}: {}\n", key, value));
95        }
96        
97        // Separator
98        output.push_str("\n\n\n");
99        
100        // HTML
101        output.push_str(&self.html);
102        
103        output.into_bytes()
104    }
105}
106
107/// In-memory snapshot store
108pub struct SnapshotStore {
109    /// Snapshots by hash
110    snapshots: RwLock<HashMap<String, RawSnapshot>>,
111    /// URL -> hash index
112    url_index: RwLock<HashMap<String, String>>,
113    /// Size limit
114    max_snapshots: usize,
115}
116
117impl Default for SnapshotStore {
118    fn default() -> Self {
119        Self::new(10000)
120    }
121}
122
123impl SnapshotStore {
124    /// New store
125    pub fn new(max_snapshots: usize) -> Self {
126        Self {
127            snapshots: RwLock::new(HashMap::new()),
128            url_index: RwLock::new(HashMap::new()),
129            max_snapshots,
130        }
131    }
132
133    /// Store a snapshot
134    pub fn store(&self, snapshot: RawSnapshot) -> String {
135        let hash = snapshot.hash.clone();
136        let url_key = snapshot.url.to_string();
137
138        // Check the limit
139        {
140            let snapshots = self.snapshots.read().unwrap();
141            if snapshots.len() >= self.max_snapshots {
142                // Simple eviction (first in)
143                drop(snapshots);
144                let mut snapshots = self.snapshots.write().unwrap();
145                let mut url_index = self.url_index.write().unwrap();
146                
147                // Remove 10% of the oldest
148                let to_remove: Vec<_> = snapshots
149                    .keys()
150                    .take(self.max_snapshots / 10)
151                    .cloned()
152                    .collect();
153                for h in to_remove {
154                    snapshots.remove(&h);
155                    // Clean up the index
156                    url_index.retain(|_, v| v != &h);
157                }
158            }
159        }
160
161        // Store
162        self.snapshots.write().unwrap().insert(hash.clone(), snapshot);
163        self.url_index.write().unwrap().insert(url_key, hash.clone());
164
165        hash
166    }
167
168    /// Get by hash
169    pub fn get_by_hash(&self, hash: &str) -> Option<RawSnapshot> {
170        self.snapshots.read().unwrap().get(hash).cloned()
171    }
172
173    /// Get by URL
174    pub fn get_by_url(&self, url: &Url) -> Option<RawSnapshot> {
175        let url_key = url.to_string();
176        let hash = self.url_index.read().unwrap().get(&url_key)?.clone();
177        self.get_by_hash(&hash)
178    }
179
180    /// Check if a hash exists
181    pub fn has_hash(&self, hash: &str) -> bool {
182        self.snapshots.read().unwrap().contains_key(hash)
183    }
184
185    /// Number of snapshots
186    pub fn len(&self) -> usize {
187        self.snapshots.read().unwrap().len()
188    }
189
190    /// Is store empty?
191    pub fn is_empty(&self) -> bool {
192        self.snapshots.read().unwrap().is_empty()
193    }
194
195    /// Clear the store
196    pub fn clear(&self) {
197        self.snapshots.write().unwrap().clear();
198        self.url_index.write().unwrap().clear();
199    }
200}