Skip to main content

hashiverse_server_lib/environment/
disk_environment_store.rs

1//! # Production disk [`EnvironmentStore`]
2//!
3//! Backing store used by the real server binary. Persists to a `fjall` keyspace for
4//! metadata and a two-level directory tree (256² slots, ≈16 M bundles before any
5//! single directory grows past ~64 K entries) for the bundle bodies. Writes go
6//! through a temp-file + rename dance so a mid-write crash never leaves a torn
7//! bundle, and feedback-update batches are atomic at the `fjall` level.
8//!
9//! Library choice rationale:
10//! - **fjall** over **redb** (redb compacts synchronously and blocks writes during
11//!   it, which we can't afford on a single-node server) and over **sled** (stalled
12//!   upstream; its v1.0 rewrite hasn't landed).
13//! - **postcard** for metadata serialisation — compact, fast, stable format, no
14//!   schema on disk.
15//!
16//! Feedback entries are stored under composite keys `(location_id, post_id,
17//! feedback_type)` so range iteration can walk all feedback for a given bundle
18//! location during decimation in a single sequential scan.
19
20use bytes::Bytes;
21use crate::environment::environment::{Environment, EnvironmentDimensions, EnvironmentFactory, PostBundleMetadata};
22use crate::environment::environment_store::EnvironmentStore;
23use anyhow::anyhow;
24use async_trait::async_trait;
25use fjall::{Database, Keyspace};
26use fs2::FileExt;
27use hashiverse_lib::tools::time::{TimeMillis, TimeMillisBytes};
28use hashiverse_lib::tools::types::{ID_BYTES, Id, SALT_BYTES, Salt, Pow};
29use log::{info, trace, warn};
30use std::collections::HashMap;
31use std::fs;
32use std::fs::OpenOptions;
33use std::path::PathBuf;
34use std::sync::Arc;
35use hashiverse_lib::protocol::posting::encoded_post_feedback::EncodedPostFeedbackV1;
36
37const MAX_ENVIRONMENTS_PER_NODE: usize = 256;
38
39pub struct DiskEnvironmentFactory {
40    base_path: String,
41}
42
43#[async_trait]
44impl EnvironmentFactory for DiskEnvironmentFactory {
45    fn new(base_path: &str) -> Self {
46        Self { base_path: base_path.to_string() }
47    }
48
49    async fn open_next_available(&self, environment_dimensions: EnvironmentDimensions) -> anyhow::Result<Environment> {
50        for env_id in 1..=MAX_ENVIRONMENTS_PER_NODE {
51            let disk_environment_store = DiskEnvironmentStore::new(&self.base_path, env_id);
52            match disk_environment_store {
53                Ok(disk_environment_store) => return Environment::new(Arc::new(disk_environment_store), environment_dimensions).await,
54                Err(_) => continue,
55            }
56        }
57
58        anyhow::bail!("no environments available")
59    }
60}
61
62/// DiskEnvironmentStore implements the `EnvironmentStore` trait for production use.  PostBundles are stored as files in a directory tree, while config, metadata, feedback, and post-expiry metainformation are stored in a key-value database.
63///
64/// A lot of time was spent comparing the different available databases.  Given the nature of hashiverse, it makes sense that some sort of key-value (KV) database is used.
65/// For simplicity of building and maintenance, we decided to use a pure-Rust implementation.  This narrowed teh field down to redb, sled, and fjall.
66/// We would have liked to use redb, because of its self-reported performance with individual writes and random range reads - both things that hashiverse does a lot of.
67/// Unfortunately it requires occasional compaction of the ever-growing files on disk, something that needs downtime and 2x disk space to do.  This essentially killed it for us.
68/// Next was sled, which autocompacts and seemed the next most performant, but there seems to be little improvement and support for it - it's own github repo says use this as beta software only, and work on its v1.0 release seems to have stalled.
69/// That leaves fjall - seemingly third place in performance, but gauging by its github activity, is ever improving and is highly supported.  It also does autocompaction.
70pub struct DiskEnvironmentStore {
71    path: PathBuf,
72    #[allow(dead_code)]
73    lock_file: fs::File, // Blocks other processes from claiming this environment on disk
74    database: Database,
75    keyspace_config: Keyspace,
76    keyspace_post_bundle_last_accessed: Keyspace, // location_id -> TimeMillis
77    keyspace_post_bundle_metadata: Keyspace,      // location_id -> PostBundleMetadata
78    keyspace_post_bundle_feedback: Keyspace,      // [location_id,post_id,feedback_type] -> [salt,pow]
79}
80
81impl DiskEnvironmentStore {
82    fn new(base_path: &str, env_id: usize) -> anyhow::Result<Self> {
83        let path = PathBuf::from(base_path).join(env_id.to_string());
84
85        // Check that the path exists
86        fs::create_dir_all(&path)?;
87
88        // Attempt to grab the lock
89        let lock_path = path.join("lock");
90        let lock_file = OpenOptions::new().create(true).truncate(true).read(true).write(true).open(&lock_path)?;
91        lock_file.try_lock_exclusive()?;
92
93        // Make sure the various stores exist
94        let database = Database::builder(path.join("database")).open()?;
95        let keyspace_config = database.keyspace("config", fjall::KeyspaceCreateOptions::default)?;
96        let keyspace_post_bundle_last_accessed = database.keyspace("post_bundle_last_accessed", fjall::KeyspaceCreateOptions::default)?;
97        let keyspace_post_bundle_metadata = database.keyspace("keyspace_post_bundle_metadata", fjall::KeyspaceCreateOptions::default)?;
98        let keyspace_post_bundle_feedback = database.keyspace("keyspace_post_bundle_feedback", fjall::KeyspaceCreateOptions::default)?;
99
100        info!("using environment {} at {}", env_id, path.to_str().unwrap());
101
102        Ok(Self {
103            path,
104            lock_file,
105            database,
106            keyspace_config,
107            keyspace_post_bundle_last_accessed,
108            keyspace_post_bundle_metadata,
109            keyspace_post_bundle_feedback,
110        })
111    }
112
113    fn path_for_location_id(&self, location_id: &Id) -> (PathBuf, PathBuf) {
114        // Two indirections of with 256 files in the folder allows 256^3 = 16million postbundles
115        // Assuming each is at tiniest 1kb, that implies about 16Gb of disk space, which is what we are aiming for.
116        // If we wanted to substantially increase the disk size supported by a hashiverse node, we might have to intro another level of indirection.
117        // Note that it is likely ext4 will also start running out of inodes if posts are too small - inodes on ext4 assume a 16kb file on average.
118        // Perhaps XFS is mandated?  Though the tradeoff is XFS is slower than ext4 when handling little files...
119        let b0 = format!("{:02x}", location_id.0[0]);
120        let b1 = format!("{:02x}", location_id.0[1]);
121
122        let directory = self.path.join("post_bundles").join(b0).join(b1);
123        let filename = directory.join(location_id.to_hex_str());
124
125        (directory, filename)
126    }
127}
128
129impl EnvironmentStore for DiskEnvironmentStore {
130    fn post_bundle_count(&self) -> anyhow::Result<usize> {
131        let len = self.keyspace_post_bundle_last_accessed.len()?;
132        Ok(len)
133    }
134
135    fn post_bundle_feedback_count(&self) -> anyhow::Result<usize> {
136        let len = self.keyspace_post_bundle_feedback.len()?;
137        Ok(len)
138    }
139    fn post_bundle_metadata_get(&self, location_id: &Id) -> anyhow::Result<Option<PostBundleMetadata>> {
140        let guard = self.keyspace_post_bundle_metadata.get(location_id)?;
141        match guard {
142            Some(guard) => Ok(Some(postcard::from_bytes(&guard)?)),
143            None => Ok(None),
144        }
145    }
146
147    fn post_bundle_metadata_put(&self, location_id: &Id, post_bundle_metadata: &PostBundleMetadata) -> anyhow::Result<()> {
148        // Much faster than allocating a Vec each time...
149        let mut scratch = [0u8; 64];
150        let scratch_used = postcard::to_slice(post_bundle_metadata, &mut scratch)?;
151        self.keyspace_post_bundle_metadata.insert(location_id.0, scratch_used.as_ref())?;
152        Ok(())
153    }
154
155    fn post_bundle_bytes_get(&self, location_id: &Id) -> anyhow::Result<Option<Bytes>> {
156        let (_directory, filename) = self.path_for_location_id(location_id);
157        let result = fs::read(filename).ok().map(Bytes::from);
158        Ok(result)
159    }
160
161    fn post_bundle_bytes_put(&self, location_id: &Id, bytes: &[u8]) -> anyhow::Result<()> {
162        let (directory, filename) = self.path_for_location_id(location_id);
163        let filename_temp = filename.with_added_extension("tmp");
164        fs::create_dir_all(&directory)?;
165        fs::write(&filename_temp, bytes)?;
166        fs::rename(&filename_temp, &filename)?;
167        Ok(())
168    }
169
170    fn post_bundle_feedbacks_bytes_get(&self, post_bundle_location_id: &Id) -> anyhow::Result<Bytes> {
171        let mut bytes = Vec::new();
172
173        self.keyspace_post_bundle_feedback.prefix(post_bundle_location_id).for_each(|guard| {
174            let try_result = try {
175                let (key, value) = guard.into_inner().map_err(|e| anyhow!("{}", e))?;
176                let post_bundle_feedback_key = PostBundleFeedbackKey::from_slice(&key)?;
177                let post_bundle_feedback_value = PostBundleFeedbackValue::from_slice(&value)?;
178                EncodedPostFeedbackV1::append_encode_direct_to_bytes(
179                    &mut bytes,
180                    post_bundle_feedback_key.post_id_bytes(),
181                    post_bundle_feedback_key.feedback_type(),
182                    post_bundle_feedback_value.salt_bytes(),
183                    post_bundle_feedback_value.pow(),
184                )?;
185            };
186
187            if let Err(e) = try_result {
188                warn!("unexpectedly unable to encode post bundle feedback: {}", e);
189            }
190        });
191
192        Ok(Bytes::from(bytes))
193    }
194
195
196    fn post_feedback_put_if_more_powerful(&self, location_id: &Id, encoded_post_feedback: &EncodedPostFeedbackV1) -> anyhow::Result<()> {
197        let post_bundle_feedback_key = PostBundleFeedbackKey::new(location_id, &encoded_post_feedback.post_id, encoded_post_feedback.feedback_type);
198
199        // Check that we don't already have much better feedback
200        let post_bundle_feedback_value = self.keyspace_post_bundle_feedback.get(&post_bundle_feedback_key)?;
201        if let Some(post_bundle_feedback_value) = post_bundle_feedback_value {
202            let post_bundle_feedback_value = PostBundleFeedbackValue::from_slice(&post_bundle_feedback_value)?;
203            if post_bundle_feedback_value.pow() >= encoded_post_feedback.pow {
204                trace!("Not storing lesser feedback for location_id={} with existing pow={}: feedback={:?}", location_id, post_bundle_feedback_value.pow(), encoded_post_feedback);
205                return Ok(());
206            }
207        }
208
209        // Commit the improved feedback
210        {
211            let post_bundle_feedback_value = PostBundleFeedbackValue::new(encoded_post_feedback.salt, encoded_post_feedback.pow);
212            self.keyspace_post_bundle_feedback.insert(post_bundle_feedback_key.0, post_bundle_feedback_value.0)?;
213        }
214
215        Ok(())
216    }
217
218    fn post_bundles_delete(&self, location_ids: &[Id]) -> anyhow::Result<()> {
219        let post_bundle_bytes_delete = |location_id: &Id| -> anyhow::Result<()> {
220            let (_directory, filename) = self.path_for_location_id(location_id);
221            let _result = fs::remove_file(filename);
222            Ok(())
223        };
224
225        // Collect feedback keys before opening the batch to avoid borrow conflicts
226        let mut feedback_keys_to_delete: Vec<Vec<u8>> = Vec::new();
227        for location_id in location_ids {
228            self.keyspace_post_bundle_feedback.prefix(location_id).for_each(|guard| {
229                let try_result: anyhow::Result<()> = try {
230                    let (key, _) = guard.into_inner().map_err(|e| anyhow!("{}", e))?;
231                    feedback_keys_to_delete.push(key.to_vec());
232                };
233                if let Err(e) = try_result {
234                    warn!("failed to collect feedback key for deletion: {}", e);
235                }
236            });
237        }
238
239        let mut batch = self.database.batch();
240        for location_id in location_ids {
241            batch.remove(&self.keyspace_post_bundle_metadata, location_id.0);
242            batch.remove(&self.keyspace_post_bundle_last_accessed, location_id.0);
243            post_bundle_bytes_delete(location_id)?;
244        }
245        for key in &feedback_keys_to_delete {
246            batch.remove(&self.keyspace_post_bundle_feedback, key.as_slice());
247        }
248        batch.commit()?;
249
250        Ok(())
251    }
252
253    fn post_bundles_last_accessed_flush(&self, post_bundles_last_accessed: &HashMap<Id, TimeMillis>) -> anyhow::Result<()> {
254        let mut batch = self.database.batch();
255        for (location_id, time_millis) in post_bundles_last_accessed.iter() {
256            let time_millis_bytes = time_millis.encode_be();
257            batch.insert(&self.keyspace_post_bundle_last_accessed, location_id.0, time_millis_bytes.0);
258        }
259        batch.commit()?;
260        Ok(())
261    }
262
263    fn post_bundles_last_accessed_iter(&self, location_id: &Id) -> Box<dyn Iterator<Item = Result<(Id, TimeMillisBytes), anyhow::Error>> + '_> {
264        let it = self
265            .keyspace_post_bundle_last_accessed
266            .range(location_id.to_string()..)
267            .chain(self.keyspace_post_bundle_last_accessed.range(..location_id.to_string()))
268            .map(|guard| {
269                let (location_id, time_millis_bytes) = guard.into_inner().map_err(|e| anyhow!("{}", e))?;
270                let location_id = Id::from_slice(&location_id)?;
271                let time_millis_bytes = TimeMillisBytes::from_bytes(&time_millis_bytes)?;
272                Ok((location_id, time_millis_bytes))
273            });
274
275        Box::new(it)
276    }
277
278    fn config_get_bytes(&self, key: &str) -> anyhow::Result<Option<Vec<u8>>> {
279        Ok(self.keyspace_config.get(key)?.map(|v| v.to_vec()))
280    }
281
282    fn config_put_bytes(&self, key: &str, v: Vec<u8>) -> anyhow::Result<()> {
283        self.keyspace_config.insert(key, v)?;
284        Ok(())
285    }
286}
287
288const POST_BUNDLE_FEEDBACK_KEY_SIZE: usize = ID_BYTES + ID_BYTES + 1;
289pub struct PostBundleFeedbackKey(pub [u8; POST_BUNDLE_FEEDBACK_KEY_SIZE]);
290
291impl PostBundleFeedbackKey {
292    pub fn new(location_id: &Id, post_id: &Id, feedback_type: u8) -> Self {
293        let mut bytes = [0u8; POST_BUNDLE_FEEDBACK_KEY_SIZE];
294        bytes[0..ID_BYTES].copy_from_slice(location_id.as_bytes());
295        bytes[ID_BYTES..2*ID_BYTES].copy_from_slice(post_id.as_bytes());
296        bytes[2*ID_BYTES] = feedback_type;
297        Self(bytes)
298    }
299
300    pub fn from_slice(bytes: &[u8]) -> anyhow::Result<Self> {
301        let bytes: [u8; POST_BUNDLE_FEEDBACK_KEY_SIZE] = bytes
302            .try_into()
303            .map_err(|_| anyhow::anyhow!("Invalid PostBundleFeedbackKey length: expected {}, got {}", POST_BUNDLE_FEEDBACK_KEY_SIZE, bytes.len()))?;
304        Ok(Self(bytes))
305    }
306
307    pub fn post_bundle_location_id_bytes(&self) -> &[u8] {
308        &self.0[0..ID_BYTES]
309    }
310    pub fn post_id_bytes(&self) -> &[u8] {
311        &self.0[ID_BYTES..2*ID_BYTES]
312    }
313
314    pub fn feedback_type(&self) -> u8 {
315        self.0[2*ID_BYTES]
316    }
317}
318
319impl AsRef<[u8]> for PostBundleFeedbackKey {
320    fn as_ref(&self) -> &[u8] {
321        &self.0
322    }
323}
324
325const POST_BUNDLE_FEEDBACK_VALUE_SIZE: usize = SALT_BYTES + 1;
326pub struct PostBundleFeedbackValue(pub [u8; POST_BUNDLE_FEEDBACK_VALUE_SIZE]);
327
328impl PostBundleFeedbackValue {
329    pub fn new(salt: Salt, pow: Pow) -> Self {
330        let mut bytes = [0u8; POST_BUNDLE_FEEDBACK_VALUE_SIZE];
331        bytes[0..SALT_BYTES].copy_from_slice(salt.as_slice());
332        bytes[SALT_BYTES] = pow.0;
333        Self(bytes)
334    }
335
336    pub fn from_slice(bytes: &[u8]) -> anyhow::Result<Self> {
337        let bytes: [u8; POST_BUNDLE_FEEDBACK_VALUE_SIZE] = bytes
338            .try_into()
339            .map_err(|_| anyhow::anyhow!("Invalid PostBundleFeedbackValue length: expected {}, got {}", POST_BUNDLE_FEEDBACK_VALUE_SIZE, bytes.len()))?;
340        Ok(Self(bytes))
341    }
342    pub fn salt_bytes(&self) -> &[u8] {
343        &self.0[0..SALT_BYTES]
344    }
345    pub fn pow(&self) -> Pow {
346        Pow(self.0[SALT_BYTES])
347    }
348}
349
350#[cfg(test)]
351mod tests {
352    use crate::environment;
353    use crate::environment::disk_environment_store::DiskEnvironmentFactory;
354
355    #[tokio::test]
356    async fn basics_test() -> anyhow::Result<()> {
357        environment::environment::tests::basics_test::<DiskEnvironmentFactory>().await
358    }
359
360
361    #[tokio::test]
362    async fn feedback_bytes_get_test() -> anyhow::Result<()> {
363        environment::environment::tests::feedback_bytes_get_test::<DiskEnvironmentFactory>().await
364    }
365
366    #[tokio::test]
367    async fn feedback_put_if_more_powerful_test() -> anyhow::Result<()> {
368        environment::environment::tests::feedback_put_if_more_powerful_test::<DiskEnvironmentFactory>().await
369    }
370
371    // These are brutally expensive on disk - we rely on the memory stub for the decimation tests
372    //
373    // #[tokio::test]
374    // async fn decimation_feedback_deleted_test() -> anyhow::Result<()> {
375    //     environment::environment::tests::decimation_feedback_deleted_test::<DiskEnvironmentFactory>().await
376    // }
377    // #[tokio::test]
378    // async fn decimation_test() -> anyhow::Result<()> {
379    //     environment::environment::tests::decimation_test::<DiskEnvironmentFactory>().await
380    // }
381    // #[tokio::test]
382    // async fn decimation_convergence_test() -> anyhow::Result<()> {
383    //     environment::environment::tests::decimation_convergence_test::<DiskEnvironmentFactory>(2 * 1000).await
384    // }
385    // #[tokio::test]
386    // async fn decimation_existence_test() -> anyhow::Result<()> { environment::environment::tests::decimation_existence_test::<DiskEnvironmentFactory>().await }
387}