Skip to main content

graphrefly_storage/
file.rs

1//! Filesystem-backed kv backend (M4.C — DS-14-storage Audit 4).
2//!
3//! [`FileBackend`] maps each key to a `.bin` file under a configured directory.
4//! Keys are percent-encoded so any UTF-8 string can be stored safely:
5//! `[a-zA-Z0-9_-]` pass through; everything else is UTF-8 encoded with each
6//! byte formatted as lowercase `%xx`. The encoded filename for any given key
7//! is byte-identical to the TS `fileBackend` impl
8//! ([`packages/pure-ts/src/extra/storage/tiers-node.ts`](https://github.com/graphrefly/graphrefly-ts/blob/main/packages/pure-ts/src/extra/storage/tiers-node.ts) — D159) so a TS-written
9//! file can be loaded by a Rust reader on the same directory.
10//!
11//! Writes are atomic via [`tempfile::NamedTempFile::persist`]: a tempfile is
12//! created in the target directory, written in full, then renamed onto the
13//! key path. A partially-written file is never visible at the final path,
14//! even on process crash. The `NamedTempFile` Drop impl deletes any tempfile
15//! that never made it through `persist` (covers panics between create and
16//! commit).
17//!
18//! `flush()` is a no-op — durability is on per-write basis via the rename.
19//! `read` / `delete` / `list` tolerate missing directory + missing key by
20//! returning `Ok(None)` / `Ok(())` / `Ok(vec![])` respectively (D158).
21//!
22//! Cargo feature: gated behind `file` (default-on).
23
24use std::fs;
25use std::io;
26use std::io::Write as _;
27use std::path::{Path, PathBuf};
28use std::sync::Arc;
29
30use serde::{de::DeserializeOwned, Serialize};
31use tempfile::NamedTempFile;
32
33use crate::backend::StorageBackend;
34use crate::codec::{Codec, JsonCodec};
35use crate::error::StorageError;
36use crate::memory::{
37    append_log_storage, kv_storage, snapshot_storage, AppendLogStorage, AppendLogStorageOptions,
38    KvStorage, KvStorageOptions, SnapshotStorage, SnapshotStorageOptions,
39};
40
41/// File extension applied to every key file. Inverse `decode_filename_to_key`
42/// rejects entries that don't end in this suffix.
43const FILE_SUFFIX: &str = ".bin";
44
45/// Lowercase hex alphabet for `%xx` encoding. Lower case is required for
46/// byte-equal cross-impl filenames; TS produces lowercase via
47/// `Number.toString(16)`.
48const HEX_LOWER: &[u8; 16] = b"0123456789abcdef";
49
50/// Filesystem-backed [`StorageBackend`].
51///
52/// One file per key under `dir`. Concurrent writers are safe at the
53/// per-key granularity (atomic rename via `tempfile`); concurrent writers
54/// to the SAME key race in unspecified-but-atomic fashion (last commit wins).
55///
56/// # Filesystem portability
57///
58/// Key→filename encoding preserves ASCII case: `Foo` and `foo` encode to
59/// `Foo.bin` and `foo.bin`. On case-insensitive filesystems (default macOS
60/// APFS, default Windows NTFS) these collide. graphrefly-internal keys
61/// (tier names, WAL frame paths) are case-consistent by construction, so
62/// the collision is only reachable with adversarial user-supplied keys.
63/// Lift documented in `porting-deferred.md` "M4.C `FileBackend`
64/// case-insensitive-filesystem key collision".
65///
66/// # Example
67///
68/// ```ignore
69/// use std::sync::Arc;
70/// use graphrefly_storage::{file_backend, snapshot_storage, SnapshotStorageOptions};
71///
72/// let backend = file_backend("./checkpoints");
73/// let tier = snapshot_storage(backend, SnapshotStorageOptions::<MyState, _>::default());
74/// tier.save(state).unwrap();
75/// ```
76#[derive(Debug)]
77pub struct FileBackend {
78    dir: PathBuf,
79    name: String,
80    include_hidden: bool,
81}
82
83impl FileBackend {
84    /// Construct a backend rooted at `dir`. The directory is created lazily on
85    /// first `write()` — `read` / `list` / `delete` tolerate its absence.
86    #[must_use]
87    pub fn new(dir: impl AsRef<Path>) -> Self {
88        let dir = dir.as_ref().to_path_buf();
89        let name = format!("file:{}", dir.display());
90        Self {
91            dir,
92            name,
93            include_hidden: false,
94        }
95    }
96
97    /// Override whether `list()` includes filenames beginning with `.` (D161).
98    ///
99    /// Default `false`: hidden filenames are skipped. This protects against
100    /// in-flight `tempfile::NamedTempFile` temp files (which are created with
101    /// a leading-`.` prefix) leaking into enumeration results during a
102    /// concurrent flush.
103    ///
104    /// Pass `true` if your application intentionally writes keys whose
105    /// percent-encoding produces a leading-`.` filename and you need them
106    /// visible in `list()`.
107    #[must_use]
108    pub fn with_include_hidden(mut self, include: bool) -> Self {
109        self.include_hidden = include;
110        self
111    }
112
113    /// Backend root directory.
114    #[must_use]
115    pub fn dir(&self) -> &Path {
116        &self.dir
117    }
118
119    /// Whether `list()` includes dot-prefixed filenames.
120    #[must_use]
121    pub fn include_hidden(&self) -> bool {
122        self.include_hidden
123    }
124
125    /// Per-key filesystem path (`<dir>/<encoded-key>.bin`).
126    fn path_for(&self, key: &str) -> PathBuf {
127        let mut filename = encode_key_to_filename(key);
128        filename.push_str(FILE_SUFFIX);
129        self.dir.join(filename)
130    }
131}
132
133/// Convenience constructor returning an `Arc<FileBackend>`. Use this when
134/// sharing a single backend across multiple tiers (the paired
135/// `{ snapshot, wal }` pattern from DS-14-storage §a). For non-default
136/// configuration use `Arc::new(FileBackend::new(dir).with_include_hidden(true))`.
137#[must_use]
138pub fn file_backend(dir: impl AsRef<Path>) -> Arc<FileBackend> {
139    Arc::new(FileBackend::new(dir))
140}
141
142impl StorageBackend for FileBackend {
143    fn name(&self) -> &str {
144        &self.name
145    }
146
147    fn read(&self, key: &str) -> Result<Option<Vec<u8>>, StorageError> {
148        match fs::read(self.path_for(key)) {
149            Ok(bytes) => Ok(Some(bytes)),
150            Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(None),
151            Err(e) => Err(io_error("read", &self.dir, e)),
152        }
153    }
154
155    fn write(&self, key: &str, bytes: &[u8]) -> Result<(), StorageError> {
156        fs::create_dir_all(&self.dir).map_err(|e| io_error("mkdir", &self.dir, e))?;
157        let target = self.path_for(key);
158        let mut tmp =
159            NamedTempFile::new_in(&self.dir).map_err(|e| io_error("tempfile", &self.dir, e))?;
160        tmp.write_all(bytes)
161            .map_err(|e| io_error("write tmp", &self.dir, e))?;
162        tmp.persist(&target)
163            .map_err(|e| io_error("rename", &self.dir, e.error))?;
164        Ok(())
165    }
166
167    fn delete(&self, key: &str) -> Result<(), StorageError> {
168        match fs::remove_file(self.path_for(key)) {
169            Ok(()) => Ok(()),
170            Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()),
171            Err(e) => Err(io_error("delete", &self.dir, e)),
172        }
173    }
174
175    fn list(&self, prefix: &str) -> Result<Vec<String>, StorageError> {
176        let entries = match fs::read_dir(&self.dir) {
177            Ok(e) => e,
178            Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(Vec::new()),
179            Err(e) => return Err(io_error("list", &self.dir, e)),
180        };
181        let mut keys = Vec::new();
182        for entry in entries {
183            let entry = entry.map_err(|e| io_error("list-entry", &self.dir, e))?;
184            let raw = entry.file_name();
185            let Some(name) = raw.to_str() else { continue };
186            if !self.include_hidden && name.starts_with('.') {
187                continue;
188            }
189            let Some(key) = decode_filename_to_key(name) else {
190                continue;
191            };
192            if !prefix.is_empty() && !key.starts_with(prefix) {
193                continue;
194            }
195            keys.push(key);
196        }
197        keys.sort();
198        Ok(keys)
199    }
200}
201
202fn io_error(op: &str, dir: &Path, source: io::Error) -> StorageError {
203    StorageError::BackendError {
204        message: format!("file backend {op} failed at {}: {source}", dir.display()),
205        source: Some(Box::new(source)),
206    }
207}
208
209/// Encode an arbitrary key to a safe filename stem.
210///
211/// `[a-zA-Z0-9_-]` pass through unencoded; everything else is UTF-8 encoded
212/// and each byte is formatted as lowercase `%xx`. Cross-impl byte-identical
213/// with TS [`pathFor`](https://github.com/graphrefly/graphrefly-ts/blob/main/packages/pure-ts/src/extra/storage/tiers-node.ts).
214fn encode_key_to_filename(key: &str) -> String {
215    let mut out = String::with_capacity(key.len());
216    let mut buf = [0u8; 4];
217    for ch in key.chars() {
218        if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
219            out.push(ch);
220            continue;
221        }
222        for &byte in ch.encode_utf8(&mut buf).as_bytes() {
223            out.push('%');
224            out.push(HEX_LOWER[(byte >> 4) as usize] as char);
225            out.push(HEX_LOWER[(byte & 0x0F) as usize] as char);
226        }
227    }
228    out
229}
230
231/// Inverse of [`encode_key_to_filename`].
232///
233/// Returns `None` when:
234/// - the filename does not end in `.bin`
235/// - the decoded byte sequence is not valid UTF-8
236/// - the filename contains non-ASCII characters outside `%xx` escapes
237///   (those can't have come from our encoder; matches TS behavior of treating
238///   such filenames as un-decodable)
239///
240/// Truncated (`abc%5`) or invalid-hex (`abc%5z`) escapes fall through to
241/// literal-byte semantics — matches the TS `keyFromFilename` regex-fallthrough
242/// branch.
243fn decode_filename_to_key(filename: &str) -> Option<String> {
244    let stem = filename.strip_suffix(FILE_SUFFIX)?;
245    let chars: Vec<char> = stem.chars().collect();
246    let mut bytes: Vec<u8> = Vec::with_capacity(chars.len());
247    let mut i = 0;
248    while i < chars.len() {
249        let ch = chars[i];
250        if ch == '%' && i + 2 < chars.len() {
251            if let (Some(hi), Some(lo)) = (nibble(chars[i + 1]), nibble(chars[i + 2])) {
252                bytes.push((hi << 4) | lo);
253                i += 3;
254                continue;
255            }
256        }
257        if !ch.is_ascii() {
258            return None;
259        }
260        bytes.push(ch as u8);
261        i += 1;
262    }
263    String::from_utf8(bytes).ok()
264}
265
266fn nibble(c: char) -> Option<u8> {
267    c.to_digit(16).and_then(|d| u8::try_from(d).ok())
268}
269
270// ── Convenience tier wrappers ───────────────────────────────────────────────
271
272/// Convenience: snapshot tier over a fresh file backend rooted at `dir`.
273/// Mirror of [`crate::memory_snapshot`] for filesystem persistence.
274#[must_use]
275pub fn file_snapshot<T, C>(
276    dir: impl AsRef<Path>,
277    opts: SnapshotStorageOptions<T, C>,
278) -> SnapshotStorage<FileBackend, T, C>
279where
280    T: Send + Sync + 'static,
281    C: Codec<T>,
282{
283    snapshot_storage(Arc::new(FileBackend::new(dir)), opts)
284}
285
286/// Convenience: snapshot tier over a fresh file backend with
287/// [`SnapshotStorageOptions::default`] + a `JsonCodec`.
288#[must_use]
289pub fn file_snapshot_default<T>(dir: impl AsRef<Path>) -> SnapshotStorage<FileBackend, T, JsonCodec>
290where
291    T: Serialize + DeserializeOwned + Send + Sync + 'static,
292{
293    file_snapshot(dir, SnapshotStorageOptions::default())
294}
295
296/// Convenience: append-log tier over a fresh file backend rooted at `dir`.
297#[must_use]
298pub fn file_append_log<T, C>(
299    dir: impl AsRef<Path>,
300    opts: AppendLogStorageOptions<T, C>,
301) -> AppendLogStorage<FileBackend, T, C>
302where
303    T: Serialize + DeserializeOwned + Clone + Send + Sync + 'static,
304    C: Codec<Vec<T>>,
305{
306    append_log_storage(Arc::new(FileBackend::new(dir)), opts)
307}
308
309/// Convenience: append-log tier over a fresh file backend with
310/// [`AppendLogStorageOptions::default`] + a `JsonCodec`.
311#[must_use]
312pub fn file_append_log_default<T>(
313    dir: impl AsRef<Path>,
314) -> AppendLogStorage<FileBackend, T, JsonCodec>
315where
316    T: Serialize + DeserializeOwned + Clone + Send + Sync + 'static,
317{
318    file_append_log(dir, AppendLogStorageOptions::default())
319}
320
321/// Convenience: kv tier over a fresh file backend rooted at `dir`.
322#[must_use]
323pub fn file_kv<T, C>(
324    dir: impl AsRef<Path>,
325    opts: KvStorageOptions<T, C>,
326) -> KvStorage<FileBackend, T, C>
327where
328    T: Send + Sync + 'static,
329    C: Codec<T>,
330{
331    kv_storage(Arc::new(FileBackend::new(dir)), opts)
332}
333
334/// Convenience: kv tier over a fresh file backend with
335/// [`KvStorageOptions::default`] + a `JsonCodec`.
336#[must_use]
337pub fn file_kv_default<T>(dir: impl AsRef<Path>) -> KvStorage<FileBackend, T, JsonCodec>
338where
339    T: Serialize + DeserializeOwned + Send + Sync + 'static,
340{
341    file_kv(dir, KvStorageOptions::default())
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    #[test]
349    fn encode_alphanumeric_passthrough() {
350        assert_eq!(encode_key_to_filename("abcXYZ-_09"), "abcXYZ-_09");
351    }
352
353    #[test]
354    fn encode_special_chars_percent_escape() {
355        assert_eq!(
356            encode_key_to_filename("app/with:slashes"),
357            "app%2fwith%3aslashes"
358        );
359    }
360
361    #[test]
362    fn encode_non_ascii_two_byte_utf8() {
363        // U+00E9 'é' = 0xC3 0xA9
364        assert_eq!(encode_key_to_filename("café"), "caf%c3%a9");
365    }
366
367    #[test]
368    fn encode_non_ascii_three_byte_utf8() {
369        // U+20AC '€' = 0xE2 0x82 0xAC
370        assert_eq!(encode_key_to_filename("€100"), "%e2%82%ac100");
371    }
372
373    #[test]
374    fn encode_emoji_four_byte_utf8() {
375        // U+1F44B 👋 = 0xF0 0x9F 0x91 0x8B
376        assert_eq!(encode_key_to_filename("👋"), "%f0%9f%91%8b");
377    }
378
379    #[test]
380    fn encode_empty_key() {
381        assert_eq!(encode_key_to_filename(""), "");
382    }
383
384    #[test]
385    fn decode_round_trip_covers_canonical_set() {
386        for key in [
387            "simple",
388            "app/with:slashes",
389            "café",
390            "€100",
391            "👋 hello",
392            "a-b_c",
393            "",
394        ] {
395            let filename = format!("{}.bin", encode_key_to_filename(key));
396            assert_eq!(
397                decode_filename_to_key(&filename).as_deref(),
398                Some(key),
399                "round-trip failed for {key:?}",
400            );
401        }
402    }
403
404    #[test]
405    fn decode_rejects_non_bin_suffix() {
406        assert!(decode_filename_to_key("foo.txt").is_none());
407        assert!(decode_filename_to_key("foo").is_none());
408        assert!(decode_filename_to_key(".bin").is_some()); // empty stem decodes to ""
409    }
410
411    #[test]
412    fn decode_truncated_percent_escape_treated_literally() {
413        // Matches TS keyFromFilename: incomplete `%x` at end falls through to
414        // ASCII branch — `abc%5` decodes to `abc%5`.
415        assert_eq!(
416            decode_filename_to_key("abc%5.bin").as_deref(),
417            Some("abc%5")
418        );
419    }
420
421    #[test]
422    fn decode_invalid_hex_treated_literally() {
423        // `%5z` fails the hex check, falls through to per-char ASCII bytes.
424        assert_eq!(
425            decode_filename_to_key("abc%5z.bin").as_deref(),
426            Some("abc%5z")
427        );
428    }
429
430    #[test]
431    fn decode_uppercase_hex_accepted() {
432        // TS regex is /[0-9a-f]{2}$/i (case-insensitive); Rust mirrors via
433        // char::to_digit which accepts both cases.
434        assert_eq!(
435            decode_filename_to_key("caf%C3%A9.bin").as_deref(),
436            Some("café")
437        );
438    }
439
440    #[test]
441    fn decode_rejects_non_ascii_outside_escapes() {
442        // A filename containing a literal non-ASCII char (not `%xx`) cannot
443        // have come from our encoder; treat as un-decodable.
444        assert!(decode_filename_to_key("café.bin").is_none());
445    }
446
447    #[test]
448    fn nibble_validates_hex_set() {
449        for c in ['0', '5', '9', 'a', 'f', 'A', 'F'] {
450            assert!(nibble(c).is_some(), "{c} should be a hex digit");
451        }
452        for c in ['g', 'G', '/', '@', '\u{00e9}'] {
453            assert!(nibble(c).is_none(), "{c} should not be a hex digit");
454        }
455    }
456}