fsblobstore/
lib.rs

1//! A abstraction over a filesystem blob storage where each blob is
2//! named/key'd by its own hash.
3//!
4//! # Features
5//! | Feature     | Function
6//! |-------------|----------
7//! | `enumerate` | Enable method for enumerating all keys in storage.
8//! | `get-fname` | Enable method for acquiring the path of a blob.
9//! | `mkbasedir` | Auto-create the base directory in factory methods.
10//!
11//! The use of the `enumerate` and `get-fname` features are discouraged since
12//! they may encourage breaking the intended usage pattern for `FsBlobStore`
13//! instances.
14
15#![cfg_attr(docsrs, feature(doc_cfg))]
16
17mod ch;
18mod err;
19
20use std::{
21  fs,
22  path::{Path, PathBuf}
23};
24
25#[cfg(feature = "enumerate")]
26use {
27  std::{path::Component, thread},
28  walkdir::WalkDir
29};
30
31use idbag::IdBagU32;
32
33use tmpfile::TmpProc;
34
35use sha2::{Digest, Sha256};
36
37pub use ch::ContentHash;
38pub use tmpfile::{self, TmpFile};
39
40pub use err::Error;
41
42
43/// Internal type used by the [`TmpFile`] to hash and move blobs into their
44/// final location.
45struct Hasher {
46  inner: Sha256,
47  _id: idbag::IdU32
48}
49
50impl TmpProc for Hasher {
51  type Output = ContentHash;
52  type Error = Error;
53
54  /// Called when a buffer is about to be written.
55  fn update(&mut self, buf: &[u8]) {
56    self.inner.update(buf);
57  }
58
59  fn finalize(
60    &mut self,
61    tmpfile: Option<&Path>
62  ) -> Result<(Self::Output, Option<PathBuf>), Self::Error> {
63    let result = self.inner.clone().finalize();
64    let hash = result.to_vec();
65
66    let fname = if let Some(tmpfile) = tmpfile {
67      let Some(basedir) = tmpfile.parent() else {
68        panic!("foo");
69      };
70
71      let hexhash = hex::encode(&hash);
72      let (subdir1, rest) = hexhash.split_at(2);
73      let (subdir2, fname) = rest.split_at(2);
74      let dir = basedir.join(subdir1).join(subdir2);
75      if !dir.exists() {
76        std::fs::create_dir_all(&dir)?;
77      }
78      Some(dir.join(fname))
79    } else {
80      None
81    };
82    Ok((ContentHash::from(hash), fname))
83  }
84}
85
86
87/// An abstraction over a blob storage in a file system directory.
88pub struct FsBlobStore {
89  basedir: PathBuf,
90
91  minsize: Option<usize>,
92
93  /// Used to allocate unique identifiers for naming temporary files.
94  idbag: IdBagU32
95}
96
97impl FsBlobStore {
98  fn fsparts(hexhash: &str) -> (&str, &str, &str) {
99    let (subdir1, rest) = hexhash.split_at(2);
100    let (subdir2, fname) = rest.split_at(2);
101
102    (subdir1, subdir2, fname)
103  }
104
105  fn relpathname(hash: &[u8]) -> PathBuf {
106    assert_eq!(hash.len(), 32);
107
108    let hexhash = hex::encode(hash);
109    let (subdir1, subdir2, fname) = Self::fsparts(&hexhash);
110    PathBuf::from(subdir1).join(subdir2).join(fname)
111  }
112
113  fn abspathname(&self, hash: &[u8]) -> PathBuf {
114    let p = Self::relpathname(hash);
115    self.basedir.join(p)
116  }
117}
118
119
120impl FsBlobStore {
121  /// Create a new file system-backed blob storage engine.
122  ///
123  /// The `basedir` is where the blobs and temporary files will be stored.  The
124  /// caller must ensure that either `basedir` is absolute, or that the path
125  /// remains valid throughout the object's lifetime.
126  ///
127  /// If the basedir does not exist, it will automatically be created if the
128  /// `mkbasedir` feature is enabled.
129  ///
130  /// # Errors
131  /// If `mkbasedir` feature is enabled, [`Error::IO`] indicates that the base
132  /// directory can not be created.
133  pub fn new(basedir: impl AsRef<Path>) -> Result<Self, Error> {
134    let basedir = basedir.as_ref();
135
136    #[cfg(feature = "mkbasedir")]
137    if !basedir.exists() {
138      fs::create_dir_all(basedir)?;
139    }
140
141    Ok(Self {
142      basedir: basedir.to_path_buf(),
143      minsize: None,
144      idbag: IdBagU32::new()
145    })
146  }
147
148  /// This function serves the purpose as [`FsBlobStore::new()`], but will
149  /// enable support for storing small files in memory, rather than be written
150  /// to disk.
151  ///
152  /// # Notes
153  /// If support for storing small files in memory is enabled, "files" that
154  /// will fall into this category will not actually be stored in the file
155  /// system, and thus will neither be enumerable or read.
156  ///
157  /// The calling application must maintain its own databasse for such cases.
158  #[allow(clippy::missing_errors_doc)]
159  pub fn with_minsize(
160    basedir: impl AsRef<Path>,
161    minsize: usize
162  ) -> Result<Self, Error> {
163    let basedir = basedir.as_ref();
164
165    #[cfg(feature = "mkbasedir")]
166    if !basedir.exists() {
167      fs::create_dir_all(basedir)?;
168    }
169
170    Ok(Self {
171      basedir: basedir.to_path_buf(),
172      minsize: Some(minsize),
173      idbag: IdBagU32::new()
174    })
175  }
176
177
178  /// Check if content for a hash exists in store.
179  ///
180  /// # Errors
181  /// [`Error::IO`] indicates that it was not possible to determine whether the
182  /// file exists.
183  pub fn have(&self, hash: &[u8]) -> Result<bool, std::io::Error> {
184    let fname = self.abspathname(hash);
185    fname.try_exists()
186  }
187
188  /// Get a reader for a blob.
189  ///
190  /// # Errors
191  /// [`Error::IO`] means the file could not be opened.
192  pub fn reader(
193    &self,
194    hash: &[u8]
195  ) -> Result<impl std::io::Read, std::io::Error> {
196    let fname = self.abspathname(hash);
197    fs::File::open(fname)
198  }
199
200  /// Return a [`TmpFile`] writer for writing to temporary file.
201  ///
202  /// If the caller wishes to keep the file it must call `TmpFile::persist()`.
203  /// Dropping the `TmpFile`, without persisting it, will remove the temporary
204  /// file.
205  ///
206  /// # Errors
207  /// `std::io::Error` indicates that the temporary file could not be created.
208  pub fn writer(&self) -> Result<TmpFile<ContentHash, Error>, std::io::Error> {
209    let id = self.idbag.alloc();
210    let tmpfname = format!("tmp-{:08x}", id.get());
211    let tp = Hasher {
212      inner: Sha256::new(),
213      _id: id
214    };
215    let tmpfname = self.basedir.join(tmpfname);
216    if let Some(minsize) = self.minsize {
217      TmpFile::with_minsize(tmpfname, Box::new(tp), minsize)
218    } else {
219      TmpFile::new(tmpfname, Box::new(tp))
220    }
221  }
222
223  /// Remove a blob, by its hash, from the blob store.
224  ///
225  /// # Errors
226  /// `std::io::Error` indicates the file could not be removed.
227  ///
228  /// # Panics
229  /// If the `hash` is not 32 bytes long this method will panic.
230  pub fn rm(&self, hash: &[u8]) -> Result<(), std::io::Error> {
231    let fname = self.abspathname(hash);
232
233    fs::remove_file(&fname)?;
234
235    let Some(subdir) = fname.parent() else {
236      panic!("Unexpectedly unable to get parent directory.");
237    };
238    let Ok(()) = fs::remove_dir(subdir) else {
239      // Assume there are other files in this directory
240      return Ok(());
241    };
242
243    let Some(subdir) = subdir.parent() else {
244      panic!("Unexpectedly unable to get parent directory.");
245    };
246    let Ok(()) = fs::remove_dir(subdir) else {
247      // Assume there are other directories in this directory
248      return Ok(());
249    };
250
251    Ok(())
252  }
253
254  /// Get a list of all hashes in the fs blob store.
255  ///
256  /// On success, returns an object that will stream the records in an
257  /// unspecified order.
258  ///
259  /// # Caveat
260  /// This method exists, despite it being incongruous with the overall
261  /// philosophy of the blob store.  The application should maintain a
262  /// separate database of the blob hashes stored in the `FsBlobStore`, and
263  /// enumerations of hashes should be performed in the database instead.
264  ///
265  /// Enumerating the `FsBlobStore` is potentially slow.  Its use should be
266  /// limited to infrequent integrity checks.
267  ///
268  /// This method will launch a background thread which lives as long as it
269  /// performs its work.  It is inadvisable to allow end users to trigger this
270  /// method to be run.
271  #[cfg(feature = "enumerate")]
272  #[cfg_attr(docsrs, doc(cfg(feature = "enumerate")))]
273  #[allow(clippy::missing_panics_doc)]
274  #[must_use]
275  pub fn enumerate(
276    &self
277  ) -> (recstrm::Receiver<ContentHash, ()>, thread::JoinHandle<()>) {
278    let (tx, rx) = recstrm::channel::<ContentHash, ()>(32, None);
279    let basedir = self.basedir.clone();
280    let jh = thread::spawn(move || {
281      // Send hashes in batches
282      let mut batch = Vec::with_capacity(16);
283      for entry in WalkDir::new(&basedir).into_iter().filter_map(Result::ok) {
284        // Only care about entries of depth 3 (<subdir1>/<subdir2>/<file>)
285        if entry.depth() != 3 {
286          continue;
287        }
288
289        // Only care about regular files
290        if !entry.file_type().is_file() {
291          continue;
292        }
293
294        // Strip base directory from path
295        let pth = entry.path();
296        // unwrap() should be okay, because path was constructed from basedir
297        let pth = pth.strip_prefix(&basedir).unwrap();
298
299        // Construct a string from path components
300        // Ignore any paths that have components that are not utf-8, and
301        // ignore components that aren't "normal".
302        let mut p = String::with_capacity(64);
303        for c in pth.components() {
304          match c {
305            Component::Normal(os) => {
306              let Some(s) = os.to_str() else {
307                // Not utf-8, ignore this
308                continue;
309              };
310              p.push_str(s);
311            }
312            _ => {
313              // Igmore this path because it contains unexpected component type
314              continue;
315            }
316          }
317        }
318
319        // Ignore anything that isn't 64 characters long.
320        // (256 bit hashes that are hex encoded are 64 characters long)
321        if p.len() != 64 {
322          continue;
323        }
324
325        // Ignore strings that aren't purely hex digits
326        if !p.chars().all(|c| c.is_ascii_hexdigit()) {
327          continue;
328        }
329
330        // unwrap() is okay, since the it should have been sufficiently
331        // validated above
332        let hash = hex::decode(p).unwrap();
333
334        batch.push(ContentHash::from(hash));
335        #[allow(clippy::iter_with_drain)]
336        if batch.len() >= 16 && tx.send_batch(batch.drain(..)).is_err() {
337          break;
338        }
339      }
340      if !batch.is_empty() {
341        let _ = tx.send_batch(batch.into_iter());
342      }
343    });
344
345    (rx, jh)
346  }
347
348  /// Get complete filename of an existing blob.
349  ///
350  /// Returns `Ok(PathBuf)` containing the path to the content, if it exists.
351  ///
352  /// # Caveat
353  /// The use of this method is strongly discouraged.  Use
354  /// `FsBlobStore::have()` to check if a blob exists in the datastore,
355  /// `FsBlobStore::reader()` to read a blob, and `FsBlobStore::rm()` to remove
356  /// a blob.
357  ///
358  /// # Errors
359  /// `std::io::Error` indicates the file doesn't exists or its metadata could
360  /// not be read.
361  #[cfg(feature = "get-fname")]
362  #[cfg_attr(docsrs, doc(cfg(feature = "get-fname")))]
363  pub fn get_fname(&self, hash: &[u8]) -> Result<PathBuf, std::io::Error> {
364    let fname = self.abspathname(hash);
365    fs::metadata(&fname)?;
366    Ok(fname)
367  }
368}
369
370// vim: set ft=rust et sw=2 ts=2 sts=2 cinoptions=2 tw=79 :