fsblobstore 0.0.5

A file-system backed blob storage abstraction.
Documentation
//! A abstraction over a filesystem blob storage where each blob is
//! named/key'd by its own hash.
//!
//! # Features
//! | Feature     | Function
//! |-------------|----------
//! | `enumerate` | Enable method for enumerating all keys in storage.
//! | `get-fname` | Enable method for acquiring the path of a blob.
//! | `mkbasedir` | Auto-create the base directory in factory methods.
//!
//! The use of the `enumerate` and `get-fname` features are discouraged since
//! they may encourage breaking the intended usage pattern for `FsBlobStore`
//! instances.

#![cfg_attr(docsrs, feature(doc_cfg))]

mod ch;
mod err;

use std::{
  fs,
  path::{Path, PathBuf}
};

#[cfg(feature = "enumerate")]
use {
  std::{path::Component, thread},
  walkdir::WalkDir
};

use idbag::IdBagU32;

use tmpfile::TmpProc;

use sha2::{Digest, Sha256};

pub use ch::ContentHash;
pub use tmpfile::{self, TmpFile};

pub use err::Error;


/// Internal type used by the [`TmpFile`] to hash and move blobs into their
/// final location.
struct Hasher {
  inner: Sha256,
  _id: idbag::IdU32
}

impl TmpProc for Hasher {
  type Output = ContentHash;
  type Error = Error;

  /// Called when a buffer is about to be written.
  fn update(&mut self, buf: &[u8]) {
    self.inner.update(buf);
  }

  fn finalize(
    &mut self,
    tmpfile: Option<&Path>
  ) -> Result<(Self::Output, Option<PathBuf>), Self::Error> {
    let result = self.inner.clone().finalize();
    let hash = result.to_vec();

    let fname = if let Some(tmpfile) = tmpfile {
      let Some(basedir) = tmpfile.parent() else {
        panic!("foo");
      };

      let hexhash = hex::encode(&hash);
      let (subdir1, rest) = hexhash.split_at(2);
      let (subdir2, fname) = rest.split_at(2);
      let dir = basedir.join(subdir1).join(subdir2);
      if !dir.exists() {
        std::fs::create_dir_all(&dir)?;
      }
      Some(dir.join(fname))
    } else {
      None
    };
    Ok((ContentHash::from(hash), fname))
  }
}


/// An abstraction over a blob storage in a file system directory.
pub struct FsBlobStore {
  basedir: PathBuf,

  minsize: Option<usize>,

  /// Used to allocate unique identifiers for naming temporary files.
  idbag: IdBagU32
}

impl FsBlobStore {
  fn fsparts(hexhash: &str) -> (&str, &str, &str) {
    let (subdir1, rest) = hexhash.split_at(2);
    let (subdir2, fname) = rest.split_at(2);

    (subdir1, subdir2, fname)
  }

  fn relpathname(hash: &[u8]) -> PathBuf {
    assert_eq!(hash.len(), 32);

    let hexhash = hex::encode(hash);
    let (subdir1, subdir2, fname) = Self::fsparts(&hexhash);
    PathBuf::from(subdir1).join(subdir2).join(fname)
  }

  fn abspathname(&self, hash: &[u8]) -> PathBuf {
    let p = Self::relpathname(hash);
    self.basedir.join(p)
  }
}


impl FsBlobStore {
  /// Create a new file system-backed blob storage engine.
  ///
  /// The `basedir` is where the blobs and temporary files will be stored.  The
  /// caller must ensure that either `basedir` is absolute, or that the path
  /// remains valid throughout the object's lifetime.
  ///
  /// If the basedir does not exist, it will automatically be created if the
  /// `mkbasedir` feature is enabled.
  ///
  /// # Errors
  /// If `mkbasedir` feature is enabled, [`Error::IO`] indicates that the base
  /// directory can not be created.
  pub fn new(basedir: impl AsRef<Path>) -> Result<Self, Error> {
    let basedir = basedir.as_ref();

    #[cfg(feature = "mkbasedir")]
    if !basedir.exists() {
      fs::create_dir_all(basedir)?;
    }

    Ok(Self {
      basedir: basedir.to_path_buf(),
      minsize: None,
      idbag: IdBagU32::new()
    })
  }

  /// This function serves the purpose as [`FsBlobStore::new()`], but will
  /// enable support for storing small files in memory, rather than be written
  /// to disk.
  ///
  /// # Notes
  /// If support for storing small files in memory is enabled, "files" that
  /// will fall into this category will not actually be stored in the file
  /// system, and thus will neither be enumerable or read.
  ///
  /// The calling application must maintain its own databasse for such cases.
  #[allow(clippy::missing_errors_doc)]
  pub fn with_minsize(
    basedir: impl AsRef<Path>,
    minsize: usize
  ) -> Result<Self, Error> {
    let basedir = basedir.as_ref();

    #[cfg(feature = "mkbasedir")]
    if !basedir.exists() {
      fs::create_dir_all(basedir)?;
    }

    Ok(Self {
      basedir: basedir.to_path_buf(),
      minsize: Some(minsize),
      idbag: IdBagU32::new()
    })
  }


  /// Check if content for a hash exists in store.
  ///
  /// # Errors
  /// [`Error::IO`] indicates that it was not possible to determine whether the
  /// file exists.
  pub fn have(&self, hash: &[u8]) -> Result<bool, std::io::Error> {
    let fname = self.abspathname(hash);
    fname.try_exists()
  }

  /// Get a reader for a blob.
  ///
  /// # Errors
  /// [`Error::IO`] means the file could not be opened.
  pub fn reader(
    &self,
    hash: &[u8]
  ) -> Result<impl std::io::Read, std::io::Error> {
    let fname = self.abspathname(hash);
    fs::File::open(fname)
  }

  /// Return a [`TmpFile`] writer for writing to temporary file.
  ///
  /// If the caller wishes to keep the file it must call `TmpFile::persist()`.
  /// Dropping the `TmpFile`, without persisting it, will remove the temporary
  /// file.
  ///
  /// # Errors
  /// `std::io::Error` indicates that the temporary file could not be created.
  pub fn writer(&self) -> Result<TmpFile<ContentHash, Error>, std::io::Error> {
    let id = self.idbag.alloc();
    let tmpfname = format!("tmp-{:08x}", id.get());
    let tp = Hasher {
      inner: Sha256::new(),
      _id: id
    };
    let tmpfname = self.basedir.join(tmpfname);
    if let Some(minsize) = self.minsize {
      TmpFile::with_minsize(tmpfname, Box::new(tp), minsize)
    } else {
      TmpFile::new(tmpfname, Box::new(tp))
    }
  }

  /// Remove a blob, by its hash, from the blob store.
  ///
  /// # Errors
  /// `std::io::Error` indicates the file could not be removed.
  ///
  /// # Panics
  /// If the `hash` is not 32 bytes long this method will panic.
  pub fn rm(&self, hash: &[u8]) -> Result<(), std::io::Error> {
    let fname = self.abspathname(hash);

    fs::remove_file(&fname)?;

    let Some(subdir) = fname.parent() else {
      panic!("Unexpectedly unable to get parent directory.");
    };
    let Ok(()) = fs::remove_dir(subdir) else {
      // Assume there are other files in this directory
      return Ok(());
    };

    let Some(subdir) = subdir.parent() else {
      panic!("Unexpectedly unable to get parent directory.");
    };
    let Ok(()) = fs::remove_dir(subdir) else {
      // Assume there are other directories in this directory
      return Ok(());
    };

    Ok(())
  }

  /// Get a list of all hashes in the fs blob store.
  ///
  /// On success, returns an object that will stream the records in an
  /// unspecified order.
  ///
  /// # Caveat
  /// This method exists, despite it being incongruous with the overall
  /// philosophy of the blob store.  The application should maintain a
  /// separate database of the blob hashes stored in the `FsBlobStore`, and
  /// enumerations of hashes should be performed in the database instead.
  ///
  /// Enumerating the `FsBlobStore` is potentially slow.  Its use should be
  /// limited to infrequent integrity checks.
  ///
  /// This method will launch a background thread which lives as long as it
  /// performs its work.  It is inadvisable to allow end users to trigger this
  /// method to be run.
  #[cfg(feature = "enumerate")]
  #[cfg_attr(docsrs, doc(cfg(feature = "enumerate")))]
  #[allow(clippy::missing_panics_doc)]
  #[must_use]
  pub fn enumerate(
    &self
  ) -> (recstrm::Receiver<ContentHash, ()>, thread::JoinHandle<()>) {
    let (tx, rx) = recstrm::channel::<ContentHash, ()>(32, None);
    let basedir = self.basedir.clone();
    let jh = thread::spawn(move || {
      // Send hashes in batches
      let mut batch = Vec::with_capacity(16);
      for entry in WalkDir::new(&basedir).into_iter().filter_map(Result::ok) {
        // Only care about entries of depth 3 (<subdir1>/<subdir2>/<file>)
        if entry.depth() != 3 {
          continue;
        }

        // Only care about regular files
        if !entry.file_type().is_file() {
          continue;
        }

        // Strip base directory from path
        let pth = entry.path();
        // unwrap() should be okay, because path was constructed from basedir
        let pth = pth.strip_prefix(&basedir).unwrap();

        // Construct a string from path components
        // Ignore any paths that have components that are not utf-8, and
        // ignore components that aren't "normal".
        let mut p = String::with_capacity(64);
        for c in pth.components() {
          match c {
            Component::Normal(os) => {
              let Some(s) = os.to_str() else {
                // Not utf-8, ignore this
                continue;
              };
              p.push_str(s);
            }
            _ => {
              // Igmore this path because it contains unexpected component type
              continue;
            }
          }
        }

        // Ignore anything that isn't 64 characters long.
        // (256 bit hashes that are hex encoded are 64 characters long)
        if p.len() != 64 {
          continue;
        }

        // Ignore strings that aren't purely hex digits
        if !p.chars().all(|c| c.is_ascii_hexdigit()) {
          continue;
        }

        // unwrap() is okay, since the it should have been sufficiently
        // validated above
        let hash = hex::decode(p).unwrap();

        batch.push(ContentHash::from(hash));
        #[allow(clippy::iter_with_drain)]
        if batch.len() >= 16 && tx.send_batch(batch.drain(..)).is_err() {
          break;
        }
      }
      if !batch.is_empty() {
        let _ = tx.send_batch(batch.into_iter());
      }
    });

    (rx, jh)
  }

  /// Get complete filename of an existing blob.
  ///
  /// Returns `Ok(PathBuf)` containing the path to the content, if it exists.
  ///
  /// # Caveat
  /// The use of this method is strongly discouraged.  Use
  /// `FsBlobStore::have()` to check if a blob exists in the datastore,
  /// `FsBlobStore::reader()` to read a blob, and `FsBlobStore::rm()` to remove
  /// a blob.
  ///
  /// # Errors
  /// `std::io::Error` indicates the file doesn't exists or its metadata could
  /// not be read.
  #[cfg(feature = "get-fname")]
  #[cfg_attr(docsrs, doc(cfg(feature = "get-fname")))]
  pub fn get_fname(&self, hash: &[u8]) -> Result<PathBuf, std::io::Error> {
    let fname = self.abspathname(hash);
    fs::metadata(&fname)?;
    Ok(fname)
  }
}

// vim: set ft=rust et sw=2 ts=2 sts=2 cinoptions=2 tw=79 :