fsblobstore/lib.rs
1//! A abstraction over a filesystem blob storage where each blob is
2//! named/key'd by its own hash.
3//!
4//! # Features
5//! | Feature | Function
6//! |-------------|----------
7//! | `enumerate` | Enable method for enumerating all keys in storage.
8//! | `get-fname` | Enable method for acquiring the path of a blob.
9//! | `mkbasedir` | Auto-create the base directory in factory methods.
10//!
11//! The use of the `enumerate` and `get-fname` features are discouraged since
12//! they may encourage breaking the intended usage pattern for `FsBlobStore`
13//! instances.
14
15#![cfg_attr(docsrs, feature(doc_cfg))]
16
17mod ch;
18mod err;
19
20use std::{
21 fs,
22 path::{Path, PathBuf}
23};
24
25#[cfg(feature = "enumerate")]
26use {
27 std::{path::Component, thread},
28 walkdir::WalkDir
29};
30
31use idbag::IdBagU32;
32
33use tmpfile::TmpProc;
34
35use sha2::{Digest, Sha256};
36
37pub use ch::ContentHash;
38pub use tmpfile::{self, TmpFile};
39
40pub use err::Error;
41
42
43/// Internal type used by the [`TmpFile`] to hash and move blobs into their
44/// final location.
45struct Hasher {
46 inner: Sha256,
47 _id: idbag::IdU32
48}
49
50impl TmpProc for Hasher {
51 type Output = ContentHash;
52 type Error = Error;
53
54 /// Called when a buffer is about to be written.
55 fn update(&mut self, buf: &[u8]) {
56 self.inner.update(buf);
57 }
58
59 fn finalize(
60 &mut self,
61 tmpfile: Option<&Path>
62 ) -> Result<(Self::Output, Option<PathBuf>), Self::Error> {
63 let result = self.inner.clone().finalize();
64 let hash = result.to_vec();
65
66 let fname = if let Some(tmpfile) = tmpfile {
67 let Some(basedir) = tmpfile.parent() else {
68 panic!("foo");
69 };
70
71 let hexhash = hex::encode(&hash);
72 let (subdir1, rest) = hexhash.split_at(2);
73 let (subdir2, fname) = rest.split_at(2);
74 let dir = basedir.join(subdir1).join(subdir2);
75 if !dir.exists() {
76 std::fs::create_dir_all(&dir)?;
77 }
78 Some(dir.join(fname))
79 } else {
80 None
81 };
82 Ok((ContentHash::from(hash), fname))
83 }
84}
85
86
87/// An abstraction over a blob storage in a file system directory.
88pub struct FsBlobStore {
89 basedir: PathBuf,
90
91 minsize: Option<usize>,
92
93 /// Used to allocate unique identifiers for naming temporary files.
94 idbag: IdBagU32
95}
96
97impl FsBlobStore {
98 fn fsparts(hexhash: &str) -> (&str, &str, &str) {
99 let (subdir1, rest) = hexhash.split_at(2);
100 let (subdir2, fname) = rest.split_at(2);
101
102 (subdir1, subdir2, fname)
103 }
104
105 fn relpathname(hash: &[u8]) -> PathBuf {
106 assert_eq!(hash.len(), 32);
107
108 let hexhash = hex::encode(hash);
109 let (subdir1, subdir2, fname) = Self::fsparts(&hexhash);
110 PathBuf::from(subdir1).join(subdir2).join(fname)
111 }
112
113 fn abspathname(&self, hash: &[u8]) -> PathBuf {
114 let p = Self::relpathname(hash);
115 self.basedir.join(p)
116 }
117}
118
119
120impl FsBlobStore {
121 /// Create a new file system-backed blob storage engine.
122 ///
123 /// The `basedir` is where the blobs and temporary files will be stored. The
124 /// caller must ensure that either `basedir` is absolute, or that the path
125 /// remains valid throughout the object's lifetime.
126 ///
127 /// If the basedir does not exist, it will automatically be created if the
128 /// `mkbasedir` feature is enabled.
129 ///
130 /// # Errors
131 /// If `mkbasedir` feature is enabled, [`Error::IO`] indicates that the base
132 /// directory can not be created.
133 pub fn new(basedir: impl AsRef<Path>) -> Result<Self, Error> {
134 let basedir = basedir.as_ref();
135
136 #[cfg(feature = "mkbasedir")]
137 if !basedir.exists() {
138 fs::create_dir_all(basedir)?;
139 }
140
141 Ok(Self {
142 basedir: basedir.to_path_buf(),
143 minsize: None,
144 idbag: IdBagU32::new()
145 })
146 }
147
148 /// This function serves the purpose as [`FsBlobStore::new()`], but will
149 /// enable support for storing small files in memory, rather than be written
150 /// to disk.
151 ///
152 /// # Notes
153 /// If support for storing small files in memory is enabled, "files" that
154 /// will fall into this category will not actually be stored in the file
155 /// system, and thus will neither be enumerable or read.
156 ///
157 /// The calling application must maintain its own databasse for such cases.
158 #[allow(clippy::missing_errors_doc)]
159 pub fn with_minsize(
160 basedir: impl AsRef<Path>,
161 minsize: usize
162 ) -> Result<Self, Error> {
163 let basedir = basedir.as_ref();
164
165 #[cfg(feature = "mkbasedir")]
166 if !basedir.exists() {
167 fs::create_dir_all(basedir)?;
168 }
169
170 Ok(Self {
171 basedir: basedir.to_path_buf(),
172 minsize: Some(minsize),
173 idbag: IdBagU32::new()
174 })
175 }
176
177
178 /// Check if content for a hash exists in store.
179 ///
180 /// # Errors
181 /// [`Error::IO`] indicates that it was not possible to determine whether the
182 /// file exists.
183 pub fn have(&self, hash: &[u8]) -> Result<bool, std::io::Error> {
184 let fname = self.abspathname(hash);
185 fname.try_exists()
186 }
187
188 /// Get a reader for a blob.
189 ///
190 /// # Errors
191 /// [`Error::IO`] means the file could not be opened.
192 pub fn reader(
193 &self,
194 hash: &[u8]
195 ) -> Result<impl std::io::Read, std::io::Error> {
196 let fname = self.abspathname(hash);
197 fs::File::open(fname)
198 }
199
200 /// Return a [`TmpFile`] writer for writing to temporary file.
201 ///
202 /// If the caller wishes to keep the file it must call `TmpFile::persist()`.
203 /// Dropping the `TmpFile`, without persisting it, will remove the temporary
204 /// file.
205 ///
206 /// # Errors
207 /// `std::io::Error` indicates that the temporary file could not be created.
208 pub fn writer(&self) -> Result<TmpFile<ContentHash, Error>, std::io::Error> {
209 let id = self.idbag.alloc();
210 let tmpfname = format!("tmp-{:08x}", id.get());
211 let tp = Hasher {
212 inner: Sha256::new(),
213 _id: id
214 };
215 let tmpfname = self.basedir.join(tmpfname);
216 if let Some(minsize) = self.minsize {
217 TmpFile::with_minsize(tmpfname, Box::new(tp), minsize)
218 } else {
219 TmpFile::new(tmpfname, Box::new(tp))
220 }
221 }
222
223 /// Remove a blob, by its hash, from the blob store.
224 ///
225 /// # Errors
226 /// `std::io::Error` indicates the file could not be removed.
227 ///
228 /// # Panics
229 /// If the `hash` is not 32 bytes long this method will panic.
230 pub fn rm(&self, hash: &[u8]) -> Result<(), std::io::Error> {
231 let fname = self.abspathname(hash);
232
233 fs::remove_file(&fname)?;
234
235 let Some(subdir) = fname.parent() else {
236 panic!("Unexpectedly unable to get parent directory.");
237 };
238 let Ok(()) = fs::remove_dir(subdir) else {
239 // Assume there are other files in this directory
240 return Ok(());
241 };
242
243 let Some(subdir) = subdir.parent() else {
244 panic!("Unexpectedly unable to get parent directory.");
245 };
246 let Ok(()) = fs::remove_dir(subdir) else {
247 // Assume there are other directories in this directory
248 return Ok(());
249 };
250
251 Ok(())
252 }
253
254 /// Get a list of all hashes in the fs blob store.
255 ///
256 /// On success, returns an object that will stream the records in an
257 /// unspecified order.
258 ///
259 /// # Caveat
260 /// This method exists, despite it being incongruous with the overall
261 /// philosophy of the blob store. The application should maintain a
262 /// separate database of the blob hashes stored in the `FsBlobStore`, and
263 /// enumerations of hashes should be performed in the database instead.
264 ///
265 /// Enumerating the `FsBlobStore` is potentially slow. Its use should be
266 /// limited to infrequent integrity checks.
267 ///
268 /// This method will launch a background thread which lives as long as it
269 /// performs its work. It is inadvisable to allow end users to trigger this
270 /// method to be run.
271 #[cfg(feature = "enumerate")]
272 #[cfg_attr(docsrs, doc(cfg(feature = "enumerate")))]
273 #[allow(clippy::missing_panics_doc)]
274 #[must_use]
275 pub fn enumerate(
276 &self
277 ) -> (recstrm::Receiver<ContentHash, ()>, thread::JoinHandle<()>) {
278 let (tx, rx) = recstrm::channel::<ContentHash, ()>(32, None);
279 let basedir = self.basedir.clone();
280 let jh = thread::spawn(move || {
281 // Send hashes in batches
282 let mut batch = Vec::with_capacity(16);
283 for entry in WalkDir::new(&basedir).into_iter().filter_map(Result::ok) {
284 // Only care about entries of depth 3 (<subdir1>/<subdir2>/<file>)
285 if entry.depth() != 3 {
286 continue;
287 }
288
289 // Only care about regular files
290 if !entry.file_type().is_file() {
291 continue;
292 }
293
294 // Strip base directory from path
295 let pth = entry.path();
296 // unwrap() should be okay, because path was constructed from basedir
297 let pth = pth.strip_prefix(&basedir).unwrap();
298
299 // Construct a string from path components
300 // Ignore any paths that have components that are not utf-8, and
301 // ignore components that aren't "normal".
302 let mut p = String::with_capacity(64);
303 for c in pth.components() {
304 match c {
305 Component::Normal(os) => {
306 let Some(s) = os.to_str() else {
307 // Not utf-8, ignore this
308 continue;
309 };
310 p.push_str(s);
311 }
312 _ => {
313 // Igmore this path because it contains unexpected component type
314 continue;
315 }
316 }
317 }
318
319 // Ignore anything that isn't 64 characters long.
320 // (256 bit hashes that are hex encoded are 64 characters long)
321 if p.len() != 64 {
322 continue;
323 }
324
325 // Ignore strings that aren't purely hex digits
326 if !p.chars().all(|c| c.is_ascii_hexdigit()) {
327 continue;
328 }
329
330 // unwrap() is okay, since the it should have been sufficiently
331 // validated above
332 let hash = hex::decode(p).unwrap();
333
334 batch.push(ContentHash::from(hash));
335 #[allow(clippy::iter_with_drain)]
336 if batch.len() >= 16 && tx.send_batch(batch.drain(..)).is_err() {
337 break;
338 }
339 }
340 if !batch.is_empty() {
341 let _ = tx.send_batch(batch.into_iter());
342 }
343 });
344
345 (rx, jh)
346 }
347
348 /// Get complete filename of an existing blob.
349 ///
350 /// Returns `Ok(PathBuf)` containing the path to the content, if it exists.
351 ///
352 /// # Caveat
353 /// The use of this method is strongly discouraged. Use
354 /// `FsBlobStore::have()` to check if a blob exists in the datastore,
355 /// `FsBlobStore::reader()` to read a blob, and `FsBlobStore::rm()` to remove
356 /// a blob.
357 ///
358 /// # Errors
359 /// `std::io::Error` indicates the file doesn't exists or its metadata could
360 /// not be read.
361 #[cfg(feature = "get-fname")]
362 #[cfg_attr(docsrs, doc(cfg(feature = "get-fname")))]
363 pub fn get_fname(&self, hash: &[u8]) -> Result<PathBuf, std::io::Error> {
364 let fname = self.abspathname(hash);
365 fs::metadata(&fname)?;
366 Ok(fname)
367 }
368}
369
370// vim: set ft=rust et sw=2 ts=2 sts=2 cinoptions=2 tw=79 :