crev_recursive_digest/
lib.rs

1//! Recursive digest for filesystem path content
2use std::marker::PhantomData;
3use std::{
4    collections::HashSet,
5    ffi::OsStr,
6    fs,
7    io::BufRead,
8    path::{Path, PathBuf},
9};
10use thiserror::Error;
11
12/// Re-export `walkdir`
13pub use walkdir;
14
15/// Read file content into a `digest::Digest`
16fn read_file_to_digest_input(path: &Path, input: &mut impl digest::Digest) -> std::io::Result<()> {
17    let file = fs::File::open(path)?;
18
19    let mut reader = std::io::BufReader::new(file);
20
21    loop {
22        let length = {
23            let buffer = reader.fill_buf()?;
24            input.update(buffer);
25            buffer.len()
26        };
27        if length == 0 {
28            break;
29        }
30        reader.consume(length);
31    }
32
33    Ok(())
34}
35
36#[derive(Debug, Error)]
37pub enum DigestError {
38    #[error("could not convert OsStr string to utf8")]
39    OsStrConversionError,
40    #[error("io Error: {}", _0)]
41    IoError(std::io::Error),
42    #[error("walkdir Error: {}", _0)]
43    WalkdirError(walkdir::Error),
44    #[error("an entry that was supposed to be a file, contains sub-entries")]
45    FileWithSubentriesError,
46    #[error("file not supported: {}", _0)]
47    FileNotSupported(String),
48}
49
50impl From<std::io::Error> for DigestError {
51    fn from(err: std::io::Error) -> Self {
52        DigestError::IoError(err)
53    }
54}
55
56impl From<walkdir::Error> for DigestError {
57    fn from(err: walkdir::Error) -> Self {
58        DigestError::WalkdirError(err)
59    }
60}
61
62/// Handle passed to the user to add optional path data
63pub struct AdditionalDataWriter<'a, D> {
64    used: bool,
65    hasher: &'a mut D,
66}
67
68impl<'a, D> AdditionalDataWriter<'a, D>
69where
70    D: digest::Digest,
71{
72    pub fn input(&mut self, bytes: &[u8]) {
73        if !bytes.is_empty() {
74            if !self.used {
75                self.hasher.update([0]);
76                self.used = true;
77            }
78            self.hasher.update(bytes);
79        }
80    }
81}
82
83pub struct RecursiveDigestBuilder<Digest, FFilter, FAData> {
84    filter: FFilter,
85    additional_data: FAData,
86    digest: std::marker::PhantomData<Digest>,
87}
88
89impl<D, FFilter, FAData> RecursiveDigestBuilder<D, FFilter, FAData>
90where
91    FFilter: Fn(&walkdir::DirEntry) -> bool,
92    FAData: Fn(&walkdir::DirEntry, &mut AdditionalDataWriter<'_, D>) -> Result<(), DigestError>,
93{
94    /// Set filter function just like [`walkdir::IntoIterator::filter_entry`]
95    pub fn filter<F: Fn(&walkdir::DirEntry) -> bool>(
96        self,
97        filter: F,
98    ) -> RecursiveDigestBuilder<D, F, FAData> {
99        RecursiveDigestBuilder {
100            filter,
101            additional_data: self.additional_data,
102            digest: self.digest,
103        }
104    }
105
106    pub fn additional_data<
107        F: Fn(&walkdir::DirEntry, &mut AdditionalDataWriter<'_, D>) -> Result<(), DigestError>,
108    >(
109        self,
110        f: F,
111    ) -> RecursiveDigestBuilder<D, FFilter, F> {
112        RecursiveDigestBuilder {
113            filter: self.filter,
114            additional_data: f,
115            digest: self.digest,
116        }
117    }
118
119    pub fn build(self) -> RecursiveDigest<D, FFilter, FAData> {
120        RecursiveDigest {
121            digest: self.digest,
122            filter: self.filter,
123            additional_data: self.additional_data,
124        }
125    }
126}
127
128/// Recursive Digest
129///
130/// Can calculate a recursive digest for a path
131pub struct RecursiveDigest<Digest, FFilter, FAData> {
132    digest: PhantomData<Digest>,
133    filter: FFilter,
134    additional_data: FAData,
135}
136
137impl<D>
138    RecursiveDigest<
139        D,
140        Box<dyn Fn(&walkdir::DirEntry) -> bool>,
141        Box<
142            dyn Fn(&walkdir::DirEntry, &mut AdditionalDataWriter<'_, D>) -> Result<(), DigestError>,
143        >,
144    >
145where
146    D: digest::Digest + digest::FixedOutput,
147{
148    /// Create `RecursiveDigest` by configuring `RecursiveDigestBuilder`
149    #[must_use]
150    pub fn new() -> RecursiveDigestBuilder<
151        D,
152        Box<dyn Fn(&walkdir::DirEntry) -> bool>,
153        Box<
154            dyn Fn(&walkdir::DirEntry, &mut AdditionalDataWriter<'_, D>) -> Result<(), DigestError>,
155        >,
156    > {
157        RecursiveDigestBuilder {
158            filter: Box::new(|_| true),
159            additional_data: Box::new(|_, _| Ok(())),
160            digest: PhantomData,
161        }
162    }
163}
164
165#[cfg(unix)]
166fn hash_osstr<D: digest::Digest>(digest: &mut D, s: &OsStr) {
167    use std::os::unix::ffi::OsStrExt;
168    digest.update(s.as_bytes());
169}
170
171#[cfg(not(unix))]
172fn hash_osstr<D: digest::Digest>(digest: &mut D, s: &OsStr) {
173    digest.update(s.to_string_lossy().as_bytes());
174}
175
176impl<D, FFilter, FAData> RecursiveDigest<D, FFilter, FAData>
177where
178    FFilter: Fn(&walkdir::DirEntry) -> bool,
179    FAData: Fn(&walkdir::DirEntry, &mut AdditionalDataWriter<'_, D>) -> Result<(), DigestError>,
180    D: digest::Digest + digest::FixedOutput,
181{
182    pub fn get_digest_of(&self, root_path: &Path) -> Result<Vec<u8>, DigestError> {
183        let mut hashers = vec![];
184
185        // pop the top hasher and output it to the one just above it
186        fn flush_up_one_level<D: digest::Digest + digest::FixedOutput>(hashers: &mut Vec<D>) {
187            let hasher = hashers.pop().expect("must not be empty yet");
188            let h2 = hashers
189                .last_mut()
190                .expect("must not happen");
191            <D as digest::Digest>::update(h2, hasher.finalize_fixed().as_slice());
192        }
193
194        let base_depth = root_path.components().count();
195
196        let mut first = true;
197        for entry in walkdir::WalkDir::new(root_path)
198            .follow_links(false)
199            .sort_by(|a, b| a.path().cmp(b.path()))
200            .into_iter()
201            .filter_entry(|entry| {
202                // can't skip the `root_path`
203                if first {
204                    debug_assert_eq!(root_path, entry.path());
205                    first = false;
206                    return true;
207                }
208
209                (self.filter)(entry)
210            })
211        {
212            let entry = entry?;
213            let entry_depth = entry.path().components().count();
214
215            debug_assert!(base_depth <= entry_depth);
216            let depth = entry_depth - base_depth;
217            let hasher_size_required = depth + 1;
218
219            // we finished with (potentially multiple levels of) recursive content
220            // in the previous iterations, now:
221            // we flush it upwards, and replace the top one with a fresh one
222            while hasher_size_required <= hashers.len() {
223                flush_up_one_level(&mut hashers);
224            }
225            hashers.push(D::new());
226
227            debug_assert_eq!(hashers.len(), hasher_size_required);
228
229            let file_type = entry.file_type();
230
231            // top level directory includes only content, no name or additional_data
232            // names and additional data go the hasher above the one we just prepared
233            if 0 < depth {
234                let hasher = hashers.get_mut(depth - 1).expect("must not happen");
235
236                let mut name_hasher = D::new();
237                // name
238                hash_osstr(
239                    &mut name_hasher,
240                    entry.path().file_name().expect("must have a file_name"),
241                );
242                // additional data (optional)
243                (self.additional_data)(
244                    &entry,
245                    &mut AdditionalDataWriter {
246                        hasher,
247                        used: false,
248                    },
249                )?;
250                <D as digest::Digest>::update(hasher, name_hasher.finalize_fixed().as_slice());
251            }
252
253            // content
254            if file_type.is_file() {
255                self.read_content_of_file(
256                    entry.path(),
257                    hashers.last_mut().expect("must not happen"),
258                )?;
259            } else if file_type.is_symlink() {
260                self.read_content_of_symlink(
261                    entry.path(),
262                    hashers.last_mut().expect("must not happen"),
263                )?;
264            } else if file_type.is_dir() {
265                let hasher = hashers.last_mut().expect("must not happen");
266                <D as digest::Digest>::update(hasher, b"D");
267            } else {
268                return Err(DigestError::FileNotSupported(
269                    entry.path().display().to_string(),
270                ));
271            }
272        }
273
274        loop {
275            if hashers.len() == 1 {
276                return Ok(hashers
277                    .pop()
278                    .expect("must not fail")
279                    .finalize_fixed()
280                    .to_vec());
281            }
282            flush_up_one_level(&mut hashers);
283        }
284    }
285
286    fn read_content_of_file(
287        &self,
288        full_path: &Path,
289        parent_hasher: &mut D,
290    ) -> Result<(), DigestError> {
291        <D as digest::Digest>::update(parent_hasher, b"F");
292        read_file_to_digest_input(full_path, parent_hasher)?;
293        Ok(())
294    }
295
296    fn read_content_of_symlink(
297        &self,
298        full_path: &Path,
299        parent_hasher: &mut D,
300    ) -> Result<(), DigestError> {
301        <D as digest::Digest>::update(parent_hasher, b"L");
302        <D as digest::Digest>::update(parent_hasher,
303            full_path
304                .read_link()?
305                .to_str()
306                .ok_or(DigestError::OsStrConversionError)?
307                .as_bytes(),
308        );
309        Ok(())
310    }
311}
312
313#[deprecated]
314pub fn get_recursive_digest_for_paths<D: digest::Digest + digest::FixedOutput, H>(
315    root_path: &Path,
316    paths: HashSet<PathBuf, H>,
317) -> Result<Vec<u8>, DigestError>
318where
319    H: std::hash::BuildHasher,
320{
321    let h = RecursiveDigest::<D, _, _>::new()
322        .filter(|entry| {
323            let rel_path = entry
324                .path()
325                .strip_prefix(root_path)
326                .expect("must be prefix");
327            paths.contains(rel_path)
328        })
329        .build();
330
331    h.get_digest_of(root_path)
332}
333
334#[deprecated]
335pub fn get_recursive_digest_for_dir<
336    Digest: digest::Digest + digest::FixedOutput,
337    H: std::hash::BuildHasher,
338>(
339    root_path: &Path,
340    rel_path_ignore_list: &HashSet<PathBuf, H>,
341) -> Result<Vec<u8>, DigestError> {
342    let h = RecursiveDigest::<Digest, _, _>::new()
343        .filter(|entry| {
344            let rel_path = entry
345                .path()
346                .strip_prefix(root_path)
347                .expect("must be prefix");
348            !rel_path_ignore_list.contains(rel_path)
349        })
350        .build();
351
352    h.get_digest_of(root_path)
353}