dasher/
lib.rs

1use sha3::{Digest, Sha3_256};
2use std::collections::{HashMap, VecDeque};
3use std::fmt;
4use std::fs;
5use std::path::PathBuf;
6use walkdir::{DirEntry, WalkDir};
7
8/// Defines the different types of nodes the directory walk can encounter. These types
9/// define, through the `to_u8` implementation, the byte that will be inserted between a
10/// node's name and content hash.
11#[derive(Debug, Clone, Copy)]
12enum NodeType {
13    /// A Directory node is a directory - this byte will be inserted between the
14    /// directory's name hash and its first content hash.
15    Directory,
16    /// The Directory Separator is used between content hashes for a directory. While a
17    /// normal file will have exactly one name and content hash, a directory can have an
18    /// arbitrary number of content hashes - one per node in the directory. The
19    /// Directory Separator byte is inserted between these hashes.
20    DirSeparator,
21    /// A File is any normal file - this byte will be inserted between the file's name
22    /// hash and its content hash.
23    File,
24    /// A Symlink is a symbolic link (on platforms that support it; specifically, where
25    /// [`is_symlink`] returns true). This byte is used instead of [`NodeType::File`]
26    /// when the node is a symlink.
27    ///
28    /// [`is_symlink`]: std::path::Path::is_symlink
29    Symlink,
30}
31
32impl NodeType {
33    pub fn to_u8(self) -> u8 {
34        match self {
35            NodeType::Directory => 2,
36            NodeType::DirSeparator => 3,
37            NodeType::File => 5,
38            NodeType::Symlink => 7,
39        }
40    }
41}
42
43/// Defines the different types of returns from [`hash_content`]
44#[derive(Debug, Clone)]
45enum ContentResult {
46    /// A File result is any return that actually includes data, that is a regular file
47    /// or a symlink, handled as needed.
48    File(Vec<u8>),
49    /// A Directory result is a signal that the node is a directory and queued node
50    /// hashes need to be appended.
51    Directory,
52}
53
54/// Defines the error states arising from hashing the content of a node
55#[derive(Debug)]
56pub enum ContentError {
57    UnknownNodeType,
58    IOError(std::io::Error),
59}
60
61impl fmt::Display for ContentError {
62    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
63        match self {
64            ContentError::UnknownNodeType => write!(f, "Node is not a directory, file, or symlink"),
65            ContentError::IOError(_) => write!(f, "Encountered a problem opening a node"),
66        }
67    }
68}
69
70impl From<std::io::Error> for ContentError {
71    fn from(err: std::io::Error) -> Self {
72        Self::IOError(err)
73    }
74}
75
76/// Error types from the [`hash_directory`] function
77#[derive(Debug)]
78pub enum HashError {
79    ContentError(ContentError),
80}
81
82impl fmt::Display for HashError {
83    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
84        write!(f, "Hash could not be computed")
85    }
86}
87
88impl From<ContentError> for HashError {
89    fn from(err: ContentError) -> Self {
90        Self::ContentError(err)
91    }
92}
93
94/// Given a list of directory paths, computes their directory hash and returns it as a
95/// byte array.
96///
97/// This function is equivalent to calling `hash_directory` on each directory in turn,
98/// concatenating all those hashes with the `NodeType::DirSeparator` byte, then hashing
99/// the result. The input does not have to be sorted, but will be sorted prior to
100/// hashing, in order to ensure quirks of calling this function won't result in
101/// inconsistent results.
102///
103/// If the input is a singleton vector, i.e. a vector with only one element, calling
104/// this function is equivalent to calling `hash_directory` on that element. If the
105/// input is an empty vector, the null-hash is returned.
106pub fn hash_directories(paths: Vec<PathBuf>) -> Result<Vec<u8>, HashError> {
107    let mut paths = paths.clone();
108    paths.sort();
109
110    let data = paths
111        .iter()
112        .cloned()
113        .map(|p| hash_directory(p))
114        .collect::<Result<Vec<_>, HashError>>()?
115        .join(&NodeType::DirSeparator.to_u8());
116
117    match paths.len() {
118        1 => Ok(data),
119        _ => Ok(Vec::from(
120            Sha3_256::new().chain_update(data).finalize().as_slice(),
121        )),
122    }
123}
124
125/// Given a path to a directory, as a `PathBuf`, computes the directory hash and returns
126/// it as a byte array.
127///
128/// # Examples
129///
130/// no
131///
132/// # Scheme
133///
134/// The hashing scheme is, in essence, generating a [Merkle
135/// tree](https://en.wikipedia.org/wiki/Merkle_tree), but with extra steps. Each node in
136/// the directory tree has its name hashed, then its contents, then those hashes are
137/// concatenated with a separator byte based on the node's type, and that data is hashed
138/// again to generate the node's hash. This process is repeated, from the bottom up in
139/// the directory tree, until all nodes have been hashed and a final hash for the entire
140/// directory can be returned.
141///
142/// For normal **files**, the node hash is simply:
143/// `hash(hash(name) + byte + hash(content))`
144///
145/// For **directories**, the node hash includes arbitrarily many content hashes, one per
146/// sub-node:
147/// `hash(hash(name) + byte + hash(content_1) + byte + hash(content_2) + ... + byte +
148/// hash(content_n))`
149///
150/// Finally, for **symlinks**, the link isn't followed. Instead, the content hash is the
151/// hash of the path to the file the link points to.
152/// `hash(hash(name) + byte + hash(path))`
153pub fn hash_directory(path: PathBuf) -> Result<Vec<u8>, HashError> {
154    let mut cache_map: HashMap<usize, VecDeque<Vec<u8>>> = HashMap::new();
155
156    for entry in WalkDir::new(&path).sort_by_file_name().contents_first(true) {
157        let entry = match entry {
158            Ok(v) => v,
159            Err(_) => continue,
160        };
161
162        let content = hash_content(&entry)?;
163        let name = Vec::from(
164            Sha3_256::new()
165                .chain_update(entry.file_name().to_string_lossy().as_bytes())
166                .finalize()
167                .as_slice(),
168        );
169
170        let node = match content {
171            crate::ContentResult::File(content) => node_file_hash(&entry, name, content),
172            crate::ContentResult::Directory => node_dir_hash(&mut cache_map, &entry, name),
173        };
174
175        match cache_map.get_mut(&entry.depth()) {
176            Some(q) => q.push_back(node),
177            None => {
178                let mut q: VecDeque<Vec<u8>> = VecDeque::new();
179                q.push_back(node);
180                cache_map.insert(entry.depth(), q);
181            }
182        }
183    }
184
185    Ok(cache_map.get_mut(&0).unwrap().pop_front().unwrap())
186}
187
188/// Given a [`DirEntry`], hashes its contents according to the type of node.
189fn hash_content(entry: &DirEntry) -> Result<ContentResult, ContentError> {
190    if entry.file_type().is_symlink() {
191        Ok(ContentResult::File(Vec::from(
192            Sha3_256::new()
193                .chain_update(fs::read_link(entry.path())?.to_string_lossy().as_bytes())
194                .finalize()
195                .as_slice(),
196        )))
197    } else if entry.file_type().is_dir() {
198        Ok(ContentResult::Directory)
199    } else if entry.file_type().is_file() {
200        Ok(ContentResult::File(Vec::from(
201            Sha3_256::new()
202                .chain_update(fs::read(entry.path())?)
203                .finalize()
204                .as_slice(),
205        )))
206    } else {
207        Err(ContentError::UnknownNodeType)
208    }
209}
210
211/// Generates the node hash of a given File node
212fn node_file_hash(entry: &DirEntry, name: Vec<u8>, content: Vec<u8>) -> Vec<u8> {
213    Vec::from(
214        Sha3_256::new()
215            .chain_update(name)
216            .chain_update(if entry.file_type().is_symlink() {
217                [NodeType::Symlink.to_u8()]
218            } else {
219                [NodeType::File.to_u8()]
220            })
221            .chain_update(content)
222            .finalize()
223            .as_slice(),
224    )
225}
226
227/// Generates the node hash of a given Directory node
228fn node_dir_hash(
229    cache: &mut HashMap<usize, VecDeque<Vec<u8>>>,
230    entry: &DirEntry,
231    name: Vec<u8>,
232) -> Vec<u8> {
233    let hasher = Sha3_256::new()
234        .chain_update(name)
235        .chain_update([NodeType::Directory.to_u8()]);
236    let data = match cache.get_mut(&(entry.depth() + 1)) {
237        Some(q) => q
238            .drain(..)
239            .collect::<Vec<Vec<u8>>>()
240            .join(&NodeType::DirSeparator.to_u8()),
241        None => Vec::new(),
242    };
243    Vec::from(hasher.chain_update(data).finalize().as_slice())
244}
245
246#[cfg(test)]
247mod tests {
248    use std::path::PathBuf;
249
250    use crate::{Digest, Sha3_256};
251    use hex_literal::hex;
252    #[test]
253    fn basic_sha256() {
254        let mut hasher = Sha3_256::new();
255        hasher.update(b"coffee");
256        let result = hasher.finalize();
257        assert_eq!(
258            result[..],
259            hex!("2250fa0b557f93b1d92a26e2ca55cfe2354e416e9d674277784cfb09184b41b0")[..]
260        );
261    }
262
263    #[test]
264    fn hash_test_data_one() {
265        // one.test content: 9241024260f87e2b901ed6972c48a17c4dc71e0939b0dd445f431f9cf406ca3a
266        // one.test name:    8d80e5830940407463f61fe1ef751de17cb095f8646ff71a72b1374efe5d84c5
267        // one.test node:    7716a22d94ecef97998c296ec7914ee0f6bcd66d8b37ac82688b4a3a4ba0a0ca
268        // one name:         6f70f27e13fc073a2541cd1e8b38ba9dbd5ec6de7bfeb24328534c417697381f
269        // one node:         9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
270        let result = crate::hash_directory(PathBuf::from("test_data/one")).unwrap();
271        assert_eq!(
272            result[..],
273            hex!("9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d")
274        )
275    }
276
277    #[test]
278    fn hash_test_data_two() {
279        // one.test content: 9241024260f87e2b901ed6972c48a17c4dc71e0939b0dd445f431f9cf406ca3a
280        // one.test name:    8d80e5830940407463f61fe1ef751de17cb095f8646ff71a72b1374efe5d84c5
281        // one.test node:    7716a22d94ecef97998c296ec7914ee0f6bcd66d8b37ac82688b4a3a4ba0a0ca
282        // two.test content: f2ee51400cb7890e88835039d97b3411df6d2460843c8e84b3f7541c40eec1ba
283        // two.test name:    af148b4b830b9882103cf3ca767e34a6e4dc52a0b6f08df31a77ae9903d1949d
284        // two.test node:    a6e7380dabe9ba94240a56570ba58dc457e57924ad78bc2476867dcbfd57c8eb
285        // subone name:      55fc1f2b086b1f8bde1078ae015e788ec2766e38f68a19deedc1d4cc1a882a57
286        // subone node:      2c57a0604a04b43168f02b177d6ac0f0205c70c1896a6c0b4eefd870dfe7089d
287        // bacon.test cont:  5d9121bfbe2ccd96c4e2e94e2c0c4a9940fb325f728fd5de26fc3e0f8df37914
288        // bacon.test name:  e176fb76eb3d39d67764c86a91dfb5ff11ce95ae556145bef4112434da0cffcf
289        // bacon.test node:  0ffcbf39480556cf4438cbe5bd18b926d9abecb8f1ea99f75d31f28a84429c8d
290        // two.test(l) cont: 29077be8b7ce3922951fb75ff84cf5aa29edbddd86725ee7b34a4508345b93a5
291        // two.test(l) node: b97f5177d45d1cac7b7b381609684fe2c689f8538bcc92db0a8312a0c6b83185
292        // subtwo name:      996340b04c5f9afc98dc7b81278eff5e796a814c6078101391277461246cc5b7
293        // subtwo node:      fa44844774f5f9e126e5a8d6b3c426b1f8b9af9abf7984371fae9ff4ef57e305
294        // two name:         cad32d6da454536a0412369e78baf227a81309b9579df2f450d1b5f5c8c26bf0
295        // two node:         9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df
296        let result = crate::hash_directory(PathBuf::from("test_data/two")).unwrap();
297        assert_eq!(
298            result[..],
299            hex!("9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df")
300        )
301    }
302
303    #[test]
304    fn hash_test_data_all() {
305        // one node:         9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
306        // two node:         9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df
307        // test_data name:   d9c66fed039088497293a5155e68c9722336edef5991a67fa285d9a2565582bf
308        // test_data node:   3b5e49ac9126759771d677bdacbc18a63ff94ad4e07718c18347254d7b9c6cb1
309        let result = crate::hash_directory(PathBuf::from("test_data")).unwrap();
310        assert_eq!(
311            result[..],
312            hex!("3b5e49ac9126759771d677bdacbc18a63ff94ad4e07718c18347254d7b9c6cb1")
313        )
314    }
315
316    #[test]
317    fn hash_test_data_all_via_directories() {
318        // one node:        9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
319        // two node:        9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df
320        // collected node:  9fe82ef81c21f04ed6050b650fdef5c441fa49db43b2aaf7e383f7466778b026
321        let input = vec![
322            PathBuf::from("test_data/two"),
323            PathBuf::from("test_data/one"),
324        ];
325        let result = crate::hash_directories(input).unwrap();
326        assert_eq!(
327            result[..],
328            hex!("9fe82ef81c21f04ed6050b650fdef5c441fa49db43b2aaf7e383f7466778b026")
329        )
330    }
331
332    #[test]
333    fn hash_test_data_just_one_via_directories() {
334        // one node:        9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
335        let input = vec![PathBuf::from("test_data/one")];
336        let result = crate::hash_directories(input).unwrap();
337        assert_eq!(
338            result[..],
339            hex!("9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d")
340        )
341    }
342
343    #[test]
344    fn hash_directories_empty_input() {
345        let result = crate::hash_directories(vec![]).unwrap();
346        assert_eq!(
347            result[..],
348            hex!("a7ffc6f8bf1ed76651c14756a061d662f580ff4de43b49fa82d80a4b80f8434a")
349        )
350    }
351}