dasher/lib.rs
1use sha3::{Digest, Sha3_256};
2use std::collections::{HashMap, VecDeque};
3use std::fmt;
4use std::fs;
5use std::path::PathBuf;
6use walkdir::{DirEntry, WalkDir};
7
8/// Defines the different types of nodes the directory walk can encounter. These types
9/// define, through the `to_u8` implementation, the byte that will be inserted between a
10/// node's name and content hash.
11#[derive(Debug, Clone, Copy)]
12enum NodeType {
13 /// A Directory node is a directory - this byte will be inserted between the
14 /// directory's name hash and its first content hash.
15 Directory,
16 /// The Directory Separator is used between content hashes for a directory. While a
17 /// normal file will have exactly one name and content hash, a directory can have an
18 /// arbitrary number of content hashes - one per node in the directory. The
19 /// Directory Separator byte is inserted between these hashes.
20 DirSeparator,
21 /// A File is any normal file - this byte will be inserted between the file's name
22 /// hash and its content hash.
23 File,
24 /// A Symlink is a symbolic link (on platforms that support it; specifically, where
25 /// [`is_symlink`] returns true). This byte is used instead of [`NodeType::File`]
26 /// when the node is a symlink.
27 ///
28 /// [`is_symlink`]: std::path::Path::is_symlink
29 Symlink,
30}
31
32impl NodeType {
33 pub fn to_u8(self) -> u8 {
34 match self {
35 NodeType::Directory => 2,
36 NodeType::DirSeparator => 3,
37 NodeType::File => 5,
38 NodeType::Symlink => 7,
39 }
40 }
41}
42
43/// Defines the different types of returns from [`hash_content`]
44#[derive(Debug, Clone)]
45enum ContentResult {
46 /// A File result is any return that actually includes data, that is a regular file
47 /// or a symlink, handled as needed.
48 File(Vec<u8>),
49 /// A Directory result is a signal that the node is a directory and queued node
50 /// hashes need to be appended.
51 Directory,
52}
53
54/// Defines the error states arising from hashing the content of a node
55#[derive(Debug)]
56pub enum ContentError {
57 UnknownNodeType,
58 IOError(std::io::Error),
59}
60
61impl fmt::Display for ContentError {
62 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
63 match self {
64 ContentError::UnknownNodeType => write!(f, "Node is not a directory, file, or symlink"),
65 ContentError::IOError(_) => write!(f, "Encountered a problem opening a node"),
66 }
67 }
68}
69
70impl From<std::io::Error> for ContentError {
71 fn from(err: std::io::Error) -> Self {
72 Self::IOError(err)
73 }
74}
75
76/// Error types from the [`hash_directory`] function
77#[derive(Debug)]
78pub enum HashError {
79 ContentError(ContentError),
80}
81
82impl fmt::Display for HashError {
83 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
84 write!(f, "Hash could not be computed")
85 }
86}
87
88impl From<ContentError> for HashError {
89 fn from(err: ContentError) -> Self {
90 Self::ContentError(err)
91 }
92}
93
94/// Given a list of directory paths, computes their directory hash and returns it as a
95/// byte array.
96///
97/// This function is equivalent to calling `hash_directory` on each directory in turn,
98/// concatenating all those hashes with the `NodeType::DirSeparator` byte, then hashing
99/// the result. The input does not have to be sorted, but will be sorted prior to
100/// hashing, in order to ensure quirks of calling this function won't result in
101/// inconsistent results.
102///
103/// If the input is a singleton vector, i.e. a vector with only one element, calling
104/// this function is equivalent to calling `hash_directory` on that element. If the
105/// input is an empty vector, the null-hash is returned.
106pub fn hash_directories(paths: Vec<PathBuf>) -> Result<Vec<u8>, HashError> {
107 let mut paths = paths.clone();
108 paths.sort();
109
110 let data = paths
111 .iter()
112 .cloned()
113 .map(|p| hash_directory(p))
114 .collect::<Result<Vec<_>, HashError>>()?
115 .join(&NodeType::DirSeparator.to_u8());
116
117 match paths.len() {
118 1 => Ok(data),
119 _ => Ok(Vec::from(
120 Sha3_256::new().chain_update(data).finalize().as_slice(),
121 )),
122 }
123}
124
125/// Given a path to a directory, as a `PathBuf`, computes the directory hash and returns
126/// it as a byte array.
127///
128/// # Examples
129///
130/// no
131///
132/// # Scheme
133///
134/// The hashing scheme is, in essence, generating a [Merkle
135/// tree](https://en.wikipedia.org/wiki/Merkle_tree), but with extra steps. Each node in
136/// the directory tree has its name hashed, then its contents, then those hashes are
137/// concatenated with a separator byte based on the node's type, and that data is hashed
138/// again to generate the node's hash. This process is repeated, from the bottom up in
139/// the directory tree, until all nodes have been hashed and a final hash for the entire
140/// directory can be returned.
141///
142/// For normal **files**, the node hash is simply:
143/// `hash(hash(name) + byte + hash(content))`
144///
145/// For **directories**, the node hash includes arbitrarily many content hashes, one per
146/// sub-node:
147/// `hash(hash(name) + byte + hash(content_1) + byte + hash(content_2) + ... + byte +
148/// hash(content_n))`
149///
150/// Finally, for **symlinks**, the link isn't followed. Instead, the content hash is the
151/// hash of the path to the file the link points to.
152/// `hash(hash(name) + byte + hash(path))`
153pub fn hash_directory(path: PathBuf) -> Result<Vec<u8>, HashError> {
154 let mut cache_map: HashMap<usize, VecDeque<Vec<u8>>> = HashMap::new();
155
156 for entry in WalkDir::new(&path).sort_by_file_name().contents_first(true) {
157 let entry = match entry {
158 Ok(v) => v,
159 Err(_) => continue,
160 };
161
162 let content = hash_content(&entry)?;
163 let name = Vec::from(
164 Sha3_256::new()
165 .chain_update(entry.file_name().to_string_lossy().as_bytes())
166 .finalize()
167 .as_slice(),
168 );
169
170 let node = match content {
171 crate::ContentResult::File(content) => node_file_hash(&entry, name, content),
172 crate::ContentResult::Directory => node_dir_hash(&mut cache_map, &entry, name),
173 };
174
175 match cache_map.get_mut(&entry.depth()) {
176 Some(q) => q.push_back(node),
177 None => {
178 let mut q: VecDeque<Vec<u8>> = VecDeque::new();
179 q.push_back(node);
180 cache_map.insert(entry.depth(), q);
181 }
182 }
183 }
184
185 Ok(cache_map.get_mut(&0).unwrap().pop_front().unwrap())
186}
187
188/// Given a [`DirEntry`], hashes its contents according to the type of node.
189fn hash_content(entry: &DirEntry) -> Result<ContentResult, ContentError> {
190 if entry.file_type().is_symlink() {
191 Ok(ContentResult::File(Vec::from(
192 Sha3_256::new()
193 .chain_update(fs::read_link(entry.path())?.to_string_lossy().as_bytes())
194 .finalize()
195 .as_slice(),
196 )))
197 } else if entry.file_type().is_dir() {
198 Ok(ContentResult::Directory)
199 } else if entry.file_type().is_file() {
200 Ok(ContentResult::File(Vec::from(
201 Sha3_256::new()
202 .chain_update(fs::read(entry.path())?)
203 .finalize()
204 .as_slice(),
205 )))
206 } else {
207 Err(ContentError::UnknownNodeType)
208 }
209}
210
211/// Generates the node hash of a given File node
212fn node_file_hash(entry: &DirEntry, name: Vec<u8>, content: Vec<u8>) -> Vec<u8> {
213 Vec::from(
214 Sha3_256::new()
215 .chain_update(name)
216 .chain_update(if entry.file_type().is_symlink() {
217 [NodeType::Symlink.to_u8()]
218 } else {
219 [NodeType::File.to_u8()]
220 })
221 .chain_update(content)
222 .finalize()
223 .as_slice(),
224 )
225}
226
227/// Generates the node hash of a given Directory node
228fn node_dir_hash(
229 cache: &mut HashMap<usize, VecDeque<Vec<u8>>>,
230 entry: &DirEntry,
231 name: Vec<u8>,
232) -> Vec<u8> {
233 let hasher = Sha3_256::new()
234 .chain_update(name)
235 .chain_update([NodeType::Directory.to_u8()]);
236 let data = match cache.get_mut(&(entry.depth() + 1)) {
237 Some(q) => q
238 .drain(..)
239 .collect::<Vec<Vec<u8>>>()
240 .join(&NodeType::DirSeparator.to_u8()),
241 None => Vec::new(),
242 };
243 Vec::from(hasher.chain_update(data).finalize().as_slice())
244}
245
246#[cfg(test)]
247mod tests {
248 use std::path::PathBuf;
249
250 use crate::{Digest, Sha3_256};
251 use hex_literal::hex;
252 #[test]
253 fn basic_sha256() {
254 let mut hasher = Sha3_256::new();
255 hasher.update(b"coffee");
256 let result = hasher.finalize();
257 assert_eq!(
258 result[..],
259 hex!("2250fa0b557f93b1d92a26e2ca55cfe2354e416e9d674277784cfb09184b41b0")[..]
260 );
261 }
262
263 #[test]
264 fn hash_test_data_one() {
265 // one.test content: 9241024260f87e2b901ed6972c48a17c4dc71e0939b0dd445f431f9cf406ca3a
266 // one.test name: 8d80e5830940407463f61fe1ef751de17cb095f8646ff71a72b1374efe5d84c5
267 // one.test node: 7716a22d94ecef97998c296ec7914ee0f6bcd66d8b37ac82688b4a3a4ba0a0ca
268 // one name: 6f70f27e13fc073a2541cd1e8b38ba9dbd5ec6de7bfeb24328534c417697381f
269 // one node: 9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
270 let result = crate::hash_directory(PathBuf::from("test_data/one")).unwrap();
271 assert_eq!(
272 result[..],
273 hex!("9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d")
274 )
275 }
276
277 #[test]
278 fn hash_test_data_two() {
279 // one.test content: 9241024260f87e2b901ed6972c48a17c4dc71e0939b0dd445f431f9cf406ca3a
280 // one.test name: 8d80e5830940407463f61fe1ef751de17cb095f8646ff71a72b1374efe5d84c5
281 // one.test node: 7716a22d94ecef97998c296ec7914ee0f6bcd66d8b37ac82688b4a3a4ba0a0ca
282 // two.test content: f2ee51400cb7890e88835039d97b3411df6d2460843c8e84b3f7541c40eec1ba
283 // two.test name: af148b4b830b9882103cf3ca767e34a6e4dc52a0b6f08df31a77ae9903d1949d
284 // two.test node: a6e7380dabe9ba94240a56570ba58dc457e57924ad78bc2476867dcbfd57c8eb
285 // subone name: 55fc1f2b086b1f8bde1078ae015e788ec2766e38f68a19deedc1d4cc1a882a57
286 // subone node: 2c57a0604a04b43168f02b177d6ac0f0205c70c1896a6c0b4eefd870dfe7089d
287 // bacon.test cont: 5d9121bfbe2ccd96c4e2e94e2c0c4a9940fb325f728fd5de26fc3e0f8df37914
288 // bacon.test name: e176fb76eb3d39d67764c86a91dfb5ff11ce95ae556145bef4112434da0cffcf
289 // bacon.test node: 0ffcbf39480556cf4438cbe5bd18b926d9abecb8f1ea99f75d31f28a84429c8d
290 // two.test(l) cont: 29077be8b7ce3922951fb75ff84cf5aa29edbddd86725ee7b34a4508345b93a5
291 // two.test(l) node: b97f5177d45d1cac7b7b381609684fe2c689f8538bcc92db0a8312a0c6b83185
292 // subtwo name: 996340b04c5f9afc98dc7b81278eff5e796a814c6078101391277461246cc5b7
293 // subtwo node: fa44844774f5f9e126e5a8d6b3c426b1f8b9af9abf7984371fae9ff4ef57e305
294 // two name: cad32d6da454536a0412369e78baf227a81309b9579df2f450d1b5f5c8c26bf0
295 // two node: 9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df
296 let result = crate::hash_directory(PathBuf::from("test_data/two")).unwrap();
297 assert_eq!(
298 result[..],
299 hex!("9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df")
300 )
301 }
302
303 #[test]
304 fn hash_test_data_all() {
305 // one node: 9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
306 // two node: 9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df
307 // test_data name: d9c66fed039088497293a5155e68c9722336edef5991a67fa285d9a2565582bf
308 // test_data node: 3b5e49ac9126759771d677bdacbc18a63ff94ad4e07718c18347254d7b9c6cb1
309 let result = crate::hash_directory(PathBuf::from("test_data")).unwrap();
310 assert_eq!(
311 result[..],
312 hex!("3b5e49ac9126759771d677bdacbc18a63ff94ad4e07718c18347254d7b9c6cb1")
313 )
314 }
315
316 #[test]
317 fn hash_test_data_all_via_directories() {
318 // one node: 9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
319 // two node: 9119ffd015d217097164f944331ee865fb6ac8c0b670728cf42c9e45c21ea0df
320 // collected node: 9fe82ef81c21f04ed6050b650fdef5c441fa49db43b2aaf7e383f7466778b026
321 let input = vec![
322 PathBuf::from("test_data/two"),
323 PathBuf::from("test_data/one"),
324 ];
325 let result = crate::hash_directories(input).unwrap();
326 assert_eq!(
327 result[..],
328 hex!("9fe82ef81c21f04ed6050b650fdef5c441fa49db43b2aaf7e383f7466778b026")
329 )
330 }
331
332 #[test]
333 fn hash_test_data_just_one_via_directories() {
334 // one node: 9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d
335 let input = vec![PathBuf::from("test_data/one")];
336 let result = crate::hash_directories(input).unwrap();
337 assert_eq!(
338 result[..],
339 hex!("9fd3dceb108e5f6067a623a592524a4014f5d7244e537891d147b51e8c1c147d")
340 )
341 }
342
343 #[test]
344 fn hash_directories_empty_input() {
345 let result = crate::hash_directories(vec![]).unwrap();
346 assert_eq!(
347 result[..],
348 hex!("a7ffc6f8bf1ed76651c14756a061d662f580ff4de43b49fa82d80a4b80f8434a")
349 )
350 }
351}