Skip to main content

substrate/tree/
blob_tree_blake3_nfc.rs

1//! `blob_tree_blake3_nfc` tree hashing algorithm.
2//!
3//! This algorithm uses Git-like blob/tree object construction, BLAKE3 hashing,
4//! and NFC filename normalization for cross-platform stability.
5
6use std::collections::HashSet;
7
8use anyhow::{anyhow, Result};
9use unicase::UniCase;
10use unicode_normalization::UnicodeNormalization;
11
12use crate::crypto::{format_hash, HashAlgorithm};
13
14pub const ALGORITHM: &str = "blob_tree_blake3_nfc";
15
16/// Hash output size in bytes (BLAKE3 = 32).
17const HASH_SIZE: usize = 32;
18
19/// Git-compatible file modes.
20const MODE_FILE: &str = "100644";
21const MODE_EXECUTABLE: &str = "100755";
22const MODE_DIRECTORY: &str = "40000";
23
24/// In-memory representation of a filesystem entry for tree hashing.
25///
26/// Symlinks are not supported — implementations MUST reject them at walk time.
27pub enum TreeEntry {
28    File {
29        name: String,
30        content: Vec<u8>,
31        executable: bool,
32    },
33    Directory {
34        name: String,
35        children: Vec<TreeEntry>,
36    },
37}
38
39fn hash_data(data: &[u8]) -> [u8; HASH_SIZE] {
40    *blake3::hash(data).as_bytes()
41}
42
43fn normalize_filename(name: &str) -> String {
44    name.nfc().collect::<String>()
45}
46
47/// Return CMN's deterministic key for portable sibling filename matching.
48///
49/// This intentionally uses canonical decomposition plus Unicode case folding,
50/// not compatibility folding, so compatibility glyphs are not over-rejected.
51pub fn portable_filename_key(name: &str) -> String {
52    let decomposed = name.nfd().collect::<String>();
53    UniCase::unicode(decomposed.as_str())
54        .to_folded_case()
55        .nfd()
56        .collect()
57}
58
59fn hash_blob(content: &[u8]) -> [u8; HASH_SIZE] {
60    let header = format!("blob {}\0", content.len());
61    let mut data = Vec::new();
62    data.extend_from_slice(header.as_bytes());
63    data.extend_from_slice(content);
64    hash_data(&data)
65}
66
67fn hash_tree(entries: &[(String, String, [u8; HASH_SIZE])]) -> [u8; HASH_SIZE] {
68    let mut entry_bytes = Vec::new();
69
70    for (mode, name, hash) in entries {
71        entry_bytes.extend_from_slice(mode.as_bytes());
72        entry_bytes.push(b' ');
73        entry_bytes.extend_from_slice(name.as_bytes());
74        entry_bytes.push(b'\0');
75        entry_bytes.extend_from_slice(hash);
76    }
77
78    let header = format!("tree {}\0", entry_bytes.len());
79    let mut tree_data = Vec::new();
80    tree_data.extend_from_slice(header.as_bytes());
81    tree_data.extend_from_slice(&entry_bytes);
82
83    hash_data(&tree_data)
84}
85
86fn should_exclude(name: &str, exclude_names: &[String]) -> bool {
87    exclude_names.iter().any(|pattern| name == pattern)
88}
89
90/// Compute a tree hash from in-memory entries (no filesystem I/O).
91pub fn compute_hash_from_entries(
92    entries: &[TreeEntry],
93    exclude_names: &[String],
94) -> Result<String> {
95    let (hash, _size) = hash_entries(entries, exclude_names)?;
96    Ok(format_hash(HashAlgorithm::B3, &hash))
97}
98
99/// Compute tree hash and total uncompressed source size (sum of all blob content bytes).
100pub fn compute_hash_and_size_from_entries(
101    entries: &[TreeEntry],
102    exclude_names: &[String],
103) -> Result<(String, u64)> {
104    let (hash, size) = hash_entries(entries, exclude_names)?;
105    Ok((format_hash(HashAlgorithm::B3, &hash), size))
106}
107
108fn hash_entries(entries: &[TreeEntry], exclude_names: &[String]) -> Result<([u8; HASH_SIZE], u64)> {
109    let mut tree_entries = Vec::new();
110    let mut seen_names = HashSet::new();
111    let mut seen_portable_names = HashSet::new();
112    let mut total_size: u64 = 0;
113
114    for entry in entries {
115        let raw_name = match entry {
116            TreeEntry::File { name, .. } | TreeEntry::Directory { name, .. } => name.as_str(),
117        };
118
119        if should_exclude(raw_name, exclude_names) {
120            continue;
121        }
122
123        let normalized_name = normalize_filename(raw_name);
124        if !seen_names.insert(normalized_name.clone()) {
125            return Err(anyhow!(
126                "filename_nfc_conflict: Filename conflict after NFC normalization: {} (multiple sibling entries normalize to same name)",
127                raw_name
128            ));
129        }
130
131        let portable_name = portable_filename_key(raw_name);
132        if !seen_portable_names.insert(portable_name) {
133            return Err(anyhow!(
134                "filename_portable_conflict: Filename conflict under CMN portable filename matching: {} (multiple sibling entries fold to same portable name)",
135                raw_name
136            ));
137        }
138
139        let (mode, hash) = match entry {
140            TreeEntry::File {
141                content,
142                executable,
143                ..
144            } => {
145                total_size += content.len() as u64;
146                let mode = if *executable {
147                    MODE_EXECUTABLE
148                } else {
149                    MODE_FILE
150                };
151                (mode.to_string(), hash_blob(content))
152            }
153            TreeEntry::Directory { children, .. } => {
154                let (dir_hash, dir_size) = hash_entries(children, exclude_names)?;
155                total_size += dir_size;
156                (MODE_DIRECTORY.to_string(), dir_hash)
157            }
158        };
159
160        tree_entries.push((mode, normalized_name, hash));
161    }
162
163    tree_entries.sort_by(|a, b| a.1.cmp(&b.1));
164    Ok((hash_tree(&tree_entries), total_size))
165}
166
167#[cfg(test)]
168#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
169mod tests {
170
171    use super::*;
172
173    #[test]
174    fn test_hash_empty_blob() {
175        let content = b"";
176        let hash = hash_blob(content);
177        assert_eq!(hash.len(), 32);
178
179        let expected_data = b"blob 0\0";
180        let expected_hash = blake3::hash(expected_data);
181        assert_eq!(hash, *expected_hash.as_bytes());
182    }
183
184    #[test]
185    fn test_hash_simple_blob() {
186        let content = b"hello";
187        let hash = hash_blob(content);
188        let hash2 = hash_blob(content);
189        assert_eq!(hash, hash2);
190
191        let expected_data = b"blob 5\0hello";
192        let expected_hash = blake3::hash(expected_data);
193        assert_eq!(hash, *expected_hash.as_bytes());
194    }
195
196    #[test]
197    fn test_normalize_filename() {
198        let nfd = "cafe\u{0301}";
199        let normalized = normalize_filename(nfd);
200        let nfc = "caf\u{00e9}";
201        assert_eq!(normalized, nfc);
202    }
203
204    #[test]
205    fn test_portable_filename_key() {
206        assert_eq!(
207            portable_filename_key("File.txt"),
208            portable_filename_key("file.txt")
209        );
210        assert_eq!(
211            portable_filename_key("cafe\u{0301}.txt"),
212            portable_filename_key("caf\u{00e9}.txt")
213        );
214        assert_eq!(
215            portable_filename_key("Maße.txt"),
216            portable_filename_key("MASSE.txt")
217        );
218        assert_eq!(
219            portable_filename_key("Σ.txt"),
220            portable_filename_key("ς.txt")
221        );
222        assert_eq!(
223            portable_filename_key("flour.txt"),
224            portable_filename_key("flour.txt")
225        );
226        assert_eq!(
227            portable_filename_key("K.txt"),
228            portable_filename_key("k.txt")
229        );
230        assert_ne!(
231            portable_filename_key("A.txt"),
232            portable_filename_key("A.txt")
233        );
234        assert_ne!(
235            portable_filename_key("①.txt"),
236            portable_filename_key("1.txt")
237        );
238    }
239
240    #[test]
241    fn test_hash_tree_empty() {
242        let entries: Vec<(String, String, [u8; 32])> = vec![];
243        let hash = hash_tree(&entries);
244        assert_eq!(hash.len(), 32);
245    }
246
247    #[test]
248    fn test_hash_tree_single_entry() {
249        let file_hash = [0u8; 32];
250        let entries = vec![(MODE_FILE.to_string(), "test.txt".to_string(), file_hash)];
251        let hash = hash_tree(&entries);
252        let hash2 = hash_tree(&entries);
253        assert_eq!(hash.len(), 32);
254        assert_eq!(hash, hash2);
255    }
256
257    #[test]
258    fn test_compute_hash_from_entries_format() {
259        let entries = vec![TreeEntry::File {
260            name: "test.txt".to_string(),
261            content: b"hello".to_vec(),
262            executable: false,
263        }];
264
265        let hash = compute_hash_from_entries(&entries, &[]).unwrap();
266        assert!(hash.starts_with("b3."));
267        let b58_part = &hash[3..];
268        assert!(b58_part.len() >= 40 && b58_part.len() <= 48);
269    }
270
271    #[test]
272    fn test_exclusion() {
273        let entries = vec![
274            TreeEntry::File {
275                name: "excluded.tmp".to_string(),
276                content: b"{}".to_vec(),
277                executable: false,
278            },
279            TreeEntry::File {
280                name: "test.txt".to_string(),
281                content: b"hello".to_vec(),
282                executable: false,
283            },
284        ];
285
286        let hash_all = compute_hash_from_entries(&entries, &[]).unwrap();
287        let hash_excluded =
288            compute_hash_from_entries(&entries, &["excluded.tmp".to_string()]).unwrap();
289        assert_ne!(hash_all, hash_excluded);
290    }
291
292    #[test]
293    fn test_deterministic_hash() {
294        let entries = vec![
295            TreeEntry::File {
296                name: "a.txt".to_string(),
297                content: b"content a".to_vec(),
298                executable: false,
299            },
300            TreeEntry::File {
301                name: "b.txt".to_string(),
302                content: b"content b".to_vec(),
303                executable: false,
304            },
305        ];
306
307        let hash1 = compute_hash_from_entries(&entries, &[]).unwrap();
308        let hash2 = compute_hash_from_entries(&entries, &[]).unwrap();
309        assert_eq!(hash1, hash2);
310    }
311
312    #[test]
313    fn test_should_exclude() {
314        assert!(!should_exclude("test.json", &[]));
315        assert!(!should_exclude("other.json", &[]));
316
317        let excludes = vec!["node_modules".to_string(), ".git".to_string()];
318        assert!(should_exclude("node_modules", &excludes));
319        assert!(should_exclude(".git", &excludes));
320        assert!(!should_exclude("src", &excludes));
321    }
322
323    #[test]
324    fn test_nested_directory() {
325        let entries = vec![
326            TreeEntry::File {
327                name: "main.rs".to_string(),
328                content: b"fn main() {}".to_vec(),
329                executable: false,
330            },
331            TreeEntry::Directory {
332                name: "src".to_string(),
333                children: vec![TreeEntry::File {
334                    name: "lib.rs".to_string(),
335                    content: b"pub fn foo() {}".to_vec(),
336                    executable: false,
337                }],
338            },
339        ];
340
341        let hash = compute_hash_from_entries(&entries, &[]).unwrap();
342        assert!(hash.starts_with("b3."));
343    }
344
345    #[test]
346    fn test_executable_flag() {
347        let entries_normal = vec![TreeEntry::File {
348            name: "script.sh".to_string(),
349            content: b"#!/bin/sh".to_vec(),
350            executable: false,
351        }];
352        let entries_exec = vec![TreeEntry::File {
353            name: "script.sh".to_string(),
354            content: b"#!/bin/sh".to_vec(),
355            executable: true,
356        }];
357
358        let hash_normal = compute_hash_from_entries(&entries_normal, &[]).unwrap();
359        let hash_exec = compute_hash_from_entries(&entries_exec, &[]).unwrap();
360        assert_ne!(hash_normal, hash_exec);
361    }
362
363    #[test]
364    fn rejects_sibling_case_collision() {
365        let entries = vec![
366            TreeEntry::File {
367                name: "File.txt".to_string(),
368                content: b"upper".to_vec(),
369                executable: false,
370            },
371            TreeEntry::File {
372                name: "file.txt".to_string(),
373                content: b"lower".to_vec(),
374                executable: false,
375            },
376        ];
377        let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
378        assert!(err.to_string().contains("filename_portable_conflict"));
379    }
380
381    #[test]
382    fn rejects_sibling_canonical_unicode_collision() {
383        let entries = vec![
384            TreeEntry::File {
385                name: "caf\u{00e9}.txt".to_string(),
386                content: b"nfc".to_vec(),
387                executable: false,
388            },
389            TreeEntry::File {
390                name: "cafe\u{0301}.txt".to_string(),
391                content: b"nfd".to_vec(),
392                executable: false,
393            },
394        ];
395        let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
396        assert!(err.to_string().contains("filename_nfc_conflict"));
397    }
398
399    #[test]
400    fn rejects_sibling_full_case_fold_collision() {
401        let entries = vec![
402            TreeEntry::File {
403                name: "Maße.txt".to_string(),
404                content: b"one".to_vec(),
405                executable: false,
406            },
407            TreeEntry::File {
408                name: "MASSE.txt".to_string(),
409                content: b"two".to_vec(),
410                executable: false,
411            },
412        ];
413        let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
414        assert!(err.to_string().contains("filename_portable_conflict"));
415    }
416
417    #[test]
418    fn rejects_file_vs_directory_portable_collision() {
419        let entries = vec![
420            TreeEntry::File {
421                name: "foo".to_string(),
422                content: b"file".to_vec(),
423                executable: false,
424            },
425            TreeEntry::Directory {
426                name: "FOO".to_string(),
427                children: vec![TreeEntry::File {
428                    name: "bar".to_string(),
429                    content: b"child".to_vec(),
430                    executable: false,
431                }],
432            },
433        ];
434        let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
435        assert!(err.to_string().contains("filename_portable_conflict"));
436    }
437}