1use std::collections::HashSet;
7
8use anyhow::{anyhow, Result};
9use unicase::UniCase;
10use unicode_normalization::UnicodeNormalization;
11
12use crate::crypto::{format_hash, HashAlgorithm};
13
14pub const ALGORITHM: &str = "blob_tree_blake3_nfc";
15
16const HASH_SIZE: usize = 32;
18
19const MODE_FILE: &str = "100644";
21const MODE_EXECUTABLE: &str = "100755";
22const MODE_DIRECTORY: &str = "40000";
23
24pub enum TreeEntry {
28 File {
29 name: String,
30 content: Vec<u8>,
31 executable: bool,
32 },
33 Directory {
34 name: String,
35 children: Vec<TreeEntry>,
36 },
37}
38
39fn hash_data(data: &[u8]) -> [u8; HASH_SIZE] {
40 *blake3::hash(data).as_bytes()
41}
42
43fn normalize_filename(name: &str) -> String {
44 name.nfc().collect::<String>()
45}
46
47pub fn portable_filename_key(name: &str) -> String {
52 let decomposed = name.nfd().collect::<String>();
53 UniCase::unicode(decomposed.as_str())
54 .to_folded_case()
55 .nfd()
56 .collect()
57}
58
59fn hash_blob(content: &[u8]) -> [u8; HASH_SIZE] {
60 let header = format!("blob {}\0", content.len());
61 let mut data = Vec::new();
62 data.extend_from_slice(header.as_bytes());
63 data.extend_from_slice(content);
64 hash_data(&data)
65}
66
67fn hash_tree(entries: &[(String, String, [u8; HASH_SIZE])]) -> [u8; HASH_SIZE] {
68 let mut entry_bytes = Vec::new();
69
70 for (mode, name, hash) in entries {
71 entry_bytes.extend_from_slice(mode.as_bytes());
72 entry_bytes.push(b' ');
73 entry_bytes.extend_from_slice(name.as_bytes());
74 entry_bytes.push(b'\0');
75 entry_bytes.extend_from_slice(hash);
76 }
77
78 let header = format!("tree {}\0", entry_bytes.len());
79 let mut tree_data = Vec::new();
80 tree_data.extend_from_slice(header.as_bytes());
81 tree_data.extend_from_slice(&entry_bytes);
82
83 hash_data(&tree_data)
84}
85
86fn should_exclude(name: &str, exclude_names: &[String]) -> bool {
87 exclude_names.iter().any(|pattern| name == pattern)
88}
89
90pub fn compute_hash_from_entries(
92 entries: &[TreeEntry],
93 exclude_names: &[String],
94) -> Result<String> {
95 let (hash, _size) = hash_entries(entries, exclude_names)?;
96 Ok(format_hash(HashAlgorithm::B3, &hash))
97}
98
99pub fn compute_hash_and_size_from_entries(
101 entries: &[TreeEntry],
102 exclude_names: &[String],
103) -> Result<(String, u64)> {
104 let (hash, size) = hash_entries(entries, exclude_names)?;
105 Ok((format_hash(HashAlgorithm::B3, &hash), size))
106}
107
108fn hash_entries(entries: &[TreeEntry], exclude_names: &[String]) -> Result<([u8; HASH_SIZE], u64)> {
109 let mut tree_entries = Vec::new();
110 let mut seen_names = HashSet::new();
111 let mut seen_portable_names = HashSet::new();
112 let mut total_size: u64 = 0;
113
114 for entry in entries {
115 let raw_name = match entry {
116 TreeEntry::File { name, .. } | TreeEntry::Directory { name, .. } => name.as_str(),
117 };
118
119 if should_exclude(raw_name, exclude_names) {
120 continue;
121 }
122
123 let normalized_name = normalize_filename(raw_name);
124 if !seen_names.insert(normalized_name.clone()) {
125 return Err(anyhow!(
126 "filename_nfc_conflict: Filename conflict after NFC normalization: {} (multiple sibling entries normalize to same name)",
127 raw_name
128 ));
129 }
130
131 let portable_name = portable_filename_key(raw_name);
132 if !seen_portable_names.insert(portable_name) {
133 return Err(anyhow!(
134 "filename_portable_conflict: Filename conflict under CMN portable filename matching: {} (multiple sibling entries fold to same portable name)",
135 raw_name
136 ));
137 }
138
139 let (mode, hash) = match entry {
140 TreeEntry::File {
141 content,
142 executable,
143 ..
144 } => {
145 total_size += content.len() as u64;
146 let mode = if *executable {
147 MODE_EXECUTABLE
148 } else {
149 MODE_FILE
150 };
151 (mode.to_string(), hash_blob(content))
152 }
153 TreeEntry::Directory { children, .. } => {
154 let (dir_hash, dir_size) = hash_entries(children, exclude_names)?;
155 total_size += dir_size;
156 (MODE_DIRECTORY.to_string(), dir_hash)
157 }
158 };
159
160 tree_entries.push((mode, normalized_name, hash));
161 }
162
163 tree_entries.sort_by(|a, b| a.1.cmp(&b.1));
164 Ok((hash_tree(&tree_entries), total_size))
165}
166
167#[cfg(test)]
168#[allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
169mod tests {
170
171 use super::*;
172
173 #[test]
174 fn test_hash_empty_blob() {
175 let content = b"";
176 let hash = hash_blob(content);
177 assert_eq!(hash.len(), 32);
178
179 let expected_data = b"blob 0\0";
180 let expected_hash = blake3::hash(expected_data);
181 assert_eq!(hash, *expected_hash.as_bytes());
182 }
183
184 #[test]
185 fn test_hash_simple_blob() {
186 let content = b"hello";
187 let hash = hash_blob(content);
188 let hash2 = hash_blob(content);
189 assert_eq!(hash, hash2);
190
191 let expected_data = b"blob 5\0hello";
192 let expected_hash = blake3::hash(expected_data);
193 assert_eq!(hash, *expected_hash.as_bytes());
194 }
195
196 #[test]
197 fn test_normalize_filename() {
198 let nfd = "cafe\u{0301}";
199 let normalized = normalize_filename(nfd);
200 let nfc = "caf\u{00e9}";
201 assert_eq!(normalized, nfc);
202 }
203
204 #[test]
205 fn test_portable_filename_key() {
206 assert_eq!(
207 portable_filename_key("File.txt"),
208 portable_filename_key("file.txt")
209 );
210 assert_eq!(
211 portable_filename_key("cafe\u{0301}.txt"),
212 portable_filename_key("caf\u{00e9}.txt")
213 );
214 assert_eq!(
215 portable_filename_key("Maße.txt"),
216 portable_filename_key("MASSE.txt")
217 );
218 assert_eq!(
219 portable_filename_key("Σ.txt"),
220 portable_filename_key("ς.txt")
221 );
222 assert_eq!(
223 portable_filename_key("flour.txt"),
224 portable_filename_key("flour.txt")
225 );
226 assert_eq!(
227 portable_filename_key("K.txt"),
228 portable_filename_key("k.txt")
229 );
230 assert_ne!(
231 portable_filename_key("A.txt"),
232 portable_filename_key("A.txt")
233 );
234 assert_ne!(
235 portable_filename_key("①.txt"),
236 portable_filename_key("1.txt")
237 );
238 }
239
240 #[test]
241 fn test_hash_tree_empty() {
242 let entries: Vec<(String, String, [u8; 32])> = vec![];
243 let hash = hash_tree(&entries);
244 assert_eq!(hash.len(), 32);
245 }
246
247 #[test]
248 fn test_hash_tree_single_entry() {
249 let file_hash = [0u8; 32];
250 let entries = vec![(MODE_FILE.to_string(), "test.txt".to_string(), file_hash)];
251 let hash = hash_tree(&entries);
252 let hash2 = hash_tree(&entries);
253 assert_eq!(hash.len(), 32);
254 assert_eq!(hash, hash2);
255 }
256
257 #[test]
258 fn test_compute_hash_from_entries_format() {
259 let entries = vec![TreeEntry::File {
260 name: "test.txt".to_string(),
261 content: b"hello".to_vec(),
262 executable: false,
263 }];
264
265 let hash = compute_hash_from_entries(&entries, &[]).unwrap();
266 assert!(hash.starts_with("b3."));
267 let b58_part = &hash[3..];
268 assert!(b58_part.len() >= 40 && b58_part.len() <= 48);
269 }
270
271 #[test]
272 fn test_exclusion() {
273 let entries = vec![
274 TreeEntry::File {
275 name: "excluded.tmp".to_string(),
276 content: b"{}".to_vec(),
277 executable: false,
278 },
279 TreeEntry::File {
280 name: "test.txt".to_string(),
281 content: b"hello".to_vec(),
282 executable: false,
283 },
284 ];
285
286 let hash_all = compute_hash_from_entries(&entries, &[]).unwrap();
287 let hash_excluded =
288 compute_hash_from_entries(&entries, &["excluded.tmp".to_string()]).unwrap();
289 assert_ne!(hash_all, hash_excluded);
290 }
291
292 #[test]
293 fn test_deterministic_hash() {
294 let entries = vec![
295 TreeEntry::File {
296 name: "a.txt".to_string(),
297 content: b"content a".to_vec(),
298 executable: false,
299 },
300 TreeEntry::File {
301 name: "b.txt".to_string(),
302 content: b"content b".to_vec(),
303 executable: false,
304 },
305 ];
306
307 let hash1 = compute_hash_from_entries(&entries, &[]).unwrap();
308 let hash2 = compute_hash_from_entries(&entries, &[]).unwrap();
309 assert_eq!(hash1, hash2);
310 }
311
312 #[test]
313 fn test_should_exclude() {
314 assert!(!should_exclude("test.json", &[]));
315 assert!(!should_exclude("other.json", &[]));
316
317 let excludes = vec!["node_modules".to_string(), ".git".to_string()];
318 assert!(should_exclude("node_modules", &excludes));
319 assert!(should_exclude(".git", &excludes));
320 assert!(!should_exclude("src", &excludes));
321 }
322
323 #[test]
324 fn test_nested_directory() {
325 let entries = vec![
326 TreeEntry::File {
327 name: "main.rs".to_string(),
328 content: b"fn main() {}".to_vec(),
329 executable: false,
330 },
331 TreeEntry::Directory {
332 name: "src".to_string(),
333 children: vec![TreeEntry::File {
334 name: "lib.rs".to_string(),
335 content: b"pub fn foo() {}".to_vec(),
336 executable: false,
337 }],
338 },
339 ];
340
341 let hash = compute_hash_from_entries(&entries, &[]).unwrap();
342 assert!(hash.starts_with("b3."));
343 }
344
345 #[test]
346 fn test_executable_flag() {
347 let entries_normal = vec![TreeEntry::File {
348 name: "script.sh".to_string(),
349 content: b"#!/bin/sh".to_vec(),
350 executable: false,
351 }];
352 let entries_exec = vec![TreeEntry::File {
353 name: "script.sh".to_string(),
354 content: b"#!/bin/sh".to_vec(),
355 executable: true,
356 }];
357
358 let hash_normal = compute_hash_from_entries(&entries_normal, &[]).unwrap();
359 let hash_exec = compute_hash_from_entries(&entries_exec, &[]).unwrap();
360 assert_ne!(hash_normal, hash_exec);
361 }
362
363 #[test]
364 fn rejects_sibling_case_collision() {
365 let entries = vec![
366 TreeEntry::File {
367 name: "File.txt".to_string(),
368 content: b"upper".to_vec(),
369 executable: false,
370 },
371 TreeEntry::File {
372 name: "file.txt".to_string(),
373 content: b"lower".to_vec(),
374 executable: false,
375 },
376 ];
377 let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
378 assert!(err.to_string().contains("filename_portable_conflict"));
379 }
380
381 #[test]
382 fn rejects_sibling_canonical_unicode_collision() {
383 let entries = vec![
384 TreeEntry::File {
385 name: "caf\u{00e9}.txt".to_string(),
386 content: b"nfc".to_vec(),
387 executable: false,
388 },
389 TreeEntry::File {
390 name: "cafe\u{0301}.txt".to_string(),
391 content: b"nfd".to_vec(),
392 executable: false,
393 },
394 ];
395 let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
396 assert!(err.to_string().contains("filename_nfc_conflict"));
397 }
398
399 #[test]
400 fn rejects_sibling_full_case_fold_collision() {
401 let entries = vec![
402 TreeEntry::File {
403 name: "Maße.txt".to_string(),
404 content: b"one".to_vec(),
405 executable: false,
406 },
407 TreeEntry::File {
408 name: "MASSE.txt".to_string(),
409 content: b"two".to_vec(),
410 executable: false,
411 },
412 ];
413 let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
414 assert!(err.to_string().contains("filename_portable_conflict"));
415 }
416
417 #[test]
418 fn rejects_file_vs_directory_portable_collision() {
419 let entries = vec![
420 TreeEntry::File {
421 name: "foo".to_string(),
422 content: b"file".to_vec(),
423 executable: false,
424 },
425 TreeEntry::Directory {
426 name: "FOO".to_string(),
427 children: vec![TreeEntry::File {
428 name: "bar".to_string(),
429 content: b"child".to_vec(),
430 executable: false,
431 }],
432 },
433 ];
434 let err = compute_hash_from_entries(&entries, &[]).unwrap_err();
435 assert!(err.to_string().contains("filename_portable_conflict"));
436 }
437}