1use crate::cid::Cid;
20use crate::proto::{merkledag, unixfs};
21use crate::verify::{DAG_PB_CODEC, DagNode, Hasher, encode_multihash, encode_pbnode_canonical};
22use crate::{CidVersion, FolderEntry, UploadFolderOptions};
23use prost::Message;
24use sha2::{Digest, Sha256};
25use std::collections::BTreeMap;
26use std::fs::File;
27use std::io::Read;
28use std::path::PathBuf;
29
30#[derive(Debug, thiserror::Error)]
31pub enum FolderHashError {
32 #[error("I/O error reading {path}: {source}")]
33 Io {
34 path: PathBuf,
35 #[source]
36 source: std::io::Error,
37 },
38 #[error("HAMT recursion exceeded 8 levels (impossible hash collision)")]
39 HamtDepthExceeded,
40 #[error("sink error: {0}")]
41 Sink(#[source] std::io::Error),
42}
43
44pub type BlockSink<'a> = dyn FnMut(&[u8], &[u8]) -> std::io::Result<()> + 'a;
50
51#[derive(Debug)]
53enum TreeNode {
54 File(PathBuf),
55 Dir(BTreeMap<String, TreeNode>),
56}
57
58fn build_tree(entries: &[FolderEntry]) -> BTreeMap<String, TreeNode> {
61 let mut root: BTreeMap<String, TreeNode> = BTreeMap::new();
62 for entry in entries {
63 let mut cursor = &mut root;
64 let segments: Vec<&str> = entry.relative_path.split('/').collect();
65 let (last, parents) = segments.split_last().expect("non-empty relative path");
66 for part in parents {
67 let next = cursor
68 .entry((*part).to_string())
69 .or_insert_with(|| TreeNode::Dir(BTreeMap::new()));
70 cursor = match next {
71 TreeNode::Dir(map) => map,
72 TreeNode::File(_) => {
73 panic!(
74 "folder tree: {} is both a file and a directory prefix",
75 part
76 );
77 }
78 };
79 }
80 cursor.insert(
81 (*last).to_string(),
82 TreeNode::File(entry.absolute_path.clone()),
83 );
84 }
85 root
86}
87
88#[derive(Debug, Clone)]
90pub(crate) struct ChildLink {
91 pub name: String,
92 pub cid_bytes: Vec<u8>,
94 pub cumulative_size: u64,
95}
96
97fn build_cid_bytes(multihash: Vec<u8>, cid_v1: bool) -> Vec<u8> {
100 if cid_v1 {
101 let mut cid = Vec::with_capacity(2 + multihash.len());
102 cid.push(0x01);
103 cid.push(DAG_PB_CODEC as u8);
104 cid.extend_from_slice(&multihash);
105 cid
106 } else {
107 multihash
108 }
109}
110
111#[cfg_attr(not(test), allow(dead_code))]
123pub(crate) fn build_plain_directory(children: &[ChildLink], cid_v1: bool) -> DagNode {
124 build_plain_directory_with_sink(children, cid_v1, &mut |_, _| Ok(()))
125 .expect("no-op sink cannot fail")
126}
127
128fn build_plain_directory_with_sink(
129 children: &[ChildLink],
130 cid_v1: bool,
131 sink: &mut BlockSink<'_>,
132) -> Result<DagNode, FolderHashError> {
133 let links: Vec<merkledag::PbLink> = children
134 .iter()
135 .map(|c| merkledag::PbLink {
136 hash: Some(c.cid_bytes.clone()),
137 name: Some(c.name.clone()),
138 tsize: Some(c.cumulative_size),
139 })
140 .collect();
141
142 let dir_data = unixfs::Data {
143 r#type: unixfs::DataType::Directory as i32,
144 data: None,
145 filesize: None,
146 blocksizes: vec![],
147 hash_type: None,
148 fanout: None,
149 };
150 let mut dir_bytes = Vec::new();
151 dir_data
152 .encode(&mut dir_bytes)
153 .expect("protobuf encoding cannot fail");
154
155 let node = merkledag::PbNode {
156 links,
157 data: Some(dir_bytes),
158 };
159 let node_bytes = encode_pbnode_canonical(&node);
160 let digest = Sha256::digest(&node_bytes);
161 let mh = encode_multihash(&digest);
162 let cid_bytes = build_cid_bytes(mh, cid_v1);
163
164 let node_size = node_bytes.len() as u64;
165 let children_cumulative: u64 = children.iter().map(|c| c.cumulative_size).sum();
166
167 let dag_node = DagNode {
168 cid_bytes,
169 cumulative_size: node_size + children_cumulative,
170 data_size: 0, };
172 sink(&dag_node.cid_bytes, &node_bytes).map_err(FolderHashError::Sink)?;
173 Ok(dag_node)
174}
175
176const HAMT_SHARDING_SIZE: usize = 256 * 1024; pub(crate) fn estimated_dir_size(children: &[ChildLink]) -> usize {
186 children
187 .iter()
188 .map(|c| c.name.len() + c.cid_bytes.len())
189 .sum()
190}
191
192pub(crate) fn should_shard(children: &[ChildLink]) -> bool {
194 estimated_dir_size(children) >= HAMT_SHARDING_SIZE
195}
196
197pub(crate) fn set_bit(bf: &mut [u8; 32], slot: u8) {
205 let byte = 31 - (slot / 8) as usize;
206 let bit = slot % 8;
207 bf[byte] |= 1 << bit;
208}
209
210#[cfg(test)]
212pub(crate) fn test_bit(bf: &[u8; 32], slot: u8) -> bool {
213 let byte = 31 - (slot / 8) as usize;
214 let bit = slot % 8;
215 (bf[byte] & (1 << bit)) != 0
216}
217
218const HAMT_FANOUT: u32 = 256;
220
221const HAMT_HASH_TYPE: u64 = 0x22;
223
224const HAMT_MURMUR_SEED: u32 = 0;
226
227fn hamt_hash_name(name: &str) -> u64 {
232 use murmur3::murmur3_x64_128;
233 use std::io::Cursor;
234 let h128 = murmur3_x64_128(&mut Cursor::new(name.as_bytes()), HAMT_MURMUR_SEED)
235 .expect("in-memory cursor read cannot fail");
236 h128 as u64
237}
238
239fn hamt_slot(hash: u64, level: u32) -> u8 {
248 ((hash >> (56 - 8 * level)) & 0xff) as u8
249}
250
251#[cfg_attr(not(test), allow(dead_code))]
253pub(crate) fn build_hamt_root(
254 children: &[ChildLink],
255 cid_v1: bool,
256) -> Result<DagNode, FolderHashError> {
257 build_hamt_root_with_sink(children, cid_v1, &mut |_, _| Ok(()))
258}
259
260fn build_hamt_root_with_sink(
261 children: &[ChildLink],
262 cid_v1: bool,
263 sink: &mut BlockSink<'_>,
264) -> Result<DagNode, FolderHashError> {
265 let hashed: Vec<(u64, ChildLink)> = children
266 .iter()
267 .map(|c| (hamt_hash_name(&c.name), c.clone()))
268 .collect();
269 build_hamt_node_with_sink(&hashed, 0, cid_v1, sink)
270}
271
272fn build_hamt_node_with_sink(
273 entries: &[(u64, ChildLink)],
274 level: u32,
275 cid_v1: bool,
276 sink: &mut BlockSink<'_>,
277) -> Result<DagNode, FolderHashError> {
278 if level >= 8 {
279 return Err(FolderHashError::HamtDepthExceeded);
280 }
281
282 let mut buckets: [Vec<(u64, ChildLink)>; 256] = std::array::from_fn(|_| Vec::new());
284 for (h, c) in entries {
285 let slot = hamt_slot(*h, level) as usize;
286 buckets[slot].push((*h, c.clone()));
287 }
288
289 let mut bitfield = [0u8; 32];
290 let mut links: Vec<merkledag::PbLink> = Vec::new();
291 let mut children_cumulative: u64 = 0;
292
293 for (slot_idx, bucket) in buckets.iter().enumerate() {
294 if bucket.is_empty() {
295 continue;
296 }
297 set_bit(&mut bitfield, slot_idx as u8);
298
299 let (link_name, child_dag) = if bucket.len() == 1 {
300 let (_, c) = &bucket[0];
302 let name = format!("{:02X}{}", slot_idx, c.name);
303 (
304 name,
305 DagNode {
306 cid_bytes: c.cid_bytes.clone(),
307 cumulative_size: c.cumulative_size,
308 data_size: 0,
309 },
310 )
311 } else {
312 let sub = build_hamt_node_with_sink(bucket, level + 1, cid_v1, sink)?;
314 (format!("{slot_idx:02X}"), sub)
315 };
316
317 children_cumulative += child_dag.cumulative_size;
318 links.push(merkledag::PbLink {
319 hash: Some(child_dag.cid_bytes),
320 name: Some(link_name),
321 tsize: Some(child_dag.cumulative_size),
322 });
323 }
324
325 links.sort_by(|a, b| a.name.cmp(&b.name));
328
329 let bf_vec: Vec<u8> = {
332 let start = bitfield.iter().position(|&b| b != 0).unwrap_or(32);
333 bitfield[start..].to_vec()
334 };
335
336 let unixfs_data = unixfs::Data {
337 r#type: unixfs::DataType::HamtShard as i32,
338 data: Some(bf_vec),
339 filesize: None,
340 blocksizes: vec![],
341 hash_type: Some(HAMT_HASH_TYPE),
342 fanout: Some(HAMT_FANOUT as u64),
343 };
344 let mut data_bytes = Vec::new();
345 unixfs_data
346 .encode(&mut data_bytes)
347 .expect("protobuf encoding cannot fail");
348
349 let node = merkledag::PbNode {
350 links,
351 data: Some(data_bytes),
352 };
353 let node_bytes = encode_pbnode_canonical(&node);
354 let digest = Sha256::digest(&node_bytes);
355 let mh = encode_multihash(&digest);
356 let cid_bytes = build_cid_bytes(mh, cid_v1);
357
358 let node_size = node_bytes.len() as u64;
359
360 let dag_node = DagNode {
361 cid_bytes,
362 cumulative_size: node_size + children_cumulative,
363 data_size: 0,
364 };
365 sink(&dag_node.cid_bytes, &node_bytes).map_err(FolderHashError::Sink)?;
366 Ok(dag_node)
367}
368
369pub fn build_folder_dag(
375 entries: &[FolderEntry],
376 opts: &UploadFolderOptions,
377 sink: &mut BlockSink<'_>,
378) -> Result<Cid, FolderHashError> {
379 let tree = build_tree(entries);
380 let cid_v1 = matches!(opts.cid_version, CidVersion::V1);
381 let root = hash_dir_with_sink(&tree, cid_v1, sink)?;
382 Ok(cid_bytes_to_cid(&root.cid_bytes))
383}
384
385pub fn hash_folder_root(
405 entries: &[FolderEntry],
406 opts: &UploadFolderOptions,
407) -> Result<Cid, FolderHashError> {
408 build_folder_dag(entries, opts, &mut |_, _| Ok(()))
409}
410
411fn hash_dir_with_sink(
412 tree: &BTreeMap<String, TreeNode>,
413 cid_v1: bool,
414 sink: &mut BlockSink<'_>,
415) -> Result<DagNode, FolderHashError> {
416 let mut children: Vec<ChildLink> = Vec::with_capacity(tree.len());
417 for (name, node) in tree {
418 let dag = match node {
419 TreeNode::File(path) => hash_file_with_sink(path, cid_v1, sink)?,
420 TreeNode::Dir(sub) => hash_dir_with_sink(sub, cid_v1, sink)?,
421 };
422 children.push(ChildLink {
423 name: name.clone(),
424 cid_bytes: dag.cid_bytes,
425 cumulative_size: dag.cumulative_size,
426 });
427 }
428
429 if should_shard(&children) {
430 build_hamt_root_with_sink(&children, cid_v1, sink)
431 } else {
432 build_plain_directory_with_sink(&children, cid_v1, sink)
433 }
434}
435
436fn hash_file_with_sink(
437 path: &std::path::Path,
438 cid_v1: bool,
439 sink: &mut BlockSink<'_>,
440) -> Result<DagNode, FolderHashError> {
441 let mut hasher = if cid_v1 {
442 Hasher::for_ipfs_v1_raw_leaves()
443 } else {
444 Hasher::for_ipfs()
445 };
446
447 let mut f = File::open(path).map_err(|e| FolderHashError::Io {
448 path: path.to_path_buf(),
449 source: e,
450 })?;
451 let mut buf = vec![0u8; 64 * 1024];
452 loop {
453 let n = f.read(&mut buf).map_err(|e| FolderHashError::Io {
454 path: path.to_path_buf(),
455 source: e,
456 })?;
457 if n == 0 {
458 break;
459 }
460 hasher
461 .update_with_sink(&buf[..n], sink)
462 .map_err(FolderHashError::Sink)?;
463 }
464
465 hasher
469 .finalize_with_sink(sink)
470 .map_err(FolderHashError::Sink)
471}
472
473fn cid_bytes_to_cid(bytes: &[u8]) -> Cid {
474 let cid_str = if bytes.first() == Some(&0x01) {
478 let parsed = ::cid::Cid::try_from(bytes).expect("hash_folder_root produces valid CIDs");
479 parsed.to_string()
480 } else {
481 let mh = multihash::Multihash::<64>::from_bytes(bytes)
482 .expect("CIDv0 multihash bytes must parse");
483 let parsed = ::cid::Cid::new_v0(mh).expect("valid sha2-256 multihash");
484 parsed.to_string()
485 };
486 Cid::try_from(cid_str.as_str()).expect("round-trip CID parse")
487}
488
489#[cfg(test)]
490mod tests {
491 use super::*;
492
493 fn entry(rel: &str) -> FolderEntry {
494 FolderEntry {
495 relative_path: rel.to_string(),
496 absolute_path: PathBuf::from(format!("/abs/{rel}")),
497 }
498 }
499
500 #[test]
501 fn build_tree_flat() {
502 let entries = vec![entry("a.txt"), entry("b.txt")];
503 let tree = build_tree(&entries);
504 assert_eq!(tree.len(), 2);
505 assert!(matches!(tree.get("a.txt"), Some(TreeNode::File(_))));
506 assert!(matches!(tree.get("b.txt"), Some(TreeNode::File(_))));
507 }
508
509 #[test]
510 fn build_tree_nested() {
511 let entries = vec![
512 entry("top.txt"),
513 entry("sub/inner.txt"),
514 entry("sub/deeper/leaf.txt"),
515 ];
516 let tree = build_tree(&entries);
517 assert!(matches!(tree.get("top.txt"), Some(TreeNode::File(_))));
518 let sub = match tree.get("sub") {
519 Some(TreeNode::Dir(m)) => m,
520 _ => panic!("sub must be a Dir"),
521 };
522 assert!(matches!(sub.get("inner.txt"), Some(TreeNode::File(_))));
523 let deeper = match sub.get("deeper") {
524 Some(TreeNode::Dir(m)) => m,
525 _ => panic!("deeper must be a Dir"),
526 };
527 assert!(matches!(deeper.get("leaf.txt"), Some(TreeNode::File(_))));
528 }
529
530 #[test]
531 fn build_tree_alphabetic_via_btreemap() {
532 let entries = vec![entry("zebra.txt"), entry("apple.txt"), entry("mango.txt")];
533 let tree = build_tree(&entries);
534 let names: Vec<&String> = tree.keys().collect();
535 assert_eq!(names, vec!["apple.txt", "mango.txt", "zebra.txt"]);
536 }
537
538 #[test]
539 fn plain_dir_two_entries_canonical_encoding() {
540 let child_a = ChildLink {
544 name: "a.txt".to_string(),
545 cid_bytes: stub_cidv1_raw(0x00),
546 cumulative_size: 1,
547 };
548 let child_b = ChildLink {
549 name: "b.txt".to_string(),
550 cid_bytes: stub_cidv1_raw(0x01),
551 cumulative_size: 1,
552 };
553
554 let dag = build_plain_directory(&[child_a.clone(), child_b.clone()], true);
555
556 assert_eq!(dag.cid_bytes[0], 0x01); assert_eq!(dag.cid_bytes[1], 0x70); assert!(dag.cumulative_size > child_a.cumulative_size + child_b.cumulative_size);
561 }
562
563 fn stub_cidv1_raw(b: u8) -> Vec<u8> {
564 let mut v = vec![0x01u8, 0x55, 0x12, 0x20];
566 v.extend(std::iter::repeat_n(b, 32));
567 v
568 }
569
570 #[test]
571 fn estimated_dir_size_short_names() {
572 let children = vec![
576 ChildLink {
577 name: "a".repeat(10),
578 cid_bytes: stub_cidv1_raw(0x00),
579 cumulative_size: 0,
580 },
581 ChildLink {
582 name: "b".repeat(10),
583 cid_bytes: stub_cidv1_raw(0x01),
584 cumulative_size: 0,
585 },
586 ];
587 assert_eq!(estimated_dir_size(&children), 92);
588 }
589
590 #[test]
591 fn shard_threshold_below_and_above() {
592 let stub_cid = stub_cidv1_raw(0x00);
600 let small: Vec<ChildLink> = (0..5957)
601 .map(|i| ChildLink {
602 name: format!("{i:08}"),
603 cid_bytes: stub_cid.clone(),
604 cumulative_size: 0,
605 })
606 .collect();
607 let big: Vec<ChildLink> = (0..5958)
608 .map(|i| ChildLink {
609 name: format!("{i:08}"),
610 cid_bytes: stub_cid.clone(),
611 cumulative_size: 0,
612 })
613 .collect();
614 assert!(!should_shard(&small));
615 assert!(should_shard(&big));
616 }
617
618 #[test]
619 fn bitfield_set_and_test() {
620 let mut bf = [0u8; 32];
621 set_bit(&mut bf, 0);
622 set_bit(&mut bf, 7);
623 set_bit(&mut bf, 8);
624 set_bit(&mut bf, 255);
625 assert!(test_bit(&bf, 0));
626 assert!(test_bit(&bf, 7));
627 assert!(test_bit(&bf, 8));
628 assert!(test_bit(&bf, 255));
629 assert!(!test_bit(&bf, 1));
630 assert!(!test_bit(&bf, 254));
631 }
632
633 #[test]
634 fn hash_folder_root_single_file_v1_raw() {
635 use crate::{CidVersion, UploadFolderOptions};
636 use std::io::Write;
637 use tempfile::TempDir;
638
639 let dir = TempDir::new().unwrap();
640 let path = dir.path().join("hello.txt");
641 let mut f = std::fs::File::create(&path).unwrap();
642 f.write_all(b"hello\n").unwrap();
643 drop(f);
644
645 let entries = vec![FolderEntry {
646 relative_path: "hello.txt".to_string(),
647 absolute_path: path,
648 }];
649 let opts = UploadFolderOptions {
650 cid_version: CidVersion::V1,
651 ..Default::default()
652 };
653
654 let cid = hash_folder_root(&entries, &opts).expect("hash should succeed");
655
656 let s = cid.to_string();
657 assert!(s.starts_with("bafy"), "expected CIDv1 dag-pb root, got {s}");
658 }
659
660 #[test]
661 fn hamt_single_level_two_entries() {
662 let kids = vec![
668 ChildLink {
669 name: "alpha".to_string(),
670 cid_bytes: stub_cidv1_raw(0xaa),
671 cumulative_size: 1,
672 },
673 ChildLink {
674 name: "bravo".to_string(),
675 cid_bytes: stub_cidv1_raw(0xbb),
676 cumulative_size: 1,
677 },
678 ];
679 let dag = build_hamt_root(&kids, true).expect("HAMT must build");
680
681 assert_eq!(dag.cid_bytes[0], 0x01);
682 assert_eq!(dag.cid_bytes[1], 0x70);
683 assert!(dag.cumulative_size > 0);
684 }
685}