Skip to main content

aleph_cid/
verify.rs

1use crate::cid::Cid;
2use cid::Cid as LibCid;
3use prost::Message;
4use sha2::{Digest, Sha256};
5
6/// Encode a protobuf varint (LEB128).
7fn encode_varint(mut value: u64, buf: &mut Vec<u8>) {
8    while value >= 0x80 {
9        buf.push((value as u8) | 0x80);
10        value >>= 7;
11    }
12    buf.push(value as u8);
13}
14
15/// Encode a protobuf field tag (field number + wire type).
16fn encode_tag(field_number: u32, wire_type: u8, buf: &mut Vec<u8>) {
17    encode_varint(((field_number as u64) << 3) | wire_type as u64, buf);
18}
19
20/// Wire type 2: length-delimited.
21const WIRE_TYPE_LEN: u8 = 2;
22
23/// Encode a PBNode in canonical dag-pb order (Links before Data).
24///
25/// Standard prost encoding emits fields in field-number order (Data=1 before
26/// Links=2), but the IPFS dag-pb spec mandates Links before Data. Without this
27/// ordering the SHA-256 digest — and therefore the CID — will differ from what
28/// IPFS computes.
29pub(crate) fn encode_pbnode_canonical(node: &merkledag::PbNode) -> Vec<u8> {
30    let mut buf = Vec::new();
31
32    // Links (field 2) first
33    for link in &node.links {
34        let mut link_buf = Vec::new();
35        prost::Message::encode(link, &mut link_buf).expect("encoding PBLink");
36        encode_tag(2, WIRE_TYPE_LEN, &mut buf);
37        encode_varint(link_buf.len() as u64, &mut buf);
38        buf.extend_from_slice(&link_buf);
39    }
40
41    // Data (field 1) second
42    if let Some(data) = &node.data {
43        encode_tag(1, WIRE_TYPE_LEN, &mut buf);
44        encode_varint(data.len() as u64, &mut buf);
45        buf.extend_from_slice(data);
46    }
47
48    buf
49}
50
51use crate::proto::merkledag;
52use crate::proto::unixfs;
53
54/// A CID whose codec/version combination this crate cannot reproduce.
55#[derive(Debug, thiserror::Error)]
56#[error("unsupported CID format: {0}")]
57pub struct UnsupportedCid(pub String);
58
59/// Raw codec for CIDv1 (identity mapping of bytes to CID).
60pub(crate) const RAW_CODEC: u64 = 0x55;
61
62/// dag-pb codec for CIDv1.
63pub(crate) const DAG_PB_CODEC: u64 = 0x70;
64
65/// IPFS default chunk size: 256 KiB.
66pub(crate) const CHUNK_SIZE: usize = 262144;
67
68/// IPFS default maximum links per node (go-ipfs `helpers.DefaultLinksPerBlock`).
69const MAX_LINKS: usize = 174;
70
71/// Encode a SHA-256 digest as a multihash: [0x12, 0x20, ...32 bytes...]
72pub(crate) fn encode_multihash(digest: &[u8]) -> Vec<u8> {
73    let mut mh = Vec::with_capacity(2 + digest.len());
74    mh.push(0x12); // SHA-256 code
75    mh.push(0x20); // 32 bytes
76    mh.extend_from_slice(digest);
77    mh
78}
79
80/// A dag-pb node (leaf or internal) used during tree construction.
81#[derive(Debug)]
82pub struct DagNode {
83    /// The CID bytes stored in PBLink.Hash.
84    /// For CIDv0: bare multihash [0x12, 0x20, ...32 bytes SHA-256 digest...]
85    /// For CIDv1: full CID binary (varint version + varint codec + multihash)
86    pub(crate) cid_bytes: Vec<u8>,
87    /// Cumulative size: serialized node bytes + sum of children's cumulative sizes.
88    /// For raw leaves this equals the raw chunk size. Used for PBLink.Tsize.
89    pub(crate) cumulative_size: u64,
90    /// Total file data bytes covered by this subtree.
91    pub(crate) data_size: u64,
92}
93
94/// A streaming hasher that accumulates data and produces a [`Cid`] on
95/// finalization.
96#[derive(Debug)]
97pub enum Hasher {
98    /// CIDv1 with the raw codec: the whole input is one raw block.
99    CidRaw { hasher: Sha256 },
100    /// dag-pb DAG (CIDv0 wrapped leaves, or CIDv1 with raw leaves).
101    DagPb {
102        buffer: Vec<u8>,
103        leaves: Vec<DagNode>,
104        raw_leaves: bool,
105    },
106}
107
108impl Hasher {
109    /// Creates a hasher for IPFS CIDv0 dag-pb (wrapped leaves, balanced DAG).
110    pub fn for_ipfs() -> Self {
111        Self::DagPb {
112            buffer: Vec::with_capacity(CHUNK_SIZE),
113            leaves: Vec::new(),
114            raw_leaves: false,
115        }
116    }
117
118    /// Creates a hasher for IPFS CIDv1 dag-pb with raw leaves.
119    ///
120    /// Matches kubo's `ipfs add --cid-version=1 --raw-leaves` defaults: chunks
121    /// of 256 KiB are stored as raw blocks (codec 0x55) and the file root (if
122    /// the file spans multiple chunks) is a dag-pb node linking those raw leaves.
123    /// A single-chunk file collapses to a bare raw block whose CID is `bafkrei…`.
124    pub fn for_ipfs_v1_raw_leaves() -> Self {
125        Self::DagPb {
126            buffer: Vec::with_capacity(CHUNK_SIZE),
127            leaves: Vec::new(),
128            raw_leaves: true,
129        }
130    }
131
132    /// Creates a hasher that reproduces the construction of the given CID:
133    /// dag-pb with wrapped leaves for CIDv0, a single raw block for CIDv1
134    /// raw, dag-pb with raw leaves for CIDv1 dag-pb.
135    pub fn for_expected(expected: &Cid) -> Result<Self, UnsupportedCid> {
136        if expected.is_v0() {
137            // CIDv0 is always dag-pb codec with wrapped leaves
138            return Ok(Self::for_ipfs());
139        }
140
141        let parsed = LibCid::try_from(expected.as_str())
142            .map_err(|e| UnsupportedCid(format!("{expected}: {e}")))?;
143
144        match parsed.codec() {
145            RAW_CODEC => Ok(Self::CidRaw {
146                hasher: Sha256::new(),
147            }),
148            DAG_PB_CODEC => Ok(Self::for_ipfs_v1_raw_leaves()),
149            other => Err(UnsupportedCid(format!(
150                "{expected}: unsupported codec 0x{other:x}"
151            ))),
152        }
153    }
154
155    /// Feed data into the hasher.
156    pub fn update(&mut self, data: &[u8]) {
157        match self {
158            Self::CidRaw { hasher, .. } => hasher.update(data),
159            Self::DagPb {
160                buffer,
161                leaves,
162                raw_leaves,
163            } => {
164                let raw = *raw_leaves;
165                let mut remaining = data;
166                while !remaining.is_empty() {
167                    let space = CHUNK_SIZE - buffer.len();
168                    let take = remaining.len().min(space);
169                    buffer.extend_from_slice(&remaining[..take]);
170                    remaining = &remaining[take..];
171                    if buffer.len() == CHUNK_SIZE {
172                        let leaf = if raw {
173                            Self::build_raw_leaf(buffer)
174                        } else {
175                            Self::build_leaf(buffer)
176                        };
177                        leaves.push(leaf);
178                        buffer.clear();
179                    }
180                }
181            }
182        }
183    }
184
185    /// Build a dag-pb leaf node from a chunk of data.
186    pub(crate) fn build_leaf(chunk: &[u8]) -> DagNode {
187        let (node, _bytes) = Self::build_leaf_with_bytes(chunk);
188        node
189    }
190
191    /// Build a dag-pb leaf node from a chunk of data, also returning the
192    /// canonical pbnode bytes used to compute the CID.
193    pub(crate) fn build_leaf_with_bytes(chunk: &[u8]) -> (DagNode, Vec<u8>) {
194        let unixfs_data = unixfs::Data {
195            r#type: unixfs::DataType::File as i32,
196            data: if chunk.is_empty() {
197                None
198            } else {
199                Some(chunk.to_vec())
200            },
201            filesize: Some(chunk.len() as u64),
202            blocksizes: vec![],
203            hash_type: None,
204            fanout: None,
205        };
206        let mut unixfs_bytes = Vec::new();
207        unixfs_data
208            .encode(&mut unixfs_bytes)
209            .expect("protobuf encoding cannot fail for in-memory buffers");
210
211        let node = merkledag::PbNode {
212            links: vec![],
213            data: Some(unixfs_bytes),
214        };
215        let node_bytes = encode_pbnode_canonical(&node);
216
217        let digest = Sha256::digest(&node_bytes);
218        let cid_bytes = encode_multihash(&digest);
219
220        let dag_node = DagNode {
221            cid_bytes,
222            cumulative_size: node_bytes.len() as u64,
223            data_size: chunk.len() as u64,
224        };
225        (dag_node, node_bytes)
226    }
227
228    /// Build a raw leaf node: hash the chunk directly without dag-pb/UnixFS wrapping.
229    /// Used for CIDv1 dag-pb which defaults to raw leaves.
230    pub(crate) fn build_raw_leaf(chunk: &[u8]) -> DagNode {
231        let digest = Sha256::digest(chunk);
232        let mh = encode_multihash(&digest);
233        // CIDv1 raw binary: varint(1) + varint(RAW_CODEC) + multihash
234        let mut cid_bytes = Vec::with_capacity(2 + mh.len());
235        cid_bytes.push(0x01); // CID version 1
236        cid_bytes.push(RAW_CODEC as u8); // 0x55
237        cid_bytes.extend_from_slice(&mh);
238
239        DagNode {
240            cid_bytes,
241            cumulative_size: chunk.len() as u64,
242            data_size: chunk.len() as u64,
243        }
244    }
245
246    /// Build an internal dag-pb node from a list of children.
247    /// When `v1` is true, produces CIDv1 dag-pb binary; otherwise bare multihash (CIDv0).
248    pub(crate) fn build_internal_node(children: &[DagNode], v1: bool) -> DagNode {
249        let (node, _bytes) = Self::build_internal_node_with_bytes(children, v1);
250        node
251    }
252
253    /// Build an internal dag-pb node from a list of children, also returning the
254    /// canonical pbnode bytes used to compute the CID.
255    /// When `v1` is true, produces CIDv1 dag-pb binary; otherwise bare multihash (CIDv0).
256    pub(crate) fn build_internal_node_with_bytes(
257        children: &[DagNode],
258        v1: bool,
259    ) -> (DagNode, Vec<u8>) {
260        let total_data_size: u64 = children.iter().map(|c| c.data_size).sum();
261        let blocksizes: Vec<u64> = children.iter().map(|c| c.data_size).collect();
262
263        let links: Vec<merkledag::PbLink> = children
264            .iter()
265            .map(|c| merkledag::PbLink {
266                hash: Some(c.cid_bytes.clone()),
267                name: Some(String::new()),
268                tsize: Some(c.cumulative_size),
269            })
270            .collect();
271
272        let root_unixfs = unixfs::Data {
273            r#type: unixfs::DataType::File as i32,
274            data: None,
275            filesize: Some(total_data_size),
276            blocksizes,
277            hash_type: None,
278            fanout: None,
279        };
280        let mut root_unixfs_bytes = Vec::new();
281        root_unixfs
282            .encode(&mut root_unixfs_bytes)
283            .expect("protobuf encoding cannot fail");
284
285        let node = merkledag::PbNode {
286            links,
287            data: Some(root_unixfs_bytes),
288        };
289        let node_bytes = encode_pbnode_canonical(&node);
290
291        let digest = Sha256::digest(&node_bytes);
292        let mh = encode_multihash(&digest);
293
294        let cid_bytes = if v1 {
295            let mut cid = Vec::with_capacity(2 + mh.len());
296            cid.push(0x01); // CID version 1
297            cid.push(DAG_PB_CODEC as u8); // 0x70
298            cid.extend_from_slice(&mh);
299            cid
300        } else {
301            mh
302        };
303
304        let node_size = node_bytes.len() as u64;
305        let children_cumulative: u64 = children.iter().map(|c| c.cumulative_size).sum();
306
307        let dag_node = DagNode {
308            cid_bytes,
309            cumulative_size: node_size + children_cumulative,
310            data_size: total_data_size,
311        };
312        (dag_node, node_bytes)
313    }
314
315    /// Like `update`, but invokes `sink` for each complete leaf (raw or
316    /// pbnode-wrapped) as it is finalized.
317    #[allow(clippy::type_complexity)]
318    pub(crate) fn update_with_sink(
319        &mut self,
320        data: &[u8],
321        sink: &mut dyn FnMut(&[u8], &[u8]) -> std::io::Result<()>,
322    ) -> std::io::Result<()> {
323        match self {
324            Self::CidRaw { hasher, .. } => {
325                hasher.update(data);
326                Ok(())
327            }
328            Self::DagPb {
329                buffer,
330                leaves,
331                raw_leaves,
332            } => {
333                let raw = *raw_leaves;
334                let mut remaining = data;
335                while !remaining.is_empty() {
336                    let space = CHUNK_SIZE - buffer.len();
337                    let take = remaining.len().min(space);
338                    buffer.extend_from_slice(&remaining[..take]);
339                    remaining = &remaining[take..];
340                    if buffer.len() == CHUNK_SIZE {
341                        let (leaf, block_bytes) = if raw {
342                            let bytes = buffer.clone();
343                            (Self::build_raw_leaf(buffer), bytes)
344                        } else {
345                            Self::build_leaf_with_bytes(buffer)
346                        };
347                        sink(&leaf.cid_bytes, &block_bytes)?;
348                        leaves.push(leaf);
349                        buffer.clear();
350                    }
351                }
352                Ok(())
353            }
354        }
355    }
356
357    /// Drain the trailing partial buffer (if any), emit final leaf and all
358    /// internal nodes up to the root, invoking `sink` for each block.
359    /// Returns the root `DagNode`.
360    ///
361    /// Panics if called on a `CidRaw` hasher.
362    #[allow(clippy::type_complexity)]
363    pub(crate) fn finalize_with_sink(
364        self,
365        sink: &mut dyn FnMut(&[u8], &[u8]) -> std::io::Result<()>,
366    ) -> std::io::Result<DagNode> {
367        match self {
368            Self::DagPb {
369                buffer,
370                mut leaves,
371                raw_leaves,
372            } => {
373                let v1 = raw_leaves;
374
375                // Single-leaf (or empty-file) fast path: file fits in one chunk.
376                if leaves.is_empty() {
377                    let (leaf, block_bytes) = if raw_leaves {
378                        let bytes = buffer.clone();
379                        (Self::build_raw_leaf(&buffer), bytes)
380                    } else {
381                        Self::build_leaf_with_bytes(&buffer)
382                    };
383                    sink(&leaf.cid_bytes, &block_bytes)?;
384                    return Ok(leaf);
385                }
386
387                // Flush remaining partial chunk.
388                if !buffer.is_empty() {
389                    let (leaf, block_bytes) = if raw_leaves {
390                        let bytes = buffer.clone();
391                        (Self::build_raw_leaf(&buffer), bytes)
392                    } else {
393                        Self::build_leaf_with_bytes(&buffer)
394                    };
395                    sink(&leaf.cid_bytes, &block_bytes)?;
396                    leaves.push(leaf);
397                }
398
399                // Build internal node tree, emitting each level to sink.
400                let mut nodes = leaves;
401                while nodes.len() > 1 {
402                    let mut next_level = Vec::with_capacity(nodes.len().div_ceil(MAX_LINKS));
403                    for chunk in nodes.chunks(MAX_LINKS) {
404                        let (internal, node_bytes) =
405                            Self::build_internal_node_with_bytes(chunk, v1);
406                        sink(&internal.cid_bytes, &node_bytes)?;
407                        next_level.push(internal);
408                    }
409                    nodes = next_level;
410                }
411                Ok(nodes.into_iter().next().unwrap())
412            }
413            _ => panic!("finalize_with_sink called on non-DagPb hasher"),
414        }
415    }
416
417    /// Build the root `DagNode` for a `DagPb` hasher, including the correct
418    /// cumulative_size for use as `PBLink.Tsize` in a parent directory node.
419    ///
420    /// Panics if called on a `CidRaw` hasher (only `DagPb` produces a DAG
421    /// node with meaningful cumulative_size).
422    pub(crate) fn finalize_dag_node(self) -> DagNode {
423        match self {
424            Self::DagPb {
425                buffer,
426                mut leaves,
427                raw_leaves,
428            } => {
429                let make_leaf = |chunk: &[u8]| {
430                    if raw_leaves {
431                        Self::build_raw_leaf(chunk)
432                    } else {
433                        Self::build_leaf(chunk)
434                    }
435                };
436
437                // Flush any remaining bytes in buffer as the last chunk.
438                if !buffer.is_empty() {
439                    leaves.push(make_leaf(&buffer));
440                }
441
442                let v1 = raw_leaves;
443
444                if leaves.is_empty() {
445                    make_leaf(&[])
446                } else if leaves.len() == 1 {
447                    leaves.into_iter().next().unwrap()
448                } else {
449                    let mut nodes = leaves;
450                    while nodes.len() > 1 {
451                        nodes = nodes
452                            .chunks(MAX_LINKS)
453                            .map(|c| Self::build_internal_node(c, v1))
454                            .collect();
455                    }
456                    nodes.into_iter().next().unwrap()
457                }
458            }
459            _ => panic!("finalize_dag_node called on non-DagPb hasher"),
460        }
461    }
462
463    /// Finalize the hasher and return the computed [`Cid`].
464    pub fn finalize(self) -> Cid {
465        match self {
466            Self::CidRaw { hasher } => {
467                let digest = hasher.finalize();
468                // SHA-256 multihash code is 0x12
469                let mh = multihash::Multihash::<64>::wrap(0x12, &digest)
470                    .expect("SHA-256 digest fits in 64-byte multihash");
471                let computed_lib_cid = LibCid::new_v1(RAW_CODEC, mh);
472                let computed_cid_str = computed_lib_cid.to_string();
473                Cid::try_from(computed_cid_str.as_str()).expect("valid computed CID")
474            }
475            Self::DagPb { .. } => {
476                let root_node = self.finalize_dag_node();
477                let root_cid_bytes = root_node.cid_bytes;
478
479                // CIDv1 binary starts with 0x01 (version byte); CIDv0 bare
480                // multihash starts with 0x12 (sha2-256 code).
481                let computed_cid_str = if root_cid_bytes.first() == Some(&0x01) {
482                    let lib_cid =
483                        LibCid::try_from(&root_cid_bytes[..]).expect("valid CIDv1 from build");
484                    lib_cid.to_string()
485                } else {
486                    // CIDv0: bare multihash (raw_leaves=false)
487                    bs58::encode(&root_cid_bytes).into_string()
488                };
489
490                Cid::try_from(computed_cid_str.as_str()).expect("computed CID is always valid")
491            }
492        }
493    }
494}
495
496/// Computes an IPFS CIDv0 (dag-pb) for the given data.
497///
498/// Uses the same chunking and tree construction as IPFS's default settings
499/// (256 KiB chunks, balanced DAG, wrapped leaves).
500pub fn compute_cid(data: &[u8]) -> Cid {
501    let mut hasher = Hasher::for_ipfs();
502    hasher.update(data);
503    hasher.finalize()
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    /// Recompute the data's CID with the hasher implied by `expected` and
511    /// assert it matches.
512    fn assert_cid_roundtrip(expected: &Cid, chunks: &[&[u8]]) {
513        let mut hasher = Hasher::for_expected(expected).unwrap();
514        for chunk in chunks {
515            hasher.update(chunk);
516        }
517        assert_eq!(&hasher.finalize(), expected);
518    }
519
520    #[test]
521    fn test_verify_cidv1_raw_success() {
522        use cid::Cid as LibCid;
523        use multihash_codetable::{Code, MultihashDigest};
524
525        let data = b"hello ipfs world";
526        // Compute the expected CIDv1 raw
527        let mh = Code::Sha2_256.digest(data);
528        let expected_cid = LibCid::new_v1(0x55, mh); // 0x55 = raw codec
529        // Convert to our Cid type (base32 encoded string)
530        let cid_string = expected_cid.to_string();
531        let expected = Cid::try_from(cid_string.as_str()).unwrap();
532
533        assert_cid_roundtrip(&expected, &[data]);
534    }
535
536    #[test]
537    fn test_verify_cidv1_raw_failure() {
538        use cid::Cid as LibCid;
539        use multihash_codetable::{Code, MultihashDigest};
540
541        let data = b"hello ipfs world";
542        let mh = Code::Sha2_256.digest(data);
543        let expected_cid = LibCid::new_v1(0x55, mh);
544        let cid_string = expected_cid.to_string();
545        let expected = Cid::try_from(cid_string.as_str()).unwrap();
546
547        let mut hasher = Hasher::for_expected(&expected).unwrap();
548        hasher.update(b"wrong data");
549        assert_ne!(hasher.finalize(), expected);
550    }
551
552    #[test]
553    fn test_for_expected_unsupported_codec_errors() {
554        use cid::Cid as LibCid;
555        use multihash_codetable::{Code, MultihashDigest};
556
557        // CIDv1 with dag-cbor codec (0x71), built from a valid sha-256
558        // multihash so it parses as a CID but is not reproducible here.
559        let mh = Code::Sha2_256.digest(b"x");
560        let dag_cbor = LibCid::new_v1(0x71, mh).to_string();
561        let expected = Cid::try_from(dag_cbor.as_str()).unwrap();
562        let err = Hasher::for_expected(&expected).unwrap_err();
563        assert!(err.to_string().contains("0x71"));
564    }
565
566    #[test]
567    fn test_verify_cidv0_single_chunk() {
568        // Small file (< 256KB) -> single leaf node, no intermediate DAG
569        let data = b"hello dag-pb world";
570
571        // Build the expected CIDv0 manually using our protobuf types
572        let unixfs_data = unixfs::Data {
573            r#type: unixfs::DataType::File as i32,
574            data: Some(data.to_vec()),
575            filesize: Some(data.len() as u64),
576            blocksizes: vec![],
577            hash_type: None,
578            fanout: None,
579        };
580        let mut unixfs_bytes = Vec::new();
581        unixfs_data.encode(&mut unixfs_bytes).unwrap();
582
583        let node = merkledag::PbNode {
584            links: vec![],
585            data: Some(unixfs_bytes),
586        };
587        let node_bytes = encode_pbnode_canonical(&node);
588
589        let digest = sha2::Sha256::digest(&node_bytes);
590        let mut multihash_bytes = vec![0x12, 0x20];
591        multihash_bytes.extend_from_slice(&digest);
592        let expected_cidv0 = bs58::encode(&multihash_bytes).into_string();
593
594        let expected = Cid::try_from(expected_cidv0.as_str()).unwrap();
595        assert_cid_roundtrip(&expected, &[data]);
596    }
597
598    #[test]
599    fn test_verify_cidv0_multi_chunk() {
600        // File larger than 256KB -> multiple leaves + root node
601        let chunk_size = 262144;
602        let data = vec![0xABu8; chunk_size + 100]; // slightly over one chunk
603
604        // Build leaf 1
605        let chunk1 = &data[..chunk_size];
606        let leaf1_unixfs = unixfs::Data {
607            r#type: unixfs::DataType::File as i32,
608            data: Some(chunk1.to_vec()),
609            filesize: Some(chunk1.len() as u64),
610            blocksizes: vec![],
611            hash_type: None,
612            fanout: None,
613        };
614        let mut leaf1_unixfs_bytes = Vec::new();
615        leaf1_unixfs.encode(&mut leaf1_unixfs_bytes).unwrap();
616        let leaf1_node = merkledag::PbNode {
617            links: vec![],
618            data: Some(leaf1_unixfs_bytes),
619        };
620        let leaf1_bytes = encode_pbnode_canonical(&leaf1_node);
621        let leaf1_digest = sha2::Sha256::digest(&leaf1_bytes);
622        let mut leaf1_mh = vec![0x12, 0x20];
623        leaf1_mh.extend_from_slice(&leaf1_digest);
624
625        // Build leaf 2
626        let chunk2 = &data[chunk_size..];
627        let leaf2_unixfs = unixfs::Data {
628            r#type: unixfs::DataType::File as i32,
629            data: Some(chunk2.to_vec()),
630            filesize: Some(chunk2.len() as u64),
631            blocksizes: vec![],
632            hash_type: None,
633            fanout: None,
634        };
635        let mut leaf2_unixfs_bytes = Vec::new();
636        leaf2_unixfs.encode(&mut leaf2_unixfs_bytes).unwrap();
637        let leaf2_node = merkledag::PbNode {
638            links: vec![],
639            data: Some(leaf2_unixfs_bytes),
640        };
641        let leaf2_bytes = encode_pbnode_canonical(&leaf2_node);
642        let leaf2_digest = sha2::Sha256::digest(&leaf2_bytes);
643        let mut leaf2_mh = vec![0x12, 0x20];
644        leaf2_mh.extend_from_slice(&leaf2_digest);
645
646        // Build root node
647        let root_unixfs = unixfs::Data {
648            r#type: unixfs::DataType::File as i32,
649            data: None,
650            filesize: Some(data.len() as u64),
651            blocksizes: vec![chunk1.len() as u64, chunk2.len() as u64],
652            hash_type: None,
653            fanout: None,
654        };
655        let mut root_unixfs_bytes = Vec::new();
656        root_unixfs.encode(&mut root_unixfs_bytes).unwrap();
657        let root_node = merkledag::PbNode {
658            links: vec![
659                merkledag::PbLink {
660                    hash: Some(leaf1_mh),
661                    name: Some(String::new()),
662                    tsize: Some(leaf1_bytes.len() as u64),
663                },
664                merkledag::PbLink {
665                    hash: Some(leaf2_mh),
666                    name: Some(String::new()),
667                    tsize: Some(leaf2_bytes.len() as u64),
668                },
669            ],
670            data: Some(root_unixfs_bytes),
671        };
672        let root_bytes = encode_pbnode_canonical(&root_node);
673        let root_digest = sha2::Sha256::digest(&root_bytes);
674        let mut root_multihash = vec![0x12, 0x20];
675        root_multihash.extend_from_slice(&root_digest);
676        let expected_cidv0 = bs58::encode(&root_multihash).into_string();
677
678        let expected = Cid::try_from(expected_cidv0.as_str()).unwrap();
679        assert_cid_roundtrip(&expected, &[&data]);
680    }
681
682    #[test]
683    fn test_verify_cidv0_multi_chunk_streamed() {
684        // Multi-chunk data fed in small increments must produce the same CID
685        // as a single update() call.
686        let chunk_size = 262144;
687        let data = vec![0xCDu8; chunk_size * 2 + 500]; // 2 full chunks + partial
688
689        let expected = compute_cid(&data);
690
691        let mut hasher = Hasher::for_ipfs();
692        let mut offset = 0;
693        let step = 1000;
694        while offset < data.len() {
695            let end = (offset + step).min(data.len());
696            hasher.update(&data[offset..end]);
697            offset = end;
698        }
699        assert_eq!(hasher.finalize(), expected);
700    }
701
702    #[test]
703    fn test_verify_cidv0_exact_chunk_boundary() {
704        // File is exactly one chunk (262144 bytes)
705        let data = vec![0x42u8; CHUNK_SIZE];
706
707        let unixfs_data = unixfs::Data {
708            r#type: unixfs::DataType::File as i32,
709            data: Some(data.clone()),
710            filesize: Some(data.len() as u64),
711            blocksizes: vec![],
712            hash_type: None,
713            fanout: None,
714        };
715        let mut unixfs_bytes = Vec::new();
716        unixfs_data.encode(&mut unixfs_bytes).unwrap();
717
718        let node = merkledag::PbNode {
719            links: vec![],
720            data: Some(unixfs_bytes),
721        };
722        let node_bytes = encode_pbnode_canonical(&node);
723
724        let digest = sha2::Sha256::digest(&node_bytes);
725        let mut multihash_bytes = vec![0x12, 0x20];
726        multihash_bytes.extend_from_slice(&digest);
727        let expected_cidv0 = bs58::encode(&multihash_bytes).into_string();
728
729        let expected = Cid::try_from(expected_cidv0.as_str()).unwrap();
730        assert_cid_roundtrip(&expected, &[&data]);
731    }
732
733    #[test]
734    fn test_verify_cidv0_empty_file() {
735        // Known IPFS empty file CID (produced by `echo -n '' | ipfs add`)
736        let expected_cid = "QmbFMke1KXqnYyBBWxB74N4c5SBnJMVAiMNRcGu6x1AwQH";
737        let expected = Cid::try_from(expected_cid).unwrap();
738
739        // No update() calls — empty file
740        assert_cid_roundtrip(&expected, &[]);
741    }
742
743    #[test]
744    fn test_verify_cidv0_multi_level_dag() {
745        // "deadbeef" repeated 16Mi times (128 MiB total) — CID obtained from IPFS directly
746        let expected_cid = "QmcYKke22MG2rnu4nPVj8Z3hMPi2wtVMKzqLcJwYRThYif";
747        let data = "deadbeef".repeat(16 * 1024 * 1024);
748
749        let expected = Cid::try_from(expected_cid).unwrap();
750        assert_cid_roundtrip(&expected, &[data.as_bytes()]);
751    }
752
753    #[test]
754    fn test_verify_cidv1_dagpb_multi_level_dag() {
755        // Same data as the CIDv0 test, CIDv1 dag-pb CID obtained from IPFS directly
756        let expected_cid = "bafybeiawhayvhrtunmsazigmne75kqyyb2z7oqlvky3abpk4tbkqyzv6iu";
757        let data = "deadbeef".repeat(16 * 1024 * 1024);
758
759        let expected = Cid::try_from(expected_cid).unwrap();
760        assert_cid_roundtrip(&expected, &[data.as_bytes()]);
761    }
762
763    #[test]
764    fn test_compute_cid_small_file() {
765        let data = b"hello dag-pb world";
766        let cid = compute_cid(data);
767        assert_cid_roundtrip(&cid, &[data]);
768    }
769
770    #[test]
771    fn test_compute_cid_large_file() {
772        let data = vec![0xABu8; 262144 + 100];
773        let cid = compute_cid(&data);
774        assert_cid_roundtrip(&cid, &[&data]);
775    }
776
777    #[test]
778    fn test_hasher_for_ipfs() {
779        let data = b"hello dag-pb world";
780        let mut hasher = Hasher::for_ipfs();
781        hasher.update(data);
782        assert_eq!(hasher.finalize(), compute_cid(data));
783    }
784
785    #[test]
786    fn test_hasher_for_ipfs_large() {
787        let data = vec![0xABu8; CHUNK_SIZE + 100];
788        let mut hasher = Hasher::for_ipfs();
789        hasher.update(&data);
790        assert_eq!(hasher.finalize(), compute_cid(&data));
791    }
792
793    #[test]
794    fn for_ipfs_v1_raw_leaves_hashes_short_input() {
795        let mut h = Hasher::for_ipfs_v1_raw_leaves();
796        h.update(b"hello\n");
797        let cid = h.finalize();
798        // CIDv1 raw for "hello\n" (6 bytes): SHA-256("hello\n") = 5891b5b5...
799        // encoded as [0x01, 0x55, 0x12, 0x20, ...digest...] in base32lower.
800        // Verified: printf 'hello\n' | sha256sum gives 5891b5b5...
801        assert_eq!(
802            cid.to_string(),
803            "bafkreicysg23kiwv34eg2d7qweipxwosdo2py4ldv42nbauguluen5v6am"
804        );
805    }
806}