Skip to main content

arloader/
merkle.rs

1//! Functionality for chunking file data and calculating and verifying root ids.
2
3use crate::{crypto::Provider, error::Error};
4use borsh::BorshDeserialize;
5
6/// Single struct used for original data chunks (Leaves) and branch nodes (hashes of pairs of child nodes).
7#[derive(Debug, PartialEq, Clone)]
8pub struct Node {
9    pub id: [u8; HASH_SIZE],
10    pub data_hash: Option<[u8; HASH_SIZE]>,
11    pub min_byte_range: usize,
12    pub max_byte_range: usize,
13    pub left_child: Option<Box<Node>>,
14    pub right_child: Option<Box<Node>>,
15}
16
17/// Concatenated ids and offsets for full set of nodes for an original data chunk, starting with the root.
18#[derive(Debug, PartialEq, Clone)]
19pub struct Proof {
20    pub offset: usize,
21    pub proof: Vec<u8>,
22}
23
24/// Populated with data from deserialized [`Proof`] for original data chunk (Leaf [`Node`]).
25#[repr(C)]
26#[derive(BorshDeserialize, Debug, PartialEq, Clone)]
27pub struct LeafProof {
28    data_hash: [u8; HASH_SIZE],
29    notepad: [u8; NOTE_SIZE - 8],
30    offset: [u8; 8],
31}
32
33/// Populated with data from deserialized [`Proof`] for branch [`Node`] (hash of pair of child nodes).
34#[derive(BorshDeserialize, Debug, PartialEq, Clone)]
35pub struct BranchProof {
36    left_id: [u8; HASH_SIZE],
37    right_id: [u8; HASH_SIZE],
38    notepad: [u8; NOTE_SIZE - 8],
39    offset: [u8; 8],
40}
41
42/// Includes methods to deserialize [`Proof`]s.
43pub trait ProofDeserialize<T> {
44    fn try_from_proof_slice(slice: &[u8]) -> Result<T, Error>;
45    fn offset(&self) -> usize;
46}
47
48impl ProofDeserialize<LeafProof> for LeafProof {
49    fn try_from_proof_slice(slice: &[u8]) -> Result<Self, Error> {
50        let proof = LeafProof::try_from_slice(slice)?;
51        Ok(proof)
52    }
53    fn offset(&self) -> usize {
54        usize::from_be_bytes(self.offset)
55    }
56}
57
58impl ProofDeserialize<BranchProof> for BranchProof {
59    fn try_from_proof_slice(slice: &[u8]) -> Result<Self, Error> {
60        let proof = BranchProof::try_from_slice(slice)?;
61        Ok(proof)
62    }
63    fn offset(&self) -> usize {
64        usize::from_be_bytes(self.offset)
65    }
66}
67
68pub const MAX_CHUNK_SIZE: usize = 256 * 1024;
69pub const MIN_CHUNK_SIZE: usize = 32 * 1024;
70pub const HASH_SIZE: usize = 32;
71const NOTE_SIZE: usize = 32;
72
73/// Includes a function to convert a number to a Vec of 32 bytes per the Arweave spec.
74pub trait Helpers<T> {
75    fn to_note_vec(&self) -> Vec<u8>;
76}
77
78impl Helpers<usize> for usize {
79    fn to_note_vec(&self) -> Vec<u8> {
80        let mut note = vec![0; NOTE_SIZE - 8];
81        note.extend((*self as u64).to_be_bytes());
82        note
83    }
84}
85
86/// Generates data chunks from which the calculation of root id starts.
87pub fn generate_leaves(data: Vec<u8>, crypto: &Provider) -> Result<Vec<Node>, Error> {
88    let mut data_chunks: Vec<&[u8]> = data.chunks(MAX_CHUNK_SIZE).collect();
89
90    #[allow(unused_assignments)]
91    let mut last_two = Vec::new();
92
93    if data_chunks.len() > 1 && data_chunks.last().unwrap().len() < MIN_CHUNK_SIZE {
94        last_two = data_chunks.split_off(data_chunks.len() - 2).concat();
95        let chunk_size = last_two.len() / 2 + (last_two.len() % 2 != 0) as usize;
96        data_chunks.append(&mut last_two.chunks(chunk_size).collect::<Vec<&[u8]>>());
97    }
98
99    if data_chunks.last().unwrap().len() == MAX_CHUNK_SIZE {
100        data_chunks.push(&[]);
101    }
102
103    let mut leaves = Vec::<Node>::new();
104    let mut min_byte_range = 0;
105    for chunk in data_chunks.into_iter() {
106        let data_hash = crypto.hash_sha256(chunk)?;
107        let max_byte_range = min_byte_range + &chunk.len();
108        let offset = max_byte_range.to_note_vec();
109        let id = crypto.hash_all_sha256(vec![&data_hash, &offset])?;
110
111        leaves.push(Node {
112            id,
113            data_hash: Some(data_hash),
114            min_byte_range,
115            max_byte_range,
116            left_child: None,
117            right_child: None,
118        });
119        min_byte_range = min_byte_range + &chunk.len();
120    }
121    Ok(leaves)
122}
123
124/// Hashes together a single branch node from a pair of child nodes.
125pub fn hash_branch(left: Node, right: Node, crypto: &Provider) -> Result<Node, Error> {
126    let max_byte_range = left.max_byte_range.to_note_vec();
127    let id = crypto.hash_all_sha256(vec![&left.id, &right.id, &max_byte_range])?;
128    Ok(Node {
129        id,
130        data_hash: None,
131        min_byte_range: left.max_byte_range,
132        max_byte_range: right.max_byte_range,
133        left_child: Some(Box::new(left)),
134        right_child: Some(Box::new(right)),
135    })
136}
137
138/// Builds one layer of branch nodes from a layer of child nodes.
139pub fn build_layer<'a>(nodes: Vec<Node>, crypto: &Provider) -> Result<Vec<Node>, Error> {
140    let mut layer = Vec::<Node>::with_capacity(nodes.len() / 2 + (nodes.len() % 2 != 0) as usize);
141    let mut nodes_iter = nodes.into_iter();
142    while let Some(left) = nodes_iter.next() {
143        if let Some(right) = nodes_iter.next() {
144            layer.push(hash_branch(left, right, &crypto).unwrap());
145        } else {
146            layer.push(left);
147        }
148    }
149    Ok(layer)
150}
151
152/// Builds all layers from leaves up to single root node.
153pub fn generate_data_root(mut nodes: Vec<Node>, crypto: &Provider) -> Result<Node, Error> {
154    while nodes.len() > 1 {
155        nodes = build_layer(nodes, &crypto)?;
156    }
157    let root = nodes.pop().unwrap();
158    Ok(root)
159}
160
161/// Calculates [`Proof`] for each data chunk contained in root [`Node`].
162pub fn resolve_proofs(node: Node, proof: Option<Proof>) -> Result<Vec<Proof>, Error> {
163    let mut proof = if let Some(proof) = proof {
164        proof
165    } else {
166        Proof {
167            offset: 0,
168            proof: Vec::new(),
169        }
170    };
171    match node {
172        // Leaf
173        Node {
174            data_hash: Some(data_hash),
175            max_byte_range,
176            left_child: None,
177            right_child: None,
178            ..
179        } => {
180            proof.offset = max_byte_range - 1;
181            proof.proof.extend(data_hash);
182            proof.proof.extend(max_byte_range.to_note_vec());
183            return Ok(vec![proof]);
184        }
185        // Branch
186        Node {
187            data_hash: None,
188            min_byte_range,
189            left_child: Some(left_child),
190            right_child: Some(right_child),
191            ..
192        } => {
193            proof.proof.extend(left_child.id.clone());
194            proof.proof.extend(right_child.id.clone());
195            proof.proof.extend(min_byte_range.to_note_vec());
196
197            let mut left_proof = resolve_proofs(*left_child, Some(proof.clone()))?;
198            let right_proof = resolve_proofs(*right_child, Some(proof))?;
199            left_proof.extend(right_proof);
200            return Ok(left_proof);
201        }
202        _ => unreachable!(),
203    }
204}
205
206/// Validates chunk of data against provided [`Proof`].
207pub fn validate_chunk(
208    mut root_id: [u8; HASH_SIZE],
209    chunk: Node,
210    proof: Proof,
211    crypto: &Provider,
212) -> Result<(), Error> {
213    match chunk {
214        Node {
215            data_hash: Some(data_hash),
216            max_byte_range,
217            ..
218        } => {
219            // Split proof into branches and leaf. Leaf is at the end and branches are ordered
220            // from root to leaf.
221            let (branches, leaf) = proof
222                .proof
223                .split_at(proof.proof.len() - HASH_SIZE - NOTE_SIZE);
224
225            // Deserialize proof.
226            let branch_proofs: Vec<BranchProof> = branches
227                .chunks(HASH_SIZE * 2 + NOTE_SIZE)
228                .map(|b| BranchProof::try_from_proof_slice(b).unwrap())
229                .collect();
230            let leaf_proof = LeafProof::try_from_proof_slice(leaf)?;
231
232            // Validate branches.
233            for branch_proof in branch_proofs.iter() {
234                // Calculate the id from the proof.
235                let id = crypto.hash_all_sha256(vec![
236                    &branch_proof.left_id,
237                    &branch_proof.right_id,
238                    &branch_proof.offset().to_note_vec(),
239                ])?;
240
241                // Ensure calculated id correct.
242                if !(id == root_id) {
243                    return Err(Error::InvalidProof.into());
244                }
245
246                // If the offset from the proof is greater than the offset in the data chunk,
247                // then the next id to validate against is from the left.
248                root_id = match max_byte_range > branch_proof.offset() {
249                    true => branch_proof.right_id,
250                    false => branch_proof.left_id,
251                }
252            }
253
254            // Validate leaf: both id and data_hash are correct.
255            let id = crypto.hash_all_sha256(vec![&data_hash, &max_byte_range.to_note_vec()])?;
256            if !(id == root_id) & !(data_hash == leaf_proof.data_hash) {
257                return Err(Error::InvalidProof.into());
258            }
259        }
260        _ => {
261            unreachable!()
262        }
263    }
264    Ok(())
265}
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270    use crate::transaction::Base64;
271    use std::{path::PathBuf, str::FromStr};
272    use tokio::fs;
273
274    #[tokio::test]
275    async fn test_generate_leaves() -> Result<(), Error> {
276        let crypto = Provider::from_keypair_path(PathBuf::from(
277            "tests/fixtures/arweave-key-7eV1qae4qVNqsNChg3Scdi-DpOLJPCogct4ixoq1WNg.json",
278        ))
279        .await?;
280        let data = fs::read("tests/fixtures/1mb.bin").await?;
281        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
282        assert_eq!(
283            leaves[1],
284            Node {
285                id: [
286                    116, 162, 15, 141, 57, 10, 17, 205, 78, 2, 213, 56, 154, 61, 223, 174, 73, 226,
287                    192, 82, 70, 39, 237, 145, 89, 66, 199, 123, 31, 23, 88, 38
288                ],
289                data_hash: Some([
290                    49, 180, 221, 222, 226, 186, 75, 140, 193, 105, 70, 238, 149, 178, 153, 32,
291                    144, 208, 63, 136, 223, 103, 186, 4, 109, 24, 64, 127, 20, 38, 98, 56
292                ]),
293                min_byte_range: 262144,
294                max_byte_range: 524288,
295                left_child: None,
296                right_child: None
297            }
298        );
299        Ok(())
300    }
301
302    #[tokio::test]
303    async fn test_hash_branch() -> Result<(), Error> {
304        let crypto = Provider::from_keypair_path(PathBuf::from(
305            "tests/fixtures/arweave-key-7eV1qae4qVNqsNChg3Scdi-DpOLJPCogct4ixoq1WNg.json",
306        ))
307        .await?;
308
309        let data = fs::read("tests/fixtures/1mb.bin").await?;
310        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
311        let mut nodes_iter = leaves.into_iter();
312        let left = nodes_iter.next().unwrap();
313        let right = nodes_iter.next().unwrap();
314        let left_clone = left.clone();
315        let right_clone = right.clone();
316
317        let branch = hash_branch(left, right, &crypto)?;
318        assert_eq!(
319            branch,
320            Node {
321                id: [
322                    50, 116, 51, 211, 72, 86, 49, 84, 45, 220, 75, 153, 44, 133, 213, 88, 58, 246,
323                    8, 202, 100, 249, 227, 0, 10, 177, 116, 187, 113, 95, 41, 10,
324                ],
325                data_hash: None,
326                min_byte_range: 262144,
327                max_byte_range: 524288,
328                left_child: Some(Box::new(left_clone)),
329                right_child: Some(Box::new(right_clone))
330            }
331        );
332        Ok(())
333    }
334    #[tokio::test]
335    async fn test_build_layer() -> Result<(), Error> {
336        let crypto = Provider::from_keypair_path(PathBuf::from(
337            "tests/fixtures/arweave-key-7eV1qae4qVNqsNChg3Scdi-DpOLJPCogct4ixoq1WNg.json",
338        ))
339        .await?;
340        let data = fs::read("tests/fixtures/1mb.bin").await?;
341        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
342        let layer = build_layer(leaves, &crypto)?;
343        assert_eq!(
344            layer[0].id,
345            [
346                50, 116, 51, 211, 72, 86, 49, 84, 45, 220, 75, 153, 44, 133, 213, 88, 58, 246, 8,
347                202, 100, 249, 227, 0, 10, 177, 116, 187, 113, 95, 41, 10,
348            ]
349        );
350        assert_eq!(layer[0].min_byte_range, 262144);
351        assert_eq!(layer[0].max_byte_range, 524288);
352        Ok(())
353    }
354
355    #[tokio::test]
356    async fn test_generate_data_root_even_chunks() -> Result<(), Error> {
357        let crypto = Provider::default();
358        let data = fs::read("tests/fixtures/1mb.bin").await?;
359        // root id as calculate by arweave-js
360        let root_actual = Base64::from_str("o1tTTjbC7hIZN6KbUUYjlkQoDl2k8VXNuBDcGIs52Hc")?;
361        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
362        let root = generate_data_root(leaves, &crypto)?;
363        assert_eq!(root.id, root_actual.0.as_ref());
364        Ok(())
365    }
366
367    #[tokio::test]
368    async fn test_generate_proof() -> Result<(), Error> {
369        let crypto = Provider::default();
370        let proof_actual = Base64::from_str("7EAC9FsACQRwe4oIzu7Mza9KjgWKT4toYxDYGjWrCdp0QgsrYS6AueMJ_rM6ZEGslGqjUekzD3WSe7B5_fwipgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAnH6dASdQCigcL43lp0QclqBaSncF4TspuvxoFbn2L18EXpQrP1wkbwdIjSSWQQRt_F31yNvxtc09KkPFtzMKAwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAIHiHU9QwOImFzjqSlfxkJJCtSbAox6TbbFhQvlEapSgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAA")?;
371        let data = fs::read("tests/fixtures/rebar3").await?;
372        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
373        let root = generate_data_root(leaves, &crypto)?;
374
375        let proofs = resolve_proofs(root, None)?;
376        assert_eq!(
377            proofs[0],
378            Proof {
379                offset: 262143,
380                proof: proof_actual.0,
381            },
382        );
383        Ok(())
384    }
385    #[tokio::test]
386    async fn test_validate_chunks() -> Result<(), Error> {
387        let crypto = Provider::default();
388        let data = fs::read("tests/fixtures/1mb.bin").await?;
389        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
390        let root = generate_data_root(leaves.clone(), &crypto)?;
391        let root_id = root.id.clone();
392        let proofs = resolve_proofs(root, None)?;
393        println!("proofs_len: {}", proofs.len());
394        assert_eq!(leaves.len(), proofs.len());
395
396        for (chunk, proof) in leaves.into_iter().zip(proofs.into_iter()) {
397            assert_eq!((), validate_chunk(root_id.clone(), chunk, proof, &crypto)?);
398        }
399        Ok(())
400    }
401
402    #[tokio::test]
403    async fn test_valid_root() -> Result<(), Error> {
404        let crypto = Provider::default();
405        let data_root_actual = Base64::from_str("t-GCOnjPWxdox950JsrFMu3nzOE4RktXpMcIlkqSUTw")?;
406        let data = fs::read("tests/fixtures/rebar3").await?;
407        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
408        let root = generate_data_root(leaves.clone(), &crypto)?;
409        assert_eq!(root.id.to_vec(), data_root_actual.0);
410        Ok(())
411    }
412
413    #[tokio::test]
414    async fn test_valid_root_even_chunks() -> Result<(), Error> {
415        let crypto = Provider::default();
416        let data = fs::read("tests/fixtures/1mb.bin").await?;
417        // root id as calculate by arweave-js
418        let root_actual = Base64::from_str("o1tTTjbC7hIZN6KbUUYjlkQoDl2k8VXNuBDcGIs52Hc")?;
419        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
420        let root = generate_data_root(leaves, &crypto)?;
421        assert_eq!(root.id, root_actual.0.as_ref());
422        Ok(())
423    }
424
425    #[test]
426    fn test_valid_root_small_last_chunk() -> Result<(), Error> {
427        let crypto = Provider::default();
428        let data = vec![0; 256 * 1024 + 1];
429        // root id as calculate by arweave-js
430        let root_actual = Base64::from_str("br1Vtl3TS_NGWdHmYqBh3-MxrlckoluHCZGmUZk-dJc")?;
431        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
432        let root = generate_data_root(leaves, &crypto)?;
433        println!("{}", Base64(root.id.to_vec()));
434        assert_eq!(root.id, root_actual.0.as_ref());
435        Ok(())
436    }
437
438    #[tokio::test]
439    async fn test_even_chunks() -> Result<(), Error> {
440        let crypto = Provider::default();
441        let data = fs::read("tests/fixtures/1mb.bin").await?;
442        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
443        println!("{:?}", leaves[4]);
444        assert_eq!(leaves.len(), 5);
445        Ok(())
446    }
447
448    #[test]
449    fn test_small_last_chunk() -> Result<(), Error> {
450        let crypto = Provider::default();
451        let data = vec![0; 256 * 1024 + 1];
452        let leaves: Vec<Node> = generate_leaves(data, &crypto)?;
453        assert_eq!(131073, leaves[0].max_byte_range);
454        assert_eq!(131072, leaves[1].max_byte_range - leaves[1].min_byte_range);
455        Ok(())
456    }
457}