Skip to main content

openjd_snapshots/
hash.rs

1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// Copyright by contributors to this project.
3// SPDX-License-Identifier: (Apache-2.0 OR MIT)
4
5use serde::{Deserialize, Serialize};
6use std::fs::File;
7use std::io::Read;
8use std::path::Path;
9use xxhash_rust::xxh3::{xxh3_128, Xxh3Default};
10
11pub const DEFAULT_FILE_CHUNK_SIZE: i64 = 256 * 1024 * 1024;
12pub const WHOLE_FILE_CHUNK_SIZE: i64 = -1;
13pub const DEFAULT_S3_MULTIPART_PART_SIZE: usize = 32 * 1024 * 1024;
14
15#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
16pub enum HashAlgorithm {
17    #[serde(rename = "xxh128")]
18    Xxh128,
19}
20
21impl HashAlgorithm {
22    pub fn extension(&self) -> &'static str {
23        match self {
24            Self::Xxh128 => "xxh128",
25        }
26    }
27}
28
29impl std::fmt::Display for HashAlgorithm {
30    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
31        f.write_str(self.extension())
32    }
33}
34
35/// Computes xxh128 hash of data, returns lowercase hex string.
36pub fn hash_data(data: &[u8]) -> String {
37    format!("{:032x}", xxh3_128(data))
38}
39
40/// Reads file in streaming fashion and computes xxh128 hash.
41pub fn hash_file(path: &Path) -> std::io::Result<String> {
42    let mut file = File::open(path)?;
43    let mut hasher = Xxh3Default::new();
44    let mut buf = [0u8; 64 * 1024];
45    loop {
46        let n = file.read(&mut buf)?;
47        if n == 0 {
48            break;
49        }
50        hasher.update(&buf[..n]);
51    }
52    Ok(format!("{:032x}", hasher.digest128()))
53}
54
55/// Hashes file in chunks, returns vec of hex hash strings.
56///
57/// `chunk_size` must be strictly positive. Pass a positive value in bytes.
58/// `WHOLE_FILE_CHUNK_SIZE` is not a valid argument — callers should use
59/// [`hash_file`] for whole-file hashing.
60///
61/// `expected_size` is the file size the caller expects on disk (typically
62/// from the manifest entry). If the actual file size differs, this function
63/// returns an `InvalidData` error — content-addressed hashing requires the
64/// file on disk to match what the manifest claims.
65///
66/// Uses `read_exact` to ensure chunk boundaries are determined by `chunk_size`,
67/// not by how many bytes a single `read()` call returns.
68pub fn hash_file_chunked(
69    path: &Path,
70    chunk_size: u64,
71    expected_size: u64,
72) -> std::io::Result<Vec<String>> {
73    if chunk_size == 0 {
74        return Err(std::io::Error::new(
75            std::io::ErrorKind::InvalidInput,
76            "hash_file_chunked requires chunk_size > 0",
77        ));
78    }
79    let file = File::open(path)?;
80    let actual_size = file.metadata()?.len();
81    if actual_size != expected_size {
82        return Err(std::io::Error::new(
83            std::io::ErrorKind::InvalidData,
84            format!(
85                "file size mismatch for {}: expected {expected_size}, found {actual_size}",
86                path.display()
87            ),
88        ));
89    }
90
91    let mut file = file;
92    let full_chunks = actual_size / chunk_size;
93    let remainder_len = (actual_size % chunk_size) as usize;
94    let mut hashes = Vec::with_capacity(full_chunks as usize + 1);
95    let mut buf = vec![0u8; chunk_size as usize];
96
97    for _ in 0..full_chunks {
98        file.read_exact(&mut buf)?;
99        hashes.push(hash_data(&buf));
100    }
101    if remainder_len > 0 {
102        buf.truncate(remainder_len);
103        file.read_exact(&mut buf)?;
104        hashes.push(hash_data(&buf));
105    }
106    if hashes.is_empty() {
107        hashes.push(hash_data(&[]));
108    }
109    Ok(hashes)
110}
111
112/// Formats a byte count as a human-readable string (e.g., "1.5 MB").
113pub fn human_readable_file_size(bytes: u64) -> String {
114    let mut size = bytes as f64;
115    for unit in &["B", "KB", "MB", "GB", "TB", "PB", "EB"] {
116        let rounded = (size * 100.0).round() / 100.0;
117        if rounded < 1000.0 {
118            if *unit == "B" {
119                return format!("{} {}", rounded as u64, unit);
120            }
121            return format!("{rounded} {unit}");
122        }
123        size /= 1000.0;
124    }
125    format!("{} EB", (size * 100.0).round() / 100.0)
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131    use std::io::Write;
132
133    #[test]
134    fn hash_known_data() {
135        let h = hash_data(b"hello world");
136        assert_eq!(h.len(), 32);
137        // Deterministic — same input always produces same hash
138        assert_eq!(h, hash_data(b"hello world"));
139        // Different input produces different hash
140        assert_ne!(h, hash_data(b"goodbye"));
141    }
142
143    #[test]
144    fn hash_empty_data() {
145        let h = hash_data(b"");
146        assert_eq!(h.len(), 32);
147    }
148
149    #[test]
150    fn hash_temp_file() {
151        let dir = tempfile::tempdir().unwrap();
152        let p = dir.path().join("test.txt");
153        std::fs::write(&p, b"file content").unwrap();
154        let h = hash_file(&p).unwrap();
155        assert_eq!(h, hash_data(b"file content"));
156    }
157
158    #[test]
159    fn hash_chunked_file() {
160        let dir = tempfile::tempdir().unwrap();
161        let p = dir.path().join("chunked.bin");
162        let mut f = File::create(&p).unwrap();
163        // Write 10 bytes, chunk size 4 => 3 chunks (4+4+2)
164        f.write_all(&[0u8; 10]).unwrap();
165        drop(f);
166        let hashes = hash_file_chunked(&p, 4, 10).unwrap();
167        assert_eq!(hashes.len(), 3);
168        assert_eq!(hashes[0], hash_data(&[0u8; 4]));
169        assert_eq!(hashes[2], hash_data(&[0u8; 2]));
170    }
171
172    #[test]
173    fn hash_chunked_file_is_deterministic() {
174        // Chunk hashing must be deterministic — two hashings of the same file
175        // must produce identical chunk-hash vectors.
176        let dir = tempfile::tempdir().unwrap();
177        let p = dir.path().join("testfile");
178        let chunk_size: u64 = 1024;
179        let data: Vec<u8> = (0..3 * chunk_size).map(|i| (i % 256) as u8).collect();
180        std::fs::write(&p, &data).unwrap();
181
182        let h1 = hash_file_chunked(&p, chunk_size, data.len() as u64).unwrap();
183        let h2 = hash_file_chunked(&p, chunk_size, data.len() as u64).unwrap();
184        assert_eq!(h1.len(), 3);
185        assert_eq!(h1, h2);
186    }
187
188    #[test]
189    fn hash_chunked_empty_file() {
190        let dir = tempfile::tempdir().unwrap();
191        let p = dir.path().join("empty.bin");
192        File::create(&p).unwrap();
193        let hashes = hash_file_chunked(&p, 4, 0).unwrap();
194        assert_eq!(hashes.len(), 1);
195        assert_eq!(hashes[0], hash_data(b""));
196    }
197
198    #[test]
199    fn hash_chunked_rejects_zero_chunk_size() {
200        let dir = tempfile::tempdir().unwrap();
201        let p = dir.path().join("f.bin");
202        std::fs::write(&p, b"data").unwrap();
203        let err = hash_file_chunked(&p, 0, 4).unwrap_err();
204        assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput);
205        assert!(err.to_string().contains("chunk_size > 0"));
206    }
207
208    #[test]
209    fn hash_chunked_size_mismatch_longer_on_disk() {
210        // Manifest says 5 bytes but file is 10 bytes. Must error.
211        let dir = tempfile::tempdir().unwrap();
212        let p = dir.path().join("f.bin");
213        std::fs::write(&p, [0u8; 10]).unwrap();
214        let err = hash_file_chunked(&p, 4, 5).unwrap_err();
215        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
216        assert!(err.to_string().contains("size"), "{err}");
217    }
218
219    #[test]
220    fn hash_chunked_size_mismatch_shorter_on_disk() {
221        // Manifest says 10 bytes but file is 5 bytes. Must error.
222        let dir = tempfile::tempdir().unwrap();
223        let p = dir.path().join("f.bin");
224        std::fs::write(&p, [0u8; 5]).unwrap();
225        let err = hash_file_chunked(&p, 4, 10).unwrap_err();
226        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
227        assert!(err.to_string().contains("size"), "{err}");
228    }
229
230    #[test]
231    fn hash_algorithm_serde() {
232        let json = serde_json::to_string(&HashAlgorithm::Xxh128).unwrap();
233        assert_eq!(json, "\"xxh128\"");
234        let parsed: HashAlgorithm = serde_json::from_str(&json).unwrap();
235        assert_eq!(parsed, HashAlgorithm::Xxh128);
236    }
237
238    #[test]
239    fn hash_algorithm_extension() {
240        assert_eq!(HashAlgorithm::Xxh128.extension(), "xxh128");
241    }
242
243    #[test]
244    fn human_readable_bytes() {
245        assert_eq!(human_readable_file_size(0), "0 B");
246        assert_eq!(human_readable_file_size(1), "1 B");
247        assert_eq!(human_readable_file_size(999), "999 B");
248    }
249
250    #[test]
251    fn human_readable_kilobytes() {
252        assert_eq!(human_readable_file_size(1_000), "1 KB");
253        assert_eq!(human_readable_file_size(1_500), "1.5 KB");
254    }
255
256    #[test]
257    fn human_readable_megabytes() {
258        assert_eq!(human_readable_file_size(1_000_000), "1 MB");
259        assert_eq!(human_readable_file_size(256 * 1024 * 1024), "268.44 MB");
260    }
261
262    #[test]
263    fn human_readable_gigabytes() {
264        assert_eq!(human_readable_file_size(1_000_000_000), "1 GB");
265    }
266
267    #[test]
268    fn human_readable_terabytes() {
269        assert_eq!(human_readable_file_size(1_000_000_000_000), "1 TB");
270    }
271
272    #[test]
273    fn human_readable_petabytes() {
274        assert_eq!(human_readable_file_size(1_000_000_000_000_000), "1 PB");
275    }
276
277    #[test]
278    fn human_readable_exabytes() {
279        assert_eq!(human_readable_file_size(1_000_000_000_000_000_000), "1 EB");
280        assert_eq!(human_readable_file_size(u64::MAX), "18.45 EB");
281    }
282}