1use serde::{Deserialize, Serialize};
6use std::fs::File;
7use std::io::Read;
8use std::path::Path;
9use xxhash_rust::xxh3::{xxh3_128, Xxh3Default};
10
11pub const DEFAULT_FILE_CHUNK_SIZE: i64 = 256 * 1024 * 1024;
12pub const WHOLE_FILE_CHUNK_SIZE: i64 = -1;
13pub const DEFAULT_S3_MULTIPART_PART_SIZE: usize = 32 * 1024 * 1024;
14
15#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
16pub enum HashAlgorithm {
17 #[serde(rename = "xxh128")]
18 Xxh128,
19}
20
21impl HashAlgorithm {
22 pub fn extension(&self) -> &'static str {
23 match self {
24 Self::Xxh128 => "xxh128",
25 }
26 }
27}
28
29impl std::fmt::Display for HashAlgorithm {
30 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
31 f.write_str(self.extension())
32 }
33}
34
35pub fn hash_data(data: &[u8]) -> String {
37 format!("{:032x}", xxh3_128(data))
38}
39
40pub fn hash_file(path: &Path) -> std::io::Result<String> {
42 let mut file = File::open(path)?;
43 let mut hasher = Xxh3Default::new();
44 let mut buf = [0u8; 64 * 1024];
45 loop {
46 let n = file.read(&mut buf)?;
47 if n == 0 {
48 break;
49 }
50 hasher.update(&buf[..n]);
51 }
52 Ok(format!("{:032x}", hasher.digest128()))
53}
54
55pub fn hash_file_chunked(
69 path: &Path,
70 chunk_size: u64,
71 expected_size: u64,
72) -> std::io::Result<Vec<String>> {
73 if chunk_size == 0 {
74 return Err(std::io::Error::new(
75 std::io::ErrorKind::InvalidInput,
76 "hash_file_chunked requires chunk_size > 0",
77 ));
78 }
79 let file = File::open(path)?;
80 let actual_size = file.metadata()?.len();
81 if actual_size != expected_size {
82 return Err(std::io::Error::new(
83 std::io::ErrorKind::InvalidData,
84 format!(
85 "file size mismatch for {}: expected {expected_size}, found {actual_size}",
86 path.display()
87 ),
88 ));
89 }
90
91 let mut file = file;
92 let full_chunks = actual_size / chunk_size;
93 let remainder_len = (actual_size % chunk_size) as usize;
94 let mut hashes = Vec::with_capacity(full_chunks as usize + 1);
95 let mut buf = vec![0u8; chunk_size as usize];
96
97 for _ in 0..full_chunks {
98 file.read_exact(&mut buf)?;
99 hashes.push(hash_data(&buf));
100 }
101 if remainder_len > 0 {
102 buf.truncate(remainder_len);
103 file.read_exact(&mut buf)?;
104 hashes.push(hash_data(&buf));
105 }
106 if hashes.is_empty() {
107 hashes.push(hash_data(&[]));
108 }
109 Ok(hashes)
110}
111
112pub fn human_readable_file_size(bytes: u64) -> String {
114 let mut size = bytes as f64;
115 for unit in &["B", "KB", "MB", "GB", "TB", "PB", "EB"] {
116 let rounded = (size * 100.0).round() / 100.0;
117 if rounded < 1000.0 {
118 if *unit == "B" {
119 return format!("{} {}", rounded as u64, unit);
120 }
121 return format!("{rounded} {unit}");
122 }
123 size /= 1000.0;
124 }
125 format!("{} EB", (size * 100.0).round() / 100.0)
126}
127
128#[cfg(test)]
129mod tests {
130 use super::*;
131 use std::io::Write;
132
133 #[test]
134 fn hash_known_data() {
135 let h = hash_data(b"hello world");
136 assert_eq!(h.len(), 32);
137 assert_eq!(h, hash_data(b"hello world"));
139 assert_ne!(h, hash_data(b"goodbye"));
141 }
142
143 #[test]
144 fn hash_empty_data() {
145 let h = hash_data(b"");
146 assert_eq!(h.len(), 32);
147 }
148
149 #[test]
150 fn hash_temp_file() {
151 let dir = tempfile::tempdir().unwrap();
152 let p = dir.path().join("test.txt");
153 std::fs::write(&p, b"file content").unwrap();
154 let h = hash_file(&p).unwrap();
155 assert_eq!(h, hash_data(b"file content"));
156 }
157
158 #[test]
159 fn hash_chunked_file() {
160 let dir = tempfile::tempdir().unwrap();
161 let p = dir.path().join("chunked.bin");
162 let mut f = File::create(&p).unwrap();
163 f.write_all(&[0u8; 10]).unwrap();
165 drop(f);
166 let hashes = hash_file_chunked(&p, 4, 10).unwrap();
167 assert_eq!(hashes.len(), 3);
168 assert_eq!(hashes[0], hash_data(&[0u8; 4]));
169 assert_eq!(hashes[2], hash_data(&[0u8; 2]));
170 }
171
172 #[test]
173 fn hash_chunked_file_is_deterministic() {
174 let dir = tempfile::tempdir().unwrap();
177 let p = dir.path().join("testfile");
178 let chunk_size: u64 = 1024;
179 let data: Vec<u8> = (0..3 * chunk_size).map(|i| (i % 256) as u8).collect();
180 std::fs::write(&p, &data).unwrap();
181
182 let h1 = hash_file_chunked(&p, chunk_size, data.len() as u64).unwrap();
183 let h2 = hash_file_chunked(&p, chunk_size, data.len() as u64).unwrap();
184 assert_eq!(h1.len(), 3);
185 assert_eq!(h1, h2);
186 }
187
188 #[test]
189 fn hash_chunked_empty_file() {
190 let dir = tempfile::tempdir().unwrap();
191 let p = dir.path().join("empty.bin");
192 File::create(&p).unwrap();
193 let hashes = hash_file_chunked(&p, 4, 0).unwrap();
194 assert_eq!(hashes.len(), 1);
195 assert_eq!(hashes[0], hash_data(b""));
196 }
197
198 #[test]
199 fn hash_chunked_rejects_zero_chunk_size() {
200 let dir = tempfile::tempdir().unwrap();
201 let p = dir.path().join("f.bin");
202 std::fs::write(&p, b"data").unwrap();
203 let err = hash_file_chunked(&p, 0, 4).unwrap_err();
204 assert_eq!(err.kind(), std::io::ErrorKind::InvalidInput);
205 assert!(err.to_string().contains("chunk_size > 0"));
206 }
207
208 #[test]
209 fn hash_chunked_size_mismatch_longer_on_disk() {
210 let dir = tempfile::tempdir().unwrap();
212 let p = dir.path().join("f.bin");
213 std::fs::write(&p, [0u8; 10]).unwrap();
214 let err = hash_file_chunked(&p, 4, 5).unwrap_err();
215 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
216 assert!(err.to_string().contains("size"), "{err}");
217 }
218
219 #[test]
220 fn hash_chunked_size_mismatch_shorter_on_disk() {
221 let dir = tempfile::tempdir().unwrap();
223 let p = dir.path().join("f.bin");
224 std::fs::write(&p, [0u8; 5]).unwrap();
225 let err = hash_file_chunked(&p, 4, 10).unwrap_err();
226 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
227 assert!(err.to_string().contains("size"), "{err}");
228 }
229
230 #[test]
231 fn hash_algorithm_serde() {
232 let json = serde_json::to_string(&HashAlgorithm::Xxh128).unwrap();
233 assert_eq!(json, "\"xxh128\"");
234 let parsed: HashAlgorithm = serde_json::from_str(&json).unwrap();
235 assert_eq!(parsed, HashAlgorithm::Xxh128);
236 }
237
238 #[test]
239 fn hash_algorithm_extension() {
240 assert_eq!(HashAlgorithm::Xxh128.extension(), "xxh128");
241 }
242
243 #[test]
244 fn human_readable_bytes() {
245 assert_eq!(human_readable_file_size(0), "0 B");
246 assert_eq!(human_readable_file_size(1), "1 B");
247 assert_eq!(human_readable_file_size(999), "999 B");
248 }
249
250 #[test]
251 fn human_readable_kilobytes() {
252 assert_eq!(human_readable_file_size(1_000), "1 KB");
253 assert_eq!(human_readable_file_size(1_500), "1.5 KB");
254 }
255
256 #[test]
257 fn human_readable_megabytes() {
258 assert_eq!(human_readable_file_size(1_000_000), "1 MB");
259 assert_eq!(human_readable_file_size(256 * 1024 * 1024), "268.44 MB");
260 }
261
262 #[test]
263 fn human_readable_gigabytes() {
264 assert_eq!(human_readable_file_size(1_000_000_000), "1 GB");
265 }
266
267 #[test]
268 fn human_readable_terabytes() {
269 assert_eq!(human_readable_file_size(1_000_000_000_000), "1 TB");
270 }
271
272 #[test]
273 fn human_readable_petabytes() {
274 assert_eq!(human_readable_file_size(1_000_000_000_000_000), "1 PB");
275 }
276
277 #[test]
278 fn human_readable_exabytes() {
279 assert_eq!(human_readable_file_size(1_000_000_000_000_000_000), "1 EB");
280 assert_eq!(human_readable_file_size(u64::MAX), "18.45 EB");
281 }
282}