Skip to main content

rustack_s3_core/
checksums.rs

1//! Checksum computation for S3 objects.
2//!
3//! Provides functions to compute MD5, SHA-1, SHA-256, CRC32, and CRC32C
4//! checksums used by S3 for ETags and the additional checksum algorithms
5//! supported by the `x-amz-checksum-*` headers.
6//!
7//! # Streaming Hashing
8//!
9//! For large objects that cannot be buffered entirely in memory, use
10//! [`StreamingHasher`] to incrementally feed data and obtain the final
11//! results via [`HasherResult`].
12
13use std::{fmt, str::FromStr};
14
15use base64::{Engine, engine::general_purpose::STANDARD as BASE64_STANDARD};
16use digest::Digest;
17
18// ---------------------------------------------------------------------------
19// ChecksumAlgorithm
20// ---------------------------------------------------------------------------
21
22/// S3-supported checksum algorithms (excluding MD5 which is always computed
23/// for the ETag).
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25pub enum ChecksumAlgorithm {
26    /// CRC-32 (IEEE 802.3).
27    Crc32,
28    /// CRC-32C (Castagnoli).
29    Crc32c,
30    /// CRC-64/NVME (NVMe polynomial, hardware-accelerated).
31    Crc64Nvme,
32    /// SHA-1.
33    Sha1,
34    /// SHA-256.
35    Sha256,
36}
37
38impl ChecksumAlgorithm {
39    /// Return the canonical string representation used in S3 headers.
40    #[must_use]
41    pub fn as_str(&self) -> &'static str {
42        match self {
43            Self::Crc32 => "CRC32",
44            Self::Crc32c => "CRC32C",
45            Self::Crc64Nvme => "CRC64NVME",
46            Self::Sha1 => "SHA1",
47            Self::Sha256 => "SHA256",
48        }
49    }
50}
51
52impl fmt::Display for ChecksumAlgorithm {
53    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54        f.write_str(self.as_str())
55    }
56}
57
58/// Error returned when parsing a [`ChecksumAlgorithm`] from a string fails.
59#[derive(Debug, Clone, thiserror::Error)]
60#[error("unknown checksum algorithm: {0}")]
61pub struct ParseChecksumAlgorithmError(String);
62
63impl FromStr for ChecksumAlgorithm {
64    type Err = ParseChecksumAlgorithmError;
65
66    fn from_str(s: &str) -> Result<Self, Self::Err> {
67        match s.to_ascii_uppercase().as_str() {
68            "CRC32" => Ok(Self::Crc32),
69            "CRC32C" => Ok(Self::Crc32c),
70            "CRC64NVME" => Ok(Self::Crc64Nvme),
71            "SHA1" => Ok(Self::Sha1),
72            "SHA256" => Ok(Self::Sha256),
73            _ => Err(ParseChecksumAlgorithmError(s.to_owned())),
74        }
75    }
76}
77
78// ---------------------------------------------------------------------------
79// ChecksumValue
80// ---------------------------------------------------------------------------
81
82/// A base64-encoded checksum value paired with its algorithm.
83#[derive(Debug, Clone, PartialEq, Eq)]
84pub struct ChecksumValue {
85    /// The algorithm used to compute this checksum.
86    pub algorithm: ChecksumAlgorithm,
87    /// The base64-encoded checksum.
88    pub value: String,
89}
90
91// ---------------------------------------------------------------------------
92// Standalone checksum functions
93// ---------------------------------------------------------------------------
94
95/// Compute the hex-encoded MD5 digest of `data`.
96///
97/// This is the raw hex digest used internally. For an S3-formatted ETag (quoted),
98/// use [`compute_etag`].
99///
100/// # Examples
101///
102/// ```
103/// use rustack_s3_core::checksums::compute_md5;
104///
105/// let digest = compute_md5(b"hello");
106/// assert_eq!(digest, "5d41402abc4b2a76b9719d911017c592");
107/// ```
108#[must_use]
109pub fn compute_md5(data: &[u8]) -> String {
110    let hash = md5::Md5::digest(data);
111    hex::encode(hash)
112}
113
114/// Compute the quoted hex-encoded MD5 digest of `data`, suitable for use as
115/// an S3 ETag.
116///
117/// The returned string is surrounded by double quotes, e.g.
118/// `"5d41402abc4b2a76b9719d911017c592"`.
119///
120/// # Examples
121///
122/// ```
123/// use rustack_s3_core::checksums::compute_etag;
124///
125/// let etag = compute_etag(b"");
126/// assert_eq!(etag, "\"d41d8cd98f00b204e9800998ecf8427e\"");
127/// ```
128#[must_use]
129pub fn compute_etag(data: &[u8]) -> String {
130    let md5_hex = compute_md5(data);
131    format!("\"{md5_hex}\"")
132}
133
134/// Compute a composite ETag for a multipart upload.
135///
136/// The composite ETag is the MD5 of the concatenated binary MD5 digests of
137/// each part, formatted as `"<hex>-<part_count>"`.
138///
139/// Each entry in `part_md5_hexes` should be the *unquoted* hex MD5 of a part.
140///
141/// # Examples
142///
143/// ```
144/// use rustack_s3_core::checksums::compute_multipart_etag;
145///
146/// let part_hexes = ["5d41402abc4b2a76b9719d911017c592"];
147/// let etag = compute_multipart_etag(&part_hexes, 1);
148/// assert!(etag.ends_with("-1\""));
149/// ```
150#[must_use]
151pub fn compute_multipart_etag(part_md5_hexes: &[impl AsRef<str>], part_count: usize) -> String {
152    let mut combined = Vec::with_capacity(part_md5_hexes.len() * 16);
153    for hex_str in part_md5_hexes {
154        let hex_str = hex_str.as_ref().trim_matches('"');
155        if let Ok(bytes) = hex::decode(hex_str) {
156            combined.extend_from_slice(&bytes);
157        }
158    }
159    let final_md5 = hex::encode(md5::Md5::digest(&combined));
160    format!("\"{final_md5}-{part_count}\"")
161}
162
163/// Compute a base64-encoded checksum for the given algorithm.
164///
165/// # Examples
166///
167/// ```
168/// use rustack_s3_core::checksums::{ChecksumAlgorithm, compute_checksum};
169///
170/// let b64 = compute_checksum(ChecksumAlgorithm::Crc32, b"hello");
171/// assert!(!b64.is_empty());
172/// ```
173#[must_use]
174pub fn compute_checksum(algorithm: ChecksumAlgorithm, data: &[u8]) -> String {
175    match algorithm {
176        ChecksumAlgorithm::Crc32 => {
177            let mut hasher = crc32fast::Hasher::new();
178            hasher.update(data);
179            let value = hasher.finalize();
180            BASE64_STANDARD.encode(value.to_be_bytes())
181        }
182        ChecksumAlgorithm::Crc32c => {
183            let value = crc32c::crc32c(data);
184            BASE64_STANDARD.encode(value.to_be_bytes())
185        }
186        ChecksumAlgorithm::Crc64Nvme => {
187            let mut hasher = crc64fast_nvme::Digest::new();
188            hasher.write(data);
189            let value = hasher.sum64();
190            BASE64_STANDARD.encode(value.to_be_bytes())
191        }
192        ChecksumAlgorithm::Sha1 => {
193            let hash = sha1::Sha1::digest(data);
194            BASE64_STANDARD.encode(hash)
195        }
196        ChecksumAlgorithm::Sha256 => {
197            let hash = sha2::Sha256::digest(data);
198            BASE64_STANDARD.encode(hash)
199        }
200    }
201}
202
203/// Compute a composite checksum for a multipart upload.
204///
205/// The composite checksum is computed by concatenating the raw (decoded)
206/// checksums of each part and then computing the checksum of that
207/// concatenation. The result is base64-encoded with a `-<part_count>` suffix.
208///
209/// Each entry in `part_checksums_b64` should be the base64-encoded checksum
210/// of a single part (without a part-count suffix).
211///
212/// # Examples
213///
214/// ```
215/// use rustack_s3_core::checksums::{ChecksumAlgorithm, compute_checksum, compute_composite_checksum};
216///
217/// let part1 = compute_checksum(ChecksumAlgorithm::Crc32, b"hello");
218/// let composite = compute_composite_checksum(ChecksumAlgorithm::Crc32, &[part1]);
219/// assert!(composite.contains("-1"));
220/// ```
221#[must_use]
222pub fn compute_composite_checksum(
223    algorithm: ChecksumAlgorithm,
224    part_checksums_b64: &[impl AsRef<str>],
225) -> String {
226    let mut combined = Vec::new();
227    for b64 in part_checksums_b64 {
228        if let Ok(bytes) = BASE64_STANDARD.decode(b64.as_ref()) {
229            combined.extend_from_slice(&bytes);
230        }
231    }
232    let checksum_b64 = compute_checksum(algorithm, &combined);
233    format!("{checksum_b64}-{}", part_checksums_b64.len())
234}
235
236// ---------------------------------------------------------------------------
237// StreamingHasher
238// ---------------------------------------------------------------------------
239
240/// Result produced by [`StreamingHasher::finish`].
241#[derive(Debug, Clone)]
242pub struct HasherResult {
243    /// Hex-encoded MD5 digest.
244    pub md5_hex: String,
245    /// Per-algorithm base64-encoded checksums (only for algorithms that were
246    /// requested when the hasher was created).
247    pub checksums: Vec<ChecksumValue>,
248}
249
250/// Incremental hasher that computes MD5 and optionally additional checksums
251/// over a stream of data chunks.
252///
253/// # Examples
254///
255/// ```
256/// use rustack_s3_core::checksums::{ChecksumAlgorithm, StreamingHasher};
257///
258/// let mut hasher = StreamingHasher::new(&[ChecksumAlgorithm::Sha256]);
259/// hasher.update(b"hello ");
260/// hasher.update(b"world");
261/// let result = hasher.finish();
262/// assert!(!result.md5_hex.is_empty());
263/// assert_eq!(result.checksums.len(), 1);
264/// ```
265pub struct StreamingHasher {
266    md5: md5::Md5,
267    sha1: Option<sha1::Sha1>,
268    sha256: Option<sha2::Sha256>,
269    crc32: Option<crc32fast::Hasher>,
270    crc32c: Option<u32>,
271    crc64nvme: Option<crc64fast_nvme::Digest>,
272    algorithms: Vec<ChecksumAlgorithm>,
273}
274
275impl fmt::Debug for StreamingHasher {
276    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
277        f.debug_struct("StreamingHasher")
278            .field("algorithms", &self.algorithms)
279            .finish_non_exhaustive()
280    }
281}
282
283impl StreamingHasher {
284    /// Create a new streaming hasher.
285    ///
286    /// MD5 is always computed. Provide additional algorithms in `algorithms`
287    /// to compute extra checksums.
288    #[must_use]
289    pub fn new(algorithms: &[ChecksumAlgorithm]) -> Self {
290        let mut sha1 = None;
291        let mut sha256 = None;
292        let mut crc32 = None;
293        let mut crc32c = None;
294        let mut crc64nvme = None;
295
296        for &algo in algorithms {
297            match algo {
298                ChecksumAlgorithm::Sha1 => {
299                    sha1 = Some(<sha1::Sha1 as Digest>::new());
300                }
301                ChecksumAlgorithm::Sha256 => {
302                    sha256 = Some(<sha2::Sha256 as Digest>::new());
303                }
304                ChecksumAlgorithm::Crc32 => {
305                    crc32 = Some(crc32fast::Hasher::new());
306                }
307                ChecksumAlgorithm::Crc32c => {
308                    crc32c = Some(0);
309                }
310                ChecksumAlgorithm::Crc64Nvme => {
311                    crc64nvme = Some(crc64fast_nvme::Digest::new());
312                }
313            }
314        }
315
316        Self {
317            md5: <md5::Md5 as Digest>::new(),
318            sha1,
319            sha256,
320            crc32,
321            crc32c,
322            crc64nvme,
323            algorithms: algorithms.to_vec(),
324        }
325    }
326
327    /// Feed more data into the hasher.
328    pub fn update(&mut self, data: &[u8]) {
329        Digest::update(&mut self.md5, data);
330
331        if let Some(ref mut h) = self.sha1 {
332            Digest::update(h, data);
333        }
334        if let Some(ref mut h) = self.sha256 {
335            Digest::update(h, data);
336        }
337        if let Some(ref mut h) = self.crc32 {
338            h.update(data);
339        }
340        if let Some(ref mut val) = self.crc32c {
341            *val = crc32c::crc32c_append(*val, data);
342        }
343        if let Some(ref mut h) = self.crc64nvme {
344            h.write(data);
345        }
346    }
347
348    /// Finalize the hasher and return the results.
349    ///
350    /// This consumes the hasher.
351    #[must_use]
352    pub fn finish(self) -> HasherResult {
353        let md5_hex = hex::encode(Digest::finalize(self.md5));
354
355        let mut checksums = Vec::with_capacity(self.algorithms.len());
356        for algo in &self.algorithms {
357            let value = match algo {
358                ChecksumAlgorithm::Sha1 => {
359                    let hash = Digest::finalize(self.sha1.clone().unwrap_or_default());
360                    BASE64_STANDARD.encode(hash)
361                }
362                ChecksumAlgorithm::Sha256 => {
363                    let hash = Digest::finalize(self.sha256.clone().unwrap_or_default());
364                    BASE64_STANDARD.encode(hash)
365                }
366                ChecksumAlgorithm::Crc32 => {
367                    let val = self
368                        .crc32
369                        .as_ref()
370                        .map_or(0, crc32fast::Hasher::clone_finalize);
371                    BASE64_STANDARD.encode(val.to_be_bytes())
372                }
373                ChecksumAlgorithm::Crc32c => {
374                    let val = self.crc32c.unwrap_or(0);
375                    BASE64_STANDARD.encode(val.to_be_bytes())
376                }
377                ChecksumAlgorithm::Crc64Nvme => {
378                    let val = self
379                        .crc64nvme
380                        .as_ref()
381                        .map_or(0, crc64fast_nvme::Digest::sum64);
382                    BASE64_STANDARD.encode(val.to_be_bytes())
383                }
384            };
385            checksums.push(ChecksumValue {
386                algorithm: *algo,
387                value,
388            });
389        }
390
391        HasherResult { md5_hex, checksums }
392    }
393}
394
395// ---------------------------------------------------------------------------
396// crc32fast Hasher clone_finalize helper
397// ---------------------------------------------------------------------------
398
399/// Extension trait to finalize a cloned `crc32fast::Hasher` without
400/// consuming the original (used by `StreamingHasher::finish`).
401trait CloneFinalize {
402    /// Clone and finalize, returning the CRC-32 value.
403    fn clone_finalize(&self) -> u32;
404}
405
406impl CloneFinalize for crc32fast::Hasher {
407    fn clone_finalize(&self) -> u32 {
408        self.clone().finalize()
409    }
410}
411
412#[cfg(test)]
413mod tests {
414    use super::*;
415
416    // -----------------------------------------------------------------------
417    // ChecksumAlgorithm
418    // -----------------------------------------------------------------------
419
420    #[test]
421    fn test_should_display_checksum_algorithm() {
422        assert_eq!(ChecksumAlgorithm::Crc32.to_string(), "CRC32");
423        assert_eq!(ChecksumAlgorithm::Crc32c.to_string(), "CRC32C");
424        assert_eq!(ChecksumAlgorithm::Crc64Nvme.to_string(), "CRC64NVME");
425        assert_eq!(ChecksumAlgorithm::Sha1.to_string(), "SHA1");
426        assert_eq!(ChecksumAlgorithm::Sha256.to_string(), "SHA256");
427    }
428
429    #[test]
430    fn test_should_parse_checksum_algorithm() {
431        assert_eq!(
432            "CRC32".parse::<ChecksumAlgorithm>().ok(),
433            Some(ChecksumAlgorithm::Crc32)
434        );
435        assert_eq!(
436            "crc32c".parse::<ChecksumAlgorithm>().ok(),
437            Some(ChecksumAlgorithm::Crc32c)
438        );
439        assert_eq!(
440            "CRC64NVME".parse::<ChecksumAlgorithm>().ok(),
441            Some(ChecksumAlgorithm::Crc64Nvme)
442        );
443        assert_eq!(
444            "sha1".parse::<ChecksumAlgorithm>().ok(),
445            Some(ChecksumAlgorithm::Sha1)
446        );
447        assert_eq!(
448            "SHA256".parse::<ChecksumAlgorithm>().ok(),
449            Some(ChecksumAlgorithm::Sha256)
450        );
451        assert!("unknown".parse::<ChecksumAlgorithm>().is_err());
452    }
453
454    // -----------------------------------------------------------------------
455    // MD5 / ETag
456    // -----------------------------------------------------------------------
457
458    #[test]
459    fn test_should_compute_md5_empty() {
460        assert_eq!(compute_md5(b""), "d41d8cd98f00b204e9800998ecf8427e");
461    }
462
463    #[test]
464    fn test_should_compute_md5_hello() {
465        assert_eq!(compute_md5(b"hello"), "5d41402abc4b2a76b9719d911017c592");
466    }
467
468    #[test]
469    fn test_should_compute_etag_empty() {
470        assert_eq!(compute_etag(b""), "\"d41d8cd98f00b204e9800998ecf8427e\"");
471    }
472
473    #[test]
474    fn test_should_compute_etag_with_data() {
475        let etag = compute_etag(b"hello");
476        assert!(etag.starts_with('"'));
477        assert!(etag.ends_with('"'));
478        assert_eq!(etag.len(), 34); // 32 hex + 2 quotes
479    }
480
481    // -----------------------------------------------------------------------
482    // Multipart ETag
483    // -----------------------------------------------------------------------
484
485    #[test]
486    fn test_should_compute_multipart_etag() {
487        let part1_hex = compute_md5(b"hello");
488        let part2_hex = compute_md5(b"world");
489        let etag = compute_multipart_etag(&[part1_hex, part2_hex], 2);
490        assert!(etag.starts_with('"'));
491        assert!(etag.ends_with("-2\""));
492    }
493
494    #[test]
495    fn test_should_compute_multipart_etag_single_part() {
496        let part_hex = compute_md5(b"data");
497        let etag = compute_multipart_etag(&[part_hex], 1);
498        assert!(etag.ends_with("-1\""));
499    }
500
501    // -----------------------------------------------------------------------
502    // Algorithm-specific checksums
503    // -----------------------------------------------------------------------
504
505    #[test]
506    fn test_should_compute_crc32_checksum() {
507        let b64 = compute_checksum(ChecksumAlgorithm::Crc32, b"hello");
508        assert!(!b64.is_empty());
509        // Verify round-trip decode
510        let decoded = BASE64_STANDARD.decode(&b64);
511        assert!(decoded.is_ok());
512        assert_eq!(decoded.expect("test decode").len(), 4);
513    }
514
515    #[test]
516    fn test_should_compute_crc32c_checksum() {
517        let b64 = compute_checksum(ChecksumAlgorithm::Crc32c, b"hello");
518        assert!(!b64.is_empty());
519    }
520
521    #[test]
522    fn test_should_compute_crc64nvme_checksum() {
523        let b64 = compute_checksum(ChecksumAlgorithm::Crc64Nvme, b"hello");
524        assert!(!b64.is_empty());
525        let decoded = BASE64_STANDARD.decode(&b64);
526        assert!(decoded.is_ok());
527        assert_eq!(decoded.expect("test decode").len(), 8);
528    }
529
530    #[test]
531    fn test_should_compute_sha1_checksum() {
532        let b64 = compute_checksum(ChecksumAlgorithm::Sha1, b"hello");
533        let decoded = BASE64_STANDARD.decode(&b64);
534        assert!(decoded.is_ok());
535        assert_eq!(decoded.expect("test decode").len(), 20);
536    }
537
538    #[test]
539    fn test_should_compute_sha256_checksum() {
540        let b64 = compute_checksum(ChecksumAlgorithm::Sha256, b"hello");
541        let decoded = BASE64_STANDARD.decode(&b64);
542        assert!(decoded.is_ok());
543        assert_eq!(decoded.expect("test decode").len(), 32);
544    }
545
546    // -----------------------------------------------------------------------
547    // Composite checksums
548    // -----------------------------------------------------------------------
549
550    #[test]
551    fn test_should_compute_composite_checksum() {
552        let p1 = compute_checksum(ChecksumAlgorithm::Sha256, b"part1");
553        let p2 = compute_checksum(ChecksumAlgorithm::Sha256, b"part2");
554        let composite = compute_composite_checksum(ChecksumAlgorithm::Sha256, &[p1, p2]);
555        assert!(composite.contains("-2"));
556    }
557
558    // -----------------------------------------------------------------------
559    // StreamingHasher
560    // -----------------------------------------------------------------------
561
562    #[test]
563    fn test_should_stream_md5_only() {
564        let mut hasher = StreamingHasher::new(&[]);
565        hasher.update(b"hello");
566        let result = hasher.finish();
567        assert_eq!(result.md5_hex, "5d41402abc4b2a76b9719d911017c592");
568        assert!(result.checksums.is_empty());
569    }
570
571    #[test]
572    fn test_should_stream_with_sha256() {
573        let mut hasher = StreamingHasher::new(&[ChecksumAlgorithm::Sha256]);
574        hasher.update(b"hello ");
575        hasher.update(b"world");
576        let result = hasher.finish();
577
578        // MD5 of "hello world"
579        assert_eq!(result.md5_hex, compute_md5(b"hello world"));
580
581        // SHA-256 of "hello world"
582        assert_eq!(result.checksums.len(), 1);
583        assert_eq!(result.checksums[0].algorithm, ChecksumAlgorithm::Sha256);
584        assert_eq!(
585            result.checksums[0].value,
586            compute_checksum(ChecksumAlgorithm::Sha256, b"hello world"),
587        );
588    }
589
590    #[test]
591    fn test_should_stream_multiple_algorithms() {
592        let algos = [
593            ChecksumAlgorithm::Crc32,
594            ChecksumAlgorithm::Crc32c,
595            ChecksumAlgorithm::Crc64Nvme,
596            ChecksumAlgorithm::Sha1,
597            ChecksumAlgorithm::Sha256,
598        ];
599        let mut hasher = StreamingHasher::new(&algos);
600        hasher.update(b"test data");
601        let result = hasher.finish();
602
603        assert_eq!(result.checksums.len(), 5);
604        for (i, algo) in algos.iter().enumerate() {
605            assert_eq!(result.checksums[i].algorithm, *algo);
606            assert_eq!(
607                result.checksums[i].value,
608                compute_checksum(*algo, b"test data"),
609            );
610        }
611    }
612
613    #[test]
614    fn test_should_match_single_shot_and_streaming_results() {
615        let data = b"The quick brown fox jumps over the lazy dog";
616
617        let single_md5 = compute_md5(data);
618        let single_sha256 = compute_checksum(ChecksumAlgorithm::Sha256, data);
619
620        let mut hasher = StreamingHasher::new(&[ChecksumAlgorithm::Sha256]);
621        // Feed in chunks
622        hasher.update(&data[..10]);
623        hasher.update(&data[10..30]);
624        hasher.update(&data[30..]);
625        let result = hasher.finish();
626
627        assert_eq!(result.md5_hex, single_md5);
628        assert_eq!(result.checksums[0].value, single_sha256);
629    }
630}