Skip to main content

c2pa/utils/
hash_utils.rs

1// Copyright 2022 Adobe. All rights reserved.
2// This file is licensed to you under the Apache License,
3// Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
4// or the MIT license (http://opensource.org/licenses/MIT),
5// at your option.
6
7// Unless required by applicable law or agreed to in writing,
8// this software is distributed on an "AS IS" BASIS, WITHOUT
9// WARRANTIES OR REPRESENTATIONS OF ANY KIND, either express or
10// implied. See the LICENSE-MIT and LICENSE-APACHE files for the
11// specific language governing permissions and limitations under
12// each license.
13
14use std::{
15    fs::File,
16    io::{Cursor, Read, Seek, SeekFrom},
17    ops::RangeInclusive,
18    path::Path,
19};
20
21use range_set::RangeSet;
22use serde::{Deserialize, Serialize};
23use serde_json::Value;
24// direct sha functions
25use sha2::{Digest, Sha256, Sha384, Sha512};
26
27use crate::{crypto::base64::encode, utils::io_utils::stream_len, Error, Result};
28
29const MAX_HASH_BUF: usize = 256 * 1024 * 1024; // cap memory usage to 256MB
30
31#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
32/// Defines a hash range to be used with `hash_stream_by_alg`
33pub struct HashRange {
34    start: u64,
35    length: u64,
36
37    #[serde(skip)]
38    bmff_offset: Option<u64>, /* optional tracking of offset positions to include in BMFF_V2 hashes in BE format */
39}
40
41impl HashRange {
42    pub fn new(start: u64, length: u64) -> Self {
43        HashRange {
44            start,
45            length,
46            bmff_offset: None,
47        }
48    }
49
50    /// update the start value
51    #[allow(dead_code)]
52    pub fn set_start(&mut self, start: u64) {
53        self.start = start;
54    }
55
56    /// return start as usize
57    pub fn start(&self) -> u64 {
58        self.start
59    }
60
61    /// return length as usize
62    pub fn length(&self) -> u64 {
63        self.length
64    }
65
66    pub fn set_length(&mut self, length: u64) {
67        self.length = length;
68    }
69
70    // set offset for BMFF_V2 to be hashed in addition to data
71    pub fn set_bmff_offset(&mut self, offset: u64) {
72        self.bmff_offset = Some(offset);
73    }
74
75    // get option offset for BMFF_V2 hash
76    pub fn bmff_offset(&self) -> Option<u64> {
77        self.bmff_offset
78    }
79}
80
81/// Compare two byte vectors return true if match, false otherwise
82pub fn vec_compare(va: &[u8], vb: &[u8]) -> bool {
83    (va.len() == vb.len()) &&  // zip stops at the shortest
84     va.iter()
85       .zip(vb)
86       .all(|(a,b)| a == b)
87}
88
89#[derive(Clone, Debug)]
90pub enum Hasher {
91    SHA256(Sha256),
92    SHA384(Sha384),
93    SHA512(Sha512),
94}
95
96impl Default for Hasher {
97    fn default() -> Self {
98        Hasher::SHA256(Sha256::new())
99    }
100}
101
102impl Hasher {
103    // update hash value with new data
104    pub fn update(&mut self, data: &[u8]) {
105        use Hasher::*;
106        // update the hash
107        match self {
108            SHA256(ref mut d) => d.update(data),
109            SHA384(ref mut d) => d.update(data),
110            SHA512(ref mut d) => d.update(data),
111        }
112    }
113
114    // consume hasher and return the final digest
115    pub fn finalize(hasher_enum: Hasher) -> Vec<u8> {
116        use Hasher::*;
117        // return the hash
118        match hasher_enum {
119            SHA256(d) => d.finalize().to_vec(),
120            SHA384(d) => d.finalize().to_vec(),
121            SHA512(d) => d.finalize().to_vec(),
122        }
123    }
124
125    pub fn finalize_reset(&mut self) -> Vec<u8> {
126        use Hasher::*;
127
128        // return the hash and leave the Hasher open and reset
129        match self {
130            SHA256(ref mut d) => d.finalize_reset().to_vec(),
131            SHA384(ref mut d) => d.finalize_reset().to_vec(),
132            SHA512(ref mut d) => d.finalize_reset().to_vec(),
133        }
134    }
135
136    pub fn new(alg: &str) -> Result<Hasher> {
137        match alg {
138            "sha256" => Ok(Hasher::SHA256(Sha256::new())),
139            "sha384" => Ok(Hasher::SHA384(Sha384::new())),
140            "sha512" => Ok(Hasher::SHA512(Sha512::new())),
141            _ => Err(Error::UnsupportedType),
142        }
143    }
144}
145
146// Return hash bytes for desired hashing algorithm.
147pub fn hash_by_alg(alg: &str, data: &[u8], exclusions: Option<Vec<HashRange>>) -> Vec<u8> {
148    let mut reader = Cursor::new(data);
149
150    hash_stream_by_alg(alg, &mut reader, exclusions, true).unwrap_or_default()
151}
152
153// Return hash inclusive bytes for desired hashing algorithm.
154pub fn hash_by_alg_with_inclusions(alg: &str, data: &[u8], inclusions: Vec<HashRange>) -> Vec<u8> {
155    let mut reader = Cursor::new(data);
156
157    hash_stream_by_alg(alg, &mut reader, Some(inclusions), false).unwrap_or_default()
158}
159
160// Return hash bytes for asset using desired hashing algorithm.
161pub fn hash_asset_by_alg(
162    alg: &str,
163    asset_path: &Path,
164    exclusions: Option<Vec<HashRange>>,
165) -> Result<Vec<u8>> {
166    let mut file = File::open(asset_path)?;
167    hash_stream_by_alg(alg, &mut file, exclusions, true)
168}
169
170// Return hash inclusive bytes for asset using desired hashing algorithm.
171pub fn hash_asset_by_alg_with_inclusions(
172    alg: &str,
173    asset_path: &Path,
174    inclusions: Vec<HashRange>,
175) -> Result<Vec<u8>> {
176    let mut file = File::open(asset_path)?;
177    hash_stream_by_alg(alg, &mut file, Some(inclusions), false)
178}
179
180/*  Returns hash bytes for a stream using desired hashing algorithm.  The function handles the many
181    possible hash requirements of C2PA.  The function accepts a source stream 'data', an optional
182    set of hash ranges 'hash_range' and a boolean to indicate whether the hash range is an exclusion
183    or inclusion set of hash ranges.
184
185    The basic case is to hash a stream without hash ranges:
186    The data represents a single contiguous stream of bytes to be hash where D are data bytes
187
188    to_be_hashed: [DDDDDDDDD...DDDDDDDDDD]
189
190    The data is then chunked and hashed in groups to reduce memory
191    footprint and increase performance.
192
193    The most common case for C2PA is the use of an exclusion hash.  In this case the 'hash_range' indicate
194    which byte ranges should be excluded shown here depicted with I for included bytes and  X for excluded bytes
195
196    to_be_hashed: [IIIIXXXIIIIXXXXXIIIXXIII...IIII]
197
198    In this case the data is split into a set of ranges covering the included bytes.  The set of ranged bytes
199    are then chunked and hashed just like the default case.
200
201    The opposite of this is when 'is_exclusion' is set to false indicating the 'hash_ranges' represent the bytes
202    to include in the hash. Here are the bytes in 'data' are excluded except those explicitly referenced.
203
204    to_be_hashed: [XXXXXXIIIIXXXXXIIXXXX...XXXX]
205
206    Again a set of ranged bytes are created and hashed as described above.
207
208    The last case is a special requirement for BMFF based assets (exclusion hashes only).  For this case we not
209    only hash the data but also the location where the data was found in the asset.  To do this we add a special
210    HashRange object to the hash ranges to indicate which locations in the stream require this special offset
211    hash.  To make processing efficient we again split the data into ranges at not just the exclusion
212    points but also for these markers.  The hashing loop knows to pause at these special marker ranges to insert
213    the hash of the offset.  The stream sent to the hashing loop logically looks like this where M is the marker.
214    to_be_hashed: [IIIIIXXXXXMIIIIIMXXXXXMXXXXIII...III]
215
216    The data is again split into range sets breaking at the exclusion points and now also the markers.
217*/
218/// Internal implementation of [`hash_stream_by_alg`] with an optional per-range
219/// progress/cancellation callback.  SDK internals that have a [`Context`] available
220/// pass a closure that calls [`Context::check_progress`]; the public wrapper supplies
221/// `None` so external callers are unaffected.
222pub(crate) fn hash_stream_by_alg_with_progress<R>(
223    alg: &str,
224    data: &mut R,
225    hash_range: Option<Vec<HashRange>>,
226    is_exclusion: bool,
227    mut progress: Option<&mut dyn FnMut(u32, u32) -> Result<()>>,
228) -> Result<Vec<u8>>
229where
230    R: Read + Seek + ?Sized,
231{
232    let mut bmff_v2_starts: Vec<u64> = Vec::new();
233
234    use Hasher::*;
235    let mut hasher_enum = match alg {
236        "sha256" => SHA256(Sha256::new()),
237        "sha384" => SHA384(Sha384::new()),
238        "sha512" => SHA512(Sha512::new()),
239        _ => {
240            return Err(Error::UnsupportedType);
241        }
242    };
243
244    let data_len = stream_len(data)?;
245    data.rewind()?;
246
247    if data_len < 1 {
248        return Err(Error::OtherError("no data to hash".into()));
249    }
250
251    let ranges = match hash_range {
252        Some(mut hr) if !hr.is_empty() => {
253            // hash data skipping excluded regions
254            // sort the exclusions
255            hr.sort_by_key(|a| a.start());
256
257            // verify structure of blocks
258            let num_blocks = hr.len();
259            let range_end = hr[num_blocks - 1].start() + hr[num_blocks - 1].length();
260            let data_end = data_len - 1;
261
262            // range extends past end of file so fail
263            if data_len < range_end {
264                return Err(Error::BadParam(
265                    "The exclusion range exceed the data length".to_string(),
266                ));
267            }
268
269            if is_exclusion {
270                //build final ranges
271                let mut ranges_vec: Vec<RangeInclusive<u64>> = Vec::new();
272                let mut ranges = RangeSet::<[RangeInclusive<u64>; 1]>::from(0..=data_end);
273                for exclusion in hr {
274                    // add new BMFF V2 offset as a new range to be included so that we can
275                    // pause to add the offset hash
276                    if let Some(offset) = exclusion.bmff_offset() {
277                        bmff_v2_starts.push(offset);
278                        continue;
279                    }
280
281                    if exclusion.length() == 0 {
282                        continue;
283                    }
284
285                    let end = exclusion
286                        .start()
287                        .checked_add(exclusion.length())
288                        .ok_or(Error::BadParam("No exclusion range".to_string()))?
289                        .checked_sub(1)
290                        .ok_or(Error::BadParam("No exclusion range".to_string()))?;
291                    let exclusion_start = exclusion.start();
292                    ranges.remove_range(exclusion_start..=end);
293                }
294
295                // merge standard ranges and BMFF V2 ranges into single list
296                if !bmff_v2_starts.is_empty() {
297                    bmff_v2_starts.sort();
298
299                    // split ranges at BMFF V2 offsets and insert offset value
300                    for r in ranges.into_smallvec() {
301                        // if bmff_v2 offset is within the range then split the range at the off set and both side to ranges_vec
302                        let mut current_range = r;
303                        for os in &bmff_v2_starts {
304                            if current_range.contains(os) {
305                                if *current_range.start() == *os {
306                                    ranges_vec.push(RangeInclusive::new(*os, *os));
307                                // offset
308                                } else {
309                                    ranges_vec
310                                        .push(RangeInclusive::new(*current_range.start(), *os - 1)); // left side
311                                    ranges_vec.push(RangeInclusive::new(*os, *os)); // offset
312                                    current_range = RangeInclusive::new(*os, *current_range.end());
313                                    // right side
314                                }
315                            }
316                        }
317                        ranges_vec.push(current_range);
318                    }
319
320                    // add in remaining BMFF V2 offsets that were not included in the ranges because of subsets
321                    let range_start = RangeInclusive::new(0, 0);
322                    let range_end = RangeInclusive::new(data_end, data_end);
323                    let before_any_range = *ranges_vec.first().unwrap_or(&range_start).start();
324                    let after_any_range = *ranges_vec.last().unwrap_or(&range_end).end();
325
326                    for os in &bmff_v2_starts {
327                        if !ranges_vec.iter().any(|r| r.contains(os))
328                            && *os > before_any_range
329                            && *os < after_any_range
330                        {
331                            ranges_vec.push(RangeInclusive::new(*os, *os));
332                        }
333                    }
334
335                    // sort by start position
336                    ranges_vec.sort_by(|a, b| {
337                        let a_start = a.start();
338                        let b_start = b.start();
339                        a_start.cmp(b_start)
340                    });
341
342                    ranges_vec
343                } else {
344                    for r in ranges.into_smallvec() {
345                        ranges_vec.push(r);
346                    }
347                    ranges_vec
348                }
349            } else {
350                //build final ranges
351                let mut ranges_vec: Vec<RangeInclusive<u64>> = Vec::new();
352                for inclusion in hr {
353                    if inclusion.length() == 0 {
354                        continue;
355                    }
356
357                    let end = inclusion.start() + inclusion.length() - 1;
358                    let inclusion_start = inclusion.start();
359
360                    // add new BMFF V2 offset as a new range to be included so that we can
361                    // pause to add the offset hash
362                    if let Some(offset) = inclusion.bmff_offset() {
363                        ranges_vec.push(RangeInclusive::new(offset, offset));
364                        bmff_v2_starts.push(offset);
365                    }
366
367                    // add inclusion
368                    ranges_vec.push(RangeInclusive::new(inclusion_start, end));
369                }
370                ranges_vec
371            }
372        }
373        _ => {
374            let mut ranges_vec: Vec<RangeInclusive<u64>> = Vec::new();
375            let data_end = data_len - 1;
376            ranges_vec.push(RangeInclusive::new(0_u64, data_end));
377
378            ranges_vec
379        }
380    };
381
382    let total = ranges.len() as u32;
383    let mut step: u32 = 0;
384
385    if cfg!(target_arch = "wasm32") {
386        // hash the data for ranges
387        for r in ranges {
388            step += 1;
389            if let Some(cb) = progress.as_mut() {
390                cb(step, total)?;
391            }
392
393            let start = r.start();
394            let end = r.end();
395            let mut chunk_left = end - start + 1;
396
397            // check to see if this range is an BMFF V2 offset to include in the hash
398            if bmff_v2_starts.contains(start) && end == start {
399                hasher_enum.update(&start.to_be_bytes());
400                continue;
401            }
402
403            // move to start of range
404            data.seek(SeekFrom::Start(*start))?;
405
406            loop {
407                let mut chunk = vec![0u8; std::cmp::min(chunk_left as usize, MAX_HASH_BUF)];
408
409                data.read_exact(&mut chunk)?;
410
411                hasher_enum.update(&chunk);
412
413                chunk_left -= chunk.len() as u64;
414                if chunk_left == 0 {
415                    break;
416                }
417            }
418        }
419    } else {
420        // hash the data for ranges
421        for r in ranges {
422            step += 1;
423            if let Some(cb) = progress.as_mut() {
424                cb(step, total)?;
425            }
426
427            let start = r.start();
428            let end = r.end();
429            let mut chunk_left = end - start + 1;
430
431            // check to see if this range is an BMFF V2 offset to include in the hash
432            if bmff_v2_starts.contains(start) && end == start {
433                hasher_enum.update(&start.to_be_bytes());
434                continue;
435            }
436
437            // move to start of range
438            data.seek(SeekFrom::Start(*start))?;
439
440            let mut chunk = vec![0u8; std::cmp::min(chunk_left as usize, MAX_HASH_BUF)];
441            data.read_exact(&mut chunk)?;
442
443            loop {
444                let (tx, rx) = std::sync::mpsc::channel();
445
446                chunk_left -= chunk.len() as u64;
447
448                std::thread::spawn(move || {
449                    hasher_enum.update(&chunk);
450                    tx.send(hasher_enum).unwrap_or_default();
451                });
452
453                // are we done
454                if chunk_left == 0 {
455                    hasher_enum = match rx.recv() {
456                        Ok(hasher) => hasher,
457                        Err(_) => return Err(Error::ThreadReceiveError),
458                    };
459                    break;
460                }
461
462                // read next chunk while we wait for hash
463                let mut next_chunk = vec![0u8; std::cmp::min(chunk_left as usize, MAX_HASH_BUF)];
464                data.read_exact(&mut next_chunk)?;
465
466                hasher_enum = match rx.recv() {
467                    Ok(hasher) => hasher,
468                    Err(_) => return Err(Error::ThreadReceiveError),
469                };
470
471                chunk = next_chunk;
472            }
473        }
474    }
475
476    // return the hash
477    Ok(Hasher::finalize(hasher_enum))
478}
479
480/// May be used to generate hashes in combination with embeddable APIs.
481pub fn hash_stream_by_alg<R>(
482    alg: &str,
483    data: &mut R,
484    hash_range: Option<Vec<HashRange>>,
485    is_exclusion: bool,
486) -> Result<Vec<u8>>
487where
488    R: Read + Seek + ?Sized,
489{
490    hash_stream_by_alg_with_progress(alg, data, hash_range, is_exclusion, None)
491}
492
493// verify the hash using the specified algorithm
494pub fn verify_by_alg(
495    alg: &str,
496    hash: &[u8],
497    data: &[u8],
498    exclusions: Option<Vec<HashRange>>,
499) -> bool {
500    // hash with the same algorithm as target
501    let data_hash = hash_by_alg(alg, data, exclusions);
502    vec_compare(hash, &data_hash)
503}
504
505// verify the hash using the specified algorithm
506pub fn verify_asset_by_alg(
507    alg: &str,
508    hash: &[u8],
509    asset_path: &Path,
510    exclusions: Option<Vec<HashRange>>,
511) -> bool {
512    // hash with the same algorithm as target
513    if let Ok(data_hash) = hash_asset_by_alg(alg, asset_path, exclusions) {
514        vec_compare(hash, &data_hash)
515    } else {
516        false
517    }
518}
519
520pub fn verify_stream_by_alg<R>(
521    alg: &str,
522    hash: &[u8],
523    reader: &mut R,
524    hash_range: Option<Vec<HashRange>>,
525    is_exclusion: bool,
526) -> bool
527where
528    R: Read + Seek + ?Sized,
529{
530    if let Ok(data_hash) = hash_stream_by_alg(alg, reader, hash_range, is_exclusion) {
531        vec_compare(hash, &data_hash)
532    } else {
533        false
534    }
535}
536
537// Used by Merkle tree calculations to generate the pair wise hash
538pub fn concat_and_hash(alg: &str, left: &[u8], right: Option<&[u8]>) -> Vec<u8> {
539    let mut temp = left.to_vec();
540
541    if let Some(r) = right {
542        temp.append(&mut r.to_vec())
543    }
544
545    hash_by_alg(alg, &temp, None)
546}
547
548/// replace byte arrays with base64 encoded strings
549pub fn hash_to_b64(mut value: Value) -> Value {
550    use std::collections::VecDeque;
551
552    let mut queue = VecDeque::new();
553    queue.push_back(&mut value);
554
555    while let Some(current) = queue.pop_front() {
556        match current {
557            Value::Object(obj) => {
558                for (_, v) in obj.iter_mut() {
559                    if let Value::Array(hash_arr) = v {
560                        if !hash_arr.is_empty() && hash_arr.iter().all(|x| x.is_number()) {
561                            // Pre-allocate with capacity to avoid reallocations
562                            let mut hash_bytes = Vec::with_capacity(hash_arr.len());
563                            // Convert numbers to bytes safely
564                            for n in hash_arr.iter() {
565                                if let Some(num) = n.as_u64() {
566                                    hash_bytes.push(num as u8);
567                                }
568                            }
569                            *v = Value::String(encode(&hash_bytes));
570                        }
571                    }
572                    queue.push_back(v);
573                }
574            }
575            Value::Array(arr) => {
576                for v in arr.iter_mut() {
577                    queue.push_back(v);
578                }
579            }
580            _ => {}
581        }
582    }
583    value
584}
585
586#[cfg(test)]
587mod tests {
588    #![allow(clippy::unwrap_used)]
589
590    use std::io::Cursor;
591
592    use super::*;
593
594    #[test]
595    fn progress_callback_is_called() {
596        let data = vec![0u8; 64];
597        let mut called = false;
598        let mut reader = Cursor::new(&data);
599        let mut cb = |_step, _total| {
600            called = true;
601            Ok(())
602        };
603        hash_stream_by_alg_with_progress("sha256", &mut reader, None, true, Some(&mut cb)).unwrap();
604        assert!(called, "progress callback should have been invoked");
605    }
606
607    #[test]
608    fn progress_callback_can_cancel() {
609        let data = vec![0u8; 64];
610        let mut reader = Cursor::new(&data);
611        let mut cb = |_step, _total| Err(Error::OperationCancelled);
612        let result =
613            hash_stream_by_alg_with_progress("sha256", &mut reader, None, true, Some(&mut cb));
614        assert!(
615            matches!(result, Err(Error::OperationCancelled)),
616            "expected OperationCancelled, got {result:?}"
617        );
618    }
619}