Skip to main content

copybook_determinism/
lib.rs

1#![cfg_attr(not(test), deny(clippy::unwrap_used, clippy::expect_used))]
2// SPDX-License-Identifier: AGPL-3.0-or-later
3#![allow(clippy::missing_inline_in_public_items)]
4//! Determinism primitives for repeatable output validation.
5//!
6//! This crate isolates one responsibility:
7//! compare two output byte streams, hash them, and report bounded byte-level differences.
8//!
9//! Use [`compare_outputs`] to check whether two codec runs produced identical bytes,
10//! and [`blake3_hex`] for stable content hashing.
11//!
12//! # Typical Workflow
13//!
14//! ```rust,ignore
15//! use copybook_determinism::{compare_outputs, DeterminismMode};
16//!
17//! let result = compare_outputs(DeterminismMode::DecodeOnly, &output_a, &output_b);
18//! assert!(result.is_deterministic);
19//! ```
20
21use serde::{Deserialize, Serialize};
22
23/// Default cap used when collecting byte-level differences.
24pub const DEFAULT_MAX_DIFFS: usize = 100;
25
26/// Hex-encoded BLAKE3 digest length in characters.
27pub const BLAKE3_HEX_LEN: usize = 64;
28
29/// Mode of determinism checking (decode-only, encode-only, or full round-trip).
30///
31/// # Examples
32///
33/// ```
34/// use copybook_determinism::DeterminismMode;
35///
36/// let mode = DeterminismMode::DecodeOnly;
37/// assert_eq!(mode, DeterminismMode::DecodeOnly);
38/// assert_ne!(mode, DeterminismMode::RoundTrip);
39/// ```
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
41#[serde(rename_all = "snake_case")]
42pub enum DeterminismMode {
43    /// Check that decoding the same binary data twice produces identical JSON.
44    DecodeOnly,
45    /// Check that encoding the same JSON twice produces identical binary data.
46    EncodeOnly,
47    /// Check that decode→encode→decode produces identical JSON.
48    RoundTrip,
49}
50
51/// Details about a byte difference found during determinism checking.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
53pub struct ByteDiff {
54    /// Byte offset where the difference was found.
55    pub offset: usize,
56    /// Byte value from the first run.
57    pub round1_byte: u8,
58    /// Byte value from the second run.
59    pub round2_byte: u8,
60}
61
62/// Result of a determinism check operation.
63#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
64pub struct DeterminismResult {
65    /// The mode of checking that was performed.
66    pub mode: DeterminismMode,
67    /// BLAKE3 hash of the first run's output.
68    pub round1_hash: String,
69    /// BLAKE3 hash of the second run's output.
70    pub round2_hash: String,
71    /// Whether the two runs produced identical outputs.
72    pub is_deterministic: bool,
73    /// If non-deterministic, details of the byte differences.
74    #[serde(skip_serializing_if = "Option::is_none")]
75    pub byte_differences: Option<Vec<ByteDiff>>,
76}
77
78impl DeterminismResult {
79    /// Returns true if both runs produced identical outputs.
80    #[must_use]
81    #[inline]
82    pub fn passed(&self) -> bool {
83        self.is_deterministic
84    }
85
86    /// Returns the number of byte differences found (0 if deterministic).
87    #[must_use]
88    #[inline]
89    pub fn diff_count(&self) -> usize {
90        self.byte_differences.as_ref().map_or(0, Vec::len)
91    }
92}
93
94/// Compute a lowercase hex BLAKE3 hash for a byte slice.
95#[must_use]
96#[inline]
97pub fn blake3_hex(data: &[u8]) -> String {
98    blake3::hash(data).to_hex().to_string()
99}
100
101/// Compare two byte slices and build a determinism result with default diff limit.
102#[must_use]
103#[inline]
104pub fn compare_outputs(mode: DeterminismMode, round1: &[u8], round2: &[u8]) -> DeterminismResult {
105    compare_outputs_with_limit(mode, round1, round2, DEFAULT_MAX_DIFFS)
106}
107
108/// Compare two byte slices and build a determinism result with an explicit diff limit.
109#[must_use]
110pub fn compare_outputs_with_limit(
111    mode: DeterminismMode,
112    round1: &[u8],
113    round2: &[u8],
114    max_diffs: usize,
115) -> DeterminismResult {
116    let hash1 = blake3::hash(round1);
117    let hash2 = blake3::hash(round2);
118    let is_deterministic = hash1 == hash2;
119
120    DeterminismResult {
121        mode,
122        round1_hash: hash1.to_hex().to_string(),
123        round2_hash: hash2.to_hex().to_string(),
124        is_deterministic,
125        byte_differences: if is_deterministic {
126            None
127        } else {
128            Some(find_byte_differences_with_limit(round1, round2, max_diffs))
129        },
130    }
131}
132
133/// Find byte-level differences between two slices using [`DEFAULT_MAX_DIFFS`] entries at most.
134#[must_use]
135#[inline]
136pub fn find_byte_differences(round1: &[u8], round2: &[u8]) -> Vec<ByteDiff> {
137    find_byte_differences_with_limit(round1, round2, DEFAULT_MAX_DIFFS)
138}
139
140/// Find byte-level differences between two slices with an explicit limit.
141#[must_use]
142pub fn find_byte_differences_with_limit(
143    round1: &[u8],
144    round2: &[u8],
145    max_diffs: usize,
146) -> Vec<ByteDiff> {
147    if max_diffs == 0 {
148        return Vec::new();
149    }
150
151    let min_len = round1.len().min(round2.len());
152    let max_len = round1.len().max(round2.len());
153    let mut diffs = Vec::with_capacity(max_diffs.min(max_len));
154
155    for (offset, (&byte_a, &byte_b)) in round1.iter().zip(round2.iter()).enumerate() {
156        if byte_a != byte_b {
157            diffs.push(ByteDiff {
158                offset,
159                round1_byte: byte_a,
160                round2_byte: byte_b,
161            });
162            if diffs.len() >= max_diffs {
163                return diffs;
164            }
165        }
166    }
167
168    if round1.len() != round2.len() {
169        for offset in min_len..max_len {
170            let byte_a = round1.get(offset).copied().unwrap_or(0);
171            let byte_b = round2.get(offset).copied().unwrap_or(0);
172            diffs.push(ByteDiff {
173                offset,
174                round1_byte: byte_a,
175                round2_byte: byte_b,
176            });
177            if diffs.len() >= max_diffs {
178                return diffs;
179            }
180        }
181    }
182
183    diffs
184}
185
186#[cfg(test)]
187#[allow(clippy::unwrap_used, clippy::expect_used)]
188mod tests {
189    use super::*;
190    use proptest::prelude::*;
191
192    #[test]
193    fn diff_bytes_reports_mismatches() {
194        let a = b"ABCDEF";
195        let b = b"ABxDEy";
196        let diffs = find_byte_differences(a, b);
197
198        assert_eq!(diffs.len(), 2);
199        assert_eq!(diffs[0].offset, 2);
200        assert_eq!(diffs[0].round1_byte, b'C');
201        assert_eq!(diffs[0].round2_byte, b'x');
202        assert_eq!(diffs[1].offset, 5);
203        assert_eq!(diffs[1].round1_byte, b'F');
204        assert_eq!(diffs[1].round2_byte, b'y');
205    }
206
207    #[test]
208    fn diff_bytes_handles_length_mismatch() {
209        let a = b"ABC";
210        let b = b"ABCDE";
211        let diffs = find_byte_differences(a, b);
212
213        assert_eq!(diffs.len(), 2);
214        assert_eq!(diffs[0].offset, 3);
215        assert_eq!(diffs[0].round1_byte, 0);
216        assert_eq!(diffs[0].round2_byte, b'D');
217        assert_eq!(diffs[1].offset, 4);
218        assert_eq!(diffs[1].round1_byte, 0);
219        assert_eq!(diffs[1].round2_byte, b'E');
220    }
221
222    #[test]
223    fn diff_bytes_limits_to_max() {
224        let a = vec![0u8; 200];
225        let b = vec![1u8; 200];
226        let diffs = find_byte_differences_with_limit(&a, &b, 37);
227        assert_eq!(diffs.len(), 37);
228    }
229
230    #[test]
231    fn compare_outputs_marks_identical_bytes_deterministic() {
232        let bytes = b"{\"k\":\"v\"}";
233        let result = compare_outputs(DeterminismMode::DecodeOnly, bytes, bytes);
234
235        assert!(result.is_deterministic);
236        assert!(result.byte_differences.is_none());
237        assert_eq!(result.round1_hash, result.round2_hash);
238        assert_eq!(result.round1_hash.len(), BLAKE3_HEX_LEN);
239        assert!(result.round1_hash.chars().all(|c| c.is_ascii_hexdigit()));
240    }
241
242    #[test]
243    fn compare_outputs_reports_non_determinism() {
244        let result = compare_outputs(DeterminismMode::EncodeOnly, b"ABC", b"ABX");
245        assert!(!result.is_deterministic);
246        assert_ne!(result.round1_hash, result.round2_hash);
247        assert_eq!(result.diff_count(), 1);
248        assert_eq!(result.byte_differences.as_ref().unwrap()[0].offset, 2);
249    }
250
251    #[test]
252    fn blake3_hex_empty_input_produces_valid_hash() {
253        let hash = blake3_hex(b"");
254        assert_eq!(hash.len(), BLAKE3_HEX_LEN);
255        assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
256    }
257
258    #[test]
259    fn blake3_hex_different_inputs_produce_different_hashes() {
260        let h1 = blake3_hex(b"hello");
261        let h2 = blake3_hex(b"world");
262        assert_ne!(h1, h2);
263    }
264
265    #[test]
266    fn blake3_hex_identical_inputs_produce_same_hash() {
267        let h1 = blake3_hex(b"determinism");
268        let h2 = blake3_hex(b"determinism");
269        assert_eq!(h1, h2);
270    }
271
272    #[test]
273    fn compare_outputs_empty_slices_are_deterministic() {
274        let result = compare_outputs(DeterminismMode::DecodeOnly, b"", b"");
275        assert!(result.passed());
276        assert_eq!(result.diff_count(), 0);
277        assert!(result.byte_differences.is_none());
278        assert_eq!(result.round1_hash, result.round2_hash);
279    }
280
281    #[test]
282    fn compare_outputs_empty_vs_non_empty_is_non_deterministic() {
283        let result = compare_outputs(DeterminismMode::EncodeOnly, b"", b"X");
284        assert!(!result.passed());
285        assert_eq!(result.diff_count(), 1);
286        let diffs = result.byte_differences.as_ref().unwrap();
287        assert_eq!(diffs[0].offset, 0);
288        assert_eq!(diffs[0].round1_byte, 0);
289        assert_eq!(diffs[0].round2_byte, b'X');
290    }
291
292    #[test]
293    fn compare_outputs_round_trip_mode_sets_mode_field() {
294        let result = compare_outputs(DeterminismMode::RoundTrip, b"ABC", b"ABC");
295        assert_eq!(result.mode, DeterminismMode::RoundTrip);
296        assert!(result.passed());
297    }
298
299    #[test]
300    fn compare_outputs_with_limit_caps_reported_diffs() {
301        let a = vec![0u8; 50];
302        let b = vec![1u8; 50];
303        let result = compare_outputs_with_limit(DeterminismMode::DecodeOnly, &a, &b, 5);
304        assert!(!result.passed());
305        assert_eq!(result.diff_count(), 5);
306    }
307
308    #[test]
309    fn find_byte_differences_identical_inputs_returns_empty() {
310        let data = b"identical bytes";
311        let diffs = find_byte_differences(data, data);
312        assert!(diffs.is_empty());
313    }
314
315    #[test]
316    fn find_byte_differences_with_limit_zero_returns_empty() {
317        let diffs = find_byte_differences_with_limit(b"AAA", b"BBB", 0);
318        assert!(diffs.is_empty());
319    }
320
321    #[test]
322    fn determinism_result_serde_round_trip() {
323        let result = DeterminismResult {
324            mode: DeterminismMode::EncodeOnly,
325            round1_hash: blake3_hex(b"test"),
326            round2_hash: blake3_hex(b"test"),
327            is_deterministic: true,
328            byte_differences: None,
329        };
330        let json = serde_json::to_string(&result).unwrap();
331        let deserialized: DeterminismResult = serde_json::from_str(&json).unwrap();
332        assert_eq!(deserialized, result);
333    }
334
335    proptest! {
336        #[test]
337        fn prop_identical_inputs_always_deterministic(data in prop::collection::vec(any::<u8>(), 0..512)) {
338            let result = compare_outputs(DeterminismMode::RoundTrip, &data, &data);
339            prop_assert!(result.passed());
340            prop_assert_eq!(result.diff_count(), 0);
341        }
342
343        #[test]
344        fn prop_diff_count_never_exceeds_limit(
345            a in prop::collection::vec(any::<u8>(), 0..256),
346            b in prop::collection::vec(any::<u8>(), 0..256),
347            limit in 0usize..64usize
348        ) {
349            let diffs = find_byte_differences_with_limit(&a, &b, limit);
350            prop_assert!(diffs.len() <= limit);
351        }
352
353        #[test]
354        fn prop_hash_is_stable(data in prop::collection::vec(any::<u8>(), 0..512)) {
355            let h1 = blake3_hex(&data);
356            let h2 = blake3_hex(&data);
357            prop_assert_eq!(h1.as_str(), h2.as_str());
358            prop_assert_eq!(h1.len(), BLAKE3_HEX_LEN);
359        }
360    }
361}