gmcrypto-core 0.7.0

Constant-time-designed pure-Rust SM2/SM3 primitives (no_std + alloc) with an in-CI dudect timing-leak regression harness
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
//! Streaming SM4-CBC encrypt / decrypt (v0.3 W5).
//!
//! Single-shot v0.2 [`super::mode_cbc::encrypt`] / [`super::mode_cbc::decrypt`]
//! still ship unchanged; this module adds an `update`/`finalize` shape
//! for callers who can't materialize the full plaintext / ciphertext
//! up front.
//!
//! # Equivalence with v0.2 single-shot
//!
//! For any plaintext `M` partitioned into chunks `M = c_0 || c_1 || ...
//! || c_n`, the streaming encryptor's concatenated output equals
//! `super::mode_cbc::encrypt(key, iv, M)` byte-for-byte. Same goes
//! for the decryptor.
//!
//! # Padding-oracle posture
//!
//! Same as v0.2's single-shot decrypt — see the
//! [`super::mode_cbc`] module-doc. Wrap with HMAC-SM3 + encrypt-then-
//! MAC if you need integrity in the presence of network attackers.
//! The streaming decryptor's PKCS#7 strip on `finalize` reuses the
//! v0.2 constant-time scan idiom — it does not reimplement it.
//!
//! # Streaming decrypt buffer-back-by-one rule
//!
//! [`Sm4CbcDecryptor::update`] holds the **most recent decrypted
//! block** back from emission so that `finalize` can apply PKCS#7
//! strip to it — even on a chunked-input call where the boundary
//! between "last block" and "not last block" is only known at
//! `finalize` time. This avoids an early-emit padding oracle: the
//! caller sees plaintext bytes only after `finalize` confirms the
//! overall structure is consistent.
//!
//! # Failure-mode invariant
//!
//! [`Sm4CbcDecryptor::finalize`] returns `Option<Vec<u8>>` — `None`
//! on any decrypt-side failure (length not multiple of 16, invalid
//! PKCS#7). Single uninformative shape per `CLAUDE.md`.

use crate::sm4::cipher::{BLOCK_SIZE, KEY_SIZE, Sm4Cipher};
use alloc::vec::Vec;
use subtle::{ConditionallySelectable, ConstantTimeEq, ConstantTimeGreater};

/// Streaming SM4-CBC encryptor with PKCS#7 padding.
///
/// Construct with `new(&key, &iv)`, feed plaintext via `update`,
/// finalize with `finalize` (returns the full ciphertext as a
/// `Vec<u8>`). The IV must be **caller-supplied unpredictable**
/// per NIST SP 800-38A Appendix C — same contract as
/// [`super::mode_cbc::encrypt`].
///
/// `update` may be called any number of times with arbitrary chunk
/// sizes. `finalize` must be called exactly once; after `finalize`
/// the instance is consumed.
pub struct Sm4CbcEncryptor {
    cipher: Sm4Cipher,
    /// Most recent ciphertext block (or IV before the first block
    /// is emitted).
    prev: [u8; BLOCK_SIZE],
    /// Buffered partial-block bytes from the tail of the most
    /// recent `update` call.
    buffer: [u8; BLOCK_SIZE],
    /// Number of valid bytes in `buffer`. Always `< BLOCK_SIZE`.
    buffer_len: usize,
    /// Accumulated ciphertext so far (full blocks only).
    output: Vec<u8>,
}

impl Sm4CbcEncryptor {
    /// Construct a new streaming encryptor. The IV must be
    /// CSPRNG-derived per the SM4-CBC IV contract.
    #[must_use]
    pub fn new(key: &[u8; KEY_SIZE], iv: &[u8; BLOCK_SIZE]) -> Self {
        Self {
            cipher: Sm4Cipher::new(key),
            prev: *iv,
            buffer: [0u8; BLOCK_SIZE],
            buffer_len: 0,
            output: Vec::new(),
        }
    }

    /// Absorb plaintext bytes. Emits ciphertext for every full
    /// 16-byte block; trailing partial bytes are buffered until the
    /// next `update` or `finalize`.
    pub fn update(&mut self, mut data: &[u8]) {
        // Top up the buffer first if it's partially filled.
        if self.buffer_len > 0 {
            let need = BLOCK_SIZE - self.buffer_len;
            let take = need.min(data.len());
            self.buffer[self.buffer_len..self.buffer_len + take].copy_from_slice(&data[..take]);
            self.buffer_len += take;
            data = &data[take..];
            if self.buffer_len == BLOCK_SIZE {
                let block = self.buffer;
                self.encrypt_one(&block);
                self.buffer_len = 0;
            }
        }
        // Drain whole blocks straight from the input.
        while data.len() >= BLOCK_SIZE {
            let mut block = [0u8; BLOCK_SIZE];
            block.copy_from_slice(&data[..BLOCK_SIZE]);
            self.encrypt_one(&block);
            data = &data[BLOCK_SIZE..];
        }
        // Buffer any trailing partial block.
        if !data.is_empty() {
            self.buffer[..data.len()].copy_from_slice(data);
            self.buffer_len = data.len();
        }
    }

    /// Drain the accumulated ciphertext, leaving the encryptor ready
    /// for further `update` calls. The Rust streaming API has no
    /// inherent reason for this method — `finalize` consumes the
    /// encryptor and returns the full accumulation. This helper exists
    /// for the `gmcrypto-c` FFI shim's streaming pattern (v0.5 W1)
    /// which emits ciphertext incrementally as `update` produces full
    /// blocks.
    ///
    /// **Not SemVer-stable.** Same posture as
    /// [`crate::sm2::sign_raw_with_id`]: `#[doc(hidden)] pub` for FFI-
    /// shim consumption; its signature may change in any v0.5+ minor.
    #[doc(hidden)]
    pub fn take_output(&mut self) -> Vec<u8> {
        core::mem::take(&mut self.output)
    }

    /// Apply PKCS#7 padding to the buffered tail and emit the final
    /// ciphertext block(s). Consumes the encryptor.
    #[must_use]
    pub fn finalize(mut self) -> Vec<u8> {
        // PKCS#7: append `pad_len = BLOCK_SIZE - buffer_len` copies
        // of `pad_len`. When buffer_len == 0, that's a full block of
        // `0x10` per RFC 5652 §6.3.
        #[allow(clippy::cast_possible_truncation)]
        let pad_len = (BLOCK_SIZE - self.buffer_len) as u8;
        for i in self.buffer_len..BLOCK_SIZE {
            self.buffer[i] = pad_len;
        }
        let block = self.buffer;
        self.encrypt_one(&block);
        self.output
    }

    fn encrypt_one(&mut self, plaintext_block: &[u8; BLOCK_SIZE]) {
        let mut block = *plaintext_block;
        for (b, p) in block.iter_mut().zip(self.prev.iter()) {
            *b ^= *p;
        }
        self.cipher.encrypt_block(&mut block);
        self.prev = block;
        self.output.extend_from_slice(&block);
    }
}

/// Streaming SM4-CBC decryptor with PKCS#7 strip.
///
/// Construct with `new(&key, &iv)`, feed ciphertext via `update`,
/// finalize with `finalize` (returns `Option<Vec<u8>>`).
///
/// **Buffer-back-by-one:** `update` decrypts every full 16-byte
/// block but holds the **most recent decrypted block** back from
/// emission until `finalize` confirms it is the last block. This
/// keeps the PKCS#7 strip uniform — no early-emit padding-oracle
/// surface during the streaming phase. Callers see plaintext only
/// after `finalize` validates the trailing-block padding.
///
/// Same single-`None` failure-mode posture as the v0.2 single-shot
/// [`super::mode_cbc::decrypt`].
pub struct Sm4CbcDecryptor {
    cipher: Sm4Cipher,
    /// Most recent ciphertext block (or IV before the first block).
    prev: [u8; BLOCK_SIZE],
    /// Buffered partial-block ciphertext bytes from the tail of the
    /// most recent `update` call.
    buffer: [u8; BLOCK_SIZE],
    buffer_len: usize,
    /// Accumulated plaintext from "definitely-not-the-last" blocks.
    output: Vec<u8>,
    /// The **last decrypted block** held back from emission. None if
    /// no full block has been processed yet.
    held_back: Option<[u8; BLOCK_SIZE]>,
}

impl Sm4CbcDecryptor {
    /// Construct a new streaming decryptor.
    #[must_use]
    pub fn new(key: &[u8; KEY_SIZE], iv: &[u8; BLOCK_SIZE]) -> Self {
        Self {
            cipher: Sm4Cipher::new(key),
            prev: *iv,
            buffer: [0u8; BLOCK_SIZE],
            buffer_len: 0,
            output: Vec::new(),
            held_back: None,
        }
    }

    /// Absorb ciphertext bytes.
    ///
    /// **v0.6 W6 fanout (under `sm4-bitsliced-simd`):** when the
    /// incoming `data` holds enough buffered ciphertext for a full
    /// SIMD batch (`SIMD_BATCH` blocks — 8 on `x86_64`, 4 on `aarch64`,
    /// 1 elsewhere), the batched path
    /// [`super::cipher::Sm4Cipher::decrypt_blocks_simd`] fans the
    /// per-round `tau` across the full SIMD register width via
    /// [`gmcrypto_simd::sm4::sbox_x32`] / [`gmcrypto_simd::sm4::sbox_x16`].
    /// Behavior is byte-identical to the per-block
    /// [`Self::decrypt_one`] path (Q5.10's "no new public Rust
    /// surface" carries through — only the internal loop shape
    /// changes).
    pub fn update(&mut self, mut data: &[u8]) {
        if self.buffer_len > 0 {
            let need = BLOCK_SIZE - self.buffer_len;
            let take = need.min(data.len());
            self.buffer[self.buffer_len..self.buffer_len + take].copy_from_slice(&data[..take]);
            self.buffer_len += take;
            data = &data[take..];
            if self.buffer_len == BLOCK_SIZE {
                let block = self.buffer;
                self.decrypt_one(&block);
                self.buffer_len = 0;
            }
        }

        // v0.6 W6 — SIMD batch fanout. Drains `data` in
        // `SIMD_BATCH`-sized chunks while at least one batch is
        // available. The chaining-input snapshot (`saved`) is taken
        // before parallel decrypt; each plaintext is then XOR-ed
        // with the previous ciphertext (preserving the standard CBC
        // chaining). The last decrypted block of the batch lands in
        // `held_back` (preserving the buffer-back-by-one
        // invariant); the prior batch's `held_back` is flushed to
        // `output`. Same per-block semantics as `decrypt_one`,
        // amortized across `SIMD_BATCH` blocks.
        #[cfg(feature = "sm4-bitsliced-simd")]
        {
            use super::cipher::SIMD_BATCH;
            while data.len() >= SIMD_BATCH * BLOCK_SIZE {
                let mut batch = [[0u8; BLOCK_SIZE]; SIMD_BATCH];
                for i in 0..SIMD_BATCH {
                    batch[i].copy_from_slice(&data[i * BLOCK_SIZE..(i + 1) * BLOCK_SIZE]);
                }
                self.decrypt_batch(&batch);
                data = &data[SIMD_BATCH * BLOCK_SIZE..];
            }
        }

        while data.len() >= BLOCK_SIZE {
            let mut block = [0u8; BLOCK_SIZE];
            block.copy_from_slice(&data[..BLOCK_SIZE]);
            self.decrypt_one(&block);
            data = &data[BLOCK_SIZE..];
        }
        if !data.is_empty() {
            self.buffer[..data.len()].copy_from_slice(data);
            self.buffer_len = data.len();
        }
    }

    /// Drain the emitted plaintext so far (i.e. all decrypted blocks
    /// EXCEPT the held-back final-candidate block). Same FFI-helper
    /// posture as [`Sm4CbcEncryptor::take_output`]: `#[doc(hidden)] pub`
    /// for the v0.5 W1 streaming FFI; not SemVer-stable.
    ///
    /// **Note**: the held-back block is *not* drained — the buffer-
    /// back-by-one invariant is preserved across this call.
    #[doc(hidden)]
    pub fn take_output(&mut self) -> Vec<u8> {
        core::mem::take(&mut self.output)
    }

    /// Strip PKCS#7 padding from the held-back final block and emit
    /// the full plaintext. Returns `None` if any failure mode is
    /// hit — length not multiple of 16, no full blocks ever seen,
    /// or padding-strip rejection.
    #[must_use]
    pub fn finalize(mut self) -> Option<Vec<u8>> {
        // Any partial buffered ciphertext at finalize time is invalid
        // (overall ciphertext length must be a multiple of 16).
        if self.buffer_len != 0 {
            return None;
        }
        let last = self.held_back?;
        let stripped = strip_pkcs7_block(&last)?;
        self.output.extend_from_slice(&last[..stripped]);
        Some(self.output)
    }

    fn decrypt_one(&mut self, ciphertext_block: &[u8; BLOCK_SIZE]) {
        let mut block = *ciphertext_block;
        let saved = block;
        self.cipher.decrypt_block(&mut block);
        for (b, p) in block.iter_mut().zip(self.prev.iter()) {
            *b ^= *p;
        }
        self.prev = saved;

        // Move any previously-held-back block to the output (it's
        // now confirmed-not-the-last) and replace it with the
        // freshly-decrypted block.
        if let Some(prev_held) = self.held_back.take() {
            self.output.extend_from_slice(&prev_held);
        }
        self.held_back = Some(block);
    }

    /// v0.6 W6 — Batch-decrypt a SIMD_BATCH-block chunk of ciphertext,
    /// preserving the buffer-back-by-one invariant.
    ///
    /// State-machine contract (preserved across the batched call):
    ///
    /// ```text
    /// Before:
    ///   self.prev      = ciphertext of last block before this batch (or IV)
    ///   self.held_back = Some(plaintext of last-block-before-this-batch),
    ///                    or None on first batch
    ///
    /// For ct_blocks[0..N] (N = SIMD_BATCH):
    ///   1. saved[i] = ct_blocks[i]                  (chaining snapshot)
    ///   2. pt_blocks[0..N] = batched_decrypt(ct_blocks[0..N])
    ///   3. pt_blocks[0] ^= self.prev                (chain to pre-batch state)
    ///      pt_blocks[i] ^= saved[i-1]   for i in 1..N
    ///   4. Emit prior self.held_back to self.output (if Some).
    ///   5. Emit pt_blocks[0..N-1]   to self.output  (confirmed-not-last)
    ///   6. self.held_back = Some(pt_blocks[N-1])    (defer last for padding)
    ///   7. self.prev = saved[N-1]                    (= ct_blocks[N-1])
    /// ```
    ///
    /// Byte-identical to calling [`Self::decrypt_one`] N times on the
    /// same ciphertext blocks.
    #[cfg(feature = "sm4-bitsliced-simd")]
    fn decrypt_batch(&mut self, ct_blocks: &[[u8; BLOCK_SIZE]; super::cipher::SIMD_BATCH]) {
        use super::cipher::SIMD_BATCH;

        // 1. Snapshot the chaining inputs before decryption.
        let saved = *ct_blocks;

        // 2. Batched parallel decrypt (in-place).
        let mut pt_blocks = saved;
        self.cipher.decrypt_blocks_simd(&mut pt_blocks);

        // 3. XOR with chaining inputs: pt[0] ^= prev; pt[i] ^= saved[i-1].
        for (b, p) in pt_blocks[0].iter_mut().zip(self.prev.iter()) {
            *b ^= *p;
        }
        for i in 1..SIMD_BATCH {
            let chain = saved[i - 1];
            for (b, p) in pt_blocks[i].iter_mut().zip(chain.iter()) {
                *b ^= *p;
            }
        }

        // 4. Flush prior held_back.
        if let Some(prev_held) = self.held_back.take() {
            self.output.extend_from_slice(&prev_held);
        }
        // 5. Emit pt_blocks[0..N-1] (confirmed-not-last; another block
        //    follows in held_back).
        for pt in pt_blocks.iter().take(SIMD_BATCH - 1) {
            self.output.extend_from_slice(pt);
        }
        // 6. Hold back the last decrypted block (PKCS#7 candidate).
        self.held_back = Some(pt_blocks[SIMD_BATCH - 1]);
        // 7. Update prev to the last ciphertext of this batch.
        self.prev = saved[SIMD_BATCH - 1];
    }
}

/// Constant-time PKCS#7 strip on a 16-byte block. Returns the byte
/// count that should be retained (`BLOCK_SIZE - pad_len`) on success,
/// `None` on any malformed padding.
///
/// Same scan logic as [`super::mode_cbc::decrypt`]'s helper —
/// re-implemented here to avoid making the v0.2 helper public, but
/// byte-identical in behavior.
fn strip_pkcs7_block(block: &[u8; BLOCK_SIZE]) -> Option<usize> {
    let last = block[BLOCK_SIZE - 1];
    let pad_nonzero = !last.ct_eq(&0u8);
    #[allow(clippy::cast_possible_truncation)]
    let pad_le_block = !last.ct_gt(&(BLOCK_SIZE as u8));
    let pad_in_range = pad_nonzero & pad_le_block;

    let mut acc: u8 = 0;
    for (i, byte) in block.iter().enumerate() {
        #[allow(clippy::cast_possible_truncation)]
        let pos_from_end = (BLOCK_SIZE - i) as u8;
        let in_padding = !pos_from_end.ct_gt(&last);
        let diff = *byte ^ last;
        let masked = u8::conditional_select(&0u8, &diff, in_padding);
        acc |= masked;
    }
    let acc_zero = acc.ct_eq(&0u8);
    let valid = pad_in_range & acc_zero;
    if bool::from(valid) {
        Some(BLOCK_SIZE - last as usize)
    } else {
        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::sm4::mode_cbc;

    /// Equivalence with single-shot encrypt for a no-chunking call.
    #[test]
    fn encrypt_single_chunk_matches_v02() {
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        let plaintext = b"streaming round trip";
        let mut enc = Sm4CbcEncryptor::new(&key, &iv);
        enc.update(plaintext);
        let stream_ct = enc.finalize();
        let oneshot_ct = mode_cbc::encrypt(&key, &iv, plaintext);
        assert_eq!(stream_ct, oneshot_ct);
    }

    /// Equivalence with single-shot encrypt across chunk boundaries.
    #[test]
    fn encrypt_chunked_matches_v02() {
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        // 100-byte plaintext, mid-multi-block.
        let pt: Vec<u8> = (0..100u8).collect();
        // Several arbitrary chunkings.
        for chunk_size in [1usize, 7, 16, 17, 31, 32, 100] {
            let mut enc = Sm4CbcEncryptor::new(&key, &iv);
            for chunk in pt.chunks(chunk_size) {
                enc.update(chunk);
            }
            let stream_ct = enc.finalize();
            let oneshot_ct = mode_cbc::encrypt(&key, &iv, &pt);
            assert_eq!(stream_ct, oneshot_ct, "chunk_size={chunk_size}");
        }
    }

    /// Round-trip through streaming encrypt + streaming decrypt.
    #[test]
    fn streaming_round_trip() {
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        for len in [0usize, 1, 15, 16, 17, 31, 32, 33, 100, 256] {
            #[allow(clippy::cast_possible_truncation)]
            let pt: Vec<u8> = (0..len).map(|i| (i as u8).wrapping_mul(13)).collect();
            let mut enc = Sm4CbcEncryptor::new(&key, &iv);
            enc.update(&pt);
            let ct = enc.finalize();

            // Decrypt across multiple chunkings.
            for chunk_size in [1usize, 7, 16, 17, 31, 32, ct.len().max(1)] {
                let mut dec = Sm4CbcDecryptor::new(&key, &iv);
                for chunk in ct.chunks(chunk_size) {
                    dec.update(chunk);
                }
                let recovered = dec.finalize().expect("decrypt");
                assert_eq!(recovered, pt, "len={len} chunk_size={chunk_size}");
            }
        }
    }

    /// Decrypt rejects truncated stream (length not multiple of 16).
    #[test]
    fn decrypt_rejects_truncated() {
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        let mut dec = Sm4CbcDecryptor::new(&key, &iv);
        dec.update(&[0xAB; 31]); // 31 bytes = 1 full block + 15 buffered
        assert!(dec.finalize().is_none());
    }

    /// Decrypt rejects empty stream (no full blocks at all).
    #[test]
    fn decrypt_rejects_empty() {
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        let dec = Sm4CbcDecryptor::new(&key, &iv);
        assert!(dec.finalize().is_none());
    }

    /// Decrypt rejects bad padding (tampered final block).
    #[test]
    fn decrypt_rejects_bad_padding() {
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        let pt = b"this is a test message that spans multiple blocks";
        let mut enc = Sm4CbcEncryptor::new(&key, &iv);
        enc.update(pt);
        let mut ct = enc.finalize();
        let last = ct.len() - 1;
        ct[last] ^= 0x01;
        let mut dec = Sm4CbcDecryptor::new(&key, &iv);
        dec.update(&ct);
        assert!(dec.finalize().is_none());
    }

    /// v0.6 W6 — chunk-boundary sweep around the SIMD batch size.
    ///
    /// Per codex's phase 3 design flag #3: "chunk-boundary tests
    /// around `0, 1, N-1, N, N+1, 2N` blocks, including
    /// `take_output`." The new `decrypt_batch` path only triggers
    /// when `data.len() >= SIMD_BATCH * BLOCK_SIZE`; this test
    /// covers the transitions on either side.
    ///
    /// `SIMD_BATCH` is compile-time arch-specific. On `x86_64`: 8;
    /// on `aarch64`: 4; elsewhere: 1 (where this test degenerates
    /// to single-block).
    #[cfg(feature = "sm4-bitsliced-simd")]
    #[test]
    fn cbc_decrypt_simd_batch_boundary_sweep() {
        use super::super::cipher::SIMD_BATCH;
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];

        // Block counts around the batch boundary. For SIMD_BATCH=8
        // (x86_64): 0, 1, 7, 8, 9, 15, 16, 17. For SIMD_BATCH=4
        // (aarch64): 0, 1, 3, 4, 5, 7, 8, 9. For SIMD_BATCH=1
        // (other): degenerates to 0, 1, 0, 1, 2, 1, 2, 3 — still
        // valid, just less interesting.
        let block_counts: [usize; 8] = [
            0,
            1,
            SIMD_BATCH.saturating_sub(1),
            SIMD_BATCH,
            SIMD_BATCH + 1,
            (2 * SIMD_BATCH).saturating_sub(1),
            2 * SIMD_BATCH,
            2 * SIMD_BATCH + 1,
        ];

        for &n_blocks in &block_counts {
            // Build n_blocks worth of plaintext (16 bytes per block).
            // n_blocks = 0 maps to empty plaintext (PKCS#7 pads to
            // one full block of 0x10).
            let pt: Vec<u8> = (0..(n_blocks * BLOCK_SIZE))
                .map(|i| u8::try_from(i & 0xFF).unwrap_or(0))
                .collect();
            let canonical = mode_cbc::encrypt(&key, &iv, &pt);

            // Stream-decrypt the canonical ciphertext.
            let mut dec = Sm4CbcDecryptor::new(&key, &iv);
            dec.update(&canonical);
            let recovered = dec.finalize().expect("decrypt");
            assert_eq!(
                recovered, pt,
                "boundary sweep: n_blocks={n_blocks} (SIMD_BATCH={SIMD_BATCH})",
            );
        }
    }

    /// v0.6 W6 — chunked-update boundary sweep. The same total
    /// ciphertext, fed through `update` in chunks of varying sizes
    /// (including sub-block, exact-batch, batch+1, etc.), must
    /// recover the same plaintext. Catches state-machine bugs in
    /// the partial-buffer + batch-drain + single-block-drain
    /// transitions.
    #[cfg(feature = "sm4-bitsliced-simd")]
    #[test]
    fn cbc_decrypt_simd_chunked_update_sweep() {
        use super::super::cipher::SIMD_BATCH;
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        // 3 * SIMD_BATCH + half-block of plaintext to ensure
        // every batch boundary is crossed at least once.
        let total_blocks = 3 * SIMD_BATCH + 1;
        let pt: Vec<u8> = (0..(total_blocks * BLOCK_SIZE - 5))
            .map(|i| u8::try_from((i * 17) & 0xFF).unwrap_or(0))
            .collect();
        let ct = mode_cbc::encrypt(&key, &iv, &pt);

        let batch_bytes = SIMD_BATCH * BLOCK_SIZE;
        let chunk_sizes = [
            1,
            7,
            BLOCK_SIZE,
            BLOCK_SIZE + 1,
            batch_bytes - 1,
            batch_bytes,
            batch_bytes + 1,
            2 * batch_bytes,
            ct.len().max(1),
        ];

        for &chunk_size in &chunk_sizes {
            let mut dec = Sm4CbcDecryptor::new(&key, &iv);
            for chunk in ct.chunks(chunk_size) {
                dec.update(chunk);
            }
            let recovered = dec.finalize().expect("decrypt");
            assert_eq!(
                recovered, pt,
                "chunked update: chunk_size={chunk_size} (batch_bytes={batch_bytes})",
            );
        }
    }

    /// v0.6 W6 — `take_output` interaction with the SIMD batch
    /// path. The FFI-shim helper drains emitted plaintext partway
    /// through a stream; the held-back block must remain held back
    /// across the take. Verifies that calling `take_output` mid-
    /// stream doesn't drop the SIMD-batched held-back invariant.
    #[cfg(feature = "sm4-bitsliced-simd")]
    #[test]
    fn cbc_decrypt_simd_take_output_preserves_held_back() {
        use super::super::cipher::SIMD_BATCH;
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        // 2 * SIMD_BATCH + 1 blocks of plaintext to force at least
        // one batch and one straggler-block path.
        let total_blocks = 2 * SIMD_BATCH + 1;
        let pt: Vec<u8> = (0..(total_blocks * BLOCK_SIZE))
            .map(|i| u8::try_from((i ^ 0xA5) & 0xFF).unwrap_or(0))
            .collect();
        let ct = mode_cbc::encrypt(&key, &iv, &pt);

        // Feed in two chunks: first SIMD_BATCH + 1 blocks, then the
        // rest. Drain emitted plaintext via take_output after the
        // first chunk; the held_back block must still be there for
        // finalize to consume.
        let mut dec = Sm4CbcDecryptor::new(&key, &iv);
        let split = (SIMD_BATCH + 1) * BLOCK_SIZE;
        dec.update(&ct[..split]);
        let first_chunk_pt = dec.take_output();

        dec.update(&ct[split..]);
        let rest = dec.finalize().expect("decrypt");

        // Concatenated result must equal the original plaintext.
        let mut combined = first_chunk_pt;
        combined.extend_from_slice(&rest);
        assert_eq!(combined, pt);
    }

    /// Cross-validation: streaming encrypt of a decryption of a
    /// streaming encrypt is a fixed point. (Stronger sanity check
    /// than just round-trip — exercises both paths against each
    /// other on the same instance.)
    #[test]
    fn streaming_decrypt_matches_v02_oneshot() {
        let key = [0x42u8; KEY_SIZE];
        let iv = [0x33u8; BLOCK_SIZE];
        let pt = b"test message for cross-validation";
        let canonical = mode_cbc::encrypt(&key, &iv, pt);

        // Decrypt the canonical ciphertext via streaming decryptor.
        let mut dec = Sm4CbcDecryptor::new(&key, &iv);
        dec.update(&canonical);
        let stream_pt = dec.finalize().expect("streaming decrypt");
        assert_eq!(stream_pt, pt);

        // And vice versa: oneshot decrypt of streaming ciphertext.
        let mut enc = Sm4CbcEncryptor::new(&key, &iv);
        enc.update(pt);
        let blob = enc.finalize();
        let recovered = mode_cbc::decrypt(&key, &iv, &blob).expect("oneshot decrypt");
        assert_eq!(recovered, pt);
    }
}