ethrex-trie 17.0.0

Ethereum Merkle Patricia Trie for the ethrex Ethereum execution client
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
use std::{cmp, mem};

use ethrex_rlp::{
    decode::RLPDecode,
    encode::RLPEncode,
    error::RLPDecodeError,
    structs::{Decoder, Encoder},
};

// ── SIMD nibble expansion ────────────────────────────────────────────────────
//
// The hot path during block execution converts 32-byte keccak keys to 64
// nibbles on every trie lookup / insert.  We replace the original flat_map
// iterator chain with a SIMD kernel that processes 16 or 32 bytes per cycle.

/// Expands each byte in `bytes` into two nibbles (high nibble first),
/// writing `bytes.len() * 2` bytes to the uninitialized `output` pointer.
///
/// # Safety
/// `output` must be valid for writes of at least `bytes.len() * 2` bytes.
#[inline]
#[allow(unsafe_code)]
unsafe fn expand_bytes_to_nibbles(bytes: &[u8], output: *mut u8) {
    #[cfg(target_arch = "x86_64")]
    {
        // SAFETY: caller guarantees output is writable for bytes.len() * 2 bytes.
        unsafe { expand_bytes_to_nibbles_x86_64(bytes, output) };
        return;
    }
    #[cfg(target_arch = "aarch64")]
    {
        // SAFETY: caller guarantees output is writable for bytes.len() * 2 bytes.
        unsafe { expand_bytes_to_nibbles_aarch64(bytes, output) };
        return;
    }
    // Portable scalar fallback for other architectures.
    #[allow(unreachable_code)]
    // SAFETY: caller guarantees output is writable for bytes.len() * 2 bytes.
    unsafe {
        expand_bytes_to_nibbles_scalar(bytes, output)
    };
}

#[cfg(target_arch = "x86_64")]
#[allow(unsafe_code)]
#[inline]
unsafe fn expand_bytes_to_nibbles_x86_64(bytes: &[u8], output: *mut u8) {
    use std::arch::x86_64::*;

    let n = bytes.len();
    let mut i = 0usize;

    // --- AVX2 path: 32 bytes → 64 nibbles per iteration ---
    // Enabled only when the compiler has +avx2 in target-feature
    // (set via .cargo/config.toml for the production x86_64-linux target).
    #[cfg(target_feature = "avx2")]
    // SAFETY: AVX2 is enabled at compile time via .cargo/config.toml target-feature.
    unsafe {
        let mask256 = _mm256_set1_epi8(0x0F_u8 as i8);
        while i + 32 <= n {
            // Load 32 input bytes.
            let v = _mm256_loadu_si256(bytes.as_ptr().add(i).cast::<__m256i>());
            // Extract high nibbles: shift each 16-bit word right by 4, then mask.
            let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), mask256);
            // Extract low nibbles.
            let lo = _mm256_and_si256(v, mask256);
            // Interleave hi/lo within each 128-bit lane.
            // _mm256_unpacklo_epi8 → [hi0,lo0,hi1,lo1,…,hi7,lo7 | hi16,lo16,…,hi23,lo23]
            // _mm256_unpackhi_epi8 → [hi8,lo8,…,hi15,lo15       | hi24,lo24,…,hi31,lo31]
            let unpack_lo = _mm256_unpacklo_epi8(hi, lo);
            let unpack_hi = _mm256_unpackhi_epi8(hi, lo);
            // Cross-lane permute to restore sequential byte order:
            //   out_lo = [lane0(unpack_lo), lane0(unpack_hi)] = bytes  0-15 nibbles
            //   out_hi = [lane1(unpack_lo), lane1(unpack_hi)] = bytes 16-31 nibbles
            let out_lo = _mm256_permute2x128_si256::<0x20>(unpack_lo, unpack_hi);
            let out_hi = _mm256_permute2x128_si256::<0x31>(unpack_lo, unpack_hi);
            _mm256_storeu_si256(output.add(i * 2).cast::<__m256i>(), out_lo);
            _mm256_storeu_si256(output.add(i * 2 + 32).cast::<__m256i>(), out_hi);
            i += 32;
        }
    }

    // --- SSE2 path: 16 bytes → 32 nibbles per iteration ---
    // SSE2 is part of the x86_64 baseline; no runtime check needed.
    // SAFETY: SSE2 is always available on x86_64; pointer arithmetic stays within bounds.
    unsafe {
        let mask128 = _mm_set1_epi8(0x0F_u8 as i8);
        while i + 16 <= n {
            let v = _mm_loadu_si128(bytes.as_ptr().add(i).cast::<__m128i>());
            let hi = _mm_and_si128(_mm_srli_epi16(v, 4), mask128);
            let lo = _mm_and_si128(v, mask128);
            let lo16 = _mm_unpacklo_epi8(hi, lo);
            let hi16 = _mm_unpackhi_epi8(hi, lo);
            _mm_storeu_si128(output.add(i * 2).cast::<__m128i>(), lo16);
            _mm_storeu_si128(output.add(i * 2 + 16).cast::<__m128i>(), hi16);
            i += 16;
        }

        // Scalar tail for remaining bytes (0-15).
        while i < n {
            let b = *bytes.get_unchecked(i);
            *output.add(i * 2) = b >> 4;
            *output.add(i * 2 + 1) = b & 0x0F;
            i += 1;
        }
    }
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(unsafe_code)]
#[inline]
unsafe fn expand_bytes_to_nibbles_aarch64(bytes: &[u8], output: *mut u8) {
    use std::arch::aarch64::*;

    let n = bytes.len();
    let mut i = 0usize;

    // NEON is mandatory on aarch64; no runtime detection needed.
    // SAFETY: NEON is always available on aarch64; bounds are maintained by the loop guard.
    unsafe {
        let mask_0f = vdupq_n_u8(0x0F);
        while i + 16 <= n {
            let v = vld1q_u8(bytes.as_ptr().add(i));
            // vshrq_n_u8 shifts each 8-bit lane independently → high nibbles directly.
            let hi = vshrq_n_u8(v, 4);
            let lo = vandq_u8(v, mask_0f);
            // vzip1q_u8 / vzip2q_u8 interleave bytes from two vectors.
            let lo16 = vzip1q_u8(hi, lo); // [hi0,lo0,hi1,lo1,…,hi7,lo7]
            let hi16 = vzip2q_u8(hi, lo); // [hi8,lo8,…,hi15,lo15]
            vst1q_u8(output.add(i * 2), lo16);
            vst1q_u8(output.add(i * 2 + 16), hi16);
            i += 16;
        }

        // Scalar tail.
        while i < n {
            let b = *bytes.get_unchecked(i);
            *output.add(i * 2) = b >> 4;
            *output.add(i * 2 + 1) = b & 0x0F;
            i += 1;
        }
    }
}

#[allow(unsafe_code)]
#[inline]
unsafe fn expand_bytes_to_nibbles_scalar(bytes: &[u8], output: *mut u8) {
    // SAFETY: caller guarantees output is valid for bytes.len() * 2 bytes.
    unsafe {
        for (i, &b) in bytes.iter().enumerate() {
            *output.add(i * 2) = b >> 4;
            *output.add(i * 2 + 1) = b & 0x0F;
        }
    }
}

// ── SIMD nibble packing ──────────────────────────────────────────────────────
//
// pack_nibble_pairs combines pairs of nibbles [hi, lo, hi, lo, …] into bytes
// [(hi<<4)|lo, …].  This is the hot path inside encode_compact, called on every
// trie node when computing the Merkle root.
//
// Strategy: use SSSE3 _mm_maddubs_epi16 which does
//   result[i] = a[2i]*16 + a[2i+1]  (treating a as unsigned, b as signed)
// Setting b = [16, 1, 16, 1, …] gives the packed nibble byte for each pair.
// This is enabled when SSSE3 is available (always on x86-64-v3).

/// Packs pairs of nibbles in `nibbles` into bytes, writing to `output`.
/// `nibbles.len()` must be even.
///
/// # Safety
/// `output` must be writable for `nibbles.len() / 2` bytes.
#[inline]
#[allow(unsafe_code)]
unsafe fn pack_nibble_pairs(nibbles: &[u8], output: *mut u8) {
    debug_assert!(nibbles.len().is_multiple_of(2));
    #[cfg(target_arch = "x86_64")]
    {
        unsafe { pack_nibble_pairs_x86_64(nibbles, output) };
        return;
    }
    #[cfg(target_arch = "aarch64")]
    {
        unsafe { pack_nibble_pairs_aarch64(nibbles, output) };
        return;
    }
    #[allow(unreachable_code)]
    unsafe {
        pack_nibble_pairs_scalar(nibbles, output)
    };
}

#[cfg(target_arch = "x86_64")]
#[allow(unsafe_code)]
#[inline]
unsafe fn pack_nibble_pairs_x86_64(nibbles: &[u8], output: *mut u8) {
    let n = nibbles.len(); // always even
    let mut i = 0usize; // index into nibbles (steps of 32)
    let mut o = 0usize; // index into output (steps of 16)

    // SSSE3 path: 32 nibbles → 16 output bytes per iteration.
    // _mm_maddubs_epi16(a, b): result[k] = (a[2k]*b[2k] + a[2k+1]*b[2k+1]) as i16
    // With b=[16,1,16,1,...] and a=[hi,lo,...]: result[k] = hi*16 + lo
    #[cfg(target_feature = "ssse3")]
    // SAFETY: SSSE3 enabled at compile time; pointer arithmetic stays within bounds.
    unsafe {
        use std::arch::x86_64::*;
        // Multiplier: weight = [16, 1] repeated → multiply even nibble by 16, odd by 1
        let weights = _mm_set1_epi16(0x0110_u16 as i16); // bytes: [16, 1, 16, 1, ...]
        while i + 32 <= n {
            // Load 32 nibbles (16 pairs) in two 128-bit chunks.
            let lo_chunk = _mm_loadu_si128(nibbles.as_ptr().add(i).cast::<__m128i>());
            let hi_chunk = _mm_loadu_si128(nibbles.as_ptr().add(i + 16).cast::<__m128i>());
            // maddubs: [hi0*16+lo0, hi1*16+lo1, …, hi7*16+lo7] as 16-bit lanes
            let lo_packed = _mm_maddubs_epi16(lo_chunk, weights);
            let hi_packed = _mm_maddubs_epi16(hi_chunk, weights);
            // packus: saturate to u8 and pack both 8×i16 → 16×u8
            let result = _mm_packus_epi16(lo_packed, hi_packed);
            _mm_storeu_si128(output.add(o).cast::<__m128i>(), result);
            i += 32;
            o += 16;
        }
    }

    // Scalar tail (handles remaining pairs when n is not a multiple of 32,
    // or when SSSE3 is unavailable).
    unsafe {
        while i + 2 <= n {
            *output.add(o) = (*nibbles.get_unchecked(i) << 4) | *nibbles.get_unchecked(i + 1);
            i += 2;
            o += 1;
        }
    }
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(unsafe_code)]
#[inline]
unsafe fn pack_nibble_pairs_aarch64(nibbles: &[u8], output: *mut u8) {
    use std::arch::aarch64::*;

    let n = nibbles.len();
    let mut i = 0usize;
    let mut o = 0usize;

    // SAFETY: NEON always available; bounds maintained by loop guard.
    unsafe {
        while i + 32 <= n {
            // Load 32 nibbles interleaved as [hi, lo] pairs.
            let v = vld2q_u8(nibbles.as_ptr().add(i));
            // v.0 = hi nibbles, v.1 = lo nibbles
            // Pack: (hi << 4) | lo
            let packed = vorrq_u8(vshlq_n_u8(v.0, 4), v.1);
            vst1q_u8(output.add(o), packed);
            i += 32;
            o += 16;
        }
        while i + 2 <= n {
            *output.add(o) = (*nibbles.get_unchecked(i) << 4) | *nibbles.get_unchecked(i + 1);
            i += 2;
            o += 1;
        }
    }
}

#[allow(unsafe_code)]
#[inline]
unsafe fn pack_nibble_pairs_scalar(nibbles: &[u8], output: *mut u8) {
    // SAFETY: caller ensures `output` is valid for nibbles.len()/2 bytes.
    unsafe {
        let mut o = 0usize;
        let mut i = 0usize;
        let n = nibbles.len();
        while i + 2 <= n {
            *output.add(o) = (*nibbles.get_unchecked(i) << 4) | *nibbles.get_unchecked(i + 1);
            i += 2;
            o += 1;
        }
    }
}
// ─────────────────────────────────────────────────────────────────────────────

// ── SIMD prefix comparison ───────────────────────────────────────────────────
//
// count_common_prefix finds the length of the longest common prefix of two
// byte slices.  The trie uses this on every insert/lookup to navigate branch
// nodes.  Using SIMD we can compare 16 (SSE2) or 32 (AVX2) bytes at once.

#[allow(unsafe_code)]
#[inline]
fn count_common_prefix(a: &[u8], b: &[u8]) -> usize {
    #[cfg(target_arch = "x86_64")]
    {
        // SAFETY: x86_64 SIMD; bounds are maintained within the function.
        return unsafe { count_common_prefix_x86_64(a, b) };
    }
    #[cfg(target_arch = "aarch64")]
    {
        // SAFETY: NEON enabled; bounds are maintained within the function.
        return unsafe { count_common_prefix_aarch64(a, b) };
    }
    #[allow(unreachable_code)]
    count_common_prefix_scalar(a, b)
}

#[cfg(target_arch = "x86_64")]
#[allow(unsafe_code)]
#[inline]
unsafe fn count_common_prefix_x86_64(a: &[u8], b: &[u8]) -> usize {
    use std::arch::x86_64::*;

    let n = a.len().min(b.len());
    let mut i = 0usize;

    #[cfg(target_feature = "avx2")]
    // SAFETY: AVX2 enabled at compile time; pointer arithmetic stays within bounds.
    unsafe {
        while i + 32 <= n {
            let va = _mm256_loadu_si256(a.as_ptr().add(i).cast::<__m256i>());
            let vb = _mm256_loadu_si256(b.as_ptr().add(i).cast::<__m256i>());
            // Compare bytes: equal → 0xFF, else 0x00
            let eq = _mm256_cmpeq_epi8(va, vb);
            // Create a 32-bit mask where bit k = 1 iff byte k was equal.
            let mask = _mm256_movemask_epi8(eq) as u32;
            if mask != 0xFFFF_FFFF {
                // First differing byte is at bit position (trailing ones).
                return i + mask.trailing_ones() as usize;
            }
            i += 32;
        }
    }

    // SSE2 (16-byte chunks). SSE2 is x86_64 baseline.
    // SAFETY: SSE2 always available; bounds maintained by loop guard.
    unsafe {
        while i + 16 <= n {
            let va = _mm_loadu_si128(a.as_ptr().add(i).cast::<__m128i>());
            let vb = _mm_loadu_si128(b.as_ptr().add(i).cast::<__m128i>());
            let eq = _mm_cmpeq_epi8(va, vb);
            let mask = _mm_movemask_epi8(eq) as u16;
            if mask != 0xFFFF {
                return i + mask.trailing_ones() as usize;
            }
            i += 16;
        }
    }

    // Scalar tail.
    i + count_common_prefix_scalar(&a[i..n], &b[i..n])
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(unsafe_code)]
#[inline]
unsafe fn count_common_prefix_aarch64(a: &[u8], b: &[u8]) -> usize {
    use std::arch::aarch64::*;

    let n = a.len().min(b.len());
    let mut i = 0usize;

    // SAFETY: NEON always available; pointer arithmetic stays within bounds.
    unsafe {
        while i + 16 <= n {
            let va = vld1q_u8(a.as_ptr().add(i));
            let vb = vld1q_u8(b.as_ptr().add(i));
            // vceqq_u8: equal lanes → 0xFF, else 0x00
            let eq = vceqq_u8(va, vb);
            // vminvq_u8: reduce to minimum; if all 0xFF then all bytes matched.
            if vminvq_u8(eq) == 0xFF {
                i += 16;
                continue;
            }
            // Find first non-matching byte by scanning the 16-byte window.
            let mut eq_arr = [0u8; 16];
            vst1q_u8(eq_arr.as_mut_ptr(), eq);
            for (j, &byte) in eq_arr.iter().enumerate() {
                if byte == 0 {
                    return i + j;
                }
            }
            unreachable!()
        }
    }

    i + count_common_prefix_scalar(&a[i..n], &b[i..n])
}

#[inline]
fn count_common_prefix_scalar(a: &[u8], b: &[u8]) -> usize {
    a.iter().zip(b.iter()).take_while(|(x, y)| x == y).count()
}
// ─────────────────────────────────────────────────────────────────────────────

// TODO: move path-tracking logic somewhere else
// PERF: try using a stack-allocated array
/// Struct representing a list of nibbles (half-bytes)
#[derive(
    Debug,
    Clone,
    Default,
    serde::Serialize,
    serde::Deserialize,
    rkyv::Deserialize,
    rkyv::Serialize,
    rkyv::Archive,
)]
pub struct Nibbles {
    data: Vec<u8>,
    /// Parts of the path that have already been consumed (used for tracking
    /// current position when visiting nodes). See `current()`.
    already_consumed: Vec<u8>,
}

// NOTE: custom impls to ignore the `already_consumed` field

impl PartialEq for Nibbles {
    fn eq(&self, other: &Nibbles) -> bool {
        self.data == other.data
    }
}

impl Eq for Nibbles {}

impl PartialOrd for Nibbles {
    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for Nibbles {
    fn cmp(&self, other: &Self) -> cmp::Ordering {
        self.data.cmp(&other.data)
    }
}

impl std::hash::Hash for Nibbles {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.data.hash(state);
    }
}

impl Nibbles {
    /// Create `Nibbles` from  hex-encoded nibbles
    pub const fn from_hex(hex: Vec<u8>) -> Self {
        Self {
            data: hex,
            already_consumed: vec![],
        }
    }

    /// Splits incoming bytes into nibbles and appends the leaf flag (a 16 nibble at the end)
    pub fn from_bytes(bytes: &[u8]) -> Self {
        Self::from_raw(bytes, true)
    }

    /// Splits incoming bytes into nibbles and appends the leaf flag (a 16 nibble at the end) if is_leaf is true
    pub fn from_raw(bytes: &[u8], is_leaf: bool) -> Self {
        let extra = usize::from(is_leaf);
        let mut data = Vec::with_capacity(bytes.len() * 2 + extra);

        // SAFETY: we allocated at least `bytes.len() * 2` capacity (plus `extra`),
        // and we set_len to exactly `bytes.len() * 2` after the SIMD kernel fills them.
        #[allow(unsafe_code)]
        unsafe {
            expand_bytes_to_nibbles(bytes, data.as_mut_ptr());
            data.set_len(bytes.len() * 2);
        }

        if is_leaf {
            data.push(16);
        }

        Self {
            data,
            already_consumed: vec![],
        }
    }

    pub fn into_vec(self) -> Vec<u8> {
        self.data
    }

    /// Returns the amount of nibbles
    pub fn len(&self) -> usize {
        self.data.len()
    }

    /// Returns true if there are no nibbles
    pub fn is_empty(&self) -> bool {
        self.data.is_empty()
    }

    /// If `prefix` is a prefix of self, move the offset after
    /// the prefix and return true, otherwise return false.
    pub fn skip_prefix(&mut self, prefix: &Nibbles) -> bool {
        if self.len() >= prefix.len() && &self.data[..prefix.len()] == prefix.as_ref() {
            self.already_consumed.extend_from_slice(&prefix.data);
            self.data.drain(..prefix.len());
            true
        } else {
            false
        }
    }

    /// Compares self to another, comparing prefixes only in case of unequal lengths.
    pub fn compare_prefix(&self, prefix: &Nibbles) -> cmp::Ordering {
        if self.len() > prefix.len() {
            self.data[..prefix.len()].cmp(&prefix.data)
        } else {
            self.data[..].cmp(&prefix.data[..self.len()])
        }
    }

    /// Compares self to another and returns the shared nibble count (amount of nibbles that are equal, from the start)
    pub fn count_prefix(&self, other: &Nibbles) -> usize {
        count_common_prefix(self.as_ref(), other.as_ref())
    }

    /// Removes and returns the first nibble
    #[allow(clippy::should_implement_trait)]
    pub fn next(&mut self) -> Option<u8> {
        (!self.is_empty()).then(|| {
            self.already_consumed.push(self.data[0]);
            self.data.remove(0)
        })
    }

    /// Removes and returns the first nibble if it is a suitable choice index (aka < 16)
    pub fn next_choice(&mut self) -> Option<usize> {
        self.next().filter(|choice| *choice < 16).map(usize::from)
    }

    /// Returns the nibbles after the given offset
    pub fn offset(&self, offset: usize) -> Nibbles {
        let mut already_consumed = Vec::with_capacity(self.already_consumed.len() + offset);
        already_consumed.extend_from_slice(&self.already_consumed);
        already_consumed.extend_from_slice(&self.data[..offset]);
        Nibbles {
            data: self.data[offset..].to_vec(),
            already_consumed,
        }
    }

    /// Returns the nibbles beween the start and end indexes
    pub fn slice(&self, start: usize, end: usize) -> Nibbles {
        Nibbles::from_hex(self.data[start..end].to_vec())
    }

    /// Extends the nibbles with another list of nibbles
    pub fn extend(&mut self, other: &Nibbles) {
        self.data.extend_from_slice(other.as_ref());
    }

    /// Return the nibble at the given index, will panic if the index is out of range
    pub fn at(&self, i: usize) -> usize {
        self.data[i] as usize
    }

    /// Inserts a nibble at the start
    pub fn prepend(&mut self, nibble: u8) {
        self.data.insert(0, nibble);
    }

    /// Inserts a nibble at the end
    pub fn append(&mut self, nibble: u8) {
        self.data.push(nibble);
    }

    /// Taken from https://github.com/citahub/cita_trie/blob/master/src/nibbles.rs#L56
    /// Encodes the nibbles in compact form
    #[allow(unsafe_code)]
    pub fn encode_compact(&self) -> Vec<u8> {
        let is_leaf = self.is_leaf();
        let mut hex = if is_leaf {
            &self.data[0..self.data.len() - 1]
        } else {
            &self.data[0..]
        };
        // node type    path length    |    prefix    hexchar
        // --------------------------------------------------
        // extension    even           |    0000      0x0
        // extension    odd            |    0001      0x1
        // leaf         even           |    0010      0x2
        // leaf         odd            |    0011      0x3
        let prefix_nibble = if hex.len() % 2 == 1 {
            let v = 0x10 + hex[0];
            hex = &hex[1..];
            v
        } else {
            0x00
        };

        let pair_count = hex.len() / 2;
        let mut compact = Vec::with_capacity(1 + pair_count);
        compact.push(prefix_nibble + if is_leaf { 0x20 } else { 0x00 });

        // SIMD-accelerated packing of nibble pairs → bytes.
        // SAFETY: compact has capacity for `pair_count` bytes beyond the one already pushed.
        // pack_nibble_pairs writes exactly `pair_count` bytes starting at offset 1;
        // set_len then exposes those initialized bytes.
        unsafe {
            let out_ptr = compact.as_mut_ptr().add(1);
            pack_nibble_pairs(hex, out_ptr);
            compact.set_len(1 + pair_count);
        }

        compact
    }

    /// Encodes the nibbles in compact form
    pub fn decode_compact(compact: &[u8]) -> Self {
        Self::from_hex(compact_to_hex(compact))
    }

    /// Returns true if the nibbles contain the leaf flag (16) at the end
    pub fn is_leaf(&self) -> bool {
        if self.is_empty() {
            false
        } else {
            self.data[self.data.len() - 1] == 16
        }
    }

    /// Combines the nibbles into bytes, trimming the leaf flag if necessary
    pub fn to_bytes(&self) -> Vec<u8> {
        // Trim leaf flag
        let data = if !self.is_empty() && self.is_leaf() {
            &self.data[..self.len() - 1]
        } else {
            &self.data[..]
        };
        // Combine nibbles into bytes
        data.chunks(2)
            .map(|chunk| match chunk.len() {
                1 => chunk[0] << 4,
                _ => chunk[0] << 4 | chunk[1],
            })
            .collect::<Vec<_>>()
    }

    /// Concatenates self and another Nibbles returning a new Nibbles
    pub fn concat(&self, other: &Nibbles) -> Nibbles {
        let mut data = Vec::with_capacity(self.data.len() + other.data.len());
        data.extend_from_slice(&self.data);
        data.extend_from_slice(&other.data);
        Nibbles {
            data,
            already_consumed: self.already_consumed.clone(),
        }
    }

    /// Returns a copy of self with the nibble added at the end
    pub fn append_new(&self, nibble: u8) -> Nibbles {
        let mut data = Vec::with_capacity(self.data.len() + 1);
        data.extend_from_slice(&self.data);
        data.push(nibble);
        Nibbles {
            data,
            already_consumed: self.already_consumed.clone(),
        }
    }

    /// Return already consumed parts of path
    pub fn current(&self) -> Nibbles {
        Nibbles {
            data: self.already_consumed.clone(),
            already_consumed: vec![],
        }
    }

    /// Empties `self.data` and returns the content
    pub fn take(&mut self) -> Self {
        Nibbles {
            data: mem::take(&mut self.data),
            already_consumed: mem::take(&mut self.already_consumed),
        }
    }
}

impl AsRef<[u8]> for Nibbles {
    fn as_ref(&self) -> &[u8] {
        &self.data
    }
}

impl RLPEncode for Nibbles {
    fn encode(&self, buf: &mut dyn bytes::BufMut) {
        Encoder::new(buf).encode_field(&self.data).finish();
    }
}

impl RLPDecode for Nibbles {
    fn decode_unfinished(rlp: &[u8]) -> Result<(Self, &[u8]), RLPDecodeError> {
        let decoder = Decoder::new(rlp)?;
        let (data, decoder) = decoder.decode_field("data")?;
        Ok((
            Self {
                data,
                already_consumed: vec![],
            },
            decoder.finish()?,
        ))
    }
}

// Code taken from https://github.com/ethereum/go-ethereum/blob/a1093d98eb3260f2abf340903c2d968b2b891c11/trie/encoding.go#L82
fn compact_to_hex(compact: &[u8]) -> Vec<u8> {
    if compact.is_empty() {
        return vec![];
    }
    let mut base = keybytes_to_hex(compact);
    // delete terminator flag
    let end = if base[0] < 2 {
        base.len() - 1
    } else {
        base.len()
    };
    // apply odd flag
    let chop = 2 - (base[0] & 1) as usize;
    base.drain(..chop);
    base.truncate(end - chop);
    base
}

// Code taken from https://github.com/ethereum/go-ethereum/blob/a1093d98eb3260f2abf340903c2d968b2b891c11/trie/encoding.go#L96
fn keybytes_to_hex(keybytes: &[u8]) -> Vec<u8> {
    let nibble_count = keybytes.len() * 2;
    let mut nibbles = Vec::with_capacity(nibble_count + 1);

    // SAFETY: we just allocated `nibble_count` capacity; SIMD kernel fills them.
    #[allow(unsafe_code)]
    unsafe {
        expand_bytes_to_nibbles(keybytes, nibbles.as_mut_ptr());
        nibbles.set_len(nibble_count);
    }
    nibbles.push(16); // leaf terminator
    nibbles
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Scalar reference for expand_bytes_to_nibbles (no SIMD).
    fn expand_bytes_scalar_ref(bytes: &[u8]) -> Vec<u8> {
        bytes.iter().flat_map(|&b| [b >> 4, b & 0x0F]).collect()
    }

    /// Scalar reference for pack_nibble_pairs (no SIMD).
    fn pack_nibble_pairs_scalar_ref(nibbles: &[u8]) -> Vec<u8> {
        nibbles
            .chunks_exact(2)
            .map(|pair| (pair[0] << 4) | pair[1])
            .collect()
    }

    #[test]
    fn expand_bytes_to_nibbles_matches_scalar() {
        // Test edge-case lengths: 0, 1, 15, 16, 17, 31, 32, 33, 64
        for &len in &[0, 1, 2, 15, 16, 17, 31, 32, 33, 48, 64] {
            let input: Vec<u8> = (0..len).map(|i| (i * 37 + 13) as u8).collect();
            let expected = expand_bytes_scalar_ref(&input);

            let mut actual = vec![0u8; input.len() * 2];
            #[allow(unsafe_code)]
            unsafe {
                expand_bytes_to_nibbles(&input, actual.as_mut_ptr());
            }
            assert_eq!(actual, expected, "mismatch at input length {len}");
        }
    }

    #[test]
    fn pack_nibble_pairs_matches_scalar() {
        // Test edge-case pair counts: 0, 2, 14, 16, 30, 32, 34, 64
        for &nibble_count in &[0, 2, 4, 14, 16, 30, 32, 34, 48, 64] {
            let input: Vec<u8> = (0..nibble_count).map(|i| (i % 16) as u8).collect();
            let expected = pack_nibble_pairs_scalar_ref(&input);

            let mut actual = vec![0u8; nibble_count / 2];
            #[allow(unsafe_code)]
            unsafe {
                pack_nibble_pairs(&input, actual.as_mut_ptr());
            }
            assert_eq!(actual, expected, "mismatch at nibble count {nibble_count}");
        }
    }

    #[test]
    fn expand_then_pack_roundtrip() {
        for &len in &[0, 1, 16, 32, 33] {
            let input: Vec<u8> = (0..len).map(|i| (i * 53 + 7) as u8).collect();
            let mut nibbles = vec![0u8; input.len() * 2];
            #[allow(unsafe_code)]
            unsafe {
                expand_bytes_to_nibbles(&input, nibbles.as_mut_ptr());
            }

            let mut packed = vec![0u8; input.len()];
            #[allow(unsafe_code)]
            unsafe {
                pack_nibble_pairs(&nibbles, packed.as_mut_ptr());
            }
            assert_eq!(packed, input, "roundtrip failed at length {len}");
        }
    }

    #[test]
    fn count_common_prefix_correctness() {
        // Identical slices
        let a = vec![1u8, 2, 3, 4, 5];
        assert_eq!(count_common_prefix(&a, &a), 5);

        // No common prefix
        assert_eq!(count_common_prefix(&[1, 2, 3], &[4, 5, 6]), 0);

        // Partial match
        assert_eq!(count_common_prefix(&[1, 2, 3, 4], &[1, 2, 5, 6]), 2);

        // Empty
        assert_eq!(count_common_prefix(&[], &[1, 2]), 0);
        assert_eq!(count_common_prefix(&[1, 2], &[]), 0);
        assert_eq!(count_common_prefix(&[], &[]), 0);

        // Long match crossing SIMD boundaries (>16 bytes)
        let long_a: Vec<u8> = (0..33).collect();
        let mut long_b = long_a.clone();
        long_b[32] = 255;
        assert_eq!(count_common_prefix(&long_a, &long_b), 32);
    }

    #[test]
    fn from_raw_leaf_flag() {
        let bytes = &[0xAB, 0xCD];
        let with_leaf = Nibbles::from_raw(bytes, true);
        let without_leaf = Nibbles::from_raw(bytes, false);

        assert_eq!(with_leaf.data, vec![0x0A, 0x0B, 0x0C, 0x0D, 16]);
        assert_eq!(without_leaf.data, vec![0x0A, 0x0B, 0x0C, 0x0D]);
    }
}