edgefirst-decoder 0.20.0

ML model output decoding for YOLO and ModelPack object detection and segmentation
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
// SPDX-FileCopyrightText: Copyright 2026 Au-Zone Technologies
// SPDX-License-Identifier: Apache-2.0

//! NCHW → NHWC tensor transpose used by the per-scale pipeline when a
//! schema-declared NCHW child is bound at frame time.
//!
//! The per-scale level kernels (`level_box`, `level_score`, `level_mc`)
//! take flat slices and use NHWC-contiguous indexing — each anchor's
//! channel block is read as one consecutive run of `channels` elements.
//! NCHW children, by contrast, place all `h*w` values of channel 0 first,
//! then channel 1, and so on; the per-anchor strided read pattern
//! breaks both the kernel's index math and cache locality.
//!
//! Rather than maintain a parallel NCHW kernel matrix (~doubles the
//! kernel count) the pipeline pays a small per-frame transpose cost
//! into a scratch buffer and then dispatches the existing NHWC kernel.
//! Cost on Cortex-A53 scalar: ~6 ms / frame for a yolo at 640x640
//! (memory-bandwidth-bound). Phase 2 NEON tile-transpose (the same
//! pattern that powers `tiled_proto_transpose_nchw_to_nhwc` on `main`)
//! drops this to ~1.5 ms.
//!
//! NHWC schemas (TFLite, current canonical fixtures) take the
//! pass-through code path in `pipeline::run` and never invoke this
//! module — zero overhead for the canonical case.

/// Generic NCHW → NHWC element transpose.
///
/// `src` is laid out as `[N, C, H, W]` with `N == 1` (the per-scale
/// subsystem only handles single-batch tensors). `dst` is written as
/// `[N, H, W, C]`. Both must be exactly `h * w * c` elements.
///
/// Walks the destination in row-major (`hi`, `wi`, `ci`) order so the
/// destination stores are sequential — typically faster on cores with
/// write-combining stores than walking the source sequentially. Both
/// orderings have the same cache-line count; the difference is in store
/// merging.
///
/// On aarch64, byte-sized inputs (`u8` / `i8`) are routed through a
/// 16×16 NEON tile transpose for the bulk of `hw × c` and tail rows /
/// columns drop to scalar. The byte path is the only one that hits the
/// fast lane today; halfword (i16/u16) and float dtypes use the scalar
/// fallback because they're rarer in the per-scale-NCHW use case
/// (Ara-240 quantizes to int8 throughout).
pub(crate) fn nchw_to_nhwc<T: Copy>(src: &[T], h: usize, w: usize, c: usize, dst: &mut [T]) {
    debug_assert_eq!(src.len(), h * w * c, "src length mismatch");
    debug_assert_eq!(dst.len(), h * w * c, "dst length mismatch");

    // Byte-size dtype fast path on aarch64. The bit pattern of i8/u8 is
    // preserved by the transpose (it's a pure permutation), so we can
    // safely punt the call through the u8 NEON routine via raw bytes.
    #[cfg(target_arch = "aarch64")]
    if std::mem::size_of::<T>() == 1 && c >= 16 && (h * w) >= 16 {
        // SAFETY: T is byte-sized so the raw-byte view is well-formed and
        // the transpose is a bitwise permutation. Dispatching by size
        // matches both i8 and u8 (and any other 1-byte Copy type).
        unsafe {
            let src_bytes = core::slice::from_raw_parts(src.as_ptr() as *const u8, src.len());
            let dst_bytes = core::slice::from_raw_parts_mut(dst.as_mut_ptr() as *mut u8, dst.len());
            nchw_to_nhwc_u8_neon(src_bytes, h, w, c, dst_bytes);
        }
        return;
    }

    let hw = h * w;
    let mut dst_idx = 0;
    for hi in 0..h {
        for wi in 0..w {
            let src_base = hi * w + wi;
            for ci in 0..c {
                dst[dst_idx] = src[ci * hw + src_base];
                dst_idx += 1;
            }
        }
    }
}

/// NEON 16×16 byte tile transpose for NCHW → NHWC.
///
/// Source layout: `[c, hw]` with row stride `hw`. Destination layout:
/// `[hw, c]` with row stride `c`. Bulk of the matrix is processed in
/// 16×16 byte tiles using the canonical 4-stage TRN1/TRN2 pattern; the
/// right and bottom edges fall through to scalar.
///
/// The tile transpose reads each 16-byte register-wide row of source
/// then permutes lanes via four pairwise interleaving stages
/// (`.16b` → `.8h` → `.4s` → `.2d`). Each stage halves the column-vs-row
/// disagreement; after four stages the loaded "rows" hold what were
/// originally "columns", at which point we store them out as the
/// transposed 16×16 destination block.
///
/// # Safety
/// Requires aarch64 NEON (mandatory on aarch64). Caller must ensure
/// `src.len() == dst.len() == h * w * c` and that `c` and `h * w` are
/// each at least 16 — the bulk path indexes 16-wide tiles unchecked.
#[cfg(target_arch = "aarch64")]
pub(crate) unsafe fn nchw_to_nhwc_u8_neon(
    src: &[u8],
    h: usize,
    w: usize,
    c: usize,
    dst: &mut [u8],
) {
    use core::arch::aarch64::*;
    let hw = h * w;
    debug_assert_eq!(src.len(), hw * c);
    debug_assert_eq!(dst.len(), hw * c);
    debug_assert!(hw >= 16 && c >= 16);

    let src_ptr = src.as_ptr();
    let dst_ptr = dst.as_mut_ptr();

    // 16-wide tile counts. Tail rows / columns are scalar; the kernel
    // is most useful when both dims are exact multiples of 16 (channel
    // counts of 32 / 64 / 80 are the common per-scale cases).
    let n_tiles_hw = hw / 16;
    let n_tiles_c = c / 16;

    for tile_c in 0..n_tiles_c {
        let c_base = tile_c * 16;
        for tile_hw in 0..n_tiles_hw {
            let hw_base = tile_hw * 16;

            // Load 16 source rows of 16 bytes — each row is 16 spatial
            // positions of one channel.
            let r0 = vld1q_u8(src_ptr.add((c_base) * hw + hw_base));
            let r1 = vld1q_u8(src_ptr.add((c_base + 1) * hw + hw_base));
            let r2 = vld1q_u8(src_ptr.add((c_base + 2) * hw + hw_base));
            let r3 = vld1q_u8(src_ptr.add((c_base + 3) * hw + hw_base));
            let r4 = vld1q_u8(src_ptr.add((c_base + 4) * hw + hw_base));
            let r5 = vld1q_u8(src_ptr.add((c_base + 5) * hw + hw_base));
            let r6 = vld1q_u8(src_ptr.add((c_base + 6) * hw + hw_base));
            let r7 = vld1q_u8(src_ptr.add((c_base + 7) * hw + hw_base));
            let r8 = vld1q_u8(src_ptr.add((c_base + 8) * hw + hw_base));
            let r9 = vld1q_u8(src_ptr.add((c_base + 9) * hw + hw_base));
            let r10 = vld1q_u8(src_ptr.add((c_base + 10) * hw + hw_base));
            let r11 = vld1q_u8(src_ptr.add((c_base + 11) * hw + hw_base));
            let r12 = vld1q_u8(src_ptr.add((c_base + 12) * hw + hw_base));
            let r13 = vld1q_u8(src_ptr.add((c_base + 13) * hw + hw_base));
            let r14 = vld1q_u8(src_ptr.add((c_base + 14) * hw + hw_base));
            let r15 = vld1q_u8(src_ptr.add((c_base + 15) * hw + hw_base));

            // Stage 1: pair-wise byte interleave (TRN1/TRN2 .16b).
            let a0 = vtrn1q_u8(r0, r1);
            let a1 = vtrn2q_u8(r0, r1);
            let a2 = vtrn1q_u8(r2, r3);
            let a3 = vtrn2q_u8(r2, r3);
            let a4 = vtrn1q_u8(r4, r5);
            let a5 = vtrn2q_u8(r4, r5);
            let a6 = vtrn1q_u8(r6, r7);
            let a7 = vtrn2q_u8(r6, r7);
            let a8 = vtrn1q_u8(r8, r9);
            let a9 = vtrn2q_u8(r8, r9);
            let a10 = vtrn1q_u8(r10, r11);
            let a11 = vtrn2q_u8(r10, r11);
            let a12 = vtrn1q_u8(r12, r13);
            let a13 = vtrn2q_u8(r12, r13);
            let a14 = vtrn1q_u8(r14, r15);
            let a15 = vtrn2q_u8(r14, r15);

            // Stage 2: pair-wise halfword interleave (TRN1/TRN2 .8h).
            macro_rules! trn_h {
                ($lo:expr, $hi:expr, $kind:ident) => {
                    vreinterpretq_u8_u16($kind(
                        vreinterpretq_u16_u8($lo),
                        vreinterpretq_u16_u8($hi),
                    ))
                };
            }
            let b0 = trn_h!(a0, a2, vtrn1q_u16);
            let b1 = trn_h!(a1, a3, vtrn1q_u16);
            let b2 = trn_h!(a0, a2, vtrn2q_u16);
            let b3 = trn_h!(a1, a3, vtrn2q_u16);
            let b4 = trn_h!(a4, a6, vtrn1q_u16);
            let b5 = trn_h!(a5, a7, vtrn1q_u16);
            let b6 = trn_h!(a4, a6, vtrn2q_u16);
            let b7 = trn_h!(a5, a7, vtrn2q_u16);
            let b8 = trn_h!(a8, a10, vtrn1q_u16);
            let b9 = trn_h!(a9, a11, vtrn1q_u16);
            let b10 = trn_h!(a8, a10, vtrn2q_u16);
            let b11 = trn_h!(a9, a11, vtrn2q_u16);
            let b12 = trn_h!(a12, a14, vtrn1q_u16);
            let b13 = trn_h!(a13, a15, vtrn1q_u16);
            let b14 = trn_h!(a12, a14, vtrn2q_u16);
            let b15 = trn_h!(a13, a15, vtrn2q_u16);

            // Stage 3: pair-wise word interleave (TRN1/TRN2 .4s).
            macro_rules! trn_s {
                ($lo:expr, $hi:expr, $kind:ident) => {
                    vreinterpretq_u8_u32($kind(
                        vreinterpretq_u32_u8($lo),
                        vreinterpretq_u32_u8($hi),
                    ))
                };
            }
            let d0 = trn_s!(b0, b4, vtrn1q_u32);
            let d1 = trn_s!(b1, b5, vtrn1q_u32);
            let d2 = trn_s!(b2, b6, vtrn1q_u32);
            let d3 = trn_s!(b3, b7, vtrn1q_u32);
            let d4 = trn_s!(b0, b4, vtrn2q_u32);
            let d5 = trn_s!(b1, b5, vtrn2q_u32);
            let d6 = trn_s!(b2, b6, vtrn2q_u32);
            let d7 = trn_s!(b3, b7, vtrn2q_u32);
            let d8 = trn_s!(b8, b12, vtrn1q_u32);
            let d9 = trn_s!(b9, b13, vtrn1q_u32);
            let d10 = trn_s!(b10, b14, vtrn1q_u32);
            let d11 = trn_s!(b11, b15, vtrn1q_u32);
            let d12 = trn_s!(b8, b12, vtrn2q_u32);
            let d13 = trn_s!(b9, b13, vtrn2q_u32);
            let d14 = trn_s!(b10, b14, vtrn2q_u32);
            let d15 = trn_s!(b11, b15, vtrn2q_u32);

            // Stage 4: pair-wise doubleword interleave (TRN1/TRN2 .2d).
            macro_rules! trn_d {
                ($lo:expr, $hi:expr, $kind:ident) => {
                    vreinterpretq_u8_u64($kind(
                        vreinterpretq_u64_u8($lo),
                        vreinterpretq_u64_u8($hi),
                    ))
                };
            }
            let t0 = trn_d!(d0, d8, vtrn1q_u64);
            let t1 = trn_d!(d1, d9, vtrn1q_u64);
            let t2 = trn_d!(d2, d10, vtrn1q_u64);
            let t3 = trn_d!(d3, d11, vtrn1q_u64);
            let t4 = trn_d!(d4, d12, vtrn1q_u64);
            let t5 = trn_d!(d5, d13, vtrn1q_u64);
            let t6 = trn_d!(d6, d14, vtrn1q_u64);
            let t7 = trn_d!(d7, d15, vtrn1q_u64);
            let t8 = trn_d!(d0, d8, vtrn2q_u64);
            let t9 = trn_d!(d1, d9, vtrn2q_u64);
            let t10 = trn_d!(d2, d10, vtrn2q_u64);
            let t11 = trn_d!(d3, d11, vtrn2q_u64);
            let t12 = trn_d!(d4, d12, vtrn2q_u64);
            let t13 = trn_d!(d5, d13, vtrn2q_u64);
            let t14 = trn_d!(d6, d14, vtrn2q_u64);
            let t15 = trn_d!(d7, d15, vtrn2q_u64);

            // Store 16 destination rows of 16 bytes — each row is 16
            // channels at one spatial position.
            vst1q_u8(dst_ptr.add((hw_base) * c + c_base), t0);
            vst1q_u8(dst_ptr.add((hw_base + 1) * c + c_base), t1);
            vst1q_u8(dst_ptr.add((hw_base + 2) * c + c_base), t2);
            vst1q_u8(dst_ptr.add((hw_base + 3) * c + c_base), t3);
            vst1q_u8(dst_ptr.add((hw_base + 4) * c + c_base), t4);
            vst1q_u8(dst_ptr.add((hw_base + 5) * c + c_base), t5);
            vst1q_u8(dst_ptr.add((hw_base + 6) * c + c_base), t6);
            vst1q_u8(dst_ptr.add((hw_base + 7) * c + c_base), t7);
            vst1q_u8(dst_ptr.add((hw_base + 8) * c + c_base), t8);
            vst1q_u8(dst_ptr.add((hw_base + 9) * c + c_base), t9);
            vst1q_u8(dst_ptr.add((hw_base + 10) * c + c_base), t10);
            vst1q_u8(dst_ptr.add((hw_base + 11) * c + c_base), t11);
            vst1q_u8(dst_ptr.add((hw_base + 12) * c + c_base), t12);
            vst1q_u8(dst_ptr.add((hw_base + 13) * c + c_base), t13);
            vst1q_u8(dst_ptr.add((hw_base + 14) * c + c_base), t14);
            vst1q_u8(dst_ptr.add((hw_base + 15) * c + c_base), t15);
        }

        // Right edge: hw % 16 columns. Scalar tail walks the unfilled
        // spatial positions, each touching 16 channels at this c-tile.
        let hw_tail_start = n_tiles_hw * 16;
        for hw_idx in hw_tail_start..hw {
            for ci in 0..16 {
                let src_idx = (c_base + ci) * hw + hw_idx;
                let dst_idx = hw_idx * c + (c_base + ci);
                *dst.get_unchecked_mut(dst_idx) = *src.get_unchecked(src_idx);
            }
        }
    }

    // Bottom edge: c % 16 channel rows. Scalar tail covers all spatial
    // positions for any unaligned trailing channel block.
    let c_tail_start = n_tiles_c * 16;
    for hw_idx in 0..hw {
        for ci in c_tail_start..c {
            let src_idx = ci * hw + hw_idx;
            let dst_idx = hw_idx * c + ci;
            *dst.get_unchecked_mut(dst_idx) = *src.get_unchecked(src_idx);
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Hand-checked small case: NCHW `[1, 2, 2, 3]` → NHWC `[1, 2, 3, 2]`.
    /// Source layout (channel-major): chan0[(0,0), (0,1), (0,2), (1,0), …]
    /// chan1[(0,0), (0,1), (0,2), (1,0), …].
    #[test]
    fn nchw_to_nhwc_small_hand_check() {
        // h=2, w=3, c=2 → 12 elements.
        // NCHW src (channel-major):
        //   chan0: 0  1  2 | 3  4  5
        //   chan1: 6  7  8 | 9 10 11
        let src: [i32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
        // Expected NHWC dst (anchor-major, then channel):
        //   row 0: (c0,c1) = (0,6), (1,7), (2,8)
        //   row 1: (c0,c1) = (3,9), (4,10), (5,11)
        let expected: [i32; 12] = [0, 6, 1, 7, 2, 8, 3, 9, 4, 10, 5, 11];
        let mut dst = [0i32; 12];
        nchw_to_nhwc(&src, 2, 3, 2, &mut dst);
        assert_eq!(dst, expected);
    }

    /// Roundtrip: NCHW → NHWC → NCHW recovers the original.
    /// Verifies the inverse exists (we don't ship the inverse but the
    /// algebra confirms the forward transpose is well-defined).
    #[test]
    fn nchw_to_nhwc_roundtrip_random() {
        let h = 5;
        let w = 7;
        let c = 4;
        let n = h * w * c;
        let src: Vec<i16> = (0..n as i16).collect();
        let mut nhwc = vec![0i16; n];
        nchw_to_nhwc(&src, h, w, c, &mut nhwc);
        // Manual inverse: NHWC → NCHW.
        let mut back = vec![0i16; n];
        let hw = h * w;
        for hi in 0..h {
            for wi in 0..w {
                for ci in 0..c {
                    let nhwc_idx = (hi * w + wi) * c + ci;
                    let nchw_idx = ci * hw + hi * w + wi;
                    back[nchw_idx] = nhwc[nhwc_idx];
                }
            }
        }
        assert_eq!(back, src);
    }

    /// Single-channel NCHW vs NHWC are bit-equal (the layout collapses).
    #[test]
    fn nchw_to_nhwc_single_channel_passthrough() {
        let h = 4;
        let w = 5;
        let c = 1;
        let src: Vec<u8> = (0..(h * w) as u8).collect();
        let mut dst = vec![0u8; h * w];
        nchw_to_nhwc(&src, h, w, c, &mut dst);
        assert_eq!(dst, src);
    }

    /// Single-pixel NCHW vs NHWC are bit-equal (h=w=1).
    #[test]
    fn nchw_to_nhwc_single_pixel_passthrough() {
        let src: [f32; 8] = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let mut dst = [0f32; 8];
        nchw_to_nhwc(&src, 1, 1, 8, &mut dst);
        assert_eq!(dst, src);
    }

    /// Bit-exact parity between NEON tile transpose and the scalar
    /// reference. Generic `nchw_to_nhwc::<u8>` dispatches to the NEON
    /// kernel on aarch64 (when c >= 16 and hw >= 16), so we run both
    /// the scalar reference and the same dispatch and compare.
    #[cfg(target_arch = "aarch64")]
    #[test]
    fn nchw_to_nhwc_u8_neon_matches_scalar_aligned_16_16() {
        // Exact 16×16 case: one full tile, no edges.
        let h = 4;
        let w = 4;
        let c = 16;
        let n = h * w * c;
        // Use a unique value per (channel, spatial) so any swap shows up.
        let src: Vec<u8> = (0..n).map(|i| (i % 251) as u8).collect();
        let mut dst_neon = vec![0u8; n];
        let mut dst_scalar = vec![0u8; n];
        super::nchw_to_nhwc(&src, h, w, c, &mut dst_neon);
        // Force scalar by inlining the loop here (mirrors the generic
        // fallback after the cfg block).
        let hw = h * w;
        let mut idx = 0;
        for hi in 0..h {
            for wi in 0..w {
                let base = hi * w + wi;
                for ci in 0..c {
                    dst_scalar[idx] = src[ci * hw + base];
                    idx += 1;
                }
            }
        }
        assert_eq!(dst_neon, dst_scalar, "NEON 16x16 tile mismatched scalar");
    }

    /// NEON path with a tail in BOTH directions: c=20 (one 16-tile +
    /// 4-channel tail), hw=20 (one 16-tile + 4-spatial tail). Exercises
    /// every code path in the kernel.
    #[cfg(target_arch = "aarch64")]
    #[test]
    fn nchw_to_nhwc_u8_neon_matches_scalar_with_both_tails() {
        let h = 5;
        let w = 4;
        let c = 20;
        let n = h * w * c;
        let src: Vec<u8> = (0..n).map(|i| ((i * 37) % 251) as u8).collect();
        let mut dst_neon = vec![0u8; n];
        let mut dst_scalar = vec![0u8; n];
        super::nchw_to_nhwc(&src, h, w, c, &mut dst_neon);
        let hw = h * w;
        let mut idx = 0;
        for hi in 0..h {
            for wi in 0..w {
                let base = hi * w + wi;
                for ci in 0..c {
                    dst_scalar[idx] = src[ci * hw + base];
                    idx += 1;
                }
            }
        }
        assert_eq!(dst_neon, dst_scalar);
    }

    /// Realistic per-scale shape: c=80 (score), h*w=400 (level 2's
    /// 20×20 grid). Exercises 5 channel tiles × 25 spatial tiles.
    #[cfg(target_arch = "aarch64")]
    #[test]
    fn nchw_to_nhwc_u8_neon_realistic_score_shape() {
        let h = 20;
        let w = 20;
        let c = 80;
        let n = h * w * c;
        let src: Vec<u8> = (0..n).map(|i| ((i ^ (i >> 7)) % 251) as u8).collect();
        let mut dst_neon = vec![0u8; n];
        let mut dst_scalar = vec![0u8; n];
        super::nchw_to_nhwc(&src, h, w, c, &mut dst_neon);
        let hw = h * w;
        let mut idx = 0;
        for hi in 0..h {
            for wi in 0..w {
                let base = hi * w + wi;
                for ci in 0..c {
                    dst_scalar[idx] = src[ci * hw + base];
                    idx += 1;
                }
            }
        }
        assert_eq!(dst_neon, dst_scalar);
    }

    /// i8 inputs go through the same NEON byte path (size_of::<i8>() == 1).
    /// Bit-pattern preservation is what we rely on; this confirms it
    /// holds across the full signed range including negatives.
    #[cfg(target_arch = "aarch64")]
    #[test]
    fn nchw_to_nhwc_i8_neon_matches_scalar() {
        let h = 4;
        let w = 4;
        let c = 32;
        let n = h * w * c;
        let src: Vec<i8> = (0..n).map(|i| (i as i32 - 100) as i8).collect();
        let mut dst_neon = vec![0i8; n];
        let mut dst_scalar = vec![0i8; n];
        super::nchw_to_nhwc(&src, h, w, c, &mut dst_neon);
        let hw = h * w;
        let mut idx = 0;
        for hi in 0..h {
            for wi in 0..w {
                let base = hi * w + wi;
                for ci in 0..c {
                    dst_scalar[idx] = src[ci * hw + base];
                    idx += 1;
                }
            }
        }
        assert_eq!(dst_neon, dst_scalar);
    }
}