1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
//! Safe ARM NEON implementation of refmvs functions.
//!
//! splat_mv: Fills rows of RefMvsBlock arrays with a single value.
//! Uses 16-byte NEON stores for the 12-byte RefMvsBlock struct (with R_PAD overflow).
#![cfg_attr(not(feature = "unchecked"), forbid(unsafe_code))]
#![cfg_attr(feature = "unchecked", deny(unsafe_code))]
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
use core::arch::aarch64::*;
#[cfg(feature = "asm")]
use crate::src::align::Align16;
#[cfg(feature = "asm")]
use crate::src::refmvs::RefMvsBlock;
/// ARM NEON implementation of splat_mv.
///
/// Fills bh4 rows of RefMvsBlock arrays with the same value.
/// Each row pointer rr[y] points to an array, and we fill rr[y][bx4..bx4+bw4].
///
/// RefMvsBlock is 12 bytes. We use 16-byte stores at stride 12,
/// which safely overwrites 4 bytes into the next element (or padding at end).
#[cfg(all(feature = "asm", target_arch = "aarch64"))]
#[allow(unsafe_code)]
pub unsafe extern "C" fn splat_mv_neon(
rr: *mut *mut RefMvsBlock,
rmv: &Align16<RefMvsBlock>,
bx4: i32,
bw4: i32,
bh4: i32,
) {
let bx4 = bx4 as usize;
let bw4 = bw4 as usize;
let bh4 = bh4 as usize;
// Load the 16-byte aligned value (12 bytes data + 4 bytes padding)
let rmv_ptr = rmv as *const Align16<RefMvsBlock> as *const u8;
let val128 = unsafe { vld1q_u8(rmv_ptr) };
for y in 0..bh4 {
let row = unsafe { *rr.add(y) };
if row.is_null() {
continue;
}
let base = unsafe { (row as *mut u8).add(bx4 * 12) };
// Each RefMvsBlock is 12 bytes. Store 16 bytes at stride 12.
let mut i = 0;
while i < bw4 {
unsafe { vst1q_u8(base.add(i * 12), val128) };
i += 1;
}
}
}