openjph_core/arch.rs
1//! CPU architecture utilities — port of `ojph_arch.h/cpp`.
2//!
3//! Provides CPU feature detection, alignment constants, and bit-manipulation
4//! helpers that wrap Rust intrinsics for clarity and C++ API compatibility.
5
6// ---------------------------------------------------------------------------
7// Alignment constants
8// ---------------------------------------------------------------------------
9
10/// Required byte alignment for SIMD buffers (AVX-512 = 64 bytes).
11pub const BYTE_ALIGNMENT: u32 = 64;
12
13/// log₂(BYTE_ALIGNMENT).
14pub const LOG_BYTE_ALIGNMENT: u32 = BYTE_ALIGNMENT.trailing_zeros();
15
16/// Required alignment for heap-allocated objects.
17pub const OBJECT_ALIGNMENT: u32 = 8;
18
19// ---------------------------------------------------------------------------
20// CPU extension levels — x86-64
21// ---------------------------------------------------------------------------
22
23/// Supported x86-64 SIMD extension levels, ordered by capability.
24#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
25#[repr(i32)]
26pub enum CpuExtLevel {
27 /// No SIMD.
28 Generic = 0,
29 /// MMX (legacy, rarely targeted).
30 Mmx = 1,
31 /// SSE.
32 Sse = 2,
33 /// SSE2.
34 Sse2 = 3,
35 /// SSE3.
36 Sse3 = 4,
37 /// SSSE3.
38 Ssse3 = 5,
39 /// SSE4.1.
40 Sse41 = 6,
41 /// SSE4.2.
42 Sse42 = 7,
43 /// AVX.
44 Avx = 8,
45 /// AVX2.
46 Avx2 = 9,
47 /// AVX2 + FMA.
48 Avx2Fma = 10,
49 /// AVX-512 (F+BW+CD+DQ+VL at minimum).
50 Avx512 = 11,
51}
52
53// ---------------------------------------------------------------------------
54// CPU extension levels — ARM
55// ---------------------------------------------------------------------------
56
57/// Supported ARM SIMD extension levels.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
59#[repr(i32)]
60pub enum ArmCpuExtLevel {
61 /// No SIMD.
62 Generic = 0,
63 /// ARM NEON (ASIMD).
64 Neon = 1,
65 /// ARM SVE.
66 Sve = 2,
67 /// ARM SVE2.
68 Sve2 = 3,
69}
70
71// ---------------------------------------------------------------------------
72// Runtime CPU feature detection
73// ---------------------------------------------------------------------------
74
75/// Detects the highest supported x86-64 SIMD level at runtime.
76///
77/// On non-x86 targets this always returns `CpuExtLevel::Generic as i32`.
78#[cfg(target_arch = "x86_64")]
79pub fn get_cpu_ext_level() -> i32 {
80 // Probe from highest to lowest.
81 if is_x86_feature_detected!("avx512f")
82 && is_x86_feature_detected!("avx512bw")
83 && is_x86_feature_detected!("avx512cd")
84 && is_x86_feature_detected!("avx512dq")
85 && is_x86_feature_detected!("avx512vl")
86 {
87 return CpuExtLevel::Avx512 as i32;
88 }
89 if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
90 return CpuExtLevel::Avx2Fma as i32;
91 }
92 if is_x86_feature_detected!("avx2") {
93 return CpuExtLevel::Avx2 as i32;
94 }
95 if is_x86_feature_detected!("avx") {
96 return CpuExtLevel::Avx as i32;
97 }
98 if is_x86_feature_detected!("sse4.2") {
99 return CpuExtLevel::Sse42 as i32;
100 }
101 if is_x86_feature_detected!("sse4.1") {
102 return CpuExtLevel::Sse41 as i32;
103 }
104 if is_x86_feature_detected!("ssse3") {
105 return CpuExtLevel::Ssse3 as i32;
106 }
107 if is_x86_feature_detected!("sse3") {
108 return CpuExtLevel::Sse3 as i32;
109 }
110 if is_x86_feature_detected!("sse2") {
111 return CpuExtLevel::Sse2 as i32;
112 }
113 if is_x86_feature_detected!("sse") {
114 return CpuExtLevel::Sse as i32;
115 }
116 CpuExtLevel::Generic as i32
117}
118
119/// Detects the highest supported ARM SIMD level at runtime.
120#[cfg(target_arch = "aarch64")]
121pub fn get_cpu_ext_level() -> i32 {
122 // aarch64 always has NEON.
123 ArmCpuExtLevel::Neon as i32
124}
125
126/// Fallback for architectures without specialised detection.
127#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
128pub fn get_cpu_ext_level() -> i32 {
129 0
130}
131
132// ---------------------------------------------------------------------------
133// Bit-manipulation helpers
134// ---------------------------------------------------------------------------
135
136/// Population count (number of set bits).
137#[inline]
138pub const fn population_count(val: u32) -> u32 {
139 val.count_ones()
140}
141
142/// Count leading zeros (32-bit).
143#[inline]
144pub const fn count_leading_zeros(val: u32) -> u32 {
145 val.leading_zeros()
146}
147
148/// Count leading zeros (64-bit).
149#[inline]
150pub const fn count_leading_zeros_u64(val: u64) -> u32 {
151 val.leading_zeros()
152}
153
154/// Count trailing zeros (32-bit).
155#[inline]
156pub const fn count_trailing_zeros(val: u32) -> u32 {
157 val.trailing_zeros()
158}
159
160// ---------------------------------------------------------------------------
161// Rounding helpers
162// ---------------------------------------------------------------------------
163
164/// Rounds a float to the nearest integer (ties away from zero), matching the
165/// C++ `ojph_round` behaviour.
166#[inline]
167pub fn ojph_round(val: f32) -> i32 {
168 (val + if val >= 0.0 { 0.5 } else { -0.5 }) as i32
169}
170
171/// Truncates a float toward zero, matching the C++ `ojph_trunc`.
172#[inline]
173pub fn ojph_trunc(val: f32) -> i32 {
174 val as i32
175}
176
177// ---------------------------------------------------------------------------
178// Alignment helpers
179// ---------------------------------------------------------------------------
180
181/// Returns the smallest multiple of `alignment` (in bytes) that can hold
182/// `count` elements of type `T`.
183///
184/// This is the Rust equivalent of the C++ `calc_aligned_size<T>()` template.
185#[inline]
186pub const fn calc_aligned_size<T>(count: usize, alignment: u32) -> usize {
187 let byte_size = count * std::mem::size_of::<T>();
188 let align = alignment as usize;
189 (byte_size + align - 1) & !(align - 1)
190}
191
192// ---------------------------------------------------------------------------
193// Tests
194// ---------------------------------------------------------------------------
195
196#[cfg(test)]
197mod tests {
198 use super::*;
199
200 #[test]
201 fn popcount() {
202 assert_eq!(population_count(0b1010_1010), 4);
203 }
204
205 #[test]
206 fn leading_zeros() {
207 assert_eq!(count_leading_zeros(1), 31);
208 assert_eq!(count_leading_zeros_u64(1), 63);
209 }
210
211 #[test]
212 fn trailing_zeros() {
213 assert_eq!(count_trailing_zeros(8), 3);
214 }
215
216 #[test]
217 fn round_trunc() {
218 assert_eq!(ojph_round(2.3), 2);
219 assert_eq!(ojph_round(2.7), 3);
220 assert_eq!(ojph_round(-2.3), -2);
221 assert_eq!(ojph_trunc(2.9), 2);
222 assert_eq!(ojph_trunc(-2.9), -2);
223 }
224
225 #[test]
226 fn aligned_size() {
227 // 10 i32 = 40 bytes → next multiple of 64 = 64
228 assert_eq!(calc_aligned_size::<i32>(10, 64), 64);
229 // 20 i32 = 80 bytes → next multiple of 64 = 128
230 assert_eq!(calc_aligned_size::<i32>(20, 64), 128);
231 }
232
233 #[test]
234 fn cpu_detection_runs() {
235 let level = get_cpu_ext_level();
236 assert!(level >= 0);
237 }
238}