1use super::Histogram;
54use core::ops::{Deref, DerefMut};
55
56#[repr(C)]
61#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)]
62pub struct Histogram32 {
63 pub inner: Histogram<u32>,
64}
65
66impl Default for Histogram<u32> {
67 fn default() -> Self {
69 Histogram { counter: [0; 256] }
70 }
71}
72
73impl Deref for Histogram32 {
74 type Target = Histogram<u32>;
75
76 fn deref(&self) -> &Self::Target {
77 &self.inner
78 }
79}
80
81impl DerefMut for Histogram32 {
82 fn deref_mut(&mut self) -> &mut Self::Target {
83 &mut self.inner
84 }
85}
86
87impl Histogram32 {
88 pub fn from_bytes(bytes: &[u8]) -> Self {
90 let mut histogram = Histogram32::default();
91 histogram32_from_bytes(bytes, &mut histogram);
92 histogram
93 }
94}
95
96pub fn histogram32_from_bytes(bytes: &[u8], hist: &mut Histogram32) {
145 if bytes.len() < 64 {
147 histogram32_reference(bytes, hist)
148 } else {
149 histogram32_generic_batched_unroll_4_u32(bytes, hist)
150 }
151}
152
153pub(crate) fn histogram32_generic_batched_unroll_4_u32(bytes: &[u8], histogram: &mut Histogram32) {
154 if bytes.is_empty() {
155 return;
156 }
157
158 unsafe {
159 let histo_ptr = histogram.inner.counter.as_mut_ptr();
160 let mut current_ptr = bytes.as_ptr() as *const u32;
161 let ptr_end = bytes.as_ptr().add(bytes.len());
162
163 let ptr_end_unroll = bytes
165 .as_ptr()
166 .add(bytes.len() & !(4 * size_of::<u32>() - 1))
167 as *const u32;
168
169 #[cfg(all(target_arch = "x86_64", feature = "std"))]
170 if std::is_x86_feature_detected!("bmi1") {
171 if current_ptr < ptr_end_unroll {
172 process_four_u32_bmi(histo_ptr, &mut current_ptr, ptr_end_unroll);
173 }
174 } else if current_ptr < ptr_end_unroll {
175 process_four_u32_generic(histo_ptr, &mut current_ptr, ptr_end_unroll);
176 }
177
178 #[cfg(all(target_arch = "x86", feature = "nightly", feature = "std"))]
179 if std::is_x86_feature_detected!("bmi1") {
180 if current_ptr < ptr_end_unroll {
181 process_four_u32_bmi(histo_ptr, &mut current_ptr, ptr_end_unroll);
182 }
183 } else if current_ptr < ptr_end_unroll {
184 process_four_u32_generic(histo_ptr, &mut current_ptr, ptr_end_unroll);
185 }
186
187 #[cfg(not(any(
188 all(target_arch = "x86_64", feature = "std"),
189 all(target_arch = "x86", feature = "nightly", feature = "std")
190 )))]
191 if current_ptr < ptr_end_unroll {
192 process_four_u32_generic(histo_ptr, &mut current_ptr, ptr_end_unroll);
193 }
194
195 let mut current_ptr = current_ptr as *const u8;
197 while current_ptr < ptr_end {
198 let byte = *current_ptr;
199 current_ptr = current_ptr.add(1);
200 *histo_ptr.add(byte as usize) += 1;
201 }
202 }
203}
204
205#[inline(never)]
206#[cfg(target_arch = "x86_64")]
207#[target_feature(enable = "bmi1")]
208unsafe extern "sysv64" fn process_four_u32_bmi(
209 histo_ptr: *mut u32,
210 values_ptr: &mut *const u32,
211 ptr_end_unroll: *const u32,
212) {
213 core::arch::asm!(
214 "push rbp",
216 "2:",
217 "mov {eax:e}, [{cur_ptr}]", "mov {ebx:e}, [{cur_ptr} + 4]", "mov {ecx:e}, [{cur_ptr} + 8]", "mov {edx:e}, [{cur_ptr} + 12]", "add {cur_ptr}, 16", "movzx {tmp_e:e}, {eax:l}",
225 "movzx ebp, {eax:h}",
226 "inc dword ptr [{hist_ptr} + 4*{tmp_e:r}]",
227 "bextr {tmp_e:e}, {eax:e}, {bextr_pat:e}",
228 "shr {eax:e}, 24",
229 "inc dword ptr [{hist_ptr} + 4*rbp]",
230 "inc dword ptr [{hist_ptr} + 4*{tmp_e:r}]",
231 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
232
233 "movzx {eax:e}, {ebx:l}",
235 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
236 "movzx {eax:e}, {ebx:h}",
237 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
238 "bextr {eax:e}, {ebx:e}, {bextr_pat:e}",
239 "shr {ebx:e}, 24",
240 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
241 "inc dword ptr [{hist_ptr} + 4*{ebx:r}]",
242
243 "movzx {eax:e}, {ecx:l}",
245 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
246 "movzx {eax:e}, {ecx:h}",
247 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
248 "bextr {eax:e}, {ecx:e}, {bextr_pat:e}",
249 "shr {ecx:e}, 24",
250 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
251 "inc dword ptr [{hist_ptr} + 4*{ecx:r}]",
252
253 "movzx {eax:e}, {edx:l}",
255 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
256 "movzx {eax:e}, {edx:h}",
257 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
258 "bextr {eax:e}, {edx:e}, {bextr_pat:e}",
259 "shr {edx:e}, 24",
260 "inc dword ptr [{hist_ptr} + 4*{eax:r}]",
261 "inc dword ptr [{hist_ptr} + 4*{edx:r}]",
262
263 "cmp {cur_ptr}, {end_ptr}",
265 "jb 2b",
266 "pop rbp",
267
268 cur_ptr = inout(reg) *values_ptr,
269 hist_ptr = in(reg) histo_ptr,
270 end_ptr = in(reg) ptr_end_unroll,
271 bextr_pat = in(reg) 2064u32,
272 eax = out(reg_abcd) _,
273 ebx = out(reg_abcd) _,
274 ecx = out(reg_abcd) _,
275 edx = out(reg_abcd) _,
276 tmp_e = out(reg) _,
277 options(nostack)
278 );
279}
280
281#[cfg(feature = "nightly")]
282#[unsafe(naked)]
283#[cfg(target_arch = "x86")]
284#[target_feature(enable = "bmi1")]
285unsafe extern "stdcall" fn process_four_u32_bmi(
287 histo_ptr: *mut u32,
288 values_ptr: &mut *const u32,
289 ptr_end_unroll: *const u32,
290) {
291 core::arch::naked_asm!(
292 "push ebp",
294 "push ebx",
295 "push edi",
296 "push esi",
297 "push eax", "mov eax, dword ptr [esp + 28]", "mov esi, dword ptr [esp + 24]", "mov edx, dword ptr [eax]", ".p2align 4, 0x90",
304 "2:",
306 "mov eax, dword ptr [edx]", "mov edi, dword ptr [edx + 12]", "mov ecx, dword ptr [edx + 4]", "mov ebx, dword ptr [edx + 8]", "add edx, 16", "movzx ebp, al", "mov dword ptr [esp], edi", "mov edi, 2064", "inc dword ptr [esi + 4*ebp]", "movzx ebp, ah",
318 "inc dword ptr [esi + 4*ebp]",
319 "bextr ebp, eax, edi",
320 "shr eax, 24",
321 "inc dword ptr [esi + 4*ebp]",
322 "inc dword ptr [esi + 4*eax]",
323 "movzx eax, cl",
325 "inc dword ptr [esi + 4*eax]",
326 "movzx eax, ch",
327 "inc dword ptr [esi + 4*eax]",
328 "bextr eax, ecx, edi",
329 "shr ecx, 24",
330 "inc dword ptr [esi + 4*eax]",
331 "inc dword ptr [esi + 4*ecx]",
332 "mov ecx, dword ptr [esp]", "movzx eax, bl",
335 "inc dword ptr [esi + 4*eax]",
336 "movzx eax, bh",
337 "inc dword ptr [esi + 4*eax]",
338 "bextr eax, ebx, edi",
339 "shr ebx, 24",
340 "inc dword ptr [esi + 4*eax]",
341 "inc dword ptr [esi + 4*ebx]",
342 "movzx eax, cl",
344 "inc dword ptr [esi + 4*eax]",
345 "movzx eax, ch",
346 "inc dword ptr [esi + 4*eax]",
347 "bextr eax, ecx, edi",
348 "shr ecx, 24",
349 "inc dword ptr [esi + 4*eax]",
350 "inc dword ptr [esi + 4*ecx]",
351 "cmp edx, dword ptr [esp + 32]", "jb 2b", "mov eax, dword ptr [esp + 28]", "mov dword ptr [eax], edx", "add esp, 4", "pop esi",
360 "pop edi",
361 "pop ebx",
362 "pop ebp",
363 "ret 12", );
365}
366
367#[inline(never)] unsafe extern "C" fn process_four_u32_generic(
369 histo_ptr: *mut u32,
370 values_ptr: &mut *const u32,
371 ptr_end_unroll: *const u32,
372) {
373 while {
374 let value1 = values_ptr.read_unaligned();
376 let value2 = values_ptr.add(1).read_unaligned();
377 let value3 = values_ptr.add(2).read_unaligned();
378 let value4 = values_ptr.add(3).read_unaligned();
379
380 *histo_ptr.add((value1 & 0xFF) as usize) += 1;
382 *histo_ptr.add(((value1 >> 8) & 0xFF) as usize) += 1;
383 *histo_ptr.add(((value1 >> 16) & 0xFF) as usize) += 1;
384 *histo_ptr.add((value1 >> 24) as usize) += 1;
385
386 *histo_ptr.add((value2 & 0xFF) as usize) += 1;
388 *histo_ptr.add(((value2 >> 8) & 0xFF) as usize) += 1;
389 *histo_ptr.add(((value2 >> 16) & 0xFF) as usize) += 1;
390 *histo_ptr.add((value2 >> 24) as usize) += 1;
391
392 *histo_ptr.add((value3 & 0xFF) as usize) += 1;
394 *histo_ptr.add(((value3 >> 8) & 0xFF) as usize) += 1;
395 *histo_ptr.add(((value3 >> 16) & 0xFF) as usize) += 1;
396 *histo_ptr.add((value3 >> 24) as usize) += 1;
397
398 *histo_ptr.add((value4 & 0xFF) as usize) += 1;
400 *histo_ptr.add(((value4 >> 8) & 0xFF) as usize) += 1;
401 *histo_ptr.add(((value4 >> 16) & 0xFF) as usize) += 1;
402 *histo_ptr.add((value4 >> 24) as usize) += 1;
403
404 *values_ptr = values_ptr.add(4);
405 *values_ptr < ptr_end_unroll
406 } {}
407}
408
409pub(crate) fn histogram32_reference(bytes: &[u8], histogram: &mut Histogram32) {
412 let histo_ptr = histogram.inner.counter.as_mut_ptr();
413 let mut current_ptr = bytes.as_ptr();
414 let ptr_end = unsafe { current_ptr.add(bytes.len()) };
415
416 unsafe {
419 while current_ptr < ptr_end {
420 let byte = *current_ptr;
421 current_ptr = current_ptr.add(1);
422 *histo_ptr.add(byte as usize) += 1;
423 }
424 }
425}
426
427#[cfg(test)]
428mod reference_tests {
429 use super::*;
430 use std::vec::Vec;
431
432 #[test]
435 fn verify_full_range_in_reference_impl() {
436 let input: Vec<u8> = (0..=255).collect();
437 let mut histogram = Histogram32::default();
438 histogram32_reference(&input, &mut histogram);
439
440 for count in histogram.inner.counter.iter() {
442 assert_eq!(*count, 1);
443 }
444 }
445}
446
447#[cfg(test)]
448mod alternative_implementation_tests {
449 use super::*;
450 use crate::histogram::histogram32_private::*;
451 use rstest::rstest;
452 use std::vec::Vec;
453
454 fn generate_test_data(size: usize) -> Vec<u8> {
456 (0..size).map(|i| (i % 256) as u8).collect()
457 }
458
459 #[rstest]
460 #[case::batched_u32(histogram32_generic_batched_u32)]
461 #[case::batched_u64(histogram32_generic_batched_u64)]
462 #[case::batched_unroll2_u32(histogram32_generic_batched_unroll_2_u32)]
463 #[case::batched_unroll2_u64(histogram32_generic_batched_unroll_2_u64)]
464 #[case::batched_unroll4_u32(histogram32_generic_batched_unroll_4_u32)]
465 #[case::batched_unroll4_u64(histogram32_generic_batched_unroll_4_u64)]
466 #[case::nonaliased_withruns(histogram_nonaliased_withruns_core)]
467 fn test_against_reference(#[case] implementation: fn(&[u8], &mut Histogram32)) {
468 for size in 0..=767 {
470 let test_data = generate_test_data(size);
471
472 let mut implementation_result = Histogram32::default();
474 let mut reference_result = Histogram32::default();
475 implementation(&test_data, &mut implementation_result);
476 histogram32_reference(&test_data, &mut reference_result);
477
478 assert_eq!(
479 implementation_result.inner.counter, reference_result.inner.counter,
480 "Implementation failed for size {size}"
481 );
482 }
483 }
484}