1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
//! Software prefetch helpers for SIMD CRC kernels.
// SAFETY: This module provides low-level prefetch intrinsics that require unsafe.
// Prefetch instructions are hints to the CPU and cannot cause memory unsafety;
// invalid addresses are silently ignored.
//! This module provides platform-tuned prefetch constants and inline helpers
//! for optimal memory access patterns in large-buffer CRC computation.
//!
//! # Background
//!
//! Modern CPUs have hardware prefetchers that work well for sequential access,
//! but software prefetch hints can still provide 5-15% gains in tight loops by:
//! - Reducing cache miss stalls when hardware prefetch falls behind
//! - Ensuring data arrives in L1 before the CPU needs it
//! - Working better with double-unrolled loops that process larger chunks
//!
//! # Prefetch Distance Tuning
//!
//! The optimal prefetch distance depends on:
//! - Memory latency (~70-100 cycles on modern x86, ~60-80 cycles on ARM)
//! - Loop iteration time (cycles per block processed)
//! - Cache line size (64 bytes on all modern platforms)
//!
//! Formula: `prefetch_distance = (memory_latency / cycles_per_block) * block_size`
//!
//! For a kernel processing 256B blocks at ~80 GiB/s on a 4GHz CPU:
//! - Time per block: 256B / 80GiB/s ≈ 3ns ≈ 12 cycles
//! - With 80-cycle memory latency: 80/12 * 256B ≈ 1.7KB
//! - Practical value: 512B-1KB (2-4 blocks ahead)
//!
//! # Usage Pattern
//!
//! ```text
//! use crate::checksum::common::prefetch::{prefetch_read_l1, LARGE_BLOCK_DISTANCE};
//!
//! // In a double-unrolled loop processing 512B per iteration:
//! while ptr.add(DOUBLE_BLOCK) <= end {
//! // Prefetch 2 iterations ahead (1KB for 512B blocks)
//! prefetch_read_l1(ptr.add(LARGE_BLOCK_DISTANCE));
//!
//! // Process first 256B block
//! // ... fold operations ...
//!
//! // Process second 256B block
//! // ... fold operations ...
//!
//! ptr = ptr.add(DOUBLE_BLOCK);
//! }
//! ```
// ─────────────────────────────────────────────────────────────────────────────
// Platform-Tuned Constants
// ─────────────────────────────────────────────────────────────────────────────
/// Prefetch distance for large buffer kernels (xl size, 1MB+).
///
/// Tuned for double-unrolled loops processing 512B per iteration.
/// Value: 1024 bytes (2 iterations ahead).
///
/// # Rationale
/// - At 80 GiB/s, 512B takes ~6ns ≈ 24 cycles at 4GHz
/// - Memory latency ~80-100 cycles on Zen4/Ice Lake
/// - 100 cycles / 24 cycles ≈ 4 blocks, but 2 blocks (1KB) is practical sweet spot
/// - Prefetching too far ahead wastes L1 cache space
pub const LARGE_BLOCK_DISTANCE: usize = 1024;
/// Prefetch distance for large buffer kernels on ARM64.
///
/// Tuned for Graviton2/3 and Apple Silicon.
/// Value: 768 bytes (~2-3 iterations ahead for 256B blocks).
///
/// # Rationale
/// - Graviton2: ~60-70 cycle memory latency, narrower memory bus than x86
/// - Apple M1-M3: Excellent hardware prefetch, but software hints still help
/// - ARM NEON processes 128B blocks, so 768B = 6 blocks ahead
pub const LARGE_BLOCK_DISTANCE: usize = 768;
// ─────────────────────────────────────────────────────────────────────────────
// x86-64 Prefetch Intrinsics
// ─────────────────────────────────────────────────────────────────────────────
// ─────────────────────────────────────────────────────────────────────────────
// ARM64 Prefetch Intrinsics
// ─────────────────────────────────────────────────────────────────────────────
// ─────────────────────────────────────────────────────────────────────────────
// Public API
// ─────────────────────────────────────────────────────────────────────────────
pub use prefetch_read_l1;
pub use prefetch_read_l1;
// Fallback for other architectures (no-op)
pub const LARGE_BLOCK_DISTANCE: usize = 512;
/// No-op prefetch fallback used only by tests on unsupported architectures.
///
/// # Safety
///
/// This function performs no memory access and is always safe to call.
pub unsafe
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────