oximedia_timecode/simd_manchester.rs
1//! SIMD-accelerated Manchester encoding and decoding for LTC bitstreams.
2//!
3//! Manchester coding maps each data bit to two half-bit audio periods:
4//! - Bit `0`: **high** half-period followed by **low** half-period
5//! - Bit `1`: **low** half-period followed by **high** half-period
6//!
7//! The audio samples are `i16`: `HIGH_LEVEL` and `LOW_LEVEL`.
8//!
9//! # SIMD strategy
10//! On `x86_64` targets with the `avx2` CPU feature available at runtime,
11//! [`manchester_encode_simd`] processes 32 input bytes (= 256 bits) per
12//! iteration using `_mm256_movemask_epi8` to extract the sign bit of each
13//! byte, then expands each bit into two `i16` samples. All other targets (and
14//! AVX2-capable targets without the feature at runtime) fall back to a pure
15//! scalar path that produces identical output.
16
17#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
18#![allow(unsafe_code)] // AVX2 intrinsics require unsafe blocks
19
20/// PCM level for the high half-period.
21pub const HIGH_LEVEL: i16 = i16::MAX;
22/// PCM level for the low half-period.
23pub const LOW_LEVEL: i16 = i16::MIN;
24
25// ── Encoding ──────────────────────────────────────────────────────────────────
26
27/// Encode `bits` using Manchester coding, returning PCM audio samples.
28///
29/// Each byte in `bits` is treated as a single data bit (non-zero ⇒ `1`,
30/// zero ⇒ `0`). The returned `Vec<i16>` has exactly `bits.len() × 2`
31/// elements: two samples (high–low or low–high) per input bit.
32///
33/// On `x86_64` CPUs that report `avx2` at runtime the AVX2 fast-path is
34/// taken; otherwise the scalar path is used. Both paths produce identical
35/// output.
36pub fn manchester_encode_simd(bits: &[u8]) -> Vec<i16> {
37 #[cfg(target_arch = "x86_64")]
38 {
39 if is_x86_feature_detected!("avx2") {
40 return manchester_encode_avx2(bits);
41 }
42 }
43 manchester_encode_scalar(bits)
44}
45
46/// Scalar Manchester encoding — reference implementation.
47///
48/// Exposed publicly so callers can force the scalar path in tests.
49pub fn manchester_encode_scalar(bits: &[u8]) -> Vec<i16> {
50 let mut out = Vec::with_capacity(bits.len() * 2);
51 for &b in bits {
52 if b == 0 {
53 // Bit 0: high then low
54 out.push(HIGH_LEVEL);
55 out.push(LOW_LEVEL);
56 } else {
57 // Bit 1: low then high
58 out.push(LOW_LEVEL);
59 out.push(HIGH_LEVEL);
60 }
61 }
62 out
63}
64
65/// AVX2 Manchester encoding path — processes 32 bytes per loop iteration.
66///
67/// Only compiled and called on `x86_64` targets at runtime when AVX2 is
68/// available. On other targets the function is not compiled at all.
69#[cfg(target_arch = "x86_64")]
70fn manchester_encode_avx2(bits: &[u8]) -> Vec<i16> {
71 use std::arch::x86_64::*;
72
73 let mut out = Vec::with_capacity(bits.len() * 2);
74
75 // Process 32 bytes at a time using AVX2.
76 let chunks = bits.chunks_exact(32);
77 let remainder = chunks.remainder();
78
79 for chunk in chunks {
80 // SAFETY: we check for avx2 at runtime before calling this function.
81 unsafe {
82 // Load 32 bytes into a 256-bit register.
83 let v = _mm256_loadu_si256(chunk.as_ptr().cast::<__m256i>());
84
85 // Detect non-zero bytes using an unsigned-safe approach:
86 // cmpeq produces 0xFF where the byte == 0, 0x00 where byte != 0.
87 // Inverting via andnot(eq_zero, all_ones) gives 0xFF for non-zero
88 // bytes regardless of the sign bit, covering the full 0–255 range.
89 let zero = _mm256_setzero_si256();
90 let all_ones = _mm256_set1_epi8(-1_i8);
91 // andnot(a, b) = (!a) & b; here gives 0xFF for lanes where v != 0.
92 let nonzero_mask: u32 =
93 _mm256_movemask_epi8(_mm256_andnot_si256(_mm256_cmpeq_epi8(v, zero), all_ones))
94 as u32;
95
96 // Each bit in nonzero_mask corresponds to one input byte (LSB = lane 0).
97 for lane in 0..32usize {
98 let is_one = (nonzero_mask >> lane) & 1;
99 if is_one == 0 {
100 // Bit 0: high then low
101 out.push(HIGH_LEVEL);
102 out.push(LOW_LEVEL);
103 } else {
104 // Bit 1: low then high
105 out.push(LOW_LEVEL);
106 out.push(HIGH_LEVEL);
107 }
108 }
109 }
110 }
111
112 // Handle the remaining bytes with the scalar path.
113 for &b in remainder {
114 if b == 0 {
115 out.push(HIGH_LEVEL);
116 out.push(LOW_LEVEL);
117 } else {
118 out.push(LOW_LEVEL);
119 out.push(HIGH_LEVEL);
120 }
121 }
122
123 out
124}
125
126// ── Decoding ──────────────────────────────────────────────────────────────────
127
128/// Decode Manchester-coded audio `samples` back to a bit stream.
129///
130/// Each pair of consecutive samples is examined:
131/// - `(HIGH, LOW)` → bit `0`
132/// - `(LOW, HIGH)` → bit `1`
133/// - Ambiguous / noisy pair → bit value is determined by the majority level
134/// (the sample further from zero wins).
135///
136/// `threshold` is the *absolute* amplitude level that distinguishes a
137/// meaningful HIGH/LOW from noise near zero. Typical value: `8192` for
138/// 25% of full-scale `i16`.
139///
140/// Returns one `u8` per decoded bit (value `0` or `1`). If `samples.len()`
141/// is odd the trailing unpaired sample is ignored.
142pub fn manchester_decode_simd(samples: &[i16], threshold: i16) -> Vec<u8> {
143 #[cfg(target_arch = "x86_64")]
144 {
145 if is_x86_feature_detected!("avx2") {
146 return manchester_decode_avx2(samples, threshold);
147 }
148 }
149 manchester_decode_scalar(samples, threshold)
150}
151
152/// Scalar Manchester decoding — reference implementation.
153pub fn manchester_decode_scalar(samples: &[i16], threshold: i16) -> Vec<u8> {
154 let mut out = Vec::with_capacity(samples.len() / 2);
155 let mut i = 0;
156 while i + 1 < samples.len() {
157 let first = samples[i];
158 let second = samples[i + 1];
159
160 let first_positive = first >= threshold;
161 let first_negative = first <= -threshold;
162 let second_positive = second >= threshold;
163 let second_negative = second <= -threshold;
164
165 let bit = if first_positive && second_negative {
166 // (HIGH, LOW) → bit 0
167 0u8
168 } else if first_negative && second_positive {
169 // (LOW, HIGH) → bit 1
170 1u8
171 } else {
172 // Ambiguous: use magnitude to decide.
173 // The half with the larger absolute value determines the polarity.
174 if first.unsigned_abs() >= second.unsigned_abs() {
175 // First sample dominates
176 if first >= 0 {
177 0u8 // First=HIGH → bit 0
178 } else {
179 1u8 // First=LOW → bit 1
180 }
181 } else {
182 // Second sample dominates
183 if second >= 0 {
184 1u8 // Second=HIGH → bit 1
185 } else {
186 0u8 // Second=LOW → bit 0
187 }
188 }
189 };
190
191 out.push(bit);
192 i += 2;
193 }
194 out
195}
196
197/// AVX2 Manchester decoding — processes 16 interleaved sample-pairs per iter.
198///
199/// Manchester samples are interleaved: `[f0, s0, f1, s1, ..., f_k, s_k, ...]`.
200/// Each iteration loads 32 consecutive i16 (= 16 pairs) and deinterleaves them
201/// using `_mm256_srli_epi32` so that `firsts` holds `f_k` and `seconds` holds
202/// `s_k` in the low 16 bits of each i32 word. The high 16 bits remain zero,
203/// which prevents false comparisons.
204#[cfg(target_arch = "x86_64")]
205fn manchester_decode_avx2(samples: &[i16], threshold: i16) -> Vec<u8> {
206 use std::arch::x86_64::*;
207
208 let mut out = Vec::with_capacity(samples.len() / 2);
209
210 // 16 pairs per AVX2 iteration: each iteration consumes 32 i16 elements.
211 let pair_count = samples.len() / 2;
212 let avx_pairs_per_iter = 16usize;
213 let full_iters = pair_count / avx_pairs_per_iter;
214 let remainder_start = full_iters * avx_pairs_per_iter * 2; // in i16 elements
215
216 unsafe {
217 let thresh_vec = _mm256_set1_epi16(threshold);
218 let neg_thresh_vec = _mm256_set1_epi16(-threshold);
219 // Mask to isolate the low 16 bits of each i32 word (= the first sample
220 // of each pair after casting the interleaved stream to i32 words).
221 let mask_lo16 = _mm256_set1_epi32(0x0000_FFFF_u32 as i32);
222 // Pre-compute (threshold - 1) so we can use strict cmpgt for >=.
223 let thresh_m1 = _mm256_sub_epi16(thresh_vec, _mm256_set1_epi16(1));
224 let neg_thresh_m1 = _mm256_sub_epi16(neg_thresh_vec, _mm256_set1_epi16(1));
225
226 for iter in 0..full_iters {
227 // base is an i16-element offset; each iteration spans 32 i16.
228 let base = iter * avx_pairs_per_iter * 2;
229
230 // Load 32 consecutive i16: pairs 0–7 in raw_lo, pairs 8–15 in raw_hi.
231 // Memory layout (little-endian i16): [f0, s0, f1, s1, ..., f7, s7, f8, s8, ...]
232 let raw_lo = _mm256_loadu_si256(samples[base..].as_ptr().cast::<__m256i>());
233 let raw_hi = _mm256_loadu_si256(samples[base + 16..].as_ptr().cast::<__m256i>());
234
235 // Deinterleave: treat each i32 word as (s_k << 16 | f_k) on
236 // little-endian. AND with mask isolates f_k in i16[2k]; shift
237 // right 16 brings s_k into i16[2k]. Odd lanes (i16[2k+1]) are
238 // zero in both cases, so they never trigger threshold comparisons.
239 let firsts_lo = _mm256_and_si256(raw_lo, mask_lo16);
240 let firsts_hi = _mm256_and_si256(raw_hi, mask_lo16);
241 let seconds_lo = _mm256_srli_epi32(raw_lo, 16);
242 let seconds_hi = _mm256_srli_epi32(raw_hi, 16);
243
244 // first >= threshold ⟺ first > (threshold - 1)
245 let first_high_lo = _mm256_cmpgt_epi16(firsts_lo, thresh_m1);
246 let first_high_hi = _mm256_cmpgt_epi16(firsts_hi, thresh_m1);
247 // second <= -threshold ⟺ (−threshold − 1) > second
248 let second_low_lo = _mm256_cmpgt_epi16(neg_thresh_m1, seconds_lo);
249 let second_low_hi = _mm256_cmpgt_epi16(neg_thresh_m1, seconds_hi);
250
251 // Both conditions → (HIGH, LOW) = bit 0.
252 let bit0_lo = _mm256_and_si256(first_high_lo, second_low_lo);
253 let bit0_hi = _mm256_and_si256(first_high_hi, second_low_hi);
254
255 // movemask_epi8 produces one bit per byte. Each i16 lane occupies
256 // 2 bytes; i16[2k] occupies bytes 4k and 4k+1 (little-endian i32).
257 // The sign bit (= compare result) of i16[2k] appears at bit 4k.
258 let bits_lo: u32 = _mm256_movemask_epi8(bit0_lo) as u32;
259 let bits_hi: u32 = _mm256_movemask_epi8(bit0_hi) as u32;
260
261 // Extract 8 decisions from each half (pairs 0–7 from bits_lo,
262 // pairs 8–15 from bits_hi). Bit 4k in the mask corresponds to
263 // pair k in this half.
264 for k in 0..8usize {
265 let is_bit0 = (bits_lo >> (k * 4)) & 1;
266 out.push(if is_bit0 != 0 { 0u8 } else { 1u8 });
267 }
268 for k in 0..8usize {
269 let is_bit0 = (bits_hi >> (k * 4)) & 1;
270 out.push(if is_bit0 != 0 { 0u8 } else { 1u8 });
271 }
272 }
273 }
274
275 // Scalar fallback for the remaining pairs (< 16 pairs).
276 out.extend(manchester_decode_scalar(
277 &samples[remainder_start..],
278 threshold,
279 ));
280
281 out
282}
283
284// ── Tests ──────────────────────────────────────────────────────────────────────
285
286#[cfg(test)]
287mod tests {
288 use super::*;
289
290 // ── Encoding tests ──────────────────────────────────────────────────────
291
292 #[test]
293 fn test_encode_zero_bit() {
294 let encoded = manchester_encode_scalar(&[0]);
295 assert_eq!(encoded.len(), 2);
296 assert_eq!(encoded[0], HIGH_LEVEL, "bit 0: first sample should be HIGH");
297 assert_eq!(encoded[1], LOW_LEVEL, "bit 0: second sample should be LOW");
298 }
299
300 #[test]
301 fn test_encode_one_bit() {
302 let encoded = manchester_encode_scalar(&[1]);
303 assert_eq!(encoded.len(), 2);
304 assert_eq!(encoded[0], LOW_LEVEL, "bit 1: first sample should be LOW");
305 assert_eq!(
306 encoded[1], HIGH_LEVEL,
307 "bit 1: second sample should be HIGH"
308 );
309 }
310
311 #[test]
312 fn test_encode_empty() {
313 let encoded = manchester_encode_scalar(&[]);
314 assert!(encoded.is_empty());
315 }
316
317 #[test]
318 fn test_encode_zero_stream() {
319 // All-zero bit stream: every pair should be (HIGH, LOW).
320 let bits: Vec<u8> = vec![0; 8];
321 let encoded = manchester_encode_scalar(&bits);
322 assert_eq!(encoded.len(), 16);
323 for i in 0..8 {
324 assert_eq!(encoded[i * 2], HIGH_LEVEL, "bit {i}: first must be HIGH");
325 assert_eq!(encoded[i * 2 + 1], LOW_LEVEL, "bit {i}: second must be LOW");
326 }
327 }
328
329 #[test]
330 fn test_encode_256_random_roundtrip_scalar() {
331 // Generate a pseudo-random 256-bit sequence and verify round-trip.
332 let bits: Vec<u8> = (0u32..256)
333 .map(|i| ((i.wrapping_mul(2654435761) >> 16) & 1) as u8)
334 .collect();
335
336 let encoded = manchester_encode_scalar(&bits);
337 let decoded = manchester_decode_scalar(&encoded, HIGH_LEVEL / 2);
338
339 assert_eq!(decoded.len(), bits.len());
340 for (i, (&orig, &dec)) in bits.iter().zip(decoded.iter()).enumerate() {
341 assert_eq!(orig, dec, "round-trip mismatch at bit {i}");
342 }
343 }
344
345 #[test]
346 fn test_encode_simd_matches_scalar() {
347 // The SIMD path (or its scalar fallback) must match the reference.
348 let bits: Vec<u8> = (0u8..255).collect(); // 255 bytes: non-zero = 1, zero = 0
349 let scalar = manchester_encode_scalar(&bits);
350 let simd = manchester_encode_simd(&bits);
351 assert_eq!(scalar, simd, "SIMD encode must match scalar encode");
352 }
353
354 // ── Decoding tests ──────────────────────────────────────────────────────
355
356 #[test]
357 fn test_decode_zero_bit() {
358 let samples = [HIGH_LEVEL, LOW_LEVEL];
359 let decoded = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
360 assert_eq!(decoded, vec![0u8]);
361 }
362
363 #[test]
364 fn test_decode_one_bit() {
365 let samples = [LOW_LEVEL, HIGH_LEVEL];
366 let decoded = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
367 assert_eq!(decoded, vec![1u8]);
368 }
369
370 #[test]
371 fn test_decode_empty() {
372 let decoded = manchester_decode_scalar(&[], 1000);
373 assert!(decoded.is_empty());
374 }
375
376 #[test]
377 fn test_decode_odd_length_ignores_tail() {
378 // Three samples: one valid pair + one trailing sample.
379 let samples = [HIGH_LEVEL, LOW_LEVEL, HIGH_LEVEL]; // pair(0) + orphan
380 let decoded = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
381 assert_eq!(decoded.len(), 1);
382 assert_eq!(decoded[0], 0);
383 }
384
385 #[test]
386 fn test_decode_simd_matches_scalar() {
387 let bits: Vec<u8> = (0u8..255).map(|b| b & 1).collect();
388 let samples = manchester_encode_scalar(&bits);
389 let scalar = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
390 let simd = manchester_decode_simd(&samples, HIGH_LEVEL / 2);
391 assert_eq!(scalar, simd, "SIMD decode must match scalar decode");
392 }
393
394 #[test]
395 fn test_encode_decode_256_random_simd_roundtrip() {
396 // Full round-trip via SIMD paths.
397 // Use a u64 LCG with a 64-bit multiplier.
398 let bits: Vec<u8> = (0u64..256)
399 .map(|i| {
400 let hash = i
401 .wrapping_mul(6364136223846793005_u64)
402 .wrapping_add(1442695040888963407_u64);
403 ((hash >> 32) & 1) as u8
404 })
405 .collect();
406
407 let encoded = manchester_encode_simd(&bits);
408 let decoded = manchester_decode_simd(&encoded, HIGH_LEVEL / 2);
409
410 assert_eq!(decoded.len(), bits.len());
411 for (i, (&orig, &dec)) in bits.iter().zip(decoded.iter()).enumerate() {
412 assert_eq!(orig, dec, "SIMD round-trip mismatch at bit {i}");
413 }
414 }
415}