oximedia_timecode/
simd_manchester.rs

1//! SIMD-accelerated Manchester encoding and decoding for LTC bitstreams.
2//!
3//! Manchester coding maps each data bit to two half-bit audio periods:
4//! - Bit `0`: **high** half-period followed by **low** half-period
5//! - Bit `1`: **low** half-period followed by **high** half-period
6//!
7//! The audio samples are `i16`: `HIGH_LEVEL` and `LOW_LEVEL`.
8//!
9//! # SIMD strategy
10//! On `x86_64` targets with the `avx2` CPU feature available at runtime,
11//! [`manchester_encode_simd`] processes 32 input bytes (= 256 bits) per
12//! iteration using `_mm256_movemask_epi8` to extract the sign bit of each
13//! byte, then expands each bit into two `i16` samples.  All other targets (and
14//! AVX2-capable targets without the feature at runtime) fall back to a pure
15//! scalar path that produces identical output.
16
17#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
18#![allow(unsafe_code)] // AVX2 intrinsics require unsafe blocks
19
20/// PCM level for the high half-period.
21pub const HIGH_LEVEL: i16 = i16::MAX;
22/// PCM level for the low half-period.
23pub const LOW_LEVEL: i16 = i16::MIN;
24
25// ── Encoding ──────────────────────────────────────────────────────────────────
26
27/// Encode `bits` using Manchester coding, returning PCM audio samples.
28///
29/// Each byte in `bits` is treated as a single data bit (non-zero ⇒ `1`,
30/// zero ⇒ `0`).  The returned `Vec<i16>` has exactly `bits.len() × 2`
31/// elements: two samples (high–low or low–high) per input bit.
32///
33/// On `x86_64` CPUs that report `avx2` at runtime the AVX2 fast-path is
34/// taken; otherwise the scalar path is used.  Both paths produce identical
35/// output.
36pub fn manchester_encode_simd(bits: &[u8]) -> Vec<i16> {
37    #[cfg(target_arch = "x86_64")]
38    {
39        if is_x86_feature_detected!("avx2") {
40            return manchester_encode_avx2(bits);
41        }
42    }
43    manchester_encode_scalar(bits)
44}
45
46/// Scalar Manchester encoding — reference implementation.
47///
48/// Exposed publicly so callers can force the scalar path in tests.
49pub fn manchester_encode_scalar(bits: &[u8]) -> Vec<i16> {
50    let mut out = Vec::with_capacity(bits.len() * 2);
51    for &b in bits {
52        if b == 0 {
53            // Bit 0: high then low
54            out.push(HIGH_LEVEL);
55            out.push(LOW_LEVEL);
56        } else {
57            // Bit 1: low then high
58            out.push(LOW_LEVEL);
59            out.push(HIGH_LEVEL);
60        }
61    }
62    out
63}
64
65/// AVX2 Manchester encoding path — processes 32 bytes per loop iteration.
66///
67/// Only compiled and called on `x86_64` targets at runtime when AVX2 is
68/// available.  On other targets the function is not compiled at all.
69#[cfg(target_arch = "x86_64")]
70fn manchester_encode_avx2(bits: &[u8]) -> Vec<i16> {
71    use std::arch::x86_64::*;
72
73    let mut out = Vec::with_capacity(bits.len() * 2);
74
75    // Process 32 bytes at a time using AVX2.
76    let chunks = bits.chunks_exact(32);
77    let remainder = chunks.remainder();
78
79    for chunk in chunks {
80        // SAFETY: we check for avx2 at runtime before calling this function.
81        unsafe {
82            // Load 32 bytes into a 256-bit register.
83            let v = _mm256_loadu_si256(chunk.as_ptr().cast::<__m256i>());
84
85            // Detect non-zero bytes using an unsigned-safe approach:
86            // cmpeq produces 0xFF where the byte == 0, 0x00 where byte != 0.
87            // Inverting via andnot(eq_zero, all_ones) gives 0xFF for non-zero
88            // bytes regardless of the sign bit, covering the full 0–255 range.
89            let zero = _mm256_setzero_si256();
90            let all_ones = _mm256_set1_epi8(-1_i8);
91            // andnot(a, b) = (!a) & b; here gives 0xFF for lanes where v != 0.
92            let nonzero_mask: u32 =
93                _mm256_movemask_epi8(_mm256_andnot_si256(_mm256_cmpeq_epi8(v, zero), all_ones))
94                    as u32;
95
96            // Each bit in nonzero_mask corresponds to one input byte (LSB = lane 0).
97            for lane in 0..32usize {
98                let is_one = (nonzero_mask >> lane) & 1;
99                if is_one == 0 {
100                    // Bit 0: high then low
101                    out.push(HIGH_LEVEL);
102                    out.push(LOW_LEVEL);
103                } else {
104                    // Bit 1: low then high
105                    out.push(LOW_LEVEL);
106                    out.push(HIGH_LEVEL);
107                }
108            }
109        }
110    }
111
112    // Handle the remaining bytes with the scalar path.
113    for &b in remainder {
114        if b == 0 {
115            out.push(HIGH_LEVEL);
116            out.push(LOW_LEVEL);
117        } else {
118            out.push(LOW_LEVEL);
119            out.push(HIGH_LEVEL);
120        }
121    }
122
123    out
124}
125
126// ── Decoding ──────────────────────────────────────────────────────────────────
127
128/// Decode Manchester-coded audio `samples` back to a bit stream.
129///
130/// Each pair of consecutive samples is examined:
131/// - `(HIGH, LOW)` → bit `0`
132/// - `(LOW, HIGH)` → bit `1`
133/// - Ambiguous / noisy pair → bit value is determined by the majority level
134///   (the sample further from zero wins).
135///
136/// `threshold` is the *absolute* amplitude level that distinguishes a
137/// meaningful HIGH/LOW from noise near zero.  Typical value: `8192` for
138/// 25% of full-scale `i16`.
139///
140/// Returns one `u8` per decoded bit (value `0` or `1`).  If `samples.len()`
141/// is odd the trailing unpaired sample is ignored.
142pub fn manchester_decode_simd(samples: &[i16], threshold: i16) -> Vec<u8> {
143    #[cfg(target_arch = "x86_64")]
144    {
145        if is_x86_feature_detected!("avx2") {
146            return manchester_decode_avx2(samples, threshold);
147        }
148    }
149    manchester_decode_scalar(samples, threshold)
150}
151
152/// Scalar Manchester decoding — reference implementation.
153pub fn manchester_decode_scalar(samples: &[i16], threshold: i16) -> Vec<u8> {
154    let mut out = Vec::with_capacity(samples.len() / 2);
155    let mut i = 0;
156    while i + 1 < samples.len() {
157        let first = samples[i];
158        let second = samples[i + 1];
159
160        let first_positive = first >= threshold;
161        let first_negative = first <= -threshold;
162        let second_positive = second >= threshold;
163        let second_negative = second <= -threshold;
164
165        let bit = if first_positive && second_negative {
166            // (HIGH, LOW) → bit 0
167            0u8
168        } else if first_negative && second_positive {
169            // (LOW, HIGH) → bit 1
170            1u8
171        } else {
172            // Ambiguous: use magnitude to decide.
173            // The half with the larger absolute value determines the polarity.
174            if first.unsigned_abs() >= second.unsigned_abs() {
175                // First sample dominates
176                if first >= 0 {
177                    0u8 // First=HIGH → bit 0
178                } else {
179                    1u8 // First=LOW  → bit 1
180                }
181            } else {
182                // Second sample dominates
183                if second >= 0 {
184                    1u8 // Second=HIGH → bit 1
185                } else {
186                    0u8 // Second=LOW  → bit 0
187                }
188            }
189        };
190
191        out.push(bit);
192        i += 2;
193    }
194    out
195}
196
197/// AVX2 Manchester decoding — processes 16 interleaved sample-pairs per iter.
198///
199/// Manchester samples are interleaved: `[f0, s0, f1, s1, ..., f_k, s_k, ...]`.
200/// Each iteration loads 32 consecutive i16 (= 16 pairs) and deinterleaves them
201/// using `_mm256_srli_epi32` so that `firsts` holds `f_k` and `seconds` holds
202/// `s_k` in the low 16 bits of each i32 word.  The high 16 bits remain zero,
203/// which prevents false comparisons.
204#[cfg(target_arch = "x86_64")]
205fn manchester_decode_avx2(samples: &[i16], threshold: i16) -> Vec<u8> {
206    use std::arch::x86_64::*;
207
208    let mut out = Vec::with_capacity(samples.len() / 2);
209
210    // 16 pairs per AVX2 iteration: each iteration consumes 32 i16 elements.
211    let pair_count = samples.len() / 2;
212    let avx_pairs_per_iter = 16usize;
213    let full_iters = pair_count / avx_pairs_per_iter;
214    let remainder_start = full_iters * avx_pairs_per_iter * 2; // in i16 elements
215
216    unsafe {
217        let thresh_vec = _mm256_set1_epi16(threshold);
218        let neg_thresh_vec = _mm256_set1_epi16(-threshold);
219        // Mask to isolate the low 16 bits of each i32 word (= the first sample
220        // of each pair after casting the interleaved stream to i32 words).
221        let mask_lo16 = _mm256_set1_epi32(0x0000_FFFF_u32 as i32);
222        // Pre-compute (threshold - 1) so we can use strict cmpgt for >=.
223        let thresh_m1 = _mm256_sub_epi16(thresh_vec, _mm256_set1_epi16(1));
224        let neg_thresh_m1 = _mm256_sub_epi16(neg_thresh_vec, _mm256_set1_epi16(1));
225
226        for iter in 0..full_iters {
227            // base is an i16-element offset; each iteration spans 32 i16.
228            let base = iter * avx_pairs_per_iter * 2;
229
230            // Load 32 consecutive i16: pairs 0–7 in raw_lo, pairs 8–15 in raw_hi.
231            // Memory layout (little-endian i16): [f0, s0, f1, s1, ..., f7, s7, f8, s8, ...]
232            let raw_lo = _mm256_loadu_si256(samples[base..].as_ptr().cast::<__m256i>());
233            let raw_hi = _mm256_loadu_si256(samples[base + 16..].as_ptr().cast::<__m256i>());
234
235            // Deinterleave: treat each i32 word as (s_k << 16 | f_k) on
236            // little-endian.  AND with mask isolates f_k in i16[2k]; shift
237            // right 16 brings s_k into i16[2k].  Odd lanes (i16[2k+1]) are
238            // zero in both cases, so they never trigger threshold comparisons.
239            let firsts_lo = _mm256_and_si256(raw_lo, mask_lo16);
240            let firsts_hi = _mm256_and_si256(raw_hi, mask_lo16);
241            let seconds_lo = _mm256_srli_epi32(raw_lo, 16);
242            let seconds_hi = _mm256_srli_epi32(raw_hi, 16);
243
244            // first >= threshold  ⟺  first > (threshold - 1)
245            let first_high_lo = _mm256_cmpgt_epi16(firsts_lo, thresh_m1);
246            let first_high_hi = _mm256_cmpgt_epi16(firsts_hi, thresh_m1);
247            // second <= -threshold  ⟺  (−threshold − 1) > second
248            let second_low_lo = _mm256_cmpgt_epi16(neg_thresh_m1, seconds_lo);
249            let second_low_hi = _mm256_cmpgt_epi16(neg_thresh_m1, seconds_hi);
250
251            // Both conditions → (HIGH, LOW) = bit 0.
252            let bit0_lo = _mm256_and_si256(first_high_lo, second_low_lo);
253            let bit0_hi = _mm256_and_si256(first_high_hi, second_low_hi);
254
255            // movemask_epi8 produces one bit per byte.  Each i16 lane occupies
256            // 2 bytes; i16[2k] occupies bytes 4k and 4k+1 (little-endian i32).
257            // The sign bit (= compare result) of i16[2k] appears at bit 4k.
258            let bits_lo: u32 = _mm256_movemask_epi8(bit0_lo) as u32;
259            let bits_hi: u32 = _mm256_movemask_epi8(bit0_hi) as u32;
260
261            // Extract 8 decisions from each half (pairs 0–7 from bits_lo,
262            // pairs 8–15 from bits_hi).  Bit 4k in the mask corresponds to
263            // pair k in this half.
264            for k in 0..8usize {
265                let is_bit0 = (bits_lo >> (k * 4)) & 1;
266                out.push(if is_bit0 != 0 { 0u8 } else { 1u8 });
267            }
268            for k in 0..8usize {
269                let is_bit0 = (bits_hi >> (k * 4)) & 1;
270                out.push(if is_bit0 != 0 { 0u8 } else { 1u8 });
271            }
272        }
273    }
274
275    // Scalar fallback for the remaining pairs (< 16 pairs).
276    out.extend(manchester_decode_scalar(
277        &samples[remainder_start..],
278        threshold,
279    ));
280
281    out
282}
283
284// ── Tests ──────────────────────────────────────────────────────────────────────
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    // ── Encoding tests ──────────────────────────────────────────────────────
291
292    #[test]
293    fn test_encode_zero_bit() {
294        let encoded = manchester_encode_scalar(&[0]);
295        assert_eq!(encoded.len(), 2);
296        assert_eq!(encoded[0], HIGH_LEVEL, "bit 0: first sample should be HIGH");
297        assert_eq!(encoded[1], LOW_LEVEL, "bit 0: second sample should be LOW");
298    }
299
300    #[test]
301    fn test_encode_one_bit() {
302        let encoded = manchester_encode_scalar(&[1]);
303        assert_eq!(encoded.len(), 2);
304        assert_eq!(encoded[0], LOW_LEVEL, "bit 1: first sample should be LOW");
305        assert_eq!(
306            encoded[1], HIGH_LEVEL,
307            "bit 1: second sample should be HIGH"
308        );
309    }
310
311    #[test]
312    fn test_encode_empty() {
313        let encoded = manchester_encode_scalar(&[]);
314        assert!(encoded.is_empty());
315    }
316
317    #[test]
318    fn test_encode_zero_stream() {
319        // All-zero bit stream: every pair should be (HIGH, LOW).
320        let bits: Vec<u8> = vec![0; 8];
321        let encoded = manchester_encode_scalar(&bits);
322        assert_eq!(encoded.len(), 16);
323        for i in 0..8 {
324            assert_eq!(encoded[i * 2], HIGH_LEVEL, "bit {i}: first must be HIGH");
325            assert_eq!(encoded[i * 2 + 1], LOW_LEVEL, "bit {i}: second must be LOW");
326        }
327    }
328
329    #[test]
330    fn test_encode_256_random_roundtrip_scalar() {
331        // Generate a pseudo-random 256-bit sequence and verify round-trip.
332        let bits: Vec<u8> = (0u32..256)
333            .map(|i| ((i.wrapping_mul(2654435761) >> 16) & 1) as u8)
334            .collect();
335
336        let encoded = manchester_encode_scalar(&bits);
337        let decoded = manchester_decode_scalar(&encoded, HIGH_LEVEL / 2);
338
339        assert_eq!(decoded.len(), bits.len());
340        for (i, (&orig, &dec)) in bits.iter().zip(decoded.iter()).enumerate() {
341            assert_eq!(orig, dec, "round-trip mismatch at bit {i}");
342        }
343    }
344
345    #[test]
346    fn test_encode_simd_matches_scalar() {
347        // The SIMD path (or its scalar fallback) must match the reference.
348        let bits: Vec<u8> = (0u8..255).collect(); // 255 bytes: non-zero = 1, zero = 0
349        let scalar = manchester_encode_scalar(&bits);
350        let simd = manchester_encode_simd(&bits);
351        assert_eq!(scalar, simd, "SIMD encode must match scalar encode");
352    }
353
354    // ── Decoding tests ──────────────────────────────────────────────────────
355
356    #[test]
357    fn test_decode_zero_bit() {
358        let samples = [HIGH_LEVEL, LOW_LEVEL];
359        let decoded = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
360        assert_eq!(decoded, vec![0u8]);
361    }
362
363    #[test]
364    fn test_decode_one_bit() {
365        let samples = [LOW_LEVEL, HIGH_LEVEL];
366        let decoded = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
367        assert_eq!(decoded, vec![1u8]);
368    }
369
370    #[test]
371    fn test_decode_empty() {
372        let decoded = manchester_decode_scalar(&[], 1000);
373        assert!(decoded.is_empty());
374    }
375
376    #[test]
377    fn test_decode_odd_length_ignores_tail() {
378        // Three samples: one valid pair + one trailing sample.
379        let samples = [HIGH_LEVEL, LOW_LEVEL, HIGH_LEVEL]; // pair(0) + orphan
380        let decoded = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
381        assert_eq!(decoded.len(), 1);
382        assert_eq!(decoded[0], 0);
383    }
384
385    #[test]
386    fn test_decode_simd_matches_scalar() {
387        let bits: Vec<u8> = (0u8..255).map(|b| b & 1).collect();
388        let samples = manchester_encode_scalar(&bits);
389        let scalar = manchester_decode_scalar(&samples, HIGH_LEVEL / 2);
390        let simd = manchester_decode_simd(&samples, HIGH_LEVEL / 2);
391        assert_eq!(scalar, simd, "SIMD decode must match scalar decode");
392    }
393
394    #[test]
395    fn test_encode_decode_256_random_simd_roundtrip() {
396        // Full round-trip via SIMD paths.
397        // Use a u64 LCG with a 64-bit multiplier.
398        let bits: Vec<u8> = (0u64..256)
399            .map(|i| {
400                let hash = i
401                    .wrapping_mul(6364136223846793005_u64)
402                    .wrapping_add(1442695040888963407_u64);
403                ((hash >> 32) & 1) as u8
404            })
405            .collect();
406
407        let encoded = manchester_encode_simd(&bits);
408        let decoded = manchester_decode_simd(&encoded, HIGH_LEVEL / 2);
409
410        assert_eq!(decoded.len(), bits.len());
411        for (i, (&orig, &dec)) in bits.iter().zip(decoded.iter()).enumerate() {
412            assert_eq!(orig, dec, "SIMD round-trip mismatch at bit {i}");
413        }
414    }
415}
oximedia_timecode/simd_manchester.rs

oximedia_timecode/
simd_manchester.rs