colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
use core::arch::aarch64::*;

use crate::{ColorMatrix, row::scalar};

// ===== RGB → HSV =========================================================

/// NEON RGB → planar HSV. Semantics match
/// [`scalar::rgb_to_hsv_row`] byte‑identically.
///
/// # Safety
///
/// The caller must uphold **all** of the following. Violating any
/// causes undefined behavior:
///
/// 1. **NEON must be available on the current CPU** (same obligation
///    as `yuv_420_to_rgb_row`; the dispatcher checks this via
///    `is_aarch64_feature_detected!("neon")`).
/// 2. `rgb.len() >= 3 * width`.
/// 3. `h_out.len() >= width`.
/// 4. `s_out.len() >= width`.
/// 5. `v_out.len() >= width`.
///
/// Bounds are verified by `debug_assert` in debug builds. The kernel
/// relies on unchecked pointer arithmetic (`vld3q_u8`, `vst1q_u8`).
///
/// # Numerical contract
///
/// Bit‑identical to the scalar reference. Every scalar op has the
/// same SIMD counterpart in the same order: `vmaxq_f32` / `vminq_f32`
/// mirror `f32::max` / `f32::min`; `vdivq_f32` is true f32 division
/// (not reciprocal estimate); branch cascade uses `vbslq_f32` in the
/// same `delta == 0 → v == r → v == g → v == b` priority.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgb_to_hsv_row(
  rgb: &[u8],
  h_out: &mut [u8],
  s_out: &mut [u8],
  v_out: &mut [u8],
  width: usize,
) {
  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
  debug_assert!(h_out.len() >= width, "H row too short");
  debug_assert!(s_out.len() >= width, "S row too short");
  debug_assert!(v_out.len() >= width, "V row too short");

  // SAFETY: NEON availability is the caller's obligation per the
  // `# Safety` section. All pointer adds below are bounded by the
  // `while x + 16 <= width` loop condition and the caller‑promised
  // slice lengths.
  unsafe {
    let mut x = 0usize;
    while x + 16 <= width {
      // Deinterleave 16 RGB pixels → three u8x16 channel vectors.
      let rgb_vec = vld3q_u8(rgb.as_ptr().add(x * 3));
      let r_u8 = rgb_vec.0;
      let g_u8 = rgb_vec.1;
      let b_u8 = rgb_vec.2;

      // Widen each u8x16 to four f32x4 (16 values split into four
      // 4‑pixel groups) for the f32 HSV math.
      let (b0, b1, b2, b3) = u8x16_to_f32x4_quad(b_u8);
      let (g0, g1, g2, g3) = u8x16_to_f32x4_quad(g_u8);
      let (r0, r1, r2, r3) = u8x16_to_f32x4_quad(r_u8);

      // HSV per 4‑pixel group. Each returns (h_quant, s_quant, v_quant)
      // as f32x4 values already in [0, 179] / [0, 255] / [0, 255].
      let (h0, s0, v0) = hsv_group(b0, g0, r0);
      let (h1, s1, v1) = hsv_group(b1, g1, r1);
      let (h2, s2, v2) = hsv_group(b2, g2, r2);
      let (h3, s3, v3) = hsv_group(b3, g3, r3);

      // Truncate f32 → u8 via u32 intermediate, matching scalar `as u8`
      // (which saturates then truncates; values are pre‑clamped so the
      // narrow is safe).
      let h_u8 = f32x4_quad_to_u8x16(h0, h1, h2, h3);
      let s_u8 = f32x4_quad_to_u8x16(s0, s1, s2, s3);
      let v_u8 = f32x4_quad_to_u8x16(v0, v1, v2, v3);

      vst1q_u8(h_out.as_mut_ptr().add(x), h_u8);
      vst1q_u8(s_out.as_mut_ptr().add(x), s_u8);
      vst1q_u8(v_out.as_mut_ptr().add(x), v_u8);

      x += 16;
    }

    // Scalar tail for the 0..15 leftover pixels.
    if x < width {
      scalar::rgb_to_hsv_row(
        &rgb[x * 3..width * 3],
        &mut h_out[x..width],
        &mut s_out[x..width],
        &mut v_out[x..width],
        width - x,
      );
    }
  }
}

/// Widens a u8x16 to four f32x4 groups (covering lanes 0..3, 4..7,
/// 8..11, 12..15 respectively). Lanes are zero‑extended at each
/// widening step, so f32 values land exactly in `[0.0, 255.0]`.
#[inline(always)]
fn u8x16_to_f32x4_quad(v: uint8x16_t) -> (float32x4_t, float32x4_t, float32x4_t, float32x4_t) {
  unsafe {
    let u16_lo = vmovl_u8(vget_low_u8(v)); // u16x8 = lanes 0..7
    let u16_hi = vmovl_u8(vget_high_u8(v)); // u16x8 = lanes 8..15
    let u32_0 = vmovl_u16(vget_low_u16(u16_lo)); // lanes 0..3
    let u32_1 = vmovl_u16(vget_high_u16(u16_lo)); // lanes 4..7
    let u32_2 = vmovl_u16(vget_low_u16(u16_hi)); // lanes 8..11
    let u32_3 = vmovl_u16(vget_high_u16(u16_hi)); // lanes 12..15
    (
      vcvtq_f32_u32(u32_0),
      vcvtq_f32_u32(u32_1),
      vcvtq_f32_u32(u32_2),
      vcvtq_f32_u32(u32_3),
    )
  }
}

/// Computes HSV for 4 pixels. Mirrors the scalar `rgb_to_hsv_pixel`
/// op‑for‑op. Returns `(h_quant, s_quant, v_quant)` — each already
/// clamped to the scalar's output range (`h ≤ 179`, `s ≤ 255`,
/// `v ≤ 255`), still as f32 awaiting u8 conversion in the caller.
#[inline(always)]
fn hsv_group(
  b: float32x4_t,
  g: float32x4_t,
  r: float32x4_t,
) -> (float32x4_t, float32x4_t, float32x4_t) {
  unsafe {
    let zero = vdupq_n_f32(0.0);
    let half = vdupq_n_f32(0.5);
    let sixty = vdupq_n_f32(60.0);
    let one_twenty = vdupq_n_f32(120.0);
    let two_forty = vdupq_n_f32(240.0);
    let three_sixty = vdupq_n_f32(360.0);
    let one_seventy_nine = vdupq_n_f32(179.0);
    let two_fifty_five = vdupq_n_f32(255.0);

    // V = max(b, g, r); min = min(b, g, r); delta = V - min.
    // vmaxq_f32 / vminq_f32 are NaN‑tolerant, matching f32::max / f32::min.
    let v = vmaxq_f32(vmaxq_f32(b, g), r);
    let min_bgr = vminq_f32(vminq_f32(b, g), r);
    let delta = vsubq_f32(v, min_bgr);

    // S = if v == 0 { 0 } else { 255 * delta / v }.
    let mask_v_nonzero = vmvnq_u32(vceqq_f32(v, zero));
    let s_nonzero = vdivq_f32(vmulq_f32(two_fifty_five, delta), v);
    let s = vbslq_f32(mask_v_nonzero, s_nonzero, zero);

    // Hue — compute all three candidate formulas then select.
    let mask_delta_zero = vceqq_f32(delta, zero);
    let mask_v_is_r = vceqq_f32(v, r);
    let mask_v_is_g = vceqq_f32(v, g);

    // Branch 1 (v == r): 60 * (g - b) / delta, wrap negatives by +360.
    let h_r = {
      let raw = vdivq_f32(vmulq_f32(sixty, vsubq_f32(g, b)), delta);
      let mask_neg = vcltq_f32(raw, zero);
      vbslq_f32(mask_neg, vaddq_f32(raw, three_sixty), raw)
    };
    // Branch 2 (v == g): 60 * (b - r) / delta + 120.
    let h_g = vaddq_f32(
      vdivq_f32(vmulq_f32(sixty, vsubq_f32(b, r)), delta),
      one_twenty,
    );
    // Branch 3 (v == b, implicit): 60 * (r - g) / delta + 240.
    let h_b = vaddq_f32(
      vdivq_f32(vmulq_f32(sixty, vsubq_f32(r, g)), delta),
      two_forty,
    );

    // Cascade: if delta == 0 → 0; else if v == r → h_r; else if v == g
    // → h_g; else → h_b. Same priority order as the scalar.
    let hue_g_or_b = vbslq_f32(mask_v_is_g, h_g, h_b);
    let hue_nonzero_delta = vbslq_f32(mask_v_is_r, h_r, hue_g_or_b);
    let hue = vbslq_f32(mask_delta_zero, zero, hue_nonzero_delta);

    // Quantize to the scalar's output ranges. Scalar:
    //   h_quant = (hue * 0.5 + 0.5).clamp(0, 179)
    //   s_quant = (s + 0.5).clamp(0, 255)
    //   v_quant = (v + 0.5).clamp(0, 255)
    // clamp → vminq(vmaxq(v, lo), hi). Inputs are all finite so NaN
    // handling is irrelevant here.
    let h_quant = vminq_f32(
      vmaxq_f32(vaddq_f32(vmulq_f32(hue, half), half), zero),
      one_seventy_nine,
    );
    let s_quant = vminq_f32(vmaxq_f32(vaddq_f32(s, half), zero), two_fifty_five);
    let v_quant = vminq_f32(vmaxq_f32(vaddq_f32(v, half), zero), two_fifty_five);

    (h_quant, s_quant, v_quant)
  }
}

/// Converts four f32x4 vectors (16 values in [0, 255]) to one u8x16.
/// Truncates f32 → u32 via `vcvtq_u32_f32` (matches scalar `as u8`
/// which saturates‑then‑truncates; values are pre‑clamped so the
/// narrowing steps below are exact).
#[inline(always)]
fn f32x4_quad_to_u8x16(
  a: float32x4_t,
  b: float32x4_t,
  c: float32x4_t,
  d: float32x4_t,
) -> uint8x16_t {
  unsafe {
    let a_u32 = vcvtq_u32_f32(a);
    let b_u32 = vcvtq_u32_f32(b);
    let c_u32 = vcvtq_u32_f32(c);
    let d_u32 = vcvtq_u32_f32(d);
    let ab_u16 = vcombine_u16(vmovn_u32(a_u32), vmovn_u32(b_u32));
    let cd_u16 = vcombine_u16(vmovn_u32(c_u32), vmovn_u32(d_u32));
    vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16))
  }
}

// ===== RGB → luma (Y') ===================================================

/// NEON RGB → planar luma (Y'). Byte‑identical to
/// [`scalar::rgb_to_luma_row`] (Q15 weighted sum, optional limited‑range
/// post‑scale to `[16, 235]`).
///
/// Block size: 16 px / iter (one `vld3q_u8` deinterleave = 48 input
/// bytes, one `vst1q_u8` luma store = 16 output bytes). Coefficients
/// are hoisted outside the loop — `matrix` is selected once via
/// `luma_coefficients_q15`, then splatted to i16x4 vectors so the
/// in‑loop multiplies use `vmull_s16` (i16 x i16 → i32 widening).
///
/// # Safety
///
/// 1. **NEON must be available on the current CPU** (caller obligation).
/// 2. `rgb.len() >= 3 * width`.
/// 3. `luma_out.len() >= width`.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn rgb_to_luma_row(
  rgb: &[u8],
  luma_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");

  let (k_r, k_g, k_b) = scalar::luma_coefficients_q15(matrix);
  // SAFETY block carries unchecked NEON intrinsics; the splats below
  // are safe (only register operations) but live inside the same
  // unsafe context as the load/multiply chain.
  // All matrix coefficients fit in i16 (max ≈ 23436 ≪ 32767), so we
  // can use the cheap i16xi16 → i32 widening multiply (`vmull_s16`).
  let kr_v = vdup_n_s16(k_r as i16);
  let kg_v = vdup_n_s16(k_g as i16);
  let kb_v = vdup_n_s16(k_b as i16);
  let rnd_v = vdupq_n_s32(1 << 14);
  // Limited‑range post‑scale constants. Hoisted once even when unused;
  // unused branches inline as dead and the splats don't cost anything.
  let lim_scale_v = vdup_n_s16(28142);
  let lim_off_v = vdupq_n_s16(16);

  // SAFETY: NEON availability is the caller's obligation; loop guard
  // `x + 16 <= width` keeps the 48‑byte read and 16‑byte write inside
  // the caller‑promised slice lengths.
  unsafe {
    let mut x = 0usize;
    while x + 16 <= width {
      // Deinterleave 16 RGB pixels (48 bytes) → 3 x u8x16 channels.
      let rgb_vec = vld3q_u8(rgb.as_ptr().add(x * 3));
      let r_u8 = rgb_vec.0;
      let g_u8 = rgb_vec.1;
      let b_u8 = rgb_vec.2;

      // Widen each u8x16 to two i16x8 halves (zero‑extend), since
      // every sample is in [0, 255] and the multiply target is i16 x i16.
      let r_lo_u16 = vmovl_u8(vget_low_u8(r_u8));
      let r_hi_u16 = vmovl_u8(vget_high_u8(r_u8));
      let g_lo_u16 = vmovl_u8(vget_low_u8(g_u8));
      let g_hi_u16 = vmovl_u8(vget_high_u8(g_u8));
      let b_lo_u16 = vmovl_u8(vget_low_u8(b_u8));
      let b_hi_u16 = vmovl_u8(vget_high_u8(b_u8));
      let r_lo = vreinterpretq_s16_u16(r_lo_u16);
      let r_hi = vreinterpretq_s16_u16(r_hi_u16);
      let g_lo = vreinterpretq_s16_u16(g_lo_u16);
      let g_hi = vreinterpretq_s16_u16(g_hi_u16);
      let b_lo = vreinterpretq_s16_u16(b_lo_u16);
      let b_hi = vreinterpretq_s16_u16(b_hi_u16);

      // Y_full per i32x4 quarter: (k_r·R + k_g·G + k_b·B + RND) >> 15.
      let y0 = q15_luma(
        vget_low_s16(r_lo),
        vget_low_s16(g_lo),
        vget_low_s16(b_lo),
        kr_v,
        kg_v,
        kb_v,
        rnd_v,
      );
      let y1 = q15_luma(
        vget_high_s16(r_lo),
        vget_high_s16(g_lo),
        vget_high_s16(b_lo),
        kr_v,
        kg_v,
        kb_v,
        rnd_v,
      );
      let y2 = q15_luma(
        vget_low_s16(r_hi),
        vget_low_s16(g_hi),
        vget_low_s16(b_hi),
        kr_v,
        kg_v,
        kb_v,
        rnd_v,
      );
      let y3 = q15_luma(
        vget_high_s16(r_hi),
        vget_high_s16(g_hi),
        vget_high_s16(b_hi),
        kr_v,
        kg_v,
        kb_v,
        rnd_v,
      );

      // Saturate‑narrow to i16x8 (clamps negatives → 0 and >32767 → 32767;
      // both extremes are well outside our [0,255] expected range and
      // can only occur when coefficient sums round slightly above 1.0).
      let y_lo_i16 = vcombine_s16(vqmovn_s32(y0), vqmovn_s32(y1));
      let y_hi_i16 = vcombine_s16(vqmovn_s32(y2), vqmovn_s32(y3));

      let y_u8 = if full_range {
        // Saturate‑narrow i16x8x2 → u8x16 ([0,255] clamp).
        vcombine_u8(vqmovun_s16(y_lo_i16), vqmovun_s16(y_hi_i16))
      } else {
        // Limited‑range post‑scale: clamp Y_full to [0,255] first
        // (matches the scalar's `y_full_clamped` step), then apply the
        // 28142/32768 Q15 multiply and add 16. Re‑use Q15 widening
        // multiply via vmull_s16.
        let y_clamp_u8_lo = vqmovun_s16(y_lo_i16);
        let y_clamp_u8_hi = vqmovun_s16(y_hi_i16);
        // Re‑widen u8 → i16 (always non‑negative, so signed and
        // unsigned widen produce the same bit pattern).
        let yc_lo_i16 = vreinterpretq_s16_u16(vmovl_u8(y_clamp_u8_lo));
        let yc_hi_i16 = vreinterpretq_s16_u16(vmovl_u8(y_clamp_u8_hi));
        let y_lim_lo = limited_range_scale(yc_lo_i16, lim_scale_v, lim_off_v, rnd_v);
        let y_lim_hi = limited_range_scale(yc_hi_i16, lim_scale_v, lim_off_v, rnd_v);
        vcombine_u8(vqmovun_s16(y_lim_lo), vqmovun_s16(y_lim_hi))
      };

      vst1q_u8(luma_out.as_mut_ptr().add(x), y_u8);
      x += 16;
    }

    if x < width {
      scalar::rgb_to_luma_row(
        &rgb[x * 3..width * 3],
        &mut luma_out[x..width],
        width - x,
        matrix,
        full_range,
      );
    }
  }
}

/// Q15 weighted sum for one 4‑pixel group.
/// Returns `(k_r·R + k_g·G + k_b·B + RND) >> 15` as i32x4.
#[inline(always)]
fn q15_luma(
  r: int16x4_t,
  g: int16x4_t,
  b: int16x4_t,
  kr: int16x4_t,
  kg: int16x4_t,
  kb: int16x4_t,
  rnd: int32x4_t,
) -> int32x4_t {
  unsafe {
    let acc = vmull_s16(r, kr);
    let acc = vmlal_s16(acc, g, kg);
    let acc = vmlal_s16(acc, b, kb);
    let acc = vaddq_s32(acc, rnd);
    vshrq_n_s32::<15>(acc)
  }
}

/// Limited‑range post‑scale: `16 + ((y_clamped * 28142 + RND) >> 15)`,
/// applied in i16 arithmetic. `y_clamped` is in `[0, 255]` (already
/// clamped by the caller), so `y_clamped * 28142 ≤ 7.18M` fits in i32.
#[inline(always)]
fn limited_range_scale(
  yc: int16x8_t,
  scale: int16x4_t,
  off: int16x8_t,
  rnd: int32x4_t,
) -> int16x8_t {
  unsafe {
    let lo = vmull_s16(vget_low_s16(yc), scale);
    let hi = vmull_s16(vget_high_s16(yc), scale);
    let lo = vshrq_n_s32::<15>(vaddq_s32(lo, rnd));
    let hi = vshrq_n_s32::<15>(vaddq_s32(hi, rnd));
    let scaled_i16 = vcombine_s16(vqmovn_s32(lo), vqmovn_s32(hi));
    vaddq_s16(scaled_i16, off)
  }
}