scenesdetect 0.1.0

Scene/shot cut detection ported from PySceneDetect — Sans-I/O streaming API with SIMD-accelerated detectors for histogram, pHash, threshold, content, and adaptive algorithms.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
//! x86 / x86_64 SSSE3 backend for BGR→HSV.
//!
//! No native 3-channel deinterleave on x86; we emulate it with `PSHUFB`
//! (SSSE3). Nine shuffle masks + six ORs deinterleave 48 packed BGR bytes
//! into three `u8x16` vectors. The rest of the pipeline mirrors the NEON
//! version: widen u8→u16→u32, convert to f32x4, run the branch-free HSV
//! math on four 4-pixel groups, narrow back to u8x16 via saturating packs.
//!
//! SSE4.1's `_mm_blendv_ps` would be nicer for mask blending but we stick to
//! SSSE3 + SSE2 (universal on x86_64). The manual `(mask & t) | (!mask & f)`
//! pattern compiles to the same handful of ops.

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

// Shuffle masks for PSHUFB (`_mm_shuffle_epi8`). Each mask has one byte per
// output lane: if high bit is set, output lane is zeroed; else low 4 bits
// select the input byte. We use `-1` for "zero this lane".
//
// Input blocks (16 bytes each):
//   blk0: B0 G0 R0 B1 G1 R1 B2 G2 R2 B3 G3 R3 B4 G4 R4 B5
//   blk1: G5 R5 B6 G6 R6 B7 G7 R7 B8 G8 R8 B9 G9 R9 B10 G10
//   blk2: R10 B11 G11 R11 B12 G12 R12 B13 G13 R13 B14 G14 R14 B15 G15 R15

// When AVX2 is also enabled at compile time, the BGR→HSV dispatch takes
// the AVX2 path, leaving the SSSE3 BGR function + its helpers and shuffle
// constants unused. `mean_abs_diff` and `sobel` are still called via SSSE3
// even when AVX2 is present (no AVX2 variants of those exist).
#[allow(dead_code)]
const BLK0_B: [i8; 16] = [0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK0_G: [i8; 16] = [1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK0_R: [i8; 16] = [2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1];

#[allow(dead_code)]
const BLK1_B: [i8; 16] = [-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK1_G: [i8; 16] = [-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1];
#[allow(dead_code)]
const BLK1_R: [i8; 16] = [-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1];

#[allow(dead_code)]
const BLK2_B: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13];
#[allow(dead_code)]
const BLK2_G: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14];
#[allow(dead_code)]
const BLK2_R: [i8; 16] = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15];

/// SSSE3 BGR→HSV: 16 pixels per iteration.
///
/// # Safety
///
/// Caller must ensure SSSE3 is available (`is_x86_feature_detected!("ssse3")`
/// or `target_feature = "ssse3"`). Buffers must cover the ranges indicated by
/// `width`, `height`, `stride`.
#[allow(dead_code)] // AVX2 takes the BGR path when both are compiled
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
pub(super) unsafe fn bgr_to_hsv_planes(
  h_out: &mut [u8],
  s_out: &mut [u8],
  v_out: &mut [u8],
  src: &[u8],
  width: u32,
  height: u32,
  stride: u32,
) {
  const LANES: usize = 16;
  let w = width as usize;
  let h = height as usize;
  let s = stride as usize;
  let whole = w / LANES * LANES;

  let m_b0 = unsafe { _mm_loadu_si128(BLK0_B.as_ptr() as *const __m128i) };
  let m_g0 = unsafe { _mm_loadu_si128(BLK0_G.as_ptr() as *const __m128i) };
  let m_r0 = unsafe { _mm_loadu_si128(BLK0_R.as_ptr() as *const __m128i) };
  let m_b1 = unsafe { _mm_loadu_si128(BLK1_B.as_ptr() as *const __m128i) };
  let m_g1 = unsafe { _mm_loadu_si128(BLK1_G.as_ptr() as *const __m128i) };
  let m_r1 = unsafe { _mm_loadu_si128(BLK1_R.as_ptr() as *const __m128i) };
  let m_b2 = unsafe { _mm_loadu_si128(BLK2_B.as_ptr() as *const __m128i) };
  let m_g2 = unsafe { _mm_loadu_si128(BLK2_G.as_ptr() as *const __m128i) };
  let m_r2 = unsafe { _mm_loadu_si128(BLK2_R.as_ptr() as *const __m128i) };
  let zero_i = unsafe { _mm_setzero_si128() };

  for y in 0..h {
    let row_base = y * s;
    let dst_off = y * w;

    let mut x = 0;
    while x < whole {
      let p = unsafe { src.as_ptr().add(row_base + x * 3) };
      let blk0 = unsafe { _mm_loadu_si128(p as *const __m128i) };
      let blk1 = unsafe { _mm_loadu_si128(p.add(16) as *const __m128i) };
      let blk2 = unsafe { _mm_loadu_si128(p.add(32) as *const __m128i) };

      let b = unsafe {
        _mm_or_si128(
          _mm_or_si128(_mm_shuffle_epi8(blk0, m_b0), _mm_shuffle_epi8(blk1, m_b1)),
          _mm_shuffle_epi8(blk2, m_b2),
        )
      };
      let g = unsafe {
        _mm_or_si128(
          _mm_or_si128(_mm_shuffle_epi8(blk0, m_g0), _mm_shuffle_epi8(blk1, m_g1)),
          _mm_shuffle_epi8(blk2, m_g2),
        )
      };
      let r = unsafe {
        _mm_or_si128(
          _mm_or_si128(_mm_shuffle_epi8(blk0, m_r0), _mm_shuffle_epi8(blk1, m_r1)),
          _mm_shuffle_epi8(blk2, m_r2),
        )
      };

      // Widen u8x16 → two u16x8 halves per channel.
      let b_lo16 = unsafe { _mm_unpacklo_epi8(b, zero_i) };
      let b_hi16 = unsafe { _mm_unpackhi_epi8(b, zero_i) };
      let g_lo16 = unsafe { _mm_unpacklo_epi8(g, zero_i) };
      let g_hi16 = unsafe { _mm_unpackhi_epi8(g, zero_i) };
      let r_lo16 = unsafe { _mm_unpacklo_epi8(r, zero_i) };
      let r_hi16 = unsafe { _mm_unpackhi_epi8(r, zero_i) };

      // Process four groups of 4 pixels each.
      macro_rules! group {
        ($b16:expr, $g16:expr, $r16:expr, $half:ident) => {{
          let bu = unsafe { $half($b16, zero_i) };
          let gu = unsafe { $half($g16, zero_i) };
          let ru = unsafe { $half($r16, zero_i) };
          let bf = unsafe { _mm_cvtepi32_ps(bu) };
          let gf = unsafe { _mm_cvtepi32_ps(gu) };
          let rf = unsafe { _mm_cvtepi32_ps(ru) };
          let (hue, sat, val) = unsafe { bgr_to_hsv_f32x4(bf, gf, rf) };
          // Use add-0.5 + truncate (round half-up for non-negative values)
          // to match the scalar `round()` semantics instead of MXCSR's
          // default round-to-nearest-even via `_mm_cvtps_epi32`.
          let half = unsafe { _mm_set1_ps(0.5) };
          let hh = unsafe { _mm_mul_ps(hue, _mm_set1_ps(0.5)) };
          let h_u32 = unsafe { clamp_i32_max(_mm_cvttps_epi32(_mm_add_ps(hh, half)), 179) };
          let s_u32 = unsafe { clamp_i32_max(_mm_cvttps_epi32(_mm_add_ps(sat, half)), 255) };
          let v_u32 = unsafe { clamp_i32_max(_mm_cvttps_epi32(_mm_add_ps(val, half)), 255) };
          (h_u32, s_u32, v_u32)
        }};
      }

      let (h0, s0, v0) = group!(b_lo16, g_lo16, r_lo16, _mm_unpacklo_epi16);
      let (h1, s1, v1) = group!(b_lo16, g_lo16, r_lo16, _mm_unpackhi_epi16);
      let (h2, s2, v2) = group!(b_hi16, g_hi16, r_hi16, _mm_unpacklo_epi16);
      let (h3, s3, v3) = group!(b_hi16, g_hi16, r_hi16, _mm_unpackhi_epi16);

      let h_vec = unsafe { pack_quad(h0, h1, h2, h3) };
      let s_vec = unsafe { pack_quad(s0, s1, s2, s3) };
      let v_vec = unsafe { pack_quad(v0, v1, v2, v3) };

      unsafe {
        _mm_storeu_si128(h_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, h_vec);
        _mm_storeu_si128(s_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, s_vec);
        _mm_storeu_si128(v_out.as_mut_ptr().add(dst_off + x) as *mut __m128i, v_vec);
      }

      x += LANES;
    }

    // Scalar tail.
    let row = &src[row_base..row_base + w * 3];
    while x < w {
      let b = row[x * 3] as f32;
      let g = row[x * 3 + 1] as f32;
      let r = row[x * 3 + 2] as f32;
      let (hue, sat, val) = super::scalar::Scalar::bgr_to_hsv_pixel(b, g, r);
      h_out[dst_off + x] = hue;
      s_out[dst_off + x] = sat;
      v_out[dst_off + x] = val;
      x += 1;
    }
  }
}

/// Clamp `i32x4` lanes to `[0, max]`. Our values are non-negative by
/// construction (widened from `u8`), so no lower-bound check needed.
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn clamp_i32_max(v: __m128i, max: i32) -> __m128i {
  let mv = unsafe { _mm_set1_epi32(max) };
  let gt = unsafe { _mm_cmpgt_epi32(v, mv) };
  unsafe { _mm_or_si128(_mm_and_si128(gt, mv), _mm_andnot_si128(gt, v)) }
}

/// Pack four `i32x4` vectors (values ≤ 255) into one `u8x16` via two levels
/// of saturating narrow.
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn pack_quad(a: __m128i, b: __m128i, c: __m128i, d: __m128i) -> __m128i {
  // _mm_packs_epi32: signed saturation to i16 range (values 0..255 OK).
  let lo = unsafe { _mm_packs_epi32(a, b) };
  let hi = unsafe { _mm_packs_epi32(c, d) };
  // _mm_packus_epi16: unsigned saturation to u8 range.
  unsafe { _mm_packus_epi16(lo, hi) }
}

/// Branch-free 4-lane BGR→HSV core. Returns `(hue ∈ [0, 360), sat, val)` as
/// `f32x4`. Caller divides hue by 2, rounds, and narrows to u8.
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn bgr_to_hsv_f32x4(b: __m128, g: __m128, r: __m128) -> (__m128, __m128, __m128) {
  let zero = unsafe { _mm_setzero_ps() };
  let one = unsafe { _mm_set1_ps(1.0) };

  let v = unsafe { _mm_max_ps(_mm_max_ps(b, g), r) };
  let min = unsafe { _mm_min_ps(_mm_min_ps(b, g), r) };
  let delta = unsafe { _mm_sub_ps(v, min) };

  let delta_zero = unsafe { _mm_cmpeq_ps(delta, zero) };
  let v_zero = unsafe { _mm_cmpeq_ps(v, zero) };
  let delta_safe = unsafe { blend(delta_zero, one, delta) };

  let sixty = unsafe { _mm_set1_ps(60.0) };
  let c120 = unsafe { _mm_set1_ps(120.0) };
  let c240 = unsafe { _mm_set1_ps(240.0) };
  let c360 = unsafe { _mm_set1_ps(360.0) };
  let c255 = unsafe { _mm_set1_ps(255.0) };

  let h_r = unsafe { _mm_div_ps(_mm_mul_ps(sixty, _mm_sub_ps(g, b)), delta_safe) };
  let h_g = unsafe {
    _mm_add_ps(
      _mm_div_ps(_mm_mul_ps(sixty, _mm_sub_ps(b, r)), delta_safe),
      c120,
    )
  };
  let h_b = unsafe {
    _mm_add_ps(
      _mm_div_ps(_mm_mul_ps(sixty, _mm_sub_ps(r, g)), delta_safe),
      c240,
    )
  };

  let is_r = unsafe { _mm_cmpeq_ps(v, r) };
  let is_g = unsafe { _mm_cmpeq_ps(v, g) };
  let not_r_and_g = unsafe { _mm_andnot_ps(is_r, is_g) };
  let hue_rg = unsafe { blend(is_r, h_r, h_b) };
  let hue = unsafe { blend(not_r_and_g, h_g, hue_rg) };
  let neg = unsafe { _mm_cmplt_ps(hue, zero) };
  let hue = unsafe { blend(neg, _mm_add_ps(hue, c360), hue) };
  let hue = unsafe { blend(delta_zero, zero, hue) };

  let v_safe = unsafe { blend(v_zero, one, v) };
  let sat = unsafe { _mm_div_ps(_mm_mul_ps(c255, delta), v_safe) };
  let sat = unsafe { blend(v_zero, zero, sat) };

  (hue, sat, v)
}

/// `mask ? t : f`, where `mask` is per-lane all-ones or all-zeros from a
/// comparison intrinsic. SSE2 equivalent of SSE4.1 `_mm_blendv_ps`.
#[allow(dead_code)]
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
#[inline]
unsafe fn blend(mask: __m128, t: __m128, f: __m128) -> __m128 {
  unsafe { _mm_or_ps(_mm_and_ps(mask, t), _mm_andnot_ps(mask, f)) }
}

/// SSE2 `mean_abs_diff`: `Σ|a[i] - b[i]| / n`.
///
/// Uses `_mm_sad_epu8` — a single instruction that computes the sum of
/// absolute u8 differences for 16 bytes, returning two u16 partial sums
/// in lanes 0 and 8 of a `__m128i` (the other lanes are zero).
///
/// # Safety
///
/// Caller must ensure at least SSE2 is available (true on every x86_64 target).
/// Marked `ssse3` because the parent module is ssse3-gated, but only SSE2
/// instructions are used here.
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
pub(super) unsafe fn mean_abs_diff(a: &[u8], b: &[u8], n: usize) -> f64 {
  const LANES: usize = 16;
  let whole = n / LANES * LANES;
  let mut acc = unsafe { _mm_setzero_si128() }; // u64x2 accumulator

  let mut i = 0;
  while i < whole {
    let va = unsafe { _mm_loadu_si128(a.as_ptr().add(i) as *const __m128i) };
    let vb = unsafe { _mm_loadu_si128(b.as_ptr().add(i) as *const __m128i) };
    // _mm_sad_epu8: per 8-byte half, sums |a[j]-b[j]| into a u16 in
    // lanes 0 and 8. The other 6 lanes of each half are zero.
    let sad = unsafe { _mm_sad_epu8(va, vb) };
    acc = unsafe { _mm_add_epi64(acc, sad) };
    i += LANES;
  }

  // Horizontal reduce u64x2 → u64.
  let hi = unsafe { _mm_srli_si128::<8>(acc) };
  let total = unsafe { _mm_add_epi64(acc, hi) };
  // `_mm_cvtsi128_si64` is x86_64-only (no 64-bit GPRs on i686).
  // Fall back to a memory round-trip on 32-bit.
  #[cfg(target_arch = "x86_64")]
  let mut sum: u64 = unsafe { _mm_cvtsi128_si64(total) as u64 };
  #[cfg(target_arch = "x86")]
  let mut sum: u64 = {
    let mut tmp = 0u64;
    unsafe { _mm_storel_epi64(&mut tmp as *mut u64 as *mut __m128i, total) };
    tmp
  };

  // Scalar tail.
  while i < n {
    let da = a[i] as i32 - b[i] as i32;
    sum += da.unsigned_abs() as u64;
    i += 1;
  }

  sum as f64 / n as f64
}

/// SSSE3 Sobel 3×3. Same structure as NEON: i16x8 stencil for magnitude,
/// scalar direction.
///
/// # Safety
///
/// Caller must ensure SSSE3 is available.
#[target_feature(enable = "ssse3")]
#[allow(unused_unsafe)]
pub(super) unsafe fn sobel(input: &[u8], mag: &mut [i32], dir: &mut [u8], w: usize, h: usize) {
  mag.fill(0);
  dir.fill(0);

  const LANES: usize = 8;
  let zero_i = unsafe { _mm_setzero_si128() };

  for y in 1..h.saturating_sub(1) {
    let prev = &input[(y - 1) * w..];
    let curr = &input[y * w..];
    let next = &input[(y + 1) * w..];
    let off = y * w;

    let mut x = 1usize;

    while x + LANES < w {
      macro_rules! ld {
        ($row:expr, $o:expr) => {{
          let v = unsafe { _mm_loadl_epi64($row.as_ptr().add($o) as *const __m128i) };
          unsafe { _mm_unpacklo_epi8(v, zero_i) } // u8→u16, treated as i16 (values 0..255)
        }};
      }
      let pl = ld!(prev, x - 1);
      let pm = ld!(prev, x);
      let pr = ld!(prev, x + 1);
      let cl = ld!(curr, x - 1);
      let cr = ld!(curr, x + 1);
      let nl = ld!(next, x - 1);
      let nm = ld!(next, x);
      let nr = ld!(next, x + 1);

      // Gx = (pr + 2*cr + nr) - (pl + 2*cl + nl)
      let gx = unsafe {
        let pos = _mm_add_epi16(_mm_add_epi16(pr, _mm_slli_epi16::<1>(cr)), nr);
        let neg = _mm_add_epi16(_mm_add_epi16(pl, _mm_slli_epi16::<1>(cl)), nl);
        _mm_sub_epi16(pos, neg)
      };
      // Gy = (nl + 2*nm + nr) - (pl + 2*pm + pr)
      let gy = unsafe {
        let pos = _mm_add_epi16(_mm_add_epi16(nl, _mm_slli_epi16::<1>(nm)), nr);
        let neg = _mm_add_epi16(_mm_add_epi16(pl, _mm_slli_epi16::<1>(pm)), pr);
        _mm_sub_epi16(pos, neg)
      };

      let mag_i16 = unsafe { _mm_add_epi16(_mm_abs_epi16(gx), _mm_abs_epi16(gy)) };

      // Widen i16→i32 and store.
      let lo = unsafe { _mm_unpacklo_epi16(mag_i16, _mm_cmpgt_epi16(zero_i, mag_i16)) };
      let hi = unsafe { _mm_unpackhi_epi16(mag_i16, _mm_cmpgt_epi16(zero_i, mag_i16)) };
      unsafe {
        _mm_storeu_si128(mag.as_mut_ptr().add(off + x) as *mut __m128i, lo);
        _mm_storeu_si128(mag.as_mut_ptr().add(off + x + 4) as *mut __m128i, hi);
      }

      // Direction: scalar.
      let gx_arr: [i16; 8] = unsafe { core::mem::transmute(gx) };
      let gy_arr: [i16; 8] = unsafe { core::mem::transmute(gy) };
      for j in 0..LANES {
        let ax = gx_arr[j].unsigned_abs() as u32;
        let ay = gy_arr[j].unsigned_abs() as u32;
        dir[off + x + j] = if ay * 1000 < ax * 414 {
          0
        } else if ay * 1000 > ax * 2414 {
          2
        } else if (gx_arr[j] >= 0) == (gy_arr[j] >= 0) {
          1
        } else {
          3
        };
      }

      x += LANES;
    }

    // Scalar tail.
    while x < w - 1 {
      let i = |yy: usize, xx: usize| input[yy * w + xx] as i32;
      let gx = -i(y - 1, x - 1) - 2 * i(y, x - 1) - i(y + 1, x - 1)
        + i(y - 1, x + 1)
        + 2 * i(y, x + 1)
        + i(y + 1, x + 1);
      let gy = -i(y - 1, x - 1) - 2 * i(y - 1, x) - i(y - 1, x + 1)
        + i(y + 1, x - 1)
        + 2 * i(y + 1, x)
        + i(y + 1, x + 1);
      mag[off + x] = gx.abs() + gy.abs();
      let ax = gx.unsigned_abs();
      let ay = gy.unsigned_abs();
      dir[off + x] = if ay * 1000 < ax * 414 {
        0
      } else if ay * 1000 > ax * 2414 {
        2
      } else if gx.signum() == gy.signum() {
        1
      } else {
        3
      };
      x += 1;
    }
  }
}