scirs2-vision 0.4.4

Computer vision module for SciRS2 (scirs2-vision)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
//! Video frame buffer and temporal processing utilities.
//!
//! This module provides data structures for working with video frame sequences
//! and temporal algorithms such as per-pixel temporal median filtering and
//! background subtraction using a Mixture of Gaussians (MoG) model.
//!
//! # Overview
//!
//! - [`VideoFrame`] -- a single video frame with metadata
//! - [`FrameBuffer`] -- a circular (ring) buffer of [`VideoFrame`]s
//! - [`temporal_median_filter`] -- per-pixel median over a window of RGB frames
//! - [`MogBackground`] / [`background_subtraction_mog`] -- per-pixel MoG foreground detection
//! - [`frame_interpolation`] -- flow-based frame interpolation

use crate::error::{Result, VisionError};
use crate::optical_flow_dense::warp_image;
use scirs2_core::ndarray::{Array2, Array3};
use std::collections::VecDeque;

// ---------------------------------------------------------------------------
// VideoFrame
// ---------------------------------------------------------------------------

/// A single video frame with associated timestamp and spatial metadata.
#[derive(Debug, Clone)]
pub struct VideoFrame {
    /// Pixel data stored as `[height, width, channels]` in `[0, 1]`.
    pub data: Array3<f64>,
    /// Timestamp in seconds.
    pub timestamp: f64,
    /// Frame width in pixels.
    pub width: usize,
    /// Frame height in pixels.
    pub height: usize,
}

impl VideoFrame {
    /// Create a new [`VideoFrame`].
    ///
    /// # Errors
    ///
    /// Returns an error if the array dimensions are inconsistent with the
    /// supplied `width` and `height`.
    pub fn new(data: Array3<f64>, timestamp: f64) -> Result<Self> {
        let shape = data.dim();
        let height = shape.0;
        let width = shape.1;
        Ok(Self {
            data,
            timestamp,
            width,
            height,
        })
    }

    /// Return the number of colour channels.
    pub fn channels(&self) -> usize {
        self.data.dim().2
    }

    /// Extract a single channel as an `Array2<f64>`.
    pub fn channel(&self, ch: usize) -> Result<Array2<f64>> {
        if ch >= self.channels() {
            return Err(VisionError::InvalidParameter(format!(
                "channel index {ch} out of range (frame has {} channels)",
                self.channels()
            )));
        }
        Ok(self
            .data
            .slice(scirs2_core::ndarray::s![.., .., ch])
            .to_owned())
    }

    /// Convert to grayscale by averaging all channels.
    pub fn to_grayscale(&self) -> Array2<f64> {
        let (h, w, c) = self.data.dim();
        let mut gray = Array2::<f64>::zeros((h, w));
        let weight = 1.0 / c as f64;
        for ch in 0..c {
            for r in 0..h {
                for col in 0..w {
                    gray[[r, col]] += self.data[[r, col, ch]] * weight;
                }
            }
        }
        gray
    }
}

// ---------------------------------------------------------------------------
// FrameBuffer (circular buffer)
// ---------------------------------------------------------------------------

/// A capacity-bounded circular buffer of [`VideoFrame`]s.
///
/// When the buffer is full, the oldest frame is automatically evicted.
#[derive(Debug, Clone)]
pub struct FrameBuffer {
    /// Internal double-ended queue acting as a ring buffer.
    pub frames: VecDeque<VideoFrame>,
    /// Maximum number of frames retained.
    pub capacity: usize,
}

impl FrameBuffer {
    /// Create a new `FrameBuffer` with the given capacity.
    pub fn new(capacity: usize) -> Result<Self> {
        if capacity == 0 {
            return Err(VisionError::InvalidParameter(
                "FrameBuffer: capacity must be at least 1".into(),
            ));
        }
        Ok(Self {
            frames: VecDeque::with_capacity(capacity),
            capacity,
        })
    }

    /// Push a frame onto the back.  If the buffer is at capacity the oldest
    /// (front) frame is dropped first.
    pub fn push(&mut self, frame: VideoFrame) {
        if self.frames.len() == self.capacity {
            self.frames.pop_front();
        }
        self.frames.push_back(frame);
    }

    /// Number of frames currently stored.
    pub fn len(&self) -> usize {
        self.frames.len()
    }

    /// Returns `true` if the buffer contains no frames.
    pub fn is_empty(&self) -> bool {
        self.frames.is_empty()
    }

    /// Whether the buffer has reached its maximum capacity.
    pub fn is_full(&self) -> bool {
        self.frames.len() == self.capacity
    }

    /// Retrieve a reference to the most recent frame, if any.
    pub fn latest(&self) -> Option<&VideoFrame> {
        self.frames.back()
    }

    /// Iterate over frames in chronological order.
    pub fn iter(&self) -> impl Iterator<Item = &VideoFrame> {
        self.frames.iter()
    }
}

// ---------------------------------------------------------------------------
// Temporal median filter
// ---------------------------------------------------------------------------

/// Apply a per-pixel temporal median filter over a slice of RGB (or N-channel) frames.
///
/// All frames must have the same spatial shape `[H, W, C]`.  The output is
/// the per-pixel, per-channel median over the supplied window.
///
/// # Arguments
///
/// * `frames`  – slice of at least 1 frame, each `[H, W, C]` in `[0, 1]`
/// * `window`  – number of frames to include in the median (capped to
///   `frames.len()` if larger)
pub fn temporal_median_filter(frames: &[Array3<f64>], window: usize) -> Result<Array3<f64>> {
    if frames.is_empty() {
        return Err(VisionError::InvalidParameter(
            "temporal_median_filter: frames slice must not be empty".into(),
        ));
    }
    let window = window.min(frames.len()).max(1);
    let ref_shape = frames[0].dim();
    for (i, f) in frames.iter().enumerate() {
        if f.dim() != ref_shape {
            return Err(VisionError::DimensionMismatch(format!(
                "temporal_median_filter: frame {i} shape {:?} != reference {:?}",
                f.dim(),
                ref_shape
            )));
        }
    }

    let (h, w, c) = ref_shape;
    let start = frames.len().saturating_sub(window);
    let window_frames = &frames[start..];
    let n = window_frames.len();
    let mut output = Array3::<f64>::zeros((h, w, c));

    for row in 0..h {
        for col in 0..w {
            for ch in 0..c {
                let mut vals: Vec<f64> = window_frames.iter().map(|f| f[[row, col, ch]]).collect();
                vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
                output[[row, col, ch]] = if n % 2 == 1 {
                    vals[n / 2]
                } else {
                    (vals[n / 2 - 1] + vals[n / 2]) * 0.5
                };
            }
        }
    }

    Ok(output)
}

// ---------------------------------------------------------------------------
// Mixture of Gaussians background subtraction
// ---------------------------------------------------------------------------

/// Configuration for the Mixture of Gaussians background model.
#[derive(Debug, Clone)]
pub struct MogBackground {
    /// Number of Gaussian components per pixel.
    pub n_gaussians: usize,
    /// Learning rate for updating the model (α).  Typical: 0.005.
    pub learning_rate: f64,
    /// Mahalanobis distance threshold for foreground / background decision.
    pub threshold: f64,
    // Per-pixel model: [H, W, K] tensors for mean, variance and weight.
    means: Option<Array3<f64>>,
    variances: Option<Array3<f64>>,
    weights: Option<Array3<f64>>,
}

impl MogBackground {
    /// Create a new uninitialised MoG background model.
    ///
    /// The model is initialised on the first call to [`background_subtraction_mog`].
    pub fn new(n_gaussians: usize, learning_rate: f64, threshold: f64) -> Result<Self> {
        if n_gaussians == 0 {
            return Err(VisionError::InvalidParameter(
                "MogBackground: n_gaussians must be at least 1".into(),
            ));
        }
        if !(0.0..=1.0).contains(&learning_rate) {
            return Err(VisionError::InvalidParameter(
                "MogBackground: learning_rate must be in (0, 1]".into(),
            ));
        }
        Ok(Self {
            n_gaussians,
            learning_rate,
            threshold,
            means: None,
            variances: None,
            weights: None,
        })
    }
}

impl Default for MogBackground {
    fn default() -> Self {
        Self {
            n_gaussians: 3,
            learning_rate: 0.005,
            threshold: 2.5,
            means: None,
            variances: None,
            weights: None,
        }
    }
}

/// Apply Mixture of Gaussians background subtraction to a single-channel frame.
///
/// Updates `background_model` in place and returns a boolean mask where
/// `true` indicates a foreground (moving) pixel.
///
/// The frame values are expected in `[0, 1]`.
pub fn background_subtraction_mog(
    frame: &Array2<f64>,
    background_model: &mut MogBackground,
) -> Result<Array2<bool>> {
    let (rows, cols) = frame.dim();
    let k = background_model.n_gaussians;
    let alpha = background_model.learning_rate;
    let thr = background_model.threshold;

    // Initialise model on first call.
    if background_model.means.is_none() {
        // All components start at the current frame value with high variance
        // and equal weight.
        let init_weight = 1.0 / k as f64;
        let mut means = Array3::<f64>::zeros((rows, cols, k));
        let variances = Array3::<f64>::from_elem((rows, cols, k), 0.01);
        let weights = Array3::<f64>::from_elem((rows, cols, k), init_weight);
        // Copy frame pixel into each component mean.
        for r in 0..rows {
            for c in 0..cols {
                for ki in 0..k {
                    means[[r, c, ki]] = frame[[r, c]];
                }
            }
        }
        background_model.means = Some(means);
        background_model.variances = Some(variances);
        background_model.weights = Some(weights);
    }

    let means = background_model.means.as_mut().expect("means initialised");
    let variances = background_model
        .variances
        .as_mut()
        .expect("variances initialised");
    let weights = background_model
        .weights
        .as_mut()
        .expect("weights initialised");

    let mut fg_mask = Array2::<bool>::from_elem((rows, cols), false);

    for r in 0..rows {
        for c in 0..cols {
            let pixel = frame[[r, c]];
            let mut matched = false;
            let mut best_ki = 0usize;

            // Find matching component (Mahalanobis distance check).
            for ki in 0..k {
                let diff = pixel - means[[r, c, ki]];
                let var = variances[[r, c, ki]];
                if var > 1e-12 && diff * diff / var < thr * thr {
                    // Update matching component.
                    let rho = alpha / weights[[r, c, ki]].max(1e-12);
                    means[[r, c, ki]] += rho * diff;
                    variances[[r, c, ki]] = (1.0 - rho) * var + rho * diff * diff;
                    weights[[r, c, ki]] = (1.0 - alpha) * weights[[r, c, ki]] + alpha;

                    matched = true;
                    best_ki = ki;
                    break;
                }
            }

            if !matched {
                // Replace the least-weighted component.
                let mut min_w = weights[[r, c, 0]];
                let mut min_ki = 0;
                for ki in 1..k {
                    if weights[[r, c, ki]] < min_w {
                        min_w = weights[[r, c, ki]];
                        min_ki = ki;
                    }
                }
                means[[r, c, min_ki]] = pixel;
                variances[[r, c, min_ki]] = 0.01;
                weights[[r, c, min_ki]] = alpha;
                best_ki = min_ki;
            }

            // Renormalise weights.
            let mut w_sum = 0.0_f64;
            for ki in 0..k {
                w_sum += weights[[r, c, ki]];
            }
            if w_sum > 1e-12 {
                for ki in 0..k {
                    weights[[r, c, ki]] /= w_sum;
                }
            }

            // Foreground decision: pixel belongs to background if its matched
            // component has weight above 1/K (dominant component check).
            let is_bg = matched && weights[[r, c, best_ki]] > 1.0 / k as f64;
            fg_mask[[r, c]] = !is_bg;
        }
    }

    Ok(fg_mask)
}

// ---------------------------------------------------------------------------
// Flow-based frame interpolation
// ---------------------------------------------------------------------------

/// Interpolate a frame between `frame1` and `frame2` at time `t ∈ [0, 1]`.
///
/// Uses the supplied optical flow `flow` (computed from `frame1` to `frame2`)
/// to warp `frame1` forward by `t` and `frame2` backward by `(1 - t)`, then
/// blends the two warped images linearly.
///
/// # Arguments
///
/// * `frame1` – source frame `[H, W, C]`
/// * `frame2` – target frame `[H, W, C]`
/// * `t`      – interpolation parameter in `[0, 1]` (0 → frame1, 1 → frame2)
/// * `flow`   – `(u, v)` flow field from frame1 to frame2 (shape `[H, W]` each)
pub fn frame_interpolation(
    frame1: &Array3<f64>,
    frame2: &Array3<f64>,
    t: f64,
    flow: (&Array2<f64>, &Array2<f64>),
) -> Result<Array3<f64>> {
    let shape = frame1.dim();
    if shape != frame2.dim() {
        return Err(VisionError::DimensionMismatch(
            "frame_interpolation: frame1 and frame2 must have identical shapes".into(),
        ));
    }
    let (u, v) = flow;
    let (h, w, _) = shape;
    if u.dim() != (h, w) || v.dim() != (h, w) {
        return Err(VisionError::DimensionMismatch(
            "frame_interpolation: flow field spatial shape must match frame spatial shape".into(),
        ));
    }
    if !(0.0..=1.0).contains(&t) {
        return Err(VisionError::InvalidParameter(
            "frame_interpolation: t must be in [0, 1]".into(),
        ));
    }

    let (rows, cols, channels) = shape;

    // Forward warp: scale flow by t.
    let u_fwd = u.mapv(|x| x * t);
    let v_fwd = v.mapv(|x| x * t);
    // Backward warp: scale flow by -(1-t).
    let u_bwd = u.mapv(|x| -x * (1.0 - t));
    let v_bwd = v.mapv(|x| -x * (1.0 - t));

    let mut output = Array3::<f64>::zeros((rows, cols, channels));

    for ch in 0..channels {
        let ch1 = frame1
            .slice(scirs2_core::ndarray::s![.., .., ch])
            .to_owned();
        let ch2 = frame2
            .slice(scirs2_core::ndarray::s![.., .., ch])
            .to_owned();

        let warped1 = warp_image(&ch1, &u_fwd, &v_fwd)?;
        let warped2 = warp_image(&ch2, &u_bwd, &v_bwd)?;

        for r in 0..rows {
            for c in 0..cols {
                output[[r, c, ch]] = (1.0 - t) * warped1[[r, c]] + t * warped2[[r, c]];
            }
        }
    }

    Ok(output)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use scirs2_core::ndarray::{Array2, Array3};

    fn rgb_frame(h: usize, w: usize, val: f64) -> Array3<f64> {
        Array3::from_elem((h, w, 3), val)
    }

    #[test]
    fn frame_buffer_circular_eviction() {
        let mut buf = FrameBuffer::new(3).expect("FrameBuffer::new failed");
        for i in 0..5u32 {
            let data = rgb_frame(4, 4, i as f64 / 10.0);
            let frame = VideoFrame::new(data, i as f64).expect("VideoFrame::new failed");
            buf.push(frame);
        }
        assert_eq!(buf.len(), 3);
        // Oldest kept should be frame 2 (timestamp = 2.0).
        assert!((buf.frames[0].timestamp - 2.0).abs() < 1e-9);
    }

    #[test]
    fn temporal_median_filter_single_frame() {
        let f = rgb_frame(4, 4, 0.7);
        let out =
            temporal_median_filter(std::slice::from_ref(&f), 1).expect("median filter failed");
        for &v in out.iter() {
            assert!((v - 0.7).abs() < 1e-10);
        }
    }

    #[test]
    fn temporal_median_filter_three_frames() {
        // Three frames: 0.2, 0.5, 0.8 → median should be 0.5.
        let f1 = rgb_frame(2, 2, 0.2);
        let f2 = rgb_frame(2, 2, 0.5);
        let f3 = rgb_frame(2, 2, 0.8);
        let out = temporal_median_filter(&[f1, f2, f3], 3).expect("median filter failed");
        for &v in out.iter() {
            assert!((v - 0.5).abs() < 1e-10, "expected 0.5, got {v}");
        }
    }

    #[test]
    fn mog_background_first_frame_all_foreground_zero() {
        // On the very first frame everything is initialised to match the pixel,
        // so we expect either all background or well-defined behaviour.
        let frame = Array2::from_elem((4, 4), 0.5_f64);
        let mut model = MogBackground::new(3, 0.005, 2.5).expect("MogBackground::new failed");
        let mask = background_subtraction_mog(&frame, &mut model).expect("mog failed");
        // After first frame init, every pixel "matched" the newly created component
        // but the component weight equals alpha (< 1/K), so all are foreground.
        // Either way the mask must have the correct shape.
        assert_eq!(mask.dim(), (4, 4));
    }

    #[test]
    fn mog_background_converges_to_background() {
        // After many identical frames the model should classify them as background.
        let frame = Array2::from_elem((4, 4), 0.5_f64);
        let mut model = MogBackground::new(3, 0.1, 2.5).expect("MogBackground::new failed");
        for _ in 0..50 {
            let _ = background_subtraction_mog(&frame, &mut model);
        }
        let mask = background_subtraction_mog(&frame, &mut model).expect("mog failed");
        // All pixels should be background after convergence.
        for val in mask.iter() {
            assert!(!val, "expected background, got foreground");
        }
    }

    #[test]
    fn frame_interpolation_at_zero_returns_frame1() {
        let f1 = rgb_frame(4, 4, 0.2);
        let f2 = rgb_frame(4, 4, 0.8);
        let u = Array2::zeros((4, 4));
        let v = Array2::zeros((4, 4));
        let out = frame_interpolation(&f1, &f2, 0.0, (&u, &v)).expect("interpolation failed");
        for &val in out.iter() {
            assert!((val - 0.2).abs() < 1e-10, "expected 0.2, got {val}");
        }
    }

    #[test]
    fn frame_interpolation_at_one_returns_frame2() {
        let f1 = rgb_frame(4, 4, 0.2);
        let f2 = rgb_frame(4, 4, 0.8);
        let u = Array2::zeros((4, 4));
        let v = Array2::zeros((4, 4));
        let out = frame_interpolation(&f1, &f2, 1.0, (&u, &v)).expect("interpolation failed");
        for &val in out.iter() {
            assert!((val - 0.8).abs() < 1e-10, "expected 0.8, got {val}");
        }
    }
}