od_opencv 0.10.1

Object detection utilities in Rust programming language for YOLO-based neural networks in OpenCV ecosystem
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
//! Ultralytics YOLO models (v8, v9, v11) using RKNN NPU.
//!
//! Warning!!!: Aggressively optimized for embedded ARM for SPECIFIC setup: Cortex-A7 / RV1106:
//! - Pre-allocated resize buffer (zero allocation per frame)
//! - Custom nearest-neighbor resize on raw bytes (no image crate overhead)
//! - Precomputed NC1HWC2 channel offsets (no division in hot loop)
//! - i8-space confidence threshold (skip float math for rejected predictions)
//! - Lazy bbox dequantization (only for detections above threshold)
//! - Incremental offset arithmetic (addition, not multiplication)

use rknn_runtime::{RknnModel, Nc1hwc2Layout};

use crate::bbox::BBox;
use crate::image_buffer::ImageBuffer;
use crate::postprocess::{Detection, nms, filter_by_class, detections_to_vecs};
use crate::preprocessing::{LetterboxMeta, PreprocessMeta, StretchMeta};

/// Error type for RKNN model operations.
#[derive(Debug)]
pub enum RknnModelError {
    /// Error from RKNN runtime
    Rknn(rknn_runtime::Error),
    /// Invalid model output shape
    InvalidOutputShape(String),
}

impl std::fmt::Display for RknnModelError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            RknnModelError::Rknn(e) => write!(f, "RKNN error: {}", e),
            RknnModelError::InvalidOutputShape(s) => write!(f, "Invalid output shape: {}", s),
        }
    }
}

impl std::error::Error for RknnModelError {}

impl From<rknn_runtime::Error> for RknnModelError {
    fn from(e: rknn_runtime::Error) -> Self {
        RknnModelError::Rknn(e)
    }
}

/// Ultralytics YOLO model (v8, v9, v11) using RKNN NPU.
///
/// Expects models converted with `onnx_to_rknn.py` which:
/// - Normalizes bbox coordinates to 0-1
/// - Applies sigmoid to class scores
/// - Reshapes to 4D to avoid the RV1106 zero-copy bug
///
/// Output layout and channel offsets are precomputed at load time
/// to eliminate all division from the per-frame hot path.
pub struct ModelUltralyticsRknn {
    model: RknnModel,
    input_width: u32,
    input_height: u32,
    class_filters: Vec<usize>,
    use_letterbox: bool,
    /// Pre-allocated buffer for resized input (avoids 307KB alloc per frame).
    resize_buf: Vec<u8>,
    /// NC1HWC2 output layout (precomputed from tensor attributes at load time).
    layout: Nc1hwc2Layout,
    /// Precomputed raw-data offset for each class channel relative to p_offset.
    /// Computed once at load time; eliminates ch/c2 and ch%c2 division from inner loop.
    class_raw_offsets: Vec<usize>,
}

impl ModelUltralyticsRknn {
    /// Creates a new model from an RKNN file.
    ///
    /// Input size is read from the model automatically (NHWC shape).
    /// Validates that the output is NC1HWC2 format and precomputes the
    /// channel layout for zero-division inference.
    ///
    /// # Arguments
    /// * `model_path` - Path to the `.rknn` model file
    /// * `num_classes` - Number of detection classes in the model
    /// * `class_filters` - List of class indices to detect (empty for all classes)
    ///
    /// # Example
    /// ```ignore
    /// let model = ModelUltralyticsRknn::new_from_file(
    ///     "yolov8n.rknn",
    ///     80,       // COCO classes
    ///     vec![],   // detect all classes
    /// )?;
    /// ```
    pub fn new_from_file(
        model_path: &str,
        num_classes: usize,
        class_filters: Vec<usize>,
    ) -> Result<Self, RknnModelError> {
        let model = RknnModel::load(model_path)?;
        Self::from_model(model, num_classes, class_filters)
    }

    /// Creates a new model with a custom library path.
    ///
    /// Use this when `librknnmrt.so` is not at the default path (`/usr/lib/librknnmrt.so`).
    ///
    /// # Arguments
    /// * `model_path` - Path to the `.rknn` model file
    /// * `lib_path` - Path to `librknnmrt.so`
    /// * `num_classes` - Number of detection classes in the model
    /// * `class_filters` - List of class indices to detect (empty for all classes)
    ///
    /// # Example
    /// ```ignore
    /// let model = ModelUltralyticsRknn::new_with_lib(
    ///     "yolov8n.rknn",
    ///     "/opt/rknn/lib/librknnmrt.so",
    ///     80,
    ///     vec![],
    /// )?;
    /// ```
    pub fn new_with_lib(
        model_path: &str,
        lib_path: &str,
        num_classes: usize,
        class_filters: Vec<usize>,
    ) -> Result<Self, RknnModelError> {
        let model = RknnModel::load_with_lib(model_path, lib_path)?;
        Self::from_model(model, num_classes, class_filters)
    }

    /// Shared init: validate output format and precompute layout.
    fn from_model(
        model: RknnModel,
        num_classes: usize,
        class_filters: Vec<usize>,
    ) -> Result<Self, RknnModelError> {
        // Read input size from the model (NHWC: [1, H, W, 3]).
        let input_shape = &model.input_attr().shape;
        let input_height = input_shape[1];
        let input_width = input_shape[2];

        // Build NC1HWC2 layout from output tensor attributes.
        // Validates format + shape, precomputes stride/offset params.
        let layout = model.output_nc1hwc2_layout(0)?;

        if layout.c2() < 4 {
            return Err(RknnModelError::InvalidOutputShape(
                format!("NC1HWC2 c2={} < 4: bbox channels must fit in one block", layout.c2()),
            ));
        }

        // Precompute class channel offsets (channels 4..4+num_classes).
        let class_raw_offsets = layout.precompute_channel_offsets(4, num_classes);

        Ok(Self {
            model,
            input_width,
            input_height,
            class_filters,
            #[cfg(feature = "letterbox")]
            use_letterbox: true,
            #[cfg(not(feature = "letterbox"))]
            use_letterbox: false,
            resize_buf: vec![0u8; input_width as usize * input_height as usize * 3],
            layout,
            class_raw_offsets,
        })
    }

    /// Enables or disables letterbox preprocessing.
    ///
    /// Letterbox preserves aspect ratio by padding with gray (114, 114, 114).
    /// Default is `true` when the `letterbox` feature is enabled, `false` otherwise.
    pub fn set_letterbox(&mut self, enabled: bool) {
        self.use_letterbox = enabled;
    }

    /// Returns the input size (width, height).
    pub fn input_size(&self) -> (u32, u32) {
        (self.input_width, self.input_height)
    }

    /// Runs inference on an image.
    ///
    /// # Arguments
    /// * `image` - Input image buffer (RGB)
    /// * `conf_threshold` - Confidence threshold (0.0 - 1.0)
    /// * `nms_threshold` - NMS IoU threshold (0.0 - 1.0)
    ///
    /// # Returns
    /// Tuple of (bounding boxes, class IDs, confidence scores)
    pub fn forward(
        &mut self,
        image: &ImageBuffer,
        conf_threshold: f32,
        nms_threshold: f32,
    ) -> Result<(Vec<BBox>, Vec<usize>, Vec<f32>), RknnModelError> {
        let (orig_h, orig_w, _) = image.shape();
        let dst_w = self.input_width as usize;
        let dst_h = self.input_height as usize;
        let already_correct_size = orig_w == dst_w && orig_h == dst_h;

        // Preprocess: resize into pre-allocated buffer or zero-copy passthrough
        let meta = if already_correct_size {
            PreprocessMeta::Stretch(StretchMeta {
                scale_x: 1.0,
                scale_y: 1.0,
                original_width: orig_w as i32,
                original_height: orig_h as i32,
            })
        } else {
            let src = image.as_slice().expect("ImageBuffer not contiguous");
            if self.use_letterbox {
                let lm = resize_letterbox_nearest_into(
                    src, orig_w, orig_h,
                    &mut self.resize_buf, dst_w, dst_h,
                );
                PreprocessMeta::Letterbox(lm)
            } else {
                resize_nearest_rgb_into(
                    src, orig_w, orig_h,
                    &mut self.resize_buf, dst_w, dst_h,
                );
                PreprocessMeta::Stretch(StretchMeta {
                    scale_x: orig_w as f32 / dst_w as f32,
                    scale_y: orig_h as f32 / dst_h as f32,
                    original_width: orig_w as i32,
                    original_height: orig_h as i32,
                })
            }
        };

        // NPU inference - zero-copy when size matches, pre-allocated buffer otherwise
        let input_bytes = if already_correct_size {
            image.as_slice().expect("ImageBuffer not contiguous")
        } else {
            &self.resize_buf
        };
        self.model.run(input_bytes)?;

        // Parse NC1HWC2 output directly with precomputed offsets
        let raw = self.model.output_raw(0)?;
        let detections = parse_nc1hwc2_direct(
            raw,
            &self.class_raw_offsets,
            &self.layout,
            conf_threshold,
            self.input_width as f32,
            self.input_height as f32,
            &meta,
        );

        let filtered = filter_by_class(&detections, &self.class_filters);
        let final_detections = nms(&filtered, nms_threshold);
        Ok(detections_to_vecs(final_detections))
    }
}

impl crate::ObjectDetector for ModelUltralyticsRknn {
    type Input = ImageBuffer;
    type Error = RknnModelError;

    fn detect(
        &mut self,
        input: &Self::Input,
        conf_threshold: f32,
        nms_threshold: f32,
    ) -> Result<(Vec<BBox>, Vec<usize>, Vec<f32>), Self::Error> {
        self.forward(input, conf_threshold, nms_threshold)
    }
}

// ---------------------------------------------------------------------------
// Fast preprocessing: nearest-neighbor resize into pre-allocated buffer.
// Pure Rust on raw RGB bytes - no image crate, no intermediate ImageBuffer.
// ---------------------------------------------------------------------------

/// Nearest-neighbor stretch resize into a caller-provided buffer.
#[inline(never)]
fn resize_nearest_rgb_into(
    src: &[u8], src_w: usize, src_h: usize,
    dst: &mut [u8], dst_w: usize, dst_h: usize,
) {
    let src_ptr = src.as_ptr();
    let dst_ptr = dst.as_mut_ptr();

    for y in 0..dst_h {
        let src_y = (y * src_h) / dst_h;
        let dst_row = y * dst_w * 3;
        let src_row = src_y * src_w * 3;

        for x in 0..dst_w {
            let src_x = (x * src_w) / dst_w;
            let si = src_row + src_x * 3;
            let di = dst_row + x * 3;
            unsafe {
                std::ptr::copy_nonoverlapping(src_ptr.add(si), dst_ptr.add(di), 3);
            }
        }
    }
}

/// Nearest-neighbor letterbox resize into a caller-provided buffer.
/// Preserves aspect ratio and pads with gray (114, 114, 114).
#[inline(never)]
fn resize_letterbox_nearest_into(
    src: &[u8], src_w: usize, src_h: usize,
    dst: &mut [u8], dst_w: usize, dst_h: usize,
) -> LetterboxMeta {
    let scale = f32::min(dst_w as f32 / src_w as f32, dst_h as f32 / src_h as f32);
    let new_w = (src_w as f32 * scale).round() as usize;
    let new_h = (src_h as f32 * scale).round() as usize;
    let pad_left = (dst_w - new_w) / 2;
    let pad_top = (dst_h - new_h) / 2;

    // Gray padding
    dst.fill(114);

    let src_ptr = src.as_ptr();
    let dst_ptr = dst.as_mut_ptr();

    for y in 0..new_h {
        let src_y = (y * src_h) / new_h;
        let dst_row = (y + pad_top) * dst_w * 3;
        let src_row = src_y * src_w * 3;

        for x in 0..new_w {
            let src_x = (x * src_w) / new_w;
            let si = src_row + src_x * 3;
            let di = dst_row + (x + pad_left) * 3;
            unsafe {
                std::ptr::copy_nonoverlapping(src_ptr.add(si), dst_ptr.add(di), 3);
            }
        }
    }

    LetterboxMeta {
        scale,
        pad_left: pad_left as i32,
        pad_top: pad_top as i32,
        original_width: src_w as i32,
        original_height: src_h as i32,
    }
}

// ---------------------------------------------------------------------------
// Direct NC1HWC2 parser with precomputed offsets.
//
// Hot loop: one table lookup + one i8 load + one comparison per class.
// No division, no float math until a detection passes the i8 threshold.
// ---------------------------------------------------------------------------

/// Parse NC1HWC2 output directly from raw i8 data.
///
/// `class_raw_offsets` maps each class index to its raw-data offset relative
/// to the prediction base. These are precomputed at model load time,
/// eliminating `ch/c2` and `ch%c2` division from the inner loop.
///
/// `layout` provides prediction stride, dequantization, and i8 threshold
/// computation - all precomputed in `rknn-runtime`.
#[inline(never)]
fn parse_nc1hwc2_direct(
    raw: &[i8],
    class_raw_offsets: &[usize],
    layout: &Nc1hwc2Layout,
    conf_threshold: f32,
    input_width: f32,
    input_height: f32,
    meta: &PreprocessMeta,
) -> Vec<Detection> {
    let num_classes = class_raw_offsets.len();
    let threshold_i8 = layout.threshold_i8(conf_threshold);
    let stride = layout.prediction_stride();

    let mut detections = Vec::new();

    // Incremental p_offset: addition instead of p * stride multiplication
    let mut p_offset = 0usize;

    for _p in 0..layout.num_predictions() {
        // --- Scan class scores with precomputed offsets (no division) ---
        let mut best_raw = i8::MIN;
        let mut best_cls = 0usize;

        for (c, &off) in class_raw_offsets[..num_classes].iter().enumerate() {
            let v = unsafe { *raw.get_unchecked(off + p_offset) };
            if v > best_raw {
                best_raw = v;
                best_cls = c;
            }
        }

        // i8-space threshold: rejects ~99% of predictions with zero float math
        if best_raw >= threshold_i8 {
            // Dequantize only the winning score
            let best_conf = layout.dequant(best_raw);

            // Bbox channels 0-3 are at p_offset+0..3 (always in block 0, c2 >= 4)
            let cx = layout.dequant(unsafe { *raw.get_unchecked(p_offset) }) * input_width;
            let cy = layout.dequant(unsafe { *raw.get_unchecked(p_offset + 1) }) * input_height;
            let bw = layout.dequant(unsafe { *raw.get_unchecked(p_offset + 2) }) * input_width;
            let bh = layout.dequant(unsafe { *raw.get_unchecked(p_offset + 3) }) * input_height;

            if bw > 0.0 && bh > 0.0 {
                let (x, y, w_out, h_out) = meta.inverse_transform(cx, cy, bw, bh);
                detections.push(Detection::new(
                    BBox::from_center(x, y, w_out, h_out),
                    best_cls,
                    best_conf,
                ));
            }
        }

        p_offset += stride;
    }

    detections
}