1use super::CPUProcessor;
5use crate::Result;
6use edgefirst_decoder::{DetectBox, Segmentation};
7use ndarray::Axis;
8
9impl CPUProcessor {
10 #[allow(clippy::too_many_arguments)]
11 pub(super) fn render_modelpack_segmentation(
12 &mut self,
13 dst_w: usize,
14 dst_h: usize,
15 dst_rs: usize,
16 dst_c: usize,
17 dst_slice: &mut [u8],
18 segmentation: &Segmentation,
19 opacity: f32,
20 ) -> Result<()> {
21 use ndarray_stats::QuantileExt;
22
23 let seg = &segmentation.segmentation;
24 let [seg_height, seg_width, seg_classes] = *seg.shape() else {
25 unreachable!("Array3 did not have [usize; 3] as shape");
26 };
27 let start_y = (dst_h as f32 * segmentation.ymin).round();
28 let end_y = (dst_h as f32 * segmentation.ymax).round();
29 let start_x = (dst_w as f32 * segmentation.xmin).round();
30 let end_x = (dst_w as f32 * segmentation.xmax).round();
31
32 let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
33 let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
34
35 let start_x_u = (start_x as usize).min(dst_w);
36 let start_y_u = (start_y as usize).min(dst_h);
37 let end_x_u = (end_x as usize).min(dst_w);
38 let end_y_u = (end_y as usize).min(dst_h);
39
40 let argmax = seg.map_axis(Axis(2), |r| r.argmax().unwrap());
41 let get_value_at_nearest = |x: f32, y: f32| -> usize {
42 let x = x.round() as usize;
43 let y = y.round() as usize;
44 argmax
45 .get([y.min(seg_height - 1), x.min(seg_width - 1)])
46 .copied()
47 .unwrap_or(0)
48 };
49
50 for y in start_y_u..end_y_u {
51 for x in start_x_u..end_x_u {
52 let seg_x = (x as f32 - start_x) * scale_x;
53 let seg_y = (y as f32 - start_y) * scale_y;
54 let label = get_value_at_nearest(seg_x, seg_y);
55
56 if label == seg_classes - 1 {
57 continue;
58 }
59
60 let color = self.colors[label % self.colors.len()];
61
62 let alpha = if opacity == 1.0 {
63 color[3] as u16
64 } else {
65 (color[3] as f32 * opacity).round() as u16
66 };
67
68 let dst_index = (y * dst_rs) + (x * dst_c);
69 for c in 0..3 {
70 dst_slice[dst_index + c] = ((color[c] as u16 * alpha
71 + dst_slice[dst_index + c] as u16 * (255 - alpha))
72 / 255) as u8;
73 }
74 }
75 }
76
77 Ok(())
78 }
79
80 #[allow(clippy::too_many_arguments)]
81 pub(super) fn render_yolo_segmentation(
82 &mut self,
83 dst_w: usize,
84 dst_h: usize,
85 dst_rs: usize,
86 dst_c: usize,
87 dst_slice: &mut [u8],
88 segmentation: &Segmentation,
89 class: usize,
90 opacity: f32,
91 ) -> Result<()> {
92 let seg = &segmentation.segmentation;
93 let [seg_height, seg_width, classes] = *seg.shape() else {
94 unreachable!("Array3 did not have [usize;3] as shape");
95 };
96 debug_assert_eq!(classes, 1);
97
98 let start_y = (dst_h as f32 * segmentation.ymin).round();
99 let end_y = (dst_h as f32 * segmentation.ymax).round();
100 let start_x = (dst_w as f32 * segmentation.xmin).round();
101 let end_x = (dst_w as f32 * segmentation.xmax).round();
102
103 let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
104 let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
105
106 let start_x_u = (start_x as usize).min(dst_w);
107 let start_y_u = (start_y as usize).min(dst_h);
108 let end_x_u = (end_x as usize).min(dst_w);
109 let end_y_u = (end_y as usize).min(dst_h);
110
111 for y in start_y_u..end_y_u {
112 for x in start_x_u..end_x_u {
113 let seg_x = ((x as f32 - start_x) * scale_x) as usize;
114 let seg_y = ((y as f32 - start_y) * scale_y) as usize;
115 let val = *seg.get([seg_y, seg_x, 0]).unwrap_or(&0);
116
117 if val < 127 {
118 continue;
119 }
120
121 let color = self.colors[class % self.colors.len()];
122
123 let alpha = if opacity == 1.0 {
124 color[3] as u16
125 } else {
126 (color[3] as f32 * opacity).round() as u16
127 };
128
129 let dst_index = (y * dst_rs) + (x * dst_c);
130 for c in 0..3 {
131 dst_slice[dst_index + c] = ((color[c] as u16 * alpha
132 + dst_slice[dst_index + c] as u16 * (255 - alpha))
133 / 255) as u8;
134 }
135 }
136 }
137
138 Ok(())
139 }
140
141 pub(super) fn render_box(
142 &mut self,
143 dst_w: usize,
144 dst_h: usize,
145 dst_rs: usize,
146 dst_c: usize,
147 dst_slice: &mut [u8],
148 detect: &[DetectBox],
149 ) -> Result<()> {
150 const LINE_THICKNESS: usize = 3;
151
152 for d in detect {
153 use edgefirst_decoder::BoundingBox;
154
155 let label = d.label;
156 let [r, g, b, _] = self.colors[label % self.colors.len()];
157 let bbox = d.bbox.to_canonical();
158 let bbox = BoundingBox {
159 xmin: bbox.xmin.clamp(0.0, 1.0),
160 ymin: bbox.ymin.clamp(0.0, 1.0),
161 xmax: bbox.xmax.clamp(0.0, 1.0),
162 ymax: bbox.ymax.clamp(0.0, 1.0),
163 };
164 let inner = [
165 ((dst_w - 1) as f32 * bbox.xmin - 0.5).round() as usize,
166 ((dst_h - 1) as f32 * bbox.ymin - 0.5).round() as usize,
167 ((dst_w - 1) as f32 * bbox.xmax + 0.5).round() as usize,
168 ((dst_h - 1) as f32 * bbox.ymax + 0.5).round() as usize,
169 ];
170
171 let outer = [
172 inner[0].saturating_sub(LINE_THICKNESS),
173 inner[1].saturating_sub(LINE_THICKNESS),
174 (inner[2] + LINE_THICKNESS).min(dst_w),
175 (inner[3] + LINE_THICKNESS).min(dst_h),
176 ];
177
178 for y in outer[1] + 1..=inner[1] {
180 for x in outer[0] + 1..outer[2] {
181 let index = (y * dst_rs) + (x * dst_c);
182 dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
183 }
184 }
185
186 for y in inner[1]..inner[3] {
188 for x in outer[0] + 1..=inner[0] {
189 let index = (y * dst_rs) + (x * dst_c);
190 dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
191 }
192
193 for x in inner[2]..outer[2] {
194 let index = (y * dst_rs) + (x * dst_c);
195 dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
196 }
197 }
198
199 for y in inner[3]..outer[3] {
201 for x in outer[0] + 1..outer[2] {
202 let index = (y * dst_rs) + (x * dst_c);
203 dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
204 }
205 }
206 }
207 Ok(())
208 }
209
210 pub fn materialize_segmentations(
221 &self,
222 detect: &[crate::DetectBox],
223 proto_data: &crate::ProtoData,
224 ) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
225 use edgefirst_decoder::ProtoTensor;
226
227 if detect.is_empty() || proto_data.mask_coefficients.is_empty() {
228 return Ok(Vec::new());
229 }
230
231 let (proto_h, proto_w, num_protos) = match &proto_data.protos {
233 ProtoTensor::Quantized { protos, .. } => {
234 (protos.shape()[0], protos.shape()[1], protos.shape()[2])
235 }
236 ProtoTensor::Float(arr) => (arr.shape()[0], arr.shape()[1], arr.shape()[2]),
237 };
238
239 detect
240 .iter()
241 .zip(proto_data.mask_coefficients.iter())
242 .map(|(det, coeff)| {
243 let xmin = det.bbox.xmin.clamp(0.0, 1.0);
245 let ymin = det.bbox.ymin.clamp(0.0, 1.0);
246 let xmax = det.bbox.xmax.clamp(0.0, 1.0);
247 let ymax = det.bbox.ymax.clamp(0.0, 1.0);
248
249 let x0 = ((xmin * proto_w as f32) as usize).min(proto_w.saturating_sub(1));
251 let y0 = ((ymin * proto_h as f32) as usize).min(proto_h.saturating_sub(1));
252 let x1 = ((xmax * proto_w as f32).ceil() as usize).min(proto_w);
253 let y1 = ((ymax * proto_h as f32).ceil() as usize).min(proto_h);
254
255 let roi_w = x1.saturating_sub(x0).max(1);
256 let roi_h = y1.saturating_sub(y0).max(1);
257
258 if coeff.len() != num_protos {
259 return Err(crate::Error::Internal(format!(
260 "mask coeff length {} != proto channels {num_protos}",
261 coeff.len()
262 )));
263 }
264
265 let mask = match &proto_data.protos {
268 ProtoTensor::Quantized {
269 protos,
270 quantization,
271 } => {
272 let scale = quantization.scale;
273 let zp = quantization.zero_point as f32;
274 fused_dequant_dot_sigmoid_i8(
275 protos, coeff, scale, zp, y0, x0, roi_h, roi_w, num_protos,
276 )
277 }
278 ProtoTensor::Float(protos) => {
279 fused_dot_sigmoid_f32(protos, coeff, y0, x0, roi_h, roi_w, num_protos)
280 }
281 };
282
283 Ok(edgefirst_decoder::Segmentation {
284 xmin: x0 as f32 / proto_w as f32,
285 ymin: y0 as f32 / proto_h as f32,
286 xmax: x1 as f32 / proto_w as f32,
287 ymax: y1 as f32 / proto_h as f32,
288 segmentation: mask,
289 })
290 })
291 .collect::<crate::Result<Vec<_>>>()
292 }
293}
294
295#[inline]
301pub(super) fn bilinear_dot(
302 protos: &ndarray::Array3<f32>,
303 coeff: &[f32],
304 num_protos: usize,
305 px: f32,
306 py: f32,
307 proto_w: usize,
308 proto_h: usize,
309) -> f32 {
310 let x0 = (px.floor() as isize).clamp(0, proto_w as isize - 1) as usize;
311 let y0 = (py.floor() as isize).clamp(0, proto_h as isize - 1) as usize;
312 let x1 = (x0 + 1).min(proto_w - 1);
313 let y1 = (y0 + 1).min(proto_h - 1);
314
315 let fx = px - px.floor();
316 let fy = py - py.floor();
317
318 let w00 = (1.0 - fx) * (1.0 - fy);
319 let w10 = fx * (1.0 - fy);
320 let w01 = (1.0 - fx) * fy;
321 let w11 = fx * fy;
322
323 let mut acc = 0.0f32;
324 for p in 0..num_protos {
325 let val = w00 * protos[[y0, x0, p]]
326 + w10 * protos[[y0, x1, p]]
327 + w01 * protos[[y1, x0, p]]
328 + w11 * protos[[y1, x1, p]];
329 acc += coeff[p] * val;
330 }
331 acc
332}
333
334#[inline(always)]
340fn fast_sigmoid(x: f32) -> f32 {
341 if x >= 16.0 {
342 return 1.0;
343 }
344 if x <= -16.0 {
345 return 0.0;
346 }
347 const A: f32 = (1u32 << 23) as f32; const B: f32 = A * std::f32::consts::LOG2_E; const C: u32 = 127 << 23; let neg_x = -x;
353 let bits = (B * neg_x) as i32 + C as i32;
354 let exp_neg_x = f32::from_bits(bits as u32);
355 1.0 / (1.0 + exp_neg_x)
356}
357
358#[allow(clippy::too_many_arguments)]
367fn fused_dequant_dot_sigmoid_i8(
368 protos: &ndarray::Array3<i8>,
369 coeff: &[f32],
370 scale: f32,
371 zp: f32,
372 y0: usize,
373 x0: usize,
374 roi_h: usize,
375 roi_w: usize,
376 num_protos: usize,
377) -> ndarray::Array3<u8> {
378 debug_assert!(
379 protos.strides().iter().all(|&s| s >= 0),
380 "negative strides unsupported"
381 );
382 let scaled_coeff: Vec<f32> = coeff.iter().map(|&c| c * scale).collect();
385 let zp_offset: f32 = zp * scaled_coeff.iter().sum::<f32>();
390
391 let proto_stride_y = protos.strides()[0] as usize;
392 let proto_stride_x = protos.strides()[1] as usize;
393 let proto_stride_k = protos.strides()[2] as usize;
394 let proto_ptr = protos.as_ptr();
395
396 let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
397
398 for y in 0..roi_h {
399 for x in 0..roi_w {
400 let base = (y0 + y) * proto_stride_y + (x0 + x) * proto_stride_x;
402
403 let mut acc = 0.0f32;
404 let mut k = 0;
405
406 let chunks = num_protos / 4;
408 for _ in 0..chunks {
409 unsafe {
412 let p0 = *proto_ptr.add(base + k * proto_stride_k) as f32;
413 let p1 = *proto_ptr.add(base + (k + 1) * proto_stride_k) as f32;
414 let p2 = *proto_ptr.add(base + (k + 2) * proto_stride_k) as f32;
415 let p3 = *proto_ptr.add(base + (k + 3) * proto_stride_k) as f32;
416 acc += scaled_coeff[k] * p0
417 + scaled_coeff[k + 1] * p1
418 + scaled_coeff[k + 2] * p2
419 + scaled_coeff[k + 3] * p3;
420 }
421 k += 4;
422 }
423 while k < num_protos {
425 unsafe {
428 let p = *proto_ptr.add(base + k * proto_stride_k) as f32;
429 acc += scaled_coeff[k] * p;
430 }
431 k += 1;
432 }
433
434 acc -= zp_offset;
435 let sigmoid = fast_sigmoid(acc);
436 mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
437 }
438 }
439 mask
440}
441
442fn fused_dot_sigmoid_f32(
444 protos: &ndarray::Array3<f32>,
445 coeff: &[f32],
446 y0: usize,
447 x0: usize,
448 roi_h: usize,
449 roi_w: usize,
450 num_protos: usize,
451) -> ndarray::Array3<u8> {
452 debug_assert!(
453 protos.strides().iter().all(|&s| s >= 0),
454 "negative strides unsupported"
455 );
456 let proto_stride_y = protos.strides()[0] as usize;
457 let proto_stride_x = protos.strides()[1] as usize;
458 let proto_stride_k = protos.strides()[2] as usize;
459 let proto_ptr = protos.as_ptr();
460
461 let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
462
463 for y in 0..roi_h {
464 for x in 0..roi_w {
465 let base = (y0 + y) * proto_stride_y + (x0 + x) * proto_stride_x;
466
467 let mut acc = 0.0f32;
468 let mut k = 0;
469 let chunks = num_protos / 4;
470 for _ in 0..chunks {
471 unsafe {
474 let p0 = *proto_ptr.add(base + k * proto_stride_k);
475 let p1 = *proto_ptr.add(base + (k + 1) * proto_stride_k);
476 let p2 = *proto_ptr.add(base + (k + 2) * proto_stride_k);
477 let p3 = *proto_ptr.add(base + (k + 3) * proto_stride_k);
478 acc +=
479 coeff[k] * p0 + coeff[k + 1] * p1 + coeff[k + 2] * p2 + coeff[k + 3] * p3;
480 }
481 k += 4;
482 }
483 while k < num_protos {
484 unsafe {
487 let p = *proto_ptr.add(base + k * proto_stride_k);
488 acc += coeff[k] * p;
489 }
490 k += 1;
491 }
492
493 let sigmoid = fast_sigmoid(acc);
494 mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
495 }
496 }
497 mask
498}