use super::CPUProcessor;
use crate::Result;
use edgefirst_decoder::{DetectBox, Segmentation};
use ndarray::Axis;
impl CPUProcessor {
#[allow(clippy::too_many_arguments)]
pub(super) fn render_modelpack_segmentation(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
segmentation: &Segmentation,
opacity: f32,
) -> Result<()> {
use ndarray_stats::QuantileExt;
let seg = &segmentation.segmentation;
let [seg_height, seg_width, seg_classes] = *seg.shape() else {
unreachable!("Array3 did not have [usize; 3] as shape");
};
let start_y = (dst_h as f32 * segmentation.ymin).round();
let end_y = (dst_h as f32 * segmentation.ymax).round();
let start_x = (dst_w as f32 * segmentation.xmin).round();
let end_x = (dst_w as f32 * segmentation.xmax).round();
let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
let start_x_u = (start_x as usize).min(dst_w);
let start_y_u = (start_y as usize).min(dst_h);
let end_x_u = (end_x as usize).min(dst_w);
let end_y_u = (end_y as usize).min(dst_h);
let argmax = seg.map_axis(Axis(2), |r| r.argmax().unwrap());
let get_value_at_nearest = |x: f32, y: f32| -> usize {
let x = x.round() as usize;
let y = y.round() as usize;
argmax
.get([y.min(seg_height - 1), x.min(seg_width - 1)])
.copied()
.unwrap_or(0)
};
for y in start_y_u..end_y_u {
for x in start_x_u..end_x_u {
let seg_x = (x as f32 - start_x) * scale_x;
let seg_y = (y as f32 - start_y) * scale_y;
let label = get_value_at_nearest(seg_x, seg_y);
if label == seg_classes - 1 {
continue;
}
let color = self.colors[label % self.colors.len()];
let alpha = if opacity == 1.0 {
color[3] as u16
} else {
(color[3] as f32 * opacity).round() as u16
};
let dst_index = (y * dst_rs) + (x * dst_c);
for c in 0..3 {
dst_slice[dst_index + c] = ((color[c] as u16 * alpha
+ dst_slice[dst_index + c] as u16 * (255 - alpha))
/ 255) as u8;
}
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn render_yolo_segmentation(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
segmentation: &Segmentation,
class: usize,
opacity: f32,
) -> Result<()> {
let seg = &segmentation.segmentation;
let [seg_height, seg_width, classes] = *seg.shape() else {
unreachable!("Array3 did not have [usize;3] as shape");
};
debug_assert_eq!(classes, 1);
let start_y = (dst_h as f32 * segmentation.ymin).round();
let end_y = (dst_h as f32 * segmentation.ymax).round();
let start_x = (dst_w as f32 * segmentation.xmin).round();
let end_x = (dst_w as f32 * segmentation.xmax).round();
let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
let start_x_u = (start_x as usize).min(dst_w);
let start_y_u = (start_y as usize).min(dst_h);
let end_x_u = (end_x as usize).min(dst_w);
let end_y_u = (end_y as usize).min(dst_h);
for y in start_y_u..end_y_u {
for x in start_x_u..end_x_u {
let seg_x = ((x as f32 - start_x) * scale_x) as usize;
let seg_y = ((y as f32 - start_y) * scale_y) as usize;
let val = *seg.get([seg_y, seg_x, 0]).unwrap_or(&0);
if val < 127 {
continue;
}
let color = self.colors[class % self.colors.len()];
let alpha = if opacity == 1.0 {
color[3] as u16
} else {
(color[3] as f32 * opacity).round() as u16
};
let dst_index = (y * dst_rs) + (x * dst_c);
for c in 0..3 {
dst_slice[dst_index + c] = ((color[c] as u16 * alpha
+ dst_slice[dst_index + c] as u16 * (255 - alpha))
/ 255) as u8;
}
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn render_box(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
detect: &[DetectBox],
color_mode: crate::ColorMode,
) -> Result<()> {
const LINE_THICKNESS: usize = 3;
for (idx, d) in detect.iter().enumerate() {
use edgefirst_decoder::BoundingBox;
let color_index = color_mode.index(idx, d.label);
let [r, g, b, _] = self.colors[color_index % self.colors.len()];
let bbox = d.bbox.to_canonical();
let bbox = BoundingBox {
xmin: bbox.xmin.clamp(0.0, 1.0),
ymin: bbox.ymin.clamp(0.0, 1.0),
xmax: bbox.xmax.clamp(0.0, 1.0),
ymax: bbox.ymax.clamp(0.0, 1.0),
};
let inner = [
((dst_w - 1) as f32 * bbox.xmin - 0.5).round() as usize,
((dst_h - 1) as f32 * bbox.ymin - 0.5).round() as usize,
((dst_w - 1) as f32 * bbox.xmax + 0.5).round() as usize,
((dst_h - 1) as f32 * bbox.ymax + 0.5).round() as usize,
];
let outer = [
inner[0].saturating_sub(LINE_THICKNESS),
inner[1].saturating_sub(LINE_THICKNESS),
(inner[2] + LINE_THICKNESS).min(dst_w),
(inner[3] + LINE_THICKNESS).min(dst_h),
];
for y in outer[1] + 1..=inner[1] {
for x in outer[0] + 1..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
for y in inner[1]..inner[3] {
for x in outer[0] + 1..=inner[0] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
for x in inner[2]..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
for y in inner[3]..outer[3] {
for x in outer[0] + 1..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
}
Ok(())
}
pub fn materialize_segmentations(
&self,
detect: &[crate::DetectBox],
proto_data: &crate::ProtoData,
letterbox: Option<[f32; 4]>,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_decoder::ProtoTensor;
if detect.is_empty() || proto_data.mask_coefficients.is_empty() {
return Ok(Vec::new());
}
let (proto_h, proto_w, num_protos) = match &proto_data.protos {
ProtoTensor::Quantized { protos, .. } => {
(protos.shape()[0], protos.shape()[1], protos.shape()[2])
}
ProtoTensor::Float(arr) => (arr.shape()[0], arr.shape()[1], arr.shape()[2]),
};
let (lx0, inv_lw, ly0, inv_lh) = match letterbox {
Some([lx0, ly0, lx1, ly1]) => {
let lw = lx1 - lx0;
let lh = ly1 - ly0;
(
lx0,
if lw > 0.0 { 1.0 / lw } else { 1.0 },
ly0,
if lh > 0.0 { 1.0 / lh } else { 1.0 },
)
}
None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
};
detect
.iter()
.zip(proto_data.mask_coefficients.iter())
.map(|(det, coeff)| {
let bbox = det.bbox.to_canonical();
let xmin = bbox.xmin.clamp(0.0, 1.0);
let ymin = bbox.ymin.clamp(0.0, 1.0);
let xmax = bbox.xmax.clamp(0.0, 1.0);
let ymax = bbox.ymax.clamp(0.0, 1.0);
let x0 = ((xmin * proto_w as f32) as usize).min(proto_w.saturating_sub(1));
let y0 = ((ymin * proto_h as f32) as usize).min(proto_h.saturating_sub(1));
let x1 = ((xmax * proto_w as f32).ceil() as usize).min(proto_w);
let y1 = ((ymax * proto_h as f32).ceil() as usize).min(proto_h);
let roi_w = x1.saturating_sub(x0).max(1);
let roi_h = y1.saturating_sub(y0).max(1);
if coeff.len() != num_protos {
return Err(crate::Error::Internal(format!(
"mask coeff length {} != proto channels {num_protos}",
coeff.len()
)));
}
let mask = match &proto_data.protos {
ProtoTensor::Quantized {
protos,
quantization,
} => {
let scale = quantization.scale;
let zp = quantization.zero_point as f32;
fused_dequant_dot_sigmoid_i8(
protos, coeff, scale, zp, y0, x0, roi_h, roi_w, num_protos,
)
}
ProtoTensor::Float(protos) => {
fused_dot_sigmoid_f32(protos, coeff, y0, x0, roi_h, roi_w, num_protos)
}
};
let seg_xmin = ((x0 as f32 / proto_w as f32) - lx0) * inv_lw;
let seg_ymin = ((y0 as f32 / proto_h as f32) - ly0) * inv_lh;
let seg_xmax = ((x1 as f32 / proto_w as f32) - lx0) * inv_lw;
let seg_ymax = ((y1 as f32 / proto_h as f32) - ly0) * inv_lh;
Ok(edgefirst_decoder::Segmentation {
xmin: seg_xmin.clamp(0.0, 1.0),
ymin: seg_ymin.clamp(0.0, 1.0),
xmax: seg_xmax.clamp(0.0, 1.0),
ymax: seg_ymax.clamp(0.0, 1.0),
segmentation: mask,
})
})
.collect::<crate::Result<Vec<_>>>()
}
}
#[inline]
pub(super) fn bilinear_dot(
protos: &ndarray::Array3<f32>,
coeff: &[f32],
num_protos: usize,
px: f32,
py: f32,
proto_w: usize,
proto_h: usize,
) -> f32 {
let x0 = (px.floor() as isize).clamp(0, proto_w as isize - 1) as usize;
let y0 = (py.floor() as isize).clamp(0, proto_h as isize - 1) as usize;
let x1 = (x0 + 1).min(proto_w - 1);
let y1 = (y0 + 1).min(proto_h - 1);
let fx = px - px.floor();
let fy = py - py.floor();
let w00 = (1.0 - fx) * (1.0 - fy);
let w10 = fx * (1.0 - fy);
let w01 = (1.0 - fx) * fy;
let w11 = fx * fy;
let mut acc = 0.0f32;
for p in 0..num_protos {
let val = w00 * protos[[y0, x0, p]]
+ w10 * protos[[y0, x1, p]]
+ w01 * protos[[y1, x0, p]]
+ w11 * protos[[y1, x1, p]];
acc += coeff[p] * val;
}
acc
}
#[inline(always)]
fn fast_sigmoid(x: f32) -> f32 {
if x >= 16.0 {
return 1.0;
}
if x <= -16.0 {
return 0.0;
}
const A: f32 = (1u32 << 23) as f32; const B: f32 = A * std::f32::consts::LOG2_E; const C: u32 = 127 << 23; let neg_x = -x;
let bits = (B * neg_x) as i32 + C as i32;
let exp_neg_x = f32::from_bits(bits as u32);
1.0 / (1.0 + exp_neg_x)
}
#[allow(clippy::too_many_arguments)]
fn fused_dequant_dot_sigmoid_i8(
protos: &ndarray::Array3<i8>,
coeff: &[f32],
scale: f32,
zp: f32,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
debug_assert!(
protos.strides().iter().all(|&s| s >= 0),
"negative strides unsupported"
);
let scaled_coeff: Vec<f32> = coeff.iter().map(|&c| c * scale).collect();
let zp_offset: f32 = zp * scaled_coeff.iter().sum::<f32>();
let proto_stride_y = protos.strides()[0] as usize;
let proto_stride_x = protos.strides()[1] as usize;
let proto_stride_k = protos.strides()[2] as usize;
let proto_ptr = protos.as_ptr();
let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
for y in 0..roi_h {
for x in 0..roi_w {
let base = (y0 + y) * proto_stride_y + (x0 + x) * proto_stride_x;
let mut acc = 0.0f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
unsafe {
let p0 = *proto_ptr.add(base + k * proto_stride_k) as f32;
let p1 = *proto_ptr.add(base + (k + 1) * proto_stride_k) as f32;
let p2 = *proto_ptr.add(base + (k + 2) * proto_stride_k) as f32;
let p3 = *proto_ptr.add(base + (k + 3) * proto_stride_k) as f32;
acc += scaled_coeff[k] * p0
+ scaled_coeff[k + 1] * p1
+ scaled_coeff[k + 2] * p2
+ scaled_coeff[k + 3] * p3;
}
k += 4;
}
while k < num_protos {
unsafe {
let p = *proto_ptr.add(base + k * proto_stride_k) as f32;
acc += scaled_coeff[k] * p;
}
k += 1;
}
acc -= zp_offset;
let sigmoid = fast_sigmoid(acc);
mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
}
}
mask
}
fn fused_dot_sigmoid_f32(
protos: &ndarray::Array3<f32>,
coeff: &[f32],
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
debug_assert!(
protos.strides().iter().all(|&s| s >= 0),
"negative strides unsupported"
);
let proto_stride_y = protos.strides()[0] as usize;
let proto_stride_x = protos.strides()[1] as usize;
let proto_stride_k = protos.strides()[2] as usize;
let proto_ptr = protos.as_ptr();
let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
for y in 0..roi_h {
for x in 0..roi_w {
let base = (y0 + y) * proto_stride_y + (x0 + x) * proto_stride_x;
let mut acc = 0.0f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
unsafe {
let p0 = *proto_ptr.add(base + k * proto_stride_k);
let p1 = *proto_ptr.add(base + (k + 1) * proto_stride_k);
let p2 = *proto_ptr.add(base + (k + 2) * proto_stride_k);
let p3 = *proto_ptr.add(base + (k + 3) * proto_stride_k);
acc +=
coeff[k] * p0 + coeff[k + 1] * p1 + coeff[k + 2] * p2 + coeff[k + 3] * p3;
}
k += 4;
}
while k < num_protos {
unsafe {
let p = *proto_ptr.add(base + k * proto_stride_k);
acc += coeff[k] * p;
}
k += 1;
}
let sigmoid = fast_sigmoid(acc);
mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
}
}
mask
}