use super::CPUProcessor;
use crate::Result;
use edgefirst_decoder::{DetectBox, Segmentation};
use ndarray::Axis;
impl CPUProcessor {
#[allow(clippy::too_many_arguments)]
pub(super) fn render_modelpack_segmentation(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
segmentation: &Segmentation,
opacity: f32,
) -> Result<()> {
use ndarray_stats::QuantileExt;
let seg = &segmentation.segmentation;
let [seg_height, seg_width, seg_classes] = *seg.shape() else {
unreachable!("Array3 did not have [usize; 3] as shape");
};
let start_y = (dst_h as f32 * segmentation.ymin).round();
let end_y = (dst_h as f32 * segmentation.ymax).round();
let start_x = (dst_w as f32 * segmentation.xmin).round();
let end_x = (dst_w as f32 * segmentation.xmax).round();
let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
let start_x_u = (start_x as usize).min(dst_w);
let start_y_u = (start_y as usize).min(dst_h);
let end_x_u = (end_x as usize).min(dst_w);
let end_y_u = (end_y as usize).min(dst_h);
let argmax = seg.map_axis(Axis(2), |r| r.argmax().unwrap());
let get_value_at_nearest = |x: f32, y: f32| -> usize {
let x = x.round() as usize;
let y = y.round() as usize;
argmax
.get([y.min(seg_height - 1), x.min(seg_width - 1)])
.copied()
.unwrap_or(0)
};
for y in start_y_u..end_y_u {
for x in start_x_u..end_x_u {
let seg_x = (x as f32 - start_x) * scale_x;
let seg_y = (y as f32 - start_y) * scale_y;
let label = get_value_at_nearest(seg_x, seg_y);
if label == seg_classes - 1 {
continue;
}
let color = self.colors[label % self.colors.len()];
let alpha = if opacity == 1.0 {
color[3] as u16
} else {
(color[3] as f32 * opacity).round() as u16
};
let dst_index = (y * dst_rs) + (x * dst_c);
for c in 0..3 {
dst_slice[dst_index + c] = ((color[c] as u16 * alpha
+ dst_slice[dst_index + c] as u16 * (255 - alpha))
/ 255) as u8;
}
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn render_yolo_segmentation(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
segmentation: &Segmentation,
class: usize,
opacity: f32,
) -> Result<()> {
let seg = &segmentation.segmentation;
let [seg_height, seg_width, classes] = *seg.shape() else {
unreachable!("Array3 did not have [usize;3] as shape");
};
debug_assert_eq!(classes, 1);
let start_y = (dst_h as f32 * segmentation.ymin).round();
let end_y = (dst_h as f32 * segmentation.ymax).round();
let start_x = (dst_w as f32 * segmentation.xmin).round();
let end_x = (dst_w as f32 * segmentation.xmax).round();
let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
let start_x_u = (start_x as usize).min(dst_w);
let start_y_u = (start_y as usize).min(dst_h);
let end_x_u = (end_x as usize).min(dst_w);
let end_y_u = (end_y as usize).min(dst_h);
for y in start_y_u..end_y_u {
for x in start_x_u..end_x_u {
let seg_x = ((x as f32 - start_x) * scale_x) as usize;
let seg_y = ((y as f32 - start_y) * scale_y) as usize;
let val = *seg.get([seg_y, seg_x, 0]).unwrap_or(&0);
if val < 127 {
continue;
}
let color = self.colors[class % self.colors.len()];
let alpha = if opacity == 1.0 {
color[3] as u16
} else {
(color[3] as f32 * opacity).round() as u16
};
let dst_index = (y * dst_rs) + (x * dst_c);
for c in 0..3 {
dst_slice[dst_index + c] = ((color[c] as u16 * alpha
+ dst_slice[dst_index + c] as u16 * (255 - alpha))
/ 255) as u8;
}
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn render_box(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
detect: &[DetectBox],
color_mode: crate::ColorMode,
) -> Result<()> {
const LINE_THICKNESS: usize = 3;
for (idx, d) in detect.iter().enumerate() {
use edgefirst_decoder::BoundingBox;
let color_index = color_mode.index(idx, d.label);
let [r, g, b, _] = self.colors[color_index % self.colors.len()];
let bbox = d.bbox.to_canonical();
let bbox = BoundingBox {
xmin: bbox.xmin.clamp(0.0, 1.0),
ymin: bbox.ymin.clamp(0.0, 1.0),
xmax: bbox.xmax.clamp(0.0, 1.0),
ymax: bbox.ymax.clamp(0.0, 1.0),
};
let inner = [
((dst_w - 1) as f32 * bbox.xmin - 0.5).round() as usize,
((dst_h - 1) as f32 * bbox.ymin - 0.5).round() as usize,
((dst_w - 1) as f32 * bbox.xmax + 0.5).round() as usize,
((dst_h - 1) as f32 * bbox.ymax + 0.5).round() as usize,
];
let outer = [
inner[0].saturating_sub(LINE_THICKNESS),
inner[1].saturating_sub(LINE_THICKNESS),
(inner[2] + LINE_THICKNESS).min(dst_w),
(inner[3] + LINE_THICKNESS).min(dst_h),
];
for y in outer[1] + 1..=inner[1] {
for x in outer[0] + 1..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
for y in inner[1]..inner[3] {
for x in outer[0] + 1..=inner[0] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
for x in inner[2]..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
for y in inner[3]..outer[3] {
for x in outer[0] + 1..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
}
Ok(())
}
pub fn materialize_segmentations(
&self,
detect: &[crate::DetectBox],
proto_data: &crate::ProtoData,
letterbox: Option<[f32; 4]>,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::{DType, TensorMapTrait, TensorTrait};
if detect.is_empty() {
return Ok(Vec::new());
}
let proto_shape = proto_data.protos.shape();
if proto_shape.len() != 3 {
return Err(crate::Error::InvalidShape(format!(
"protos tensor must be rank-3, got {proto_shape:?}"
)));
}
let (proto_h, proto_w, num_protos) = (proto_shape[0], proto_shape[1], proto_shape[2]);
let coeff_shape = proto_data.mask_coefficients.shape();
if coeff_shape.len() != 2 || coeff_shape[1] != num_protos {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients shape {coeff_shape:?} incompatible with protos \
{proto_shape:?} (expected [N, {num_protos}])"
)));
}
if coeff_shape[0] == 0 {
return Ok(Vec::new());
}
if coeff_shape[0] != detect.len() {
return Err(crate::Error::Internal(format!(
"mask_coefficients rows {} != detection count {}",
coeff_shape[0],
detect.len()
)));
}
let (lx0, inv_lw, ly0, inv_lh) = match letterbox {
Some([lx0, ly0, lx1, ly1]) => {
let lw = lx1 - lx0;
let lh = ly1 - ly0;
(
lx0,
if lw > 0.0 { 1.0 / lw } else { 1.0 },
ly0,
if lh > 0.0 { 1.0 / lh } else { 1.0 },
)
}
None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
};
let coeff_f32_storage: Vec<f32>;
let coeff_f32_slice: &[f32] = match proto_data.mask_coefficients.dtype() {
DType::F32 => {
let t = proto_data
.mask_coefficients
.as_f32()
.expect("dtype matched F32");
let m = t.map()?;
coeff_f32_storage = m.as_slice().to_vec();
&coeff_f32_storage[..]
}
DType::F16 => {
let t = proto_data
.mask_coefficients
.as_f16()
.expect("dtype matched F16");
let m = t.map()?;
coeff_f32_storage = m.as_slice().iter().map(|v| v.to_f32()).collect();
&coeff_f32_storage[..]
}
other => {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients dtype {other:?} not supported; expected F32 or F16"
)));
}
};
let per_detection = |i: usize,
det: &crate::DetectBox|
-> crate::Result<edgefirst_decoder::Segmentation> {
let coeff = &coeff_f32_slice[i * num_protos..(i + 1) * num_protos];
let bbox = det.bbox.to_canonical();
let xmin = bbox.xmin.clamp(0.0, 1.0);
let ymin = bbox.ymin.clamp(0.0, 1.0);
let xmax = bbox.xmax.clamp(0.0, 1.0);
let ymax = bbox.ymax.clamp(0.0, 1.0);
let x0 = ((xmin * proto_w as f32) as usize).min(proto_w.saturating_sub(1));
let y0 = ((ymin * proto_h as f32) as usize).min(proto_h.saturating_sub(1));
let x1 = ((xmax * proto_w as f32).ceil() as usize).min(proto_w);
let y1 = ((ymax * proto_h as f32).ceil() as usize).min(proto_h);
let roi_w = x1.saturating_sub(x0).max(1);
let roi_h = y1.saturating_sub(y0).max(1);
let mask = match proto_data.protos.dtype() {
DType::I8 => {
let t = proto_data.protos.as_i8().expect("dtype matched I8");
let quant = t.quantization().ok_or_else(|| {
crate::Error::InvalidShape("I8 protos require quantization metadata".into())
})?;
let m = t.map()?;
fused_dequant_dot_sigmoid_i8_slice(
m.as_slice(),
coeff,
quant,
proto_h,
proto_w,
y0,
x0,
roi_h,
roi_w,
num_protos,
)?
}
DType::F32 => {
let t = proto_data.protos.as_f32().expect("dtype matched F32");
let m = t.map()?;
fused_dot_sigmoid_f32_slice(
m.as_slice(),
coeff,
proto_h,
proto_w,
y0,
x0,
roi_h,
roi_w,
num_protos,
)
}
DType::F16 => {
let t = proto_data.protos.as_f16().expect("dtype matched F16");
let m = t.map()?;
fused_dot_sigmoid_f16_slice(
m.as_slice(),
coeff,
proto_h,
proto_w,
y0,
x0,
roi_h,
roi_w,
num_protos,
)
}
other => {
return Err(crate::Error::InvalidShape(format!(
"proto tensor dtype {other:?} not supported"
)));
}
};
let seg_xmin = ((x0 as f32 / proto_w as f32) - lx0) * inv_lw;
let seg_ymin = ((y0 as f32 / proto_h as f32) - ly0) * inv_lh;
let seg_xmax = ((x1 as f32 / proto_w as f32) - lx0) * inv_lw;
let seg_ymax = ((y1 as f32 / proto_h as f32) - ly0) * inv_lh;
Ok(edgefirst_decoder::Segmentation {
xmin: seg_xmin.clamp(0.0, 1.0),
ymin: seg_ymin.clamp(0.0, 1.0),
xmax: seg_xmax.clamp(0.0, 1.0),
ymax: seg_ymax.clamp(0.0, 1.0),
segmentation: mask,
})
};
detect
.iter()
.enumerate()
.map(|(i, det)| per_detection(i, det))
.collect()
}
pub fn materialize_scaled_segmentations(
&self,
detect: &[crate::DetectBox],
proto_data: &crate::ProtoData,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::{DType, TensorMapTrait, TensorTrait};
if detect.is_empty() {
return Ok(Vec::new());
}
if width == 0 || height == 0 {
return Err(crate::Error::InvalidShape(
"Scaled mask width/height must be positive".into(),
));
}
let proto_shape = proto_data.protos.shape();
if proto_shape.len() != 3 {
return Err(crate::Error::InvalidShape(format!(
"protos tensor must be rank-3, got {proto_shape:?}"
)));
}
let (proto_h, proto_w, num_protos) = (proto_shape[0], proto_shape[1], proto_shape[2]);
let coeff_shape = proto_data.mask_coefficients.shape();
if coeff_shape.len() != 2 || coeff_shape[1] != num_protos {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients shape {coeff_shape:?} incompatible with protos \
{proto_shape:?}"
)));
}
if coeff_shape[0] == 0 {
return Ok(Vec::new());
}
let coeff_f32: Vec<f32> = match proto_data.mask_coefficients.dtype() {
DType::F32 => {
let t = proto_data.mask_coefficients.as_f32().expect("F32");
let m = t.map()?;
m.as_slice().to_vec()
}
DType::F16 => {
let t = proto_data.mask_coefficients.as_f16().expect("F16");
let m = t.map()?;
m.as_slice().iter().map(|v| v.to_f32()).collect()
}
other => {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients dtype {other:?} not supported"
)));
}
};
match proto_data.protos.dtype() {
DType::F32 => {
let t = proto_data.protos.as_f32().expect("F32");
let m = t.map()?;
scaled_segmentations_f32_slice(
detect,
&coeff_f32,
m.as_slice(),
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
)
}
DType::F16 => {
let t = proto_data.protos.as_f16().expect("F16");
let m = t.map()?;
scaled_segmentations_f16_slice(
detect,
&coeff_f32,
m.as_slice(),
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
)
}
DType::I8 => {
let t = proto_data.protos.as_i8().expect("I8");
let m = t.map()?;
let quant = t.quantization().ok_or_else(|| {
crate::Error::InvalidShape("I8 protos require quantization metadata".into())
})?;
scaled_segmentations_i8_slice(
detect,
&coeff_f32,
m.as_slice(),
proto_h,
proto_w,
num_protos,
quant,
letterbox,
width,
height,
)
}
other => Err(crate::Error::InvalidShape(format!(
"proto tensor dtype {other:?} not supported"
))),
}
}
}
#[allow(clippy::too_many_arguments)]
fn fused_dequant_dot_sigmoid_i8_slice(
protos: &[i8],
coeff: &[f32],
quant: &edgefirst_tensor::Quantization,
_proto_h: usize,
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> crate::Result<ndarray::Array3<u8>> {
use edgefirst_tensor::QuantMode;
let stride_y = proto_w * num_protos;
let mut stack_scratch = [0.0_f32; 64];
let mut heap_scratch: Vec<f32>;
let scaled_coeff: &mut [f32] = if num_protos <= stack_scratch.len() {
&mut stack_scratch[..num_protos]
} else {
heap_scratch = vec![0.0_f32; num_protos];
heap_scratch.as_mut_slice()
};
let zp_offset: f32;
match quant.mode() {
QuantMode::PerTensorSymmetric { scale } => {
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scale;
}
zp_offset = 0.0;
}
QuantMode::PerTensor { scale, zero_point } => {
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scale;
}
zp_offset = zero_point as f32 * scaled_coeff.iter().take(num_protos).sum::<f32>();
}
QuantMode::PerChannelSymmetric { scales, axis } => {
if axis != 2 {
return Err(crate::Error::NotSupported(format!(
"per-channel quantization on axis {axis} not supported \
(only channel axis 2 is implemented on this kernel)"
)));
}
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scales[k];
}
zp_offset = 0.0;
}
QuantMode::PerChannel {
scales,
zero_points,
axis,
} => {
if axis != 2 {
return Err(crate::Error::NotSupported(format!(
"per-channel quantization on axis {axis} not supported \
(only channel axis 2 is implemented on this kernel)"
)));
}
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scales[k];
}
zp_offset = (0..num_protos)
.map(|k| scaled_coeff[k] * zero_points[k] as f32)
.sum();
}
}
let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
for y in 0..roi_h {
for x in 0..roi_w {
let base = (y0 + y) * stride_y + (x0 + x) * num_protos;
let mut acc = 0.0_f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
let p0 = protos[base + k] as f32;
let p1 = protos[base + k + 1] as f32;
let p2 = protos[base + k + 2] as f32;
let p3 = protos[base + k + 3] as f32;
acc += scaled_coeff[k] * p0
+ scaled_coeff[k + 1] * p1
+ scaled_coeff[k + 2] * p2
+ scaled_coeff[k + 3] * p3;
k += 4;
}
while k < num_protos {
acc += scaled_coeff[k] * protos[base + k] as f32;
k += 1;
}
acc -= zp_offset;
let sigmoid = fast_sigmoid(acc);
mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
}
}
Ok(mask)
}
#[allow(clippy::too_many_arguments)]
fn fused_dot_sigmoid_f32_slice(
protos: &[f32],
coeff: &[f32],
_proto_h: usize,
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
let stride_y = proto_w * num_protos;
let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
for y in 0..roi_h {
for x in 0..roi_w {
let base = (y0 + y) * stride_y + (x0 + x) * num_protos;
let mut acc = 0.0_f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
acc += coeff[k] * protos[base + k]
+ coeff[k + 1] * protos[base + k + 1]
+ coeff[k + 2] * protos[base + k + 2]
+ coeff[k + 3] * protos[base + k + 3];
k += 4;
}
while k < num_protos {
acc += coeff[k] * protos[base + k];
k += 1;
}
let sigmoid = fast_sigmoid(acc);
mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
}
}
mask
}
#[allow(clippy::too_many_arguments)]
fn fused_dot_sigmoid_f16_slice(
protos: &[half::f16],
coeff: &[f32],
proto_h: usize,
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
#[cfg(all(
target_arch = "x86_64",
target_feature = "f16c",
target_feature = "fma"
))]
{
unsafe {
fused_dot_sigmoid_f16_slice_f16c(
protos, coeff, proto_h, proto_w, y0, x0, roi_h, roi_w, num_protos,
)
}
}
#[cfg(not(all(
target_arch = "x86_64",
target_feature = "f16c",
target_feature = "fma"
)))]
{
let _ = proto_h;
fused_dot_sigmoid_f16_slice_scalar(protos, coeff, proto_w, y0, x0, roi_h, roi_w, num_protos)
}
}
#[allow(clippy::too_many_arguments, dead_code)]
fn fused_dot_sigmoid_f16_slice_scalar(
protos: &[half::f16],
coeff: &[f32],
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
let stride_y = proto_w * num_protos;
let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
for y in 0..roi_h {
for x in 0..roi_w {
let base = (y0 + y) * stride_y + (x0 + x) * num_protos;
let mut acc = 0.0_f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
let p0 = protos[base + k].to_f32();
let p1 = protos[base + k + 1].to_f32();
let p2 = protos[base + k + 2].to_f32();
let p3 = protos[base + k + 3].to_f32();
acc += coeff[k] * p0 + coeff[k + 1] * p1 + coeff[k + 2] * p2 + coeff[k + 3] * p3;
k += 4;
}
while k < num_protos {
acc += coeff[k] * protos[base + k].to_f32();
k += 1;
}
let sigmoid = fast_sigmoid(acc);
mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
}
}
mask
}
#[cfg(all(
target_arch = "x86_64",
target_feature = "f16c",
target_feature = "fma"
))]
#[allow(clippy::too_many_arguments)]
#[target_feature(enable = "f16c,fma,avx")]
unsafe fn fused_dot_sigmoid_f16_slice_f16c(
protos: &[half::f16],
coeff: &[f32],
_proto_h: usize,
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
use core::arch::x86_64::{
_mm256_castps256_ps128, _mm256_cvtph_ps, _mm256_extractf128_ps, _mm256_fmadd_ps,
_mm256_loadu_ps, _mm256_setzero_ps, _mm_add_ps, _mm_cvtss_f32, _mm_hadd_ps,
_mm_loadu_si128,
};
let stride_y = proto_w * num_protos;
let chunks8 = num_protos / 8;
let tail = num_protos % 8;
let mut mask = ndarray::Array3::<u8>::zeros((roi_h, roi_w, 1));
for y in 0..roi_h {
for x in 0..roi_w {
let base = (y0 + y) * stride_y + (x0 + x) * num_protos;
let mut acc_v = _mm256_setzero_ps();
let mut k = 0;
for _ in 0..chunks8 {
let p_ptr = protos
.as_ptr()
.add(base + k)
.cast::<core::arch::x86_64::__m128i>();
let raw = _mm_loadu_si128(p_ptr);
let widened = _mm256_cvtph_ps(raw);
let coeffs_v = _mm256_loadu_ps(coeff.as_ptr().add(k));
acc_v = _mm256_fmadd_ps(coeffs_v, widened, acc_v);
k += 8;
}
let lo = _mm256_castps256_ps128(acc_v);
let hi = _mm256_extractf128_ps::<1>(acc_v);
let sum4 = _mm_add_ps(lo, hi);
let sum2 = _mm_hadd_ps(sum4, sum4);
let sum1 = _mm_hadd_ps(sum2, sum2);
let mut acc = _mm_cvtss_f32(sum1);
while k < num_protos && k - chunks8 * 8 < tail {
acc += coeff[k] * protos[base + k].to_f32();
k += 1;
}
let sigmoid = fast_sigmoid(acc);
mask[[y, x, 0]] = (sigmoid * 255.0 + 0.5) as u8;
}
}
mask
}
#[allow(clippy::too_many_arguments)]
#[inline(always)]
fn bilinear_dot_slice<P: Copy>(
protos: &[P],
stride_y: usize,
num_protos: usize,
coeff: &[f32],
px: f32,
py: f32,
proto_w: usize,
proto_h: usize,
load_f32: impl Fn(&P) -> f32,
) -> f32 {
let x0 = (px.floor() as isize).clamp(0, proto_w as isize - 1) as usize;
let y0 = (py.floor() as isize).clamp(0, proto_h as isize - 1) as usize;
let x1 = (x0 + 1).min(proto_w - 1);
let y1 = (y0 + 1).min(proto_h - 1);
let fx = px - px.floor();
let fy = py - py.floor();
let w00 = (1.0 - fx) * (1.0 - fy);
let w10 = fx * (1.0 - fy);
let w01 = (1.0 - fx) * fy;
let w11 = fx * fy;
let b00 = y0 * stride_y + x0 * num_protos;
let b10 = y0 * stride_y + x1 * num_protos;
let b01 = y1 * stride_y + x0 * num_protos;
let b11 = y1 * stride_y + x1 * num_protos;
let mut acc = 0.0_f32;
for p in 0..num_protos {
let v00 = load_f32(&protos[b00 + p]);
let v10 = load_f32(&protos[b10 + p]);
let v01 = load_f32(&protos[b01 + p]);
let v11 = load_f32(&protos[b11 + p]);
let val = w00 * v00 + w10 * v10 + w01 * v01 + w11 * v11;
acc += coeff[p] * val;
}
acc
}
#[allow(clippy::too_many_arguments)]
fn scaled_segmentations_f32_slice(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[f32],
proto_h: usize,
proto_w: usize,
num_protos: usize,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
scaled_run(
detect,
coeff_all,
protos,
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
1.0,
|p, _| *p,
)
}
#[allow(clippy::too_many_arguments)]
fn scaled_segmentations_f16_slice(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[half::f16],
proto_h: usize,
proto_w: usize,
num_protos: usize,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
scaled_run(
detect,
coeff_all,
protos,
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
1.0,
|p: &half::f16, _| p.to_f32(),
)
}
#[allow(clippy::too_many_arguments)]
fn scaled_segmentations_i8_slice(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[i8],
proto_h: usize,
proto_w: usize,
num_protos: usize,
quant: &edgefirst_tensor::Quantization,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::QuantMode;
let (scale, zp) = match quant.mode() {
QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
QuantMode::PerChannel { axis, .. } | QuantMode::PerChannelSymmetric { axis, .. } => {
return Err(crate::Error::NotSupported(format!(
"per-channel quantization (axis={axis}) on scaled seg path \
not yet supported"
)));
}
};
scaled_run(
detect,
coeff_all,
protos,
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
scale,
move |p: &i8, _| *p as f32 - zp,
)
}
#[allow(clippy::too_many_arguments)]
fn scaled_run<P: Copy>(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[P],
proto_h: usize,
proto_w: usize,
num_protos: usize,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
acc_scale: f32,
load_f32: impl Fn(&P, f32) -> f32 + Copy,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
let (lx0, lw, ly0, lh) = match letterbox {
Some([lx0, ly0, lx1, ly1]) => {
let lw = (lx1 - lx0).max(f32::EPSILON);
let lh = (ly1 - ly0).max(f32::EPSILON);
(lx0, lw, ly0, lh)
}
None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
};
let out_w = width as usize;
let out_h = height as usize;
let stride_y = proto_w * num_protos;
detect
.iter()
.enumerate()
.map(|(i, det)| {
let coeff = &coeff_all[i * num_protos..(i + 1) * num_protos];
let bbox = det.bbox.to_canonical();
let xmin = ((bbox.xmin - lx0) / lw).clamp(0.0, 1.0);
let ymin = ((bbox.ymin - ly0) / lh).clamp(0.0, 1.0);
let xmax = ((bbox.xmax - lx0) / lw).clamp(0.0, 1.0);
let ymax = ((bbox.ymax - ly0) / lh).clamp(0.0, 1.0);
let px0 = (xmin * out_w as f32).round() as usize;
let py0 = (ymin * out_h as f32).round() as usize;
let px1 = ((xmax * out_w as f32).round() as usize).min(out_w);
let py1 = ((ymax * out_h as f32).round() as usize).min(out_h);
let bbox_w = px1.saturating_sub(px0).max(1);
let bbox_h = py1.saturating_sub(py0).max(1);
let mut tile = ndarray::Array3::<u8>::zeros((bbox_h, bbox_w, 1));
for yi in 0..bbox_h {
let py = (py0 + yi) as f32;
let model_y_norm = ly0 + (py + 0.5) / out_h as f32 * lh;
let sample_y = model_y_norm * proto_h as f32 - 0.5;
for xi in 0..bbox_w {
let px = (px0 + xi) as f32;
let model_x_norm = lx0 + (px + 0.5) / out_w as f32 * lw;
let sample_x = model_x_norm * proto_w as f32 - 0.5;
let acc = bilinear_dot_slice(
protos,
stride_y,
num_protos,
coeff,
sample_x,
sample_y,
proto_w,
proto_h,
|p: &P| load_f32(p, 0.0),
);
let sigmoid = fast_sigmoid(acc_scale * acc);
tile[[yi, xi, 0]] = if sigmoid > 0.5 { 255 } else { 0 };
}
}
Ok(edgefirst_decoder::Segmentation {
xmin,
ymin,
xmax,
ymax,
segmentation: tile,
})
})
.collect()
}
fn fast_sigmoid(x: f32) -> f32 {
if x >= 16.0 {
return 1.0;
}
if x <= -16.0 {
return 0.0;
}
const A: f32 = (1u32 << 23) as f32; const B: f32 = A * std::f32::consts::LOG2_E; const C: u32 = 127 << 23; let neg_x = -x;
let bits = (B * neg_x) as i32 + C as i32;
let exp_neg_x = f32::from_bits(bits as u32);
1.0 / (1.0 + exp_neg_x)
}