use super::CPUProcessor;
use crate::Result;
use edgefirst_decoder::{DetectBox, Segmentation};
use ndarray::Axis;
use rayon::prelude::*;
impl CPUProcessor {
#[allow(clippy::too_many_arguments)]
pub(super) fn render_modelpack_segmentation(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
segmentation: &Segmentation,
opacity: f32,
) -> Result<()> {
use ndarray_stats::QuantileExt;
let seg = &segmentation.segmentation;
let [seg_height, seg_width, seg_classes] = *seg.shape() else {
unreachable!("Array3 did not have [usize; 3] as shape");
};
let start_y = (dst_h as f32 * segmentation.ymin).round();
let end_y = (dst_h as f32 * segmentation.ymax).round();
let start_x = (dst_w as f32 * segmentation.xmin).round();
let end_x = (dst_w as f32 * segmentation.xmax).round();
let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
let start_x_u = (start_x as usize).min(dst_w);
let start_y_u = (start_y as usize).min(dst_h);
let end_x_u = (end_x as usize).min(dst_w);
let end_y_u = (end_y as usize).min(dst_h);
let argmax = seg.map_axis(Axis(2), |r| r.argmax().unwrap());
let get_value_at_nearest = |x: f32, y: f32| -> usize {
let x = x.round() as usize;
let y = y.round() as usize;
argmax
.get([y.min(seg_height - 1), x.min(seg_width - 1)])
.copied()
.unwrap_or(0)
};
for y in start_y_u..end_y_u {
for x in start_x_u..end_x_u {
let seg_x = (x as f32 - start_x) * scale_x;
let seg_y = (y as f32 - start_y) * scale_y;
let label = get_value_at_nearest(seg_x, seg_y);
if label == seg_classes - 1 {
continue;
}
let color = self.colors[label % self.colors.len()];
let alpha = if opacity == 1.0 {
color[3] as u16
} else {
(color[3] as f32 * opacity).round() as u16
};
let dst_index = (y * dst_rs) + (x * dst_c);
for c in 0..3 {
dst_slice[dst_index + c] = ((color[c] as u16 * alpha
+ dst_slice[dst_index + c] as u16 * (255 - alpha))
/ 255) as u8;
}
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn render_yolo_segmentation(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
segmentation: &Segmentation,
class: usize,
opacity: f32,
) -> Result<()> {
let seg = &segmentation.segmentation;
let [seg_height, seg_width, classes] = *seg.shape() else {
unreachable!("Array3 did not have [usize;3] as shape");
};
debug_assert_eq!(classes, 1);
let start_y = (dst_h as f32 * segmentation.ymin).round();
let end_y = (dst_h as f32 * segmentation.ymax).round();
let start_x = (dst_w as f32 * segmentation.xmin).round();
let end_x = (dst_w as f32 * segmentation.xmax).round();
let scale_x = (seg_width as f32 - 1.0) / ((end_x - start_x) - 1.0);
let scale_y = (seg_height as f32 - 1.0) / ((end_y - start_y) - 1.0);
let start_x_u = (start_x as usize).min(dst_w);
let start_y_u = (start_y as usize).min(dst_h);
let end_x_u = (end_x as usize).min(dst_w);
let end_y_u = (end_y as usize).min(dst_h);
for y in start_y_u..end_y_u {
for x in start_x_u..end_x_u {
let seg_x = ((x as f32 - start_x) * scale_x) as usize;
let seg_y = ((y as f32 - start_y) * scale_y) as usize;
let val = *seg.get([seg_y, seg_x, 0]).unwrap_or(&0);
if val < 127 {
continue;
}
let color = self.colors[class % self.colors.len()];
let alpha = if opacity == 1.0 {
color[3] as u16
} else {
(color[3] as f32 * opacity).round() as u16
};
let dst_index = (y * dst_rs) + (x * dst_c);
for c in 0..3 {
dst_slice[dst_index + c] = ((color[c] as u16 * alpha
+ dst_slice[dst_index + c] as u16 * (255 - alpha))
/ 255) as u8;
}
}
}
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn render_box(
&mut self,
dst_w: usize,
dst_h: usize,
dst_rs: usize,
dst_c: usize,
dst_slice: &mut [u8],
detect: &[DetectBox],
color_mode: crate::ColorMode,
) -> Result<()> {
const LINE_THICKNESS: usize = 3;
for (idx, d) in detect.iter().enumerate() {
use edgefirst_decoder::BoundingBox;
let color_index = color_mode.index(idx, d.label);
let [r, g, b, _] = self.colors[color_index % self.colors.len()];
let bbox = d.bbox.to_canonical();
let bbox = BoundingBox {
xmin: bbox.xmin.clamp(0.0, 1.0),
ymin: bbox.ymin.clamp(0.0, 1.0),
xmax: bbox.xmax.clamp(0.0, 1.0),
ymax: bbox.ymax.clamp(0.0, 1.0),
};
let inner = [
((dst_w - 1) as f32 * bbox.xmin - 0.5).round() as usize,
((dst_h - 1) as f32 * bbox.ymin - 0.5).round() as usize,
((dst_w - 1) as f32 * bbox.xmax + 0.5).round() as usize,
((dst_h - 1) as f32 * bbox.ymax + 0.5).round() as usize,
];
let outer = [
inner[0].saturating_sub(LINE_THICKNESS),
inner[1].saturating_sub(LINE_THICKNESS),
(inner[2] + LINE_THICKNESS).min(dst_w),
(inner[3] + LINE_THICKNESS).min(dst_h),
];
for y in outer[1] + 1..=inner[1] {
for x in outer[0] + 1..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
for y in inner[1]..inner[3] {
for x in outer[0] + 1..=inner[0] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
for x in inner[2]..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
for y in inner[3]..outer[3] {
for x in outer[0] + 1..outer[2] {
let index = (y * dst_rs) + (x * dst_c);
dst_slice[index..(index + 3)].copy_from_slice(&[r, g, b]);
}
}
}
Ok(())
}
pub fn materialize_segmentations(
&self,
detect: &[crate::DetectBox],
proto_data: &crate::ProtoData,
letterbox: Option<[f32; 4]>,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::{DType, TensorMapTrait, TensorTrait};
let _span = tracing::trace_span!(
"materialize_masks",
mode = "proto",
n_detections = detect.len(),
)
.entered();
if detect.is_empty() {
return Ok(Vec::new());
}
let proto_shape = proto_data.protos.shape();
if proto_shape.len() != 3 {
return Err(crate::Error::InvalidShape(format!(
"protos tensor must be rank-3, got {proto_shape:?}"
)));
}
let (proto_h, proto_w, num_protos) = match proto_data.layout {
edgefirst_decoder::ProtoLayout::Nhwc => {
(proto_shape[0], proto_shape[1], proto_shape[2])
}
edgefirst_decoder::ProtoLayout::Nchw => {
(proto_shape[1], proto_shape[2], proto_shape[0])
}
};
let coeff_shape = proto_data.mask_coefficients.shape();
if coeff_shape.len() != 2 || coeff_shape[1] != num_protos {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients shape {coeff_shape:?} incompatible with protos \
{proto_shape:?} (expected [N, {num_protos}])"
)));
}
if coeff_shape[0] == 0 {
return Ok(Vec::new());
}
if coeff_shape[0] != detect.len() {
return Err(crate::Error::Internal(format!(
"mask_coefficients rows {} != detection count {}",
coeff_shape[0],
detect.len()
)));
}
let (lx0, inv_lw, ly0, inv_lh) = match letterbox {
Some([lx0, ly0, lx1, ly1]) => {
let lw = lx1 - lx0;
let lh = ly1 - ly0;
(
lx0,
if lw > 0.0 { 1.0 / lw } else { 1.0 },
ly0,
if lh > 0.0 { 1.0 / lh } else { 1.0 },
)
}
None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
};
if proto_data.mask_coefficients.dtype() == DType::I8
&& proto_data.protos.dtype() == DType::I8
{
let coeff_t = proto_data
.mask_coefficients
.as_i8()
.expect("I8 coefficients");
let coeff_m = coeff_t.map()?;
let coeff_quant = coeff_t.quantization().ok_or_else(|| {
crate::Error::InvalidShape(
"I8 mask_coefficients require quantization metadata".into(),
)
})?;
let proto_t = proto_data.protos.as_i8().expect("I8 protos");
let proto_m = proto_t.map()?;
let proto_quant = proto_t.quantization().ok_or_else(|| {
crate::Error::InvalidShape("I8 protos require quantization metadata".into())
})?;
match proto_segmentations_i8_i8(
detect,
coeff_m.as_slice(),
coeff_quant,
proto_m.as_slice(),
proto_quant,
proto_h,
proto_w,
num_protos,
lx0,
inv_lw,
ly0,
inv_lh,
proto_data.layout,
) {
Ok(result) => return Ok(result),
Err(crate::Error::NotSupported(_)) => {
}
Err(e) => return Err(e),
}
}
if proto_data.layout == edgefirst_decoder::ProtoLayout::Nchw {
return Err(crate::Error::NotSupported(
"NCHW proto layout requires I8 protos and coefficients with per-tensor quantization"
.into(),
));
}
let coeff_f32_storage: Vec<f32>;
let coeff_f32_slice: &[f32] = match proto_data.mask_coefficients.dtype() {
DType::F32 => {
let t = proto_data
.mask_coefficients
.as_f32()
.expect("dtype matched F32");
let m = t.map()?;
coeff_f32_storage = m.as_slice().to_vec();
&coeff_f32_storage[..]
}
DType::F16 => {
let t = proto_data
.mask_coefficients
.as_f16()
.expect("dtype matched F16");
let m = t.map()?;
coeff_f32_storage = m.as_slice().iter().map(|v| v.to_f32()).collect();
&coeff_f32_storage[..]
}
DType::I8 => {
let t = proto_data
.mask_coefficients
.as_i8()
.expect("dtype matched I8");
let m = t.map()?;
coeff_f32_storage = if let Some(q) = t.quantization() {
use edgefirst_tensor::QuantMode;
let (scale, zp) = match q.mode() {
QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
other => {
return Err(crate::Error::NotSupported(format!(
"I8 mask_coefficients quantization mode {other:?} not supported"
)));
}
};
m.as_slice()
.iter()
.map(|&v| (v as f32 - zp) * scale)
.collect()
} else {
m.as_slice().iter().map(|&v| v as f32).collect()
};
&coeff_f32_storage[..]
}
other => {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients dtype {other:?} not supported; expected F32, F16, or I8"
)));
}
};
match proto_data.protos.dtype() {
DType::I8 => {
let t = proto_data.protos.as_i8().expect("dtype matched I8");
let quant = t.quantization().ok_or_else(|| {
crate::Error::InvalidShape("I8 protos require quantization metadata".into())
})?;
let m = t.map()?;
let protos_slice = m.as_slice();
detect
.par_iter()
.enumerate()
.map(|(i, det)| {
let coeff = &coeff_f32_slice[i * num_protos..(i + 1) * num_protos];
let (x0, y0, x1, y1, roi_w, roi_h) =
bbox_to_proto_roi(det, proto_w, proto_h);
let mask = fused_dequant_dot_sign_i8_slice(
protos_slice,
coeff,
quant,
proto_h,
proto_w,
y0,
x0,
roi_h,
roi_w,
num_protos,
)?;
Ok(seg_from_roi(
mask, x0, y0, x1, y1, proto_w, proto_h, lx0, inv_lw, ly0, inv_lh,
))
})
.collect()
}
DType::F32 => {
let t = proto_data.protos.as_f32().expect("dtype matched F32");
let m = t.map()?;
let protos_slice = m.as_slice();
detect
.par_iter()
.enumerate()
.map(|(i, det)| {
let coeff = &coeff_f32_slice[i * num_protos..(i + 1) * num_protos];
let (x0, y0, x1, y1, roi_w, roi_h) =
bbox_to_proto_roi(det, proto_w, proto_h);
let mask = fused_dot_sign_f32_slice(
protos_slice,
coeff,
proto_h,
proto_w,
y0,
x0,
roi_h,
roi_w,
num_protos,
);
Ok(seg_from_roi(
mask, x0, y0, x1, y1, proto_w, proto_h, lx0, inv_lw, ly0, inv_lh,
))
})
.collect()
}
DType::F16 => {
let t = proto_data.protos.as_f16().expect("dtype matched F16");
let m = t.map()?;
let protos_slice = m.as_slice();
detect
.par_iter()
.enumerate()
.map(|(i, det)| {
let coeff = &coeff_f32_slice[i * num_protos..(i + 1) * num_protos];
let (x0, y0, x1, y1, roi_w, roi_h) =
bbox_to_proto_roi(det, proto_w, proto_h);
let mask = fused_dot_sign_f16_slice(
protos_slice,
coeff,
proto_h,
proto_w,
y0,
x0,
roi_h,
roi_w,
num_protos,
);
Ok(seg_from_roi(
mask, x0, y0, x1, y1, proto_w, proto_h, lx0, inv_lw, ly0, inv_lh,
))
})
.collect()
}
other => Err(crate::Error::InvalidShape(format!(
"proto tensor dtype {other:?} not supported"
))),
}
}
pub fn materialize_scaled_segmentations(
&self,
detect: &[crate::DetectBox],
proto_data: &crate::ProtoData,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::{DType, TensorMapTrait, TensorTrait};
let _span = tracing::trace_span!(
"materialize_masks",
mode = "scaled",
n_detections = detect.len(),
width,
height,
)
.entered();
if detect.is_empty() {
return Ok(Vec::new());
}
if width == 0 || height == 0 {
return Err(crate::Error::InvalidShape(
"Scaled mask width/height must be positive".into(),
));
}
let proto_shape = proto_data.protos.shape();
if proto_shape.len() != 3 {
return Err(crate::Error::InvalidShape(format!(
"protos tensor must be rank-3, got {proto_shape:?}"
)));
}
let (proto_h, proto_w, num_protos) = match proto_data.layout {
edgefirst_decoder::ProtoLayout::Nhwc => {
(proto_shape[0], proto_shape[1], proto_shape[2])
}
edgefirst_decoder::ProtoLayout::Nchw => {
(proto_shape[1], proto_shape[2], proto_shape[0])
}
};
let coeff_shape = proto_data.mask_coefficients.shape();
if coeff_shape.len() != 2 || coeff_shape[1] != num_protos {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients shape {coeff_shape:?} incompatible with protos \
{proto_shape:?}"
)));
}
if coeff_shape[0] == 0 {
return Ok(Vec::new());
}
if coeff_shape[0] != detect.len() {
return Err(crate::Error::Internal(format!(
"mask_coefficients rows {} != detection count {}",
coeff_shape[0],
detect.len()
)));
}
if proto_data.mask_coefficients.dtype() == DType::I8
&& proto_data.protos.dtype() == DType::I8
{
let coeff_t = proto_data
.mask_coefficients
.as_i8()
.expect("I8 coefficients");
let coeff_m = coeff_t.map()?;
let coeff_quant = coeff_t.quantization().ok_or_else(|| {
crate::Error::InvalidShape(
"I8 mask_coefficients require quantization metadata".into(),
)
})?;
let proto_t = proto_data.protos.as_i8().expect("I8 protos");
let proto_m = proto_t.map()?;
let proto_quant = proto_t.quantization().ok_or_else(|| {
crate::Error::InvalidShape("I8 protos require quantization metadata".into())
})?;
match scaled_segmentations_i8_i8(
detect,
coeff_m.as_slice(),
coeff_quant,
proto_m.as_slice(),
proto_quant,
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
proto_data.layout,
) {
Ok(result) => return Ok(result),
Err(crate::Error::NotSupported(_)) => {
}
Err(e) => return Err(e),
}
}
if proto_data.layout == edgefirst_decoder::ProtoLayout::Nchw {
return Err(crate::Error::NotSupported(
"NCHW proto layout requires I8 protos and coefficients with per-tensor quantization"
.into(),
));
}
let coeff_f32: Vec<f32> = match proto_data.mask_coefficients.dtype() {
DType::F32 => {
let t = proto_data.mask_coefficients.as_f32().expect("F32");
let m = t.map()?;
m.as_slice().to_vec()
}
DType::F16 => {
let t = proto_data.mask_coefficients.as_f16().expect("F16");
let m = t.map()?;
m.as_slice().iter().map(|v| v.to_f32()).collect()
}
DType::I8 => {
let t = proto_data.mask_coefficients.as_i8().expect("I8");
let m = t.map()?;
let q = t.quantization().ok_or_else(|| {
crate::Error::InvalidShape(
"I8 mask_coefficients require quantization metadata".into(),
)
})?;
use edgefirst_tensor::QuantMode;
let (scale, zp) = match q.mode() {
QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
_ => {
return Err(crate::Error::NotSupported(
"per-channel mask_coefficients not supported".into(),
))
}
};
m.as_slice()
.iter()
.map(|&v| (v as f32 - zp) * scale)
.collect()
}
other => {
return Err(crate::Error::InvalidShape(format!(
"mask_coefficients dtype {other:?} not supported"
)));
}
};
match proto_data.protos.dtype() {
DType::F32 => {
let t = proto_data.protos.as_f32().expect("F32");
let m = t.map()?;
scaled_segmentations_f32_slice(
detect,
&coeff_f32,
m.as_slice(),
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
)
}
DType::F16 => {
let t = proto_data.protos.as_f16().expect("F16");
let m = t.map()?;
scaled_segmentations_f16_slice(
detect,
&coeff_f32,
m.as_slice(),
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
)
}
DType::I8 => {
let t = proto_data.protos.as_i8().expect("I8");
let m = t.map()?;
let quant = t.quantization().ok_or_else(|| {
crate::Error::InvalidShape("I8 protos require quantization metadata".into())
})?;
scaled_segmentations_i8_slice(
detect,
&coeff_f32,
m.as_slice(),
proto_h,
proto_w,
num_protos,
quant,
letterbox,
width,
height,
)
}
other => Err(crate::Error::InvalidShape(format!(
"proto tensor dtype {other:?} not supported"
))),
}
}
}
fn bbox_to_proto_roi(
det: &DetectBox,
proto_w: usize,
proto_h: usize,
) -> (usize, usize, usize, usize, usize, usize) {
let bbox = det.bbox.to_canonical();
let xmin = bbox.xmin.clamp(0.0, 1.0);
let ymin = bbox.ymin.clamp(0.0, 1.0);
let xmax = bbox.xmax.clamp(0.0, 1.0);
let ymax = bbox.ymax.clamp(0.0, 1.0);
let x0 = ((xmin * proto_w as f32) as usize).min(proto_w.saturating_sub(1));
let y0 = ((ymin * proto_h as f32) as usize).min(proto_h.saturating_sub(1));
let x1 = ((xmax * proto_w as f32).ceil() as usize).min(proto_w);
let y1 = ((ymax * proto_h as f32).ceil() as usize).min(proto_h);
let roi_w = x1.saturating_sub(x0).max(1);
let roi_h = y1.saturating_sub(y0).max(1);
(x0, y0, x1, y1, roi_w, roi_h)
}
#[allow(clippy::too_many_arguments)]
fn seg_from_roi(
mask: ndarray::Array3<u8>,
x0: usize,
y0: usize,
x1: usize,
y1: usize,
proto_w: usize,
proto_h: usize,
lx0: f32,
inv_lw: f32,
ly0: f32,
inv_lh: f32,
) -> edgefirst_decoder::Segmentation {
let seg_xmin = ((x0 as f32 / proto_w as f32) - lx0) * inv_lw;
let seg_ymin = ((y0 as f32 / proto_h as f32) - ly0) * inv_lh;
let seg_xmax = ((x1 as f32 / proto_w as f32) - lx0) * inv_lw;
let seg_ymax = ((y1 as f32 / proto_h as f32) - ly0) * inv_lh;
edgefirst_decoder::Segmentation {
xmin: seg_xmin.clamp(0.0, 1.0),
ymin: seg_ymin.clamp(0.0, 1.0),
xmax: seg_xmax.clamp(0.0, 1.0),
ymax: seg_ymax.clamp(0.0, 1.0),
segmentation: mask,
}
}
#[allow(clippy::too_many_arguments)]
fn proto_segmentations_i8_i8(
detect: &[crate::DetectBox],
coeff_all: &[i8],
coeff_quant: &edgefirst_tensor::Quantization,
protos: &[i8],
proto_quant: &edgefirst_tensor::Quantization,
proto_h: usize,
proto_w: usize,
num_protos: usize,
lx0: f32,
inv_lw: f32,
ly0: f32,
inv_lh: f32,
layout: edgefirst_decoder::ProtoLayout,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::QuantMode;
let _span = tracing::trace_span!(
"mask_i8_fastpath",
n = detect.len(),
proto_h,
proto_w,
num_protos,
?layout,
)
.entered();
let zp_c: i32 = match coeff_quant.mode() {
QuantMode::PerTensor { zero_point, .. } => zero_point,
QuantMode::PerTensorSymmetric { .. } => 0,
_ => {
return Err(crate::Error::NotSupported(
"per-channel coeff quantization not supported on proto-res i8 path".into(),
))
}
};
let zp_p: i32 = match proto_quant.mode() {
QuantMode::PerTensor { zero_point, .. } => zero_point,
QuantMode::PerTensorSymmetric { .. } => 0,
_ => {
return Err(crate::Error::NotSupported(
"per-channel proto quantization not supported on proto-res i8 path".into(),
))
}
};
let hw = proto_h * proto_w;
let proto_sums: Vec<i32> = if zp_c != 0 {
match layout {
edgefirst_decoder::ProtoLayout::Nhwc => (0..hw)
.map(|px_idx| {
let base = px_idx * num_protos;
protos[base..base + num_protos]
.iter()
.map(|&v| v as i32)
.sum()
})
.collect(),
edgefirst_decoder::ProtoLayout::Nchw => {
let mut sums = vec![0i32; hw];
for c in 0..num_protos {
let plane = &protos[c * hw..];
for (px, s) in sums.iter_mut().enumerate() {
*s += plane[px] as i32;
}
}
sums
}
}
} else {
Vec::new()
};
#[cfg(target_arch = "aarch64")]
let use_dotprod = std::arch::is_aarch64_feature_detected!("dotprod");
detect
.par_iter()
.enumerate()
.map(|(i, det)| {
let coeff = &coeff_all[i * num_protos..(i + 1) * num_protos];
let (x0, y0, x1, y1, roi_w, roi_h) = bbox_to_proto_roi(det, proto_w, proto_h);
let coeff_sum: i32 = coeff.iter().map(|&c| c as i32).sum();
let bias = zp_p * coeff_sum - (num_protos as i32) * zp_c * zp_p;
let mut mask_buf = vec![0u8; roi_h * roi_w];
match layout {
edgefirst_decoder::ProtoLayout::Nhwc => {
let stride_y = proto_w * num_protos;
#[cfg(target_arch = "aarch64")]
{
if use_dotprod {
for ly in 0..roi_h {
let py = y0 + ly;
let row_base = py * stride_y + x0 * num_protos;
for lx in 0..roi_w {
let pix_base = row_base + lx * num_protos;
let proto_px = &protos[pix_base..pix_base + num_protos];
let raw_dot = unsafe {
dot_i8_neon_dotprod(
coeff.as_ptr(),
proto_px.as_ptr(),
num_protos,
)
};
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + x0 + lx]
} else {
0
};
let logit = raw_dot - correction - bias;
if logit > 0 {
mask_buf[ly * roi_w + lx] = 255;
}
}
}
} else {
for ly in 0..roi_h {
let py = y0 + ly;
let row_base = py * stride_y + x0 * num_protos;
for lx in 0..roi_w {
let pix_base = row_base + lx * num_protos;
let proto_px = &protos[pix_base..pix_base + num_protos];
let raw_dot = unsafe {
dot_i8_neon_base(
coeff.as_ptr(),
proto_px.as_ptr(),
num_protos,
)
};
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + x0 + lx]
} else {
0
};
let logit = raw_dot - correction - bias;
if logit > 0 {
mask_buf[ly * roi_w + lx] = 255;
}
}
}
}
}
#[cfg(not(target_arch = "aarch64"))]
{
for ly in 0..roi_h {
let py = y0 + ly;
let row_base = py * stride_y + x0 * num_protos;
for lx in 0..roi_w {
let pix_base = row_base + lx * num_protos;
let proto_px = &protos[pix_base..pix_base + num_protos];
let raw_dot = dot_i8_scalar(coeff, proto_px, num_protos);
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + x0 + lx]
} else {
0
};
let logit = raw_dot - correction - bias;
if logit > 0 {
mask_buf[ly * roi_w + lx] = 255;
}
}
}
}
}
edgefirst_decoder::ProtoLayout::Nchw => {
let mut accum = vec![0i32; roi_h * roi_w];
for c in 0..num_protos {
let plane = &protos[c * hw..];
let coeff_c = coeff[c] as i32;
for ly in 0..roi_h {
let py = y0 + ly;
let row_start = py * proto_w + x0;
let out_row_start = ly * roi_w;
for lx in 0..roi_w {
accum[out_row_start + lx] += coeff_c * plane[row_start + lx] as i32;
}
}
}
for ly in 0..roi_h {
let py = y0 + ly;
for lx in 0..roi_w {
let idx = ly * roi_w + lx;
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + x0 + lx]
} else {
0
};
let logit = accum[idx] - correction - bias;
if logit > 0 {
mask_buf[idx] = 255;
}
}
}
}
}
let mask = ndarray::Array3::from_shape_vec((roi_h, roi_w, 1), mask_buf)
.expect("mask_buf length matches roi_h * roi_w");
Ok(seg_from_roi(
mask, x0, y0, x1, y1, proto_w, proto_h, lx0, inv_lw, ly0, inv_lh,
))
})
.collect()
}
#[allow(clippy::too_many_arguments)]
fn fused_dot_sign_f32_slice(
protos: &[f32],
coeff: &[f32],
_proto_h: usize,
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
let stride_y = proto_w * num_protos;
let mut mask_buf = vec![0u8; roi_h * roi_w];
for y in 0..roi_h {
let row_base = (y0 + y) * stride_y + x0 * num_protos;
let out_row = &mut mask_buf[y * roi_w..(y + 1) * roi_w];
for (x, out_px) in out_row.iter_mut().enumerate() {
let base = row_base + x * num_protos;
let mut acc = 0.0_f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
acc += coeff[k] * protos[base + k]
+ coeff[k + 1] * protos[base + k + 1]
+ coeff[k + 2] * protos[base + k + 2]
+ coeff[k + 3] * protos[base + k + 3];
k += 4;
}
while k < num_protos {
acc += coeff[k] * protos[base + k];
k += 1;
}
if acc > 0.0 {
*out_px = 255;
}
}
}
ndarray::Array3::from_shape_vec((roi_h, roi_w, 1), mask_buf)
.expect("mask_buf length matches roi_h * roi_w")
}
#[allow(clippy::too_many_arguments)]
fn fused_dot_sign_f16_slice(
protos: &[half::f16],
coeff: &[f32],
_proto_h: usize,
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
#[cfg(all(
target_arch = "x86_64",
target_feature = "f16c",
target_feature = "fma"
))]
{
unsafe {
fused_dot_sign_f16_slice_f16c(protos, coeff, proto_w, y0, x0, roi_h, roi_w, num_protos)
}
}
#[cfg(not(all(
target_arch = "x86_64",
target_feature = "f16c",
target_feature = "fma"
)))]
{
fused_dot_sign_f16_slice_scalar(protos, coeff, proto_w, y0, x0, roi_h, roi_w, num_protos)
}
}
#[allow(clippy::too_many_arguments)]
fn fused_dot_sign_f16_slice_scalar(
protos: &[half::f16],
coeff: &[f32],
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
let stride_y = proto_w * num_protos;
let mut mask_buf = vec![0u8; roi_h * roi_w];
for y in 0..roi_h {
let row_base = (y0 + y) * stride_y + x0 * num_protos;
let out_row = &mut mask_buf[y * roi_w..(y + 1) * roi_w];
for (x, out_px) in out_row.iter_mut().enumerate() {
let base = row_base + x * num_protos;
let mut acc = 0.0_f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
acc += coeff[k] * protos[base + k].to_f32()
+ coeff[k + 1] * protos[base + k + 1].to_f32()
+ coeff[k + 2] * protos[base + k + 2].to_f32()
+ coeff[k + 3] * protos[base + k + 3].to_f32();
k += 4;
}
while k < num_protos {
acc += coeff[k] * protos[base + k].to_f32();
k += 1;
}
if acc > 0.0 {
*out_px = 255;
}
}
}
ndarray::Array3::from_shape_vec((roi_h, roi_w, 1), mask_buf)
.expect("mask_buf length matches roi_h * roi_w")
}
#[cfg(all(
target_arch = "x86_64",
target_feature = "f16c",
target_feature = "fma"
))]
#[allow(clippy::too_many_arguments)]
#[target_feature(enable = "f16c,fma,avx")]
unsafe fn fused_dot_sign_f16_slice_f16c(
protos: &[half::f16],
coeff: &[f32],
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> ndarray::Array3<u8> {
use core::arch::x86_64::{
_mm256_castps256_ps128, _mm256_cvtph_ps, _mm256_extractf128_ps, _mm256_fmadd_ps,
_mm256_loadu_ps, _mm256_setzero_ps, _mm_add_ps, _mm_cvtss_f32, _mm_hadd_ps,
_mm_loadu_si128,
};
let stride_y = proto_w * num_protos;
let chunks8 = num_protos / 8;
let mut mask_buf = vec![0u8; roi_h * roi_w];
for y in 0..roi_h {
let row_base = (y0 + y) * stride_y + x0 * num_protos;
let out_row = &mut mask_buf[y * roi_w..(y + 1) * roi_w];
for (x, out_px) in out_row.iter_mut().enumerate() {
let base = row_base + x * num_protos;
let mut acc_v = _mm256_setzero_ps();
let mut k = 0;
for _ in 0..chunks8 {
let p_ptr = protos
.as_ptr()
.add(base + k)
.cast::<core::arch::x86_64::__m128i>();
let raw = _mm_loadu_si128(p_ptr);
let widened = _mm256_cvtph_ps(raw);
let coeffs_v = _mm256_loadu_ps(coeff.as_ptr().add(k));
acc_v = _mm256_fmadd_ps(coeffs_v, widened, acc_v);
k += 8;
}
let lo = _mm256_castps256_ps128(acc_v);
let hi = _mm256_extractf128_ps::<1>(acc_v);
let sum4 = _mm_add_ps(lo, hi);
let sum2 = _mm_hadd_ps(sum4, sum4);
let sum1 = _mm_hadd_ps(sum2, sum2);
let mut acc = _mm_cvtss_f32(sum1);
while k < num_protos {
acc += coeff[k] * protos[base + k].to_f32();
k += 1;
}
if acc > 0.0 {
*out_px = 255;
}
}
}
ndarray::Array3::from_shape_vec((roi_h, roi_w, 1), mask_buf)
.expect("mask_buf length matches roi_h * roi_w")
}
#[allow(clippy::too_many_arguments)]
fn fused_dequant_dot_sign_i8_slice(
protos: &[i8],
coeff: &[f32],
quant: &edgefirst_tensor::Quantization,
_proto_h: usize,
proto_w: usize,
y0: usize,
x0: usize,
roi_h: usize,
roi_w: usize,
num_protos: usize,
) -> crate::Result<ndarray::Array3<u8>> {
use edgefirst_tensor::QuantMode;
let stride_y = proto_w * num_protos;
let mut stack_scratch = [0.0_f32; 64];
let mut heap_scratch: Vec<f32>;
let scaled_coeff: &mut [f32] = if num_protos <= stack_scratch.len() {
&mut stack_scratch[..num_protos]
} else {
heap_scratch = vec![0.0_f32; num_protos];
heap_scratch.as_mut_slice()
};
let zp_offset: f32;
match quant.mode() {
QuantMode::PerTensorSymmetric { scale } => {
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scale;
}
zp_offset = 0.0;
}
QuantMode::PerTensor { scale, zero_point } => {
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scale;
}
zp_offset = zero_point as f32 * scaled_coeff.iter().take(num_protos).sum::<f32>();
}
QuantMode::PerChannelSymmetric { scales, axis } => {
if axis != 2 {
return Err(crate::Error::NotSupported(format!(
"per-channel quantization on axis {axis} not supported \
(only channel axis 2 is implemented on this kernel)"
)));
}
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scales[k];
}
zp_offset = 0.0;
}
QuantMode::PerChannel {
scales,
zero_points,
axis,
} => {
if axis != 2 {
return Err(crate::Error::NotSupported(format!(
"per-channel quantization on axis {axis} not supported \
(only channel axis 2 is implemented on this kernel)"
)));
}
for k in 0..num_protos {
scaled_coeff[k] = coeff[k] * scales[k];
}
zp_offset = (0..num_protos)
.map(|k| scaled_coeff[k] * zero_points[k] as f32)
.sum();
}
}
let mut mask_buf = vec![0u8; roi_h * roi_w];
for y in 0..roi_h {
let row_base = (y0 + y) * stride_y + (x0) * num_protos;
let out_row = &mut mask_buf[y * roi_w..(y + 1) * roi_w];
for (x, out_px) in out_row.iter_mut().enumerate() {
let base = row_base + x * num_protos;
let mut acc = 0.0_f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
let p0 = protos[base + k] as f32;
let p1 = protos[base + k + 1] as f32;
let p2 = protos[base + k + 2] as f32;
let p3 = protos[base + k + 3] as f32;
acc += scaled_coeff[k] * p0
+ scaled_coeff[k + 1] * p1
+ scaled_coeff[k + 2] * p2
+ scaled_coeff[k + 3] * p3;
k += 4;
}
while k < num_protos {
acc += scaled_coeff[k] * protos[base + k] as f32;
k += 1;
}
if acc > zp_offset {
*out_px = 255;
}
}
}
Ok(ndarray::Array3::from_shape_vec((roi_h, roi_w, 1), mask_buf)
.expect("mask_buf length matches roi_h * roi_w"))
}
#[allow(clippy::too_many_arguments)]
fn scaled_segmentations_f32_slice(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[f32],
proto_h: usize,
proto_w: usize,
num_protos: usize,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
scaled_run(
detect,
coeff_all,
protos,
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
1.0,
|p, _| *p,
)
}
#[allow(clippy::too_many_arguments)]
fn scaled_segmentations_f16_slice(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[half::f16],
proto_h: usize,
proto_w: usize,
num_protos: usize,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
scaled_run(
detect,
coeff_all,
protos,
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
1.0,
|p: &half::f16, _| p.to_f32(),
)
}
#[allow(clippy::too_many_arguments)]
fn scaled_segmentations_i8_slice(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[i8],
proto_h: usize,
proto_w: usize,
num_protos: usize,
quant: &edgefirst_tensor::Quantization,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::QuantMode;
let (scale, zp) = match quant.mode() {
QuantMode::PerTensor { scale, zero_point } => (scale, zero_point as f32),
QuantMode::PerTensorSymmetric { scale } => (scale, 0.0),
QuantMode::PerChannel { axis, .. } | QuantMode::PerChannelSymmetric { axis, .. } => {
return Err(crate::Error::NotSupported(format!(
"per-channel quantization (axis={axis}) on scaled seg path \
not yet supported"
)));
}
};
scaled_run(
detect,
coeff_all,
protos,
proto_h,
proto_w,
num_protos,
letterbox,
width,
height,
scale,
move |p: &i8, _| *p as f32 - zp,
)
}
#[cfg_attr(target_arch = "aarch64", allow(dead_code))]
#[inline(always)]
fn dot_i8_scalar(coeff: &[i8], proto: &[i8], n: usize) -> i32 {
let mut acc: i32 = 0;
let chunks = n / 4;
let mut k = 0;
for _ in 0..chunks {
acc += coeff[k] as i32 * proto[k] as i32
+ coeff[k + 1] as i32 * proto[k + 1] as i32
+ coeff[k + 2] as i32 * proto[k + 2] as i32
+ coeff[k + 3] as i32 * proto[k + 3] as i32;
k += 4;
}
while k < n {
acc += coeff[k] as i32 * proto[k] as i32;
k += 1;
}
acc
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn dot_i8_neon_base(coeff: *const i8, proto: *const i8, n: usize) -> i32 {
use std::arch::aarch64::*;
let mut acc = vdupq_n_s32(0);
let full_chunks = n / 16;
let mut offset = 0usize;
for _ in 0..full_chunks {
let c = vld1q_s8(coeff.add(offset));
let p = vld1q_s8(proto.add(offset));
let lo = vmull_s8(vget_low_s8(c), vget_low_s8(p));
let hi = vmull_high_s8(c, p);
acc = vpadalq_s16(acc, lo);
acc = vpadalq_s16(acc, hi);
offset += 16;
}
let remainder = n - offset;
if remainder >= 8 {
let c = vld1_s8(coeff.add(offset));
let p = vld1_s8(proto.add(offset));
let prod = vmull_s8(c, p);
acc = vpadalq_s16(acc, prod);
offset += 8;
}
let mut scalar_acc = vaddvq_s32(acc);
while offset < n {
scalar_acc += *coeff.add(offset) as i32 * *proto.add(offset) as i32;
offset += 1;
}
scalar_acc
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn dot_i8_neon_dotprod(coeff: *const i8, proto: *const i8, n: usize) -> i32 {
use std::arch::aarch64::*;
let mut acc = vdupq_n_s32(0);
let full_chunks = n / 16;
let mut offset = 0usize;
for _ in 0..full_chunks {
let c = vld1q_s8(coeff.add(offset));
let p = vld1q_s8(proto.add(offset));
let result: int32x4_t;
core::arch::asm!(
".arch_extension dotprod",
"sdot {acc:v}.4s, {a:v}.16b, {b:v}.16b",
acc = inout(vreg) acc => result,
a = in(vreg) c,
b = in(vreg) p,
options(pure, nomem, nostack),
);
acc = result;
offset += 16;
}
let mut scalar_acc = vaddvq_s32(acc);
while offset < n {
scalar_acc += *coeff.add(offset) as i32 * *proto.add(offset) as i32;
offset += 1;
}
scalar_acc
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn compute_logits_dotprod(
logits: &mut [i32],
coeff: &[i8],
protos: &[i8],
proto_sums: &[i32],
proto_w: usize,
proto_x0: usize,
proto_y0: usize,
roi_w: usize,
roi_h: usize,
stride_y: usize,
num_protos: usize,
zp_c: i32,
bias: i32,
) {
for ly_idx in 0..roi_h {
let py = proto_y0 + ly_idx;
let row_base = py * stride_y + proto_x0 * num_protos;
for lx_idx in 0..roi_w {
let pix_base = row_base + lx_idx * num_protos;
let proto_px = &protos[pix_base..pix_base + num_protos];
let raw_dot =
unsafe { dot_i8_neon_dotprod(coeff.as_ptr(), proto_px.as_ptr(), num_protos) };
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + proto_x0 + lx_idx]
} else {
0
};
logits[ly_idx * roi_w + lx_idx] = raw_dot - correction - bias;
}
}
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn compute_logits_base(
logits: &mut [i32],
coeff: &[i8],
protos: &[i8],
proto_sums: &[i32],
proto_w: usize,
proto_x0: usize,
proto_y0: usize,
roi_w: usize,
roi_h: usize,
stride_y: usize,
num_protos: usize,
zp_c: i32,
bias: i32,
) {
for ly_idx in 0..roi_h {
let py = proto_y0 + ly_idx;
let row_base = py * stride_y + proto_x0 * num_protos;
for lx_idx in 0..roi_w {
let pix_base = row_base + lx_idx * num_protos;
let proto_px = &protos[pix_base..pix_base + num_protos];
let raw_dot =
unsafe { dot_i8_neon_base(coeff.as_ptr(), proto_px.as_ptr(), num_protos) };
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + proto_x0 + lx_idx]
} else {
0
};
logits[ly_idx * roi_w + lx_idx] = raw_dot - correction - bias;
}
}
}
#[allow(clippy::too_many_arguments)]
fn scaled_segmentations_i8_i8(
detect: &[crate::DetectBox],
coeff_all: &[i8],
coeff_quant: &edgefirst_tensor::Quantization,
protos: &[i8],
proto_quant: &edgefirst_tensor::Quantization,
proto_h: usize,
proto_w: usize,
num_protos: usize,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
layout: edgefirst_decoder::ProtoLayout,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
use edgefirst_tensor::QuantMode;
let _span = tracing::trace_span!(
"mask_i8_fastpath",
n = detect.len(),
proto_h,
proto_w,
num_protos,
width,
height,
?layout,
)
.entered();
let zp_c: i32 = match coeff_quant.mode() {
QuantMode::PerTensor { zero_point, .. } => zero_point,
QuantMode::PerTensorSymmetric { .. } => 0,
_ => {
return Err(crate::Error::NotSupported(
"per-channel coeff quantization not supported".into(),
))
}
};
let zp_p: i32 = match proto_quant.mode() {
QuantMode::PerTensor { zero_point, .. } => zero_point,
QuantMode::PerTensorSymmetric { .. } => 0,
_ => {
return Err(crate::Error::NotSupported(
"per-channel proto quantization not supported".into(),
))
}
};
let (lx0, lw, ly0, lh) = match letterbox {
Some([lx0, ly0, lx1, ly1]) => {
let lw = (lx1 - lx0).max(f32::EPSILON);
let lh = (ly1 - ly0).max(f32::EPSILON);
(lx0, lw, ly0, lh)
}
None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
};
let out_w = width as usize;
let out_h = height as usize;
let hw = proto_h * proto_w;
let proto_sums: Vec<i32> = if zp_c != 0 {
match layout {
edgefirst_decoder::ProtoLayout::Nhwc => (0..hw)
.map(|px_idx| {
let base = px_idx * num_protos;
let mut s: i32 = 0;
for k in 0..num_protos {
s += protos[base + k] as i32;
}
s
})
.collect(),
edgefirst_decoder::ProtoLayout::Nchw => {
let mut sums = vec![0i32; hw];
for c in 0..num_protos {
let plane = &protos[c * hw..];
for (px, s) in sums.iter_mut().enumerate() {
*s += plane[px] as i32;
}
}
sums
}
}
} else {
Vec::new()
};
#[cfg(target_arch = "aarch64")]
let use_dotprod = std::arch::is_aarch64_feature_detected!("dotprod");
let stride_y = proto_w * num_protos;
detect
.par_iter()
.enumerate()
.map(|(i, det)| {
let coeff = &coeff_all[i * num_protos..(i + 1) * num_protos];
let bbox = det.bbox.to_canonical();
let xmin = ((bbox.xmin - lx0) / lw).clamp(0.0, 1.0);
let ymin = ((bbox.ymin - ly0) / lh).clamp(0.0, 1.0);
let xmax = ((bbox.xmax - lx0) / lw).clamp(0.0, 1.0);
let ymax = ((bbox.ymax - ly0) / lh).clamp(0.0, 1.0);
let px0 = (xmin * out_w as f32).round() as usize;
let py0 = (ymin * out_h as f32).round() as usize;
let px1 = ((xmax * out_w as f32).round() as usize).min(out_w);
let py1 = ((ymax * out_h as f32).round() as usize).min(out_h);
let bbox_w = px1.saturating_sub(px0).max(1);
let bbox_h = py1.saturating_sub(py0).max(1);
let sample_x_at = |px: f32| -> f32 {
let model_x_norm = lx0 + (px + 0.5) / out_w as f32 * lw;
model_x_norm * proto_w as f32 - 0.5
};
let sample_y_at = |py: f32| -> f32 {
let model_y_norm = ly0 + (py + 0.5) / out_h as f32 * lh;
model_y_norm * proto_h as f32 - 0.5
};
let s_x_min = sample_x_at(px0 as f32);
let s_x_max = sample_x_at((px1 as f32) - 1.0);
let s_y_min = sample_y_at(py0 as f32);
let s_y_max = sample_y_at((py1 as f32) - 1.0);
let proto_x0 = (s_x_min.floor() as isize)
.max(0)
.min(proto_w.saturating_sub(1) as isize) as usize;
let proto_x1 = ((s_x_max.ceil() as isize) + 1).max(0).min(proto_w as isize) as usize;
let proto_y0 = (s_y_min.floor() as isize)
.max(0)
.min(proto_h.saturating_sub(1) as isize) as usize;
let proto_y1 = ((s_y_max.ceil() as isize) + 1).max(0).min(proto_h as isize) as usize;
let roi_w = proto_x1.saturating_sub(proto_x0).max(1);
let roi_h = proto_y1.saturating_sub(proto_y0).max(1);
let coeff_sum: i32 = coeff.iter().map(|&c| c as i32).sum();
let bias = zp_p * coeff_sum - (num_protos as i32) * zp_c * zp_p;
let mut logits = vec![0_i32; roi_h * roi_w];
match layout {
edgefirst_decoder::ProtoLayout::Nhwc => {
#[cfg(target_arch = "aarch64")]
{
if use_dotprod {
compute_logits_dotprod(
&mut logits,
coeff,
protos,
&proto_sums,
proto_w,
proto_x0,
proto_y0,
roi_w,
roi_h,
stride_y,
num_protos,
zp_c,
bias,
);
} else {
compute_logits_base(
&mut logits,
coeff,
protos,
&proto_sums,
proto_w,
proto_x0,
proto_y0,
roi_w,
roi_h,
stride_y,
num_protos,
zp_c,
bias,
);
}
}
#[cfg(not(target_arch = "aarch64"))]
{
for ly_idx in 0..roi_h {
let py = proto_y0 + ly_idx;
let row_base = py * stride_y + proto_x0 * num_protos;
for lx_idx in 0..roi_w {
let pix_base = row_base + lx_idx * num_protos;
let proto_px = &protos[pix_base..pix_base + num_protos];
let raw_dot = dot_i8_scalar(coeff, proto_px, num_protos);
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + proto_x0 + lx_idx]
} else {
0
};
logits[ly_idx * roi_w + lx_idx] = raw_dot - correction - bias;
}
}
}
}
edgefirst_decoder::ProtoLayout::Nchw => {
for c in 0..num_protos {
let plane = &protos[c * hw..];
let coeff_c = coeff[c] as i32;
for ly_idx in 0..roi_h {
let py = proto_y0 + ly_idx;
let row_start = py * proto_w + proto_x0;
let out_row_start = ly_idx * roi_w;
for lx_idx in 0..roi_w {
logits[out_row_start + lx_idx] +=
coeff_c * plane[row_start + lx_idx] as i32;
}
}
}
for ly_idx in 0..roi_h {
let py = proto_y0 + ly_idx;
for lx_idx in 0..roi_w {
let idx = ly_idx * roi_w + lx_idx;
let correction = if zp_c != 0 {
zp_c * proto_sums[py * proto_w + proto_x0 + lx_idx]
} else {
0
};
logits[idx] -= correction + bias;
}
}
}
}
let roi_last_x = roi_w.saturating_sub(1);
let roi_last_y = roi_h.saturating_sub(1);
const FRAC_BITS: i32 = 10;
const FRAC_SCALE: i32 = 1 << FRAC_BITS; let x_coords: Vec<(usize, usize, i32)> = (0..bbox_w)
.map(|xi| {
let sample_x = sample_x_at((px0 + xi) as f32) - proto_x0 as f32;
let x_floor = sample_x.floor();
let x_lo = (x_floor as isize).max(0).min(roi_last_x as isize) as usize;
let x_hi = (x_lo + 1).min(roi_w - 1);
let x_frac = ((sample_x - x_floor).clamp(0.0, 1.0) * FRAC_SCALE as f32) as i32;
(x_lo, x_hi, x_frac)
})
.collect();
let mut tile_buf = vec![0u8; bbox_h * bbox_w];
for yi in 0..bbox_h {
let sample_y = sample_y_at((py0 + yi) as f32) - proto_y0 as f32;
let y_floor = sample_y.floor();
let y_lo = (y_floor as isize).max(0).min(roi_last_y as isize) as usize;
let y_hi = (y_lo + 1).min(roi_h - 1);
let y_frac = ((sample_y - y_floor).clamp(0.0, 1.0) * FRAC_SCALE as f32) as i32;
let y_frac_inv = FRAC_SCALE - y_frac;
let row_lo = &logits[y_lo * roi_w..y_lo * roi_w + roi_w];
let row_hi = &logits[y_hi * roi_w..y_hi * roi_w + roi_w];
let out_row = &mut tile_buf[yi * bbox_w..(yi + 1) * bbox_w];
for (xi, &(x_lo, x_hi, x_frac)) in x_coords.iter().enumerate() {
let tl = row_lo[x_lo];
let tr = row_lo[x_hi];
let bl = row_hi[x_lo];
let br = row_hi[x_hi];
if (tl & tr & bl & br) < 0 {
continue;
}
if tl > 0 && tr > 0 && bl > 0 && br > 0 {
out_row[xi] = 255;
continue;
}
let x_frac_inv = FRAC_SCALE - x_frac;
let l0 = tl as i64 * x_frac_inv as i64 + tr as i64 * x_frac as i64;
let l1 = bl as i64 * x_frac_inv as i64 + br as i64 * x_frac as i64;
let logit = l0 * y_frac_inv as i64 + l1 * y_frac as i64;
out_row[xi] = if logit > 0 { 255 } else { 0 };
}
}
let tile = ndarray::Array3::from_shape_vec((bbox_h, bbox_w, 1), tile_buf)
.expect("tile_buf length matches bbox_h * bbox_w");
Ok(edgefirst_decoder::Segmentation {
xmin,
ymin,
xmax,
ymax,
segmentation: tile,
})
})
.collect()
}
#[allow(clippy::too_many_arguments)]
fn scaled_run<P: Copy + Sync>(
detect: &[crate::DetectBox],
coeff_all: &[f32],
protos: &[P],
proto_h: usize,
proto_w: usize,
num_protos: usize,
letterbox: Option<[f32; 4]>,
width: u32,
height: u32,
acc_scale: f32,
load_f32: impl Fn(&P, f32) -> f32 + Copy + Sync,
) -> crate::Result<Vec<edgefirst_decoder::Segmentation>> {
let (lx0, lw, ly0, lh) = match letterbox {
Some([lx0, ly0, lx1, ly1]) => {
let lw = (lx1 - lx0).max(f32::EPSILON);
let lh = (ly1 - ly0).max(f32::EPSILON);
(lx0, lw, ly0, lh)
}
None => (0.0_f32, 1.0_f32, 0.0_f32, 1.0_f32),
};
let out_w = width as usize;
let out_h = height as usize;
let stride_y = proto_w * num_protos;
detect
.par_iter()
.enumerate()
.map(|(i, det)| {
let coeff = &coeff_all[i * num_protos..(i + 1) * num_protos];
let bbox = det.bbox.to_canonical();
let xmin = ((bbox.xmin - lx0) / lw).clamp(0.0, 1.0);
let ymin = ((bbox.ymin - ly0) / lh).clamp(0.0, 1.0);
let xmax = ((bbox.xmax - lx0) / lw).clamp(0.0, 1.0);
let ymax = ((bbox.ymax - ly0) / lh).clamp(0.0, 1.0);
let px0 = (xmin * out_w as f32).round() as usize;
let py0 = (ymin * out_h as f32).round() as usize;
let px1 = ((xmax * out_w as f32).round() as usize).min(out_w);
let py1 = ((ymax * out_h as f32).round() as usize).min(out_h);
let bbox_w = px1.saturating_sub(px0).max(1);
let bbox_h = py1.saturating_sub(py0).max(1);
let sample_x_at = |px: f32| -> f32 {
let model_x_norm = lx0 + (px + 0.5) / out_w as f32 * lw;
model_x_norm * proto_w as f32 - 0.5
};
let sample_y_at = |py: f32| -> f32 {
let model_y_norm = ly0 + (py + 0.5) / out_h as f32 * lh;
model_y_norm * proto_h as f32 - 0.5
};
let s_x_min = sample_x_at(px0 as f32);
let s_x_max = sample_x_at((px1 as f32) - 1.0);
let s_y_min = sample_y_at(py0 as f32);
let s_y_max = sample_y_at((py1 as f32) - 1.0);
let proto_x0 = (s_x_min.floor() as isize)
.max(0)
.min(proto_w.saturating_sub(1) as isize) as usize;
let proto_x1 = ((s_x_max.ceil() as isize) + 1).max(0).min(proto_w as isize) as usize;
let proto_y0 = (s_y_min.floor() as isize)
.max(0)
.min(proto_h.saturating_sub(1) as isize) as usize;
let proto_y1 = ((s_y_max.ceil() as isize) + 1).max(0).min(proto_h as isize) as usize;
let roi_w = proto_x1.saturating_sub(proto_x0).max(1);
let roi_h = proto_y1.saturating_sub(proto_y0).max(1);
if !acc_scale.is_finite() || acc_scale <= 0.0 {
return Err(crate::Error::NotSupported(format!(
"acc_scale must be finite and positive for sign-threshold optimization (got {acc_scale})"
)));
}
let _ = acc_scale; let mut logits = vec![0.0_f32; roi_h * roi_w];
for ly_idx in 0..roi_h {
let py = proto_y0 + ly_idx;
let row_base = py * stride_y + proto_x0 * num_protos;
for lx_idx in 0..roi_w {
let pix_base = row_base + lx_idx * num_protos;
let mut acc = 0.0_f32;
let mut k = 0;
let chunks = num_protos / 4;
for _ in 0..chunks {
acc += coeff[k] * load_f32(&protos[pix_base + k], 0.0)
+ coeff[k + 1] * load_f32(&protos[pix_base + k + 1], 0.0)
+ coeff[k + 2] * load_f32(&protos[pix_base + k + 2], 0.0)
+ coeff[k + 3] * load_f32(&protos[pix_base + k + 3], 0.0);
k += 4;
}
while k < num_protos {
acc += coeff[k] * load_f32(&protos[pix_base + k], 0.0);
k += 1;
}
logits[ly_idx * roi_w + lx_idx] = acc;
}
}
let roi_last_x = roi_w.saturating_sub(1);
let roi_last_y = roi_h.saturating_sub(1);
let x_coords: Vec<(u32, u32, f32)> = (0..bbox_w)
.map(|xi| {
let sample_x = sample_x_at((px0 + xi) as f32) - proto_x0 as f32;
let x_floor = sample_x.floor();
let x_lo = (x_floor as isize).max(0).min(roi_last_x as isize) as u32;
let x_hi = (x_lo as usize + 1).min(roi_w - 1) as u32;
let x_frac = (sample_x - x_floor).clamp(0.0, 1.0);
(x_lo, x_hi, x_frac)
})
.collect();
let mut tile_buf = vec![0u8; bbox_h * bbox_w];
for yi in 0..bbox_h {
let sample_y = sample_y_at((py0 + yi) as f32) - proto_y0 as f32;
let y_floor = sample_y.floor();
let y_lo = (y_floor as isize).max(0).min(roi_last_y as isize) as usize;
let y_hi = (y_lo + 1).min(roi_h - 1);
let y_frac = (sample_y - y_floor).clamp(0.0, 1.0);
let row_lo = &logits[y_lo * roi_w..y_lo * roi_w + roi_w];
let row_hi = &logits[y_hi * roi_w..y_hi * roi_w + roi_w];
let out_row = &mut tile_buf[yi * bbox_w..(yi + 1) * bbox_w];
for (xi, &(x_lo, x_hi, x_frac)) in x_coords.iter().enumerate() {
let (xl, xh) = (x_lo as usize, x_hi as usize);
let l0 = row_lo[xl] + (row_lo[xh] - row_lo[xl]) * x_frac;
let l1 = row_hi[xl] + (row_hi[xh] - row_hi[xl]) * x_frac;
let logit = l0 + (l1 - l0) * y_frac;
out_row[xi] = if logit > 0.0 { 255 } else { 0 };
}
}
let tile = ndarray::Array3::from_shape_vec((bbox_h, bbox_w, 1), tile_buf)
.expect("tile_buf length matches bbox_h * bbox_w");
Ok(edgefirst_decoder::Segmentation {
xmin,
ymin,
xmax,
ymax,
segmentation: tile,
})
})
.collect()
}