#![allow(
clippy::cast_sign_loss,
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::similar_names
)]
use std::marker::PhantomData;
use crate::camera_math::CameraState;
use crate::fixed::ftol;
use crate::gline::derive_gline_frustum;
use crate::grouscan::{grouscan_run, CfType, GrouscanInputs, CF_SEED_INDEX};
use crate::opticast::camera_column_slice;
use crate::opticast_prelude::{OpticastPrelude, PREC};
use crate::rasterizer::{Rasterizer, ScanScratch};
use crate::ray_step::RayStep;
use crate::scan_loops::ScanContext;
#[derive(Clone, Copy, Debug)]
pub struct RasterTarget<'a> {
fb_ptr: *mut u32,
fb_len: usize,
zb_ptr: *mut f32,
zb_len: usize,
_marker: PhantomData<&'a mut [u32]>,
}
unsafe impl Send for RasterTarget<'_> {}
unsafe impl Sync for RasterTarget<'_> {}
impl<'a> RasterTarget<'a> {
#[must_use]
pub fn new(framebuffer: &'a mut [u32], zbuffer: &'a mut [f32]) -> Self {
Self {
fb_ptr: framebuffer.as_mut_ptr(),
fb_len: framebuffer.len(),
zb_ptr: zbuffer.as_mut_ptr(),
zb_len: zbuffer.len(),
_marker: PhantomData,
}
}
#[must_use]
pub fn fb_len(self) -> usize {
self.fb_len
}
#[must_use]
pub fn fb_ptr(self) -> *mut u32 {
self.fb_ptr
}
#[must_use]
pub fn zb_ptr(self) -> *mut f32 {
self.zb_ptr
}
pub unsafe fn write_color(self, idx: usize, color: u32) {
debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
unsafe { self.fb_ptr.add(idx).write(color) };
}
pub unsafe fn write_depth(self, idx: usize, z: f32) {
debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
unsafe { self.zb_ptr.add(idx).write(z) };
}
}
#[allow(
clippy::cast_sign_loss,
clippy::cast_possible_wrap,
clippy::many_single_char_names
)]
fn fog_blend(col: i32, dist: i32, foglut: &[i32], fog_col: i32) -> i32 {
if foglut.is_empty() {
return col;
}
let idx = (dist >> 20) as usize;
let l = foglut.get(idx).copied().unwrap_or(32767) & 32767;
let k = col;
let fc = fog_col;
let r = (((fc & 255) - (k & 255)) * l) >> 15;
let g = (((((fc >> 8) & 255) - ((k >> 8) & 255)) * l) >> 15) << 8;
let b = (((((fc >> 16) & 255) - ((k >> 16) & 255)) * l) >> 15) << 16;
r + g + b + k
}
fn sky_per_ray_update(
scratch: &mut crate::rasterizer::ScanScratch,
sky: &crate::sky::Sky,
vx1: f32,
vy1: f32,
) {
let ysiz = sky.ysiz;
if scratch.sky_cur_lng < 0 {
let ang = vy1.atan2(vx1) + std::f32::consts::PI;
let raw = ang * sky.lng_mul - 0.5;
let mut lng = ftol(raw);
if (lng as u32) >= (ysiz as u32) {
lng = lng.rem_euclid(ysiz);
}
scratch.sky_cur_lng = lng;
} else if scratch.sky_cur_dir < 0 {
let mut j = scratch.sky_cur_lng + 1;
if j >= ysiz {
j = 0;
}
loop {
let l = sky.lng[j as usize];
if l[0] * vy1 <= l[1] * vx1 {
break;
}
scratch.sky_cur_lng = j;
j += 1;
if j >= ysiz {
j = 0;
}
}
} else {
loop {
let l = sky.lng[scratch.sky_cur_lng as usize];
if l[0] * vy1 >= l[1] * vx1 {
break;
}
scratch.sky_cur_lng -= 1;
if scratch.sky_cur_lng < 0 {
scratch.sky_cur_lng = ysiz - 1;
}
}
}
scratch.sky_off = scratch.sky_cur_lng * sky.bpl;
}
#[derive(Clone)]
struct FrameCache {
ray_step: RayStep,
camera_state: CameraState,
prelude: OpticastPrelude,
gstartz0: i32,
gstartz1: i32,
vptr_offset: usize,
}
#[allow(dead_code)]
#[derive(Clone)]
pub struct ScalarRasterizer<'a> {
target: RasterTarget<'a>,
pitch_pixels: usize,
slab_buf: &'a [u8],
column_offsets: &'a [u32],
mip_base_offsets: &'a [usize],
vsid: u32,
sky: Option<&'a crate::sky::Sky>,
frame: Option<FrameCache>,
}
const _: fn() = || {
fn assert_send<T: Send>() {}
fn assert_sync<T: Sync>() {}
assert_send::<ScalarRasterizer<'_>>();
assert_sync::<ScalarRasterizer<'_>>();
};
impl<'a> ScalarRasterizer<'a> {
#[must_use]
pub fn new(
framebuffer: &'a mut [u32],
zbuffer: &'a mut [f32],
pitch_pixels: usize,
slab_buf: &'a [u8],
column_offsets: &'a [u32],
mip_base_offsets: &'a [usize],
vsid: u32,
) -> Self {
Self {
target: RasterTarget::new(framebuffer, zbuffer),
pitch_pixels,
slab_buf,
column_offsets,
mip_base_offsets,
vsid,
sky: None,
frame: None,
}
}
#[must_use]
pub fn with_sky(mut self, sky: &'a crate::sky::Sky) -> Self {
self.sky = Some(sky);
self
}
}
impl Rasterizer for ScalarRasterizer<'_> {
fn frame_setup(&mut self, ctx: &ScanContext<'_>) {
self.frame = Some(FrameCache {
ray_step: *ctx.rs,
camera_state: *ctx.camera_state,
prelude: ctx.prelude.clone(),
gstartz0: ctx.camera_gstartz0,
gstartz1: ctx.camera_gstartz1,
vptr_offset: ctx.camera_vptr_offset,
});
}
#[allow(clippy::too_many_lines)]
fn gline(
&mut self,
scratch: &mut ScanScratch,
length: u32,
x0: f32,
y0: f32,
x1: f32,
y1: f32,
) {
let cache = self
.frame
.as_ref()
.expect("gline called before frame_setup");
let leng = length as i32;
let f = derive_gline_frustum(
&cache.camera_state,
&cache.prelude,
self.vsid,
length,
x0,
y0,
x1,
y1,
);
scratch.gixy = f.gixy;
scratch.gpz = f.gpz;
scratch.gdz = f.gdz;
#[allow(clippy::cast_precision_loss)]
let cmpprec = PREC as f32;
#[allow(clippy::cast_precision_loss)]
let cmprecip = if leng > 0 {
cmpprec / (leng as f32)
} else {
0.0
};
let (gi0, gi1, cx0, cy0) = if cache.prelude.forward_z_sign < 0 {
(
ftol((f.vd1 - f.vd0) * cmprecip),
ftol((f.vz1 - f.vz0) * cmprecip),
ftol(f.vd0 * cmpprec),
ftol(f.vz0 * cmpprec),
)
} else {
(
ftol((f.vd0 - f.vd1) * cmprecip),
ftol((f.vz0 - f.vz1) * cmprecip),
ftol(f.vd1 * cmpprec),
ftol(f.vz1 * cmpprec),
)
};
let cx1 = leng.wrapping_mul(gi0).wrapping_add(cx0);
let cy1 = leng.wrapping_mul(gi1).wrapping_add(cy0);
scratch.gi0 = gi0;
scratch.gi1 = gi1;
let gscanptr_isize = scratch.gscanptr as isize;
scratch.cf[CF_SEED_INDEX] = CfType {
i0: gscanptr_isize,
i1: gscanptr_isize + leng as isize,
z0: cache.gstartz0,
z1: cache.gstartz1,
cx0,
cy0,
cx1,
cy1,
};
let mut gxmax = cache.prelude.max_scan_dist;
scratch.skycast.dist = gxmax;
let li_pos = cache.prelude.li_pos;
let vsid_signed = self.vsid as i32;
let j0 = if f.gixy[0] < 0 {
li_pos[0]
} else {
vsid_signed - 1 - li_pos[0]
};
let q0 = (i64::from(f.gdz[0]).wrapping_mul(i64::from(j0)))
.wrapping_add(i64::from(f.gpz[0] as u32));
if (q0 as u64) < u64::from(gxmax as u32) {
gxmax = q0 as i32;
scratch.skycast.dist = i32::MAX;
}
let j1 = if f.gixy[1] < 0 {
li_pos[1]
} else {
vsid_signed - 1 - li_pos[1]
};
let q1 = (i64::from(f.gdz[1]).wrapping_mul(i64::from(j1)))
.wrapping_add(i64::from(f.gpz[1] as u32));
if (q1 as u64) < u64::from(gxmax as u32) {
gxmax = q1 as i32;
scratch.skycast.dist = i32::MAX;
}
scratch.gxmax = gxmax;
if let Some(sky) = self.sky {
sky_per_ray_update(scratch, sky, f.vx1, f.vy1);
}
let column = camera_column_slice(
self.slab_buf,
self.column_offsets,
cache.prelude.column_index,
)
.unwrap_or(&[]);
let mut gcsub_local: [i64; 9] = scratch.gcsub;
if scratch.sideshademode {
let lane0_idx = if f.gixy[0] < 0 { 4 } else { 5 };
let lane1_idx = if f.gixy[1] < 0 { 6 } else { 7 };
gcsub_local[0] = gcsub_local[lane0_idx];
gcsub_local[1] = gcsub_local[lane1_idx];
}
let inputs = GrouscanInputs {
column,
gylookup: &cache.prelude.y_lookup,
gcsub: &gcsub_local,
slab_buf: self.slab_buf,
column_offsets: self.column_offsets,
mip_base_offsets: self.mip_base_offsets,
vsid: self.vsid,
sky: self.sky.map(crate::grouscan::SkyRef::from_sky),
};
let gmipnum = u32::try_from(self.mip_base_offsets.len().saturating_sub(1))
.expect("mip count fits in u32");
let _ = grouscan_run(
scratch,
&inputs,
cache.vptr_offset,
cache.prelude.column_index as usize,
cache.prelude.x_mip,
gmipnum.max(1),
);
}
fn hrend(
&mut self,
scratch: &mut ScanScratch,
sx: i32,
sy: i32,
p1: i32,
plc: i32,
incr: i32,
j: i32,
) {
let rs = self
.frame
.as_ref()
.map(|f| f.ray_step)
.expect("hrend/vrend called before frame_setup");
#[allow(clippy::cast_precision_loss)]
let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
#[allow(clippy::cast_precision_loss)]
let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
let row_start = sy as usize * self.pitch_pixels;
let mut plc_local = plc;
let mut x = sx;
#[cfg(target_arch = "x86_64")]
#[allow(clippy::cast_ptr_alignment)]
unsafe {
use core::arch::x86_64::{
__m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
_mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
};
let strx = rs.strx;
let stry = rs.stry;
let vstrx4 = _mm_set1_ps(strx * 4.0);
let vstry4 = _mm_set1_ps(stry * 4.0);
let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
while p1 - x >= 4 {
let mut col = [0i32; 4];
let mut dst = [0i32; 4];
for k in 0..4 {
let ray_idx = (plc_local >> 16) as usize;
let cd_offset = scratch.angstart[ray_idx] + j as isize;
let cd = scratch.radar[cd_offset as usize];
col[k] = cd.col;
dst[k] = cd.dist;
plc_local = plc_local.wrapping_add(incr);
}
if !scratch.foglut.is_empty() {
for k in 0..4 {
col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
}
}
let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
let vdst = _mm_cvtepi32_ps(vdsi);
let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
let vinv = _mm_rsqrt_ps(vsqr);
let vz = _mm_mul_ps(vdst, vinv);
let pixel_idx = row_start + x as usize;
_mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
_mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
vdx = _mm_add_ps(vdx, vstrx4);
vdy = _mm_add_ps(vdy, vstry4);
x += 4;
}
dirx = _mm_cvtss_f32(vdx);
diry = _mm_cvtss_f32(vdy);
}
#[cfg(target_arch = "aarch64")]
unsafe {
use core::arch::aarch64::{
float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
vst1q_u32,
};
let strx = rs.strx;
let stry = rs.stry;
let vstrx4 = vdupq_n_f32(strx * 4.0);
let vstry4 = vdupq_n_f32(stry * 4.0);
let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
while p1 - x >= 4 {
let mut col = [0i32; 4];
let mut dst = [0i32; 4];
for k in 0..4 {
let ray_idx = (plc_local >> 16) as usize;
let cd_offset = scratch.angstart[ray_idx] + j as isize;
let cd = scratch.radar[cd_offset as usize];
col[k] = cd.col;
dst[k] = cd.dist;
plc_local = plc_local.wrapping_add(incr);
}
if !scratch.foglut.is_empty() {
for k in 0..4 {
col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
}
}
let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
let est = vrsqrteq_f32(vsqr);
let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
let vz = vmulq_f32(vdst, vinv);
let pixel_idx = row_start + x as usize;
vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
vdx = vaddq_f32(vdx, vstrx4);
vdy = vaddq_f32(vdy, vstry4);
x += 4;
}
dirx = vgetq_lane_f32(vdx, 0);
diry = vgetq_lane_f32(vdy, 0);
}
#[cfg(target_arch = "wasm32")]
unsafe {
use core::arch::wasm32::{
f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
};
let strx = rs.strx;
let stry = rs.stry;
let vstrx4 = f32x4_splat(strx * 4.0);
let vstry4 = f32x4_splat(stry * 4.0);
let one = f32x4_splat(1.0);
let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
while p1 - x >= 4 {
let mut col = [0i32; 4];
let mut dst = [0i32; 4];
for k in 0..4 {
let ray_idx = (plc_local >> 16) as usize;
let cd_offset = scratch.angstart[ray_idx] + j as isize;
let cd = scratch.radar[cd_offset as usize];
col[k] = cd.col;
dst[k] = cd.dist;
plc_local = plc_local.wrapping_add(incr);
}
if !scratch.foglut.is_empty() {
for k in 0..4 {
col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
}
}
let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
let vdst = f32x4_convert_i32x4(vdsi);
let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
let vz = f32x4_mul(vdst, vinv);
let pixel_idx = row_start + x as usize;
v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
vdx = f32x4_add(vdx, vstrx4);
vdy = f32x4_add(vdy, vstry4);
x += 4;
}
dirx = f32x4_extract_lane::<0>(vdx);
diry = f32x4_extract_lane::<0>(vdy);
}
while x < p1 {
let ray_idx = (plc_local >> 16) as usize;
let cd_offset = scratch.angstart[ray_idx] + j as isize;
let cd = scratch.radar[cd_offset as usize];
let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
let pixel_idx = row_start + x as usize;
#[allow(clippy::cast_precision_loss)]
let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
unsafe {
self.target.write_color(pixel_idx, col as u32);
self.target.write_depth(pixel_idx, z);
}
dirx += rs.strx;
diry += rs.stry;
plc_local = plc_local.wrapping_add(incr);
x += 1;
}
}
fn vrend(
&mut self,
scratch: &mut ScanScratch,
sx: i32,
sy: i32,
p1: i32,
iplc: i32,
iinc: i32,
) {
let rs = self
.frame
.as_ref()
.map(|f| f.ray_step)
.expect("hrend/vrend called before frame_setup");
#[allow(clippy::cast_precision_loss)]
let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
#[allow(clippy::cast_precision_loss)]
let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
let row_start = sy as usize * self.pitch_pixels;
let half_stride = scratch.uurend_half_stride;
let mut iplc_local = iplc;
let mut x = sx;
#[cfg(target_arch = "x86_64")]
#[allow(clippy::cast_ptr_alignment)]
unsafe {
use core::arch::x86_64::{
__m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
_mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
};
let strx = rs.strx;
let stry = rs.stry;
let vstrx4 = _mm_set1_ps(strx * 4.0);
let vstry4 = _mm_set1_ps(stry * 4.0);
let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
while p1 - x >= 4 {
let xu = x as usize;
let mut u = [0i32; 4];
let mut d = [0i32; 4];
for k in 0..4 {
u[k] = scratch.uurend[xu + k];
d[k] = scratch.uurend[xu + k + half_stride];
}
let mut col = [0i32; 4];
let mut dst = [0i32; 4];
for k in 0..4 {
let ray_idx = (u[k] >> 16) as usize;
let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
let cd = scratch.radar[cd_offset as usize];
col[k] = cd.col;
dst[k] = cd.dist;
}
if !scratch.foglut.is_empty() {
for k in 0..4 {
col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
}
}
let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
let vdst = _mm_cvtepi32_ps(vdsi);
let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
let vinv = _mm_rsqrt_ps(vsqr);
let vz = _mm_mul_ps(vdst, vinv);
let pixel_idx = row_start + xu;
_mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
_mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
for k in 0..4 {
scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
}
vdx = _mm_add_ps(vdx, vstrx4);
vdy = _mm_add_ps(vdy, vstry4);
iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
x += 4;
}
dirx = _mm_cvtss_f32(vdx);
diry = _mm_cvtss_f32(vdy);
}
#[cfg(target_arch = "aarch64")]
unsafe {
use core::arch::aarch64::{
float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
vst1q_u32,
};
let strx = rs.strx;
let stry = rs.stry;
let vstrx4 = vdupq_n_f32(strx * 4.0);
let vstry4 = vdupq_n_f32(stry * 4.0);
let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
while p1 - x >= 4 {
let xu = x as usize;
let mut u = [0i32; 4];
let mut d = [0i32; 4];
for k in 0..4 {
u[k] = scratch.uurend[xu + k];
d[k] = scratch.uurend[xu + k + half_stride];
}
let mut col = [0i32; 4];
let mut dst = [0i32; 4];
for k in 0..4 {
let ray_idx = (u[k] >> 16) as usize;
let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
let cd = scratch.radar[cd_offset as usize];
col[k] = cd.col;
dst[k] = cd.dist;
}
if !scratch.foglut.is_empty() {
for k in 0..4 {
col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
}
}
let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
let est = vrsqrteq_f32(vsqr);
let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
let vz = vmulq_f32(vdst, vinv);
let pixel_idx = row_start + xu;
vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
for k in 0..4 {
scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
}
vdx = vaddq_f32(vdx, vstrx4);
vdy = vaddq_f32(vdy, vstry4);
iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
x += 4;
}
dirx = vgetq_lane_f32(vdx, 0);
diry = vgetq_lane_f32(vdy, 0);
}
#[cfg(target_arch = "wasm32")]
unsafe {
use core::arch::wasm32::{
f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
};
let strx = rs.strx;
let stry = rs.stry;
let vstrx4 = f32x4_splat(strx * 4.0);
let vstry4 = f32x4_splat(stry * 4.0);
let one = f32x4_splat(1.0);
let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
while p1 - x >= 4 {
let xu = x as usize;
let mut u = [0i32; 4];
let mut d = [0i32; 4];
for k in 0..4 {
u[k] = scratch.uurend[xu + k];
d[k] = scratch.uurend[xu + k + half_stride];
}
let mut col = [0i32; 4];
let mut dst = [0i32; 4];
for k in 0..4 {
let ray_idx = (u[k] >> 16) as usize;
let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
let cd = scratch.radar[cd_offset as usize];
col[k] = cd.col;
dst[k] = cd.dist;
}
if !scratch.foglut.is_empty() {
for k in 0..4 {
col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
}
}
let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
let vdst = f32x4_convert_i32x4(vdsi);
let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
let vz = f32x4_mul(vdst, vinv);
let pixel_idx = row_start + xu;
v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
for k in 0..4 {
scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
}
vdx = f32x4_add(vdx, vstrx4);
vdy = f32x4_add(vdy, vstry4);
iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
x += 4;
}
dirx = f32x4_extract_lane::<0>(vdx);
diry = f32x4_extract_lane::<0>(vdy);
}
while x < p1 {
let xu = x as usize;
let ray_idx = (scratch.uurend[xu] >> 16) as usize;
let cd_offset = scratch.angstart[ray_idx] + iplc_local as isize;
let cd = scratch.radar[cd_offset as usize];
let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
let pixel_idx = row_start + xu;
#[allow(clippy::cast_precision_loss)]
let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
unsafe {
self.target.write_color(pixel_idx, col as u32);
self.target.write_depth(pixel_idx, z);
}
dirx += rs.strx;
diry += rs.stry;
scratch.uurend[xu] = scratch.uurend[xu].wrapping_add(scratch.uurend[xu + half_stride]);
x += 1;
iplc_local = iplc_local.wrapping_add(iinc);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rasterizer::CastDat;
fn dummy_per_frame() -> (
crate::camera_math::CameraState,
crate::projection::ProjectionRect,
crate::ray_step::RayStep,
crate::opticast_prelude::OpticastPrelude,
) {
let cam = crate::Camera {
pos: [0.0, 0.0, 0.0],
right: [1.0, 0.0, 0.0],
down: [0.0, 1.0, 0.0],
forward: [0.0, 0.0, 1.0],
};
let cs = crate::camera_math::derive(&cam, 64, 64, 32.0, 32.0, 32.0);
let proj = crate::projection::derive_projection(&cs, 64, 64, 32.0, 32.0, 32.0, 1);
let rs = crate::ray_step::derive_ray_step(&cs, proj.cx, proj.cy, 32.0);
let prelude = crate::opticast_prelude::derive_prelude(&cs, 2048, 1, 4, 1024);
(cs, proj, rs, prelude)
}
#[test]
fn frame_setup_caches_ray_step() {
let mut fb = vec![0u32; 64 * 64];
let mut zb = vec![0.0f32; 64 * 64];
let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
let (cs, proj, rs, prelude) = dummy_per_frame();
let ctx = ScanContext {
proj: &proj,
rs: &rs,
prelude: &prelude,
xres: 64,
y_start: 0,
y_end: 64,
anginc: 1,
camera_state: &cs,
camera_gstartz0: 0,
camera_gstartz1: 0,
camera_vptr_offset: 0,
};
r.frame_setup(&ctx);
let cached_rs = r.frame.as_ref().expect("frame populated").ray_step;
assert_eq!(cached_rs.strx.to_bits(), rs.strx.to_bits());
assert_eq!(cached_rs.stry.to_bits(), rs.stry.to_bits());
assert_eq!(cached_rs.cx16, rs.cx16);
assert_eq!(cached_rs.cy16, rs.cy16);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn hrend_sse_batch_writes_4_pixel_block() {
let mut fb = vec![0u32; 64 * 64];
let mut zb = vec![0.0f32; 64 * 64];
let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
let (cs, proj, rs, prelude) = dummy_per_frame();
let ctx = ScanContext {
proj: &proj,
rs: &rs,
prelude: &prelude,
xres: 64,
y_start: 0,
y_end: 64,
anginc: 1,
camera_state: &cs,
camera_gstartz0: 0,
camera_gstartz1: 0,
camera_vptr_offset: 0,
};
r.frame_setup(&ctx);
let mut scratch = ScanScratch::new_for_size(64, 64, 64);
for (i, slot) in scratch.radar.iter_mut().enumerate().take(4) {
slot.col = 0x8000_0000_u32 as i32 | i as i32;
slot.dist = 1024;
}
for k in 0..4 {
scratch.angstart[k] = k as isize;
}
r.hrend(&mut scratch, 10, 5, 14, 0, 1 << 16, 0);
let row_off = 5 * 64;
for k in 0..4 {
let want = 0x8000_0000_u32 | k as u32;
assert_eq!(
fb[row_off + 10 + k],
want,
"fb[5][{}] = {:#010x}, expected {:#010x}",
10 + k,
fb[row_off + 10 + k],
want,
);
assert_ne!(zb[row_off + 10 + k].to_bits(), 0u32);
}
}
#[test]
fn fog_blend_disabled_returns_col_unchanged() {
let foglut: Vec<i32> = Vec::new();
let col = 0x0080_C040;
assert_eq!(fog_blend(col, 0x1234_5678, &foglut, 0xFF_FFFF), col);
}
#[test]
fn fog_blend_full_fog_returns_fog_col_per_channel() {
let foglut = vec![32767; 2048];
let col = 0x80_AA_BB_CC_u32 as i32;
let fog = 0x00_11_22_33_i32;
let blended = fog_blend(col, 0, &foglut, fog) as u32;
assert_eq!(blended & 0x00FF_FFFF, fog as u32 & 0x00FF_FFFF);
assert_eq!(blended & 0xFF00_0000, col as u32 & 0xFF00_0000);
}
#[test]
fn set_fog_zero_distance_clears_table() {
let mut s = ScanScratch::new_for_size(64, 64, 64);
s.set_fog(0x1234_5678, 100);
assert!(!s.foglut.is_empty());
s.set_fog(0, 0);
assert!(s.foglut.is_empty());
}
#[test]
fn set_fog_table_starts_at_zero_and_climbs() {
let mut s = ScanScratch::new_for_size(64, 64, 64);
s.set_fog(0xFF, 1024);
assert_eq!(s.foglut[0], 0);
assert!(
s.foglut[2047] > 30_000,
"tail entry too low: {}",
s.foglut[2047]
);
}
#[test]
fn hrend_writes_pixel_per_column_from_radar() {
let mut fb = vec![0u32; 64 * 64];
let mut zb = vec![0.0f32; 64 * 64];
let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
let (cs, proj, rs, prelude) = dummy_per_frame();
let ctx = ScanContext {
proj: &proj,
rs: &rs,
prelude: &prelude,
xres: 64,
y_start: 0,
y_end: 64,
anginc: 1,
camera_state: &cs,
camera_gstartz0: 0,
camera_gstartz1: 0,
camera_vptr_offset: 0,
};
r.frame_setup(&ctx);
let mut scratch = ScanScratch::new_for_size(64, 64, 64);
for (i, slot) in scratch.radar.iter_mut().enumerate().take(16) {
slot.col = 0x8000_0000_u32 as i32 | i as i32;
slot.dist = 1024;
}
scratch.angstart[0] = 0;
r.hrend(&mut scratch, 10, 5, 14, 0, 0, 2);
let row_off = 5 * 64;
for x in 10..14 {
let want = 0x8000_0000_u32 | 2;
assert_eq!(
fb[row_off + x],
want,
"fb[5][{x}] = {:#010x}, expected {:#010x}",
fb[row_off + x],
want,
);
}
assert_eq!(fb[row_off + 9], 0);
assert_eq!(fb[row_off + 14], 0);
}
#[test]
fn end_to_end_opticast_runs_through_real_gline() {
use crate::opticast as opticast_fn;
use crate::rasterizer::ScratchPool;
use crate::OpticastSettings;
let mut fb = vec![0u32; 640 * 480];
let mut zb = vec![0.0f32; 640 * 480];
let mut pool = ScratchPool::new(640, 480, 2048);
let sky_col = 0x80AB_CDEF_u32 as i32;
pool.set_skycast(sky_col, 0x7FFF_FFFF);
let column = vec![0u8, 200, 254, 0];
let cam_idx = 1024usize * 2048 + 1024;
let mut column_offsets = vec![0u32; 2048 * 2048 + 1];
let column_len_u32 = u32::try_from(column.len()).expect("column fits u32");
for offset in &mut column_offsets[(cam_idx + 1)..] {
*offset = column_len_u32;
}
let mip_base_offsets = [0usize, column_offsets.len()];
let mut rasterizer = ScalarRasterizer::new(
&mut fb,
&mut zb,
640,
&column,
&column_offsets,
&mip_base_offsets,
2048,
);
let cam = crate::Camera {
pos: [1024.0, 1024.0, 128.0],
right: [1.0, 0.0, 0.0],
down: [0.0, 1.0, 0.0],
forward: [0.0, 0.0, 1.0],
};
let settings = OpticastSettings::for_oracle_framebuffer(640, 480);
let outcome = opticast_fn(
&mut rasterizer,
&mut pool,
&cam,
&settings,
2048,
&column,
&column_offsets,
);
assert_eq!(outcome, crate::OpticastOutcome::Rendered);
let _ = sky_col; }
#[cfg(target_arch = "x86_64")]
#[test]
fn vrend_sse_batch_writes_4_pixel_block() {
let mut fb = vec![0u32; 64 * 64];
let mut zb = vec![0.0f32; 64 * 64];
let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
let (cs, proj, rs, prelude) = dummy_per_frame();
let ctx = ScanContext {
proj: &proj,
rs: &rs,
prelude: &prelude,
xres: 64,
y_start: 0,
y_end: 64,
anginc: 1,
camera_state: &cs,
camera_gstartz0: 0,
camera_gstartz1: 0,
camera_vptr_offset: 0,
};
r.frame_setup(&ctx);
let mut scratch = ScanScratch::new_for_size(64, 64, 64);
for k in 0..4 {
scratch.radar[k] = CastDat {
col: 0x8000_0000_u32 as i32 | k as i32,
dist: 1024,
};
scratch.angstart[k] = k as isize;
}
let half = scratch.uurend_half_stride;
for k in 0..4 {
scratch.uurend[10 + k] = (k as i32) << 16;
scratch.uurend[10 + k + half] = 5;
}
r.vrend(&mut scratch, 10, 5, 14, 0, 0);
let row_off = 5 * 64;
for k in 0..4 {
let want = 0x8000_0000_u32 | k as u32;
assert_eq!(fb[row_off + 10 + k], want, "fb col[{}]", 10 + k);
assert!(zb[row_off + 10 + k].to_bits() != 0, "z[{}]", 10 + k);
assert_eq!(
scratch.uurend[10 + k],
((k as i32) << 16) + 5,
"uurend[{}]",
10 + k
);
}
}
#[test]
fn vrend_advances_uurend_per_pixel() {
let mut fb = vec![0u32; 64 * 64];
let mut zb = vec![0.0f32; 64 * 64];
let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
let (cs, proj, rs, prelude) = dummy_per_frame();
let ctx = ScanContext {
proj: &proj,
rs: &rs,
prelude: &prelude,
xres: 64,
y_start: 0,
y_end: 64,
anginc: 1,
camera_state: &cs,
camera_gstartz0: 0,
camera_gstartz1: 0,
camera_vptr_offset: 0,
};
r.frame_setup(&ctx);
let mut scratch = ScanScratch::new_for_size(64, 64, 64);
scratch.radar[0] = CastDat {
col: 0x8033_4455_u32 as i32,
dist: 1024,
};
scratch.angstart[0] = 0;
let half = scratch.uurend_half_stride;
for sx in 10..14 {
scratch.uurend[sx] = 0;
scratch.uurend[sx + half] = 1; }
r.vrend(&mut scratch, 10, 5, 14, 0, 0);
for sx in 10..14 {
assert_eq!(scratch.uurend[sx], 1, "uurend[{sx}] not advanced");
}
}
}