Skip to main content

oxigaf_flame/
normal_map.rs

1//! CPU software rasterizer for FLAME normal maps.
2//!
3//! Produces an RGB image where each pixel encodes the interpolated surface
4//! normal at that point: `RGB = (normal.xyz + 1) / 2 * 255`.
5//!
6//! ## Performance Optimizations
7//!
8//! This module includes several performance optimizations:
9//! - SIMD-accelerated vector operations using `portable_simd`
10//! - Tile-based rasterization for improved cache locality
11//! - Incremental edge evaluation to reduce computation
12//! - Inline hints on hot path functions
13//! - Pre-computed reciprocals to avoid divisions in inner loops
14
15use image::{Rgb, RgbImage};
16use nalgebra as na;
17
18#[cfg(all(feature = "simd", nightly))]
19use std::simd::{f32x4, num::SimdFloat};
20
21use crate::mesh::Mesh;
22
23// ---------------------------------------------------------------------------
24// Camera
25// ---------------------------------------------------------------------------
26
27/// Simple pinhole camera for rendering.
28#[derive(Debug, Clone)]
29pub struct Camera {
30    /// World-to-camera rotation matrix.
31    pub rotation: na::Matrix3<f32>,
32    /// World-to-camera translation.
33    pub translation: na::Vector3<f32>,
34    /// Focal length in pixels (horizontal).
35    pub focal_x: f32,
36    /// Focal length in pixels (vertical).
37    pub focal_y: f32,
38    /// Principal point x (pixels).
39    pub cx: f32,
40    /// Principal point y (pixels).
41    pub cy: f32,
42    /// Image width in pixels.
43    pub width: u32,
44    /// Image height in pixels.
45    pub height: u32,
46    /// Near clipping plane.
47    pub near: f32,
48    /// Far clipping plane.
49    pub far: f32,
50}
51
52impl Camera {
53    /// Create a default front-facing camera suitable for head rendering.
54    #[must_use]
55    pub fn default_front(width: u32, height: u32) -> Self {
56        let focal = width as f32 * 1.5;
57        Self {
58            rotation: na::Matrix3::identity(),
59            translation: na::Vector3::new(0.0, 0.0, 0.6),
60            focal_x: focal,
61            focal_y: focal,
62            cx: width as f32 / 2.0,
63            cy: height as f32 / 2.0,
64            width,
65            height,
66            near: 0.01,
67            far: 10.0,
68        }
69    }
70
71    /// Transform a world-space point to camera space.
72    #[inline]
73    #[must_use]
74    pub fn world_to_cam(&self, p: &na::Point3<f32>) -> na::Point3<f32> {
75        na::Point3::from(self.rotation * p.coords + self.translation)
76    }
77
78    /// Project a camera-space point to pixel coordinates.
79    #[inline]
80    #[must_use]
81    pub fn project(&self, p_cam: &na::Point3<f32>) -> [f32; 2] {
82        let x = self.focal_x * p_cam.x / p_cam.z + self.cx;
83        let y = self.focal_y * p_cam.y / p_cam.z + self.cy;
84        [x, y]
85    }
86
87    /// Project a camera-space point with pre-computed z reciprocal (optimized).
88    #[inline]
89    #[must_use]
90    pub fn project_with_recip_z(&self, p_cam: &na::Point3<f32>, recip_z: f32) -> [f32; 2] {
91        let x = self.focal_x * p_cam.x * recip_z + self.cx;
92        let y = self.focal_y * p_cam.y * recip_z + self.cy;
93        [x, y]
94    }
95}
96
97// ---------------------------------------------------------------------------
98// Helper Functions (with SIMD and non-SIMD variants)
99// ---------------------------------------------------------------------------
100
101/// Vector normalization (scalar version).
102#[cfg(not(all(feature = "simd", nightly)))]
103#[inline]
104fn normalize_vector(x: f32, y: f32, z: f32) -> (f32, f32, f32) {
105    let len_sq = x * x + y * y + z * z;
106    let len = len_sq.sqrt();
107
108    if len > 1e-10 {
109        let recip_len = 1.0 / len;
110        (x * recip_len, y * recip_len, z * recip_len)
111    } else {
112        (x, y, z)
113    }
114}
115
116/// SIMD-accelerated vector normalization.
117#[cfg(all(feature = "simd", nightly))]
118#[inline]
119fn normalize_vector(x: f32, y: f32, z: f32) -> (f32, f32, f32) {
120    let v = f32x4::from_array([x, y, z, 0.0]);
121    let len_sq = (v * v).reduce_sum();
122    let len = len_sq.sqrt();
123
124    if len > 1e-10 {
125        let recip_len = 1.0 / len;
126        (x * recip_len, y * recip_len, z * recip_len)
127    } else {
128        (x, y, z)
129    }
130}
131
132/// Normal interpolation and encoding (scalar version).
133#[cfg(not(all(feature = "simd", nightly)))]
134#[inline]
135fn interpolate_and_encode_normal(
136    n0: &na::Vector3<f32>,
137    n1: &na::Vector3<f32>,
138    n2: &na::Vector3<f32>,
139    w0: f32,
140    w1: f32,
141    w2: f32,
142) -> [u8; 3] {
143    // Interpolate
144    let nx = n0.x * w0 + n1.x * w1 + n2.x * w2;
145    let ny = n0.y * w0 + n1.y * w1 + n2.y * w2;
146    let nz = n0.z * w0 + n1.z * w1 + n2.z * w2;
147
148    // Normalize
149    let (nx_norm, ny_norm, nz_norm) = normalize_vector(nx, ny, nz);
150
151    // Encode: [-1, 1] -> [0, 255]
152    let encode = |v: f32| ((v * 0.5 + 0.5) * 255.0).clamp(0.0, 255.0) as u8;
153    [encode(nx_norm), encode(ny_norm), encode(nz_norm)]
154}
155
156/// SIMD-accelerated normal interpolation and encoding.
157#[cfg(all(feature = "simd", nightly))]
158#[inline]
159fn interpolate_and_encode_normal(
160    n0: &na::Vector3<f32>,
161    n1: &na::Vector3<f32>,
162    n2: &na::Vector3<f32>,
163    w0: f32,
164    w1: f32,
165    w2: f32,
166) -> [u8; 3] {
167    // Interpolate
168    let nx = n0.x * w0 + n1.x * w1 + n2.x * w2;
169    let ny = n0.y * w0 + n1.y * w1 + n2.y * w2;
170    let nz = n0.z * w0 + n1.z * w1 + n2.z * w2;
171
172    // Normalize using SIMD
173    let (nx_norm, ny_norm, nz_norm) = normalize_vector(nx, ny, nz);
174
175    // Encode: [-1, 1] -> [0, 255] using SIMD
176    let normal_vec = f32x4::from_array([nx_norm, ny_norm, nz_norm, 0.0]);
177    let encoded = (normal_vec * f32x4::splat(0.5) + f32x4::splat(0.5)) * f32x4::splat(255.0);
178    let clamped = encoded.simd_clamp(f32x4::splat(0.0), f32x4::splat(255.0));
179    let arr = clamped.to_array();
180
181    [arr[0] as u8, arr[1] as u8, arr[2] as u8]
182}
183
184// ---------------------------------------------------------------------------
185// NormalMapRenderer
186// ---------------------------------------------------------------------------
187
188/// CPU rasterizer that renders per-vertex normals of a [`Mesh`] into an image.
189pub struct NormalMapRenderer;
190
191/// Tile size for cache-friendly rasterization.
192const TILE_SIZE: i32 = 16;
193
194impl NormalMapRenderer {
195    /// Render a normal map of `mesh` as seen from `camera`.
196    ///
197    /// Returns an RGB image where `RGB = (normal + 1) / 2 * 255`.
198    ///
199    /// This implementation uses several optimizations:
200    /// - SIMD for barycentric coords and normal calculations
201    /// - Tile-based rasterization for cache locality
202    /// - Incremental edge evaluation
203    /// - Pre-computed reciprocals
204    #[must_use]
205    pub fn render(mesh: &Mesh, camera: &Camera) -> RgbImage {
206        let w = camera.width as usize;
207        let h = camera.height as usize;
208        let mut img = RgbImage::new(camera.width, camera.height);
209        let mut depth_buf = vec![f32::INFINITY; w * h];
210
211        for face in &mesh.faces {
212            let (i0, i1, i2) = (face[0] as usize, face[1] as usize, face[2] as usize);
213
214            // Pre-fetch normals for cache locality
215            let n0 = &mesh.normals[i0];
216            let n1 = &mesh.normals[i1];
217            let n2 = &mesh.normals[i2];
218
219            // Transform to camera space
220            let p0 = camera.world_to_cam(&mesh.vertices[i0]);
221            let p1 = camera.world_to_cam(&mesh.vertices[i1]);
222            let p2 = camera.world_to_cam(&mesh.vertices[i2]);
223
224            // Near-plane cull
225            if p0.z <= camera.near || p1.z <= camera.near || p2.z <= camera.near {
226                continue;
227            }
228
229            // Pre-compute z reciprocals for projection optimization
230            let recip_z0 = 1.0 / p0.z;
231            let recip_z1 = 1.0 / p1.z;
232            let recip_z2 = 1.0 / p2.z;
233
234            // Project to screen using optimized projection
235            let s0 = camera.project_with_recip_z(&p0, recip_z0);
236            let s1 = camera.project_with_recip_z(&p1, recip_z1);
237            let s2 = camera.project_with_recip_z(&p2, recip_z2);
238
239            // Bounding box (clipped to image)
240            let min_x = s0[0].min(s1[0]).min(s2[0]).max(0.0).floor() as i32;
241            let max_x = s0[0]
242                .max(s1[0])
243                .max(s2[0])
244                .min((camera.width - 1) as f32)
245                .ceil() as i32;
246            let min_y = s0[1].min(s1[1]).min(s2[1]).max(0.0).floor() as i32;
247            let max_y = s0[1]
248                .max(s1[1])
249                .max(s2[1])
250                .min((camera.height - 1) as f32)
251                .ceil() as i32;
252
253            if min_x > max_x || min_y > max_y {
254                continue;
255            }
256
257            // Triangle area (2x)
258            let area = edge_fn(s0, s1, s2);
259            if area.abs() < 1e-10 {
260                continue;
261            }
262
263            // Pre-compute reciprocal of area to avoid divisions in inner loop
264            let inv_area = 1.0 / area;
265
266            // Tile-based rasterization for better cache locality
267            let tile_min_x = (min_x / TILE_SIZE) * TILE_SIZE;
268            let tile_max_x = ((max_x + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
269            let tile_min_y = (min_y / TILE_SIZE) * TILE_SIZE;
270            let tile_max_y = ((max_y + TILE_SIZE - 1) / TILE_SIZE) * TILE_SIZE;
271
272            // Iterate over tiles
273            for tile_y in (tile_min_y..tile_max_y).step_by(TILE_SIZE as usize) {
274                for tile_x in (tile_min_x..tile_max_x).step_by(TILE_SIZE as usize) {
275                    // Compute actual pixel bounds within this tile
276                    let px_min = tile_x.max(min_x);
277                    let px_max = (tile_x + TILE_SIZE).min(max_x + 1);
278                    let py_min = tile_y.max(min_y);
279                    let py_max = (tile_y + TILE_SIZE).min(max_y + 1);
280
281                    // Process pixels within tile
282                    Self::rasterize_tile(
283                        px_min,
284                        px_max,
285                        py_min,
286                        py_max,
287                        s0,
288                        s1,
289                        s2,
290                        inv_area,
291                        p0.z,
292                        p1.z,
293                        p2.z,
294                        n0,
295                        n1,
296                        n2,
297                        camera.near,
298                        w,
299                        &mut depth_buf,
300                        &mut img,
301                    );
302                }
303            }
304        }
305
306        img
307    }
308
309    /// Rasterize a single tile of pixels with incremental edge evaluation.
310    #[inline]
311    #[allow(clippy::too_many_arguments)]
312    fn rasterize_tile(
313        px_min: i32,
314        px_max: i32,
315        py_min: i32,
316        py_max: i32,
317        s0: [f32; 2],
318        s1: [f32; 2],
319        s2: [f32; 2],
320        inv_area: f32,
321        z0: f32,
322        z1: f32,
323        z2: f32,
324        n0: &na::Vector3<f32>,
325        n1: &na::Vector3<f32>,
326        n2: &na::Vector3<f32>,
327        near: f32,
328        width: usize,
329        depth_buf: &mut [f32],
330        img: &mut RgbImage,
331    ) {
332        // Pre-compute edge equation coefficients for incremental evaluation
333        let a01 = s0[1] - s1[1];
334        let a12 = s1[1] - s2[1];
335        let a20 = s2[1] - s0[1];
336
337        for py in py_min..py_max {
338            let p_y = py as f32 + 0.5;
339
340            // Initial edge values at the start of this scanline
341            let p_x_start = px_min as f32 + 0.5;
342            let mut e0 = edge_fn(s1, s2, [p_x_start, p_y]);
343            let mut e1 = edge_fn(s2, s0, [p_x_start, p_y]);
344            let mut e2 = edge_fn(s0, s1, [p_x_start, p_y]);
345
346            for px in px_min..px_max {
347                // Check if point is inside triangle using edge values
348                if e0 >= 0.0 && e1 >= 0.0 && e2 >= 0.0 {
349                    // Compute barycentric weights
350                    let w0 = e0 * inv_area;
351                    let w1 = e1 * inv_area;
352                    let w2 = e2 * inv_area;
353
354                    let depth = w0 * z0 + w1 * z1 + w2 * z2;
355                    let idx = py as usize * width + px as usize;
356
357                    if depth < depth_buf[idx] && depth > near {
358                        depth_buf[idx] = depth;
359
360                        // Interpolate and encode normal (SIMD when feature enabled)
361                        let rgb = interpolate_and_encode_normal(n0, n1, n2, w0, w1, w2);
362
363                        img.put_pixel(px as u32, py as u32, Rgb(rgb));
364                    }
365                }
366
367                // Incremental update for next pixel in scanline
368                e0 += a12;
369                e1 += a20;
370                e2 += a01;
371            }
372        }
373    }
374}
375
376/// Edge function (signed area x 2) of the triangle formed by `a`, `b`, `p`.
377#[inline]
378fn edge_fn(a: [f32; 2], b: [f32; 2], p: [f32; 2]) -> f32 {
379    (b[0] - a[0]) * (p[1] - a[1]) - (b[1] - a[1]) * (p[0] - a[0])
380}