kiss2d 0.1.5

KISS 2d graphics engine
use std::mem;
use std::fmt::{Debug, Formatter, Result};
use std::cmp::min;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "x86")]
use std::arch::x86::*;


#[derive(Copy, Clone)]
pub struct Point {
    pub x: f32,
    pub y: f32,
}

impl Point {
    pub fn new<T: Into<f32>>(x: T, y: T) -> Self {
        Self {
            x: x.into(),
            y: y.into(),
        }
    }

    pub fn lerp(t: f32, p0: &Self, p1: &Self) -> Self {
        Self {
            x: p0.x + t * (p1.x - p0.x),
            y: p0.y + t * (p1.y - p0.y),
        }
    }
}

impl Debug for Point {
    fn fmt(&self, f: &mut Formatter) -> Result {
        write!(f, "({}, {})", self.x, self.y)
    }
}

#[derive(Debug)]
pub struct Affine {
    a: f32,
    b: f32,
    c: f32,
    d: f32,
    e: f32,
    f: f32,
}

impl Affine {
    pub fn new(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32) -> Self {
        Self { a, b, c, d, e, f }
    }

    /// Concatenate two affine transforms.
    pub fn concat(t1: &Self, t2: &Self) -> Self {
        Self {
            a: t1.a * t2.a + t1.c * t2.b,
            b: t1.b * t2.a + t1.d * t2.b,
            c: t1.a * t2.c + t1.c * t2.d,
            d: t1.b * t2.c + t1.d * t2.d,
            e: t1.a * t2.e + t1.c * t2.f + t1.e,
            f: t1.b * t2.e + t1.d * t2.f + t1.f,
        }
    }

    pub fn apply(&self, p: &Point) -> Point {
        Point {
            x: self.a * p.x + self.c * p.y + self.e,
            y: self.b * p.x + self.d * p.y + self.f,
        }
    }
}

#[doc(hidden)]
macro _mm_shuffle($z:expr, $y:expr, $x:expr, $w:expr) {
    ($z << 6) | ($y << 4) | ($x << 2) | $w
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
pub fn accumulate_sse(src: &[f32]) -> Vec<u8> {
    // SIMD instructions force us to align data since we iterate each 4 elements
    // So:
    // n (0) => 0
    // n (1 or 2 or 3 or 4) => 4,
    // n (5) => 8
    // and so on
    let len = src.len();
    let n = (len + 3) & !3; // align data
    let mut dst: Vec<u8> = vec![0; n];

    unsafe {
        let mut offset = _mm_setzero_ps();
        let sign_mask = _mm_set1_ps(-0.);
        let mask = _mm_set1_epi32(0x0c080400);

        for i in (0..n).step_by(4) {
            let mut x = _mm_loadu_ps(&src[i]);
            x = _mm_add_ps(x, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(x), 4)));
            x = _mm_add_ps(x, _mm_shuffle_ps(_mm_setzero_ps(), x, 0x40));
            x = _mm_add_ps(x, offset);

            let mut y = _mm_andnot_ps(sign_mask, x); // fabs(x)
            y = _mm_min_ps(y, _mm_set1_ps(1.0));
            y = _mm_mul_ps(y, _mm_set1_ps(255.0));

            let mut z = _mm_cvttps_epi32(y);
            z = _mm_shuffle_epi8(z, mask);

            _mm_store_ss(mem::transmute(&dst[i]), _mm_castsi128_ps(z));
            offset = _mm_shuffle_ps(x, x, _mm_shuffle!(3, 3, 3, 3));
        }

        dst.set_len(len); // we must return vec of the same length as src.len()
    }

    dst
}

pub fn accumulate_simple(src: &[f32]) -> Vec<u8> {
    let mut acc = 0.0;
    src.iter()
        .map(|c| {
            // This would translate really well to SIMD
            acc += c;
            let y = acc.abs();
            let y = if y < 1.0 { y } else { 1.0 };
            (255.0 * y) as u8
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    fn test_accumulate(src: Vec<f32>) {
        assert_eq!(accumulate_simple(&src), accumulate(&src));
    }

    #[test]
    fn max_255_from_1_0() {
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![1.0]);
    }
    #[test]
    fn max_255_from_0_5() {
        // 0.5 * 2 = 1.0
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![0.5; 2]);
    }
    #[test]
    fn max_255_from_0_25() {
        // 0.25 * 4 = 1.0
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![0.25; 4]);
    }
    #[test]
    fn max_255_from_0_125() {
        // 0.125 * 8 = 1.0
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![0.125; 8]);
    }
    #[test]
    fn max_255_from_0_0625() {
        // 0.0625 * 16 = 1.0
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![0.0625; 16]);
    }
    #[test]
    fn max_255_from_0_03125() {
        // 0.03125 * 32 = 1.0
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![0.03125; 32]);
    }
    #[test]
    fn max_255_from_0_015625() {
        // 0.015625 * 64 = 1.0
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![0.015625; 64]);
    }
    #[test]
    fn max_255_from_0_0078125() {
        // 0.0078125 * 128 = 1.0
        // 1.0 * 255.0 = 255.0 (max value)
        test_accumulate(vec![0.0078125; 128]);
    }

    #[test]
    fn simple_0() {
        test_accumulate(vec![]);
    }
    #[test]
    fn simple_1() {
        test_accumulate(vec![0.1]);
    }
    #[test]
    fn simple_2() {
        test_accumulate(vec![0.1, 0.2]);
    }
    #[test]
    fn simple_3() {
        test_accumulate(vec![0.1, 0.2, 0.3]);
    }
    #[test]
    fn simple_4() {
        test_accumulate(vec![0.1, 0.2, 0.3, 0.4]);
    }
    #[test]
    fn simple_5() {
        test_accumulate(vec![0.1, 0.2, 0.3, 0.4, 0.5]);
    }
    #[test]
    fn simple_6() {
        test_accumulate(vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6]);
    }
    #[test]
    fn simple_7() {
        test_accumulate(vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]);
    }
}

pub struct Raster {
    w: usize,
    h: usize,
    a: Vec<f32>,
}

// TODO: is there a faster way? (investigate whether approx recip is good enough)
fn recip(x: f32) -> f32 {
    x.recip()
}

impl Raster {
    pub fn new(w: usize, h: usize) -> Self {
        Self {
            w,
            h,
            a: vec![0.0; w * h + 4],
        }
    }

    pub fn draw_line(&mut self, p0: &Point, p1: &Point) {
        //println!("draw_line {} {}", p0, p1);
        if p0.y == p1.y {
            return;
        }
        let (dir, p0, p1) = if p0.y < p1.y {
            (1.0, p0, p1)
        } else {
            (-1.0, p1, p0)
        };
        let dxdy = (p1.x - p0.x) / (p1.y - p0.y);
        let mut x = p0.x;
        let y0 = p0.y as usize; // note: implicit max of 0 because usize (TODO: really true?)
        if p0.y < 0.0 {
            x -= p0.y * dxdy;
        }
        for y in y0..min(self.h, p1.y.ceil() as usize) {
            let linestart = y * self.w;
            let dy = ((y + 1) as f32).min(p1.y) - (y as f32).max(p0.y);
            let xnext = x + dxdy * dy;
            let d = dy * dir;
            let (x0, x1) = if x < xnext { (x, xnext) } else { (xnext, x) };
            let x0floor = x0.floor();
            let x0i = x0floor as i32;
            let x1ceil = x1.ceil();
            let x1i = x1ceil as i32;
            if x1i <= x0i + 1 {
                let xmf = 0.5 * (x + xnext) - x0floor;
                self.a[linestart + x0i as usize] += d - d * xmf;
                self.a[linestart + (x0i + 1) as usize] += d * xmf;
            } else {
                let s = recip(x1 - x0);
                let x0f = x0 - x0floor;
                let a0 = 0.5 * s * (1.0 - x0f) * (1.0 - x0f);
                let x1f = x1 - x1ceil + 1.0;
                let am = 0.5 * s * x1f * x1f;
                self.a[linestart + x0i as usize] += d * a0;
                if x1i == x0i + 2 {
                    self.a[linestart + (x0i + 1) as usize] += d * (1.0 - a0 - am);
                } else {
                    let a1 = s * (1.5 - x0f);
                    self.a[linestart + (x0i + 1) as usize] += d * (a1 - a0);
                    for xi in x0i + 2..x1i - 1 {
                        self.a[linestart + xi as usize] += d * s;
                    }
                    let a2 = a1 + (x1i - x0i - 3) as f32 * s;
                    self.a[linestart + (x1i - 1) as usize] += d * (1.0 - a2 - am);
                }
                self.a[linestart + x1i as usize] += d * am;
            }
            x = xnext;
        }
    }

    pub fn draw_quad(&mut self, p0: &Point, p1: &Point, p2: &Point) {
        //println!("draw_quad {} {} {}", p0, p1, p2);
        let devx = p0.x - 2.0 * p1.x + p2.x;
        let devy = p0.y - 2.0 * p1.y + p2.y;
        let devsq = devx * devx + devy * devy;
        if devsq < 0.333 {
            self.draw_line(p0, p2);
            return;
        }
        let tol = 3.0;
        let n = 1 + (tol * (devx * devx + devy * devy)).sqrt().sqrt().floor() as usize;
        //println!("n = {}", n);
        let mut p = *p0;
        let nrecip = recip(n as f32);
        let mut t = 0.0;
        for _i in 0..n - 1 {
            t += nrecip;
            let pn = Point::lerp(t, &Point::lerp(t, p0, p1), &Point::lerp(t, p1, p2));
            self.draw_line(&p, &pn);
            p = pn;
        }
        self.draw_line(&p, p2);
    }

    /*
    fn get_bitmap_fancy(&self) -> Vec<u8> {
        let mut acc = 0.0;
        // This would translate really well to SIMD
        self.a[0..self.w * self.h].iter().map(|&a| {
            acc += a;
            (255.0 * acc.abs().min(1.0)) as u8
            //(255.5 * (0.5 + 0.4 * acc)) as u8
        }).collect()
    }
    */

    pub fn get_bitmap(&self) -> Vec<u8> {
        accumulate(&self.a[0..self.w * self.h])
    }
}