//! A highly optimized version of SeaHash.

use core::slice;

use diffuse;

/// Read a buffer smaller than 8 bytes into an integer in little-endian.
///
/// This assumes that `buf.len() < 8`. If this is not satisfied, the behavior is unspecified.
#[inline(always)]
fn read_int(buf: &[u8]) -> u64 {
    // Because we want to make sure that it is register allocated, we fetch this into a variable.
    // It will likely make no difference anyway, though.
    let ptr = buf.as_ptr();

    unsafe {
        // Break it down to reads of integers with widths in total spanning the buffer. This minimizes
        // the number of reads
        match buf.len() {
            // u8.
            1 => *ptr as u64,
            // u16.
            2 => (*(ptr as *const u16)).to_le() as u64,
            // u16 + u8.
            3 => {
                let a = (*(ptr as *const u16)).to_le() as u64;
                let b = *ptr.offset(2) as u64;

                a | (b << 16)
            },
            // u32.
            4 => (*(ptr as *const u32)).to_le() as u64,
            // u32 + u8.
            5 => {
                let a = (*(ptr as *const u32)).to_le() as u64;
                let b = *ptr.offset(4) as u64;

                a | (b << 32)
            },
            // u32 + u16.
            6 => {
                let a = (*(ptr as *const u32)).to_le() as u64;
                let b = (*(ptr.offset(4) as *const u16)).to_le() as u64;

                a | (b << 32)
            },
            // u32 + u16 + u8.
            7 => {
                let a = (*(ptr as *const u32)).to_le() as u64;
                let b = (*(ptr.offset(4) as *const u16)).to_le() as u64;
                let c = *ptr.offset(6) as u64;

                a | (b << 32) | (c << 48)
            },
            _ => 0,
        }
    }
}

/// Read a little-endian 64-bit integer from some buffer.
#[inline(always)]
unsafe fn read_u64(ptr: *const u8) -> u64 {
    #[cfg(target_pointer_width = "32")]
    {
        (*(ptr as *const u32)).to_le() as u64 | ((*(ptr as *const u32)).to_le() as u64) << 32
    }

    #[cfg(target_pointer_width = "64")]
    {
        (*(ptr as *const u64)).to_le()
    }
}

/// Hash some buffer.
///
/// This is a highly optimized implementation of SeaHash. It implements numerous techniques to
/// improve performance:
///
/// - Register allocation: This makes a great deal out of making sure everything fits into
///   registers such that minimal memory accesses are needed. This works quite successfully on most
///   CPUs, and the only time it reads from memory is when it fetches the data of the buffer.
/// - SIMD reads: Like most other good hash functions, we read 8 bytes a time. This improves things
///   a lot
/// - Independent updates: We make sure very few statements next to each other depends on the
///   other. This means that almost always the CPU will be able to run the instructions in parallel.
/// - Loop unrolling: The hot loop is unrolled such that very little branches (one every 32 bytes)
///   are needed.
///
/// and more.
pub fn hash(buf: &[u8]) -> u64 {
    unsafe {
        // We use 4 different registers to store seperate hash states, because this allows us to update
        // them seperately, and consequently exploiting ILP to update the states in parallel.
        let mut a = 0x16f11fe89b0d677c;
        let mut b = 0xb480a793d8e6c86c;
        let mut c = 0x6fe2e5aaf078ebc9;
        let mut d = 0x14f994a4c5259381;

        // The pointer to the current bytes.
        let mut ptr = buf.as_ptr();
        /// The end of the "main segment", i.e. the biggest buffer s.t. the length is divisible by
        /// 32.
        let end_ptr = buf.as_ptr().offset(buf.len() as isize & !0x1F) as usize;

        while end_ptr > ptr as usize {
            // Read and diffuse the next 4 64-bit little-endian integers from their bytes. Note
            // that we on purpose not use `^=` and co., because it aliases the lvalue, making it
            // harder for LLVM to register allocate (it will have to inline the value behind the
            // pointer, effectively assuming that it is not aliased, which can be hard to prove).

            // Placing these updates inplace can have some negative consequences on especially
            // older architectures, where they can block ILP because they assume the evaluation of
            // the old `byte` is executed, which might trigger the diffusion to run serially.
            // However, not introducing a tmp register makes sure that you don't push from the
            // register to the stack, which comes with a performance hit.
            a = a ^ read_u64(ptr);
            ptr = ptr.offset(8);

            b = b ^ read_u64(ptr);
            ptr = ptr.offset(8);

            c = c ^ read_u64(ptr);
            ptr = ptr.offset(8);

            d = d ^ read_u64(ptr);
            ptr = ptr.offset(8);

            // Diffuse the updated registers. We hope that each of these are executed in parallel.
            a = diffuse(a);
            b = diffuse(b);
            c = diffuse(c);
            d = diffuse(d);
        }

        // Rename the register (we do this to make it easier for LLVM to reallocate the register).
        let mut excessive = end_ptr;
        // Calculate the number of excessive bytes. These are bytes that could not be handled in
        // the loop above.
        excessive = buf.len() as usize + buf.as_ptr() as usize - excessive as usize;
        // Handle the excessive bytes.
        match excessive {
            0 => {},
            1...7 => {
                // 1 or more excessive.

                // Write the last excessive bytes (<8 bytes).
                a = a ^ read_int(slice::from_raw_parts(ptr as *const u8, excessive));

                // Diffuse.
                a = diffuse(a);
            },
            8 => {
                // 8 bytes excessive.

                // Update `a`.
                a = a ^ read_u64(ptr);

                // Diffuse.
                a = diffuse(a);
            },
            9...15 => {
                // More than 8 bytes excessive.

                // Update `a`.
                a = a ^ read_u64(ptr);
                ptr = ptr.offset(8);

                // Write the last excessive bytes (<8 bytes).
                excessive = excessive - 8;
                b = b ^ read_int(slice::from_raw_parts(ptr as *const u8, excessive));

                // Diffuse.
                a = diffuse(a);
                b = diffuse(b);

            },
            16 => {
                // 16 bytes excessive.

                // Update `a`.
                a = a ^ read_u64(ptr);
                ptr = ptr.offset(8);
                // Update `b`.
                b = b ^ read_u64(ptr);

                // Diffuse.
                a = diffuse(a);
                b = diffuse(b);
            },
            17...23 => {
                // 16 bytes or more excessive.

                // Update `a`.
                a = a ^ read_u64(ptr);
                ptr = ptr.offset(8);
                // Update `b`.
                b = b ^ read_u64(ptr);
                ptr = ptr.offset(8);

                // Write the last excessive bytes (<8 bytes).
                excessive = excessive - 16;
                c = c ^ read_int(slice::from_raw_parts(ptr as *const u8, excessive));

                // Diffuse.
                a = diffuse(a);
                b = diffuse(b);
                c = diffuse(c);
            },
            24 => {
                // 24 bytes excessive.

                // Update `a`.
                a = a ^ read_u64(ptr);
                ptr = ptr.offset(8);
                // Update `b`.
                b = b ^ read_u64(ptr);
                ptr = ptr.offset(8);
                // Update `c`.
                c = c ^ read_u64(ptr);

                // Diffuse.
                a = diffuse(a);
                b = diffuse(b);
                c = diffuse(c);
            },
            _ => {
                // More than 24 bytes excessive.

                // Update `a`.
                a = a ^ read_u64(ptr);
                ptr = ptr.offset(8);
                // Update `b`.
                b = b ^ read_u64(ptr);
                ptr = ptr.offset(8);
                // Update `c`.
                c = c ^ read_u64(ptr);
                ptr = ptr.offset(8);

                // Write the last excessive bytes (<8 bytes).
                excessive = excessive - 24;
                d = d ^ read_int(slice::from_raw_parts(ptr as *const u8, excessive));

                // Diffuse.
                a = diffuse(a);
                b = diffuse(b);
                c = diffuse(c);
                d = diffuse(d);

            }
        }

        // XOR the states together. Even though XOR is commutative, it doesn't matter, because the
        // state vector's initial components are mutually distinct, and thus swapping even and odd
        // chunks will affect the result, because it is sensitive to the initial condition.
        a = a ^ b;
        c = c ^ d;
        a = a ^ c;
        // XOR the number of written bytes in order to make the excessive bytes zero-sensitive
        // (without this, two excessive zeros would be equivalent to three excessive zeros). This
        // is know as length padding.
        a = a ^ buf.len() as u64;

        // We diffuse to make the excessive bytes discrete (i.e. small changes shouldn't give small
        // changes in the output).
        diffuse(a)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use reference;

    fn hash_match(a: &[u8]) {
        assert_eq!(hash(a), reference::hash(a));
    }

    #[test]
    fn zero() {
        let arr = [0; 4096];
        for n in 0..4096 {
            hash_match(&arr[0..n]);
        }
    }

    #[test]
    fn seq() {
        let mut buf = [0; 4096];
        for i in 0..4096 {
            buf[i] = i as u8;
        }
        hash_match(&buf);
    }


    #[test]
    fn position_depedent() {
        let mut buf1 = [0; 4098];
        for i in 0..4098 {
            buf1[i] = i as u8;
        }
        let mut buf2 = [0; 4098];
        for i in 0..4098 {
            buf2[i] = i as u8 ^ 1;
        }

        assert!(hash(&buf1) != hash(&buf2));
    }

    #[test]
    fn shakespear() {
        hash_match(b"to be or not to be");
        hash_match(b"love is a wonderful terrible thing");
    }

    #[test]
    fn zero_senitive() {
        assert_ne!(hash(&[1, 2, 3, 4]), hash(&[1, 0, 2, 3, 4]));
        assert_ne!(hash(&[1, 2, 3, 4]), hash(&[1, 0, 0, 2, 3, 4]));
        assert_ne!(hash(&[1, 2, 3, 4]), hash(&[1, 2, 3, 4, 0]));
        assert_ne!(hash(&[1, 2, 3, 4]), hash(&[0, 1, 2, 3, 4]));
        assert_ne!(hash(&[0, 0, 0]), hash(&[0, 0, 0, 0, 0]));
    }

    #[test]
    fn not_equal() {
        assert_ne!(hash(b"to be or not to be "), hash(b"to be or not to be"));
        assert_ne!(hash(b"jkjke"), hash(b"jkjk"));
        assert_ne!(hash(b"ijkjke"), hash(b"ijkjk"));
        assert_ne!(hash(b"iijkjke"), hash(b"iijkjk"));
        assert_ne!(hash(b"iiijkjke"), hash(b"iiijkjk"));
        assert_ne!(hash(b"iiiijkjke"), hash(b"iiiijkjk"));
        assert_ne!(hash(b"iiiiijkjke"), hash(b"iiiiijkjk"));
        assert_ne!(hash(b"iiiiiijkjke"), hash(b"iiiiiijkjk"));
        assert_ne!(hash(b"iiiiiiijkjke"), hash(b"iiiiiiijkjk"));
        assert_ne!(hash(b"iiiiiiiijkjke"), hash(b"iiiiiiiijkjk"));
        assert_ne!(hash(b"ab"), hash(b"bb"));
    }
}