bytecount 0.6.5

count occurrences of a given byte, or the number of UTF-8 code points, in a byte slice, fast
Documentation
extern crate packed_simd;

#[cfg(not(feature = "runtime-dispatch-simd"))]
use core::mem;
#[cfg(feature = "runtime-dispatch-simd")]
use std::mem;

use self::packed_simd::{u8x32, u8x64, FromCast};

const MASK: [u8; 64] = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
];

unsafe fn u8x64_from_offset(slice: &[u8], offset: usize) -> u8x64 {
    u8x64::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
}
unsafe fn u8x32_from_offset(slice: &[u8], offset: usize) -> u8x32 {
    u8x32::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
}

fn sum_x64(u8s: &u8x64) -> usize {
    let mut store = [0; mem::size_of::<u8x64>()];
    u8s.write_to_slice_unaligned(&mut store);
    store.iter().map(|&e| e as usize).sum()
}
fn sum_x32(u8s: &u8x32) -> usize {
    let mut store = [0; mem::size_of::<u8x32>()];
    u8s.write_to_slice_unaligned(&mut store);
    store.iter().map(|&e| e as usize).sum()
}

pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
    assert!(haystack.len() >= 32);

    unsafe {
        let mut offset = 0;
        let mut count = 0;

        let needles_x64 = u8x64::splat(needle);

        // 16320
        while haystack.len() >= offset + 64 * 255 {
            let mut counts = u8x64::splat(0);
            for _ in 0..255 {
                counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
                offset += 64;
            }
            count += sum_x64(&counts);
        }

        // 8192
        if haystack.len() >= offset + 64 * 128 {
            let mut counts = u8x64::splat(0);
            for _ in 0..128 {
                counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
                offset += 64;
            }
            count += sum_x64(&counts);
        }

        let needles_x32 = u8x32::splat(needle);

        // 32
        let mut counts = u8x32::splat(0);
        for i in 0..(haystack.len() - offset) / 32 {
            counts -=
                u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32));
        }
        count += sum_x32(&counts);

        // Straggler; need to reset counts because prior loop can run 255 times
        counts = u8x32::splat(0);
        if haystack.len() % 32 != 0 {
            counts -=
                u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32))
                    & u8x32_from_offset(&MASK, haystack.len() % 32);
        }
        count += sum_x32(&counts);

        count
    }
}

fn is_leading_utf8_byte_x64(u8s: u8x64) -> u8x64 {
    u8x64::from_cast((u8s & u8x64::splat(0b1100_0000)).ne(u8x64::splat(0b1000_0000)))
}

fn is_leading_utf8_byte_x32(u8s: u8x32) -> u8x32 {
    u8x32::from_cast((u8s & u8x32::splat(0b1100_0000)).ne(u8x32::splat(0b1000_0000)))
}

pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
    assert!(utf8_chars.len() >= 32);

    unsafe {
        let mut offset = 0;
        let mut count = 0;

        // 16320
        while utf8_chars.len() >= offset + 64 * 255 {
            let mut counts = u8x64::splat(0);
            for _ in 0..255 {
                counts -= is_leading_utf8_byte_x64(u8x64_from_offset(utf8_chars, offset));
                offset += 64;
            }
            count += sum_x64(&counts);
        }

        // 8192
        if utf8_chars.len() >= offset + 64 * 128 {
            let mut counts = u8x64::splat(0);
            for _ in 0..128 {
                counts -= is_leading_utf8_byte_x64(u8x64_from_offset(utf8_chars, offset));
                offset += 64;
            }
            count += sum_x64(&counts);
        }

        // 32
        let mut counts = u8x32::splat(0);
        for i in 0..(utf8_chars.len() - offset) / 32 {
            counts -= is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, offset + i * 32));
        }
        count += sum_x32(&counts);

        // Straggler; need to reset counts because prior loop can run 255 times
        counts = u8x32::splat(0);
        if utf8_chars.len() % 32 != 0 {
            counts -=
                is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, utf8_chars.len() - 32))
                    & u8x32_from_offset(&MASK, utf8_chars.len() % 32);
        }
        count += sum_x32(&counts);

        count
    }
}