origin 0.26.2

Program startup and thread support written in Rust
Documentation
//! The following is derived from src/mem/x86_64.rs in Rust's
//! [compiler_builtins library] at revision
//! cb060052ab7e4bad408c85d44be7e60096e93e38.
//!
//! [compiler_builtins library]: https://github.com/rust-lang/compiler-builtins

// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
// been enhanced to perform better than an simple qword loop, making them ideal
// for implementing memcpy/memset. Note that "rep cmps" has received no such
// enhancement, so it is not used to implement memcmp.
//
// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
// further enhanced to automatically select the best microarchitectural
// implementation based on length and alignment. See the following features from
// the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual":
//  - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
//  - FSRM - Fast Short REP MOV (Ice Lake and later)
//  - Fast Zero-Length MOVSB (On no current hardware)
//  - Fast Short STOSB (On no current hardware)
//
// To simplify things, we switch to using the byte-based variants if the "ermsb"
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".

use core::arch::asm;
use core::mem;

#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
    unsafe {
        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
        core::arch::asm!(
            "repe movsb (%rsi), (%rdi)",
            inout("rcx") count => _,
            inout("rdi") dest => _,
            inout("rsi") src => _,
            options(att_syntax, nostack, preserves_flags)
        );
    }
}

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
    unsafe {
        let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
        // Separating the blocks gives the compiler more freedom to reorder instructions.
        asm!(
            "rep movsb",
            inout("ecx") pre_byte_count => _,
            inout("rdi") dest => dest,
            inout("rsi") src => src,
            options(att_syntax, nostack, preserves_flags)
        );
        asm!(
            "rep movsq",
            inout("rcx") qword_count => _,
            inout("rdi") dest => dest,
            inout("rsi") src => src,
            options(att_syntax, nostack, preserves_flags)
        );
        asm!(
            "rep movsb",
            inout("ecx") byte_count => _,
            inout("rdi") dest => _,
            inout("rsi") src => _,
            options(att_syntax, nostack, preserves_flags)
        );
    }
}

#[inline(always)]
pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
    unsafe {
        let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
        // We can't separate this block due to std/cld
        asm!(
            "std",
            "rep movsb",
            "sub $7, %rsi",
            "sub $7, %rdi",
            "mov {qword_count}, %rcx",
            "rep movsq",
            "test {pre_byte_count:e}, {pre_byte_count:e}",
            "add $7, %rsi",
            "add $7, %rdi",
            "mov {pre_byte_count:e}, %ecx",
            "rep movsb",
            "cld",
            pre_byte_count = in(reg) pre_byte_count,
            qword_count = in(reg) qword_count,
            inout("ecx") byte_count => _,
            inout("rdi") dest.add(count - 1) => _,
            inout("rsi") src.add(count - 1) => _,
            // We modify flags, but we restore it afterwards
            options(att_syntax, nostack, preserves_flags)
        );
    }
}

#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
    unsafe {
        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
        core::arch::asm!(
            "repe stosb %al, (%rdi)",
            inout("rcx") count => _,
            inout("rdi") dest => _,
            inout("al") c => _,
            options(att_syntax, nostack, preserves_flags)
        )
    }
}

#[inline(always)]
#[cfg(not(target_feature = "ermsb"))]
pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
    unsafe {
        let c = c as u64 * 0x0101_0101_0101_0101;
        let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
        // Separating the blocks gives the compiler more freedom to reorder instructions.
        asm!(
            "rep stosb",
            inout("ecx") pre_byte_count => _,
            inout("rdi") dest => dest,
            in("rax") c,
            options(att_syntax, nostack, preserves_flags)
        );
        asm!(
            "rep stosq",
            inout("rcx") qword_count => _,
            inout("rdi") dest => dest,
            in("rax") c,
            options(att_syntax, nostack, preserves_flags)
        );
        asm!(
            "rep stosb",
            inout("ecx") byte_count => _,
            inout("rdi") dest => _,
            in("rax") c,
            options(att_syntax, nostack, preserves_flags)
        );
    }
}

#[inline(always)]
pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
    unsafe {
        #[inline(always)]
        unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
        where
            T: Clone + Copy + Eq,
            U: Clone + Copy + Eq,
            F: FnOnce(*const U, *const U, usize) -> i32,
        {
            unsafe {
                // Ensure T is not a ZST.
                assert!(mem::size_of::<T>() != 0);

                let end = a.add(n / mem::size_of::<T>());
                while a != end {
                    if a.read_unaligned() != b.read_unaligned() {
                        return f(a.cast(), b.cast(), mem::size_of::<T>());
                    }
                    a = a.add(1);
                    b = b.add(1);
                }
                f(a.cast(), b.cast(), n % mem::size_of::<T>())
            }
        }
        let c1 = |mut a: *const u8, mut b: *const u8, n| {
            for _ in 0..n {
                if a.read() != b.read() {
                    return i32::from(a.read()) - i32::from(b.read());
                }
                a = a.add(1);
                b = b.add(1);
            }
            0
        };
        let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
        let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
        let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
        let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
        c16(a.cast(), b.cast(), n)
    }
}

// In order to process more than on byte simultaneously when executing strlen,
// two things must be considered:
// * An n byte read with an n-byte aligned address will never cross
//   a page boundary and will always succeed. Any smaller alignment
//   may result in a read that will cross a page boundary, which may
//   trigger an access violation.
// * Surface Rust considers any kind of out-of-bounds read as undefined
//   behaviour. To dodge this, memory access operations are written
//   using inline assembly.

#[cfg(target_feature = "sse2")]
#[inline(always)]
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
    unsafe {
        use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};

        let mut n = 0;

        // The use of _mm_movemask_epi8 and company allow for speedups,
        // but they aren't cheap by themselves. Thus, possibly small strings
        // are handled in simple loops.

        for _ in 0..4 {
            if *s == 0 {
                return n;
            }

            n += 1;
            s = s.add(1);
        }

        // Shave of the least significand bits to align the address to a 16
        // byte boundary. The shaved of bits are used to correct the first iteration.

        let align = s.addr() & 15;
        let mut s = s.with_addr(s.addr() - align) as *const __m128i;
        let zero = _mm_set1_epi8(0);

        let x = {
            let r;
            asm!(
                "movdqa ({addr}), {dest}",
                addr = in(reg) s,
                dest = out(xmm_reg) r,
                options(att_syntax, nostack),
            );
            r
        };
        let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;

        if cmp != 0 {
            return n + cmp.trailing_zeros() as usize;
        }

        n += 16 - align;
        s = s.add(1);

        loop {
            let x = {
                let r;
                asm!(
                    "movdqa ({addr}), {dest}",
                    addr = in(reg) s,
                    dest = out(xmm_reg) r,
                    options(att_syntax, nostack),
                );
                r
            };
            let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
            if cmp == 0 {
                n += 16;
                s = s.add(1);
            } else {
                return n + cmp.trailing_zeros() as usize;
            }
        }
    }
}

// Provided for scenarios like kernel development, where SSE might not
// be available.
#[cfg(not(target_feature = "sse2"))]
#[inline(always)]
pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
    unsafe {
        let mut n = 0;

        // Check bytes in steps of one until
        // either a zero byte is discovered or
        // pointer is aligned to an eight byte boundary.

        while s as usize & 7 != 0 {
            if *s == 0 {
                return n;
            }
            n += 1;
            s = s.add(1);
        }

        // Check bytes in steps of eight until a zero
        // byte is discovered.

        let mut s = s as *const u64;

        loop {
            let mut cs = {
                let r: u64;
                asm!(
                    "mov ({addr}), {dest}",
                    addr = in(reg) s,
                    dest = out(reg) r,
                    options(att_syntax, nostack),
                );
                r
            };
            // Detect if a word has a zero byte, taken from
            // https://graphics.stanford.edu/~seander/bithacks.html
            if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
                loop {
                    if cs & 255 == 0 {
                        return n;
                    } else {
                        cs >>= 8;
                        n += 1;
                    }
                }
            } else {
                n += 8;
                s = s.add(1);
            }
        }
    }
}

/// Determine optimal parameters for a `rep` instruction.
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
    // Unaligned writes are still slow on modern processors, so align the destination address.
    let pre_byte_count = ((8 - (dest.addr() & 0b111)) & 0b111).min(count);
    count -= pre_byte_count;
    let qword_count = count >> 3;
    let byte_count = count & 0b111;
    (pre_byte_count, qword_count, byte_count)
}