# This file is part of faster, the SIMD library for humans.
# Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http:#mozilla.org/MPL/2.0/.
# Who needs procedural macros when you have code generators?
root = "../src/arch"
filename = "vec_patterns.rs"
# https://stackoverflow.com/questions/44780357/how-to-use-newline-n-in-f-string-to-format-output-in-python-3-6
newline = "\n"
header = lambda imports: f"""
// This file is part of faster, the SIMD library for humans.
// Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// THIS FILE IS MACHINE GENERATED.
// See vec_patterns_gen.py for more information.
#![allow(unused_imports)]
use crate::arch::current::vecs::*;
use crate::std::mem::transmute;
use crate::vecs::*;
{newline.join(imports)}
const PART_MASK: [u8; 128] = [0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF];
""".lstrip()
def generate_vec_patterns(arch, headers, els, vecs, lens, feats, blends, elsz, masks):
"""Generates a vec pattern f. A typical combination of inputs might look like this:
vecs = ['u8x64', 'u8x32', 'u8x16', 'i8x64', 'i8x32', 'i8x16', 'u16x32', 'u16x16', 'u16x8', 'i16x32', 'i16x16', 'i16x8', 'u32x16', 'u32x8', 'u32x4', 'i32x16', 'i32x8', 'i32x4', 'f32x16', 'f32x8', 'f32x4', 'u64x8', 'u64x4', 'u64x2', 'i64x8', 'i64x4', 'i64x2', 'f64x8', 'f64x4', 'f64x2']
lens = [64, 32, 16, 64, 32, 16, 32, 16, 8, 32, 16, 8, 16, 8, 4, 16, 8, 4, 16, 8, 4, 8, 4, 2, 8, 4, 2, 8, 4, 2]
els = ['u8', 'u8', 'u8', 'i8', 'i8', 'i8', 'u16', 'u16', 'u16', 'i16', 'i16', 'i16', 'u32', 'u32', 'u32', 'i32', 'i32', 'i32', 'f32', 'f32', 'f32', 'u64', 'u64', 'u64', 'i64', 'i64', 'i64', 'f64', 'f64', 'f64']
elsz = [8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 64, 64, 64, 64, 64]
feats = ['avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1', 'avx512-notyet', 'avx2', 'sse4.1']
blends= ['_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8', '_mm512_mask_mov_epi8', '_mm256_blendv_epi8', '_mm_blendv_epi8']
masks = ['u8', 'u8', 'u8', 'u8', 'u8', 'u8', 'u16', 'u16', 'u16', 'u16', 'u16', 'u16', 'u32', 'u32', 'u32', 'u32', 'u32', 'u32', 'u32', 'u32', 'u32', 'u64', 'u64', 'u64', 'u64', 'u64', 'u64', 'u64', 'u64', 'u64']
"""
with open(f"{root}/{arch}/{filename}", 'w') as f:
fprint = lambda x: print(x, file=f)
fprint(header(headers))
for e, v, l, ft, b, s, m in zip(els, vecs, lens, feats, blends, elsz, masks):
# Generate halfs
fprint(f"impl Pattern for {v} {{")
fprint(f" #[inline(always)]")
fprint(f" fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self {{")
first = ", ".join("hi" for _ in range(l // 2))
second = ", ".join("lo" for _ in range(l // 2))
fprint(f" Self::new({first}, {second})")
fprint(f" }}\n")
# Generate interleave
fprint(f" #[inline(always)]")
fprint(f" fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self {{")
args = ", ".join("hi, lo" for _ in range(l // 2))
fprint(f" Self::new({args})")
fprint(f" }}")
# Generate partition_mask
fprint(f"""
#[inline(always)]
fn partition_mask(off: usize) -> Self {{
debug_assert!(off <= Self::WIDTH);
debug_assert!(off * Self::Scalar::SIZE <= 64);
Self::load(unsafe {{ transmute(&PART_MASK[..]) }}, 64 / Self::Scalar::SIZE - off)
}}""",)
# Generate partition polyfill
fprint(f"""
#[inline(always)]
#[cfg(target_feature = "{ft}")]
fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self {{
optimized!();
unsafe {{ transmute({b}(transmute(Self::splat(hi)), transmute(Self::splat(lo)), transmute(Self::partition_mask(off)))) }}
}}
""")
# Generate partition polyfill
fprint(f" #[inline(always)]")
fprint(f" #[cfg(not(target_feature = \"{ft}\"))]")
fprint(f" fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self {{")
fprint(f" assert!(off <= Self::WIDTH);")
fprint(f" fallback!();")
fprint(f" match off {{")
for i in range(0, l + 1):
first = ", ".join("hi" for _ in range(i))
second = ", ".join("lo" for _ in range(l - i))
args = ", ".join((first, second)).strip(", ")
fprint(f" {i} => Self::new({args}),")
fprint(f" _ => unreachable!()")
fprint(f" }}")
fprint(f" }}")
# Generate ones & zeroes
fprint(f"""
/// Return a vector made entirely of ones.
#[inline(always)]
fn ones() -> Self {{
Self::splat(unsafe {{ transmute(0x{'F' * (s // 4)}{m}) }})
}}
/// Return a vector made entirely of zeroes.
#[inline(always)]
fn zeroes() -> Self {{
Self::splat(unsafe {{ transmute(0x{'0' * (s // 4)}{m}) }})
}}""")
fprint(f"}}\n")
# Down here we do all architecture dependent stuff.
if "x86":
vecs = ["u8x64", "u8x32", "u8x16", "i8x64", "i8x32", "i8x16", "u16x32", "u16x16", "u16x8", "i16x32", "i16x16", "i16x8", "u32x16", "u32x8", "u32x4", "i32x16", "i32x8", "i32x4", "f32x16", "f32x8", "f32x4", "u64x8", "u64x4", "u64x2", "i64x8", "i64x4", "i64x2", "f64x8", "f64x4", "f64x2"]
lens = [int(v.split("x")[1]) for v in vecs]
els = [v.split("x")[0] for v in vecs]
elsz = [int(el[1:]) for el in els]
masks = ["u" + el[1:] for el in els]
feats = [{512: "avx512-notyet", 256: "avx2", 128: "sse4.1"}[l * e]
for l, e in zip(lens, elsz)]
blends = [{512: "_mm512_mask_mov_epi8", 256: "_mm256_blendv_epi8", 128: "_mm_blendv_epi8"}[l * e]
for l, e in zip(lens, elsz)]
# Generate file
generate_vec_patterns("x86", ["use vektor::x86::*;"], els, vecs, lens, feats, blends, elsz, masks)
if "unknown":
vecs = ["u8x16", "i8x16", "u16x8", "i16x8", "u32x4", "i32x4", "f32x4", "u64x2", "i64x2", "f64x2"]
lens = [int(v.split("x")[1]) for v in vecs]
els = [v.split("x")[0] for v in vecs]
elsz = [int(el[1:]) for el in els]
masks = ["u" + el[1:] for el in els]
feats = [{128: "__undefined"}[l * e] for l, e in zip(lens, elsz)]
blends = [{128: "__undefined"}[l * e] for l, e in zip(lens, elsz)]
# Generate file
generate_vec_patterns("unknown", [], els, vecs, lens, feats, blends, elsz, masks)