stringzilla 4.6.0

/**
 *  @file       stringzillas.cu
 *  @brief      StringZillas library shared code for parallel string operations using CPU & CUDA backends.
 *  @author     Ash Vardanian
 *  @date       March 23, 2025
 */

#if !defined(FU_ENABLE_NUMA)
#define FU_ENABLE_NUMA 0
#endif

#include <stringzillas/stringzillas.h> // StringZillas library header

#include <variant>        // For `std::variant`
#include <cstring>        // For `std::memcpy`
#include <string_view>    // For `std::string_view`
#include <thread>         // For `std::thread::hardware_concurrency`
#include <fork_union.hpp> // Fork-join scoped thread pool

#include <stringzillas/fingerprints.hpp> // C++ templates for string processing
#include <stringzillas/similarities.hpp> // C++ templates for string similarity

#if SZ_USE_CUDA
#include <stringzillas/fingerprints.cuh> // Parallel string processing in CUDA
#include <stringzillas/similarities.cuh> // Parallel string similarity in CUDA
#endif

namespace fu = ashvardanian::fork_union;
namespace sz = ashvardanian::stringzilla;
namespace szs = ashvardanian::stringzillas;

using malloc_t = std::allocator<char>;
#if SZ_USE_CUDA
using ualloc_t = szs::unified_alloc_t;
#endif // SZ_USE_CUDA

/** Helper class for `std::visit` to handle multiple callable types in a single variant. */
template <typename... callable_types_>
struct overloaded : callable_types_... {
    using callable_types_::operator()...;
};
template <typename... callable_types_>
overloaded(callable_types_...) -> overloaded<callable_types_...>;

/** Wraps a `sz_sequence_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
struct sz_sequence_as_cpp_container_t {
    using value_type = std::string_view;
    sz_sequence_t const *sequence_ = nullptr;

    std::size_t size() const noexcept {
        sz_assert_(sequence_ != nullptr && "Sequence must not be null");
        return sequence_->count;
    }
    std::string_view operator[](std::size_t index) const noexcept {
        sz_assert_(sequence_ != nullptr && "Sequence must not be null");
        sz_assert_(index < sequence_->count && "Index out of bounds");
        sz_cptr_t start_ptr = sequence_->get_start(sequence_->handle, index);
        sz_size_t length = sequence_->get_length(sequence_->handle, index);
        return {start_ptr, length};
    }
};

/** Wraps a `sz_sequence_u64tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
struct sz_sequence_u64tape_as_cpp_container_t {
    using value_type = std::string_view;
    sz_sequence_u64tape_t const *tape_ = nullptr;

    std::size_t size() const noexcept {
        sz_assert_(tape_ != nullptr && "Tape must not be null");
        return tape_->count;
    }
    std::string_view operator[](std::size_t index) const noexcept {
        sz_assert_(tape_ != nullptr && "Tape must not be null");
        sz_assert_(index < tape_->count && "Index out of bounds");
        return {tape_->data + tape_->offsets[index], tape_->offsets[index + 1] - tape_->offsets[index]};
    }
};

/** Wraps a `sz_sequence_u32tape_t` to feel like `std::vector<std::string_view>>` in the implementation layer. */
struct sz_sequence_u32tape_as_cpp_container_t {
    using value_type = std::string_view;
    sz_sequence_u32tape_t const *tape_ = nullptr;

    std::size_t size() const noexcept {
        sz_assert_(tape_ != nullptr && "Tape must not be null");
        return tape_->count;
    }
    std::string_view operator[](std::size_t index) const noexcept {
        sz_assert_(tape_ != nullptr && "Tape must not be null");
        sz_assert_(index < tape_->count && "Index out of bounds");
        return {tape_->data + tape_->offsets[index], tape_->offsets[index + 1] - tape_->offsets[index]};
    }
};

/** Convenience class for slicing a strided fingerprints output. */
template <typename element_type_, sz_size_t row_extent_ = SZ_SIZE_MAX>
struct strided_rows {
    using value_type = element_type_;

  private:
    static constexpr sz_size_t extent_k = row_extent_; // Extent of each row, default to SZ_SIZE_MAX

    sz_ptr_t data_ = nullptr;
    sz_size_t stride_bytes_ = 0;
    sz_size_t row_length_ = 0;
    sz_size_t count_ = 0;

  public:
    strided_rows(sz_ptr_t data, sz_size_t row_length, sz_size_t stride_bytes, sz_size_t count) noexcept
        : data_(data), stride_bytes_(stride_bytes), row_length_(row_length), count_(count) {}

    std::size_t size() const noexcept { return count_; }

    template <sz_size_t new_extent_ = extent_k>
    strided_rows<element_type_, new_extent_> shifted(std::ptrdiff_t offset) const noexcept {
        return strided_rows<element_type_, new_extent_>(data_ + offset, row_length_, stride_bytes_, count_);
    }

    sz::span<value_type, extent_k> operator[](std::size_t index) const noexcept {
        sz_assert_(index < count_ && "Index out of bounds");
        return sz::span<value_type, extent_k>(reinterpret_cast<value_type *>(data_ + index * stride_bytes_),
                                              row_length_);
    }
};

/**
 *  @brief Convenience class for strided pointer arithmetic.
 *  @see
 * https://github.com/ashvardanian/less_slow.cpp/blob/b21507f7143f8175b92d0b2b2d827b3bd4bb081b/less_slow.cpp#L2593-L2641
 */
template <typename value_type_>
class strided_ptr {
    sz_ptr_t data_;
    std::size_t stride_;

  public:
    using value_type = value_type_;
    using pointer = value_type_ *;
    using reference = value_type_ &;
    using difference_type = std::ptrdiff_t;
    using iterator_category = std::random_access_iterator_tag;

    strided_ptr(sz_ptr_t data, std::size_t stride_bytes) : data_(data), stride_(stride_bytes) {
        assert(data_ && "Pointer must not be null, as NULL arithmetic is undefined");
    }
#if defined(__cpp_lib_assume_aligned) // Not available in Apple Clang
    reference operator*() const noexcept {
        return *std::launder(std::assume_aligned<1>(reinterpret_cast<pointer>(data_)));
    }
    reference operator[](difference_type i) const noexcept {
        return *std::launder(std::assume_aligned<1>(reinterpret_cast<pointer>(data_ + i * stride_)));
    }
#else
    reference operator*() const noexcept { return *reinterpret_cast<pointer>(data_); }
    reference operator[](difference_type i) const noexcept { return *reinterpret_cast<pointer>(data_ + i * stride_); }
#endif // defined(__cpp_lib_assume_aligned)

    // clang-format off
    pointer operator->() const noexcept { return &operator*(); }
    strided_ptr &operator++() noexcept { data_ += stride_; return *this; }
    strided_ptr operator++(int) noexcept { strided_ptr temp = *this; ++(*this); return temp; }
    strided_ptr &operator--() noexcept { data_ -= stride_; return *this; }
    strided_ptr operator--(int) noexcept { strided_ptr temp = *this; --(*this); return temp; }
    strided_ptr &operator+=(difference_type offset) noexcept { data_ += offset * stride_; return *this; }
    strided_ptr &operator-=(difference_type offset) noexcept { data_ -= offset * stride_; return *this; }
    strided_ptr operator+(difference_type offset) const noexcept { strided_ptr temp = *this; return temp += offset; }
    strided_ptr operator-(difference_type offset) const noexcept { strided_ptr temp = *this; return temp -= offset; }
    friend difference_type operator-(strided_ptr const &a, strided_ptr const &b) noexcept { assert(a.stride_ == b.stride_); return (a.data_ - b.data_) / static_cast<difference_type>(a.stride_); }
    friend bool operator==(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ == b.data_; }
    friend bool operator<(strided_ptr const &a, strided_ptr const &b) noexcept { return a.data_ < b.data_; }
    friend bool operator!=(strided_ptr const &a, strided_ptr const &b) noexcept { return !(a == b); }
    friend bool operator>(strided_ptr const &a, strided_ptr const &b) noexcept { return b < a; }
    friend bool operator<=(strided_ptr const &a, strided_ptr const &b) noexcept { return !(b < a); }
    friend bool operator>=(strided_ptr const &a, strided_ptr const &b) noexcept { return !(a < b); }
    // clang-format on
};

constexpr bool is_gpu_capability(sz_capability_t capability) noexcept {
    return (capability & sz_cap_cuda_k) != 0 || (capability & sz_cap_kepler_k) != 0 ||
           (capability & sz_cap_hopper_k) != 0;
}

inline sz_status_t propagate_error(sz::status_t status, char const **reporter_message,
                                   char const *optional_message = nullptr) noexcept {
    if (!reporter_message) return static_cast<sz_status_t>(status);

    // If the optional message is provided, use it verbatim
    if (optional_message && reporter_message) {
        *reporter_message = optional_message;
        return static_cast<sz_status_t>(status);
    }

    // Otherwise, map the status code to a predefined message
    switch (status) {
    case sz::status_t::success_k: *reporter_message = nullptr; break;
    case sz::status_t::bad_alloc_k: *reporter_message = "Memory allocation failed"; break;
    case sz::status_t::invalid_utf8_k: *reporter_message = "Invalid UTF-8 input"; break;
    case sz::status_t::contains_duplicates_k: *reporter_message = "Input contains duplicates"; break;
    case sz::status_t::overflow_risk_k: *reporter_message = "Overflow risk detected"; break;
    case sz::status_t::unexpected_dimensions_k: *reporter_message = "Input/output size mismatch"; break;
    case sz::status_t::missing_gpu_k: *reporter_message = "GPU device not available or CUDA not initialized"; break;
    case sz::status_t::device_code_mismatch_k: *reporter_message = "Backend and executor mismatch"; break;
    case sz::status_t::device_memory_mismatch_k: *reporter_message = "Use device-reachable or unified memory"; break;
    case sz::status_t::unknown_k: *reporter_message = "Unknown error"; break;
    default: *reporter_message = "Unrecognized error code"; break;
    }

    return static_cast<sz_status_t>(status);
}

#if SZ_USE_CUDA
inline sz_status_t propagate_error(szs::cuda_status_t cuda_status, char const **reporter_message,
                                   char const *optional_message = nullptr) noexcept {
    if (cuda_status.cuda_error != cudaSuccess) {
        if (reporter_message) *reporter_message = cudaGetErrorString(cuda_status.cuda_error);
        return static_cast<sz_status_t>(cuda_status.status);
    }
    else { return propagate_error(cuda_status.status, reporter_message, optional_message); }
}
#endif

#if SZ_USE_CUDA

/** @brief Redirects to CUDA's unified memory allocator. */
void *sz_memory_allocate_from_unified_(sz_size_t size_bytes, void *handle) {
    sz_unused_(handle);
    return szs::unified_alloc_t {}.allocate(size_bytes);
}

/** @brief Redirects to CUDA's unified memory allocator. */
void sz_memory_free_from_unified_(void *address, sz_size_t size_bytes, void *handle) {
    sz_unused_(handle);
    szs::unified_alloc_t {}.deallocate((char *)address, size_bytes);
}

#endif // SZ_USE_CUDA

struct default_scope_t {};
szs::dummy_executor_t get_executor(default_scope_t const &) noexcept { return {}; }
sz::cpu_specs_t get_specs(default_scope_t const &) noexcept { return {}; }

struct cpu_scope_t {
    std::unique_ptr<fu::basic_pool_t> executor_ptr;
    sz::cpu_specs_t specs;

    cpu_scope_t() = default;
    cpu_scope_t(std::unique_ptr<fu::basic_pool_t> exec_ptr, sz::cpu_specs_t cpu_specs) noexcept
        : executor_ptr(std::move(exec_ptr)), specs(cpu_specs) {}
};
fu::basic_pool_t &get_executor(cpu_scope_t &scope) noexcept { return *scope.executor_ptr; }
sz::cpu_specs_t get_specs(cpu_scope_t const &scope) noexcept { return scope.specs; }

#if SZ_USE_CUDA
struct gpu_scope_t {
    szs::cuda_executor_t executor;
    sz::gpu_specs_t specs;
};
szs::cuda_executor_t &get_executor(gpu_scope_t &scope) noexcept { return scope.executor; }
sz::gpu_specs_t get_specs(gpu_scope_t const &scope) noexcept { return scope.specs; }

/** Cached default GPU context (device 0) to avoid repeated scheduling boilerplate */
struct default_gpu_context_t {
    szs::cuda_status_t status {sz::status_t::unknown_k, cudaSuccess};
    szs::cuda_executor_t executor;
    sz::gpu_specs_t specs;
};

inline default_gpu_context_t &default_gpu_context() {
    static default_gpu_context_t ctx = [] {
        default_gpu_context_t result;
        szs::cuda_status_t specs_status = szs::gpu_specs_fetch(result.specs, 0);
        if (specs_status.status != sz::status_t::success_k) {
            result.status = specs_status;
            return result;
        }
        szs::cuda_status_t exec_status = result.executor.try_scheduling(0);
        result.status = exec_status;
        return result;
    }();
    return ctx;
}
#endif

struct device_scope_t {
#if SZ_USE_CUDA
    std::variant<default_scope_t, cpu_scope_t, gpu_scope_t> variants;
#else
    std::variant<default_scope_t, cpu_scope_t> variants;
#endif

    template <typename... variants_arguments_>
    device_scope_t(variants_arguments_ &&...args) noexcept : variants(std::forward<variants_arguments_>(args)...) {}
};

struct levenshtein_backends_t {

    /**
     *  On each hardware platform we use a different backend for Levenshtein distances,
     *  separately covering:
     *  - Linear or Affine gap costs
     *  - Serial, Ice Lake, CUDA, CUDA Kepler, and CUDA Hopper backends
     */
    std::variant<
#if SZ_USE_ICE
        szs::levenshtein_ice_t, szs::affine_levenshtein_ice_t,
#endif
#if SZ_USE_CUDA
        szs::levenshtein_cuda_t, szs::affine_levenshtein_cuda_t,
#endif
#if SZ_USE_KEPLER
        szs::levenshtein_kepler_t, szs::affine_levenshtein_kepler_t,
#endif
#if SZ_USE_HOPPER
        szs::levenshtein_hopper_t, szs::affine_levenshtein_hopper_t,
#endif
        szs::levenshtein_serial_t, szs::affine_levenshtein_serial_t>
        variants;

    template <typename... variants_arguments_>
    levenshtein_backends_t(variants_arguments_ &&...args) noexcept
        : variants(std::forward<variants_arguments_>(args)...) {}
};

template <typename texts_type_>
sz_status_t szs_levenshtein_distances_for_(                                      //
    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
    texts_type_ const &a_container, texts_type_ const &b_container,              //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    sz_assert_(device_punned != nullptr && "Device must be initialized");
    sz_assert_(results != nullptr && "Results must not be null");

    // Revert back from opaque pointer types
    auto *engine = reinterpret_cast<levenshtein_backends_t *>(engine_punned);
    auto *device = reinterpret_cast<device_scope_t *>(device_punned);

    // Wrap our stable ABI sequences into C++ friendly containers
    auto results_strided = strided_ptr<sz_size_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};

    // The simplest case, is having non-optimized non-unrolled hashers.
    sz_status_t result = sz_success_k;
    auto variant_logic = [&](auto &engine_variant) {
        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;

        // GPU backends are only compatible with GPU scopes
        if constexpr (is_gpu_capability(engine_capability_k)) {
#if SZ_USE_CUDA
            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<gpu_scope_t>(device->variants);
                szs::cuda_status_t status = engine_variant(    //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            // Try ephemeral GPU on default scope (device 0)
            else if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &ctx = default_gpu_context();
                szs::cuda_status_t status =
                    ctx.status != sz::status_t::success_k
                        ? ctx.status
                        : engine_variant( //
                              a_container, b_container, results_strided, ctx.executor, ctx.specs);
                result = propagate_error(status, error_message);
            }
            else { result = propagate_error(sz::status_t::device_code_mismatch_k, error_message); }
#else
            result = propagate_error(sz::status_t::missing_gpu_k, error_message);
#endif // SZ_USE_CUDA
        }
        // CPU backends are only compatible with CPU scopes
        else {
            if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &device_scope = std::get<default_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<cpu_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else { result = propagate_error(sz::status_t::device_code_mismatch_k, error_message); }
        }
    };

    std::visit(variant_logic, engine->variants);
    return result;
}

struct levenshtein_utf8_backends_t {

    /**
     *  On each hardware platform we use a different backend for Levenshtein UTF8 distances,
     *  separately covering:
     *  - Serial, Ice Lake, CUDA backends
     */
    std::variant<
#if SZ_USE_ICE
        szs::levenshtein_utf8_ice_t, // ! `szs::affine_levenshtein_utf8_ice_t` won't compile yet
#endif
        szs::levenshtein_utf8_serial_t, szs::affine_levenshtein_utf8_serial_t>
        variants;

    template <typename... variants_arguments_>
    levenshtein_utf8_backends_t(variants_arguments_ &&...args) noexcept
        : variants(std::forward<variants_arguments_>(args)...) {}
};

template <typename texts_type_>
sz_status_t szs_levenshtein_distances_utf8_for_(                                      //
    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
    texts_type_ const &a_container, texts_type_ const &b_container,                   //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    sz_assert_(device_punned != nullptr && "Device must be initialized");
    sz_assert_(results != nullptr && "Results must not be null");

    // Revert back from opaque pointer types
    auto *engine = reinterpret_cast<levenshtein_utf8_backends_t *>(engine_punned);
    auto *device = reinterpret_cast<device_scope_t *>(device_punned);

    // Wrap our stable ABI sequences into C++ friendly containers
    auto results_strided = strided_ptr<sz_size_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};

    // The simplest case, is having non-optimized non-unrolled hashers.
    sz_status_t result = sz_success_k;
    auto variant_logic = [&](auto &engine_variant) {
        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;

        // GPU backends are only compatible with GPU scopes
        if constexpr (is_gpu_capability(engine_capability_k)) {
            // No GPU backends for UTF8 Levenshtein distances yet
            result = propagate_error(sz::status_t::unknown_k, error_message);
        }
        // CPU backends are only compatible with CPU scopes
        else {
            if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &device_scope = std::get<default_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<cpu_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else { result = sz_device_code_mismatch_k; }
        }
    };

    std::visit(variant_logic, engine->variants);
    return result;
}

struct needleman_wunsch_backends_t {

    /**
     *  On each hardware platform we use a different backend for Levenshtein distances,
     *  separately covering:
     *  - Linear or Affine gap costs
     *  - Serial, Ice Lake, CUDA, CUDA Kepler, and CUDA Hopper backends
     */
    std::variant<
#if SZ_USE_ICE
        szs::needleman_wunsch_ice_t, // ! No affine variant here yet
#endif
#if SZ_USE_CUDA
        szs::needleman_wunsch_cuda_t, szs::affine_needleman_wunsch_cuda_t,
#endif
#if SZ_USE_HOPPER
        szs::needleman_wunsch_hopper_t, szs::affine_needleman_wunsch_hopper_t,
#endif
        szs::needleman_wunsch_serial_t, szs::affine_needleman_wunsch_serial_t>
        variants;

    template <typename... variants_arguments_>
    needleman_wunsch_backends_t(variants_arguments_ &&...args) noexcept
        : variants(std::forward<variants_arguments_>(args)...) {}
};

template <typename texts_type_>
sz_status_t szs_needleman_wunsch_scores_for_(                                      //
    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
    texts_type_ const &a_container, texts_type_ const &b_container,                //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    sz_assert_(device_punned != nullptr && "Device must be initialized");
    sz_assert_(results != nullptr && "Results must not be null");

    // Revert back from opaque pointer types
    auto *engine = reinterpret_cast<needleman_wunsch_backends_t *>(engine_punned);
    auto *device = reinterpret_cast<device_scope_t *>(device_punned);

    // Wrap our stable ABI sequences into C++ friendly containers
    auto results_strided = strided_ptr<sz_ssize_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};

    // The simplest case, is having non-optimized non-unrolled hashers.
    sz_status_t result = sz_success_k;
    auto variant_logic = [&](auto &engine_variant) {
        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;

        // GPU backends are only compatible with GPU scopes
        if constexpr (is_gpu_capability(engine_capability_k)) {
#if SZ_USE_CUDA
            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<gpu_scope_t>(device->variants);
                szs::cuda_status_t status = engine_variant(    //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &ctx = default_gpu_context();
                szs::cuda_status_t status =
                    ctx.status != sz::status_t::success_k
                        ? ctx.status
                        : engine_variant( //
                              a_container, b_container, results_strided, ctx.executor, ctx.specs);
                result = propagate_error(status, error_message);
            }
            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
#else
            result = propagate_error(sz::status_t::unknown_k, error_message); // GPU support is not enabled
#endif // SZ_USE_CUDA
        }
        // CPU backends are only compatible with CPU scopes
        else {
            if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &device_scope = std::get<default_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<cpu_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
        }
    };

    std::visit(variant_logic, engine->variants);
    return result;
}

struct smith_waterman_backends_t {

    /**
     *  On each hardware platform we use a different backend for Levenshtein distances,
     *  separately covering:
     *  - Linear or Affine gap costs
     *  - Serial, Ice Lake, CUDA, CUDA Kepler, and CUDA Hopper backends
     */
    std::variant<
#if SZ_USE_ICE
        szs::smith_waterman_ice_t, // ! No affine variant here yet
#endif
#if SZ_USE_CUDA
        szs::smith_waterman_cuda_t, szs::affine_smith_waterman_cuda_t,
#endif
#if SZ_USE_HOPPER
        szs::smith_waterman_hopper_t, szs::affine_smith_waterman_hopper_t,
#endif
        szs::smith_waterman_serial_t, szs::affine_smith_waterman_serial_t>
        variants;

    template <typename... variants_arguments_>
    smith_waterman_backends_t(variants_arguments_ &&...args) noexcept
        : variants(std::forward<variants_arguments_>(args)...) {}
};

template <typename texts_type_>
sz_status_t szs_smith_waterman_scores_for_(                                      //
    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
    texts_type_ const &a_container, texts_type_ const &b_container,              //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    sz_assert_(device_punned != nullptr && "Device must be initialized");
    sz_assert_(results != nullptr && "Results must not be null");

    // Revert back from opaque pointer types
    auto *engine = reinterpret_cast<smith_waterman_backends_t *>(engine_punned);
    auto *device = reinterpret_cast<device_scope_t *>(device_punned);

    // Wrap our stable ABI sequences into C++ friendly containers
    auto results_strided = strided_ptr<sz_ssize_t> {reinterpret_cast<sz_ptr_t>(results), results_stride};

    // The simplest case, is having non-optimized non-unrolled hashers.
    sz_status_t result = sz_success_k;
    auto variant_logic = [&](auto &engine_variant) {
        using engine_variant_t = std::decay_t<decltype(engine_variant)>;
        constexpr sz_capability_t engine_capability_k = engine_variant_t::capability_k;

        // GPU backends are only compatible with GPU scopes
        if constexpr (is_gpu_capability(engine_capability_k)) {
#if SZ_USE_CUDA
            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<gpu_scope_t>(device->variants);
                szs::cuda_status_t status = engine_variant(    //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else if (std::holds_alternative<default_scope_t>(device->variants)) {
                sz::gpu_specs_t specs;
                auto specs_status = szs::gpu_specs_fetch(specs, 0);
                if (specs_status.status != sz::status_t::success_k) {
                    result = propagate_error(specs_status, error_message);
                }
                else {
                    szs::cuda_executor_t executor;
                    auto exec_status = executor.try_scheduling(0);
                    if (exec_status.status != sz::status_t::success_k) {
                        result = propagate_error(exec_status, error_message);
                    }
                    else {
                        szs::cuda_status_t status = engine_variant( //
                            a_container, b_container, results_strided, executor, specs);
                        result = propagate_error(status, error_message);
                    }
                }
            }
            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
#else
            result = propagate_error(sz::status_t::unknown_k, error_message); // GPU support is not enabled
#endif // SZ_USE_CUDA
        }
        // CPU backends are only compatible with CPU scopes
        else {
            if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &device_scope = std::get<default_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<cpu_scope_t>(device->variants);
                sz::status_t status = engine_variant(          //
                    a_container, b_container, results_strided, //
                    get_executor(device_scope), get_specs(device_scope));
                result = propagate_error(status, error_message);
            }
            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
        }
    };

    std::visit(variant_logic, engine->variants);
    return result;
}

template <typename element_type_>
using vec = szs::safe_vector<element_type_, std::allocator<element_type_>>;

static constexpr size_t fingerprint_slice_k = 64;

struct fingerprints_backends_t {
    using fallback_variant_cpus_t = szs::basic_rolling_hashers<szs::floating_rolling_hasher<sz::f64_t>, sz::u32_t>;
#if SZ_USE_CUDA
    using fallback_variant_cuda_t = szs::basic_rolling_hashers<szs::floating_rolling_hasher<sz::f64_t>, sz::u32_t,
                                                               sz::u32_t, ualloc_t, sz_cap_cuda_k>;
#endif // SZ_USE_CUDA

    /**
     *  On each hardware platform the contains a group of rolling hashers.
     *  Each rolling hasher produces `fingerprint_slice_k` worth of fingerprint dimensions.
     */
    std::variant<
#if SZ_USE_HASWELL
        vec<szs::floating_rolling_hashers<sz_cap_haswell_k, fingerprint_slice_k>>,
#endif
#if SZ_USE_SKYLAKE
        vec<szs::floating_rolling_hashers<sz_cap_skylake_k, fingerprint_slice_k>>,
#endif
#if SZ_USE_CUDA
        vec<szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>>, fallback_variant_cuda_t,
#endif
        vec<szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>>, fallback_variant_cpus_t>
        variants;

    sz_size_t dimensions = 0; // Total number of dimensions across all hashers

    template <typename... variants_arguments_>
    fingerprints_backends_t(variants_arguments_ &&...args) noexcept
        : variants(std::forward<variants_arguments_>(args)...) {}
};

template <typename texts_type_>
sz_status_t szs_fingerprints_for_(                                      //
    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
    texts_type_ const &texts_container,                                 //
    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {

    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    sz_assert_(device_punned != nullptr && "Device must be initialized");
    sz_assert_(min_hashes != nullptr && "Output min_hashes cannot be null");
    sz_assert_(min_counts != nullptr && "Output min_counts cannot be null");

    // Revert back from opaque pointer types
    auto *engine = reinterpret_cast<fingerprints_backends_t *>(engine_punned);
    auto *device = reinterpret_cast<device_scope_t *>(device_punned);

    // Wrap our stable ABI sequences into C++ friendly containers
    auto const dims = engine->dimensions;
    auto const texts_count = texts_container.size();

    // The simplest case, is having non-optimized non-unrolled hashers.
    sz_status_t result = sz_success_k;
    using fallback_variant_cpus_t = typename fingerprints_backends_t::fallback_variant_cpus_t;
    auto fallback_logic_cpus = [&](fallback_variant_cpus_t &fallback_hashers) {
        auto const min_hashes_rows = //
            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), dims, min_hashes_stride, texts_count};
        auto const min_counts_rows = //
            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};

        // CPU fallback hashers can only work with CPU-compatible device scopes
        if (std::holds_alternative<default_scope_t>(device->variants)) {
            auto &device_scope = std::get<default_scope_t>(device->variants);
            sz::status_t status = fallback_hashers(                //
                texts_container, min_hashes_rows, min_counts_rows, //
                get_executor(device_scope), get_specs(device_scope));
            result = static_cast<sz_status_t>(status);
        }
        else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
            auto &device_scope = std::get<cpu_scope_t>(device->variants);
            sz::status_t status = fallback_hashers(                //
                texts_container, min_hashes_rows, min_counts_rows, //
                get_executor(device_scope), get_specs(device_scope));
            result = static_cast<sz_status_t>(status);
        }
        else { result = propagate_error(sz::status_t::unknown_k, error_message); }
    };
#if SZ_USE_CUDA
    using fallback_variant_cuda_t = typename fingerprints_backends_t::fallback_variant_cuda_t;
    auto fallback_logic_gpus = [&](fallback_variant_cuda_t &fallback_hashers) {
        auto const min_hashes_rows = //
            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), dims, min_hashes_stride, texts_count};
        auto const min_counts_rows = //
            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};

        // GPU fallback hashers can work with GPU scope, or default scope via an ephemeral GPU executor
        if (std::holds_alternative<gpu_scope_t>(device->variants)) {
            auto &device_scope = std::get<gpu_scope_t>(device->variants);
            sz::status_t status = fallback_hashers(                //
                texts_container, min_hashes_rows, min_counts_rows, //
                get_executor(device_scope), get_specs(device_scope));
            result = static_cast<sz_status_t>(status);
        }
        else if (std::holds_alternative<default_scope_t>(device->variants)) {
            auto &ctx = default_gpu_context();
            if (ctx.status.status != sz::status_t::success_k) { result = propagate_error(ctx.status, error_message); }
            else {
                sz::status_t status = fallback_hashers( //
                    texts_container, min_hashes_rows, min_counts_rows, ctx.executor, ctx.specs);
                result = propagate_error(status, error_message);
            }
        }
        else { result = propagate_error(sz::status_t::unknown_k, error_message); }
    };
#endif // SZ_USE_CUDA

    // The unrolled logic is a bit more complex than `fallback_logic_cpus`, but in practice involves
    // just one additional loop level.
    auto unrolled_logic = [&](auto &&unrolled_hashers) {
        using unrolled_hashers_t = std::decay_t<decltype(unrolled_hashers)>;
        using unrolled_hasher_t = typename unrolled_hashers_t::value_type;
        constexpr sz_capability_t engine_capability_k = unrolled_hasher_t::capability_k;
        constexpr size_t bytes_per_slice_k = fingerprint_slice_k * sizeof(sz_u32_t);

        // Each engine will produce only a few dimensions so the outputs should be defined
        // differently
        auto const min_hashes_rows = //
            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_hashes), fingerprint_slice_k, min_hashes_stride,
                                    texts_count};
        auto const min_counts_rows = //
            strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), fingerprint_slice_k, min_counts_stride,
                                    texts_count};

        // GPU backends are only compatible with GPU scopes
        if constexpr (is_gpu_capability(engine_capability_k)) {
#if SZ_USE_CUDA
            if (std::holds_alternative<gpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<gpu_scope_t>(device->variants);
                for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
                    auto &engine_variant = unrolled_hashers[i];
                    szs::cuda_status_t status = engine_variant(                                       //
                        texts_container,                                                              //
                        min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                        min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                        get_executor(device_scope), get_specs(device_scope));
                    result = propagate_error(status, error_message);
                    if (result != sz_success_k) break;
                }
            }
            else if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &ctx = default_gpu_context();
                if (ctx.status != sz::status_t::success_k) { result = propagate_error(ctx.status, error_message); }
                else {
                    for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
                        auto &engine_variant = unrolled_hashers[i];
                        szs::cuda_status_t status = engine_variant(                                       //
                            texts_container,                                                              //
                            min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                            min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                            ctx.executor, ctx.specs);
                        result = propagate_error(status, error_message);
                        if (result != sz_success_k) break;
                    }
                }
            }
            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
#else
            result = propagate_error(sz::status_t::unknown_k, error_message); // GPU support is not enabled
#endif // SZ_USE_CUDA
        }
        // CPU backends are only compatible with CPU scopes
        else {
            if (std::holds_alternative<default_scope_t>(device->variants)) {
                auto &device_scope = std::get<default_scope_t>(device->variants);
                for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
                    auto &engine_variant = unrolled_hashers[i];
                    sz::status_t status = engine_variant(                                             //
                        texts_container,                                                              //
                        min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                        min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                        get_executor(device_scope), get_specs(device_scope));
                    result = propagate_error(status, error_message);
                }
            }
            else if (std::holds_alternative<cpu_scope_t>(device->variants)) {
                auto &device_scope = std::get<cpu_scope_t>(device->variants);
                for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
                    auto &engine_variant = unrolled_hashers[i];
                    sz::status_t status = engine_variant(                                             //
                        texts_container,                                                              //
                        min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                        min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
                        get_executor(device_scope), get_specs(device_scope));
                    result = propagate_error(status, error_message);
                }
            }
            else { result = propagate_error(sz::status_t::unknown_k, error_message); }
        }
    };

#if SZ_USE_CUDA
    std::visit(overloaded {fallback_logic_cpus, fallback_logic_gpus, unrolled_logic}, engine->variants);
#else
    std::visit(overloaded {fallback_logic_cpus, unrolled_logic}, engine->variants);
#endif
    return result;
}

extern "C" {

#pragma region Metadata

SZ_DYNAMIC int szs_version_major(void) { return STRINGZILLA_H_VERSION_MAJOR; }
SZ_DYNAMIC int szs_version_minor(void) { return STRINGZILLA_H_VERSION_MINOR; }
SZ_DYNAMIC int szs_version_patch(void) { return STRINGZILLA_H_VERSION_PATCH; }

SZ_DYNAMIC sz_capability_t szs_capabilities(void) {
    // Preserve the static capabilities
    static sz_capability_t static_caps = sz_caps_none_k;
    if (static_caps == sz_caps_none_k) {
        sz_capability_t cpu_caps =
            (sz_capability_t)(sz_capabilities_comptime_implementation_() & sz_capabilities_runtime_implementation_());
#if SZ_USE_CUDA
        sz_capability_t gpu_caps = sz_caps_none_k;
        sz::gpu_specs_t first_gpu_specs;
        auto specs_status = static_cast<sz::status_t>(szs::gpu_specs_fetch(first_gpu_specs));
        if (specs_status == sz::status_t::missing_gpu_k) { return cpu_caps; }        // No GPUs available
        else if (specs_status != sz::status_t::success_k) { return sz_caps_none_k; } // Some bug
        gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_cuda_k);
        if (first_gpu_specs.sm_code >= 30) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_kepler_k);
        if (first_gpu_specs.sm_code >= 90) gpu_caps = static_cast<sz_capability_t>(gpu_caps | sz_cap_hopper_k);
        static_caps = static_cast<sz_capability_t>(cpu_caps | gpu_caps);
#else
        static_caps = cpu_caps;
#endif // SZ_USE_CUDA
    }

    return static_caps;
}

SZ_DYNAMIC sz_status_t sz_memory_allocator_init_unified(sz_memory_allocator_t *alloc, char const **error_message) {
#if SZ_USE_CUDA
    alloc->allocate = &sz_memory_allocate_from_unified_;
    alloc->free = &sz_memory_free_from_unified_;
    alloc->handle = nullptr;
    return propagate_error(sz::status_t::success_k, error_message);
#else
    sz_unused_(alloc); // Suppress unused parameter warning when CUDA is not used
    return propagate_error(sz::status_t::missing_gpu_k, error_message);
#endif
}

#pragma endregion Metadata

#pragma region Device Scopes

SZ_DYNAMIC sz_status_t szs_device_scope_init_default(szs_device_scope_t *scope_punned, char const **error_message) {
    sz_assert_(scope_punned != nullptr && "Scope must not be null");
    auto *scope = new device_scope_t {default_scope_t {}};
    if (!scope) return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate device scope");
    *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
    return propagate_error(sz::status_t::success_k, error_message);
}

SZ_DYNAMIC sz_status_t szs_device_scope_init_cpu_cores(sz_size_t cpu_cores, szs_device_scope_t *scope_punned,
                                                       char const **error_message) {
    sz_assert_(scope_punned != nullptr && "Scope must not be null");

    // If `cpu_cores` is 0, use all available cores
    if (cpu_cores == 0) cpu_cores = std::thread::hardware_concurrency();

    // If `cpu_cores` is 1, redirect to default scope
    if (cpu_cores == 1) return szs_device_scope_init_default(scope_punned, error_message);

    sz::cpu_specs_t specs;
    auto executor = std::make_unique<fu::basic_pool_t>();
    if (!executor->try_spawn(cpu_cores))
        return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to spawn thread pool");

    auto *scope =
        new (std::nothrow) device_scope_t(std::in_place_type_t<cpu_scope_t> {}, std::move(executor), std::move(specs));
    if (!scope) return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate CPU device scope");

    *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
    return propagate_error(sz::status_t::success_k, error_message);
}

SZ_DYNAMIC sz_status_t szs_device_scope_init_gpu_device(sz_size_t gpu_device, szs_device_scope_t *scope_punned,
                                                        char const **error_message) {
    sz_assert_(scope_punned != nullptr && "Scope must not be null");

#if SZ_USE_CUDA
    sz::gpu_specs_t specs;
    auto specs_status = szs::gpu_specs_fetch(specs, static_cast<int>(gpu_device));
    if (specs_status.status != sz::status_t::success_k) { return propagate_error(specs_status, error_message); }
    szs::cuda_executor_t executor;
    auto executor_status = executor.try_scheduling(static_cast<int>(gpu_device));
    if (executor_status.status != sz::status_t::success_k) return propagate_error(executor_status, error_message);

    auto *scope =
        new (std::nothrow) device_scope_t {gpu_scope_t {.executor = std::move(executor), .specs = std::move(specs)}};
    if (!scope) return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate GPU device scope");
    *scope_punned = reinterpret_cast<szs_device_scope_t>(scope);
    return propagate_error(sz::status_t::success_k, error_message);
#else
    sz_unused_(gpu_device);
    sz_unused_(scope_punned);
    return propagate_error(sz::status_t::missing_gpu_k, error_message, "CUDA support not compiled in");
#endif
}

SZ_DYNAMIC sz_status_t szs_device_scope_get_cpu_cores(szs_device_scope_t scope_punned, sz_size_t *cpu_cores,
                                                      char const **error_message) {
    if (scope_punned == nullptr || cpu_cores == nullptr)
        return propagate_error(sz::status_t::unknown_k, error_message, "Invalid null pointer argument");
    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);

    if (std::holds_alternative<cpu_scope_t>(scope->variants)) {
        auto &cpu_scope = std::get<cpu_scope_t>(scope->variants);
        if (cpu_scope.executor_ptr) {
            *cpu_cores = cpu_scope.executor_ptr->threads_count();
            return propagate_error(sz::status_t::success_k, error_message);
        }
    }
    // Default scope is single-threaded
    else if (std::holds_alternative<default_scope_t>(scope->variants)) {
        *cpu_cores = 1;
        return propagate_error(sz::status_t::success_k, error_message);
    }

    return propagate_error(sz::status_t::unknown_k, error_message, "Device scope is GPU-only");
}

SZ_DYNAMIC sz_status_t szs_device_scope_get_gpu_device(szs_device_scope_t scope_punned, sz_size_t *gpu_device,
                                                       char const **error_message) {
    if (scope_punned == nullptr || gpu_device == nullptr)
        return propagate_error(sz::status_t::unknown_k, error_message, "Invalid null pointer argument");

#if SZ_USE_CUDA
    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
    if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
        auto &gpu_scope = std::get<gpu_scope_t>(scope->variants);
        *gpu_device = static_cast<sz_size_t>(gpu_scope.executor.device_id());
        return propagate_error(sz::status_t::success_k, error_message);
    }
#else
    sz_unused_(scope_punned);
    sz_unused_(gpu_device);
#endif

    return propagate_error(sz::status_t::unknown_k, error_message, "Device scope is CPU-only");
}

SZ_DYNAMIC void szs_device_scope_free(szs_device_scope_t scope_punned) {
    if (scope_punned == nullptr) return;
    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
    delete scope;
}

SZ_DYNAMIC sz_status_t szs_device_scope_get_capabilities(szs_device_scope_t scope_punned, sz_capability_t *capabilities,
                                                         char const **error_message) {

    if (scope_punned == nullptr || capabilities == nullptr)
        return propagate_error(sz::status_t::unknown_k, error_message, "Invalid null pointer argument");
    sz_capability_t system_caps = szs_capabilities();

#if SZ_USE_CUDA
    auto *scope = reinterpret_cast<device_scope_t *>(scope_punned);
    if (std::holds_alternative<gpu_scope_t>(scope->variants)) {
        // For GPU scope, intersect system capabilities with CUDA capabilities
        *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cuda_k);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif

    // For default and CPU scopes, intersect system capabilities with CPU capabilities
    *capabilities = static_cast<sz_capability_t>(system_caps & sz_caps_cpus_k);
    return propagate_error(sz::status_t::success_k, error_message);
}

#pragma endregion Device Scopes

#pragma region Unified Allocator

SZ_DYNAMIC void *szs_unified_alloc(sz_size_t size_bytes) {
#if SZ_USE_CUDA
    return szs::unified_alloc_t {}.allocate(size_bytes);
#else
    return std::malloc(size_bytes);
#endif
}

SZ_DYNAMIC void szs_unified_free(void *ptr, sz_size_t size_bytes) {
    if (!ptr) return;
#if SZ_USE_CUDA
    szs::unified_alloc_t {}.deallocate(static_cast<char *>(ptr), size_bytes);
#else
    sz_unused_(size_bytes);
    std::free(ptr);
#endif
}

#pragma endregion Unified Allocator

#pragma region Levenshtein Distances

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_init(                                             //
    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
    szs_levenshtein_distances_t *engine_punned, char const **error_message) {

    sz_unused_(alloc); // Custom allocator not yet implemented, using default
    sz_unused_(capabilities); // Optional backends may be compiled out
    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");

    // If the gap opening and extension costs are identical we can use less memory
    auto const can_use_linear_costs = open == extend;
    auto const substitution_costs = szs::uniform_substitution_costs_t {match, mismatch};
    auto const linear_costs = szs::linear_gap_costs_t {open};
    auto const affine_costs = szs::affine_gap_costs_t {open, extend};

#if SZ_USE_ICE
    bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
    if (can_use_ice && can_use_linear_costs) {
        auto variant = szs::levenshtein_ice_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_ice_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_ice) {
        auto variant = szs::affine_levenshtein_ice_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_ice_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_ICE

#if SZ_USE_CUDA
    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) == sz_cap_cuda_k;
    if (can_use_cuda && can_use_linear_costs) {
        auto variant = szs::levenshtein_cuda_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_cuda_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_cuda) {
        auto variant = szs::affine_levenshtein_cuda_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_cuda_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_CUDA

#if SZ_USE_KEPLER
    bool const can_use_kepler = (capabilities & sz_caps_ck_k) == sz_caps_ck_k;
    if (can_use_kepler && can_use_linear_costs) {
        auto variant = szs::levenshtein_kepler_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_kepler_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_kepler) {
        auto variant = szs::affine_levenshtein_kepler_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_kepler_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_KEPLER

#if SZ_USE_HOPPER
    bool const can_use_hopper = (capabilities & sz_caps_ckh_k) == sz_caps_ckh_k;
    if (can_use_hopper && can_use_linear_costs) {
        auto variant = szs::levenshtein_hopper_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_hopper_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_hopper) {
        auto variant = szs::affine_levenshtein_hopper_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_hopper_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_HOPPER

    if (can_use_linear_costs) {
        auto variant = szs::levenshtein_serial_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::levenshtein_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else {
        auto variant = szs::affine_levenshtein_serial_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            levenshtein_backends_t(std::in_place_type_t<szs::affine_levenshtein_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
}

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_sequence(                       //
    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_t const *a, sz_sequence_t const *b,                              //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_as_cpp_container_t {a};
    auto b_container = sz_sequence_as_cpp_container_t {b};
    return szs_levenshtein_distances_for_(                      //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u32tape(                        //
    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
    return szs_levenshtein_distances_for_(                      //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_u64tape(                        //
    szs_levenshtein_distances_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
    return szs_levenshtein_distances_for_(                      //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC void szs_levenshtein_distances_free(szs_levenshtein_distances_t engine_punned) {
    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    auto *engine = reinterpret_cast<levenshtein_backends_t *>(engine_punned);
    delete engine;
}

#pragma endregion Levenshtein Distances

#pragma region Levenshtein UTF8 Distances

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_init(                                        //
    sz_error_cost_t match, sz_error_cost_t mismatch, sz_error_cost_t open, sz_error_cost_t extend, //
    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,                              //
    szs_levenshtein_distances_utf8_t *engine_punned, char const **error_message) {

    sz_unused_(alloc); // Custom allocator not yet implemented, using default
    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");

    // If the gap opening and extension costs are identical we can use less memory
    auto const can_use_linear_costs = open == extend;
    auto const substitution_costs = szs::uniform_substitution_costs_t {match, mismatch};
    auto const linear_costs = szs::linear_gap_costs_t {open};
    auto const affine_costs = szs::affine_gap_costs_t {open, extend};

#if SZ_USE_ICE
    bool const can_use_ice = (capabilities & sz_cap_ice_k) != 0;
    if (can_use_ice && can_use_linear_costs) {
        auto variant = szs::levenshtein_utf8_ice_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_ice_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate UTF-8 Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_ICE

    bool const can_use_serial = (capabilities & sz_cap_serial_k) == sz_cap_serial_k;
    if (can_use_serial && can_use_linear_costs) {
        auto variant = szs::levenshtein_utf8_serial_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            levenshtein_utf8_backends_t(std::in_place_type_t<szs::levenshtein_utf8_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate UTF-8 Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else {
        auto variant = szs::affine_levenshtein_utf8_serial_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow) levenshtein_utf8_backends_t(
            std::in_place_type_t<szs::affine_levenshtein_utf8_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate UTF-8 Levenshtein engine");

        *engine_punned = reinterpret_cast<szs_levenshtein_distances_utf8_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }

    return propagate_error(sz::status_t::unknown_k, error_message, "No supported UTF-8 Levenshtein backends available");
}

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_sequence(                       //
    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_t const *a, sz_sequence_t const *b,                                   //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_as_cpp_container_t {a};
    auto b_container = sz_sequence_as_cpp_container_t {b};
    return szs_levenshtein_distances_utf8_for_(                 //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u32tape(                        //
    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                   //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
    return szs_levenshtein_distances_utf8_for_(                 //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_levenshtein_distances_utf8_u64tape(                        //
    szs_levenshtein_distances_utf8_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                   //
    sz_size_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
    return szs_levenshtein_distances_utf8_for_(                 //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC void szs_levenshtein_distances_utf8_free(szs_levenshtein_distances_utf8_t engine_punned) {
    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    auto *engine = reinterpret_cast<levenshtein_utf8_backends_t *>(engine_punned);
    delete engine;
}

#pragma endregion Levenshtein UTF8 Distances

#pragma region Needleman Wunsch

SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_init(                       //
    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
    szs_needleman_wunsch_scores_t *engine_punned, char const **error_message) {

    sz_unused_(alloc); // Custom allocator not yet implemented, using default
    sz_unused_(capabilities); // Optional backends may be compiled out
    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");

    // If the gap opening and extension costs are identical we can use less memory
    auto const can_use_linear_costs = open == extend;
    auto const linear_costs = szs::linear_gap_costs_t {open};
    auto const affine_costs = szs::affine_gap_costs_t {open, extend};
    auto substitution_costs = szs::error_costs_256x256_t {};
    std::memcpy((void *)&substitution_costs, (void const *)subs, sizeof(substitution_costs));

#if SZ_USE_ICE
    bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
    if (can_use_ice && can_use_linear_costs) {
        auto variant = szs::needleman_wunsch_ice_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_ice_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Needleman-Wunsch engine");

        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_ICE

#if SZ_USE_CUDA
    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) != 0;
    if (can_use_cuda && can_use_linear_costs) {
        auto variant = szs::needleman_wunsch_cuda_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_cuda_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Needleman-Wunsch engine");

        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_cuda) {
        auto variant = szs::affine_needleman_wunsch_cuda_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow) needleman_wunsch_backends_t(
            std::in_place_type_t<szs::affine_needleman_wunsch_cuda_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Needleman-Wunsch engine");

        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_CUDA

#if SZ_USE_HOPPER
    bool const can_use_hopper = (capabilities & sz_caps_ckh_k) == sz_caps_ckh_k;
    if (can_use_hopper && can_use_linear_costs) {
        auto variant = szs::needleman_wunsch_hopper_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_hopper_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Needleman-Wunsch engine");

        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_hopper) {
        auto variant = szs::affine_needleman_wunsch_hopper_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow) needleman_wunsch_backends_t(
            std::in_place_type_t<szs::affine_needleman_wunsch_hopper_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Needleman-Wunsch engine");

        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_HOPPER

    if (can_use_linear_costs) {
        auto variant = szs::needleman_wunsch_serial_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            needleman_wunsch_backends_t(std::in_place_type_t<szs::needleman_wunsch_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Needleman-Wunsch engine");

        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else {
        auto variant = szs::affine_needleman_wunsch_serial_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow) needleman_wunsch_backends_t(
            std::in_place_type_t<szs::affine_needleman_wunsch_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Needleman-Wunsch engine");

        *engine_punned = reinterpret_cast<szs_needleman_wunsch_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
}

SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_sequence(                       //
    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_t const *a, sz_sequence_t const *b,                                //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_as_cpp_container_t {a};
    auto b_container = sz_sequence_as_cpp_container_t {b};
    return szs_needleman_wunsch_scores_for_(                    //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u32tape(                        //
    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,                //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
    return szs_needleman_wunsch_scores_for_(                    //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_needleman_wunsch_scores_u64tape(                        //
    szs_needleman_wunsch_scores_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,                //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
    return szs_needleman_wunsch_scores_for_(                    //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC void szs_needleman_wunsch_scores_free(szs_needleman_wunsch_scores_t engine_punned) {
    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    auto *engine = reinterpret_cast<needleman_wunsch_backends_t *>(engine_punned);
    delete engine;
}

#pragma endregion Needleman Wunsch

#pragma region Smith Waterman

SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_init(                         //
    sz_error_cost_t const *subs, sz_error_cost_t open, sz_error_cost_t extend, //
    sz_memory_allocator_t const *alloc, sz_capability_t capabilities,          //
    szs_smith_waterman_scores_t *engine_punned, char const **error_message) {

    sz_unused_(alloc); // Custom allocator not yet implemented, using default
    sz_unused_(capabilities); // Optional backends may be compiled out
    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");

    // If the gap opening and extension costs are identical we can use less memory
    auto const can_use_linear_costs = open == extend;
    auto const linear_costs = szs::linear_gap_costs_t {open};
    auto const affine_costs = szs::affine_gap_costs_t {open, extend};
    auto substitution_costs = szs::error_costs_256x256_t {};
    std::memcpy((void *)&substitution_costs, (void const *)subs, sizeof(substitution_costs));

#if SZ_USE_ICE
    bool const can_use_ice = (capabilities & sz_cap_ice_k) == sz_cap_ice_k;
    if (can_use_ice && can_use_linear_costs) {
        auto variant = szs::smith_waterman_ice_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_ice_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Smith-Waterman engine");

        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_ICE

#if SZ_USE_CUDA
    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) != 0;
    if (can_use_cuda && can_use_linear_costs) {
        auto variant = szs::smith_waterman_cuda_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_cuda_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Smith-Waterman engine");

        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_cuda) {
        auto variant = szs::affine_smith_waterman_cuda_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_cuda_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Smith-Waterman engine");

        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_CUDA

#if SZ_USE_HOPPER
    bool const can_use_hopper = (capabilities & sz_caps_ckh_k) == sz_caps_ckh_k;
    if (can_use_hopper && can_use_linear_costs) {
        auto variant = szs::smith_waterman_hopper_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_hopper_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Smith-Waterman engine");

        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_hopper) {
        auto variant = szs::affine_smith_waterman_hopper_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_hopper_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Smith-Waterman engine");

        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_HOPPER

    if (can_use_linear_costs) {
        auto variant = szs::smith_waterman_serial_t(substitution_costs, linear_costs);
        auto engine = new (std::nothrow)
            smith_waterman_backends_t(std::in_place_type_t<szs::smith_waterman_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Smith-Waterman engine");

        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else {
        auto variant = szs::affine_smith_waterman_serial_t(substitution_costs, affine_costs);
        auto engine = new (std::nothrow)
            smith_waterman_backends_t(std::in_place_type_t<szs::affine_smith_waterman_serial_t>(), std::move(variant));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message,
                                   "Failed to allocate Smith-Waterman engine");

        *engine_punned = reinterpret_cast<szs_smith_waterman_scores_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
}

SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_sequence(                       //
    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_t const *a, sz_sequence_t const *b,                              //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_as_cpp_container_t {a};
    auto b_container = sz_sequence_as_cpp_container_t {b};
    return szs_smith_waterman_scores_for_(                      //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u32tape(                        //
    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u32tape_t const *a, sz_sequence_u32tape_t const *b,              //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u32tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u32tape_as_cpp_container_t {b};
    return szs_smith_waterman_scores_for_(                      //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_smith_waterman_scores_u64tape(                        //
    szs_smith_waterman_scores_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u64tape_t const *a, sz_sequence_u64tape_t const *b,              //
    sz_ssize_t *results, sz_size_t results_stride, char const **error_message) {

    sz_assert_(a != nullptr && b != nullptr && "Input texts cannot be null");
    auto a_container = sz_sequence_u64tape_as_cpp_container_t {a};
    auto b_container = sz_sequence_u64tape_as_cpp_container_t {b};
    return szs_smith_waterman_scores_for_(                      //
        engine_punned, device_punned, a_container, b_container, //
        results, results_stride, error_message);
}

SZ_DYNAMIC void szs_smith_waterman_scores_free(szs_smith_waterman_scores_t engine_punned) {
    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    auto *engine = reinterpret_cast<smith_waterman_backends_t *>(engine_punned);
    delete engine;
}

#pragma endregion Smith Waterman

#pragma region Fingerprints

SZ_DYNAMIC sz_status_t szs_fingerprints_init(                         //
    sz_size_t dimensions, sz_size_t alphabet_size,                    //
    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
    szs_fingerprints_t *engine_punned, char const **error_message) {

    sz_unused_(alloc); // Custom allocator not yet implemented, using default
    sz_unused_(capabilities); // Optional backends may be compiled out
    sz_assert_(engine_punned != nullptr && *engine_punned == nullptr && "Engine must be uninitialized");

    // Use some default window widths if none are provided
    sz_size_t const default_window_widths[] = {3, 4, 5, 7, 9, 11, 15, 31};
    if (!window_widths || window_widths_count == 0) {
        window_widths = default_window_widths;
        window_widths_count = sizeof(default_window_widths) / sizeof(sz_size_t);
    }

    // For optimal performance the number of dimensions per window width must be divisible by the fingerprint slice.
    auto const dimensions_per_window_width_min = dimensions / window_widths_count;
    auto const dimensions_per_window_width_max = sz::divide_round_up(dimensions, window_widths_count);
    auto const can_use_sliced_sketchers = (dimensions_per_window_width_min == dimensions_per_window_width_max) &&
                                          (dimensions_per_window_width_min % fingerprint_slice_k == 0);
    using fallback_variant_cpus_t = typename fingerprints_backends_t::fallback_variant_cpus_t;

#if SZ_USE_HASWELL
    bool const can_use_haswell = (capabilities & sz_cap_haswell_k) == sz_cap_haswell_k;
    if (can_use_haswell && can_use_sliced_sketchers) {
        auto const count_hashers = dimensions / fingerprint_slice_k;
        using hasher_t = szs::floating_rolling_hashers<sz_cap_haswell_k, fingerprint_slice_k>;
        vec<hasher_t> hashers;
        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;

        // Populate the hashers with the given window widths
        for (size_t i = 0; i < count_hashers; ++i) {
            auto const window_width = window_widths[i % window_widths_count];
            auto const first_dimension_offset = i * fingerprint_slice_k;
            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
        }

        auto engine =
            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
        engine->dimensions = dimensions;
        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_HASWELL

#if SZ_USE_SKYLAKE
    bool const can_use_skylake = (capabilities & sz_cap_skylake_k) == sz_cap_skylake_k;
    if (can_use_skylake && can_use_sliced_sketchers) {
        auto const count_hashers = dimensions / fingerprint_slice_k;
        using hasher_t = szs::floating_rolling_hashers<sz_cap_skylake_k, fingerprint_slice_k>;
        vec<hasher_t> hashers;
        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;

        // Populate the hashers with the given window widths
        for (size_t i = 0; i < count_hashers; ++i) {
            auto const window_width = window_widths[i % window_widths_count];
            auto const first_dimension_offset = i * fingerprint_slice_k;
            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
        }

        auto engine =
            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
        engine->dimensions = dimensions;
        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
#endif // SZ_USE_SKYLAKE

#if SZ_USE_CUDA
    bool const can_use_cuda = (capabilities & sz_cap_cuda_k) == sz_cap_cuda_k;
    if (can_use_cuda && can_use_sliced_sketchers) {
        auto const count_hashers = dimensions / fingerprint_slice_k;
        using hasher_t = szs::floating_rolling_hashers<sz_cap_cuda_k, fingerprint_slice_k>;
        vec<hasher_t> hashers;
        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;

        // Populate the hashers with the given window widths
        for (size_t i = 0; i < count_hashers; ++i) {
            auto const window_width = window_widths[i % window_widths_count];
            auto const first_dimension_offset = i * fingerprint_slice_k;
            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
        }

        auto engine =
            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
        engine->dimensions = dimensions;
        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }
    else if (can_use_cuda) {
        using fallback_variant_cuda_t = typename fingerprints_backends_t::fallback_variant_cuda_t;
        auto variant = fallback_variant_cuda_t();
        for (size_t dimension = 0; dimension < dimensions; ++dimension) {
            auto const window_width = window_widths[dimension % window_widths_count];
            auto const extend_status = variant.try_extend(window_width, 1, alphabet_size);
            if (extend_status != sz::status_t::success_k) return static_cast<sz_status_t>(extend_status);
        }

        auto engine = new (std::nothrow)
            fingerprints_backends_t(std::in_place_type_t<fallback_variant_cuda_t>(), std::move(variant));
        if (!engine) return sz_bad_alloc_k;

        engine->dimensions = dimensions;
        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
        return sz_success_k;
    }
#endif // SZ_USE_CUDA

    // Build the vectorized, but serial backend
    if (can_use_sliced_sketchers) {
        auto const count_hashers = dimensions / fingerprint_slice_k;
        using hasher_t = szs::floating_rolling_hashers<sz_cap_serial_k, fingerprint_slice_k>;
        vec<hasher_t> hashers;
        if (hashers.try_resize(count_hashers) != sz::status_t::success_k) return sz_bad_alloc_k;

        // Populate the hashers with the given window widths
        for (size_t i = 0; i < count_hashers; ++i) {
            auto const window_width = window_widths[i % window_widths_count];
            auto const first_dimension_offset = i * fingerprint_slice_k;
            auto const seed_status = hashers[i].try_seed(window_width, alphabet_size, first_dimension_offset);
            if (seed_status != sz::status_t::success_k) return static_cast<sz_status_t>(seed_status);
        }

        auto engine =
            new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<vec<hasher_t>>(), std::move(hashers));
        if (!engine)
            return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");
        engine->dimensions = dimensions;
        *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
        return propagate_error(sz::status_t::success_k, error_message);
    }

    // Build the fallback variant with interleaving width dimensions
    auto variant = fallback_variant_cpus_t();
    for (size_t dimension = 0; dimension < dimensions; ++dimension) {
        auto const window_width = window_widths[dimension % window_widths_count];
        auto const extend_status = variant.try_extend(window_width, 1, alphabet_size);
        if (extend_status != sz::status_t::success_k) return static_cast<sz_status_t>(extend_status);
    }

    auto engine =
        new (std::nothrow) fingerprints_backends_t(std::in_place_type_t<fallback_variant_cpus_t>(), std::move(variant));
    if (!engine)
        return propagate_error(sz::status_t::bad_alloc_k, error_message, "Failed to allocate Fingerprints engine");

    engine->dimensions = dimensions;
    *engine_punned = reinterpret_cast<szs_fingerprints_t>(engine);
    return propagate_error(sz::status_t::success_k, error_message);
}

SZ_DYNAMIC sz_status_t szs_fingerprints_sequence(                       //
    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_t const *texts,                                         //
    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {

    sz_assert_(texts != nullptr && "Input texts cannot be null");
    auto texts_container = sz_sequence_as_cpp_container_t {texts};
    return szs_fingerprints_for_(                      //
        engine_punned, device_punned, texts_container, //
        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_fingerprints_u32tape(                        //
    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u32tape_t const *texts,                                 //
    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {

    sz_assert_(texts != nullptr && "Input texts cannot be null");
    auto texts_container = sz_sequence_u32tape_as_cpp_container_t {texts};
    return szs_fingerprints_for_(                      //
        engine_punned, device_punned, texts_container, //
        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_fingerprints_u64tape(                        //
    szs_fingerprints_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u64tape_t const *texts,                                 //
    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                  //
    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {

    sz_assert_(texts != nullptr && "Input texts cannot be null");
    auto texts_container = sz_sequence_u64tape_as_cpp_container_t {texts};
    return szs_fingerprints_for_(                      //
        engine_punned, device_punned, texts_container, //
        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
}

SZ_DYNAMIC void szs_fingerprints_free(szs_fingerprints_t engine_punned) {
    sz_assert_(engine_punned != nullptr && "Engine must be initialized");
    auto *engine = reinterpret_cast<fingerprints_backends_t *>(engine_punned);
    delete engine;
}

#pragma endregion Fingerprints

#pragma region Fingerprints UTF8

SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_init(                    //
    sz_size_t dimensions, sz_size_t alphabet_size,                    //
    sz_size_t const *window_widths, sz_size_t window_widths_count,    //
    sz_memory_allocator_t const *alloc, sz_capability_t capabilities, //
    szs_fingerprints_utf8_t *engine_punned, char const **error_message) {

    return szs_fingerprints_init( //
        dimensions, alphabet_size, window_widths, window_widths_count, alloc, capabilities, engine_punned,
        error_message);
}

SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_sequence(                       //
    szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_t const *texts,                                              //
    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {

    return szs_fingerprints_sequence(        //
        engine_punned, device_punned, texts, //
        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_u32tape(                        //
    szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u32tape_t const *texts,                                      //
    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {

    return szs_fingerprints_u32tape(         //
        engine_punned, device_punned, texts, //
        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
}

SZ_DYNAMIC sz_status_t szs_fingerprints_utf8_u64tape(                        //
    szs_fingerprints_utf8_t engine_punned, szs_device_scope_t device_punned, //
    sz_sequence_u64tape_t const *texts,                                      //
    sz_u32_t *min_hashes, sz_size_t min_hashes_stride,                       //
    sz_u32_t *min_counts, sz_size_t min_counts_stride, char const **error_message) {

    return szs_fingerprints_u64tape(         //
        engine_punned, device_punned, texts, //
        min_hashes, min_hashes_stride, min_counts, min_counts_stride, error_message);
}

SZ_DYNAMIC void szs_fingerprints_utf8_free(szs_fingerprints_utf8_t engine_punned) {
    return szs_fingerprints_free(engine_punned);
}

#pragma endregion Fingerprints UTF8
}