stringzilla 4.6.0

/**
 *  @brief  CUDA-accelerated string similarity utilities.
 *  @file   similarities.cuh
 *  @author Ash Vardanian
 *
 *  Unlike the CPU backed, which also has single-pair similarity scores, the CUDA backend focuses @b only on
 *  batch-processing of large collections of strings, generally, assigning a single warp to each string pair:
 *
 *  - `sz::levenshtein_distances` & `sz::levenshtein_distances_utf8` for Levenshtein edit-distances.
 *  - `sz::needleman_wunsch_score` for weighted Needleman-Wunsch global alignment scores.
 *  - `sz::smith_waterman_score` for weighted Smith-Waterman local alignment scores.
 *
 *  Unlike the trivially parallelizable CPU kernels in `stringzilla/similarities.hpp`, the GPU kernels in this file are
 *  designed for batch-processing of large collections of strings, assigning a single warp to each string pair.
 *  Thus, they should be used when hundreds of pairwise comparisons are needed, and the strings are long enough to
 *  amortize the cost of copying them to the GPU.
 *
 *  @section Abstraction layers
 *
 *  Under the hood, each @b dense high-level algorithm, like Levenshtein, NW, or SW, builds on top of a "walker"
 *  template object, which in turn builds on top of an "scorer" template object:
 *
 *  - the "walker" chooses the order in which the DP matrix is evaluated - row-wise or diagonal-wise.
 *  - the "scorer" evaluates the actual DP matrix cells, taking 3+ inputs, for "local" and "global" alignment,
 *    or the "affine local" and "affine global" alignments, differentiating the cost of gap opening & extension.
 *
 *  Those are later wrapped via 2 functions:
 *
 *  - `*_in_cuda` for on-GPU execution - implementing the actual similarity scoring.
 *  - `*_via_cuda` for on-host execution - allocating memory, and dispatching the kernel.
 *
 *  Those are in-turn wrapped into the same-named function objects:
 *
 *  - `levenshtein_distances`: {CUDA and Kepler} for any chars and lengths, {Hopper} for 8-bit and 16-bit lengths.
 *  - `needleman_wunsch_score`.
 */
#ifndef STRINGZILLAS_SIMILARITIES_CUH_
#define STRINGZILLAS_SIMILARITIES_CUH_

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda/pipeline>        // `cuda::pipeline`
#include <cooperative_groups.h> // `cooperative_groups::this_grid()`

#include "stringzillas/types.cuh"
#include "stringzillas/similarities.hpp"

namespace ashvardanian {
namespace stringzillas {

#pragma region - Common Aliases

using ualloc_t = unified_alloc_t;

/**
 *  In @b CUDA:
 *  - for GPUs before Hopper, we can use the @b SIMT model for warp-level parallelism using diagonal "walkers"
 *  - for GPUs after Hopper, we compound that with thread-level @b SIMD via @b DPX instructions for min-max
 */
using levenshtein_cuda_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
using affine_levenshtein_cuda_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;

using levenshtein_kepler_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ck_k>;
using affine_levenshtein_kepler_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_caps_ck_k>;

using levenshtein_hopper_t = levenshtein_distances<char, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
using affine_levenshtein_hopper_t = levenshtein_distances<char, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>;

using needleman_wunsch_cuda_t =
    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
using smith_waterman_cuda_t =
    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_cap_cuda_k>;

using affine_needleman_wunsch_cuda_t =
    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;
using affine_smith_waterman_cuda_t =
    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_cap_cuda_k>;

using needleman_wunsch_hopper_t =
    needleman_wunsch_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
using smith_waterman_hopper_t =
    smith_waterman_scores<char, error_costs_256x256_t, linear_gap_costs_t, ualloc_t, sz_caps_ckh_k>;

using affine_needleman_wunsch_hopper_t =
    needleman_wunsch_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>;
using affine_smith_waterman_hopper_t =
    smith_waterman_scores<char, error_costs_256x256_t, affine_gap_costs_t, ualloc_t, sz_caps_ckh_k>;

#pragma endregion - Common Aliases

#pragma region - Common Helpers

/**
 *  @brief  Dispatches min or max operation based on the compile-time objective.
 */
template <sz_similarity_objective_t objective_, typename scalar_type_>
__forceinline__ __device__ scalar_type_ pick_best_(scalar_type_ a, scalar_type_ b) noexcept {
    if constexpr (objective_ == sz_minimize_distance_k) { return std::min(a, b); }
    else { return std::max(a, b); }
}

template <sz_similarity_objective_t objective_, typename scalar_type_>
__forceinline__ __device__ scalar_type_ pick_best_in_warp_(scalar_type_ x) noexcept {
    // The `__shfl_down_sync` replaces `__shfl_down`
    // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 16), x);
    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 8), x);
    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 4), x);
    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 2), x);
    x = pick_best_<objective_, scalar_type_>(__shfl_down_sync(0xffffffff, x, 1), x);
    return x;
}

/**
 *  @brief  Loads data with a hint, that it's frequently accessed and immutable throughout the kernel.
 *  @see    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#read-only-data-cache-load-function
 *  @see    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-5-x
 */
template <typename scalar_type_>
__forceinline__ __device__ scalar_type_ load_immutable_(scalar_type_ const *ptr) noexcept {
    // The `__ldg` intrinsic translates into the `ld.global.nc` PTX instruction.
    // It reads a value from global memory and caches it in the non-coherent cache.
    // return __ldg(ptr);
    return *ptr;
}

/**
 *  @brief  Loads data with a cache hint, that it will not be accessed again.
 *  @see    https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#load-functions-using-cache-hints
 *  @see    https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators
 */
template <typename scalar_type_>
__forceinline__ __device__ scalar_type_ load_last_use_(scalar_type_ const *ptr) noexcept {
    // return __ldlu(ptr);
    return *ptr;
}

#pragma endregion - Common Helpers

#pragma region - Algorithm Building Blocks

/**
 *  @brief GPU adaptation of the `tile_scorer` on CUDA, avoiding warp-level shuffles and DPX.
 *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
 */
template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
#if SZ_HAS_CONCEPTS_
    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
             substituter_like<substituter_type_>
#endif
struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {

    using first_iterator_t = first_iterator_type_;
    using second_iterator_t = second_iterator_type_;
    using score_t = score_type_;
    using substituter_t = substituter_type_;
    using gap_costs_t = linear_gap_costs_t;

    static constexpr sz_similarity_objective_t objective_k = objective_;
    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
    static constexpr sz_capability_t capability_k = capability_;

    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
    using char_t = remove_cvref<first_char_t>;

    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                           linear_gap_costs_t, objective_k, sz_similarity_global_k, capability_k>;

  protected:
    substituter_t substituter_ {};
    linear_gap_costs_t gap_costs_ {};
    score_t final_score_ {0};

  public:
    __forceinline__ __device__ tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept
        : substituter_(subs), gap_costs_(gaps) {}

    /**
     *  @brief Initializes a boundary value within a certain diagonal.
     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
     *  @note Should only be called for the top row and left column of the matrix.
     */
    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept {
        cell = gap_costs_.open_or_extend * diagonal_index;
    }

    /**
     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
     */
    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }

    /**
     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
     *  @param second_slice The second string.
     *
     *  @param tasks_offset The offset of the first character to compare from each string.
     *  @param tasks_step The step size for the next character to compare from each string.
     *  @param tasks_count The total number of characters to compare from input slices.
     *
     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
     */
    template <typename index_type_>
    __forceinline__ __device__ void operator()(                                                      //
        first_iterator_t first_slice, second_iterator_t second_slice,                                //
        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion,                 //
        score_t const *scores_pre_deletion, score_t *scores_new) noexcept {

        // Make sure we are called for an anti-diagonal traversal order
        score_t const gap_costs = gap_costs_.open_or_extend;
        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);

        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
        // ? from shared memory on each cycle, by slicing the work differently between the threads,
        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
        // ? that code ends up being slower than the one below.
        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
            score_t pre_insertion = scores_pre_insertion[i];
            score_t pre_deletion = scores_pre_deletion[i];
            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
            char second_char = load_immutable_(second_slice + i);

            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
            score_t if_substitution = pre_substitution + cost_of_substitution;
            score_t if_deletion_or_insertion = pick_best_<objective_k>(pre_deletion, pre_insertion) + gap_costs;
            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution);
            scores_new[i] = cell_score;
        }

        // The last element of the last chunk is the result of the global alignment.
        if (tasks_offset == 0) final_score_ = scores_new[0];
    }
};

/**
 *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
 *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
 */
template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
#if SZ_HAS_CONCEPTS_
    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
             substituter_like<substituter_type_>
#endif
struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, linear_gap_costs_t,
                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {

    using first_iterator_t = first_iterator_type_;
    using second_iterator_t = second_iterator_type_;
    using score_t = score_type_;
    using substituter_t = substituter_type_;

    static constexpr sz_similarity_objective_t objective_k = objective_;
    static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
    static constexpr sz_capability_t capability_k = capability_;

    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
    using char_t = remove_cvref<first_char_t>;

    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                           linear_gap_costs_t, objective_k, sz_similarity_local_k, capability_k>;

  protected:
    substituter_t substituter_ {};
    linear_gap_costs_t gap_costs_ {};
    score_t final_score_ {0};

  public:
    __forceinline__ __device__ tile_scorer(substituter_t subs, linear_gap_costs_t gaps) noexcept
        : substituter_(subs), gap_costs_(gaps) {}

    /**
     *  @brief Initializes a boundary value within a certain diagonal.
     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
     *  @note Should only be called for the top row and left column of the matrix.
     */
    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept { cell = 0; }

    /**
     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
     */
    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }

    /**
     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
     *  @param second_slice The second string.
     *
     *  @param tasks_offset The offset of the first character to compare from each string.
     *  @param tasks_step The step size for the next character to compare from each string.
     *  @param tasks_count The total number of characters to compare from input slices.
     *
     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
     */
    template <typename index_type_>
    __forceinline__ __device__ void operator()(                                                      //
        first_iterator_t first_slice, second_iterator_t second_slice,                                //
        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
        score_t const *scores_pre_substitution, score_t const *scores_pre_insertion,                 //
        score_t const *scores_pre_deletion, score_t *scores_new) noexcept {

        // Make sure we are called for an anti-diagonal traversal order
        error_cost_t const gap_cost = gap_costs_.open_or_extend;
        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);

        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
        // ? from shared memory on each cycle, by slicing the work differently between the threads,
        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
        // ? that code ends up being slower than the one below.
        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
            score_t pre_insertion = scores_pre_insertion[i];
            score_t pre_deletion = scores_pre_deletion[i];
            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
            char second_char = load_immutable_(second_slice + i);

            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
            score_t if_substitution = pre_substitution + cost_of_substitution;
            score_t if_deletion_or_insertion = pick_best_<objective_k>(pre_deletion, pre_insertion) + gap_cost;
            score_t if_substitution_or_reset = pick_best_<objective_k, score_t>(if_substitution, 0);
            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);
            scores_new[i] = cell_score;

            // Update the global maximum score if this cell beats it.
            final_score_ = pick_best_<objective_k>(final_score_, cell_score);
        }

        // ! Don't forget to pick the best among the best scores per thread.
        final_score_ = pick_best_in_warp_<objective_k>(final_score_);
    }
};

/**
 *  @brief GPU adaptation of the `tile_scorer` on CUDA, avoiding warp-level shuffles and DPX.
 *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
 */
template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
#if SZ_HAS_CONCEPTS_
    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
             substituter_like<substituter_type_>
#endif
struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
                   objective_, sz_similarity_global_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {

    using first_iterator_t = first_iterator_type_;
    using second_iterator_t = second_iterator_type_;
    using score_t = score_type_;
    using substituter_t = substituter_type_;
    using gap_costs_t = affine_gap_costs_t;

    static constexpr sz_similarity_objective_t objective_k = objective_;
    static constexpr sz_similarity_locality_t locality_k = sz_similarity_global_k;
    static constexpr sz_capability_t capability_k = capability_;

    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
    using char_t = remove_cvref<first_char_t>;

    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                           affine_gap_costs_t, objective_k, sz_similarity_global_k, capability_k>;

  protected:
    substituter_t substituter_ {};
    affine_gap_costs_t gap_costs_ {};
    score_t final_score_ {0};

  public:
    __forceinline__ __device__ tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept
        : substituter_(subs), gap_costs_(gaps) {}

    /**
     *  @brief Initializes a boundary value within a certain diagonal.
     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
     *  @note Should only be called for the top row and left column of the matrix.
     */
    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept {
        cell = diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0;
    }

    __forceinline__ __device__ void init_gap(score_t &cell, size_t diagonal_index) const noexcept {
        // Make sure the initial value of the gap is not smaller in magnitude than the primary.
        // The supplementary matrices are initialized with values of higher magnitude,
        // which is equivalent to discarding them. That's better than using `SIZE_MAX`
        // as subsequent additions won't overflow.
        cell = (gap_costs_.open + gap_costs_.extend) +
               (diagonal_index ? gap_costs_.open + gap_costs_.extend * (diagonal_index - 1) : 0);
    }

    /**
     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
     */
    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }

    /**
     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
     *  @param second_slice The second string.
     *
     *  @param tasks_offset The offset of the first character to compare from each string.
     *  @param tasks_step The step size for the next character to compare from each string.
     *  @param tasks_count The total number of characters to compare from input slices.
     *
     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
     */
    template <typename index_type_>
    __forceinline__ __device__ void operator()(                                                      //
        first_iterator_t first_slice, second_iterator_t second_slice,                                //
        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
        score_t const *scores_pre_substitution,                                                      //
        score_t const *scores_pre_insertion,                                                         //
        score_t const *scores_pre_deletion,                                                          //
        score_t const *scores_running_insertions,                                                    //
        score_t const *scores_running_deletions,                                                     //
        score_t *scores_new,                                                                         //
        score_t *scores_new_insertions,                                                              //
        score_t *scores_new_deletions) noexcept {

        // Make sure we are called for an anti-diagonal traversal order
        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);

        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
        // ? from shared memory on each cycle, by slicing the work differently between the threads,
        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
        // ? that code ends up being slower than the one below.
        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
            score_t pre_insertion_opening = scores_pre_insertion[i];
            score_t pre_deletion_opening = scores_pre_deletion[i];
            score_t pre_insertion_expansion = scores_running_insertions[i];
            score_t pre_deletion_expansion = scores_running_deletions[i];
            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
            char second_char = load_immutable_(second_slice + i);

            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
            score_t if_substitution = pre_substitution + cost_of_substitution;
            score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
                                                           pre_insertion_expansion + gap_costs_.extend);
            score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
                                                          pre_deletion_expansion + gap_costs_.extend);
            score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution);

            // Export results.
            scores_new[i] = cell_score;
            scores_new_insertions[i] = if_insertion;
            scores_new_deletions[i] = if_deletion;
        }

        // The last element of the last chunk is the result of the global alignment.
        if (tasks_offset == 0) final_score_ = scores_new[0];
    }
};

/**
 *  @brief GPU adaptation of the `local_scorer` on CUDA, avoiding warp-level shuffles and DPX.
 *  @note Uses 32-bit `unsigned` counter to iterate through the string slices, so it can't be over 4 billion characters.
 */
template <typename first_iterator_type_, typename second_iterator_type_, typename score_type_,
          typename substituter_type_, sz_similarity_objective_t objective_, sz_capability_t capability_>
#if SZ_HAS_CONCEPTS_
    requires pointer_like<first_iterator_type_> && pointer_like<second_iterator_type_> && score_like<score_type_> &&
             substituter_like<substituter_type_>
#endif
struct tile_scorer<first_iterator_type_, second_iterator_type_, score_type_, substituter_type_, affine_gap_costs_t,
                   objective_, sz_similarity_local_k, capability_, std::enable_if_t<capability_ == sz_cap_cuda_k>> {

    using first_iterator_t = first_iterator_type_;
    using second_iterator_t = second_iterator_type_;
    using score_t = score_type_;
    using substituter_t = substituter_type_;

    static constexpr sz_similarity_objective_t objective_k = objective_;
    static constexpr sz_similarity_locality_t locality_k = sz_similarity_local_k;
    static constexpr sz_capability_t capability_k = capability_;

    using first_char_t = typename std::iterator_traits<first_iterator_t>::value_type;
    using second_char_t = typename std::iterator_traits<second_iterator_t>::value_type;
    static_assert(is_same_type<first_char_t, second_char_t>::value, "String characters must be of the same type.");
    using char_t = remove_cvref<first_char_t>;

    using cuda_warp_scorer_t = tile_scorer<first_iterator_t, second_iterator_t, score_t, substituter_t,
                                           affine_gap_costs_t, objective_k, sz_similarity_local_k, capability_k>;

  protected:
    substituter_t substituter_ {};
    affine_gap_costs_t gap_costs_ {};
    score_t final_score_ {0};

  public:
    __forceinline__ __device__ tile_scorer(substituter_t subs, affine_gap_costs_t gaps) noexcept
        : substituter_(subs), gap_costs_(gaps) {}

    /**
     *  @brief Initializes a boundary value within a certain diagonal.
     *  @note Should only be called for the diagonals outside of the bottom-right triangle.
     *  @note Should only be called for the top row and left column of the matrix.
     */
    __forceinline__ __device__ void init_score(score_t &cell, size_t diagonal_index) const noexcept { cell = 0; }
    __forceinline__ __device__ void init_gap(score_t &cell, size_t /* diagonal_index */) const noexcept {
        // Make sure the initial value of the gap is not smaller in magnitude than the primary.
        // The supplementary matrices are initialized with values of higher magnitude,
        // which is equivalent to discarding them. That's better than using `SIZE_MAX`
        // as subsequent additions won't overflow.
        cell = gap_costs_.open + gap_costs_.extend;
    }

    /**
     *  @brief Extract the final result of the scoring operation which will be always in the bottom-right corner.
     */
    __forceinline__ __device__ score_t score() const noexcept { return final_score_; }

    /**
     *  @brief Computes one diagonal of the DP matrix, using the results of the previous 2x diagonals.
     *  @param first_slice The first string, unlike the CPU variant @b NOT reversed.
     *  @param second_slice The second string.
     *
     *  @param tasks_offset The offset of the first character to compare from each string.
     *  @param tasks_step The step size for the next character to compare from each string.
     *  @param tasks_count The total number of characters to compare from input slices.
     *
     *  @tparam index_type_ @b `unsigned` is recommended if the strings are under 4 billion characters.
     */
    template <typename index_type_>
    __forceinline__ __device__ void operator()(                                                      //
        first_iterator_t first_slice, second_iterator_t second_slice,                                //
        index_type_ const tasks_offset, index_type_ const tasks_step, index_type_ const tasks_count, //
        score_t const *scores_pre_substitution,                                                      //
        score_t const *scores_pre_insertion,                                                         //
        score_t const *scores_pre_deletion,                                                          //
        score_t const *scores_running_insertions,                                                    //
        score_t const *scores_running_deletions,                                                     //
        score_t *scores_new,                                                                         //
        score_t *scores_new_insertions,                                                              //
        score_t *scores_new_deletions) noexcept {

        // Make sure we are called for an anti-diagonal traversal order
        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);

        // ? One weird observation, is that even though we can avoid fetching `pre_insertion`
        // ? from shared memory on each cycle, by slicing the work differently between the threads,
        // ? and allowing them to reuse the previous `pre_deletion` as the new `pre_insertion`,
        // ? that code ends up being slower than the one below.
        for (index_type_ i = tasks_offset; i < tasks_count; i += tasks_step) {
            score_t pre_substitution = load_last_use_(scores_pre_substitution + i);
            score_t pre_insertion_opening = scores_pre_insertion[i];
            score_t pre_deletion_opening = scores_pre_deletion[i];
            score_t pre_insertion_expansion = scores_running_insertions[i];
            score_t pre_deletion_expansion = scores_running_deletions[i];
            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
            char second_char = load_immutable_(second_slice + i);

            error_cost_t cost_of_substitution = substituter_(first_char, second_char);
            score_t if_substitution = pre_substitution + cost_of_substitution;
            score_t if_deletion = min_or_max<objective_k>(pre_deletion_opening + gap_costs_.open,
                                                          pre_deletion_expansion + gap_costs_.extend);
            score_t if_insertion = min_or_max<objective_k>(pre_insertion_opening + gap_costs_.open,
                                                           pre_insertion_expansion + gap_costs_.extend);
            score_t if_deletion_or_insertion = min_or_max<objective_k>(if_deletion, if_insertion);
            score_t if_substitution_or_reset = pick_best_<objective_k, score_t>(if_substitution, 0);
            score_t cell_score = pick_best_<objective_k>(if_deletion_or_insertion, if_substitution_or_reset);

            // Export results.
            scores_new[i] = cell_score;
            scores_new_insertions[i] = if_insertion;
            scores_new_deletions[i] = if_deletion;

            // Update the global maximum score if this cell beats it.
            final_score_ = pick_best_<objective_k>(final_score_, cell_score);
        }

        // ! Don't forget to pick the best among the best scores per thread.
        final_score_ = pick_best_in_warp_<objective_k>(final_score_);
    }
};

/*  On Kepler and newer GPUs we benefit from the following:
 *  - processing 4x 8-bit values or 2x 16-bit values at a time, packed as 32-bit words.
 *  - warp-level exchange primitives for fast reduction of the best score.
 */

#if SZ_USE_KEPLER

/**
 *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with linear gap costs.
 *  @note Requires Kepler generation GPUs to handle 4x `u8` scores at a time.
 *
 *  Relies on following instruction families to output 4x @b `u8` scores per call:
 *  - @b `prmt` to shuffle bytes in 32 bit registers.
 *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
 */
template <>
struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
    using kepler_warp_scorer_t =
        tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,          // ! Unlike CPU, uses `unsigned`
        u8_t const *scores_pre_substitution, //
        u8_t const *scores_pre_insertion,    //
        u8_t const *scores_pre_deletion,     //
        u8_t *scores_new) noexcept {

        u8_t const match_cost = this->substituter_.match;
        u8_t const mismatch_cost = this->substituter_.mismatch;
        u8_t const gap_cost = this->gap_costs_.open_or_extend;

        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
        match_cost_vec.u32 = match_cost * 0x01010101u;       // ! 4x `u8` match costs
        mismatch_cost_vec.u32 = mismatch_cost * 0x01010101u; // ! 4x `u8` mismatch costs
        gap_cost_vec.u32 = gap_cost * 0x01010101u;           // ! 4x `u8` gap costs

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 4-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_deletion_or_insertion_vec;
        sz_u32_vec_t cell_score_vec;

        // ! As we are processing 4 bytes per loop, and have at least 32 threads per block (32 * 4 = 128),
        // ! and deal with strings only under 256 bytes, this loop will fire at most twice per input.
        for (unsigned i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
            pre_insertion_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
            pre_deletion_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4); // ! this may be OOB
            second_vec = sz_u32_load_unaligned(second_slice + i);                 // ! this may be OOB, but padded
            first_vec.u32 = __nv_bswap32(first_vec.u32); // ! reverse the order of bytes in the first vector

            // Equality comparison will output 0xFF for each matching byte.
            equality_vec.u32 = __vcmpeq4(first_vec.u32, second_vec.u32);
            cost_of_substitution_vec.u32 =                //
                (equality_vec.u32 & match_cost_vec.u32) + //
                (~equality_vec.u32 & mismatch_cost_vec.u32);
            if_substitution_vec.u32 = __vaddus4(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
            if_deletion_or_insertion_vec.u32 =
                __vaddus4(__vminu4(pre_deletion_vec.u32, pre_insertion_vec.u32), gap_cost_vec.u32);
            cell_score_vec.u32 = __vminu4(if_deletion_or_insertion_vec.u32, if_substitution_vec.u32);

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.u8s[0];
            scores_new[i + 1] = cell_score_vec.u8s[1];
            scores_new[i + 2] = cell_score_vec.u8s[2];
            scores_new[i + 3] = cell_score_vec.u8s[3];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if (tasks_offset == 0) this->final_score_ = scores_new[0];
    }
};

/**
 *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with linear gap costs.
 *  @note Requires Kepler generation GPUs to handle 2x `u16` scores at a time.
 *
 *  Relies on following instruction families to output 2x @b `u16` scores per call:
 *  - @b `prmt` to shuffle bytes in 32 bit registers.
 *  - @b `vmax2,vmin2,vadd2` video-processing instructions.
 */
template <>
struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
    using kepler_warp_scorer_t =
        tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,           // ! Unlike CPU, uses `unsigned`
        u16_t const *scores_pre_substitution, //
        u16_t const *scores_pre_insertion,    //
        u16_t const *scores_pre_deletion,     //
        u16_t *scores_new) noexcept {

        u16_t const match_cost = this->substituter_.match;
        u16_t const mismatch_cost = this->substituter_.mismatch;
        u16_t const gap_cost = this->gap_costs_.open_or_extend;

        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
        match_cost_vec.u32 = match_cost * 0x00010001;       // ! 2x `u16` match costs
        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001; // ! 2x `u16` mismatch costs
        gap_cost_vec.u32 = gap_cost * 0x00010001;           // ! 2x `u16` gap costs

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_deletion_or_insertion_vec;
        sz_u32_vec_t cell_score_vec;

        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
            pre_insertion_vec.u16s[0] = scores_pre_insertion[i + 0];
            pre_insertion_vec.u16s[1] = scores_pre_insertion[i + 1];
            pre_deletion_vec.u16s[0] = scores_pre_deletion[i + 0];
            pre_deletion_vec.u16s[1] = scores_pre_deletion[i + 1];
            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded

            // Equality comparison will output 0xFFFF for each matching byte-pair.
            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
            cost_of_substitution_vec.u32 =                //
                (equality_vec.u32 & match_cost_vec.u32) + //
                (~equality_vec.u32 & mismatch_cost_vec.u32);
            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
            if_deletion_or_insertion_vec.u32 =
                __vaddus2(__vminu2(pre_deletion_vec.u32, pre_insertion_vec.u32), gap_cost_vec.u32);
            cell_score_vec.u32 = __vminu2(if_deletion_or_insertion_vec.u32, if_substitution_vec.u32);

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.u16s[0];
            scores_new[i + 1] = cell_score_vec.u16s[1];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if (tasks_offset == 0) this->final_score_ = scores_new[0];
    }
};

template <>
struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
    using kepler_warp_scorer_t =
        tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
};

template <>
struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
    using kepler_warp_scorer_t =
        tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;
};

/**
 *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
 *  @note Requires Kepler generation GPUs to handle 4x `u8` scores at a time.
 *
 *  Relies on following instruction families to output 4x @b `u8` scores per call:
 *  - @b `prmt` to shuffle bytes in 32 bit registers.
 *  - @b `vmax4,vmin4,vadd4` video-processing instructions.
 */
template <>
struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
    using kepler_warp_scorer_t =
        tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,            // ! Unlike CPU, uses `unsigned`
        u8_t const *scores_pre_substitution,   //
        u8_t const *scores_pre_insertion,      //
        u8_t const *scores_pre_deletion,       //
        u8_t const *scores_running_insertions, //
        u8_t const *scores_running_deletions,  //
        u8_t *scores_new,                      //
        u8_t *scores_new_insertions,           //
        u8_t *scores_new_deletions) noexcept {

        u8_t const match_cost = this->substituter_.match;
        u8_t const mismatch_cost = this->substituter_.mismatch;
        u8_t const gap_open_cost = this->gap_costs_.open;
        u8_t const gap_extend_cost = this->gap_costs_.extend;
        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
        match_cost_vec.u32 = match_cost * 0x01010101u;           // ! 4x `u8` match costs
        mismatch_cost_vec.u32 = mismatch_cost * 0x01010101u;     // ! 4x `u8` mismatch costs
        gap_open_cost_vec.u32 = gap_open_cost * 0x01010101u;     // ! 4x `u8` gap costs
        gap_extend_cost_vec.u32 = gap_extend_cost * 0x01010101u; // ! 4x `u8` gap costs

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 4-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
        sz_u32_vec_t cell_score_vec;

        // ! As we are processing 4 bytes per loop, and have at least 32 threads per block (32 * 4 = 128),
        // ! and deal with strings only under 256 bytes, this loop will fire at most twice per input.
        for (unsigned i = tasks_offset * 4; i < tasks_count; i += tasks_step * 4) { // ! it's OK to spill beyond bounds
            pre_substitution_vec = sz_u32_load_unaligned(scores_pre_substitution + i);
            pre_insertion_opening_vec = sz_u32_load_unaligned(scores_pre_insertion + i);
            pre_deletion_opening_vec = sz_u32_load_unaligned(scores_pre_deletion + i);
            pre_insertion_expansion_vec = sz_u32_load_unaligned(scores_running_insertions + i);
            pre_deletion_expansion_vec = sz_u32_load_unaligned(scores_running_deletions + i);
            first_vec = sz_u32_load_unaligned(first_slice + tasks_count - i - 4); // ! this may be OOB
            second_vec = sz_u32_load_unaligned(second_slice + i);                 // ! this may be OOB, but padded
            first_vec.u32 = __nv_bswap32(first_vec.u32); // ! reverse the order of bytes in the first vector

            // Equality comparison will output 0xFF for each matching byte.
            equality_vec.u32 = __vcmpeq4(first_vec.u32, second_vec.u32);
            cost_of_substitution_vec.u32 =                //
                (equality_vec.u32 & match_cost_vec.u32) + //
                (~equality_vec.u32 & mismatch_cost_vec.u32);
            if_substitution_vec.u32 = __vaddus4(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
            if_insertion_vec.u32 = __vminu4(__vaddus4(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32),
                                            __vaddus4(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
            if_deletion_vec.u32 = __vminu4(__vaddus4(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32),
                                           __vaddus4(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));
            cell_score_vec.u32 = __vminu4(if_substitution_vec.u32, __vminu4(if_insertion_vec.u32, if_deletion_vec.u32));

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.u8s[0];
            scores_new[i + 1] = cell_score_vec.u8s[1];
            scores_new[i + 2] = cell_score_vec.u8s[2];
            scores_new[i + 3] = cell_score_vec.u8s[3];
            scores_new_insertions[i + 0] = if_insertion_vec.u8s[0];
            scores_new_insertions[i + 1] = if_insertion_vec.u8s[1];
            scores_new_insertions[i + 2] = if_insertion_vec.u8s[2];
            scores_new_insertions[i + 3] = if_insertion_vec.u8s[3];
            scores_new_deletions[i + 0] = if_deletion_vec.u8s[0];
            scores_new_deletions[i + 1] = if_deletion_vec.u8s[1];
            scores_new_deletions[i + 2] = if_deletion_vec.u8s[2];
            scores_new_deletions[i + 3] = if_deletion_vec.u8s[3];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if (tasks_offset == 0) this->final_score_ = scores_new[0];
    }
};

/**
 *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
 *  @note Requires Kepler generation GPUs to handle 2x `u16` scores at a time.
 *
 *  Relies on following instruction families to output 2x @b `u16` scores per call:
 *  - @b `prmt` to shuffle bytes in 32 bit registers.
 *  - @b `vmax2,vmin2,vadd2` video-processing instructions.
 */
template <>
struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
    using kepler_warp_scorer_t =
        tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                    sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>;

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,             // ! Unlike CPU, uses `unsigned`
        u16_t const *scores_pre_substitution,   //
        u16_t const *scores_pre_insertion,      //
        u16_t const *scores_pre_deletion,       //
        u16_t const *scores_running_insertions, //
        u16_t const *scores_running_deletions,  //
        u16_t *scores_new,                      //
        u16_t *scores_new_insertions,           //
        u16_t *scores_new_deletions) noexcept {

        u16_t const match_cost = this->substituter_.match;
        u16_t const mismatch_cost = this->substituter_.mismatch;
        u16_t const gap_open_cost = this->gap_costs_.open;
        u16_t const gap_extend_cost = this->gap_costs_.extend;

        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
        match_cost_vec.u32 = match_cost * 0x00010001;           // ! 2x `u16` match costs
        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001;     // ! 2x `u16` mismatch costs
        gap_open_cost_vec.u32 = gap_open_cost * 0x00010001;     // ! 2x `u16` gap costs
        gap_extend_cost_vec.u32 = gap_extend_cost * 0x00010001; // ! 2x `u16` gap costs

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
        sz_u32_vec_t cell_score_vec;

        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
            pre_insertion_opening_vec.u16s[0] = scores_pre_insertion[i + 0];
            pre_insertion_opening_vec.u16s[1] = scores_pre_insertion[i + 1];
            pre_deletion_opening_vec.u16s[0] = scores_pre_deletion[i + 0];
            pre_deletion_opening_vec.u16s[1] = scores_pre_deletion[i + 1];
            pre_insertion_expansion_vec.u16s[0] = scores_running_insertions[i + 0];
            pre_insertion_expansion_vec.u16s[1] = scores_running_insertions[i + 1];
            pre_deletion_expansion_vec.u16s[0] = scores_running_deletions[i + 0];
            pre_deletion_expansion_vec.u16s[1] = scores_running_deletions[i + 1];
            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded

            // Equality comparison will output 0xFFFF for each matching byte-pair.
            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
            cost_of_substitution_vec.u32 =                //
                (equality_vec.u32 & match_cost_vec.u32) + //
                (~equality_vec.u32 & mismatch_cost_vec.u32);
            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
            if_insertion_vec.u32 = __vminu2(__vaddus2(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32),
                                            __vaddus2(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
            if_deletion_vec.u32 = __vminu2(__vaddus2(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32),
                                           __vaddus2(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));
            cell_score_vec.u32 = __vminu2(if_substitution_vec.u32, __vminu2(if_insertion_vec.u32, if_deletion_vec.u32));

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.u16s[0];
            scores_new[i + 1] = cell_score_vec.u16s[1];
            scores_new_insertions[i + 0] = if_insertion_vec.u16s[0];
            scores_new_insertions[i + 1] = if_insertion_vec.u16s[1];
            scores_new_deletions[i + 0] = if_deletion_vec.u16s[0];
            scores_new_deletions[i + 1] = if_deletion_vec.u16s[1];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if (tasks_offset == 0) this->final_score_ = scores_new[0];
    }
};

template <>
struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
};

template <>
struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k>
    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
};

#endif

/*  Hopper-generation optimizations are quite different from Kepler.
 *  Our Kepler optimizations are mostly designed for 8-bit and 16-bit scalars packed as 32-bit words,
 *  while Hopper optimizations are designed for 16-bit and 32-bit scalars, grouping chains of add/min/max
 *  operations using DPX instructions.
 */

#if SZ_USE_HOPPER

template <>
struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k> {
    using kepler_warp_scorer_t::tile_scorer; // Make the constructors visible
};

/**
 *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with linear gap costs.
 *  @note Requires Hopper generation GPUs to handle 2x `u16` scores at a time.
 */
template <>
struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {
    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,           // ! Unlike CPU, uses `unsigned`
        u16_t const *scores_pre_substitution, //
        u16_t const *scores_pre_insertion,    //
        u16_t const *scores_pre_deletion,     //
        u16_t *scores_new) noexcept {

        u16_t const match_cost = this->substituter_.match;
        u16_t const mismatch_cost = this->substituter_.mismatch;
        u16_t const gap_cost = this->gap_costs_.open_or_extend;

        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_cost_vec, equality_vec;
        match_cost_vec.u32 = match_cost * 0x00010001;       // ! 2x `u16` match costs
        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001; // ! 2x `u16` mismatch costs
        gap_cost_vec.u32 = gap_cost * 0x00010001;           // ! 2x `u16` gap costs

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
        sz_u32_vec_t cell_score_vec;

        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
            pre_insertion_vec.u16s[0] = scores_pre_insertion[i + 0];
            pre_insertion_vec.u16s[1] = scores_pre_insertion[i + 1];
            pre_deletion_vec.u16s[0] = scores_pre_deletion[i + 0];
            pre_deletion_vec.u16s[1] = scores_pre_deletion[i + 1];
            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded

            // Equality comparison will output 0xFFFF for each matching byte-pair.
            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
            cost_of_substitution_vec.u32 =                //
                (equality_vec.u32 & match_cost_vec.u32) + //
                (~equality_vec.u32 & mismatch_cost_vec.u32);
            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
            if_insertion_vec.u32 = __vaddus2(pre_insertion_vec.u32, gap_cost_vec.u32);
            if_deletion_vec.u32 = __vaddus2(pre_deletion_vec.u32, gap_cost_vec.u32);
            cell_score_vec.u32 = __vimin3_u16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.u16s[0];
            scores_new[i + 1] = cell_score_vec.u16s[1];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if (tasks_offset == 0) this->final_score_ = scores_new[0];
    }
};

template <>
struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
};

template <>
struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, linear_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
};

template <>
struct tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u8_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ck_k> {
    using kepler_warp_scorer_t::tile_scorer; // Make the constructors visible
};

/**
 *  @brief GPU adaptation of the `tile_scorer` - Minimizes Global Levenshtein distance with affine gap costs.
 *  @note Requires Hopper generation GPUs to handle 2x `u8` scores at a time.
 */
template <>
struct tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u16_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,             // ! Unlike CPU, uses `unsigned`
        u16_t const *scores_pre_substitution,   //
        u16_t const *scores_pre_insertion,      //
        u16_t const *scores_pre_deletion,       //
        u16_t const *scores_running_insertions, //
        u16_t const *scores_running_deletions,  //
        u16_t *scores_new,                      //
        u16_t *scores_new_insertions,           //
        u16_t *scores_new_deletions) noexcept {

        u16_t const match_cost = this->substituter_.match;
        u16_t const mismatch_cost = this->substituter_.mismatch;
        u16_t const gap_open_cost = this->gap_costs_.open;
        u16_t const gap_extend_cost = this->gap_costs_.extend;

        sz_u32_vec_t match_cost_vec, mismatch_cost_vec, gap_open_cost_vec, gap_extend_cost_vec, equality_vec;
        match_cost_vec.u32 = match_cost * 0x00010001;           // ! 2x `u16` match costs
        mismatch_cost_vec.u32 = mismatch_cost * 0x00010001;     // ! 2x `u16` mismatch costs
        gap_open_cost_vec.u32 = gap_open_cost * 0x00010001;     // ! 2x `u16` gap costs
        gap_extend_cost_vec.u32 = gap_extend_cost * 0x00010001; // ! 2x `u16` gap costs

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
        sz_u32_vec_t cell_score_vec;

        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
            pre_substitution_vec.u16s[0] = scores_pre_substitution[i + 0];
            pre_substitution_vec.u16s[1] = scores_pre_substitution[i + 1];
            pre_insertion_opening_vec.u16s[0] = scores_pre_insertion[i + 0];
            pre_insertion_opening_vec.u16s[1] = scores_pre_insertion[i + 1];
            pre_deletion_opening_vec.u16s[0] = scores_pre_deletion[i + 0];
            pre_deletion_opening_vec.u16s[1] = scores_pre_deletion[i + 1];
            pre_insertion_expansion_vec.u16s[0] = scores_running_insertions[i + 0];
            pre_insertion_expansion_vec.u16s[1] = scores_running_insertions[i + 1];
            pre_deletion_expansion_vec.u16s[0] = scores_running_deletions[i + 0];
            pre_deletion_expansion_vec.u16s[1] = scores_running_deletions[i + 1];
            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded

            // Equality comparison will output 0xFFFF for each matching byte-pair.
            equality_vec.u32 = __vcmpeq2(first_vec.u32, second_vec.u32);
            cost_of_substitution_vec.u32 =                //
                (equality_vec.u32 & match_cost_vec.u32) + //
                (~equality_vec.u32 & mismatch_cost_vec.u32);
            if_substitution_vec.u32 = __vaddus2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
            if_insertion_vec.u32 = //
                __viaddmin_u16x2(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32,
                                 __vaddus2(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
            if_deletion_vec.u32 = //
                __viaddmin_u16x2(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32,
                                 __vaddus2(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));
            cell_score_vec.u32 = __vimin3_u16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.u16s[0];
            scores_new[i + 1] = cell_score_vec.u16s[1];
            scores_new_insertions[i + 0] = if_insertion_vec.u16s[0];
            scores_new_insertions[i + 1] = if_insertion_vec.u16s[1];
            scores_new_deletions[i + 0] = if_deletion_vec.u16s[0];
            scores_new_deletions[i + 1] = if_deletion_vec.u16s[1];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if (tasks_offset == 0) this->final_score_ = scores_new[0];
    }
};

template <>
struct tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u32_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
};

template <>
struct tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                   sz_minimize_distance_k, sz_similarity_global_k, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, u64_t, uniform_substitution_costs_t, affine_gap_costs_t,
                         sz_minimize_distance_k, sz_similarity_global_k, sz_cap_cuda_k> {

    using cuda_warp_scorer_t::tile_scorer; // Make the constructors visible
};

#endif

/**
 *  @brief  String similarity scoring algorithm evaluating a @b single Dynamic Programming matrix
 *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
 *          Unlike the `_levenshtein_in_cuda_warp` is designed to take one pair of very-longs string,
 *          ideally @b Tens-of-Megabytes in size or more.
 *
 *  @param[in] shorter_string The shorter string in the pair for score calculation.
 *  @param[in] longer_string The longer string in the pair for score calculation.
 *  @param[out] result_ptr Output address of the score for the pair of strings.
 *
 *  The ideal plan is:
 *  - Use cooperative groups abstractions for grid level synchronization between iterations.
 *  - Keep 3 diagonals in shared memory, but not the texts, as depending on the diagonal, different
 *    characters will be needed. Asynchronous copy of the characters from global memory to shared memory
 *    will help hide that latency.
 *  - Each block of threads takes its own slice of those 3 diagonals, constrained by the amount of shared
 *    memory available on the device.
 *  - Every neighboring pair of diagonals has a different length, and boundary elements of each slice need
 *    to be exchanged through shared memory.
 *
 *  The current starter plan is much simpler:
 *  - Keep everything in global memory - the strings and the diagonals.
 *  - Execute the naive algorithm, expecting the hardware to handle coalescing the memory accesses.
 */
template <                                                       //
    typename char_type_ = char,                                  //
    typename index_type_ = unsigned,                             //
    typename score_type_ = size_t,                               //
    typename final_score_type_ = size_t,                         //
    typename substituter_type_ = uniform_substitution_costs_t,   //
    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
    sz_capability_t capability_ = sz_cap_cuda_k                  //
    >
__global__ void linear_score_across_cuda_device_(              //
    char_type_ const *shorter_ptr, index_type_ shorter_length, //
    char_type_ const *longer_ptr, index_type_ longer_length,   //
    final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
    substituter_type_ const substituter, linear_gap_costs_t const gap_costs) {

    namespace cg = cooperative_groups;

    sz_assert_(shorter_length > 0);
    sz_assert_(longer_length > 0);
    sz_assert_(shorter_length <= longer_length);
    using char_t = char_type_;
    using index_t = index_type_;
    using score_t = score_type_;
    using final_score_t = final_score_type_;

    static constexpr sz_capability_t capability_k = capability_;
    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = objective_;

    // Pre-load the substituter and gap costs.
    using substituter_t = substituter_type_;
    using gap_costs_t = linear_gap_costs_t;
    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");

    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
                                           objective_k, locality_k, capability_k>;

    // Only one thread will be initializing the top row and left column and outputting the result.
    bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0;

    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
    index_t const shorter_dim = shorter_length + 1, longer_dim = longer_length + 1;

    // Let's say we are dealing with 3 and 5 letter words.
    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
    // It will have:
    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
    // - 2 diagonals of fixed length, at positions: 4, 5.
    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
    index_t const diagonals_count = shorter_dim + longer_dim - 1;
    index_t const max_diagonal_length = shorter_length + 1;

    // The next few pointers will be swapped around.
    score_t *previous_scores = diagonals_ptr;
    score_t *current_scores = diagonals_ptr + max_diagonal_length;
    score_t *next_scores = diagonals_ptr + 2 * max_diagonal_length;

    // Initialize the first two diagonals:
    cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
    if (is_main_thread) {
        diagonal_aligner.init_score(previous_scores[0], 0);
        diagonal_aligner.init_score(current_scores[0], 1);
        diagonal_aligner.init_score(current_scores[1], 1);
    }

    cg::grid_group grid = cg::this_grid();

    // We skip diagonals 0 and 1, as they are trivial.
    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
    // so we are effectively computing just one value, as will be marked by a single set bit in
    // the `next_diagonal_mask` on the very first iteration.
    index_t next_diagonal_index = 2;
    index_t const global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
    index_t const global_thread_step = blockDim.x * gridDim.x;

    // Progress through the upper-left triangle of the Levenshtein matrix.
    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {

        index_t const next_diagonal_length = next_diagonal_index + 1;
        diagonal_aligner(                            //
            shorter_ptr,                             // first sequence of characters
            longer_ptr,                              // second sequence of characters
            global_thread_index, global_thread_step, //
            (index_t)(next_diagonal_length - 2),     // number of elements to compute with the `diagonal_aligner`
            previous_scores,                         // costs pre substitution
            current_scores, current_scores + 1,      // costs pre insertion/deletion
            next_scores + 1);                        // ! notice unaligned write destination

        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
        if (is_main_thread) {
            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
        }
        // Guarantee that all the writes have finished, before progressing to the next diagonal.
        grid.sync();

        // Perform a circular rotation of those buffers, to reuse the memory.
        rotate_three(previous_scores, current_scores, next_scores);
    }

    __shared__ cuda::pipeline_shared_state<cuda::thread_scope_system, 2> memcpy_pipeline_state;
    auto memcpy_pipeline = cuda::make_pipeline(grid, &memcpy_pipeline_state);

    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {

        index_t const next_diagonal_length = shorter_dim;
        diagonal_aligner(                                   //
            shorter_ptr,                                    // first sequence of characters
            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
            global_thread_index, global_thread_step,        //
            (index_t)(next_diagonal_length - 1),            // number of elements to compute with the `diagonal_aligner`
            previous_scores,                                // costs pre substitution
            current_scores, current_scores + 1,             // costs pre insertion/deletion
            next_scores);

        // Don't forget to populate the first row of the Levenshtein matrix.
        if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);

        // Guarantee that all the writes have finished, before progressing to the next diagonal.
        grid.sync();

        // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
        // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
        // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
        memcpy_pipeline.producer_acquire();
        cuda::memcpy_async(grid, (void *)previous_scores, (void const *)(current_scores + 1),
                           (next_diagonal_length - 1) * sizeof(score_t), memcpy_pipeline);
        cuda::memcpy_async(grid, (void *)current_scores, (void const *)(next_scores),
                           (next_diagonal_length) * sizeof(score_t), memcpy_pipeline);
        memcpy_pipeline.producer_commit();
        memcpy_pipeline.consumer_wait();
        memcpy_pipeline.consumer_release();
    }

    // Now let's handle the bottom-right triangle of the matrix.
    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {

        index_t const next_diagonal_length = diagonals_count - next_diagonal_index;
        diagonal_aligner(                                   //
            shorter_ptr + next_diagonal_index - longer_dim, // first sequence of characters
            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
            global_thread_index, global_thread_step,        //
            next_diagonal_length,                           // number of elements to compute with the `diagonal_aligner`
            previous_scores,                                // costs pre substitution
            current_scores, current_scores + 1,             // costs pre insertion/deletion
            next_scores);

        // Guarantee that all the writes have finished, before progressing to the next diagonal.
        grid.sync();

        // Perform a circular rotation of those buffers, to reuse the memory.
        rotate_three(previous_scores, current_scores, next_scores);

        // ! Drop the first entry among the current scores.
        // ! Assuming every next diagonal is shorter by one element,
        // ! we don't need a full-blown `sz_move` to shift the array by one element.
        previous_scores++;
    }

    // Export one result per each block.
    if (is_main_thread) *result_ptr = static_cast<final_score_t>(diagonal_aligner.score());
}

/**
 *  @brief  String similarity scoring algorithm evaluating a @b single Dynamic Programming matrix
 *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
 *          Unlike the `_levenshtein_in_cuda_warp` is designed to take one pair of very-longs string,
 *          ideally @b Tens-of-Megabytes in size or more.
 *
 *  @param[in] shorter_string The shorter string in the pair for score calculation.
 *  @param[in] longer_string The longer string in the pair for score calculation.
 *  @param[out] result_ptr Output address of the score for the pair of strings.
 */
template <                                                       //
    typename char_type_ = char,                                  //
    typename index_type_ = unsigned,                             //
    typename score_type_ = size_t,                               //
    typename final_score_type_ = size_t,                         //
    typename substituter_type_ = uniform_substitution_costs_t,   //
    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
    sz_capability_t capability_ = sz_cap_cuda_k                  //
    >
__global__ void affine_score_across_cuda_device(               //
    char_type_ const *shorter_ptr, index_type_ shorter_length, //
    char_type_ const *longer_ptr, index_type_ longer_length,   //
    final_score_type_ *result_ptr, score_type_ *diagonals_ptr, //
    substituter_type_ const substituter, affine_gap_costs_t const gap_costs) {

    namespace cg = cooperative_groups;

    sz_assert_(shorter_length > 0);
    sz_assert_(longer_length > 0);
    sz_assert_(shorter_length <= longer_length);
    using char_t = char_type_;
    using index_t = index_type_;
    using score_t = score_type_;
    using final_score_t = final_score_type_;

    static constexpr sz_capability_t capability_k = capability_;
    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = objective_;

    // Pre-load the substituter and gap costs.
    using substituter_t = substituter_type_;
    using gap_costs_t = affine_gap_costs_t;
    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");

    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
                                           objective_k, locality_k, capability_k>;

    // Only one thread will be initializing the top row and left column and outputting the result.
    bool const is_main_thread = blockIdx.x == 0 && threadIdx.x == 0; // ! Differs for warp-wide

    // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
    // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
    index_t const shorter_dim = shorter_length + 1, longer_dim = longer_length + 1;

    // Let's say we are dealing with 3 and 5 letter words.
    // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
    // It will have:
    // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
    // - 2 diagonals of fixed length, at positions: 4, 5.
    // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
    index_t const diagonals_count = shorter_dim + longer_dim - 1;
    index_t const max_diagonal_length = shorter_length + 1;

    // The next few pointers will be swapped around.
    score_t *previous_scores = diagonals_ptr;
    score_t *current_scores = diagonals_ptr + max_diagonal_length;
    score_t *next_scores = diagonals_ptr + 2 * max_diagonal_length;
    score_t *current_inserts = diagonals_ptr + 3 * max_diagonal_length;
    score_t *next_inserts = diagonals_ptr + 4 * max_diagonal_length;
    score_t *current_deletes = diagonals_ptr + 5 * max_diagonal_length;
    score_t *next_deletes = diagonals_ptr + 6 * max_diagonal_length;

    // Initialize the first two diagonals:
    cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
    if (is_main_thread) {
        diagonal_aligner.init_score(previous_scores[0], 0);
        diagonal_aligner.init_score(current_scores[0], 1);
        diagonal_aligner.init_score(current_scores[1], 1);
        diagonal_aligner.init_gap(current_inserts[0], 1);
        diagonal_aligner.init_gap(current_deletes[1], 1);
    }

    cg::grid_group grid = cg::this_grid();

    // We skip diagonals 0 and 1, as they are trivial.
    // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
    // so we are effectively computing just one value, as will be marked by a single set bit in
    // the `next_diagonal_mask` on the very first iteration.
    index_t next_diagonal_index = 2;
    index_t const global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
    index_t const global_thread_step = blockDim.x * gridDim.x;

    // Progress through the upper-left triangle of the Levenshtein matrix.
    for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {

        index_t const next_diagonal_length = next_diagonal_index + 1;
        diagonal_aligner(                            //
            shorter_ptr,                             // first sequence of characters
            longer_ptr,                              // second sequence of characters
            global_thread_index, global_thread_step, //
            (index_t)(next_diagonal_length - 2),     // number of elements to compute with the `diagonal_aligner`
            previous_scores,                         // costs pre substitution
            current_scores, current_scores + 1,      // costs pre insertion/deletion opening
            current_inserts, current_deletes + 1,    // costs pre insertion/deletion extension
            next_scores + 1,                         // ! notice unaligned write destination
            next_inserts + 1, next_deletes + 1       // ! notice unaligned write destination
        );

        // Don't forget to populate the first row and the first column of the Levenshtein matrix.
        if (is_main_thread) {
            diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
            diagonal_aligner.init_gap(next_inserts[0], next_diagonal_index);
            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
        }
        // Guarantee that all the writes have finished, before progressing to the next diagonal.
        grid.sync();

        // Perform a circular rotation of those buffers, to reuse the memory.
        rotate_three(previous_scores, current_scores, next_scores);
        trivial_swap(current_inserts, next_inserts);
        trivial_swap(current_deletes, next_deletes);
    }

    __shared__ cuda::pipeline_shared_state<cuda::thread_scope_system, 2> memcpy_pipeline_state;
    auto memcpy_pipeline = cuda::make_pipeline(grid, &memcpy_pipeline_state);

    // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
    for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {

        index_t const next_diagonal_length = shorter_dim;
        diagonal_aligner(                                   //
            shorter_ptr,                                    // first sequence of characters
            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
            global_thread_index, global_thread_step,        //
            (index_t)(next_diagonal_length - 1),            // number of elements to compute with the `diagonal_aligner`
            previous_scores,                                // costs pre substitution
            current_scores, current_scores + 1,             // costs pre insertion/deletion opening
            current_inserts, current_deletes + 1,           // costs pre insertion/deletion extension
            next_scores,                                    // updated similarity scores
            next_inserts, next_deletes                      // updated insertion/deletion extensions
        );

        // Don't forget to populate the first row of the Levenshtein matrix.
        if (is_main_thread) {
            diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
            diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
        }

        trivial_swap(current_inserts, next_inserts);
        trivial_swap(current_deletes, next_deletes);

        // Guarantee that all the writes have finished, before progressing to the next diagonal.
        grid.sync();

        // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
        // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
        // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
        memcpy_pipeline.producer_acquire();
        cuda::memcpy_async(grid, (void *)previous_scores, (void const *)(current_scores + 1),
                           (next_diagonal_length - 1) * sizeof(score_t), memcpy_pipeline);
        cuda::memcpy_async(grid, (void *)current_scores, (void const *)(next_scores),
                           (next_diagonal_length) * sizeof(score_t), memcpy_pipeline);
        memcpy_pipeline.producer_commit();
        memcpy_pipeline.consumer_wait();
        memcpy_pipeline.consumer_release();
    }

    // Now let's handle the bottom-right triangle of the matrix.
    for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {

        index_t const next_diagonal_length = diagonals_count - next_diagonal_index;
        diagonal_aligner(                                   //
            shorter_ptr + next_diagonal_index - longer_dim, // first sequence of characters
            longer_ptr + next_diagonal_index - shorter_dim, // second sequence of characters
            global_thread_index, global_thread_step,        //
            next_diagonal_length,                           // number of elements to compute with the `diagonal_aligner`
            previous_scores,                                // costs pre substitution
            current_scores, current_scores + 1,             // costs pre insertion/deletion opening
            current_inserts, current_deletes + 1,           // costs pre insertion/deletion extension
            next_scores,                                    // updated similarity scores
            next_inserts, next_deletes                      // updated insertion/deletion extensions
        );

        // Guarantee that all the writes have finished, before progressing to the next diagonal.
        grid.sync();

        // Perform a circular rotation of those buffers, to reuse the memory.
        rotate_three(previous_scores, current_scores, next_scores);
        trivial_swap(current_inserts, next_inserts);
        trivial_swap(current_deletes, next_deletes);

        // ! Drop the first entry among the current scores.
        // ! Assuming every next diagonal is shorter by one element,
        // ! we don't need a full-blown `sz_move` to shift the array by one element.
        previous_scores++;
    }

    // Export one result per each block.
    if (is_main_thread) *result_ptr = static_cast<final_score_t>(diagonal_aligner.score());
}

/**
 *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
 *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
 *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
 *
 *  @param[in] tasks Tasks containing the strings and output locations.
 *  @param[in] tasks_count The number of tasks to process.
 *  @param[in] substituter The substitution costs.
 *  @param[in] gap_costs The @b linear gap costs.
 */
template < //
    typename task_type_,
    typename char_type_ = char,                                  //
    typename index_type_ = unsigned,                             //
    typename score_type_ = size_t,                               //
    typename substituter_type_ = uniform_substitution_costs_t,   //
    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
    sz_capability_t capability_ = sz_cap_cuda_k                  //
    >
__global__ void linear_score_on_each_cuda_warp_(                             //
    task_type_ *tasks, size_t tasks_count,                                   //
    substituter_type_ const substituter, linear_gap_costs_t const gap_costs, //
    unsigned const shared_memory_size) {

    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
    using task_t = task_type_;
    using char_t = char_type_;
    using index_t = index_type_;
    using score_t = score_type_;

    static constexpr sz_capability_t capability_k = capability_;
    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = objective_;

    // Pre-load the substituter and gap costs.
    using substituter_t = substituter_type_;
    using gap_costs_t = linear_gap_costs_t;
    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");

    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
                                           objective_k, locality_k, capability_k>;

    // We may have multiple warps operating in the same block.
    unsigned const warp_size = warpSize;
    size_t const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
    size_t const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size);
    size_t const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size);
    size_t const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
    unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size);

    // Allocating shared memory is handled on the host side.
    extern __shared__ char shared_memory_for_block[];
    char *const shared_memory_for_warp =
        shared_memory_for_block + (global_warp_index % warps_per_block) * (shared_memory_size / warps_per_block);

    // Only one thread will be initializing the top row and left column and outputting the result.
    bool const is_main_thread = thread_in_warp_index == 0;

    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
    // Each block/warp may end up receiving a different number of strings.
    for (size_t task_idx = global_warp_index; task_idx < tasks_count; task_idx += warps_per_device) {
        task_t &task = tasks[task_idx];
        char_t const *shorter_global = task.shorter_ptr;
        char_t const *longer_global = task.longer_ptr;
        size_t const shorter_length = task.shorter_length;
        size_t const longer_length = task.longer_length;
        auto &result_ref = task.result;

        // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
        // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
        unsigned const shorter_dim = static_cast<unsigned>(shorter_length + 1);
        unsigned const longer_dim = static_cast<unsigned>(longer_length + 1);

        // Let's say we are dealing with 3 and 5 letter words.
        // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
        // It will have:
        // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
        // - 2 diagonals of fixed length, at positions: 4, 5.
        // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
        unsigned const diagonals_count = shorter_dim + longer_dim - 1;
        unsigned const max_diagonal_length = shorter_length + 1;
        unsigned const bytes_per_diagonal = round_up_to_multiple<unsigned>(max_diagonal_length * sizeof(score_t), 4);

        // The next few pointers will be swapped around.
        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_for_warp);
        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + bytes_per_diagonal);
        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + 2 * bytes_per_diagonal);
        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_for_warp + 3 * bytes_per_diagonal);
        char_t *const shorter = longer + longer_length;

        // Each thread in the warp will be loading it's own set of strided characters into shared memory.
        for (unsigned i = thread_in_warp_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
        for (unsigned i = thread_in_warp_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];

        // Initialize the first two diagonals:
        cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
        if (is_main_thread) {
            diagonal_aligner.init_score(previous_scores[0], 0);
            diagonal_aligner.init_score(current_scores[0], 1);
            diagonal_aligner.init_score(current_scores[1], 1);
        }

        // Make sure the shared memory is fully loaded.
        __syncwarp();

        // We skip diagonals 0 and 1, as they are trivial.
        // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
        // so we are effectively computing just one value, as will be marked by a single set bit in
        // the `next_diagonal_mask` on the very first iteration.
        unsigned next_diagonal_index = 2;

        // Progress through the upper-left triangle of the Levenshtein matrix.
        for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {

            unsigned const next_diagonal_length = next_diagonal_index + 1;
            diagonal_aligner(                       //
                shorter,                            // first sequence of characters
                longer,                             // second sequence of characters
                thread_in_warp_index, warp_size,    //
                next_diagonal_length - 2,           // number of elements to compute with the `diagonal_aligner`
                previous_scores,                    // costs pre substitution
                current_scores, current_scores + 1, // costs pre insertion/deletion
                next_scores + 1);                   // ! notice unaligned write destination

            // Don't forget to populate the first row and the first column of the Levenshtein matrix.
            if (is_main_thread) {
                diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
            }
            __syncwarp();

            // Perform a circular rotation of those buffers, to reuse the memory.
            rotate_three(previous_scores, current_scores, next_scores);
        }

        // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
        for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {

            unsigned const next_diagonal_length = shorter_dim;
            diagonal_aligner(                               //
                shorter,                                    // first sequence of characters
                longer + next_diagonal_index - shorter_dim, // second sequence of characters
                thread_in_warp_index, warp_size,            //
                next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
                previous_scores,                            // costs pre substitution
                current_scores, current_scores + 1,         // costs pre insertion/deletion
                next_scores);

            // Don't forget to populate the first row of the Levenshtein matrix.
            if (is_main_thread) diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);

            __syncwarp();
            // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
            // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
            // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
            for (size_t i = thread_in_warp_index; i + 1 < next_diagonal_length; i += warp_size)
                previous_scores[i] = current_scores[i + 1];
            __syncwarp();
            for (size_t i = thread_in_warp_index; i < next_diagonal_length; i += warp_size)
                current_scores[i] = next_scores[i];
            __syncwarp();
        }

        // Now let's handle the bottom-right triangle of the matrix.
        for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {

            unsigned const next_diagonal_length = diagonals_count - next_diagonal_index;
            diagonal_aligner(                               //
                shorter + next_diagonal_index - longer_dim, // first sequence of characters
                longer + next_diagonal_index - shorter_dim, // second sequence of characters
                thread_in_warp_index, warp_size,            //
                next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
                previous_scores,                            // costs pre substitution
                current_scores, current_scores + 1,         // costs pre insertion/deletion
                next_scores);

            // Perform a circular rotation of those buffers, to reuse the memory.
            rotate_three(previous_scores, current_scores, next_scores);

            // ! Drop the first entry among the current scores.
            // ! Assuming every next diagonal is shorter by one element,
            // ! we don't need a full-blown `sz_move` to shift the array by one element.
            previous_scores++;
            __syncwarp();
        }

        // Export one result per each block.
        if (is_main_thread) result_ref = diagonal_aligner.score();
    }
}

/**
 *  @brief  Levenshtein edit distances algorithm evaluating the Dynamic Programming matrix
 *          @b three skewed (reverse) diagonals at a time on a GPU, leveraging CUDA for parallelization.
 *          Each pair of strings gets its own @b "block" of CUDA threads forming one @b warp and shared memory.
 *
 *  @param[in] tasks Tasks containing the strings and output locations.
 *  @param[in] tasks_count The number of tasks to process.
 *  @param[in] substituter The substitution costs.
 *  @param[in] gap_costs The @b affine gap costs.
 */
template < //
    typename task_type_,
    typename char_type_ = char,                                  //
    typename index_type_ = unsigned,                             //
    typename score_type_ = size_t,                               //
    typename substituter_type_ = uniform_substitution_costs_t,   //
    sz_similarity_objective_t objective_ = sz_maximize_score_k,  //
    sz_similarity_locality_t locality_ = sz_similarity_global_k, //
    sz_capability_t capability_ = sz_cap_cuda_k                  //
    >
__global__ void affine_score_on_each_cuda_warp_(                             //
    task_type_ *tasks, size_t tasks_count,                                   //
    substituter_type_ const substituter, affine_gap_costs_t const gap_costs, //
    unsigned const shared_memory_size) {

    // Simplify usage in higher-level libraries, where wrapping custom allocators may be troublesome.
    using task_t = task_type_;
    using char_t = char_type_;
    using index_t = index_type_;
    using score_t = score_type_;

    static constexpr sz_capability_t capability_k = capability_;
    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = objective_;

    // Pre-load the substituter and gap costs.
    using substituter_t = substituter_type_;
    using gap_costs_t = affine_gap_costs_t;
    static_assert(std::is_trivially_copyable<substituter_t>::value, "Substituter must be trivially copyable.");
    static_assert(std::is_trivially_copyable<gap_costs_t>::value, "Gap costs must be trivially copyable.");

    using cuda_warp_scorer_t = tile_scorer<char_t const *, char_t const *, score_t, substituter_t, gap_costs_t,
                                           objective_k, locality_k, capability_k>;

    // We may have multiple warps operating in the same block.
    unsigned const warp_size = warpSize;
    size_t const global_thread_index = static_cast<unsigned>(blockIdx.x * blockDim.x + threadIdx.x);
    size_t const global_warp_index = static_cast<unsigned>(global_thread_index / warp_size);
    size_t const warps_per_block = static_cast<unsigned>(blockDim.x / warp_size);
    size_t const warps_per_device = static_cast<unsigned>(gridDim.x * warps_per_block);
    unsigned const thread_in_warp_index = static_cast<unsigned>(global_thread_index % warp_size);

    // Allocating shared memory is handled on the host side.
    extern __shared__ char shared_memory_for_block[];
    char *const shared_memory_for_warp =
        shared_memory_for_block + (global_warp_index % warps_per_block) * (shared_memory_size / warps_per_block);

    // Only one thread will be initializing the top row and left column and outputting the result.
    bool const is_main_thread = thread_in_warp_index == 0;

    // We are computing N edit distances for N pairs of strings. Not a cartesian product!
    // Each block/warp may end up receiving a different number of strings.
    for (size_t task_idx = global_warp_index; task_idx < tasks_count; task_idx += warps_per_device) {
        task_t &task = tasks[task_idx];
        char_t const *shorter_global = task.shorter_ptr;
        char_t const *longer_global = task.longer_ptr;
        size_t const shorter_length = task.shorter_length;
        size_t const longer_length = task.longer_length;
        auto &result_ref = task.result;

        // We are going to store 3 diagonals of the matrix, assuming each would fit into a single ZMM register.
        // The length of the longest (main) diagonal would be `shorter_dim = (shorter_length + 1)`.
        unsigned const shorter_dim = static_cast<unsigned>(shorter_length + 1);
        unsigned const longer_dim = static_cast<unsigned>(longer_length + 1);

        // Let's say we are dealing with 3 and 5 letter words.
        // The matrix will have size 4 x 6, parameterized as (shorter_dim x longer_dim).
        // It will have:
        // - 4 diagonals of increasing length, at positions: 0, 1, 2, 3.
        // - 2 diagonals of fixed length, at positions: 4, 5.
        // - 3 diagonals of decreasing length, at positions: 6, 7, 8.
        unsigned const diagonals_count = shorter_dim + longer_dim - 1;
        unsigned const max_diagonal_length = shorter_length + 1;
        unsigned const bytes_per_diagonal = round_up_to_multiple<unsigned>(max_diagonal_length * sizeof(score_t), 4);

        // The next few pointers will be swapped around.
        score_t *previous_scores = reinterpret_cast<score_t *>(shared_memory_for_warp);
        score_t *current_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + bytes_per_diagonal);
        score_t *next_scores = reinterpret_cast<score_t *>(shared_memory_for_warp + 2 * bytes_per_diagonal);
        score_t *current_inserts = reinterpret_cast<score_t *>(shared_memory_for_warp + 3 * bytes_per_diagonal);
        score_t *next_inserts = reinterpret_cast<score_t *>(shared_memory_for_warp + 4 * bytes_per_diagonal);
        score_t *current_deletes = reinterpret_cast<score_t *>(shared_memory_for_warp + 5 * bytes_per_diagonal);
        score_t *next_deletes = reinterpret_cast<score_t *>(shared_memory_for_warp + 6 * bytes_per_diagonal);
        char_t *const longer = reinterpret_cast<char_t *>(shared_memory_for_warp + 7 * bytes_per_diagonal);
        char_t *const shorter = longer + longer_length;

        // Each thread in the warp will be loading it's own set of strided characters into shared memory.
        for (unsigned i = thread_in_warp_index; i < longer_length; i += warp_size) longer[i] = longer_global[i];
        for (unsigned i = thread_in_warp_index; i < shorter_length; i += warp_size) shorter[i] = shorter_global[i];

        // Initialize the first two diagonals:
        cuda_warp_scorer_t diagonal_aligner {substituter, gap_costs};
        if (is_main_thread) {
            diagonal_aligner.init_score(previous_scores[0], 0);
            diagonal_aligner.init_score(current_scores[0], 1);
            diagonal_aligner.init_score(current_scores[1], 1);
            diagonal_aligner.init_gap(current_inserts[0], 1);
            diagonal_aligner.init_gap(current_deletes[1], 1);
        }

        // Make sure the shared memory is fully loaded.
        __syncwarp();

        // We skip diagonals 0 and 1, as they are trivial.
        // We will start with diagonal 2, which has length 3, with the first and last elements being preset,
        // so we are effectively computing just one value, as will be marked by a single set bit in
        // the `next_diagonal_mask` on the very first iteration.
        unsigned next_diagonal_index = 2;

        // Progress through the upper-left triangle of the Levenshtein matrix.
        for (; next_diagonal_index < shorter_dim; ++next_diagonal_index) {

            unsigned const next_diagonal_length = next_diagonal_index + 1;
            diagonal_aligner(                         //
                shorter,                              // first sequence of characters
                longer,                               // second sequence of characters
                thread_in_warp_index, warp_size,      //
                next_diagonal_length - 2,             // number of elements to compute with the `diagonal_aligner`
                previous_scores,                      // costs pre substitution
                current_scores, current_scores + 1,   // costs pre insertion/deletion opening
                current_inserts, current_deletes + 1, // costs pre insertion/deletion extension
                next_scores + 1,                      // ! notice unaligned write destination
                next_inserts + 1, next_deletes + 1    // ! notice unaligned write destination
            );

            // Don't forget to populate the first row and the first column of the Levenshtein matrix.
            if (is_main_thread) {
                diagonal_aligner.init_score(next_scores[0], next_diagonal_index);
                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
                diagonal_aligner.init_gap(next_inserts[0], next_diagonal_index);
                diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
            }
            __syncwarp();

            // Perform a circular rotation of those buffers, to reuse the memory.
            rotate_three(previous_scores, current_scores, next_scores);
            trivial_swap(current_inserts, next_inserts);
            trivial_swap(current_deletes, next_deletes);
        }

        // Now let's handle the anti-diagonal band of the matrix, between the top and bottom-right triangles.
        for (; next_diagonal_index < longer_dim; ++next_diagonal_index) {

            unsigned const next_diagonal_length = shorter_dim;
            diagonal_aligner(                               //
                shorter,                                    // first sequence of characters
                longer + next_diagonal_index - shorter_dim, // second sequence of characters
                thread_in_warp_index, warp_size,            //
                next_diagonal_length - 1,                   // number of elements to compute with the `diagonal_aligner`
                previous_scores,                            // costs pre substitution
                current_scores, current_scores + 1,         // costs pre insertion/deletion opening
                current_inserts, current_deletes + 1,       // costs pre insertion/deletion extension
                next_scores,                                // updated similarity scores
                next_inserts, next_deletes                  // updated insertion/deletion extensions
            );

            // Don't forget to populate the first row of the Levenshtein matrix.
            if (is_main_thread) {
                diagonal_aligner.init_score(next_scores[next_diagonal_length - 1], next_diagonal_index);
                diagonal_aligner.init_gap(next_deletes[next_diagonal_length - 1], next_diagonal_index);
            }

            trivial_swap(current_inserts, next_inserts);
            trivial_swap(current_deletes, next_deletes);

            __syncwarp();
            // ! In the central anti-diagonal band, we can't just set the `current_scores + 1` to `previous_scores`
            // ! for the circular shift, as we will end up spilling outside of the diagonal a few iterations later.
            // ! Assuming in-place `memmove` is tricky on the GPU, so we will copy the data.
            for (size_t i = thread_in_warp_index; i + 1 < next_diagonal_length; i += warp_size)
                previous_scores[i] = current_scores[i + 1];
            __syncwarp();
            for (size_t i = thread_in_warp_index; i < next_diagonal_length; i += warp_size)
                current_scores[i] = next_scores[i];
            __syncwarp();
        }

        // Now let's handle the bottom-right triangle of the matrix.
        for (; next_diagonal_index < diagonals_count; ++next_diagonal_index) {

            unsigned const next_diagonal_length = diagonals_count - next_diagonal_index;
            diagonal_aligner(                               //
                shorter + next_diagonal_index - longer_dim, // first sequence of characters
                longer + next_diagonal_index - shorter_dim, // second sequence of characters
                thread_in_warp_index, warp_size,            //
                next_diagonal_length,                       // number of elements to compute with the `diagonal_aligner`
                previous_scores,                            // costs pre substitution
                current_scores, current_scores + 1,         // costs pre insertion/deletion opening
                current_inserts, current_deletes + 1,       // costs pre insertion/deletion extension
                next_scores,                                // updated similarity scores
                next_inserts, next_deletes                  // updated insertion/deletion extensions
            );

            // Perform a circular rotation of those buffers, to reuse the memory.
            rotate_three(previous_scores, current_scores, next_scores);
            trivial_swap(current_inserts, next_inserts);
            trivial_swap(current_deletes, next_deletes);

            // ! Drop the first entry among the current scores.
            // ! Assuming every next diagonal is shorter by one element,
            // ! we don't need a full-blown `sz_move` to shift the array by one element.
            previous_scores++;
            __syncwarp();
        }

        // Export one result per each block.
        if (is_main_thread) result_ref = diagonal_aligner.score();
    }
}

#pragma endregion

#pragma region - Levenshtein Distance in CUDA

/**
 *  @brief Wraps a single task for the CUDA-based @b byte-level "similarity" kernels.
 *  @note Used to allow sorting/grouping inputs to differentiate device-wide and warp-wide tasks.
 */
template <typename char_type_>
struct cuda_similarity_task_ {
    using char_t = char_type_;

    char_t const *shorter_ptr = nullptr;
    size_t shorter_length = 0;
    char_t const *longer_ptr = nullptr;
    size_t longer_length = 0;
    size_t memory_requirement = 0;
    size_t original_index = 0;
    size_t result = std::numeric_limits<size_t>::max();       // ? Signal that we are not done yet.
    bytes_per_cell_t bytes_per_cell = eight_bytes_per_cell_k; // ? Worst case, need the most memory per scalar.
    warp_tasks_density_t density = warps_working_together_k;  // ? Worst case, we are not using shared memory.

    constexpr cuda_similarity_task_() = default;
    constexpr cuda_similarity_task_(                  //
        char_t const *first_ptr, size_t first_length, //
        char_t const *second_ptr, size_t second_length) noexcept {
        if (first_length < second_length)
            shorter_ptr = first_ptr, shorter_length = first_length, longer_ptr = second_ptr,
            longer_length = second_length;
        else
            shorter_ptr = second_ptr, shorter_length = second_length, longer_ptr = first_ptr,
            longer_length = first_length;
    }

    constexpr size_t max_diagonal_length() const noexcept { return sz_max_of_two(shorter_length, longer_length) + 1; }
};

/**
 *  @brief  Dispatches baseline Levenshtein edit distance algorithm to the GPU.
 *          Before starting the kernels, bins them by size to maximize the number of blocks
 *          per grid that can run simultaneously, while fitting into the shared memory.
 */
template <typename char_type_, typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
struct levenshtein_distances<char_type_, gap_costs_type_, allocator_type_, capability_,
                             std::enable_if_t<capability_ & sz_cap_cuda_k>> {

    using char_t = char_type_;
    using gap_costs_t = gap_costs_type_;
    using allocator_t = allocator_type_;
    using scores_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
    static constexpr sz_capability_t capability_k = capability_;

    using task_t = cuda_similarity_task_<char_t>;
    using tasks_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<task_t>;

    uniform_substitution_costs_t substituter_ {};
    gap_costs_t gap_costs_ {};
    allocator_t alloc_ {};

    levenshtein_distances(uniform_substitution_costs_t subs = {}, gap_costs_t gaps = {},
                          allocator_t const &alloc = {}) noexcept
        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}

    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
#if SZ_HAS_CONCEPTS_
        requires indexed_results_like<results_type_>
#endif
    cuda_status_t operator()(                                                                 //
        first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
        results_type_ &&results,                                                              //
        cuda_executor_t executor = {}, gpu_specs_t specs = {}) const noexcept {

        constexpr bool is_affine_k = is_same_type<gap_costs_t, affine_gap_costs_t>::value;
        constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;

        // Preallocate the events for GPU timing.
        cudaEvent_t start_event, stop_event;
        cudaEventCreate(&start_event, cudaEventBlockingSync);
        cudaEventCreate(&stop_event, cudaEventBlockingSync);

        using final_score_t = typename indexed_results_type<results_type_>::type;
        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
        if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};

        // Record the start event
        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream());
        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};

        // Export all the tasks and sort them by decreasing memory requirement.
        size_t count_empty_tasks = 0;
        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, false>;
        for (size_t i = 0; i < first_strings.size(); ++i) {
            // Ensure inputs are device-accessible (Unified/Device memory)
            if (!is_device_accessible_memory((void const *)first_strings[i].data()) ||
                !is_device_accessible_memory((void const *)second_strings[i].data()))
                return {status_t::device_memory_mismatch_k, cudaSuccess};
            task_t task(                                            //
                first_strings[i].data(), first_strings[i].length(), //
                second_strings[i].data(), second_strings[i].length());
            similarity_memory_requirements_t requirement(                                  //
                task.shorter_length, task.longer_length,                                   //
                gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
                sizeof(char_t), 4);

            task.original_index = i;
            task.memory_requirement = requirement.total;
            task.bytes_per_cell = requirement.bytes_per_cell;
            task.density = warp_tasks_density(requirement.total, specs);
            if (task.density == infinite_warps_per_multiprocessor_k) {
                if constexpr (!is_affine_k) { task.result = task.longer_length * gap_costs_.open_or_extend; }
                else if (!task.longer_length) { task.result = 0; }
                else { task.result = (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open; }
                count_empty_tasks++;
            }
            tasks[i] = task;
        }

        auto [device_level_tasks, warp_level_tasks, empty_tasks] =
            warp_tasks_grouping<task_t>({tasks.data(), tasks.size()}, specs);

        if (device_level_tasks.size()) {
            auto device_level_u16_kernel =
                is_affine_k //
                    ? (void *)&affine_score_across_cuda_device<char_t, u16_t, u16_t, final_score_t,
                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
                                                               sz_similarity_global_k, capability_k>
                    : (void *)&linear_score_across_cuda_device_<char_t, u16_t, u16_t, final_score_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>;
            auto device_level_u32_kernel =
                is_affine_k //
                    ? (void *)&affine_score_across_cuda_device<char_t, u32_t, u32_t, final_score_t,
                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
                                                               sz_similarity_global_k, capability_k>
                    : (void *)&linear_score_across_cuda_device_<char_t, u32_t, u32_t, final_score_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>;
            auto device_level_u64_kernel =
                is_affine_k //
                    ? (void *)&affine_score_across_cuda_device<char_t, u64_t, u64_t, final_score_t,
                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
                                                               sz_similarity_global_k, capability_k>
                    : (void *)&linear_score_across_cuda_device_<char_t, u64_t, u64_t, final_score_t,
                                                                uniform_substitution_costs_t, sz_minimize_distance_k,
                                                                sz_similarity_global_k, capability_k>;
            void *device_level_kernel_args[8];

            // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
            safe_vector<u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
            task_t const &largest_task = device_level_tasks[0];
            sz_assert_(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
            if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
                status_t::bad_alloc_k)
                return {status_t::bad_alloc_k};

            // Individually submit each task to the GPU.
            void *const diagonals_buffer_ptr = (void *)diagonals_u64_buffer.data();
            for (size_t i = 0; i < device_level_tasks.size(); ++i) {
                task_t const &task = device_level_tasks[i];
                device_level_kernel_args[0] = (void *)(&task.shorter_ptr);
                device_level_kernel_args[1] = (void *)(&task.shorter_length);
                device_level_kernel_args[2] = (void *)(&task.longer_ptr);
                device_level_kernel_args[3] = (void *)(&task.longer_length);
                device_level_kernel_args[4] = (void *)(&task.result);
                device_level_kernel_args[5] = (void *)(&diagonals_buffer_ptr);
                device_level_kernel_args[6] = (void *)(&substituter_);
                device_level_kernel_args[7] = (void *)(&gap_costs_);

                // Pick the smallest fitting type for the diagonals.
                void *device_level_kernel = reinterpret_cast<void *>(device_level_u16_kernel);
                if (task.bytes_per_cell >= sizeof(u32_t))
                    device_level_kernel = reinterpret_cast<void *>(device_level_u32_kernel);
                if (task.bytes_per_cell >= sizeof(u64_t))
                    device_level_kernel = reinterpret_cast<void *>(device_level_u64_kernel);

                // TODO: We can be wiser about the dimensions of this grid.
                unsigned const random_block_size = 128;
                unsigned const random_blocks_per_multiprocessor = 32;
                cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
                    reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
                    dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
                    dim3(random_block_size),                                                  // Block dimensions
                    device_level_kernel_args, // Array of kernel argument pointers
                    0,                        // Shared memory per block (in bytes)
                    executor.stream());       // CUDA stream
                if (launch_error != cudaSuccess)
                    if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
                    else { return {status_t::unknown_k, launch_error}; }
            }
        }

        // Now process remaining warp-level tasks, checking warp densities in reverse order.
        // From the highest possible number of warps per multiprocessor to the lowest.
        if (warp_level_tasks.size()) {
            auto warp_level_u8_kernel =
                is_affine_k
                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u8_t, u8_t, uniform_substitution_costs_t,
                                                               sz_minimize_distance_k, sz_similarity_global_k,
                                                               capability_k>
                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u8_t, u8_t, uniform_substitution_costs_t,
                                                               sz_minimize_distance_k, sz_similarity_global_k,
                                                               capability_k>;
            auto warp_level_u16_kernel =
                is_affine_k
                    ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u16_t, u16_t,
                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
                                                               sz_similarity_global_k, capability_k>
                    : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u16_t, u16_t,
                                                               uniform_substitution_costs_t, sz_minimize_distance_k,
                                                               sz_similarity_global_k, capability_k>;
            void *warp_level_kernel_args[5];

            cuda_status_t result;
            auto const task_size_equality = [](task_t const &lhs, task_t const &rhs) {
                return lhs.bytes_per_cell == rhs.bytes_per_cell && lhs.density == rhs.density;
            };
            auto const task_group_callback = [&](task_t const *tasks_begin, task_t const *tasks_end) {
                // Check if we need to stop processing.
                if (result.status != status_t::success_k) return;

                // Make sure all tasks can be handled by the same kernel template.
                task_t const &first_task = *tasks_begin;
                sz_assert_(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
                    return task.bytes_per_cell == first_task.bytes_per_cell && task.density == first_task.density;
                }));

                // Find the task in the batch that requires the most memory.
                task_t const &indicative_task =
                    *std::max_element(tasks_begin, tasks_end, [](task_t const &lhs, task_t const &rhs) {
                        return lhs.memory_requirement < rhs.memory_requirement;
                    });

                // Pick the smallest fitting type for the diagonals.
                void *warp_level_kernel = reinterpret_cast<void *>(warp_level_u8_kernel);
                if (indicative_task.bytes_per_cell >= sizeof(u16_t))
                    warp_level_kernel = reinterpret_cast<void *>(warp_level_u16_kernel);

                // Even if we can fit more warps per block we sometimes should not.
                auto const [optimal_density, speculative_factor] =
                    speculation_friendly_density(indicative_task.density);

                // Update the selected kernels properties.
                unsigned const shared_memory_per_block =
                    static_cast<unsigned>(indicative_task.memory_requirement * optimal_density);
                sz_assert_(shared_memory_per_block > 0);
                sz_assert_(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                cudaError_t attribute_error = cudaFuncSetAttribute(
                    warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_per_block);
                if (attribute_error != cudaSuccess) {
                    result = {status_t::unknown_k, attribute_error};
                    return;
                }

                size_t const count_tasks = tasks_end - tasks_begin;
                warp_level_kernel_args[0] = (void *)(&tasks_begin);
                warp_level_kernel_args[1] = (void *)(&count_tasks);
                warp_level_kernel_args[2] = (void *)(&substituter_);
                warp_level_kernel_args[3] = (void *)(&gap_costs_);
                warp_level_kernel_args[4] = (void *)(&shared_memory_per_block);

                // Warp-level algorithm clearly aligns with the warp size.
                unsigned const threads_per_block = static_cast<unsigned>(specs.warp_size * optimal_density);
                cudaError_t launch_error = cudaLaunchKernel(                    //
                    reinterpret_cast<void *>(warp_level_kernel),                // Kernel function pointer
                    dim3(specs.streaming_multiprocessors * speculative_factor), // Grid dimensions
                    dim3(threads_per_block),                                    // Block dimensions
                    warp_level_kernel_args,                                     // Array of kernel argument pointers
                    shared_memory_per_block,                                    // Shared memory per block (in bytes)
                    executor.stream());                                         // CUDA stream
                if (launch_error != cudaSuccess) {
                    result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                              launch_error};
                    return;
                }

                // Wait until everything completes, as on the next iteration we will update the properties again.
                cudaError_t execution_error = cudaStreamSynchronize(executor.stream());
                if (execution_error != cudaSuccess) {
                    result = {status_t::unknown_k, execution_error};
                    return;
                }
            };
            group_by(warp_level_tasks.begin(), warp_level_tasks.end(), task_size_equality, task_group_callback);
            if (result.status != status_t::success_k) return result;
        }

        // Calculate the duration:
        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream());
        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
        float execution_milliseconds = 0;
        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);

        // Now that everything went well, export the results back into the `results` array.
        for (size_t i = 0; i < tasks.size(); ++i) {
            task_t const &task = tasks[i];
            results[task.original_index] = task.result;
        }
        return {status_t::success_k, cudaSuccess, execution_milliseconds};
    }
};

#pragma endregion

#pragma region - Needleman Wunsch and Smith Waterman Scores in CUDA

/**
 *  @brief  Convenience buffer of the size matching the size of the CUDA constant memory,
 *          used to cheaper store and access the substitution costs for the characters.
 *  @see    CUDA constant memory docs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#constant
 */
__constant__ char error_costs_in_cuda_constant_memory_[256 * 256];

struct error_costs_256x256_in_cuda_constant_memory_t {
    __host__ error_cost_t magnitude() const noexcept { return 0; }
    __forceinline__ __host__ __device__ error_cost_t operator()(char a, char b) const noexcept {
#if defined(__CUDA_ARCH__)
        return error_costs_in_cuda_constant_memory_[static_cast<u8_t>(a) * 256 + static_cast<u8_t>(b)];
#else
        sz_unused_(a && b);
        return 0;
#endif
    }
};

#if SZ_USE_HOPPER

/**
 *  @brief GPU adaptation of the `tile_scorer` - Maximizes Global or Local score with linear gap costs.
 *  @note Requires Hopper generation GPUs to handle 2x `i16` scores at a time.
 */
template <sz_similarity_locality_t locality_>
struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                   linear_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                         linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {

    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;

    using tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                      linear_gap_costs_t, sz_maximize_score_k, locality_,
                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,              // ! Unlike CPU, uses `unsigned`
        sz_i16_t const *scores_pre_substitution, //
        sz_i16_t const *scores_pre_insertion,    //
        sz_i16_t const *scores_pre_deletion,     //
        sz_i16_t *scores_new) noexcept {

        error_costs_256x256_in_cuda_constant_memory_t substituter;
        sz_i16_t const gap_cost = this->gap_costs_.open_or_extend;
        sz_u32_vec_t gap_cost_vec;
        gap_cost_vec.i16s[0] = gap_cost_vec.i16s[1] = gap_cost;

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_vec, pre_deletion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_deletion_or_insertion_vec;
        sz_u32_vec_t cell_score_vec, final_score_vec;
        final_score_vec.i16s[0] = final_score_vec.i16s[1] = 0;

        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
            pre_substitution_vec.i16s[0] = load_last_use_(scores_pre_substitution + i + 0);
            pre_substitution_vec.i16s[1] = load_last_use_(scores_pre_substitution + i + 1);
            pre_insertion_vec.i16s[0] = scores_pre_insertion[i + 0];
            pre_insertion_vec.i16s[1] = scores_pre_insertion[i + 1];
            pre_deletion_vec.i16s[0] = scores_pre_deletion[i + 0];
            pre_deletion_vec.i16s[1] = scores_pre_deletion[i + 1];
            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded

            cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
            cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
            if_deletion_or_insertion_vec.u32 =
                __vaddss2(__vmaxs2(pre_insertion_vec.u32, pre_deletion_vec.u32), gap_cost_vec.u32);

            // For local scoring we should use the ReLU variants of 3-way `max`.
            if constexpr (locality_k == sz_similarity_global_k) {
                cell_score_vec.u32 = __viaddmax_s16x2(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
                                                      if_deletion_or_insertion_vec.u32);
                sz_unused_(final_score_vec);
            }
            else {
                cell_score_vec.u32 = __viaddmax_s16x2_relu(pre_substitution_vec.u32, cost_of_substitution_vec.u32,
                                                           if_deletion_or_insertion_vec.u32);
                // In the last iteration of the loop the second half-word contains noise,
                // so we have to discard it from affecting the final score.
                bool const is_tail = i + 1 == tasks_count;
                final_score_vec.i16s[0] = (std::max)(cell_score_vec.i16s[0], final_score_vec.i16s[0]);
                final_score_vec.i16s[1] = (std::max)(cell_score_vec.i16s[1 - is_tail], final_score_vec.i16s[1]);
            }

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.i16s[0];
            scores_new[i + 1] = cell_score_vec.i16s[1];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if constexpr (locality_k == sz_similarity_global_k) {
            if (tasks_offset == 0) this->final_score_ = scores_new[0];
        }
        else { // Or the best score for local alignment.
            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
        }
    }
};

template <sz_similarity_locality_t locality_>
struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                   linear_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                         linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {

    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;

    using tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                      linear_gap_costs_t, sz_maximize_score_k, locality_,
                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,              // ! Unlike CPU, uses `unsigned`
        sz_i32_t const *scores_pre_substitution, //
        sz_i32_t const *scores_pre_insertion,    //
        sz_i32_t const *scores_pre_deletion,     //
        sz_i32_t *scores_new) noexcept {

        // Make sure we are called for an anti-diagonal traversal order
        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
        error_costs_256x256_in_cuda_constant_memory_t substituter;
        sz_i32_t const gap_costs = this->gap_costs_.open_or_extend;
        sz_i32_t final_score = 0;

        for (unsigned i = tasks_offset; i < tasks_count; i += tasks_step) {
            sz_i32_t pre_substitution = load_last_use_(scores_pre_substitution + i);
            sz_i32_t pre_insertion = scores_pre_insertion[i];
            sz_i32_t pre_deletion = scores_pre_deletion[i];
            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
            char second_char = load_immutable_(second_slice + i);

            error_cost_t cost_of_substitution = substituter(first_char, second_char);
            sz_i32_t if_deletion_or_insertion = (std::max)(pre_deletion, pre_insertion) + gap_costs;
            sz_i32_t cell_score;

            // For local scoring we should use the ReLU variants of 3-way `max`.
            if constexpr (locality_k == sz_similarity_global_k) {
                cell_score = __viaddmax_s32(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
                sz_unused_(final_score);
            }
            else {
                cell_score = __viaddmax_s32_relu(pre_substitution, cost_of_substitution, if_deletion_or_insertion);
                final_score = (std::max)(cell_score, final_score);
            }

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i] = cell_score;
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if constexpr (locality_k == sz_similarity_global_k) {
            if (tasks_offset == 0) this->final_score_ = scores_new[0];
        }
        else { // Or the best score for local alignment.
            this->final_score_ = (std::max)(this->final_score_, final_score);
            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
        }
    }
};

/**
 *  @brief GPU adaptation of the `tile_scorer` - Maximizes Global or Local score with affine gap costs.
 *  @note Requires Hopper generation GPUs to handle 2x `i16` scores at a time.
 */
template <sz_similarity_locality_t locality_>
struct tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                   affine_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                         affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {

    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;

    using tile_scorer<char const *, char const *, sz_i16_t, error_costs_256x256_in_cuda_constant_memory_t,
                      affine_gap_costs_t, sz_maximize_score_k, locality_,
                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,                // ! Unlike CPU, uses `unsigned`
        sz_i16_t const *scores_pre_substitution,   //
        sz_i16_t const *scores_pre_insertion,      //
        sz_i16_t const *scores_pre_deletion,       //
        sz_i16_t const *scores_running_insertions, //
        sz_i16_t const *scores_running_deletions,  //
        sz_i16_t *scores_new,                      //
        sz_i16_t *scores_new_insertions,           //
        sz_i16_t *scores_new_deletions) noexcept {

        error_costs_256x256_in_cuda_constant_memory_t substituter;
        sz_i16_t const gap_open_cost = this->gap_costs_.open;
        sz_i16_t const gap_extend_cost = this->gap_costs_.extend;
        sz_u32_vec_t gap_open_cost_vec, gap_extend_cost_vec;
        gap_open_cost_vec.i16s[0] = gap_open_cost_vec.i16s[1] = gap_open_cost;
        gap_extend_cost_vec.i16s[0] = gap_extend_cost_vec.i16s[1] = gap_extend_cost;

        // The hardest part of this kernel is dealing with unaligned loads!
        // We want to minimize single-byte processing in favor of 2-byte SIMD loads and min/max operations.
        // Assuming we are reading consecutive values from a buffer, in every cycle, most likely, we will be
        // dealing with most values being unaligned!
        sz_u32_vec_t pre_substitution_vec, pre_insertion_opening_vec, pre_deletion_opening_vec;
        sz_u32_vec_t pre_insertion_expansion_vec, pre_deletion_expansion_vec;
        sz_u32_vec_t first_vec, second_vec;
        sz_u32_vec_t cost_of_substitution_vec, if_substitution_vec, if_insertion_vec, if_deletion_vec;
        sz_u32_vec_t cell_score_vec, final_score_vec;
        final_score_vec.i16s[0] = final_score_vec.i16s[1] = 0;

        // ! As we are processing 2 bytes per loop, and have at least 32 threads per block (32 * 2 = 64),
        // ! and deal with strings only under 64k bytes, this loop will fire at most 1K times per input
        for (unsigned i = tasks_offset * 2; i < tasks_count; i += tasks_step * 2) { // ! it's OK to spill beyond bounds
            pre_substitution_vec.i16s[0] = load_last_use_(scores_pre_substitution + i + 0);
            pre_substitution_vec.i16s[1] = load_last_use_(scores_pre_substitution + i + 1);
            pre_insertion_opening_vec.i16s[0] = scores_pre_insertion[i + 0];
            pre_insertion_opening_vec.i16s[1] = scores_pre_insertion[i + 1];
            pre_deletion_opening_vec.i16s[0] = scores_pre_deletion[i + 0];
            pre_deletion_opening_vec.i16s[1] = scores_pre_deletion[i + 1];
            pre_insertion_expansion_vec.i16s[0] = scores_running_insertions[i + 0];
            pre_insertion_expansion_vec.i16s[1] = scores_running_insertions[i + 1];
            pre_deletion_expansion_vec.i16s[0] = scores_running_deletions[i + 0];
            pre_deletion_expansion_vec.i16s[1] = scores_running_deletions[i + 1];
            first_vec.u16s[0] = load_immutable_(first_slice + tasks_count - i - 1);
            first_vec.u16s[1] = load_immutable_(first_slice + tasks_count - i - 2); // ! this may be OOB
            second_vec.u16s[0] = load_immutable_(second_slice + i + 0);
            second_vec.u16s[1] = load_immutable_(second_slice + i + 1); // ! this may be OOB, but padded

            cost_of_substitution_vec.i16s[0] = substituter(first_vec.u16s[0], second_vec.u16s[0]);
            cost_of_substitution_vec.i16s[1] = substituter(first_vec.u16s[1], second_vec.u16s[1]);
            if_substitution_vec.u32 = __vaddss2(pre_substitution_vec.u32, cost_of_substitution_vec.u32);
            if_insertion_vec.u32 = //
                __viaddmax_s16x2(pre_insertion_opening_vec.u32, gap_open_cost_vec.u32,
                                 __vaddss2(pre_insertion_expansion_vec.u32, gap_extend_cost_vec.u32));
            if_deletion_vec.u32 = //
                __viaddmax_s16x2(pre_deletion_opening_vec.u32, gap_open_cost_vec.u32,
                                 __vaddss2(pre_deletion_expansion_vec.u32, gap_extend_cost_vec.u32));

            // For local scoring we should use the ReLU variants of 3-way `max`.
            if constexpr (locality_k == sz_similarity_global_k) {
                cell_score_vec.u32 = __vimax3_s16x2(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
                sz_unused_(final_score_vec);
            }
            else {
                cell_score_vec.u32 =
                    __vimax3_s16x2_relu(if_substitution_vec.u32, if_insertion_vec.u32, if_deletion_vec.u32);
                // In the last iteration of the loop the second half-word contains noise,
                // so we have to discard it from affecting the final score.
                bool const is_tail = i + 1 == tasks_count;
                final_score_vec.i16s[0] = (std::max)(cell_score_vec.i16s[0], final_score_vec.i16s[0]);
                final_score_vec.i16s[1] = (std::max)(cell_score_vec.i16s[1 - is_tail], final_score_vec.i16s[1]);
            }

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i + 0] = cell_score_vec.i16s[0];
            scores_new[i + 1] = cell_score_vec.i16s[1];
            scores_new_insertions[i + 0] = if_insertion_vec.i16s[0];
            scores_new_insertions[i + 1] = if_insertion_vec.i16s[1];
            scores_new_deletions[i + 0] = if_deletion_vec.i16s[0];
            scores_new_deletions[i + 1] = if_deletion_vec.i16s[1];
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if constexpr (locality_k == sz_similarity_global_k) {
            if (tasks_offset == 0) this->final_score_ = scores_new[0];
        }
        else { // Or the best score for local alignment.
            this->final_score_ = __vimax3_s32(this->final_score_, final_score_vec.i16s[0], final_score_vec.i16s[1]);
            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
        }
    }
};

template <sz_similarity_locality_t locality_>
struct tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                   affine_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                         affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {

    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_similarity_objective_t objective_k = sz_maximize_score_k;

    using tile_scorer<char const *, char const *, sz_i32_t, error_costs_256x256_in_cuda_constant_memory_t,
                      affine_gap_costs_t, sz_maximize_score_k, locality_,
                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible

    __forceinline__ __device__ void operator()(            //
        char const *first_slice, char const *second_slice, //
        unsigned const tasks_offset, unsigned const tasks_step,
        unsigned const tasks_count,                // ! Unlike CPU, uses `unsigned`
        sz_i32_t const *scores_pre_substitution,   //
        sz_i32_t const *scores_pre_insertion,      //
        sz_i32_t const *scores_pre_deletion,       //
        sz_i32_t const *scores_running_insertions, //
        sz_i32_t const *scores_running_deletions,  //
        sz_i32_t *scores_new,                      //
        sz_i32_t *scores_new_insertions,           //
        sz_i32_t *scores_new_deletions) noexcept {

        // Make sure we are called for an anti-diagonal traversal order
        sz_assert_(scores_pre_insertion + 1 == scores_pre_deletion);
        sz_i32_t const gap_open_cost = this->gap_costs_.open;
        sz_i32_t const gap_extend_cost = this->gap_costs_.extend;
        error_costs_256x256_in_cuda_constant_memory_t substituter;
        sz_i32_t final_score = 0;

        for (unsigned i = tasks_offset; i < tasks_count; i += tasks_step) {
            sz_i32_t pre_substitution = load_last_use_(scores_pre_substitution + i);
            sz_i32_t pre_insertion_opening = scores_pre_insertion[i];
            sz_i32_t pre_deletion_opening = scores_pre_deletion[i];
            sz_i32_t pre_insertion_expansion = scores_running_insertions[i];
            sz_i32_t pre_deletion_expansion = scores_running_deletions[i];
            char first_char = load_immutable_(first_slice + tasks_count - i - 1);
            char second_char = load_immutable_(second_slice + i);

            error_cost_t cost_of_substitution = substituter(first_char, second_char);
            sz_i32_t if_substitution = pre_substitution + cost_of_substitution;
            sz_i32_t if_insertion =
                __viaddmax_s32(pre_insertion_opening, gap_open_cost, pre_insertion_expansion + gap_extend_cost);
            sz_i32_t if_deletion =
                __viaddmax_s32(pre_deletion_opening, gap_open_cost, pre_deletion_expansion + gap_extend_cost);
            sz_i32_t cell_score;

            // For local scoring we should use the ReLU variants of 3-way `max`.
            if constexpr (locality_k == sz_similarity_global_k) {
                cell_score = __vimax3_s32(if_substitution, if_insertion, if_deletion);
                sz_unused_(final_score);
            }
            else {
                cell_score = __vimax3_s32_relu(if_substitution, if_insertion, if_deletion);
                final_score = (std::max)(cell_score, final_score);
            }

            // When walking through the top-left triangle of the matrix, our output addresses are misaligned.
            scores_new[i] = cell_score;
            scores_new_insertions[i] = if_insertion;
            scores_new_deletions[i] = if_deletion;
        }

        // Extract the bottom-right corner of the matrix, which is the result of the global alignment.
        if constexpr (locality_k == sz_similarity_global_k) {
            if (tasks_offset == 0) this->final_score_ = scores_new[0];
        }
        else { // Or the best score for local alignment.
            this->final_score_ = (std::max)(this->final_score_, final_score);
            // On Hopper we can use specialized warp reductions for up-to 32-bit values:
            // this->final_score_ = pick_best_in_warp_<sz_maximize_score_k>(this->final_score_);
            this->final_score_ = __reduce_max_sync(0xFFFFFFFF, this->final_score_);
        }
    }
};

template <sz_similarity_locality_t locality_>
struct tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
                   linear_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
                         linear_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {

    using tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
                      linear_gap_costs_t, sz_maximize_score_k, locality_,
                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
};

template <sz_similarity_locality_t locality_>
struct tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
                   affine_gap_costs_t, sz_maximize_score_k, locality_, sz_caps_ckh_k>
    : public tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
                         affine_gap_costs_t, sz_maximize_score_k, locality_, sz_cap_cuda_k> {

    using tile_scorer<char const *, char const *, sz_i64_t, error_costs_256x256_in_cuda_constant_memory_t,
                      affine_gap_costs_t, sz_maximize_score_k, locality_,
                      sz_cap_cuda_k>::tile_scorer; // Make the constructors visible
};

#endif

/**
 *  @brief  Dispatches baseline NW or SW scoring algorithm to the GPU.
 *          Before starting the kernels, bins them by size to maximize the number of blocks
 *          per grid that can run simultaneously, while fitting into the shared memory.
 *          Unlike the Levenshtein distances, also places byte-level @b `error_costs_256x256_t`
 *          substitution costs into the CUDA @b constant memory, addressing it via
 *          @b `error_costs_256x256_in_cuda_constant_memory_t` struct.
 */
template <typename gap_costs_type_, typename allocator_type_, sz_similarity_locality_t locality_,
          sz_capability_t capability_>
struct cuda_nw_or_sw_byte_level_scores_ {

    using char_t = char;
    using substituter_t = error_costs_256x256_t;
    using gap_costs_t = gap_costs_type_;
    using allocator_t = allocator_type_;
    using scores_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<size_t>;
    static constexpr sz_similarity_locality_t locality_k = locality_;
    static constexpr sz_capability_t capability_k = capability_;

    using task_t = cuda_similarity_task_<char_t>;
    using tasks_allocator_t = typename std::allocator_traits<allocator_t>::template rebind_alloc<task_t>;

    error_costs_256x256_t substituter_ {};
    gap_costs_t gap_costs_ {};
    allocator_t alloc_ {};

    cuda_nw_or_sw_byte_level_scores_(error_costs_256x256_t subs = {}, gap_costs_t gaps = {},
                                     allocator_t const &alloc = allocator_t {}) noexcept
        : substituter_(subs), gap_costs_(gaps), alloc_(alloc) {}

    template <typename first_strings_type_, typename second_strings_type_, typename results_type_>
#if SZ_HAS_CONCEPTS_
        requires indexed_results_like<results_type_>
#endif
    cuda_status_t operator()(                                                                 //
        first_strings_type_ const &first_strings, second_strings_type_ const &second_strings, //
        results_type_ &&results,                                                              //
        cuda_executor_t executor = {}, gpu_specs_t specs = {}) const noexcept {

        constexpr bool is_local_k = locality_k == sz_similarity_local_k;
        constexpr bool is_affine_k = is_same_type<gap_costs_t, affine_gap_costs_t>::value;
        constexpr size_t count_diagonals_k = is_affine_k ? 7 : 3;

        // Preallocate the events for GPU timing.
        cudaEvent_t start_event, stop_event;
        cudaEventCreate(&start_event, cudaEventBlockingSync);
        cudaEventCreate(&stop_event, cudaEventBlockingSync);

        using final_score_t = typename indexed_results_type<results_type_>::type;
        safe_vector<task_t, tasks_allocator_t> tasks(alloc_);
        if (tasks.try_resize(first_strings.size()) == status_t::bad_alloc_k) return {status_t::bad_alloc_k};

        // Record the start event
        cudaError_t start_event_error = cudaEventRecord(start_event, executor.stream());
        if (start_event_error != cudaSuccess) return {status_t::unknown_k, start_event_error};

        // Enqueue the transfer of the substituter to the constant memory:
        cudaError_t copy_error =
            cudaMemcpyToSymbolAsync(error_costs_in_cuda_constant_memory_, (void const *)&substituter_,
                                    sizeof(substituter_t), 0, cudaMemcpyHostToDevice, executor.stream());
        if (copy_error != cudaSuccess) return {status_t::unknown_k, copy_error};

        // Export all the tasks and sort them by decreasing memory requirement.
        size_t count_empty_tasks = 0;
        using similarity_memory_requirements_t = similarity_memory_requirements<size_t, true>;
        for (size_t i = 0; i < first_strings.size(); ++i) {
            // Ensure inputs are device-accessible (Unified/Device memory)
            if (!is_device_accessible_memory((void const *)first_strings[i].data()) ||
                !is_device_accessible_memory((void const *)second_strings[i].data()))
                return {status_t::device_memory_mismatch_k, cudaSuccess};
            task_t task(                                            //
                first_strings[i].data(), first_strings[i].length(), //
                second_strings[i].data(), second_strings[i].length());
            similarity_memory_requirements_t requirement(                                  //
                task.shorter_length, task.longer_length,                                   //
                gap_type<gap_costs_t>(), substituter_.magnitude(), gap_costs_.magnitude(), //
                sizeof(char_t), 4, two_bytes_per_cell_k);

            task.original_index = i;
            task.memory_requirement = requirement.total;
            task.bytes_per_cell = requirement.bytes_per_cell;
            task.density = warp_tasks_density(requirement.total, specs);
            if (task.density == infinite_warps_per_multiprocessor_k) {
                if constexpr (is_local_k) { task.result = 0; }
                else if constexpr (!is_affine_k) { task.result = task.longer_length * gap_costs_.open_or_extend; }
                else if (!task.longer_length) { task.result = 0; }
                else { task.result = (task.longer_length - 1) * gap_costs_.extend + gap_costs_.open; }
                count_empty_tasks++;
            }
            tasks[i] = task;
        }

        auto [device_level_tasks, warp_level_tasks, empty_tasks] =
            warp_tasks_grouping<task_t>({tasks.data(), tasks.size()}, specs);

        if (device_level_tasks.size()) {
            auto device_level_i32_kernel =
                is_affine_k //
                    ? (void *)&affine_score_across_cuda_device<char_t, u32_t, sz_i32_t, final_score_t,
                                                               error_costs_256x256_in_cuda_constant_memory_t,
                                                               sz_maximize_score_k, locality_k, capability_k>
                    : (void *)&linear_score_across_cuda_device_<char_t, u32_t, sz_i32_t, final_score_t,
                                                                error_costs_256x256_in_cuda_constant_memory_t,
                                                                sz_maximize_score_k, locality_k, capability_k>;
            auto device_level_i64_kernel =
                is_affine_k //
                    ? (void *)&affine_score_across_cuda_device<char_t, u64_t, sz_i64_t, final_score_t,
                                                               error_costs_256x256_in_cuda_constant_memory_t,
                                                               sz_maximize_score_k, locality_k, capability_k>
                    : (void *)&linear_score_across_cuda_device_<char_t, u64_t, sz_i64_t, final_score_t,
                                                                error_costs_256x256_in_cuda_constant_memory_t,
                                                                sz_maximize_score_k, locality_k, capability_k>;
            void *device_level_kernel_args[8];

            // On very large inputs we can't fit the diagonals in shared memory, and use the global one.
            safe_vector<u64_t, scores_allocator_t> diagonals_u64_buffer(alloc_);
            task_t const &largest_task = device_level_tasks[0];
            sz_assert_(largest_task.max_diagonal_length() >= device_level_tasks.back().max_diagonal_length());
            if (diagonals_u64_buffer.try_resize(largest_task.max_diagonal_length() * count_diagonals_k) ==
                status_t::bad_alloc_k)
                return {status_t::bad_alloc_k};

            // Individually submit each task to the GPU.
            void *const diagonals_buffer_ptr = (void *)diagonals_u64_buffer.data();
            for (size_t i = 0; i < device_level_tasks.size(); ++i) {
                task_t const &task = device_level_tasks[i];
                device_level_kernel_args[0] = (void *)(&task.shorter_ptr);
                device_level_kernel_args[1] = (void *)(&task.shorter_length);
                device_level_kernel_args[2] = (void *)(&task.longer_ptr);
                device_level_kernel_args[3] = (void *)(&task.longer_length);
                device_level_kernel_args[4] = (void *)(&task.result);
                device_level_kernel_args[5] = (void *)(&diagonals_buffer_ptr);
                device_level_kernel_args[6] = (void *)(&substituter_);
                device_level_kernel_args[7] = (void *)(&gap_costs_);

                // Pick the smallest fitting type for the diagonals.
                void *device_level_kernel = reinterpret_cast<void *>(device_level_i32_kernel);
                if (task.bytes_per_cell >= sizeof(sz_i64_t))
                    device_level_kernel = reinterpret_cast<void *>(device_level_i64_kernel);

                // TODO: We can be wiser about the dimensions of this grid.
                unsigned const random_block_size = 128;
                unsigned const random_blocks_per_multiprocessor = 32;
                cudaError_t launch_error = cudaLaunchCooperativeKernel(                       //
                    reinterpret_cast<void *>(device_level_kernel),                            // Kernel function pointer
                    dim3(random_blocks_per_multiprocessor * specs.streaming_multiprocessors), // Grid dimensions
                    dim3(random_block_size),                                                  // Block dimensions
                    device_level_kernel_args, // Array of kernel argument pointers
                    0,                        // Shared memory per block (in bytes)
                    executor.stream());       // CUDA stream
                if (launch_error != cudaSuccess)
                    if (launch_error == cudaErrorMemoryAllocation) { return {status_t::bad_alloc_k, launch_error}; }
                    else { return {status_t::unknown_k, launch_error}; }
            }
        }

        // Now process remaining warp-level tasks, checking warp densities in reverse order.
        // From the highest possible number of warps per multiprocessor to the lowest.
        if (warp_level_tasks.size()) {
            auto warp_level_i16_kernel =
                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u16_t, sz_i16_t,
                                                                       error_costs_256x256_in_cuda_constant_memory_t,
                                                                       sz_maximize_score_k, locality_k, capability_k>
                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u16_t, sz_i16_t,
                                                                       error_costs_256x256_in_cuda_constant_memory_t,
                                                                       sz_maximize_score_k, locality_k, capability_k>;
            auto warp_level_i32_kernel =
                is_affine_k ? (void *)&affine_score_on_each_cuda_warp_<task_t, char_t, u32_t, sz_i32_t,
                                                                       error_costs_256x256_in_cuda_constant_memory_t,
                                                                       sz_maximize_score_k, locality_k, capability_k>
                            : (void *)&linear_score_on_each_cuda_warp_<task_t, char_t, u32_t, sz_i32_t,
                                                                       error_costs_256x256_in_cuda_constant_memory_t,
                                                                       sz_maximize_score_k, locality_k, capability_k>;
            void *warp_level_kernel_args[5];

            cuda_status_t result;
            auto const task_size_equality = [](task_t const &lhs, task_t const &rhs) {
                return lhs.bytes_per_cell == rhs.bytes_per_cell && lhs.density == rhs.density;
            };
            auto const task_group_callback = [&](task_t const *tasks_begin, task_t const *tasks_end) {
                // Check if we need to stop processing.
                if (result.status != status_t::success_k) return;

                // Make sure all tasks can be handled by the same kernel template.
                task_t const &first_task = *tasks_begin;
                sz_assert_(std::all_of(tasks_begin, tasks_end, [&](task_t const &task) {
                    return task.bytes_per_cell == first_task.bytes_per_cell && task.density == first_task.density;
                }));

                // Find the task in the batch that requires the most memory.
                task_t const &indicative_task =
                    *std::max_element(tasks_begin, tasks_end, [](task_t const &lhs, task_t const &rhs) {
                        return lhs.memory_requirement < rhs.memory_requirement;
                    });

                // Pick the smallest fitting type for the diagonals.
                void *warp_level_kernel = reinterpret_cast<void *>(warp_level_i16_kernel);
                if (indicative_task.bytes_per_cell >= sizeof(sz_i32_t))
                    warp_level_kernel = reinterpret_cast<void *>(warp_level_i32_kernel);

                // Even if we can fit more warps per block we sometimes should not.
                auto const [optimal_density, speculative_factor] =
                    speculation_friendly_density(indicative_task.density);

                // Update the selected kernels properties.
                unsigned const shared_memory_per_block =
                    static_cast<unsigned>(indicative_task.memory_requirement * optimal_density);
                sz_assert_(shared_memory_per_block > 0);
                sz_assert_(shared_memory_per_block < specs.shared_memory_per_multiprocessor());
                cudaError_t attribute_error = cudaFuncSetAttribute(
                    warp_level_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_per_block);
                if (attribute_error != cudaSuccess) {
                    result = {status_t::unknown_k, attribute_error};
                    return;
                }

                size_t const count_tasks = tasks_end - tasks_begin;
                warp_level_kernel_args[0] = (void *)(&tasks_begin);
                warp_level_kernel_args[1] = (void *)(&count_tasks);
                warp_level_kernel_args[2] = (void *)(&substituter_);
                warp_level_kernel_args[3] = (void *)(&gap_costs_);
                warp_level_kernel_args[4] = (void *)(&shared_memory_per_block);

                // Warp-level algorithm clearly aligns with the warp size.
                unsigned const threads_per_block = static_cast<unsigned>(specs.warp_size * optimal_density);
                cudaError_t launch_error = cudaLaunchKernel(                    //
                    reinterpret_cast<void *>(warp_level_kernel),                // Kernel function pointer
                    dim3(specs.streaming_multiprocessors * speculative_factor), // Grid dimensions
                    dim3(threads_per_block),                                    // Block dimensions
                    warp_level_kernel_args,                                     // Array of kernel argument pointers
                    shared_memory_per_block,                                    // Shared memory per block (in bytes)
                    executor.stream());                                         // CUDA stream
                if (launch_error != cudaSuccess) {
                    result = {launch_error == cudaErrorMemoryAllocation ? status_t::bad_alloc_k : status_t::unknown_k,
                              launch_error};
                    return;
                }

                // Wait until everything completes, as on the next iteration we will update the properties again.
                cudaError_t execution_error = cudaStreamSynchronize(executor.stream());
                if (execution_error != cudaSuccess) {
                    result = {status_t::unknown_k, execution_error};
                    return;
                }
            };
            group_by(warp_level_tasks.begin(), warp_level_tasks.end(), task_size_equality, task_group_callback);
            if (result.status != status_t::success_k) return result;
        }

        // Calculate the duration:
        cudaError_t stop_event_error = cudaEventRecord(stop_event, executor.stream());
        if (stop_event_error != cudaSuccess) return {status_t::unknown_k, stop_event_error};
        float execution_milliseconds = 0;
        cudaEventElapsedTime(&execution_milliseconds, start_event, stop_event);

        // Now that everything went well, export the results back into the `results` array.
        for (size_t task_index = 0; task_index < tasks.size(); ++task_index) {
            task_t const &task = tasks[task_index];
            results[task.original_index] = task.result;
        }
        return {status_t::success_k, cudaSuccess, execution_milliseconds};
    }
};

/** @brief Dispatches baseline Needleman Wunsch algorithm to the GPU. */
template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
struct needleman_wunsch_scores<char, error_costs_256x256_t, gap_costs_type_, allocator_type_, capability_,
                               std::enable_if_t<capability_ & sz_cap_cuda_k>>
    : public cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_global_k, capability_> {

    using cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_global_k,
                                           capability_>::cuda_nw_or_sw_byte_level_scores_;
};

/** @brief Dispatches baseline Smith Waterman algorithm to the GPU. */
template <typename gap_costs_type_, typename allocator_type_, sz_capability_t capability_>
struct smith_waterman_scores<char, error_costs_256x256_t, gap_costs_type_, allocator_type_, capability_,
                             std::enable_if_t<capability_ & sz_cap_cuda_k>>
    : public cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_local_k, capability_> {

    using cuda_nw_or_sw_byte_level_scores_<gap_costs_type_, allocator_type_, sz_similarity_local_k,
                                           capability_>::cuda_nw_or_sw_byte_level_scores_;
};

#pragma endregion

} // namespace stringzillas
} // namespace ashvardanian

#endif // STRINGZILLAS_SIMILARITIES_CUH_