#include "dashem.h"
#include <string.h>
#include <stdio.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#if defined(__GNUC__) || defined(__clang__)
#define LIKELY(x) __builtin_expect(!!(x), 1)
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#if defined(_MSC_VER)
#define DASHEM_UNUSED
#define DASHEM_ALWAYS_INLINE __forceinline
#define DASHEM_POPCOUNT(x) __popcnt(x)
#define DASHEM_POPCOUNTLL(x) __popcnt64(x)
#else
#define DASHEM_UNUSED __attribute__((unused))
#define DASHEM_ALWAYS_INLINE __attribute__((always_inline)) inline
#define DASHEM_POPCOUNT(x) __builtin_popcount(x)
#define DASHEM_POPCOUNTLL(x) __builtin_popcountll(x)
#endif
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
#define DASHEM_STATIC_ASSERT(cond, msg) _Static_assert((cond), msg)
#else
#define DASHEM_CONCAT_IMPL(a, b) a ## b
#define DASHEM_CONCAT(a, b) DASHEM_CONCAT_IMPL(a, b)
#if defined(_MSC_VER)
#define DASHEM_STATIC_ASSERT(cond, msg) \
typedef char DASHEM_CONCAT(dashem_sa_, __LINE__)[(cond) ? 1 : -1]
#else
#define DASHEM_STATIC_ASSERT(cond, msg) \
typedef char DASHEM_CONCAT(dashem_sa_, __LINE__)[(cond) ? 1 : -1] __attribute__((unused))
#endif
#endif
DASHEM_STATIC_ASSERT(DASHEM_EM_DASH_BYTE1 == 0xE2, "Em-dash byte 1 must be 0xE2");
DASHEM_STATIC_ASSERT(DASHEM_EM_DASH_BYTE2 == 0x80, "Em-dash byte 2 must be 0x80");
DASHEM_STATIC_ASSERT(DASHEM_EM_DASH_BYTE3 == 0x94, "Em-dash byte 3 must be 0x94");
#ifdef _MSC_VER
#include <intrin.h>
static inline int dashem_ctz(uint32_t v) {
unsigned long r;
_BitScanForward(&r, v);
return (int)r;
}
static inline int dashem_ctzll(uint64_t v) {
unsigned long r;
#if defined(_M_X64) || defined(_M_ARM64)
_BitScanForward64(&r, v);
#else
if ((uint32_t)v != 0) {
_BitScanForward(&r, (uint32_t)v);
} else {
_BitScanForward(&r, (uint32_t)(v >> 32));
r += 32;
}
#endif
return (int)r;
}
#elif defined(__GNUC__) || defined(__clang__)
#define dashem_ctz(x) __builtin_ctz(x)
#define dashem_ctzll(x) __builtin_ctzll(x)
#else
static inline int dashem_ctz(uint32_t v) {
if (v == 0) return 32;
int count = 0;
if ((v & 0xFFFF) == 0) { count += 16; v >>= 16; }
if ((v & 0xFF) == 0) { count += 8; v >>= 8; }
if ((v & 0xF) == 0) { count += 4; v >>= 4; }
if ((v & 0x3) == 0) { count += 2; v >>= 2; }
if ((v & 0x1) == 0) { count += 1; }
return count;
}
static inline int dashem_ctzll(uint64_t v) {
if (v == 0) return 64;
if ((uint32_t)v != 0) return dashem_ctz((uint32_t)v);
return 32 + dashem_ctz((uint32_t)(v >> 32));
}
#endif
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86))
#include <cpuid.h>
static uint32_t __detect_cpu_features(void) {
uint32_t features = DASHEM_CPU_SCALAR;
uint32_t eax, ebx, ecx, edx;
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
if (edx & (1U << 26)) features |= DASHEM_CPU_SSE2;
if (ecx & (1U << 0)) features |= DASHEM_CPU_SSE42;
if (ecx & (1U << 28)) features |= DASHEM_CPU_AVX;
}
if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
if (ebx & (1U << 5)) features |= DASHEM_CPU_AVX2;
if (ebx & (1U << 8)) features |= DASHEM_CPU_BMI2;
if (ebx & (1U << 16)) features |= DASHEM_CPU_AVX512F;
if (ecx & (1U << 6)) features |= DASHEM_CPU_AVX512VBMI2;
}
return features;
}
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
#include <intrin.h>
static uint32_t __detect_cpu_features(void) {
uint32_t features = DASHEM_CPU_SCALAR;
int cpuid_info[4] = {0};
__cpuid(cpuid_info, 1);
if (cpuid_info[3] & (1U << 26)) features |= DASHEM_CPU_SSE2;
if (cpuid_info[2] & (1U << 0)) features |= DASHEM_CPU_SSE42;
if (cpuid_info[2] & (1U << 28)) features |= DASHEM_CPU_AVX;
__cpuidex(cpuid_info, 7, 0);
if (cpuid_info[1] & (1U << 5)) features |= DASHEM_CPU_AVX2;
if (cpuid_info[1] & (1U << 8)) features |= DASHEM_CPU_BMI2;
if (cpuid_info[1] & (1U << 16)) features |= DASHEM_CPU_AVX512F;
if (cpuid_info[2] & (1U << 6)) features |= DASHEM_CPU_AVX512VBMI2;
return features;
}
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
static uint32_t __detect_cpu_features(void) {
return DASHEM_CPU_SCALAR | DASHEM_CPU_NEON;
}
#else
static uint32_t __detect_cpu_features(void) {
return DASHEM_CPU_SCALAR;
}
#endif
static uint32_t g_cpu_features = 0;
static int g_features_detected = 0;
typedef int (*dashem_remove_fn)(
const char * restrict input,
size_t input_len,
char * restrict output,
size_t output_capacity,
size_t * restrict output_len
);
static dashem_remove_fn g_dashem_remove_impl = NULL;
static int dashem_remove_scalar(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
);
#if defined(__AVX2__)
static int dashem_remove_avx2(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
);
static int dashem_remove_avx2_unrolled(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
);
#endif
#if defined(__AVX512F__)
static int dashem_remove_avx512(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
);
#endif
#if defined(__SSE4_2__)
static int dashem_remove_sse42(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
);
#endif
#if defined(__BMI2__)
static int dashem_remove_bmi2(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
);
#endif
#if defined(__ARM_NEON)
static int dashem_remove_neon(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
);
#endif
uint32_t dashem_detect_cpu_features(void) {
if (!g_features_detected) {
g_cpu_features = __detect_cpu_features();
g_features_detected = 1;
}
return g_cpu_features;
}
#if defined(__AVX512VBMI2__) && defined(__AVX512BW__)
static int dashem_remove_avx512_compress(const char*, size_t, char*, size_t, size_t*);
#endif
#if defined(__AVX2__)
static int dashem_remove_avx2(const char*, size_t, char*, size_t, size_t*);
static int dashem_remove_avx2_twopass(const char*, size_t, char*, size_t, size_t*);
static int dashem_remove_avx2_pshufb(const char*, size_t, char*, size_t, size_t*);
#endif
#if defined(__BMI2__)
static int dashem_remove_bmi2(const char*, size_t, char*, size_t, size_t*);
#endif
static dashem_remove_fn dashem_init_impl(void) {
uint32_t features = dashem_detect_cpu_features();
#if defined(__AVX512VBMI2__) && defined(__AVX512BW__)
if (features & DASHEM_CPU_AVX512VBMI2) {
return dashem_remove_avx512_compress;
}
#endif
#if defined(__AVX512F__)
if (features & DASHEM_CPU_AVX512F) {
return dashem_remove_avx512;
}
#endif
#if defined(__AVX2__)
if (features & DASHEM_CPU_AVX2) {
return dashem_remove_avx2;
}
#endif
#if defined(__BMI2__)
if (features & DASHEM_CPU_BMI2) {
return dashem_remove_bmi2;
}
#endif
#if defined(__SSE4_2__)
if (features & DASHEM_CPU_SSE42) {
return dashem_remove_sse42;
}
#endif
#if defined(__ARM_NEON)
if (features & DASHEM_CPU_NEON) {
return dashem_remove_neon;
}
#endif
return dashem_remove_scalar;
}
static int dashem_remove_scalar(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
size_t i = 0;
while (i + 10 <= input_len) {
uint64_t chunk;
memcpy(&chunk, input + i, 8);
uint64_t test = chunk ^ 0xE2E2E2E2E2E2E2E2ULL;
uint64_t has_e2 = (test - 0x0101010101010101ULL) & ~test & 0x8080808080808080ULL;
if (LIKELY(has_e2 == 0)) {
memcpy(out_ptr + out_idx, input + i, 8);
out_idx += 8;
i += 8;
} else {
int first_e2_bit = dashem_ctzll(has_e2);
int first_e2_byte = first_e2_bit >> 3;
if (first_e2_byte > 0) {
memcpy(out_ptr + out_idx, input + i, first_e2_byte);
out_idx += first_e2_byte;
i += first_e2_byte;
}
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
static inline int dashem_remove_fast_small(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
#if defined(__AVX2__)
#include <immintrin.h>
static int dashem_remove_avx2_twopass(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverflow"
const __m256i pattern_0xe2 = _mm256_set1_epi8((char)0xE2);
const __m256i pattern_0x80 = _mm256_set1_epi8((char)0x80);
const __m256i pattern_0x94 = _mm256_set1_epi8((char)0x94);
#pragma GCC diagnostic pop
size_t em_dash_count = 0;
size_t i = 0;
while (i + 34 <= input_len) {
if (i + 256 < input_len) {
_mm_prefetch(input + i + 256, _MM_HINT_T1);
_mm_prefetch(input + i + 320, _MM_HINT_T1);
}
__m256i v0 = _mm256_loadu_si256((__m256i *)(input + i));
__m256i v1 = _mm256_loadu_si256((__m256i *)(input + i + 1));
__m256i v2 = _mm256_loadu_si256((__m256i *)(input + i + 2));
__m256i cmp0 = _mm256_cmpeq_epi8(v0, pattern_0xe2);
__m256i cmp1 = _mm256_cmpeq_epi8(v1, pattern_0x80);
__m256i cmp2 = _mm256_cmpeq_epi8(v2, pattern_0x94);
__m256i full_match = _mm256_and_si256(cmp0, _mm256_and_si256(cmp1, cmp2));
uint32_t em_dash_mask = _mm256_movemask_epi8(full_match);
if (em_dash_mask != 0) {
while (em_dash_mask != 0) {
int match_offset = dashem_ctz(em_dash_mask);
em_dash_count++;
em_dash_mask &= ~(1u << match_offset);
if (match_offset + 1 < 32) {
em_dash_mask &= ~(1u << (match_offset + 1));
}
if (match_offset + 2 < 32) {
em_dash_mask &= ~(1u << (match_offset + 2));
}
}
}
i += 32;
}
while (i + 3 <= input_len) {
if (in_ptr[i] == 0xE2 && in_ptr[i + 1] == 0x80 && in_ptr[i + 2] == 0x94) {
em_dash_count++;
i += 3;
} else {
i++;
}
}
if (em_dash_count == 0) {
memcpy(output, input, input_len);
*output_len = input_len;
return 0;
}
size_t out_idx = 0;
i = 0;
while (i + 34 <= input_len) {
if (i + 256 < input_len) {
_mm_prefetch(input + i + 256, _MM_HINT_T1);
_mm_prefetch(input + i + 320, _MM_HINT_T1);
}
if (out_idx + 128 < output_capacity) {
_mm_prefetch(output + out_idx + 128, _MM_HINT_T1);
}
__m256i v0 = _mm256_loadu_si256((__m256i *)(input + i));
__m256i v1 = _mm256_loadu_si256((__m256i *)(input + i + 1));
__m256i v2 = _mm256_loadu_si256((__m256i *)(input + i + 2));
__m256i cmp0 = _mm256_cmpeq_epi8(v0, pattern_0xe2);
__m256i cmp1 = _mm256_cmpeq_epi8(v1, pattern_0x80);
__m256i cmp2 = _mm256_cmpeq_epi8(v2, pattern_0x94);
__m256i full_match = _mm256_and_si256(cmp0, _mm256_and_si256(cmp1, cmp2));
uint32_t em_dash_mask = _mm256_movemask_epi8(full_match);
if (em_dash_mask == 0) {
_mm256_storeu_si256((__m256i *)(out_ptr + out_idx), v0);
out_idx += 32;
i += 32;
continue;
}
size_t chunk_end = i + 32;
while (i < chunk_end) {
if ((em_dash_mask & 1) && i + 3 <= input_len &&
in_ptr[i] == 0xE2 && in_ptr[i + 1] == 0x80 && in_ptr[i + 2] == 0x94) {
i += 3;
em_dash_mask >>= 3;
if (i >= chunk_end) break;
} else {
out_ptr[out_idx++] = in_ptr[i++];
em_dash_mask >>= 1;
}
}
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 && in_ptr[i + 1] == 0x80 && in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
static int dashem_remove_avx2_pshufb(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
size_t out_idx = 0;
size_t i = 0;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverflow"
const __m256i pattern_0xe2 = _mm256_set1_epi8((char)0xE2);
const __m256i pattern_0x80 = _mm256_set1_epi8((char)0x80);
const __m256i pattern_0x94 = _mm256_set1_epi8((char)0x94);
#pragma GCC diagnostic pop
while (i + 34 <= input_len) {
if (i + 256 < input_len) {
_mm_prefetch(input + i + 256, _MM_HINT_T1);
_mm_prefetch(input + i + 320, _MM_HINT_T1);
}
__m256i v0 = _mm256_loadu_si256((__m256i *)(input + i));
__m256i v1 = _mm256_loadu_si256((__m256i *)(input + i + 1));
__m256i v2 = _mm256_loadu_si256((__m256i *)(input + i + 2));
__m256i cmp0 = _mm256_cmpeq_epi8(v0, pattern_0xe2);
__m256i cmp1 = _mm256_cmpeq_epi8(v1, pattern_0x80);
__m256i cmp2 = _mm256_cmpeq_epi8(v2, pattern_0x94);
__m256i full_match = _mm256_and_si256(cmp0, _mm256_and_si256(cmp1, cmp2));
uint32_t em_dash_mask = _mm256_movemask_epi8(full_match);
if (em_dash_mask == 0) {
_mm256_storeu_si256((__m256i *)(out_ptr + out_idx), v0);
out_idx += 32;
i += 32;
continue;
}
uint8_t shuffle_lo[16], shuffle_hi[16];
int count_lo = 0, count_hi = 0;
for (int j = 0; j < 16; j++) {
shuffle_lo[j] = 0x80;
shuffle_hi[j] = 0x80;
}
for (int j = 0; j < 32; j++) {
if ((em_dash_mask & (1u << j)) != 0) {
if (j + 1 < 32) em_dash_mask &= ~(1u << (j + 1));
if (j + 2 < 32) em_dash_mask &= ~(1u << (j + 2));
j += 2;
} else {
if (j < 16) {
if (count_lo < 16) shuffle_lo[count_lo++] = j;
} else {
if (count_hi < 16) shuffle_hi[count_hi++] = j - 16;
}
}
}
__m128i mask_lo = _mm_loadu_si128((__m128i *)shuffle_lo);
__m128i mask_hi = _mm_loadu_si128((__m128i *)shuffle_hi);
__m128i v0_lo = _mm256_castsi256_si128(v0);
__m128i v0_hi = _mm256_extracti128_si256(v0, 1);
__m128i compacted_lo = _mm_shuffle_epi8(v0_lo, mask_lo);
__m128i compacted_hi = _mm_shuffle_epi8(v0_hi, mask_hi);
if (count_lo > 0) {
memcpy(out_ptr + out_idx, &compacted_lo, count_lo);
out_idx += count_lo;
}
if (count_hi > 0) {
memcpy(out_ptr + out_idx, &compacted_hi, count_hi);
out_idx += count_hi;
}
i += 32;
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 && in_ptr[i + 1] == 0x80 && in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
static int dashem_remove_avx2(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverflow"
const __m256i pat_e2 = _mm256_set1_epi8((char)0xE2);
const __m256i pat_80 = _mm256_set1_epi8((char)0x80);
const __m256i pat_94 = _mm256_set1_epi8((char)0x94);
#pragma GCC diagnostic pop
while (i + 34 <= input_len) {
__m256i v0 = _mm256_loadu_si256((const __m256i *)(input + i));
__m256i cmp0 = _mm256_cmpeq_epi8(v0, pat_e2);
uint32_t e2_mask = (uint32_t)_mm256_movemask_epi8(cmp0);
if (LIKELY(e2_mask == 0)) {
_mm256_storeu_si256((__m256i *)(out_ptr + out_idx), v0);
out_idx += 32;
i += 32;
continue;
}
__m256i v1 = _mm256_loadu_si256((const __m256i *)(input + i + 1));
__m256i v2 = _mm256_loadu_si256((const __m256i *)(input + i + 2));
__m256i cmp1 = _mm256_cmpeq_epi8(v1, pat_80);
__m256i cmp2 = _mm256_cmpeq_epi8(v2, pat_94);
uint32_t mask = (uint32_t)_mm256_movemask_epi8(
_mm256_and_si256(cmp0, _mm256_and_si256(cmp1, cmp2)));
if (mask == 0) {
_mm256_storeu_si256((__m256i *)(out_ptr + out_idx), v0);
out_idx += 32;
i += 32;
continue;
}
int match_count = DASHEM_POPCOUNT(mask);
if (LIKELY(match_count <= 2)) {
size_t wp = i;
while (mask != 0) {
int bit = dashem_ctz(mask);
size_t match_pos = i + bit;
if (match_pos > wp) {
size_t gap = match_pos - wp;
memcpy(out_ptr + out_idx, input + wp, gap);
out_idx += gap;
}
wp = match_pos + 3;
mask &= ~(1u << bit);
if (bit + 1 < 32) mask &= ~(1u << (bit + 1));
if (bit + 2 < 32) mask &= ~(1u << (bit + 2));
}
size_t chunk_end = i + 32;
if (wp <= chunk_end) {
if (wp < chunk_end) {
memcpy(out_ptr + out_idx, input + wp, chunk_end - wp);
out_idx += chunk_end - wp;
}
i = chunk_end;
} else {
i = wp;
}
} else {
uint32_t remove = mask | (mask << 1) | (mask << 2);
uint32_t keep = ~remove;
while (keep != 0) {
int pos = dashem_ctz(keep);
out_ptr[out_idx++] = in_ptr[i + pos];
keep &= keep - 1;
}
if (mask & 0x80000000u) {
i += 34;
} else if (mask & 0x40000000u) {
i += 33;
} else {
i += 32;
}
}
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
static int dashem_remove_avx2_unrolled(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverflow"
const __m256i pattern_0xe2 = _mm256_set1_epi8((char)0xE2);
const __m256i pattern_0x80 = _mm256_set1_epi8((char)0x80);
const __m256i pattern_0x94 = _mm256_set1_epi8((char)0x94);
#pragma GCC diagnostic pop
while (i + 66 <= input_len) {
if (i + 128 < input_len) {
_mm_prefetch(input + i + 128, _MM_HINT_T0);
_mm_prefetch(input + i + 160, _MM_HINT_T0);
}
__m256i v0_a = _mm256_loadu_si256((__m256i *)(input + i));
__m256i v1_a = _mm256_loadu_si256((__m256i *)(input + i + 1));
__m256i v2_a = _mm256_loadu_si256((__m256i *)(input + i + 2));
__m256i v0_b = _mm256_loadu_si256((__m256i *)(input + i + 32));
__m256i v1_b = _mm256_loadu_si256((__m256i *)(input + i + 33));
__m256i v2_b = _mm256_loadu_si256((__m256i *)(input + i + 34));
__m256i cmp0_a = _mm256_cmpeq_epi8(v0_a, pattern_0xe2);
__m256i cmp1_a = _mm256_cmpeq_epi8(v1_a, pattern_0x80);
__m256i cmp2_a = _mm256_cmpeq_epi8(v2_a, pattern_0x94);
__m256i full_match_a = _mm256_and_si256(cmp0_a, _mm256_and_si256(cmp1_a, cmp2_a));
uint32_t mask_a = _mm256_movemask_epi8(full_match_a);
__m256i cmp0_b = _mm256_cmpeq_epi8(v0_b, pattern_0xe2);
__m256i cmp1_b = _mm256_cmpeq_epi8(v1_b, pattern_0x80);
__m256i cmp2_b = _mm256_cmpeq_epi8(v2_b, pattern_0x94);
__m256i full_match_b = _mm256_and_si256(cmp0_b, _mm256_and_si256(cmp1_b, cmp2_b));
uint32_t mask_b = _mm256_movemask_epi8(full_match_b);
if (mask_a == 0 && mask_b == 0) {
memcpy(out_ptr + out_idx, input + i, 64);
out_idx += 64;
i += 64;
continue;
}
size_t write_pos = i;
if (mask_a != 0) {
size_t processed = 0;
while (mask_a != 0) {
int match_offset = dashem_ctz(mask_a);
size_t match_pos = i + processed + match_offset;
if (match_pos > write_pos) {
size_t copy_len = match_pos - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
}
write_pos = match_pos + 3;
processed += match_offset + 3;
mask_a >>= (match_offset + 3);
}
size_t chunk_end = i + 32;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
write_pos = i + 32;
}
if (mask_b != 0) {
size_t processed = 0;
while (mask_b != 0) {
int match_offset = dashem_ctz(mask_b);
size_t match_pos = i + 32 + processed + match_offset;
if (match_pos > write_pos) {
size_t copy_len = match_pos - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
}
write_pos = match_pos + 3;
processed += match_offset + 3;
mask_b >>= (match_offset + 3);
}
size_t chunk_end = i + 64;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
} else {
size_t chunk_end = i + 64;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
}
i += 64;
}
while (i + 34 <= input_len) {
__m256i v0 = _mm256_loadu_si256((__m256i *)(input + i));
__m256i v1 = _mm256_loadu_si256((__m256i *)(input + i + 1));
__m256i v2 = _mm256_loadu_si256((__m256i *)(input + i + 2));
__m256i cmp0 = _mm256_cmpeq_epi8(v0, pattern_0xe2);
__m256i cmp1 = _mm256_cmpeq_epi8(v1, pattern_0x80);
__m256i cmp2 = _mm256_cmpeq_epi8(v2, pattern_0x94);
__m256i full_match = _mm256_and_si256(cmp0, _mm256_and_si256(cmp1, cmp2));
uint32_t em_dash_mask = _mm256_movemask_epi8(full_match);
if (em_dash_mask == 0) {
memcpy(out_ptr + out_idx, input + i, 32);
out_idx += 32;
i += 32;
continue;
}
size_t write_pos = i;
size_t processed = 0;
while (em_dash_mask != 0) {
int match_offset = dashem_ctz(em_dash_mask);
size_t match_pos = i + processed + match_offset;
if (match_pos > write_pos) {
size_t copy_len = match_pos - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
}
write_pos = match_pos + 3;
processed += match_offset + 3;
int shift_amount = match_offset + 3;
if (shift_amount >= 32) {
em_dash_mask = 0;
} else {
em_dash_mask >>= shift_amount;
}
}
size_t chunk_end = i + 32;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
i = chunk_end;
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
#endif
#if defined(__AVX512F__)
#include <immintrin.h>
static int dashem_remove_avx512(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
const __m512i pattern_0xe2 = _mm512_set1_epi8((char)0xE2);
const __m512i pattern_0x80 = _mm512_set1_epi8((char)0x80);
const __m512i pattern_0x94 = _mm512_set1_epi8((char)0x94);
while (i + 64 <= input_len) {
if (i + 128 < input_len) {
_mm_prefetch(input + i + 128, _MM_HINT_T0);
}
__m512i v0 = _mm512_loadu_si512((__m512i *)(input + i));
__m512i v1 = _mm512_loadu_si512((__m512i *)(input + i + 1));
__m512i v2 = _mm512_loadu_si512((__m512i *)(input + i + 2));
__mmask64 cmp0 = _mm512_cmpeq_epu8_mask(v0, pattern_0xe2);
__mmask64 cmp1 = _mm512_cmpeq_epu8_mask(v1, pattern_0x80);
__mmask64 cmp2 = _mm512_cmpeq_epu8_mask(v2, pattern_0x94);
uint64_t match_mask = cmp0 & cmp1 & cmp2;
if (match_mask == 0) {
memcpy(out_ptr + out_idx, input + i, 64);
out_idx += 64;
i += 64;
continue;
}
size_t write_pos = i;
size_t processed = 0;
while (match_mask != 0) {
int match_offset = dashem_ctzll(match_mask);
size_t match_pos = i + processed + match_offset;
if (match_pos > write_pos) {
size_t copy_len = match_pos - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
}
write_pos = match_pos + 3;
processed += match_offset + 3;
match_mask >>= (match_offset + 3);
}
size_t chunk_end = i + 64;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
i += 64;
}
while (i + 32 <= input_len) {
__m256i v0 = _mm256_loadu_si256((__m256i *)(input + i));
__m256i v1 = _mm256_loadu_si256((__m256i *)(input + i + 1));
__m256i v2 = _mm256_loadu_si256((__m256i *)(input + i + 2));
__m256i cmp0 = _mm256_cmpeq_epi8(v0, _mm256_set1_epi8((char)0xE2));
__m256i cmp1 = _mm256_cmpeq_epi8(v1, _mm256_set1_epi8((char)0x80));
__m256i cmp2 = _mm256_cmpeq_epi8(v2, _mm256_set1_epi8((char)0x94));
__m256i full_match = _mm256_and_si256(cmp0, _mm256_and_si256(cmp1, cmp2));
uint32_t em_dash_mask = _mm256_movemask_epi8(full_match);
if (em_dash_mask == 0) {
memcpy(out_ptr + out_idx, input + i, 32);
out_idx += 32;
i += 32;
continue;
}
size_t write_pos = i;
size_t processed = 0;
while (em_dash_mask != 0) {
int match_offset = dashem_ctz(em_dash_mask);
size_t match_pos = i + processed + match_offset;
if (match_pos > write_pos) {
size_t copy_len = match_pos - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
}
write_pos = match_pos + 3;
processed += match_offset + 3;
int shift_amount = match_offset + 3;
if (shift_amount >= 32) {
em_dash_mask = 0;
} else {
em_dash_mask >>= shift_amount;
}
}
size_t chunk_end = i + 32;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
i = chunk_end;
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
#if defined(__AVX512VBMI2__) && defined(__AVX512BW__)
static int dashem_remove_avx512_compress(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
const __m512i pattern_e2 = _mm512_set1_epi8((char)0xE2);
const __m512i pattern_80 = _mm512_set1_epi8((char)0x80);
const __m512i pattern_94 = _mm512_set1_epi8((char)0x94);
while (i + 64 <= input_len) {
if (i + 128 < input_len) {
_mm_prefetch(input + i + 128, _MM_HINT_T0);
}
__m512i v0 = _mm512_loadu_si512((__m512i *)(input + i));
__m512i v1 = _mm512_loadu_si512((__m512i *)(input + i + 1));
__m512i v2 = _mm512_loadu_si512((__m512i *)(input + i + 2));
__mmask64 match_e2 = _mm512_cmpeq_epi8_mask(v0, pattern_e2);
__mmask64 match_80 = _mm512_cmpeq_epi8_mask(v1, pattern_80);
__mmask64 match_94 = _mm512_cmpeq_epi8_mask(v2, pattern_94);
__mmask64 em_dash_start = match_e2 & match_80 & match_94;
const __mmask64 process_mask = 0x1FFFFFFFFFFFFFFFULL;
if ((em_dash_start & process_mask) == 0) {
_mm512_mask_storeu_epi8(out_ptr + out_idx, process_mask, v0);
out_idx += 61;
i += 61;
continue;
}
em_dash_start &= process_mask;
__mmask64 keep_mask = ~em_dash_start & process_mask;
__mmask64 em_dash_byte2 = em_dash_start << 1;
__mmask64 em_dash_byte3 = em_dash_start << 2;
keep_mask &= ~em_dash_byte2;
keep_mask &= ~em_dash_byte3;
_mm512_mask_compressstoreu_epi8(out_ptr + out_idx, keep_mask, v0);
out_idx += DASHEM_POPCOUNTLL(keep_mask);
i += 61;
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
#endif
#endif
#if defined(__SSE4_2__)
#include <nmmintrin.h>
static int dashem_remove_sse42(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverflow"
const __m128i pattern_0xe2 = _mm_set1_epi8((char)0xE2);
const __m128i pattern_0x80 = _mm_set1_epi8((char)0x80);
const __m128i pattern_0x94 = _mm_set1_epi8((char)0x94);
#pragma GCC diagnostic pop
while (i + 18 <= input_len) {
__m128i v0 = _mm_loadu_si128((const __m128i *)(input + i));
__m128i cmp0 = _mm_cmpeq_epi8(v0, pattern_0xe2);
uint32_t e2_mask = (uint32_t)_mm_movemask_epi8(cmp0);
if (LIKELY(e2_mask == 0)) {
_mm_storeu_si128((__m128i *)(out_ptr + out_idx), v0);
out_idx += 16;
i += 16;
continue;
}
__m128i v1 = _mm_loadu_si128((const __m128i *)(input + i + 1));
__m128i v2 = _mm_loadu_si128((const __m128i *)(input + i + 2));
__m128i cmp1 = _mm_cmpeq_epi8(v1, pattern_0x80);
__m128i cmp2 = _mm_cmpeq_epi8(v2, pattern_0x94);
__m128i full_match = _mm_and_si128(cmp0, _mm_and_si128(cmp1, cmp2));
uint32_t em_dash_mask = (uint32_t)_mm_movemask_epi8(full_match);
if (em_dash_mask == 0) {
_mm_storeu_si128((__m128i *)(out_ptr + out_idx), v0);
out_idx += 16;
i += 16;
continue;
}
size_t write_pos = i;
size_t processed = 0;
while (em_dash_mask != 0) {
int match_offset = dashem_ctz(em_dash_mask);
size_t match_pos = i + processed + match_offset;
if (match_pos > write_pos) {
size_t copy_len = match_pos - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
}
write_pos = match_pos + 3;
processed += match_offset + 3;
int shift_amount = match_offset + 3;
if (shift_amount >= 32) {
em_dash_mask = 0;
} else {
em_dash_mask >>= shift_amount;
}
}
size_t chunk_end = i + 16;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
i = chunk_end;
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
#endif
#if defined(__BMI2__)
#include <immintrin.h>
static int dashem_remove_bmi2(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
size_t skip_until = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
while (i + 8 <= input_len) {
uint64_t chunk;
memcpy(&chunk, in_ptr + i, 8);
uint64_t keep_mask = 0xFFFFFFFFFFFFFFFFULL;
for (int j = 0; j < 8 && i + j < skip_until; j++) {
keep_mask &= ~(0xFFULL << (j * 8));
}
for (int j = 0; j < 8 && i + j + 2 < input_len; j++) {
if (i + j < skip_until) {
continue;
}
if (in_ptr[i + j] == 0xE2 &&
in_ptr[i + j + 1] == 0x80 &&
in_ptr[i + j + 2] == 0x94) {
keep_mask &= ~(0xFFULL << (j * 8));
size_t em_dash_end = i + j + 3;
if (em_dash_end > i + 8) {
skip_until = em_dash_end;
}
for (int k = j + 1; k < 8 && k < j + 3; k++) {
keep_mask &= ~(0xFFULL << (k * 8));
}
j += 2;
}
}
if (keep_mask == 0xFFFFFFFFFFFFFFFFULL) {
memcpy(out_ptr + out_idx, &chunk, 8);
out_idx += 8;
i += 8;
continue;
}
uint64_t compacted = _pext_u64(chunk, keep_mask);
int bytes_kept = DASHEM_POPCOUNTLL(keep_mask) / 8;
memcpy(out_ptr + out_idx, &compacted, bytes_kept);
out_idx += bytes_kept;
i += 8;
}
while (i < input_len) {
if (i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94) {
i += 3;
} else {
out_ptr[out_idx++] = in_ptr[i++];
}
}
*output_len = out_idx;
return 0;
}
#endif
#if defined(__ARM_NEON)
#include <arm_neon.h>
static int dashem_remove_neon(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len
) {
if (output_capacity < input_len) {
return -1;
}
size_t out_idx = 0;
size_t i = 0;
size_t write_pos = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
const uint8x16_t pattern_0xe2 = vdupq_n_u8(0xE2);
const uint8x16_t pattern_0x80 = vdupq_n_u8(0x80);
const uint8x16_t pattern_0x94 = vdupq_n_u8(0x94);
while (i + 18 <= input_len) {
uint8x16_t v0 = vld1q_u8((const uint8_t *)(input + i));
uint8x16_t v1 = vld1q_u8((const uint8_t *)(input + i + 1));
uint8x16_t v2 = vld1q_u8((const uint8_t *)(input + i + 2));
uint8x16_t cmp0 = vceqq_u8(v0, pattern_0xe2);
uint8x16_t cmp1 = vceqq_u8(v1, pattern_0x80);
uint8x16_t cmp2 = vceqq_u8(v2, pattern_0x94);
uint8x16_t full_match = vandq_u8(cmp0, vandq_u8(cmp1, cmp2));
uint64_t mask_low = vgetq_lane_u64(vreinterpretq_u64_u8(full_match), 0);
uint64_t mask_high = vgetq_lane_u64(vreinterpretq_u64_u8(full_match), 1);
if (mask_low == 0 && mask_high == 0) {
size_t chunk_end = i + 16;
if (write_pos < chunk_end) {
size_t copy_len = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
write_pos = chunk_end;
}
i += 16;
continue;
}
uint8_t match_bytes[16];
vst1q_u8(match_bytes, full_match);
for (int j = 0; j < 16; j++) {
if (match_bytes[j] != 0) {
size_t match_pos = i + j;
if (match_pos > write_pos) {
size_t copy_len = match_pos - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, copy_len);
out_idx += copy_len;
}
write_pos = match_pos + 3;
j += 2;
}
}
size_t chunk_end = i + 16;
if (write_pos < chunk_end) {
size_t remaining = chunk_end - write_pos;
memcpy(out_ptr + out_idx, input + write_pos, remaining);
out_idx += remaining;
}
if (write_pos < chunk_end) {
write_pos = chunk_end;
}
i = chunk_end;
}
size_t scalar_start = (i > 0) ? write_pos : 0;
while (scalar_start < input_len) {
if (scalar_start + 3 <= input_len &&
in_ptr[scalar_start] == 0xE2 &&
in_ptr[scalar_start + 1] == 0x80 &&
in_ptr[scalar_start + 2] == 0x94) {
scalar_start += 3;
} else {
out_ptr[out_idx++] = in_ptr[scalar_start++];
}
}
*output_len = out_idx;
return 0;
}
#endif
static inline int is_continuation_byte(unsigned char c) {
return (c & 0xC0) == 0x80;
}
static inline int get_utf8_char_len(unsigned char first_byte) {
if ((first_byte & 0x80) == 0) return 1;
if ((first_byte & 0xE0) == 0xC0) return 2;
if ((first_byte & 0xF0) == 0xE0) return 3;
if ((first_byte & 0xF8) == 0xF0) return 4;
return 0;
}
static inline int validate_utf8_char(const unsigned char *ptr, size_t remaining) {
int len = get_utf8_char_len(*ptr);
if (UNLIKELY(len == 0 || (size_t)len > remaining)) {
return 0;
}
for (int i = 1; i < len; i++) {
if (!is_continuation_byte(ptr[i])) {
return 0;
}
}
return 1;
}
static DASHEM_UNUSED int process_utf8_char(
const unsigned char *input,
size_t remaining,
unsigned char *output,
size_t *output_capacity,
dashem_utf8_mode_t mode
) {
int char_len = get_utf8_char_len(input[0]);
if (UNLIKELY(char_len == 0 || (size_t)char_len > remaining)) {
if (mode == DASHEM_UTF8_STRICT) {
return -1;
} else if (mode == DASHEM_UTF8_SKIP) {
return 1;
} else {
if (*output_capacity < 3) return -2;
output[0] = 0xEF;
output[1] = 0xBF;
output[2] = 0xBD;
*output_capacity -= 3;
return 1;
}
}
for (int i = 1; i < char_len; i++) {
if (!is_continuation_byte(input[i])) {
if (mode == DASHEM_UTF8_STRICT) {
return -1;
} else if (mode == DASHEM_UTF8_SKIP) {
return 1;
} else {
if (*output_capacity < 3) return -2;
output[0] = 0xEF;
output[1] = 0xBF;
output[2] = 0xBD;
*output_capacity -= 3;
return 1;
}
}
}
if (*output_capacity < (size_t)char_len) return -2;
memcpy(output, input, char_len);
*output_capacity -= char_len;
return char_len;
}
static DASHEM_ALWAYS_INLINE int dashem_remove_insitu(
const char *buffer,
size_t input_len,
size_t *output_len
) {
size_t read_pos = 0;
size_t write_pos = 0;
const unsigned char *in_ptr = (const unsigned char *)buffer;
unsigned char *out_ptr = (unsigned char *)buffer;
while (read_pos == write_pos && read_pos + 10 <= input_len) {
uint64_t chunk;
memcpy(&chunk, in_ptr + read_pos, 8);
uint64_t test = chunk ^ 0xE2E2E2E2E2E2E2E2ULL;
uint64_t has_e2 = (test - 0x0101010101010101ULL) & ~test & 0x8080808080808080ULL;
if (LIKELY(has_e2 == 0)) {
read_pos += 8;
write_pos += 8;
} else {
int first_e2_byte = dashem_ctzll(has_e2) >> 3;
read_pos += first_e2_byte;
write_pos += first_e2_byte;
break;
}
}
while (read_pos < input_len) {
if (read_pos + 3 <= input_len &&
in_ptr[read_pos] == 0xE2 &&
in_ptr[read_pos + 1] == 0x80 &&
in_ptr[read_pos + 2] == 0x94) {
read_pos += 3;
} else {
if (write_pos != read_pos) {
out_ptr[write_pos] = in_ptr[read_pos];
}
write_pos++;
read_pos++;
}
}
*output_len = write_pos;
return 0;
}
int dashem_remove(
const char * restrict input,
size_t input_len,
char * restrict output,
size_t output_capacity,
size_t * restrict output_len
) {
if (!input || !output || !output_len) {
return -2;
}
if (UNLIKELY(input_len < 32)) {
return dashem_remove_fast_small(input, input_len, output, output_capacity, output_len);
}
if (UNLIKELY((const void *)input == (const void *)output)) {
return dashem_remove_insitu(input, input_len, output_len);
}
if (UNLIKELY(output_capacity < input_len)) {
return -1;
}
if (g_dashem_remove_impl == NULL) {
g_dashem_remove_impl = dashem_init_impl();
}
return g_dashem_remove_impl(input, input_len, output, output_capacity, output_len);
}
const char* dashem_version(void) {
return "1.1.2";
}
const char* dashem_implementation_name(void) {
uint32_t features = dashem_detect_cpu_features();
#if defined(__AVX512VBMI2__) && defined(__AVX512BW__)
if (features & DASHEM_CPU_AVX512VBMI2) {
return "AVX-512 VBMI2 (VPCOMPRESSB)";
}
#endif
#if defined(__AVX512F__)
if (features & DASHEM_CPU_AVX512F) {
return "AVX-512F";
}
#endif
#if defined(__AVX2__)
if (features & DASHEM_CPU_AVX2) {
return "AVX2";
}
#endif
#if defined(__SSE4_2__)
if (features & DASHEM_CPU_SSE42) {
return "SSE4.2";
}
#endif
#if defined(__ARM_NEON)
if (features & DASHEM_CPU_NEON) {
return "NEON";
}
#endif
return "Scalar";
}
int dashem_remove_utf8(
const char *input,
size_t input_len,
char *output,
size_t output_capacity,
size_t *output_len,
dashem_utf8_mode_t utf8_mode
) {
if (!input || !output || !output_len) {
return -2;
}
if (UNLIKELY(output_capacity == 0)) {
return -1;
}
size_t output_written = 0;
size_t i = 0;
const unsigned char *in_ptr = (const unsigned char *)input;
unsigned char *out_ptr = (unsigned char *)output;
size_t remaining_capacity = output_capacity;
while (i < input_len) {
if (UNLIKELY(i + 3 <= input_len &&
in_ptr[i] == 0xE2 &&
in_ptr[i + 1] == 0x80 &&
in_ptr[i + 2] == 0x94)) {
i += 3;
} else {
int char_len = get_utf8_char_len(in_ptr[i]);
if (LIKELY(char_len > 0 && i + (size_t)char_len <= input_len)) {
int valid = 1;
for (int j = 1; j < char_len; j++) {
if (!is_continuation_byte(in_ptr[i + j])) {
valid = 0;
break;
}
}
if (LIKELY(valid)) {
if (UNLIKELY(remaining_capacity < (size_t)char_len)) {
return -1;
}
memcpy(out_ptr + output_written, input + i, char_len);
output_written += char_len;
remaining_capacity -= char_len;
i += char_len;
} else {
if (utf8_mode == DASHEM_UTF8_STRICT) {
return -2;
} else if (utf8_mode == DASHEM_UTF8_SKIP) {
i++;
} else {
if (UNLIKELY(remaining_capacity < 3)) return -1;
out_ptr[output_written++] = 0xEF;
out_ptr[output_written++] = 0xBF;
out_ptr[output_written++] = 0xBD;
remaining_capacity -= 3;
i++;
}
}
} else {
if (utf8_mode == DASHEM_UTF8_STRICT) {
return -2;
} else if (utf8_mode == DASHEM_UTF8_SKIP) {
i++;
} else {
if (UNLIKELY(remaining_capacity < 3)) return -1;
out_ptr[output_written++] = 0xEF;
out_ptr[output_written++] = 0xBF;
out_ptr[output_written++] = 0xBD;
remaining_capacity -= 3;
i++;
}
}
}
}
*output_len = output_written;
return 0;
}