Skip to main content

base64_turbo/
lib.rs

1//! # Base64 Turbo
2//!
3//! [![Crates.io](https://img.shields.io/crates/v/base64-turbo.svg)](https://crates.io/crates/base64-turbo)
4//! [![Documentation](https://docs.rs/base64-turbo/badge.svg)](https://docs.rs/base64-turbo)
5//! [![License](https://img.shields.io/github/license/hacer-bark/base64-turbo)](https://github.com/hacer-bark/base64-turbo/blob/main/LICENSE)
6//! [![Kani Verified](https://img.shields.io/github/actions/workflow/status/hacer-bark/base64-turbo/verification.yml?label=Kani%20Verified)](https://github.com/hacer-bark/base64-turbo/actions/workflows/verification.yml)
7//! [![MIRI Verified](https://img.shields.io/github/actions/workflow/status/hacer-bark/base64-turbo/miri.yml?label=MIRI%20Verified)](https://github.com/hacer-bark/base64-turbo/actions/workflows/miri.yml)
8//! [![Logic Tests](https://img.shields.io/github/actions/workflow/status/hacer-bark/base64-turbo/tests.yml?label=Logic%20Tests)](https://github.com/hacer-bark/base64-turbo/actions/workflows/tests.yml)
9//!
10//! A SIMD-accelerated Base64 encoder/decoder for Rust, optimized for high-throughput systems.
11//!
12//! This crate provides runtime CPU detection to utilize AVX2, SSE4.1, or AVX512 (via feature flag) intrinsics.
13//! It includes a highly optimized scalar fallback for non-SIMD targets and supports `no_std` environments.
14//!
15//! ## Usage
16//!
17//! Add this to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! base64-turbo = "0.1"
22//! ```
23//!
24//! ### Basic API (Allocating)
25//!
26//! Standard usage for general applications. Requires the `std` feature (enabled by default).
27//!
28//! ```rust
29//! # #[cfg(feature = "std")]
30//! # {
31//! use base64_turbo::STANDARD;
32//!
33//! let data = b"Hello world";
34//!
35//! // Encode to String
36//! let encoded = STANDARD.encode(data);
37//! assert_eq!(encoded, "SGVsbG8gd29ybGQ=");
38//!
39//! // Decode to Vec<u8>
40//! let decoded = STANDARD.decode(&encoded).unwrap();
41//! assert_eq!(decoded, data);
42//! # }
43//! ```
44//!
45//! ### Zero-Allocation API (Slice-based)
46//!
47//! For low-latency scenarios or `no_std` environments where heap allocation is undesirable.
48//! These methods write directly into a user-provided mutable slice.
49//!
50//! ```rust
51//! use base64_turbo::STANDARD;
52//!
53//! let input = b"Raw bytes";
54//! let mut output = [0u8; 64]; // Pre-allocated stack buffer
55//!
56//! // Returns Result<usize, Error> indicating bytes written
57//! let len = STANDARD.encode_into(input, &mut output).unwrap();
58//!
59//! assert_eq!(&output[..len], b"UmF3IGJ5dGVz");
60//! ```
61//!
62//! ## Feature Flags
63//!
64//! This crate is highly configurable via Cargo features:
65//!
66//! | Feature | Default | Description |
67//! |---------|---------|-------------|
68//! | **`std`** | **Yes** | Enables `String` and `Vec` support. Disable this for `no_std` environments. |
69//! | **`simd`** | **Yes** | Enables runtime detection for AVX2 and SSE4.1 intrinsics. If disabled or unsupported by hardware, the crate falls back to scalar logic automatic. |
70//! | **`parallel`** | **No** | Enables [Rayon](https://crates.io/crates/rayon) support. Automatically parallelizes processing for payloads larger than 512KB. Recommended only for massive data ingestion tasks. |
71//! | **`avx512`** | **No** | Enables AVX512 intrinsics. |
72//!
73//! ## Safety & Verification
74//!
75//! This crate utilizes `unsafe` code for SIMD intrinsics and pointer arithmetic to achieve maximum performance.
76//!
77//! *   **Formal Verification (Kani):** Scalar (Done), SSE4.1 (In Progress), AVX2 (Done), AVX512 (In Progress) code mathematic proven to be UB free and panic free.
78//! *   **MIRI Tests:** Core SIMD logic and scalar fallbacks are verified with **MIRI** (Undefined Behavior checker) in CI.
79//! *   **Fuzzing:** The codebase is fuzz-tested via `cargo-fuzz`.
80//! *   **Fallback:** Invalid or unsupported hardware instruction sets are detected at runtime, ensuring safe fallback to scalar code.
81//! 
82//! **[Learn More](https://github.com/hacer-bark/base64-turbo/blob/main/docs/verification.md)**: Details on our threat model and formal verification strategy.
83
84#![cfg_attr(not(any(feature = "std", test)), no_std)]
85
86#![doc(issue_tracker_base_url = "https://github.com/hacer-bark/base64-turbo/issues/")]
87
88#![deny(unsafe_op_in_unsafe_fn)]
89#![warn(missing_docs)]
90#![warn(rust_2018_idioms)]
91#![warn(unused_qualifications)]
92
93#![cfg_attr(docsrs, feature(doc_cfg))]
94
95#[cfg(feature = "parallel")]
96use rayon::prelude::*;
97
98#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
99#[cfg(feature = "simd")]
100mod simd;
101mod scalar;
102
103// ======================================================================
104// ERROR DEFINITION
105// ======================================================================
106
107/// Errors that can occur during Base64 encoding or decoding operations.
108#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum Error {
110    /// The input length is invalid for Base64 decoding.
111    ///
112    /// Base64 encoded data (with padding) must strictly have a length divisible by 4.
113    /// If the input string is truncated or has incorrect padding length, this error is returned.
114    InvalidLength,
115
116    /// An invalid character was encountered during decoding.
117    ///
118    /// This occurs if the input contains bytes that do not belong to the
119    /// selected Base64 alphabet (e.g., symbols not in the standard set) or
120    /// if padding characters (`=`) appear in invalid positions.
121    InvalidCharacter,
122
123    /// The provided output buffer is too small to hold the result.
124    ///
125    /// This error is returned by the zero-allocation APIs (e.g., `encode_into`, `decode_into`)
126    /// when the destination slice passed by the user does not have enough capacity
127    /// to store the encoded or decoded data.
128    BufferTooSmall,
129}
130
131// Standard Display implementation for better error messages
132impl core::fmt::Display for Error {
133    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
134        match self {
135            Error::InvalidLength => write!(f, "Invalid Base64 input length (must be divisible by 4)"),
136            Error::InvalidCharacter => write!(f, "Invalid character found in Base64 input"),
137            Error::BufferTooSmall => write!(f, "Destination buffer is too small"),
138        }
139    }
140}
141
142// Enable std::error::Error trait when the 'std' feature is active
143#[cfg(feature = "std")]
144impl std::error::Error for Error {}
145
146// ======================================================================
147// Internal
148// Tuning Constants (Parallelism)
149// ======================================================================
150
151/// Input chunk size for parallel processing (24 KB).
152///
153/// This size is chosen to fit comfortably within the L1/L2 cache of most modern
154/// CPUs, ensuring that hot loops inside the encoder stay cache-resident.
155#[cfg(feature = "parallel")]
156const ENCODE_CHUNK_SIZE: usize = 24 * 1024;
157
158/// Output chunk size corresponding to `ENCODE_CHUNK_SIZE`.
159///
160/// Base64 encoding expands data by 4/3. For a 24KB input, the output is 32KB.
161#[cfg(feature = "parallel")]
162const DECODE_CHUNK_SIZE: usize = (ENCODE_CHUNK_SIZE / 3) * 4;
163
164/// Threshold to enable Rayon parallelism (512 KB).
165///
166/// For payloads smaller than this, the overhead of context switching and
167/// thread synchronization outweighs the throughput gains of multi-threading.
168#[cfg(feature = "parallel")]
169const PARALLEL_THRESHOLD: usize = 512 * 1024;
170
171// ======================================================================
172// Internal Lookup Tables
173// ======================================================================
174
175/// The Standard RFC 4648 Base64 Alphabet.
176/// Used for `STANDARD` and `STANDARD_NO_PAD`.
177const STANDARD_ALPHABET: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
178
179/// Computed compile-time reverse lookup table for the Standard alphabet.
180/// Maps ASCII bytes back to 6-bit indices. 0xFF indicates an invalid character.
181const STANDARD_DECODE_TABLE: [u8; 256] = {
182    let mut table = [0xFF; 256];
183    let mut i = 0;
184    while i < 64 {
185        table[STANDARD_ALPHABET[i] as usize] = i as u8;
186        i += 1;
187    }
188    table
189};
190
191/// The URL-Safe Base64 Alphabet.
192/// Replaces `+` with `-` and `/` with `_`. Used for `URL_SAFE` and `URL_SAFE_NO_PAD`.
193const URL_SAFE_ALPHABET: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
194
195/// Computed compile-time reverse lookup table for the URL-Safe alphabet.
196/// Maps ASCII bytes back to 6-bit indices. 0xFF indicates an invalid character.
197const URL_SAFE_DECODE_TABLE: [u8; 256] = {
198    let mut table = [0xFF; 256];
199    let mut i = 0;
200    while i < 64 {
201        table[URL_SAFE_ALPHABET[i] as usize] = i as u8;
202        i += 1;
203    }
204    table
205};
206
207// ======================================================================
208// Configuration & Types
209// ======================================================================
210
211/// Internal configuration for the Base64 engine.
212///
213/// This struct uses `repr(C)` to ensure predictable memory layout.
214#[repr(C)]
215#[derive(Debug, Clone, Copy)]
216pub(crate) struct Config {
217    /// If true, uses `-` and `_` instead of `+` and `/`.
218    pub url_safe: bool,
219    /// If true, writes `=` padding characters to the output.
220    pub padding: bool,
221}
222
223/// A high-performance, stateless Base64 encoder/decoder.
224///
225/// This struct holds the configuration for encoding/decoding (alphabet choice and padding).
226/// It is designed to be immutable and thread-safe.
227///
228/// # Examples
229///
230/// ```rust
231/// # #[cfg(feature = "std")]
232/// # {
233/// use base64_turbo::STANDARD;
234///
235/// let data = b"Hello world";
236///
237/// // Encode to String
238/// let encoded = STANDARD.encode(data);
239/// assert_eq!(encoded, "SGVsbG8gd29ybGQ=");
240///
241/// // Decode to Result<Vec<u8>, Error>
242/// let decoded = STANDARD.decode(&encoded).unwrap();
243/// assert_eq!(decoded, data);
244/// # }
245/// ```
246#[derive(Debug, Clone, Copy)]
247pub struct Engine {
248    pub(crate) config: Config,
249}
250
251// ======================================================================
252// Pre-defined Engines
253// ======================================================================
254
255/// Standard Base64 (RFC 4648) with padding (`=`).
256///
257/// Uses the `+` and `/` characters. This is the most common configuration.
258pub const STANDARD: Engine = Engine {
259    config: Config {
260        url_safe: false,
261        padding: true,
262    },
263};
264
265/// Standard Base64 (RFC 4648) **without** padding.
266///
267/// Uses the `+` and `/` characters, but omits trailing `=` signs.
268/// Useful for raw data streams or specific protocol requirements.
269pub const STANDARD_NO_PAD: Engine = Engine {
270    config: Config {
271        url_safe: false,
272        padding: false,
273    },
274};
275
276/// URL-Safe Base64 with padding.
277///
278/// Uses `-` and `_` instead of `+` and `/`. Safe for use in filenames and URLs.
279pub const URL_SAFE: Engine = Engine {
280    config: Config {
281        url_safe: true,
282        padding: true,
283    },
284};
285
286/// URL-Safe Base64 **without** padding.
287///
288/// Uses `-` and `_`. Commonly used in JWTs (JSON Web Tokens) and other web standards.
289pub const URL_SAFE_NO_PAD: Engine = Engine {
290    config: Config {
291        url_safe: true,
292        padding: false,
293    },
294};
295
296impl Engine {
297    // ======================================================================
298    // Length Calculators
299    // ======================================================================
300
301    /// Calculates the exact buffer size required to encode `input_len` bytes.
302    ///
303    /// This method computes the size based on the current configuration (padding vs. no padding).
304    ///
305    /// # Examples
306    ///
307    /// ```
308    /// use base64_turbo::STANDARD;
309    ///
310    /// assert_eq!(STANDARD.encoded_len(3), 4);
311    /// assert_eq!(STANDARD.encoded_len(1), 4); // With padding
312    /// ```
313    #[inline]
314    #[must_use]
315    pub const fn encoded_len(&self, input_len: usize) -> usize {
316        if self.config.padding {
317            // (n + 2) / 3 * 4
318            input_len.div_ceil(3) * 4
319        } else {
320            // (n * 4 + 2) / 3
321            (input_len * 4).div_ceil(3)
322        }
323    }
324
325    /// Calculates the **maximum** buffer size required to decode `input_len` bytes.
326    ///
327    /// # Note
328    /// This is an upper-bound estimate. The actual number of bytes written during
329    /// decoding will likely be smaller.
330    ///
331    /// You should rely on the `usize` returned by [`decode_into`](Self::decode_into)
332    /// to determine the actual valid slice of the output buffer.
333    #[inline]
334    #[must_use]
335    pub const fn estimate_decoded_len(&self, input_len: usize) -> usize {
336        // Conservative estimate: 3 bytes for every 4 chars, plus a safety margin
337        // for unpadded/chunked logic.
338        (input_len / 4 + 1) * 3
339    }
340
341    // ======================================================================
342    // Zero-Allocation APIs
343    // ======================================================================
344
345    /// Encodes `input` into the provided `output` buffer.
346    ///
347    /// This is a "Zero-Allocation" API designed for hot paths. It writes directly
348    /// into the destination slice without creating intermediate `Vec`.
349    ///
350    /// # Parallelism
351    /// If the `parallel` feature is enabled and the input size exceeds the
352    /// internal threshold (default: 512KB), this method automatically uses
353    /// Rayon to process chunks in parallel, saturating memory bandwidth.
354    ///
355    /// # Arguments
356    ///
357    /// * `input`: The binary data to encode.
358    /// * `output`: A mutable slice to write the Base64 string into.
359    ///
360    /// # Returns
361    ///
362    /// * `Ok(usize)`: The actual number of bytes written to `output`.
363    /// * `Err(Error::BufferTooSmall)`: If `output.len()` is less than [`encoded_len`](Self::encoded_len).
364    #[inline]
365    pub fn encode_into<T: AsRef<[u8]> + Sync>(
366        &self,
367        input: T,
368        output: &mut [u8],
369    ) -> Result<usize, Error> {
370        let input = input.as_ref();
371        let len = input.len();
372
373        if len == 0 {
374            return Ok(0);
375        }
376
377        let req_len = Self::encoded_len(self, len);
378        if output.len() < req_len {
379            return Err(Error::BufferTooSmall);
380        }
381
382        // --- Parallel Path ---
383        #[cfg(feature = "parallel")]
384        {
385            if len >= PARALLEL_THRESHOLD {
386                // Split input and output into corresponding chunks
387                let out_slice = &mut output[..req_len];
388
389                // Base64 expands 3 bytes -> 4 chars. 
390                // We chunk based on ENCODE_CHUNK_SIZE (24KB) to stay cache-friendly.
391                out_slice
392                    .par_chunks_mut((ENCODE_CHUNK_SIZE / 3) * 4)
393                    .zip(input.par_chunks(ENCODE_CHUNK_SIZE))
394                    .for_each(|(out_chunk, in_chunk)| {
395                        // Safe: We know the chunk sizes match the expansion ratio logic
396                        Self::encode_dispatch(self, in_chunk, out_chunk.as_mut_ptr());
397                    });
398                
399                return Ok(req_len);
400            }
401        }
402
403        // --- Serial Path ---
404        // Pass the raw pointer to the dispatcher. 
405        // SAFETY: We checked output.len() >= req_len above.
406        Self::encode_dispatch(self, input, output[..req_len].as_mut_ptr());
407
408        Ok(req_len)
409    }
410
411    /// Decodes `input` into the provided `output` buffer.
412    ///
413    /// # Performance
414    /// Like encoding, this method supports automatic parallelization for large payloads.
415    /// It verifies the validity of the Base64 input while decoding.
416    ///
417    /// # Returns
418    ///
419    /// * `Ok(usize)`: The actual number of bytes written to `output`.
420    /// * `Err(Error)`: If the input is invalid or the buffer is too small.
421    #[inline]
422    pub fn decode_into<T: AsRef<[u8]> + Sync>(
423        &self,
424        input: T,
425        output: &mut [u8],
426    ) -> Result<usize, Error> {
427        let input = input.as_ref();
428        let len = input.len();
429
430        if len == 0 {
431            return Ok(0);
432        }
433
434        let req_len = Self::estimate_decoded_len(self, len);
435        if output.len() < req_len {
436            return Err(Error::BufferTooSmall);
437        }
438
439        // --- Parallel Path ---
440        #[cfg(feature = "parallel")]
441        {
442            if len >= PARALLEL_THRESHOLD {
443                let out_slice = &mut output[..req_len];
444
445                // Parallel Reduce:
446                // 1. Split input/output into chunks.
447                // 2. Decode chunks independently.
448                // 3. Sum the number of bytes written by each chunk.
449                // 4. Return error if any chunk fails.
450                let real_len = out_slice
451                    .par_chunks_mut((DECODE_CHUNK_SIZE / 4) * 3)
452                    .zip(input.par_chunks(DECODE_CHUNK_SIZE))
453                    .try_fold(
454                        || 0usize,
455                        |acc, (out_chunk, in_chunk)| {
456                            let written = Self::decode_dispatch(self, in_chunk, out_chunk.as_mut_ptr())?;
457                            Ok(acc + written)
458                        },
459                    )
460                    .try_reduce(
461                        || 0usize,
462                        |a, b| Ok(a + b),
463                    )?;
464
465                return Ok(real_len);
466            }
467        }
468
469        // --- Serial Path ---
470        let real_len = Self::decode_dispatch(self, input, output[..req_len].as_mut_ptr())?;
471
472        Ok(real_len)
473    }
474
475    // ========================================================================
476    // Allocating APIs (std)
477    // ========================================================================
478
479    /// Allocates a new `String` and encodes the input data into it.
480    ///
481    /// This is the most convenient method for general usage.
482    ///
483    /// # Examples
484    ///
485    /// ```
486    /// use base64_turbo::STANDARD;
487    /// let b64 = STANDARD.encode(b"hello");
488    /// assert_eq!(b64, "aGVsbG8=");
489    /// ```
490    #[inline]
491    #[cfg(feature = "std")]
492    pub fn encode<T: AsRef<[u8]> + Sync>(&self, input: T) -> String {
493        let input = input.as_ref();
494
495        // 1. Calculate EXACT required size. Base64 encoding is deterministic.
496        let len = Self::encoded_len(self, input.len());
497
498        // 2. Allocate uninitialized buffer
499        let mut out = Vec::with_capacity(len);
500
501        // 3. Set length immediately
502        // SAFETY: We are about to overwrite the entire buffer in `encode_into`.
503        // We require a valid `&mut [u8]` slice for the internal logic (especially Rayon) to work.
504        // Since `encode_into` guarantees it writes exactly `len` bytes or fails (and we panic on fail),
505        // we won't expose uninitialized memory.
506        #[allow(clippy::uninit_vec)]
507        unsafe { out.set_len(len); }
508
509        // 4. Encode
510        // We trust our `encoded_len` math completely.
511        Self::encode_into(self, input, &mut out).expect("Base64 logic error: buffer size mismatch");
512
513        // 5. Convert to String
514        // SAFETY: The Base64 alphabet consists strictly of ASCII characters,
515        // which are valid UTF-8.
516        unsafe { String::from_utf8_unchecked(out) }
517    }
518
519    /// Allocates a new `Vec<u8>` and decodes the input data into it.
520    ///
521    /// # Errors
522    /// Returns `Error` if the input contains invalid characters or has an invalid length.
523    ///
524    /// # Examples
525    ///
526    /// ```
527    /// use base64_turbo::STANDARD;
528    /// let bytes = STANDARD.decode("aGVsbG8=").unwrap();
529    /// assert_eq!(bytes, b"hello");
530    /// ```
531    #[inline]
532    #[cfg(feature = "std")]
533    pub fn decode<T: AsRef<[u8]> + Sync>(&self, input: T) -> Result<Vec<u8>, Error> {
534        let input = input.as_ref();
535
536        // 1. Calculate MAXIMUM required size (upper bound)
537        let max_len = Self::estimate_decoded_len(self, input.len());
538
539        // 2. Allocate buffer
540        let mut out = Vec::with_capacity(max_len);
541
542        // 3. Set length to MAX
543        // SAFETY: We temporarily expose uninitialized memory to the `decode_into` function
544        // so it can write into the slice. We strictly sanitize the length in step 5.
545        #[allow(clippy::uninit_vec)]
546        unsafe { out.set_len(max_len); }
547
548        // 4. Decode
549        // `decode_into` handles parallel/serial dispatch and returns the `actual_len`.
550        match Self::decode_into(self, input, &mut out) {
551            Ok(actual_len) => {
552                // 5. Shrink to fit the real data
553                // SAFETY: `decode_into` reported it successfully wrote `actual_len` valid bytes.
554                // We truncate the Vec to this length, discarding any trailing garbage/uninitialized memory.
555                unsafe { out.set_len(actual_len); }
556                Ok(out)
557            }
558            Err(e) => {
559                // SAFETY: If an error occurred, we force the length to 0.
560                // This prevents the caller from accidentally inspecting uninitialized memory
561                // if they were to (incorrectly) reuse the Vec from a partial result.
562                unsafe { out.set_len(0); }
563                Err(e)
564            }
565        }
566    }
567
568    // ========================================================================
569    // Internal Dispatchers
570    // ========================================================================
571
572    #[inline(always)]
573    fn encode_dispatch(&self, input: &[u8], dst: *mut u8) {
574        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
575        #[cfg(feature = "simd")]
576        {
577            let len = input.len();
578
579            #[cfg(feature = "avx512")]
580            // Smart degrade: If len < 64, don't bother checking AVX512 features or setting up ZMM register
581            if len >= 64 
582                && std::is_x86_feature_detected!("avx512f") 
583                && std::is_x86_feature_detected!("avx512bw") 
584            {
585                unsafe { simd::encode_slice_avx512(&self.config, input, dst); }
586                return;
587            }
588
589            // Smart degrade: If len < 32, skip AVX2.
590            if len >= 32 && std::is_x86_feature_detected!("avx2") {
591                unsafe { simd::encode_slice_avx2(&self.config, input, dst); }
592                return;
593            }
594
595            // Smart degrade: If len < 16, skip SSE4.1 and go straight to scalar.
596            if len >= 16 && std::is_x86_feature_detected!("sse4.1")  {
597                unsafe { simd::encode_slice_simd(&self.config, input, dst); }
598                return;
599            }
600        }
601
602        // Fallback: Scalar / Non-x86 / Short inputs
603        // Safety: Pointers verified by caller
604        unsafe { scalar::encode_slice_unsafe(&self.config, input, dst); }
605    }
606
607    #[inline(always)]
608    fn decode_dispatch(&self, input: &[u8], dst: *mut u8) -> Result<usize, Error> {
609        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
610        #[cfg(feature = "simd")]
611        {
612            let len = input.len();
613
614            #[cfg(feature = "avx512")]
615            // Smart degrade: Don't enter AVX512 path if we don't have a full vector of input.
616            if len >= 64 
617                && std::is_x86_feature_detected!("avx512f") 
618                && std::is_x86_feature_detected!("avx512bw") 
619            {
620                return unsafe { simd::decode_slice_avx512(&self.config, input, dst) };
621            }
622
623            // Smart degrade: Fallback to AVX2 if len is between 32 and 64, or if AVX512 is missing.
624            if len >= 32 && std::is_x86_feature_detected!("avx2") {
625                return unsafe { simd::decode_slice_avx2(&self.config, input, dst) };
626            }
627
628            // Smart degrade: Fallback to SSE4.1 if len is between 16 and 32.
629            if len >= 16 && std::is_x86_feature_detected!("sse4.1")  {
630                return unsafe { simd::decode_slice_simd(&self.config, input, dst) };
631            }
632        }
633
634        // Fallback: Scalar / Non-x86 / Short inputs
635        // Safety: Pointers verified by caller
636        unsafe { scalar::decode_slice_unsafe(&self.config, input, dst) }
637    }
638}