base64_turbo/lib.rs
1//! # Base64 Turbo
2//!
3//! [](https://crates.io/crates/base64-turbo)
4//! [](https://docs.rs/base64-turbo)
5//! [](https://github.com/hacer-bark/base64-turbo/blob/main/LICENSE)
6//! [](https://github.com/hacer-bark/base64-turbo/actions/workflows/verification.yml)
7//! [](https://github.com/hacer-bark/base64-turbo/actions/workflows/miri.yml)
8//! [](https://github.com/hacer-bark/base64-turbo/actions/workflows/tests.yml)
9//!
10//! A SIMD-accelerated Base64 encoder/decoder for Rust, optimized for high-throughput systems.
11//!
12//! This crate provides runtime CPU detection to utilize AVX2, SSE4.1, or AVX512 (via feature flag) intrinsics.
13//! It includes a highly optimized scalar fallback for non-SIMD targets and supports `no_std` environments.
14//!
15//! ## Usage
16//!
17//! Add this to your `Cargo.toml`:
18//!
19//! ```toml
20//! [dependencies]
21//! base64-turbo = "0.1"
22//! ```
23//!
24//! ### Basic API (Allocating)
25//!
26//! Standard usage for general applications. Requires the `std` feature (enabled by default).
27//!
28//! ```rust
29//! # #[cfg(feature = "std")]
30//! # {
31//! use base64_turbo::STANDARD;
32//!
33//! let data = b"Hello world";
34//!
35//! // Encode to String
36//! let encoded = STANDARD.encode(data);
37//! assert_eq!(encoded, "SGVsbG8gd29ybGQ=");
38//!
39//! // Decode to Vec<u8>
40//! let decoded = STANDARD.decode(&encoded).unwrap();
41//! assert_eq!(decoded, data);
42//! # }
43//! ```
44//!
45//! ### Zero-Allocation API (Slice-based)
46//!
47//! For low-latency scenarios or `no_std` environments where heap allocation is undesirable.
48//! These methods write directly into a user-provided mutable slice.
49//!
50//! ```rust
51//! use base64_turbo::STANDARD;
52//!
53//! let input = b"Raw bytes";
54//! let mut output = [0u8; 64]; // Pre-allocated stack buffer
55//!
56//! // Returns Result<usize, Error> indicating bytes written
57//! let len = STANDARD.encode_into(input, &mut output).unwrap();
58//!
59//! assert_eq!(&output[..len], b"UmF3IGJ5dGVz");
60//! ```
61//!
62//! ## Feature Flags
63//!
64//! This crate is highly configurable via Cargo features:
65//!
66//! | Feature | Default | Description |
67//! |---------|---------|-------------|
68//! | **`std`** | **Yes** | Enables `String` and `Vec` support. Disable this for `no_std` environments. |
69//! | **`simd`** | **Yes** | Enables runtime detection for AVX2 and SSE4.1 intrinsics. If disabled or unsupported by hardware, the crate falls back to scalar logic automatic. |
70//! | **`parallel`** | **No** | Enables [Rayon](https://crates.io/crates/rayon) support. Automatically parallelizes processing for payloads larger than 512KB. Recommended only for massive data ingestion tasks. |
71//! | **`avx512`** | **No** | Enables AVX512 intrinsics. |
72//!
73//! ## Safety & Verification
74//!
75//! This crate utilizes `unsafe` code for SIMD intrinsics and pointer arithmetic to achieve maximum performance.
76//!
77//! * **Formal Verification (Kani):** Scalar (Done), SSE4.1 (In Progress), AVX2 (Done), AVX512 (In Progress) code mathematic proven to be UB free and panic free.
78//! * **MIRI Tests:** Core SIMD logic and scalar fallbacks are verified with **MIRI** (Undefined Behavior checker) in CI.
79//! * **Fuzzing:** The codebase is fuzz-tested via `cargo-fuzz`.
80//! * **Fallback:** Invalid or unsupported hardware instruction sets are detected at runtime, ensuring safe fallback to scalar code.
81//!
82//! **[Learn More](https://github.com/hacer-bark/base64-turbo/blob/main/docs/verification.md)**: Details on our threat model and formal verification strategy.
83
84#![cfg_attr(not(any(feature = "std", test)), no_std)]
85
86#![doc(issue_tracker_base_url = "https://github.com/hacer-bark/base64-turbo/issues/")]
87
88#![deny(unsafe_op_in_unsafe_fn)]
89#![warn(missing_docs)]
90#![warn(rust_2018_idioms)]
91#![warn(unused_qualifications)]
92
93#![cfg_attr(docsrs, feature(doc_cfg))]
94
95#[cfg(feature = "parallel")]
96use rayon::prelude::*;
97
98#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
99#[cfg(feature = "simd")]
100mod simd;
101mod scalar;
102
103// ======================================================================
104// ERROR DEFINITION
105// ======================================================================
106
107/// Errors that can occur during Base64 encoding or decoding operations.
108#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum Error {
110 /// The input length is invalid for Base64 decoding.
111 ///
112 /// Base64 encoded data (with padding) must strictly have a length divisible by 4.
113 /// If the input string is truncated or has incorrect padding length, this error is returned.
114 InvalidLength,
115
116 /// An invalid character was encountered during decoding.
117 ///
118 /// This occurs if the input contains bytes that do not belong to the
119 /// selected Base64 alphabet (e.g., symbols not in the standard set) or
120 /// if padding characters (`=`) appear in invalid positions.
121 InvalidCharacter,
122
123 /// The provided output buffer is too small to hold the result.
124 ///
125 /// This error is returned by the zero-allocation APIs (e.g., `encode_into`, `decode_into`)
126 /// when the destination slice passed by the user does not have enough capacity
127 /// to store the encoded or decoded data.
128 BufferTooSmall,
129}
130
131// Standard Display implementation for better error messages
132impl core::fmt::Display for Error {
133 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
134 match self {
135 Error::InvalidLength => write!(f, "Invalid Base64 input length (must be divisible by 4)"),
136 Error::InvalidCharacter => write!(f, "Invalid character found in Base64 input"),
137 Error::BufferTooSmall => write!(f, "Destination buffer is too small"),
138 }
139 }
140}
141
142// Enable std::error::Error trait when the 'std' feature is active
143#[cfg(feature = "std")]
144impl std::error::Error for Error {}
145
146// ======================================================================
147// Internal
148// Tuning Constants (Parallelism)
149// ======================================================================
150
151/// Input chunk size for parallel processing (24 KB).
152///
153/// This size is chosen to fit comfortably within the L1/L2 cache of most modern
154/// CPUs, ensuring that hot loops inside the encoder stay cache-resident.
155#[cfg(feature = "parallel")]
156const ENCODE_CHUNK_SIZE: usize = 24 * 1024;
157
158/// Output chunk size corresponding to `ENCODE_CHUNK_SIZE`.
159///
160/// Base64 encoding expands data by 4/3. For a 24KB input, the output is 32KB.
161#[cfg(feature = "parallel")]
162const DECODE_CHUNK_SIZE: usize = (ENCODE_CHUNK_SIZE / 3) * 4;
163
164/// Threshold to enable Rayon parallelism (512 KB).
165///
166/// For payloads smaller than this, the overhead of context switching and
167/// thread synchronization outweighs the throughput gains of multi-threading.
168#[cfg(feature = "parallel")]
169const PARALLEL_THRESHOLD: usize = 512 * 1024;
170
171// ======================================================================
172// Internal Lookup Tables
173// ======================================================================
174
175/// The Standard RFC 4648 Base64 Alphabet.
176/// Used for `STANDARD` and `STANDARD_NO_PAD`.
177const STANDARD_ALPHABET: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
178
179/// Computed compile-time reverse lookup table for the Standard alphabet.
180/// Maps ASCII bytes back to 6-bit indices. 0xFF indicates an invalid character.
181const STANDARD_DECODE_TABLE: [u8; 256] = {
182 let mut table = [0xFF; 256];
183 let mut i = 0;
184 while i < 64 {
185 table[STANDARD_ALPHABET[i] as usize] = i as u8;
186 i += 1;
187 }
188 table
189};
190
191/// The URL-Safe Base64 Alphabet.
192/// Replaces `+` with `-` and `/` with `_`. Used for `URL_SAFE` and `URL_SAFE_NO_PAD`.
193const URL_SAFE_ALPHABET: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
194
195/// Computed compile-time reverse lookup table for the URL-Safe alphabet.
196/// Maps ASCII bytes back to 6-bit indices. 0xFF indicates an invalid character.
197const URL_SAFE_DECODE_TABLE: [u8; 256] = {
198 let mut table = [0xFF; 256];
199 let mut i = 0;
200 while i < 64 {
201 table[URL_SAFE_ALPHABET[i] as usize] = i as u8;
202 i += 1;
203 }
204 table
205};
206
207// ======================================================================
208// Configuration & Types
209// ======================================================================
210
211/// Internal configuration for the Base64 engine.
212///
213/// This struct uses `repr(C)` to ensure predictable memory layout.
214#[repr(C)]
215#[derive(Debug, Clone, Copy)]
216pub(crate) struct Config {
217 /// If true, uses `-` and `_` instead of `+` and `/`.
218 pub url_safe: bool,
219 /// If true, writes `=` padding characters to the output.
220 pub padding: bool,
221}
222
223/// A high-performance, stateless Base64 encoder/decoder.
224///
225/// This struct holds the configuration for encoding/decoding (alphabet choice and padding).
226/// It is designed to be immutable and thread-safe.
227///
228/// # Examples
229///
230/// ```rust
231/// # #[cfg(feature = "std")]
232/// # {
233/// use base64_turbo::STANDARD;
234///
235/// let data = b"Hello world";
236///
237/// // Encode to String
238/// let encoded = STANDARD.encode(data);
239/// assert_eq!(encoded, "SGVsbG8gd29ybGQ=");
240///
241/// // Decode to Result<Vec<u8>, Error>
242/// let decoded = STANDARD.decode(&encoded).unwrap();
243/// assert_eq!(decoded, data);
244/// # }
245/// ```
246#[derive(Debug, Clone, Copy)]
247pub struct Engine {
248 pub(crate) config: Config,
249}
250
251// ======================================================================
252// Pre-defined Engines
253// ======================================================================
254
255/// Standard Base64 (RFC 4648) with padding (`=`).
256///
257/// Uses the `+` and `/` characters. This is the most common configuration.
258pub const STANDARD: Engine = Engine {
259 config: Config {
260 url_safe: false,
261 padding: true,
262 },
263};
264
265/// Standard Base64 (RFC 4648) **without** padding.
266///
267/// Uses the `+` and `/` characters, but omits trailing `=` signs.
268/// Useful for raw data streams or specific protocol requirements.
269pub const STANDARD_NO_PAD: Engine = Engine {
270 config: Config {
271 url_safe: false,
272 padding: false,
273 },
274};
275
276/// URL-Safe Base64 with padding.
277///
278/// Uses `-` and `_` instead of `+` and `/`. Safe for use in filenames and URLs.
279pub const URL_SAFE: Engine = Engine {
280 config: Config {
281 url_safe: true,
282 padding: true,
283 },
284};
285
286/// URL-Safe Base64 **without** padding.
287///
288/// Uses `-` and `_`. Commonly used in JWTs (JSON Web Tokens) and other web standards.
289pub const URL_SAFE_NO_PAD: Engine = Engine {
290 config: Config {
291 url_safe: true,
292 padding: false,
293 },
294};
295
296impl Engine {
297 // ======================================================================
298 // Length Calculators
299 // ======================================================================
300
301 /// Calculates the exact buffer size required to encode `input_len` bytes.
302 ///
303 /// This method computes the size based on the current configuration (padding vs. no padding).
304 ///
305 /// # Examples
306 ///
307 /// ```
308 /// use base64_turbo::STANDARD;
309 ///
310 /// assert_eq!(STANDARD.encoded_len(3), 4);
311 /// assert_eq!(STANDARD.encoded_len(1), 4); // With padding
312 /// ```
313 #[inline]
314 #[must_use]
315 pub const fn encoded_len(&self, input_len: usize) -> usize {
316 if self.config.padding {
317 // (n + 2) / 3 * 4
318 input_len.div_ceil(3) * 4
319 } else {
320 // (n * 4 + 2) / 3
321 (input_len * 4).div_ceil(3)
322 }
323 }
324
325 /// Calculates the **maximum** buffer size required to decode `input_len` bytes.
326 ///
327 /// # Note
328 /// This is an upper-bound estimate. The actual number of bytes written during
329 /// decoding will likely be smaller.
330 ///
331 /// You should rely on the `usize` returned by [`decode_into`](Self::decode_into)
332 /// to determine the actual valid slice of the output buffer.
333 #[inline]
334 #[must_use]
335 pub const fn estimate_decoded_len(&self, input_len: usize) -> usize {
336 // Conservative estimate: 3 bytes for every 4 chars, plus a safety margin
337 // for unpadded/chunked logic.
338 (input_len / 4 + 1) * 3
339 }
340
341 // ======================================================================
342 // Zero-Allocation APIs
343 // ======================================================================
344
345 /// Encodes `input` into the provided `output` buffer.
346 ///
347 /// This is a "Zero-Allocation" API designed for hot paths. It writes directly
348 /// into the destination slice without creating intermediate `Vec`.
349 ///
350 /// # Parallelism
351 /// If the `parallel` feature is enabled and the input size exceeds the
352 /// internal threshold (default: 512KB), this method automatically uses
353 /// Rayon to process chunks in parallel, saturating memory bandwidth.
354 ///
355 /// # Arguments
356 ///
357 /// * `input`: The binary data to encode.
358 /// * `output`: A mutable slice to write the Base64 string into.
359 ///
360 /// # Returns
361 ///
362 /// * `Ok(usize)`: The actual number of bytes written to `output`.
363 /// * `Err(Error::BufferTooSmall)`: If `output.len()` is less than [`encoded_len`](Self::encoded_len).
364 #[inline]
365 pub fn encode_into<T: AsRef<[u8]> + Sync>(
366 &self,
367 input: T,
368 output: &mut [u8],
369 ) -> Result<usize, Error> {
370 let input = input.as_ref();
371 let len = input.len();
372
373 if len == 0 {
374 return Ok(0);
375 }
376
377 let req_len = Self::encoded_len(self, len);
378 if output.len() < req_len {
379 return Err(Error::BufferTooSmall);
380 }
381
382 // --- Parallel Path ---
383 #[cfg(feature = "parallel")]
384 {
385 if len >= PARALLEL_THRESHOLD {
386 // Split input and output into corresponding chunks
387 let out_slice = &mut output[..req_len];
388
389 // Base64 expands 3 bytes -> 4 chars.
390 // We chunk based on ENCODE_CHUNK_SIZE (24KB) to stay cache-friendly.
391 out_slice
392 .par_chunks_mut((ENCODE_CHUNK_SIZE / 3) * 4)
393 .zip(input.par_chunks(ENCODE_CHUNK_SIZE))
394 .for_each(|(out_chunk, in_chunk)| {
395 // Safe: We know the chunk sizes match the expansion ratio logic
396 Self::encode_dispatch(self, in_chunk, out_chunk.as_mut_ptr());
397 });
398
399 return Ok(req_len);
400 }
401 }
402
403 // --- Serial Path ---
404 // Pass the raw pointer to the dispatcher.
405 // SAFETY: We checked output.len() >= req_len above.
406 Self::encode_dispatch(self, input, output[..req_len].as_mut_ptr());
407
408 Ok(req_len)
409 }
410
411 /// Decodes `input` into the provided `output` buffer.
412 ///
413 /// # Performance
414 /// Like encoding, this method supports automatic parallelization for large payloads.
415 /// It verifies the validity of the Base64 input while decoding.
416 ///
417 /// # Returns
418 ///
419 /// * `Ok(usize)`: The actual number of bytes written to `output`.
420 /// * `Err(Error)`: If the input is invalid or the buffer is too small.
421 #[inline]
422 pub fn decode_into<T: AsRef<[u8]> + Sync>(
423 &self,
424 input: T,
425 output: &mut [u8],
426 ) -> Result<usize, Error> {
427 let input = input.as_ref();
428 let len = input.len();
429
430 if len == 0 {
431 return Ok(0);
432 }
433
434 let req_len = Self::estimate_decoded_len(self, len);
435 if output.len() < req_len {
436 return Err(Error::BufferTooSmall);
437 }
438
439 // --- Parallel Path ---
440 #[cfg(feature = "parallel")]
441 {
442 if len >= PARALLEL_THRESHOLD {
443 let out_slice = &mut output[..req_len];
444
445 // Parallel Reduce:
446 // 1. Split input/output into chunks.
447 // 2. Decode chunks independently.
448 // 3. Sum the number of bytes written by each chunk.
449 // 4. Return error if any chunk fails.
450 let real_len = out_slice
451 .par_chunks_mut((DECODE_CHUNK_SIZE / 4) * 3)
452 .zip(input.par_chunks(DECODE_CHUNK_SIZE))
453 .try_fold(
454 || 0usize,
455 |acc, (out_chunk, in_chunk)| {
456 let written = Self::decode_dispatch(self, in_chunk, out_chunk.as_mut_ptr())?;
457 Ok(acc + written)
458 },
459 )
460 .try_reduce(
461 || 0usize,
462 |a, b| Ok(a + b),
463 )?;
464
465 return Ok(real_len);
466 }
467 }
468
469 // --- Serial Path ---
470 let real_len = Self::decode_dispatch(self, input, output[..req_len].as_mut_ptr())?;
471
472 Ok(real_len)
473 }
474
475 // ========================================================================
476 // Allocating APIs (std)
477 // ========================================================================
478
479 /// Allocates a new `String` and encodes the input data into it.
480 ///
481 /// This is the most convenient method for general usage.
482 ///
483 /// # Examples
484 ///
485 /// ```
486 /// use base64_turbo::STANDARD;
487 /// let b64 = STANDARD.encode(b"hello");
488 /// assert_eq!(b64, "aGVsbG8=");
489 /// ```
490 #[inline]
491 #[cfg(feature = "std")]
492 pub fn encode<T: AsRef<[u8]> + Sync>(&self, input: T) -> String {
493 let input = input.as_ref();
494
495 // 1. Calculate EXACT required size. Base64 encoding is deterministic.
496 let len = Self::encoded_len(self, input.len());
497
498 // 2. Allocate uninitialized buffer
499 let mut out = Vec::with_capacity(len);
500
501 // 3. Set length immediately
502 // SAFETY: We are about to overwrite the entire buffer in `encode_into`.
503 // We require a valid `&mut [u8]` slice for the internal logic (especially Rayon) to work.
504 // Since `encode_into` guarantees it writes exactly `len` bytes or fails (and we panic on fail),
505 // we won't expose uninitialized memory.
506 #[allow(clippy::uninit_vec)]
507 unsafe { out.set_len(len); }
508
509 // 4. Encode
510 // We trust our `encoded_len` math completely.
511 Self::encode_into(self, input, &mut out).expect("Base64 logic error: buffer size mismatch");
512
513 // 5. Convert to String
514 // SAFETY: The Base64 alphabet consists strictly of ASCII characters,
515 // which are valid UTF-8.
516 unsafe { String::from_utf8_unchecked(out) }
517 }
518
519 /// Allocates a new `Vec<u8>` and decodes the input data into it.
520 ///
521 /// # Errors
522 /// Returns `Error` if the input contains invalid characters or has an invalid length.
523 ///
524 /// # Examples
525 ///
526 /// ```
527 /// use base64_turbo::STANDARD;
528 /// let bytes = STANDARD.decode("aGVsbG8=").unwrap();
529 /// assert_eq!(bytes, b"hello");
530 /// ```
531 #[inline]
532 #[cfg(feature = "std")]
533 pub fn decode<T: AsRef<[u8]> + Sync>(&self, input: T) -> Result<Vec<u8>, Error> {
534 let input = input.as_ref();
535
536 // 1. Calculate MAXIMUM required size (upper bound)
537 let max_len = Self::estimate_decoded_len(self, input.len());
538
539 // 2. Allocate buffer
540 let mut out = Vec::with_capacity(max_len);
541
542 // 3. Set length to MAX
543 // SAFETY: We temporarily expose uninitialized memory to the `decode_into` function
544 // so it can write into the slice. We strictly sanitize the length in step 5.
545 #[allow(clippy::uninit_vec)]
546 unsafe { out.set_len(max_len); }
547
548 // 4. Decode
549 // `decode_into` handles parallel/serial dispatch and returns the `actual_len`.
550 match Self::decode_into(self, input, &mut out) {
551 Ok(actual_len) => {
552 // 5. Shrink to fit the real data
553 // SAFETY: `decode_into` reported it successfully wrote `actual_len` valid bytes.
554 // We truncate the Vec to this length, discarding any trailing garbage/uninitialized memory.
555 unsafe { out.set_len(actual_len); }
556 Ok(out)
557 }
558 Err(e) => {
559 // SAFETY: If an error occurred, we force the length to 0.
560 // This prevents the caller from accidentally inspecting uninitialized memory
561 // if they were to (incorrectly) reuse the Vec from a partial result.
562 unsafe { out.set_len(0); }
563 Err(e)
564 }
565 }
566 }
567
568 // ========================================================================
569 // Internal Dispatchers
570 // ========================================================================
571
572 #[inline(always)]
573 fn encode_dispatch(&self, input: &[u8], dst: *mut u8) {
574 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
575 #[cfg(feature = "simd")]
576 {
577 let len = input.len();
578
579 #[cfg(feature = "avx512")]
580 // Smart degrade: If len < 64, don't bother checking AVX512 features or setting up ZMM register
581 if len >= 64
582 && std::is_x86_feature_detected!("avx512f")
583 && std::is_x86_feature_detected!("avx512bw")
584 {
585 unsafe { simd::encode_slice_avx512(&self.config, input, dst); }
586 return;
587 }
588
589 // Smart degrade: If len < 32, skip AVX2.
590 if len >= 32 && std::is_x86_feature_detected!("avx2") {
591 unsafe { simd::encode_slice_avx2(&self.config, input, dst); }
592 return;
593 }
594
595 // Smart degrade: If len < 16, skip SSE4.1 and go straight to scalar.
596 if len >= 16 && std::is_x86_feature_detected!("sse4.1") {
597 unsafe { simd::encode_slice_simd(&self.config, input, dst); }
598 return;
599 }
600 }
601
602 // Fallback: Scalar / Non-x86 / Short inputs
603 // Safety: Pointers verified by caller
604 unsafe { scalar::encode_slice_unsafe(&self.config, input, dst); }
605 }
606
607 #[inline(always)]
608 fn decode_dispatch(&self, input: &[u8], dst: *mut u8) -> Result<usize, Error> {
609 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
610 #[cfg(feature = "simd")]
611 {
612 let len = input.len();
613
614 #[cfg(feature = "avx512")]
615 // Smart degrade: Don't enter AVX512 path if we don't have a full vector of input.
616 if len >= 64
617 && std::is_x86_feature_detected!("avx512f")
618 && std::is_x86_feature_detected!("avx512bw")
619 {
620 return unsafe { simd::decode_slice_avx512(&self.config, input, dst) };
621 }
622
623 // Smart degrade: Fallback to AVX2 if len is between 32 and 64, or if AVX512 is missing.
624 if len >= 32 && std::is_x86_feature_detected!("avx2") {
625 return unsafe { simd::decode_slice_avx2(&self.config, input, dst) };
626 }
627
628 // Smart degrade: Fallback to SSE4.1 if len is between 16 and 32.
629 if len >= 16 && std::is_x86_feature_detected!("sse4.1") {
630 return unsafe { simd::decode_slice_simd(&self.config, input, dst) };
631 }
632 }
633
634 // Fallback: Scalar / Non-x86 / Short inputs
635 // Safety: Pointers verified by caller
636 unsafe { scalar::decode_slice_unsafe(&self.config, input, dst) }
637 }
638}