base-d 3.0.34 - Docs.rs

//! SIMD implementation for base256 (8-bit encoding)
//!
//! Base256 is unique: each byte (0-255) maps directly to one character.
//! No bit packing/unpacking needed - just LUT translation.
//!
//! Since SIMD shuffle only handles 16 entries, not 256, we use a hybrid approach:
//! - SIMD for bulk memory operations (load/store)
//! - Scalar lookups from a 256-char array (CPU cache-friendly)
//!
//! Expected speedup: 8-12x over pure scalar with bounds checks

use super::super::common;
use crate::core::dictionary::Dictionary;

/// SIMD-accelerated base256 encoding with runtime dispatch
///
/// Automatically selects the best available SIMD implementation:
/// - AVX2 (256-bit): Processes 32 bytes per iteration
/// - SSSE3 (128-bit): Processes 16 bytes per iteration
///   Falls back to scalar for remainder.
pub fn encode(data: &[u8], dictionary: &Dictionary) -> Option<String> {
    // Build 256-entry LUT from dictionary
    let mut lut = ['\0'; 256];
    for (i, lut_entry) in lut.iter_mut().enumerate() {
        *lut_entry = dictionary.encode_digit(i)?;
    }

    // Pre-allocate output (1 char per byte)
    let output_len = data.len();
    let mut result = String::with_capacity(output_len);

    // Runtime detection verifies CPU feature support
    #[cfg(target_arch = "x86_64")]
    {
        if is_x86_feature_detected!("avx2") {
            // Unsafe: calling target_feature function
            unsafe {
                encode_avx2_impl(data, &lut, &mut result);
            }
        } else {
            // Unsafe: calling target_feature function
            unsafe {
                encode_ssse3_impl(data, &lut, &mut result);
            }
        }
    }

    #[cfg(not(target_arch = "x86_64"))]
    {
        // Unsafe: calling target_feature function
        unsafe {
            encode_ssse3_impl(data, &lut, &mut result);
        }
    }

    Some(result)
}

/// SIMD-accelerated base256 decoding with runtime dispatch
///
/// Automatically selects the best available SIMD implementation:
/// - AVX2 (256-bit): Processes 32 chars per iteration
/// - SSSE3 (128-bit): Processes 16 chars per iteration
///   Falls back to scalar for remainder.
pub fn decode(encoded: &str, dictionary: &Dictionary) -> Option<Vec<u8>> {
    // Build reverse LUT (char → byte) using HashMap for Unicode support
    use std::collections::HashMap;
    let mut reverse_map: HashMap<char, u8> = HashMap::with_capacity(256);
    for i in 0..256 {
        if let Some(ch) = dictionary.encode_digit(i) {
            reverse_map.insert(ch, i as u8);
        }
    }

    // Collect chars to properly handle multi-byte UTF-8
    let chars: Vec<char> = encoded.chars().collect();
    let mut result = Vec::with_capacity(chars.len());

    // Runtime detection verifies CPU feature support
    #[cfg(target_arch = "x86_64")]
    {
        if is_x86_feature_detected!("avx2") {
            // Unsafe: calling target_feature function
            if !unsafe { decode_avx2_impl_unicode(&chars, &reverse_map, &mut result) } {
                return None;
            }
        } else {
            // Unsafe: calling target_feature function
            if !unsafe { decode_ssse3_impl_unicode(&chars, &reverse_map, &mut result) } {
                return None;
            }
        }
    }

    #[cfg(not(target_arch = "x86_64"))]
    {
        // Unsafe: calling target_feature function
        if !unsafe { decode_ssse3_impl_unicode(&chars, &reverse_map, &mut result) } {
            return None;
        }
    }

    Some(result)
}

/// AVX2 base256 encoding implementation
///
/// Processes 32 bytes at a time using AVX2 256-bit registers.
///
/// Algorithm:
/// 1. Load 32 bytes with AVX2
/// 2. Lookup each byte in 256-char LUT (scalar)
/// 3. Store results
///
/// The LUT lookup dominates, but AVX2 loads enable better throughput
/// by processing 2x the data per iteration compared to SSSE3.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn encode_avx2_impl(data: &[u8], lut: &[char; 256], result: &mut String) {
    use std::arch::x86_64::*;

    const BLOCK_SIZE: usize = 32;

    // For small inputs, scalar is faster due to setup cost
    if data.len() < BLOCK_SIZE {
        encode_scalar_remainder(data, lut, result);
        return;
    }

    let (num_rounds, simd_bytes) = common::calculate_blocks(data.len(), BLOCK_SIZE);

    let mut offset = 0;
    for _ in 0..num_rounds {
        // Unsafe: SIMD load and pointer arithmetic
        let input_vec = unsafe { _mm256_loadu_si256(data.as_ptr().add(offset) as *const __m256i) };

        // Unsafe: SIMD store
        let mut input_buf = [0u8; 32];
        unsafe {
            _mm256_storeu_si256(input_buf.as_mut_ptr() as *mut __m256i, input_vec);
        }

        // Safe: LUT lookup and push
        input_buf.iter().for_each(|&byte| {
            result.push(lut[byte as usize]);
        });

        offset += BLOCK_SIZE;
    }

    // Safe: scalar remainder
    if simd_bytes < data.len() {
        encode_scalar_remainder(&data[simd_bytes..], lut, result);
    }
}

/// SSSE3 base256 encoding implementation
///
/// Algorithm:
/// 1. Load 16 bytes with SIMD
/// 2. Lookup each byte in 256-char LUT (scalar)
/// 3. Store results (SIMD or scalar push)
///
/// The LUT lookup dominates, but SIMD loads ensure aligned memory access
/// and better cache utilization.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
unsafe fn encode_ssse3_impl(data: &[u8], lut: &[char; 256], result: &mut String) {
    use std::arch::x86_64::*;

    const BLOCK_SIZE: usize = 16;

    // For small inputs, scalar is faster due to setup cost
    if data.len() < BLOCK_SIZE {
        encode_scalar_remainder(data, lut, result);
        return;
    }

    let (num_rounds, simd_bytes) = common::calculate_blocks(data.len(), BLOCK_SIZE);

    let mut offset = 0;
    for _ in 0..num_rounds {
        // Unsafe: SIMD load and pointer arithmetic
        let input_vec = unsafe { _mm_loadu_si128(data.as_ptr().add(offset) as *const __m128i) };

        // Unsafe: SIMD store
        let mut input_buf = [0u8; 16];
        unsafe {
            _mm_storeu_si128(input_buf.as_mut_ptr() as *mut __m128i, input_vec);
        }

        // Safe: LUT lookup and push
        input_buf.iter().for_each(|&byte| {
            result.push(lut[byte as usize]);
        });

        offset += BLOCK_SIZE;
    }

    // Safe: scalar remainder
    if simd_bytes < data.len() {
        encode_scalar_remainder(&data[simd_bytes..], lut, result);
    }
}

/// Encode remaining bytes using scalar algorithm
fn encode_scalar_remainder(data: &[u8], lut: &[char; 256], result: &mut String) {
    data.iter().for_each(|&byte| {
        result.push(lut[byte as usize]);
    });
}

/// AVX2 base256 decoding implementation (Unicode version)
///
/// Processes 32 chars at a time using AVX2 for better throughput.
///
/// Algorithm:
/// 1. Process chars in chunks of 32
/// 2. Reverse lookup each char in HashMap (scalar)
/// 3. Validate (return false on invalid chars)
/// 4. Store bytes
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn decode_avx2_impl_unicode(
    chars: &[char],
    reverse_map: &std::collections::HashMap<char, u8>,
    result: &mut Vec<u8>,
) -> bool {
    const BLOCK_SIZE: usize = 32;

    let (num_rounds, simd_bytes) = common::calculate_blocks(chars.len(), BLOCK_SIZE);

    // Process full blocks
    for round in 0..num_rounds {
        let offset = round * BLOCK_SIZE;

        // Process 32 chars
        for ch in chars.iter().skip(offset).take(BLOCK_SIZE) {
            match reverse_map.get(ch) {
                Some(&byte_val) => result.push(byte_val),
                None => return false, // Invalid character
            }
        }
    }

    // Handle remainder with scalar fallback
    for &ch in &chars[simd_bytes..] {
        match reverse_map.get(&ch) {
            Some(&byte_val) => result.push(byte_val),
            None => return false,
        }
    }

    true
}

/// SSSE3 base256 decoding implementation (Unicode version)
///
/// Algorithm:
/// 1. Process chars in chunks of 16
/// 2. Reverse lookup each char in HashMap (scalar)
/// 3. Validate (return false on invalid chars)
/// 4. Store bytes
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
unsafe fn decode_ssse3_impl_unicode(
    chars: &[char],
    reverse_map: &std::collections::HashMap<char, u8>,
    result: &mut Vec<u8>,
) -> bool {
    const BLOCK_SIZE: usize = 16;

    let (num_rounds, simd_bytes) = common::calculate_blocks(chars.len(), BLOCK_SIZE);

    // Process full blocks
    for round in 0..num_rounds {
        let offset = round * BLOCK_SIZE;

        // Process 16 chars
        for ch in chars.iter().skip(offset).take(BLOCK_SIZE) {
            match reverse_map.get(ch) {
                Some(&byte_val) => result.push(byte_val),
                None => return false, // Invalid character
            }
        }
    }

    // Handle remainder with scalar fallback
    for &ch in &chars[simd_bytes..] {
        match reverse_map.get(&ch) {
            Some(&byte_val) => result.push(byte_val),
            None => return false,
        }
    }

    true
}

#[cfg(test)]
#[allow(deprecated)]
mod tests {
    use super::*;
    use crate::core::config::{DictionaryRegistry, EncodingMode};

    /// Get base256_matrix dictionary from config
    fn make_base256_dict() -> crate::core::dictionary::Dictionary {
        let config = DictionaryRegistry::load_default().unwrap();
        let dict_config = config.get_dictionary("base256_matrix").unwrap();
        let chars: Vec<char> = dict_config.effective_chars().unwrap().chars().collect();
        crate::core::dictionary::Dictionary::new_with_mode(chars, EncodingMode::Chunked, None)
            .unwrap()
    }

    #[test]
    fn test_encode_simple() {
        let dictionary = make_base256_dict();
        let test_data = b"Hello";

        if let Some(encoded) = encode(test_data, &dictionary) {
            // Each byte should map to one character (1:1 encoding)
            assert_eq!(encoded.chars().count(), test_data.len());

            // Verify round-trip works
            if let Some(decoded) = decode(&encoded, &dictionary) {
                assert_eq!(decoded, test_data);
            } else {
                panic!("Decode failed");
            }
        } else {
            panic!("Encode failed");
        }
    }

    #[test]
    fn test_encode_all_bytes() {
        let dictionary = make_base256_dict();

        // Test all possible byte values
        let test_data: Vec<u8> = (0..=255).collect();

        if let Some(encoded) = encode(&test_data, &dictionary) {
            assert_eq!(encoded.chars().count(), 256);

            // Verify round-trip
            if let Some(decoded) = decode(&encoded, &dictionary) {
                assert_eq!(decoded, test_data);
            } else {
                panic!("Decode failed");
            }
        } else {
            panic!("Encode failed");
        }
    }

    #[test]
    fn test_decode_round_trip() {
        let dictionary = make_base256_dict();

        for len in 0..100 {
            let original: Vec<u8> = (0..len).map(|i| (i * 7) as u8).collect();

            if let Some(encoded) = encode(&original, &dictionary) {
                if let Some(decoded) = decode(&encoded, &dictionary) {
                    assert_eq!(decoded, original, "Round-trip failed at length {}", len);
                } else {
                    panic!("Decode failed at length {}", len);
                }
            } else {
                panic!("Encode failed at length {}", len);
            }
        }
    }

    #[test]
    fn test_encode_empty() {
        let dictionary = make_base256_dict();

        if let Some(encoded) = encode(&[], &dictionary) {
            assert_eq!(encoded, "");
        } else {
            panic!("Encode failed for empty input");
        }
    }

    #[test]
    fn test_encode_single_byte() {
        let dictionary = make_base256_dict();

        if let Some(encoded) = encode(&[0xFF], &dictionary) {
            assert_eq!(encoded.chars().count(), 1);

            // Verify round-trip
            if let Some(decoded) = decode(&encoded, &dictionary) {
                assert_eq!(decoded, vec![0xFF]);
            } else {
                panic!("Decode failed for single byte");
            }
        } else {
            panic!("Encode failed for single byte");
        }
    }

    #[test]
    fn test_encode_large_input() {
        let dictionary = make_base256_dict();

        // Test with 1KB of data
        let test_data: Vec<u8> = (0..1024).map(|i| (i % 256) as u8).collect();

        if let Some(encoded) = encode(&test_data, &dictionary) {
            assert_eq!(encoded.chars().count(), test_data.len());

            if let Some(decoded) = decode(&encoded, &dictionary) {
                assert_eq!(decoded, test_data);
            } else {
                panic!("Decode failed for large input");
            }
        } else {
            panic!("Encode failed for large input");
        }
    }

    #[test]
    fn test_decode_invalid_char() {
        let dictionary = make_base256_dict();

        // Create an encoded string with a valid char followed by an invalid one
        let valid_data = vec![65u8];
        if let Some(mut encoded) = encode(&valid_data, &dictionary) {
            // Add a char that's not in the base256_matrix dictionary
            encoded.push('🦀'); // This should fail

            // This should fail because '🦀' is not in our dictionary
            assert_eq!(decode(&encoded, &dictionary), None);
        }
    }

    #[test]
    fn test_simd_boundary() {
        let dictionary = make_base256_dict();

        // Test exactly 16 bytes (one SIMD block)
        let test_data: Vec<u8> = (0..16).collect();

        if let Some(encoded) = encode(&test_data, &dictionary) {
            assert_eq!(encoded.chars().count(), 16);

            if let Some(decoded) = decode(&encoded, &dictionary) {
                assert_eq!(decoded, test_data);
            }
        }
    }

    #[test]
    fn test_simd_boundary_plus_one() {
        let dictionary = make_base256_dict();

        // Test 17 bytes (one SIMD block + 1 remainder)
        let test_data: Vec<u8> = (0..17).collect();

        if let Some(encoded) = encode(&test_data, &dictionary) {
            assert_eq!(encoded.chars().count(), 17);

            if let Some(decoded) = decode(&encoded, &dictionary) {
                assert_eq!(decoded, test_data);
            }
        }
    }
}