fcoreutils 0.22.0

use std::io::{self, Read, Write};

use base64_simd::AsOut;

const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;

/// Number of available CPUs for parallel chunk splitting.
/// Uses std::thread::available_parallelism() to avoid triggering premature
/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
#[inline]
fn num_cpus() -> usize {
    std::thread::available_parallelism()
        .map(|n| n.get())
        .unwrap_or(1)
}

/// Chunk size for sequential no-wrap encoding: 8MB aligned to 3 bytes.
/// Larger chunks reduce function call overhead per iteration while still
/// keeping peak buffer allocation reasonable (~10.7MB for the output).
const NOWRAP_CHUNK: usize = 8 * 1024 * 1024 - (8 * 1024 * 1024 % 3);

/// Minimum data size for parallel no-wrap encoding (16MB).
/// For single-file CLI usage (typical benchmark), the Rayon pool is cold
/// on first use (~200-500µs init). At 10MB, sequential encoding is faster
/// because pool init + dispatch overhead exceeds the parallel benefit.
/// Note: multi-file callers pay pool init only once; subsequent files would
/// benefit from a lower threshold (~2MB). Optimized for single-file CLI.
const PARALLEL_NOWRAP_THRESHOLD: usize = 16 * 1024 * 1024;

/// Minimum data size for parallel wrapped encoding (12MB).
/// Same cold-pool reasoning as PARALLEL_NOWRAP_THRESHOLD above.
/// The sequential encode_wrapped_expand path with backward expansion
/// eliminates per-group overhead from L1-scatter chunking.
const PARALLEL_WRAPPED_THRESHOLD: usize = 12 * 1024 * 1024;

/// Minimum data size for parallel decoding (1MB of base64 data).
/// Lower threshold than encode because decode is more compute-intensive
/// and benefits from parallelism at smaller sizes. After first use, the
/// Rayon pool is warm (~10µs dispatch), making 1MB a good crossover point.
const PARALLEL_DECODE_THRESHOLD: usize = 1024 * 1024;

/// Hint HUGEPAGE for large output buffers on Linux.
/// MADV_HUGEPAGE tells kernel to use 2MB pages, reducing TLB misses
/// and minor fault count for large allocations (~25,600 → ~50 for 100MB).
#[cfg(target_os = "linux")]
fn hint_hugepage(buf: &mut Vec<u8>) {
    if buf.capacity() >= 2 * 1024 * 1024 {
        unsafe {
            libc::madvise(
                buf.as_mut_ptr() as *mut libc::c_void,
                buf.capacity(),
                libc::MADV_HUGEPAGE,
            );
        }
    }
}

/// Encode data and write to output with line wrapping.
/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    if wrap_col == 0 {
        return encode_no_wrap(data, out);
    }

    encode_wrapped(data, wrap_col, out)
}

/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
    if data.len() >= PARALLEL_NOWRAP_THRESHOLD && num_cpus() > 1 {
        return encode_no_wrap_parallel(data, out);
    }

    // Single-buffer encode: for data that fits in one chunk, encode directly
    // and write once. For larger data, reuse the buffer across chunks.
    let enc_len = BASE64_ENGINE.encoded_length(data.len().min(NOWRAP_CHUNK));
    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(enc_len);
    }

    for chunk in data.chunks(NOWRAP_CHUNK) {
        let clen = BASE64_ENGINE.encoded_length(chunk.len());
        let encoded = BASE64_ENGINE.encode(chunk, buf[..clen].as_out());
        out.write_all(encoded)?;
    }
    Ok(())
}

/// Parallel no-wrap encoding into a single shared output buffer.
/// Split at 3-byte boundaries, pre-calculate output offsets, encode in parallel.
/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
/// Single allocation + single write_all instead of N allocations + writev.
fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
    let num_threads = num_cpus().max(1);
    let raw_chunk = data.len() / num_threads;
    // Align to 3 bytes so each chunk encodes without padding (except the last)
    let chunk_size = ((raw_chunk + 2) / 3) * 3;

    // Split input into 3-byte-aligned chunks
    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();

    // Pre-calculate output offsets
    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
    let mut total_out = 0usize;
    for chunk in &chunks {
        offsets.push(total_out);
        total_out += BASE64_ENGINE.encoded_length(chunk.len());
    }

    // Single allocation for all threads
    let mut output: Vec<u8> = Vec::with_capacity(total_out);
    #[allow(clippy::uninit_vec)]
    unsafe {
        output.set_len(total_out);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut output);

    // Parallel encode: each thread writes into its pre-assigned region
    let output_base = output.as_mut_ptr() as usize;
    rayon::scope(|s| {
        for (i, chunk) in chunks.iter().enumerate() {
            let out_off = offsets[i];
            let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
            let base = output_base;
            s.spawn(move |_| {
                let dest =
                    unsafe { std::slice::from_raw_parts_mut((base + out_off) as *mut u8, enc_len) };
                let _ = BASE64_ENGINE.encode(chunk, dest.as_out());
            });
        }
    });

    out.write_all(&output[..total_out])
}

/// Encode with line wrapping using forward scatter from L1-cached temp buffer.
/// Encodes groups of lines into a small temp buffer (fits in L1 cache), then
/// scatter-copies wrap_col-byte chunks from temp to output with newlines.
///
/// This is faster than bulk encode + backward expansion because:
/// - Temp buffer reads hit L1 cache (essentially free bandwidth)
/// - Output buffer is written once (no double-write from backward memmove)
/// - Forward access pattern is prefetcher-friendly
fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    let bytes_per_line = wrap_col * 3 / 4;
    if bytes_per_line == 0 {
        return encode_wrapped_small(data, wrap_col, out);
    }

    if data.len() >= PARALLEL_WRAPPED_THRESHOLD && bytes_per_line.is_multiple_of(3) {
        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
    }

    // Chunked encode for data > 1MB: process ~1MB at a time to keep the output
    // buffer small (~1.3MB) and warm in L2. Reduces minor page faults from ~3500 to ~320.
    // Guard: skip if bytes_per_line > 1MB (lines_per_chunk would be 0 → infinite loop).
    if bytes_per_line.is_multiple_of(3) && data.len() > 1024 * 1024 {
        let lines_per_chunk = (1024 * 1024) / bytes_per_line;
        if lines_per_chunk > 0 {
            return encode_wrapped_chunked(data, wrap_col, bytes_per_line, out);
        }
    }

    if bytes_per_line.is_multiple_of(3) {
        return encode_wrapped_expand(data, wrap_col, bytes_per_line, out);
    }

    // Fallback for non-3-aligned bytes_per_line: use fuse_wrap approach
    let enc_max = BASE64_ENGINE.encoded_length(data.len());
    let num_full = enc_max / wrap_col;
    let rem = enc_max % wrap_col;
    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };

    // Encode full data, then fuse with newlines
    let mut enc_buf: Vec<u8> = Vec::with_capacity(enc_max);
    #[allow(clippy::uninit_vec)]
    unsafe {
        enc_buf.set_len(enc_max);
    }
    let _ = BASE64_ENGINE.encode(data, enc_buf[..enc_max].as_out());

    let mut out_buf: Vec<u8> = Vec::with_capacity(out_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(out_len);
    }
    let n = fuse_wrap(&enc_buf, wrap_col, &mut out_buf);
    out.write_all(&out_buf[..n])
}

/// Chunked encode+wrap for large in-memory data. Processes ~1MB input at a time,
/// reusing a small output buffer (~1.3MB) that stays warm in L2 cache.
/// Each chunk is aligned to bytes_per_line so it produces complete lines.
/// Reduces page faults from ~3500 (14MB alloc) to ~320 (1.3MB alloc) for 10MB input.
fn encode_wrapped_chunked(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    debug_assert!(bytes_per_line.is_multiple_of(3));

    // ~1MB input per chunk, aligned to bytes_per_line
    let lines_per_chunk = (1024 * 1024) / bytes_per_line;
    let chunk_input = lines_per_chunk * bytes_per_line;
    let line_out = wrap_col + 1;
    let max_chunk_out =
        lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;

    // SAFETY: out_buf is allocated to max_chunk_out bytes and set_len'd without
    // initialization. The encode loop below writes exactly `full_lines * line_out`
    // bytes (encode + newline per line), plus any remainder. Only the written
    // prefix `out_buf[..total_out]` is passed to write_all — no uninitialized
    // bytes are ever read. u8 has no drop glue, so the uninitialized tail is safe.
    let mut out_buf: Vec<u8> = Vec::with_capacity(max_chunk_out);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(max_chunk_out);
    }

    let mut pos = 0;
    while pos < data.len() {
        let remaining = data.len() - pos;
        let chunk_len = remaining.min(chunk_input);
        let chunk = &data[pos..pos + chunk_len];

        let full_lines = chunk_len / bytes_per_line;
        let remainder = chunk_len % bytes_per_line;

        let dst = out_buf.as_mut_ptr();
        let mut line_idx = 0;

        // 4-line unrolled loop
        while line_idx + 4 <= full_lines {
            let in_base = line_idx * bytes_per_line;
            let out_base = line_idx * line_out;
            unsafe {
                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
                let _ =
                    BASE64_ENGINE.encode(&chunk[in_base..in_base + bytes_per_line], s0.as_out());
                *dst.add(out_base + wrap_col) = b'\n';

                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &chunk[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
                    s1.as_out(),
                );
                *dst.add(out_base + line_out + wrap_col) = b'\n';

                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &chunk[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
                    s2.as_out(),
                );
                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';

                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &chunk[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
                    s3.as_out(),
                );
                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
            }
            line_idx += 4;
        }
        while line_idx < full_lines {
            let in_off = line_idx * bytes_per_line;
            let out_off = line_idx * line_out;
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(out_off), wrap_col);
                let _ = BASE64_ENGINE.encode(&chunk[in_off..in_off + bytes_per_line], s.as_out());
                *dst.add(out_off + wrap_col) = b'\n';
            }
            line_idx += 1;
        }

        let mut total_out = full_lines * line_out;

        if remainder > 0 {
            let in_off = full_lines * bytes_per_line;
            let enc_len = BASE64_ENGINE.encoded_length(remainder);
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(total_out), enc_len);
                let _ = BASE64_ENGINE.encode(&chunk[in_off..in_off + remainder], s.as_out());
                *dst.add(total_out + enc_len) = b'\n';
            }
            total_out += enc_len + 1;
        }

        out.write_all(&out_buf[..total_out])?;
        pos += chunk_len;
    }

    Ok(())
}

/// Encode with backward expansion: single contiguous SIMD encode, then expand
/// in-place to insert newlines. The encode is done in one call (no chunking),
/// which eliminates per-group function call overhead from L1-scatter.
/// The backward expansion only shifts data by ~1.3% (1 byte per 76 for wrap_col=76),
/// and for most lines the shift exceeds wrap_col so memmove uses the fast memcpy path.
fn encode_wrapped_expand(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    debug_assert!(bytes_per_line.is_multiple_of(3));
    let enc_len = BASE64_ENGINE.encoded_length(data.len());
    if enc_len == 0 {
        return Ok(());
    }

    let num_full = enc_len / wrap_col;
    let rem = enc_len % wrap_col;
    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };

    // Single allocation: encode into first enc_len bytes, expand backward to out_len.
    // SAFETY: buf[..enc_len] is initialized by BASE64_ENGINE.encode below.
    // buf[enc_len..out_len] is written by expand_backward before write_all reads it.
    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(out_len);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut buf);

    // One SIMD encode call for the entire input (no chunking overhead)
    let encoded = BASE64_ENGINE.encode(data, buf[..enc_len].as_out());
    debug_assert_eq!(encoded.len(), enc_len, "encode wrote unexpected length");

    // Expand backward to insert newlines — shifts only ~1.3% of data
    expand_backward(buf.as_mut_ptr(), enc_len, out_len, wrap_col);

    out.write_all(&buf[..out_len])
}

/// L1-scatter encode: encode groups of lines into a small L1-cached temp buffer,
/// then scatter-copy each line to its final position in the output buffer with
/// newline insertion. Each output byte is written exactly once — no read-back
/// from main memory, halving memory traffic vs backward expansion.
///
/// Temp buffer (~20KB for 256 lines × 76 chars) stays hot in L1 cache, so
/// reads during scatter are essentially free. Output buffer is streamed out
/// with sequential writes that the prefetcher can handle efficiently.
///
/// Uses a full output buffer for vmsplice safety: vmsplice maps user pages
/// into the pipe buffer, so the buffer must stay valid until the reader consumes.
#[allow(dead_code)]
fn encode_wrapped_scatter(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    let enc_len = BASE64_ENGINE.encoded_length(data.len());
    if enc_len == 0 {
        return Ok(());
    }

    let num_full = enc_len / wrap_col;
    let rem = enc_len % wrap_col;
    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };

    // Output buffer — written once via scatter, then write_all to output
    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(out_len);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut buf);

    // L1-cached temp buffer for encoding groups of lines.
    // 256 lines × 76 chars = 19,456 bytes — fits comfortably in L1 (32-64KB).
    const GROUP_LINES: usize = 256;
    let group_input = GROUP_LINES * bytes_per_line;
    let temp_size = GROUP_LINES * wrap_col;
    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        temp.set_len(temp_size);
    }

    let line_out = wrap_col + 1;
    let mut wp = 0usize; // write position in output buffer

    for chunk in data.chunks(group_input) {
        let clen = BASE64_ENGINE.encoded_length(chunk.len());
        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());

        // Scatter-copy full lines from temp to output with newlines
        let lines = clen / wrap_col;
        let chunk_rem = clen % wrap_col;

        // 8-line unrolled scatter for ILP
        let mut i = 0;
        while i + 8 <= lines {
            unsafe {
                let src = temp.as_ptr().add(i * wrap_col);
                let dst = buf.as_mut_ptr().add(wp);
                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
                *dst.add(wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
                *dst.add(line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(2 * wrap_col),
                    dst.add(2 * line_out),
                    wrap_col,
                );
                *dst.add(2 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(3 * wrap_col),
                    dst.add(3 * line_out),
                    wrap_col,
                );
                *dst.add(3 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(4 * wrap_col),
                    dst.add(4 * line_out),
                    wrap_col,
                );
                *dst.add(4 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(5 * wrap_col),
                    dst.add(5 * line_out),
                    wrap_col,
                );
                *dst.add(5 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(6 * wrap_col),
                    dst.add(6 * line_out),
                    wrap_col,
                );
                *dst.add(6 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(7 * wrap_col),
                    dst.add(7 * line_out),
                    wrap_col,
                );
                *dst.add(7 * line_out + wrap_col) = b'\n';
            }
            wp += 8 * line_out;
            i += 8;
        }
        // Remaining full lines
        while i < lines {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(i * wrap_col),
                    buf.as_mut_ptr().add(wp),
                    wrap_col,
                );
                *buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
            }
            wp += line_out;
            i += 1;
        }
        // Partial last line (only on final chunk)
        if chunk_rem > 0 {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(lines * wrap_col),
                    buf.as_mut_ptr().add(wp),
                    chunk_rem,
                );
                *buf.as_mut_ptr().add(wp + chunk_rem) = b'\n';
            }
            wp += chunk_rem + 1;
        }
    }

    out.write_all(&buf[..wp])
}

/// Scatter-copy encoded lines from temp buffer to output buffer with newlines.
/// Uses copy_nonoverlapping since temp and output never overlap.
#[inline]
#[allow(dead_code)]
fn scatter_lines(
    temp: &[u8],
    buf: &mut [u8],
    line_start: usize,
    count: usize,
    wrap_col: usize,
    line_out: usize,
) {
    unsafe {
        let src = temp.as_ptr();
        let dst = buf.as_mut_ptr();
        for i in 0..count {
            let s_off = i * wrap_col;
            let d_off = (line_start + i) * line_out;
            std::ptr::copy_nonoverlapping(src.add(s_off), dst.add(d_off), wrap_col);
            *dst.add(d_off + wrap_col) = b'\n';
        }
    }
}

/// Expand encoded data in-place by inserting newlines at wrap_col boundaries.
/// buf[0..enc_len] contains contiguous encoded data; buf has capacity for out_len.
/// After expansion, buf[0..out_len] contains wrapped output with newlines.
///
/// Processes backward so shifted data never overwrites unread source data.
/// For wrap_col=76: shift is ~1.3% (1 byte per 76), so most copies are
/// non-overlapping and the memmove fast-path (memcpy) is used.
#[inline]
fn expand_backward(ptr: *mut u8, enc_len: usize, out_len: usize, wrap_col: usize) {
    let num_full = enc_len / wrap_col;
    let rem = enc_len % wrap_col;

    unsafe {
        let mut rp = enc_len;
        let mut wp = out_len;

        // Handle partial last line (remainder)
        if rem > 0 {
            wp -= 1;
            *ptr.add(wp) = b'\n';
            wp -= rem;
            rp -= rem;
            if rp != wp {
                std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
            }
        }

        // Process full lines backward
        let mut lines_left = num_full;
        while lines_left >= 8 {
            // Unrolled: 8 lines per iteration
            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            lines_left -= 8;
        }

        // Remaining lines (0-7)
        while lines_left > 0 {
            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            if rp != wp {
                std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
            }
            lines_left -= 1;
        }
    }
}

/// Static newline byte for IoSlice references in writev calls.
static NEWLINE: [u8; 1] = [b'\n'];

/// Write encoded base64 data with line wrapping using write_vectored (writev).
/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
/// This is zero-copy: no fused output buffer needed.
#[inline]
#[allow(dead_code)]
fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
    // Each line needs 2 entries (data + newline), so 512 lines per batch.
    const MAX_IOV: usize = 1024;

    let num_full_lines = encoded.len() / wrap_col;
    let remainder = encoded.len() % wrap_col;
    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };

    // Small output: build all IoSlices and write in one call
    if total_iov <= MAX_IOV {
        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
        let mut pos = 0;
        for _ in 0..num_full_lines {
            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
            iov.push(io::IoSlice::new(&NEWLINE));
            pos += wrap_col;
        }
        if remainder > 0 {
            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
            iov.push(io::IoSlice::new(&NEWLINE));
        }
        return write_all_vectored(out, &iov);
    }

    // Large output: fuse batches of lines into a reusable L1-cached buffer.
    // Each batch copies ~39KB (512 lines × 77 bytes) from the encoded buffer
    // with newlines inserted, then writes as a single contiguous write(2).
    // This is faster than writev with 1024 IoSlice entries because:
    // - One kernel memcpy per batch vs 1024 separate copies
    // - Fused buffer (39KB) stays hot in L1 cache across batches
    let line_out = wrap_col + 1;
    const BATCH_LINES: usize = 512;
    let batch_fused_size = BATCH_LINES * line_out;
    let mut fused: Vec<u8> = Vec::with_capacity(batch_fused_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        fused.set_len(batch_fused_size);
    }

    let mut rp = 0;
    let mut lines_done = 0;

    // Process full batches using 8-line unrolled fuse_wrap
    while lines_done + BATCH_LINES <= num_full_lines {
        let n = fuse_wrap(
            &encoded[rp..rp + BATCH_LINES * wrap_col],
            wrap_col,
            &mut fused,
        );
        out.write_all(&fused[..n])?;
        rp += BATCH_LINES * wrap_col;
        lines_done += BATCH_LINES;
    }

    // Remaining full lines (partial batch)
    let remaining_lines = num_full_lines - lines_done;
    if remaining_lines > 0 {
        let n = fuse_wrap(
            &encoded[rp..rp + remaining_lines * wrap_col],
            wrap_col,
            &mut fused,
        );
        out.write_all(&fused[..n])?;
        rp += remaining_lines * wrap_col;
    }

    // Partial last line
    if remainder > 0 {
        out.write_all(&encoded[rp..rp + remainder])?;
        out.write_all(b"\n")?;
    }
    Ok(())
}

/// Write encoded base64 data with line wrapping using writev, tracking column state
/// across calls. Used by encode_stream for piped input where chunks don't align
/// to line boundaries.
#[inline]
fn write_wrapped_iov_streaming(
    encoded: &[u8],
    wrap_col: usize,
    col: &mut usize,
    out: &mut impl Write,
) -> io::Result<()> {
    const MAX_IOV: usize = 1024;
    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
    let mut rp = 0;

    while rp < encoded.len() {
        let space = wrap_col - *col;
        let avail = encoded.len() - rp;

        if avail <= space {
            // Remaining data fits in current line
            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
            *col += avail;
            if *col == wrap_col {
                iov.push(io::IoSlice::new(&NEWLINE));
                *col = 0;
            }
            break;
        } else {
            // Fill current line and add newline
            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
            iov.push(io::IoSlice::new(&NEWLINE));
            rp += space;
            *col = 0;
        }

        if iov.len() >= MAX_IOV - 1 {
            write_all_vectored(out, &iov)?;
            iov.clear();
        }
    }

    if !iov.is_empty() {
        write_all_vectored(out, &iov)?;
    }
    Ok(())
}

/// Parallel wrapped encoding with L1-scatter into a single shared output buffer.
/// Pre-calculates each thread's output offset, allocates one buffer for all threads,
/// and has each thread encode directly into its pre-assigned non-overlapping region.
/// This saves N-1 buffer allocations and corresponding page faults vs per-thread Vecs,
/// and uses a single write_all instead of writev.
fn encode_wrapped_parallel(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    let num_threads = num_cpus().max(1);
    let lines_per_chunk = ((data.len() / bytes_per_line) / num_threads).max(1);
    let chunk_input = lines_per_chunk * bytes_per_line;

    // Split input at bytes_per_line boundaries (last chunk may have remainder)
    let chunks: Vec<&[u8]> = data.chunks(chunk_input.max(bytes_per_line)).collect();

    // Pre-calculate output offsets for each chunk
    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
    let mut total_out = 0usize;
    for chunk in &chunks {
        offsets.push(total_out);
        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
        let full_lines = enc_len / wrap_col;
        let remainder = enc_len % wrap_col;
        total_out += full_lines * (wrap_col + 1) + if remainder > 0 { remainder + 1 } else { 0 };
    }

    // Single allocation for all threads
    let mut output: Vec<u8> = Vec::with_capacity(total_out);
    #[allow(clippy::uninit_vec)]
    unsafe {
        output.set_len(total_out);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut output);

    // Parallel encode: each thread writes into its pre-assigned region
    let output_base = output.as_mut_ptr() as usize;
    rayon::scope(|s| {
        for (i, chunk) in chunks.iter().enumerate() {
            let out_off = offsets[i];
            let out_end = if i + 1 < offsets.len() {
                offsets[i + 1]
            } else {
                total_out
            };
            let out_size = out_end - out_off;
            let base = output_base;
            s.spawn(move |_| {
                let out_slice = unsafe {
                    std::slice::from_raw_parts_mut((base + out_off) as *mut u8, out_size)
                };
                encode_chunk_l1_scatter_into(chunk, out_slice, wrap_col, bytes_per_line);
            });
        }
    });

    out.write_all(&output[..total_out])
}

/// Encode a chunk using L1-scatter, writing into a pre-allocated output slice.
/// Encodes groups of 256 lines into L1-cached temp buffer, scatter-copy to output with newlines.
/// The output slice must be large enough to hold the encoded+wrapped output.
fn encode_chunk_l1_scatter_into(
    data: &[u8],
    output: &mut [u8],
    wrap_col: usize,
    bytes_per_line: usize,
) {
    const GROUP_LINES: usize = 256;
    let group_input = GROUP_LINES * bytes_per_line;
    let temp_size = GROUP_LINES * wrap_col;
    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        temp.set_len(temp_size);
    }

    let line_out = wrap_col + 1;
    let mut wp = 0usize;

    for chunk in data.chunks(group_input) {
        let clen = BASE64_ENGINE.encoded_length(chunk.len());
        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());

        let lines = clen / wrap_col;
        let chunk_rem = clen % wrap_col;

        // 8-line unrolled scatter
        let mut i = 0;
        while i + 8 <= lines {
            unsafe {
                let src = temp.as_ptr().add(i * wrap_col);
                let dst = output.as_mut_ptr().add(wp);
                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
                *dst.add(wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
                *dst.add(line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(2 * wrap_col),
                    dst.add(2 * line_out),
                    wrap_col,
                );
                *dst.add(2 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(3 * wrap_col),
                    dst.add(3 * line_out),
                    wrap_col,
                );
                *dst.add(3 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(4 * wrap_col),
                    dst.add(4 * line_out),
                    wrap_col,
                );
                *dst.add(4 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(5 * wrap_col),
                    dst.add(5 * line_out),
                    wrap_col,
                );
                *dst.add(5 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(6 * wrap_col),
                    dst.add(6 * line_out),
                    wrap_col,
                );
                *dst.add(6 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(7 * wrap_col),
                    dst.add(7 * line_out),
                    wrap_col,
                );
                *dst.add(7 * line_out + wrap_col) = b'\n';
            }
            wp += 8 * line_out;
            i += 8;
        }
        while i < lines {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(i * wrap_col),
                    output.as_mut_ptr().add(wp),
                    wrap_col,
                );
                *output.as_mut_ptr().add(wp + wrap_col) = b'\n';
            }
            wp += line_out;
            i += 1;
        }
        if chunk_rem > 0 {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(lines * wrap_col),
                    output.as_mut_ptr().add(wp),
                    chunk_rem,
                );
                *output.as_mut_ptr().add(wp + chunk_rem) = b'\n';
            }
            wp += chunk_rem + 1;
        }
    }
}

/// Fuse encoded base64 data with newlines in a single pass.
/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
/// Returns number of bytes written.
#[inline]
fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
    let mut rp = 0;
    let mut wp = 0;

    // Unrolled: process 8 lines per iteration for better ILP
    while rp + 8 * wrap_col <= encoded.len() {
        unsafe {
            let src = encoded.as_ptr().add(rp);
            let dst = out_buf.as_mut_ptr().add(wp);

            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
            *dst.add(wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
            *dst.add(line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
            *dst.add(2 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
            *dst.add(3 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
            *dst.add(4 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
            *dst.add(5 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
            *dst.add(6 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
            *dst.add(7 * line_out + wrap_col) = b'\n';
        }
        rp += 8 * wrap_col;
        wp += 8 * line_out;
    }

    // Handle remaining 4 lines at a time
    while rp + 4 * wrap_col <= encoded.len() {
        unsafe {
            let src = encoded.as_ptr().add(rp);
            let dst = out_buf.as_mut_ptr().add(wp);

            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
            *dst.add(wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
            *dst.add(line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
            *dst.add(2 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
            *dst.add(3 * line_out + wrap_col) = b'\n';
        }
        rp += 4 * wrap_col;
        wp += 4 * line_out;
    }

    // Remaining full lines
    while rp + wrap_col <= encoded.len() {
        unsafe {
            std::ptr::copy_nonoverlapping(
                encoded.as_ptr().add(rp),
                out_buf.as_mut_ptr().add(wp),
                wrap_col,
            );
            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
        }
        rp += wrap_col;
        wp += line_out;
    }

    // Partial last line
    if rp < encoded.len() {
        let remaining = encoded.len() - rp;
        unsafe {
            std::ptr::copy_nonoverlapping(
                encoded.as_ptr().add(rp),
                out_buf.as_mut_ptr().add(wp),
                remaining,
            );
        }
        wp += remaining;
        out_buf[wp] = b'\n';
        wp += 1;
    }

    wp
}

/// Fallback for very small wrap columns (< 4 chars).
fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    let enc_max = BASE64_ENGINE.encoded_length(data.len());
    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(enc_max);
    }
    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());

    let wc = wrap_col.max(1);
    for line in encoded.chunks(wc) {
        out.write_all(line)?;
        out.write_all(b"\n")?;
    }
    Ok(())
}

/// Decode base64 data and write to output (borrows data, allocates clean buffer).
/// When `ignore_garbage` is true, strip all non-base64 characters.
/// When false, only strip whitespace (standard behavior).
pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    if ignore_garbage {
        let mut cleaned = strip_non_base64(data);
        return decode_clean_slice(&mut cleaned, out);
    }

    // For large data (>= 512KB): use bulk strip + single-shot decode.
    // try_line_decode decodes per-line (~25ns overhead per 76-byte line call),
    // while strip+decode uses SIMD gap-copy + single-shot SIMD decode at ~6.5 GB/s.
    // For 10MB decode benchmark: ~2ms (bulk) vs ~4ms (per-line) = 2x faster.
    // For small data (< 512KB): per-line decode avoids allocation overhead.
    if data.len() < 512 * 1024 && data.len() >= 77 {
        if let Some(result) = try_line_decode(data, out) {
            return result;
        }
    }

    // Fast path: single-pass SIMD strip + decode
    decode_stripping_whitespace(data, out)
}

/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
/// in-place with base64_simd::decode_inplace. Zero additional allocations.
///
/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
/// containing whitespace (newlines) get physically copied (~1.3% for
/// 76-char line base64). The decode writes to the same buffer, but decoded
/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
pub fn decode_mmap_inplace(
    data: &mut [u8],
    ignore_garbage: bool,
    out: &mut impl Write,
) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    // For small data: try line-by-line decode (avoids COW page faults).
    // For large data (>= 512KB): bulk strip+decode is faster than per-line decode.
    if !ignore_garbage && data.len() >= 77 && data.len() < 512 * 1024 {
        if let Some(result) = try_line_decode(data, out) {
            return result;
        }
    }

    if ignore_garbage {
        // Strip non-base64 chars in-place
        let ptr = data.as_mut_ptr();
        let len = data.len();
        let mut wp = 0;
        for rp in 0..len {
            let b = unsafe { *ptr.add(rp) };
            if is_base64_char(b) {
                unsafe { *ptr.add(wp) = b };
                wp += 1;
            }
        }
        let r = decode_inplace_with_padding(&mut data[..wp], out);
        return r;
    }

    // Fast path: uniform-line fused strip+decode (no intermediate buffer).
    if data.len() >= 77 {
        if let Some(result) = try_decode_uniform_lines(data, out) {
            return result;
        }
    }

    // Fallback: strip whitespace in-place using SIMD memchr2 gap-copy.

    // Quick check: no newlines at all — maybe already clean
    if memchr::memchr2(b'\n', b'\r', data).is_none() {
        // Check for rare whitespace
        if !data
            .iter()
            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
        {
            // Perfectly clean — decode in-place directly
            return decode_inplace_with_padding(data, out);
        }
        // Rare whitespace only — strip in-place
        let ptr = data.as_mut_ptr();
        let len = data.len();
        let mut wp = 0;
        for rp in 0..len {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(wp) = b };
                wp += 1;
            }
        }
        return decode_inplace_with_padding(&mut data[..wp], out);
    }

    // SIMD gap-copy: strip \n and \r in-place using memchr2
    let ptr = data.as_mut_ptr();
    let len = data.len();
    let mut wp = 0usize;
    let mut gap_start = 0usize;
    let mut has_rare_ws = false;

    // SAFETY: memchr2_iter reads from the original data. We write to positions
    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
        let gap_len = pos - gap_start;
        if gap_len > 0 {
            if !has_rare_ws {
                // Check for rare whitespace during the gap-copy
                has_rare_ws = unsafe {
                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
                        .iter()
                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
                };
            }
            if wp != gap_start {
                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
            }
            wp += gap_len;
        }
        gap_start = pos + 1;
    }
    // Final gap
    let tail_len = len - gap_start;
    if tail_len > 0 {
        if !has_rare_ws {
            has_rare_ws = unsafe {
                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
                    .iter()
                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
            };
        }
        if wp != gap_start {
            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
        }
        wp += tail_len;
    }

    // Second pass for rare whitespace if needed
    if has_rare_ws {
        let mut rp = 0;
        let mut cwp = 0;
        while rp < wp {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(cwp) = b };
                cwp += 1;
            }
            rp += 1;
        }
        wp = cwp;
    }

    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
    if wp >= PARALLEL_DECODE_THRESHOLD {
        // For large data, use parallel decode from the cleaned slice
        return decode_borrowed_clean_parallel(out, &data[..wp]);
    }
    decode_inplace_with_padding(&mut data[..wp], out)
}

/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
pub fn decode_owned(
    data: &mut Vec<u8>,
    ignore_garbage: bool,
    out: &mut impl Write,
) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    if ignore_garbage {
        data.retain(|&b| is_base64_char(b));
    } else {
        strip_whitespace_inplace(data);
    }

    decode_clean_slice(data, out)
}

/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
fn strip_whitespace_inplace(data: &mut Vec<u8>) {
    // Quick check: skip stripping if no \n or \r in the data.
    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
    if memchr::memchr2(b'\n', b'\r', data).is_none() {
        // No newlines/CR — check for rare whitespace only
        if data
            .iter()
            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
        {
            data.retain(|&b| NOT_WHITESPACE[b as usize]);
        }
        return;
    }

    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
    // gaps between them to compact the data in-place. For typical base64 streams,
    // newlines are the only whitespace, so this handles >99% of cases.
    let ptr = data.as_mut_ptr();
    let len = data.len();
    let mut wp = 0usize;
    let mut gap_start = 0usize;
    let mut has_rare_ws = false;

    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
        let gap_len = pos - gap_start;
        if gap_len > 0 {
            if !has_rare_ws {
                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
                has_rare_ws = data[gap_start..pos]
                    .iter()
                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
            }
            if wp != gap_start {
                unsafe {
                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
                }
            }
            wp += gap_len;
        }
        gap_start = pos + 1;
    }
    // Copy the final gap
    let tail_len = len - gap_start;
    if tail_len > 0 {
        if !has_rare_ws {
            has_rare_ws = data[gap_start..]
                .iter()
                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
        }
        if wp != gap_start {
            unsafe {
                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
            }
        }
        wp += tail_len;
    }

    data.truncate(wp);

    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
    if has_rare_ws {
        let ptr = data.as_mut_ptr();
        let len = data.len();
        let mut rp = 0;
        let mut cwp = 0;
        while rp < len {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(cwp) = b };
                cwp += 1;
            }
            rp += 1;
        }
        data.truncate(cwp);
    }
}

/// 256-byte lookup table: true for non-whitespace bytes.
/// Used for single-pass whitespace stripping in decode.
static NOT_WHITESPACE: [bool; 256] = {
    let mut table = [true; 256];
    table[b' ' as usize] = false;
    table[b'\t' as usize] = false;
    table[b'\n' as usize] = false;
    table[b'\r' as usize] = false;
    table[0x0b] = false; // vertical tab
    table[0x0c] = false; // form feed
    table
};

/// Fused strip+decode for uniform-line base64 data.
/// Detects consistent line length, then processes in sub-chunks: each sub-chunk
/// copies lines to a small local buffer (L2-hot) and decodes immediately.
/// Eliminates the large intermediate clean buffer (~12MB for 10MB decode).
/// Returns None if the data doesn't have uniform line structure.
fn try_decode_uniform_lines(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
    let first_nl = memchr::memchr(b'\n', data)?;
    let line_len = first_nl;
    if line_len == 0 || line_len % 4 != 0 {
        return None;
    }

    let stride = line_len + 1;

    // Verify the data has consistent line structure (first + last lines)
    let check_lines = 4.min(data.len() / stride);
    for i in 1..check_lines {
        let expected_nl = i * stride - 1;
        if expected_nl >= data.len() || data[expected_nl] != b'\n' {
            return None;
        }
    }

    let full_lines = if data.len() >= stride {
        let candidate = data.len() / stride;
        if candidate > 0 && data[candidate * stride - 1] != b'\n' {
            return None;
        }
        candidate
    } else {
        0
    };

    let remainder_start = full_lines * stride;
    let remainder = &data[remainder_start..];
    let rem_clean = if remainder.last() == Some(&b'\n') {
        &remainder[..remainder.len() - 1]
    } else {
        remainder
    };

    // Compute exact decoded sizes
    let decoded_per_line = line_len * 3 / 4;
    let rem_decoded_size = if rem_clean.is_empty() {
        0
    } else {
        let pad = rem_clean
            .iter()
            .rev()
            .take(2)
            .filter(|&&b| b == b'=')
            .count();
        rem_clean.len() * 3 / 4 - pad
    };
    let total_decoded = full_lines * decoded_per_line + rem_decoded_size;
    let clean_len = full_lines * line_len;

    // Parallel path: fused strip+decode with 128KB sub-chunks per thread.
    // Each thread copies lines to a thread-local buffer (L2-hot) and decodes immediately,
    // eliminating the 12MB+ intermediate clean buffer entirely.
    if clean_len >= PARALLEL_DECODE_THRESHOLD && num_cpus() > 1 {
        let mut output: Vec<u8> = Vec::with_capacity(total_decoded);
        #[allow(clippy::uninit_vec)]
        unsafe {
            output.set_len(total_decoded);
        }
        #[cfg(target_os = "linux")]
        hint_hugepage(&mut output);

        let out_ptr = output.as_mut_ptr() as usize;
        let src_ptr = data.as_ptr() as usize;
        let num_threads = num_cpus().max(1);
        let lines_per_thread = (full_lines + num_threads - 1) / num_threads;
        // 512KB sub-chunks: larger chunks give SIMD decode more contiguous data,
        // reducing per-call overhead. 512KB fits in L2 cache (256KB-1MB typical).
        let lines_per_sub = (512 * 1024 / line_len).max(1);

        let err_flag = std::sync::atomic::AtomicBool::new(false);
        rayon::scope(|s| {
            for t in 0..num_threads {
                let err_flag = &err_flag;
                s.spawn(move |_| {
                    let start_line = t * lines_per_thread;
                    if start_line >= full_lines {
                        return;
                    }
                    let end_line = (start_line + lines_per_thread).min(full_lines);
                    let chunk_lines = end_line - start_line;

                    let sub_buf_size = lines_per_sub.min(chunk_lines) * line_len;
                    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
                    #[allow(clippy::uninit_vec)]
                    unsafe {
                        local_buf.set_len(sub_buf_size);
                    }

                    let src = src_ptr as *const u8;
                    let out_base = out_ptr as *mut u8;
                    let local_dst = local_buf.as_mut_ptr();

                    let mut sub_start = 0usize;
                    while sub_start < chunk_lines {
                        if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
                            return;
                        }
                        let sub_count = (chunk_lines - sub_start).min(lines_per_sub);
                        let sub_clean = sub_count * line_len;

                        for i in 0..sub_count {
                            unsafe {
                                std::ptr::copy_nonoverlapping(
                                    src.add((start_line + sub_start + i) * stride),
                                    local_dst.add(i * line_len),
                                    line_len,
                                );
                            }
                        }

                        let out_offset = (start_line + sub_start) * decoded_per_line;
                        let out_size = sub_count * decoded_per_line;
                        let out_slice = unsafe {
                            std::slice::from_raw_parts_mut(out_base.add(out_offset), out_size)
                        };
                        if BASE64_ENGINE
                            .decode(&local_buf[..sub_clean], out_slice.as_out())
                            .is_err()
                        {
                            err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
                            return;
                        }

                        sub_start += sub_count;
                    }
                });
            }
        });
        let result: Result<(), io::Error> = if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
            Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
        } else {
            Ok(())
        };

        if let Err(e) = result {
            return Some(Err(e));
        }

        if !rem_clean.is_empty() {
            let rem_out = &mut output[full_lines * decoded_per_line..total_decoded];
            match BASE64_ENGINE.decode(rem_clean, rem_out.as_out()) {
                Ok(_) => {}
                Err(_) => return Some(decode_error()),
            }
        }

        return Some(out.write_all(&output[..total_decoded]));
    }

    // Sequential path: fused strip+decode in 256KB sub-chunks.
    // Larger sub-chunks give SIMD decode more data per call, improving throughput.
    // Uses decode_inplace on a small reusable buffer — no large allocations at all.
    let lines_per_sub = (256 * 1024 / line_len).max(1);
    let sub_buf_size = lines_per_sub * line_len;
    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        local_buf.set_len(sub_buf_size);
    }

    let src = data.as_ptr();
    let local_dst = local_buf.as_mut_ptr();

    let mut line_idx = 0usize;
    while line_idx < full_lines {
        let sub_count = (full_lines - line_idx).min(lines_per_sub);
        let sub_clean = sub_count * line_len;

        for i in 0..sub_count {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    src.add((line_idx + i) * stride),
                    local_dst.add(i * line_len),
                    line_len,
                );
            }
        }

        match BASE64_ENGINE.decode_inplace(&mut local_buf[..sub_clean]) {
            Ok(decoded) => {
                if let Err(e) = out.write_all(decoded) {
                    return Some(Err(e));
                }
            }
            Err(_) => return Some(decode_error()),
        }

        line_idx += sub_count;
    }

    if !rem_clean.is_empty() {
        let mut rem_buf = rem_clean.to_vec();
        match BASE64_ENGINE.decode_inplace(&mut rem_buf) {
            Ok(decoded) => {
                if let Err(e) = out.write_all(decoded) {
                    return Some(Err(e));
                }
            }
            Err(_) => return Some(decode_error()),
        }
    }

    Some(Ok(()))
}

/// Decode by stripping whitespace and decoding in a single fused pass.
/// For data with no whitespace, decodes directly without any copy.
/// Detects uniform line structure for fast structured-copy (no search needed),
/// falls back to SIMD memchr2 gap-copy for irregular data.
fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
    // Fast path for uniform-line base64 (e.g., standard 76-char lines + newline).
    // Copies at known offsets, avoiding the memchr2 search entirely.
    // For 13MB base64: saves ~1ms vs memchr2 gap-copy (just structured memcpy).
    if data.len() >= 77 {
        if let Some(result) = try_decode_uniform_lines(data, out) {
            return result;
        }
    }

    // Quick check: skip stripping if no \n or \r in the data.
    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
    if memchr::memchr2(b'\n', b'\r', data).is_none() {
        // No newlines/CR — check for rare whitespace only
        if !data
            .iter()
            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
        {
            return decode_borrowed_clean(out, data);
        }
        // Has rare whitespace only — strip and decode
        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
        for &b in data {
            if NOT_WHITESPACE[b as usize] {
                cleaned.push(b);
            }
        }
        return decode_clean_slice(&mut cleaned, out);
    }

    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
    let dst = clean.as_mut_ptr();
    let mut wp = 0usize;
    let mut gap_start = 0usize;
    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
    // This avoids the second full-scan pass when only \n/\r are present.
    let mut has_rare_ws = false;

    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
        let gap_len = pos - gap_start;
        if gap_len > 0 {
            // Check gap region for rare whitespace during copy.
            // This adds ~1 branch per gap but eliminates the second full scan.
            if !has_rare_ws {
                has_rare_ws = data[gap_start..pos]
                    .iter()
                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
            }
            unsafe {
                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
            }
            wp += gap_len;
        }
        gap_start = pos + 1;
    }
    // Copy the final gap after the last \n/\r
    let tail_len = data.len() - gap_start;
    if tail_len > 0 {
        if !has_rare_ws {
            has_rare_ws = data[gap_start..]
                .iter()
                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
        }
        unsafe {
            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
        }
        wp += tail_len;
    }
    unsafe {
        clean.set_len(wp);
    }

    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
    if has_rare_ws {
        let ptr = clean.as_mut_ptr();
        let len = clean.len();
        let mut rp = 0;
        let mut cwp = 0;
        while rp < len {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(cwp) = b };
                cwp += 1;
            }
            rp += 1;
        }
        clean.truncate(cwp);
    }

    // For large data (>= threshold), use parallel decode for multi-core speedup.
    // For small data, use in-place decode to avoid extra allocation.
    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
        decode_borrowed_clean_parallel(out, &clean)
    } else {
        decode_clean_slice(&mut clean, out)
    }
}

/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
/// Returns Some(result) if the data has uniform line lengths suitable for
/// per-line decode, or None if the data doesn't fit this pattern.
///
/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
/// We decode each line directly into its position in the output buffer.
fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
    // Find the first newline to determine line length
    let first_nl = memchr::memchr(b'\n', data)?;
    let line_len = first_nl; // encoded chars per line (without newline)

    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
    if line_len == 0 || line_len % 4 != 0 {
        return None;
    }

    let line_stride = line_len + 1; // line_len chars + 1 newline byte
    let decoded_per_line = line_len * 3 / 4;

    // Verify the data has a consistent line structure by checking the next few lines
    let check_lines = 4.min(data.len() / line_stride);
    for i in 1..check_lines {
        let expected_nl = i * line_stride - 1;
        if expected_nl >= data.len() {
            break;
        }
        if data[expected_nl] != b'\n' {
            return None; // Inconsistent line length
        }
    }

    // Calculate full lines and remainder
    let full_lines = if data.len() >= line_stride {
        // Check how many complete lines fit
        let candidate = data.len() / line_stride;
        // Verify the last full line's newline
        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
            return None; // Not a clean line-structured file
        }
        candidate
    } else {
        0
    };

    let remainder_start = full_lines * line_stride;
    let remainder = &data[remainder_start..];

    // Calculate exact output size
    let remainder_clean_len = if remainder.is_empty() {
        0
    } else {
        // Remainder might end with newline, strip it
        let rem = if remainder.last() == Some(&b'\n') {
            &remainder[..remainder.len() - 1]
        } else {
            remainder
        };
        if rem.is_empty() {
            0
        } else {
            // Check for padding
            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
            if rem.len() % 4 != 0 {
                return None; // Invalid remainder
            }
            rem.len() * 3 / 4 - pad
        }
    };

    // Single-allocation decode: allocate full decoded output, decode all lines
    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(total_decoded);
    }

    let dst = out_buf.as_mut_ptr();

    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
    // Each thread decodes a contiguous block of lines directly to its final position
    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
        let out_addr = dst as usize;
        let num_threads = num_cpus().max(1);
        let lines_per_chunk = (full_lines / num_threads).max(1);

        // Build per-thread task ranges: (start_line, end_line)
        let mut tasks: Vec<(usize, usize)> = Vec::new();
        let mut line_off = 0;
        while line_off < full_lines {
            let end = (line_off + lines_per_chunk).min(full_lines);
            tasks.push((line_off, end));
            line_off = end;
        }

        let decode_err = std::sync::atomic::AtomicBool::new(false);
        rayon::scope(|s| {
            for &(start_line, end_line) in &tasks {
                let decode_err = &decode_err;
                s.spawn(move |_| {
                    let out_ptr = out_addr as *mut u8;
                    let mut i = start_line;

                    while i + 4 <= end_line {
                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
                            return;
                        }
                        let in_base = i * line_stride;
                        let ob = i * decoded_per_line;
                        unsafe {
                            let s0 =
                                std::slice::from_raw_parts_mut(out_ptr.add(ob), decoded_per_line);
                            if BASE64_ENGINE
                                .decode(&data[in_base..in_base + line_len], s0.as_out())
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                            let s1 = std::slice::from_raw_parts_mut(
                                out_ptr.add(ob + decoded_per_line),
                                decoded_per_line,
                            );
                            if BASE64_ENGINE
                                .decode(
                                    &data[in_base + line_stride..in_base + line_stride + line_len],
                                    s1.as_out(),
                                )
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                            let s2 = std::slice::from_raw_parts_mut(
                                out_ptr.add(ob + 2 * decoded_per_line),
                                decoded_per_line,
                            );
                            if BASE64_ENGINE
                                .decode(
                                    &data[in_base + 2 * line_stride
                                        ..in_base + 2 * line_stride + line_len],
                                    s2.as_out(),
                                )
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                            let s3 = std::slice::from_raw_parts_mut(
                                out_ptr.add(ob + 3 * decoded_per_line),
                                decoded_per_line,
                            );
                            if BASE64_ENGINE
                                .decode(
                                    &data[in_base + 3 * line_stride
                                        ..in_base + 3 * line_stride + line_len],
                                    s3.as_out(),
                                )
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                        }
                        i += 4;
                    }

                    while i < end_line {
                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
                            return;
                        }
                        let in_start = i * line_stride;
                        let out_off = i * decoded_per_line;
                        let out_slice = unsafe {
                            std::slice::from_raw_parts_mut(out_ptr.add(out_off), decoded_per_line)
                        };
                        if BASE64_ENGINE
                            .decode(&data[in_start..in_start + line_len], out_slice.as_out())
                            .is_err()
                        {
                            decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                            return;
                        }
                        i += 1;
                    }
                });
            }
        });

        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
            return Some(decode_error());
        }
    } else {
        // Sequential decode with 4x unrolling for smaller inputs
        let mut i = 0;

        while i + 4 <= full_lines {
            let in_base = i * line_stride;
            let out_base = i * decoded_per_line;
            unsafe {
                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
                if BASE64_ENGINE
                    .decode(&data[in_base..in_base + line_len], s0.as_out())
                    .is_err()
                {
                    return Some(decode_error());
                }

                let s1 = std::slice::from_raw_parts_mut(
                    dst.add(out_base + decoded_per_line),
                    decoded_per_line,
                );
                if BASE64_ENGINE
                    .decode(
                        &data[in_base + line_stride..in_base + line_stride + line_len],
                        s1.as_out(),
                    )
                    .is_err()
                {
                    return Some(decode_error());
                }

                let s2 = std::slice::from_raw_parts_mut(
                    dst.add(out_base + 2 * decoded_per_line),
                    decoded_per_line,
                );
                if BASE64_ENGINE
                    .decode(
                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
                        s2.as_out(),
                    )
                    .is_err()
                {
                    return Some(decode_error());
                }

                let s3 = std::slice::from_raw_parts_mut(
                    dst.add(out_base + 3 * decoded_per_line),
                    decoded_per_line,
                );
                if BASE64_ENGINE
                    .decode(
                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
                        s3.as_out(),
                    )
                    .is_err()
                {
                    return Some(decode_error());
                }
            }
            i += 4;
        }

        while i < full_lines {
            let in_start = i * line_stride;
            let in_end = in_start + line_len;
            let out_off = i * decoded_per_line;
            let out_slice =
                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
                Ok(_) => {}
                Err(_) => return Some(decode_error()),
            }
            i += 1;
        }
    }

    // Decode remainder
    if remainder_clean_len > 0 {
        let rem = if remainder.last() == Some(&b'\n') {
            &remainder[..remainder.len() - 1]
        } else {
            remainder
        };
        let out_off = full_lines * decoded_per_line;
        let out_slice =
            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
            Ok(_) => {}
            Err(_) => return Some(decode_error()),
        }
    }

    // Single write_all for the entire decoded output
    Some(out.write_all(&out_buf[..total_decoded]))
}

/// Decode a clean (no whitespace) buffer in-place with SIMD.
fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }
    decode_inplace_with_padding(data, out)
}

/// Cold error path — keeps hot decode path tight by moving error construction out of line.
#[cold]
#[inline(never)]
fn decode_error() -> io::Result<()> {
    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
}

/// Decode in-place with padding fallback for truncated input.
/// GNU base64 accepts missing padding at end of stream, so if decode fails
/// and the length mod 4 is 2 or 3, retry with padding added.
fn decode_inplace_with_padding(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
    match BASE64_ENGINE.decode_inplace(data) {
        Ok(decoded) => out.write_all(decoded),
        Err(_) => {
            let remainder = data.len() % 4;
            if remainder == 2 || remainder == 3 {
                let has_existing_padding = memchr::memchr(b'=', data).is_some();
                let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
                padded.extend_from_slice(data);
                padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
                if let Ok(decoded) = BASE64_ENGINE.decode_inplace(&mut padded) {
                    out.write_all(decoded)?;
                    if has_existing_padding {
                        return decode_error();
                    }
                    return Ok(());
                }
            }
            decode_error()
        }
    }
}

/// Decode clean base64 data (no whitespace) from a borrowed slice.
fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }
    // Parallel decode for large data: split at 4-byte boundaries,
    // decode each chunk independently (base64 is context-free per 4-char group).
    if data.len() >= PARALLEL_DECODE_THRESHOLD {
        return decode_borrowed_clean_parallel(out, data);
    }
    // If input has truncated padding, pad it first (GNU base64 accepts missing padding).
    let remainder = data.len() % 4;
    if remainder == 2 || remainder == 3 {
        // If input already has '=' but length mod 4 != 0, the padding is
        // wrong/truncated. GNU base64 still decodes but reports error.
        let has_existing_padding = memchr::memchr(b'=', data).is_some();
        let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
        padded.extend_from_slice(data);
        padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
        let result = decode_borrowed_clean(out, &padded);
        if has_existing_padding && result.is_ok() {
            return decode_error();
        }
        return result;
    }
    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
    // Decoded size = data.len() * 3 / 4 minus padding.
    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
    let decoded_size = data.len() * 3 / 4 - pad;
    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(decoded_size);
    }
    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
        Ok(decoded) => {
            out.write_all(decoded)?;
            Ok(())
        }
        Err(_) => decode_error(),
    }
}

/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel.
/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
/// upfront, so each thread decodes directly to its final position. No compaction needed.
fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
    let num_threads = num_cpus().max(1);
    let raw_chunk = data.len() / num_threads;
    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
    let chunk_size = ((raw_chunk + 3) / 4) * 4;

    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();

    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
    offsets.push(0);
    let mut total_decoded = 0usize;
    for (i, chunk) in chunks.iter().enumerate() {
        let decoded_size = if i == chunks.len() - 1 {
            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
            chunk.len() * 3 / 4 - pad
        } else {
            chunk.len() * 3 / 4
        };
        total_decoded += decoded_size;
        offsets.push(total_decoded);
    }

    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
    #[allow(clippy::uninit_vec)]
    unsafe {
        output_buf.set_len(total_decoded);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut output_buf);

    // Parallel decode: each thread decodes directly into its exact final position.
    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
    let out_addr = output_buf.as_mut_ptr() as usize;
    let err_flag = std::sync::atomic::AtomicBool::new(false);
    rayon::scope(|s| {
        for (i, chunk) in chunks.iter().enumerate() {
            let offset = offsets[i];
            let expected_size = offsets[i + 1] - offset;
            let err_flag = &err_flag;
            s.spawn(move |_| {
                if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
                    return;
                }
                // SAFETY: each thread writes to non-overlapping region
                let out_slice = unsafe {
                    std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
                };
                if BASE64_ENGINE.decode(chunk, out_slice.as_out()).is_err() {
                    err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
                }
            });
        }
    });

    if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
        return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"));
    }

    out.write_all(&output_buf[..total_decoded])
}

/// Strip non-base64 characters (for -i / --ignore-garbage).
fn strip_non_base64(data: &[u8]) -> Vec<u8> {
    data.iter()
        .copied()
        .filter(|&b| is_base64_char(b))
        .collect()
}

/// Check if a byte is a valid base64 alphabet character or padding.
#[inline]
fn is_base64_char(b: u8) -> bool {
    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
}

/// Stream-encode from a reader to a writer. Used for stdin processing.
/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
pub fn encode_stream(
    reader: &mut impl Read,
    wrap_col: usize,
    writer: &mut impl Write,
) -> io::Result<()> {
    if wrap_col == 0 {
        return encode_stream_nowrap(reader, writer);
    }
    encode_stream_wrapped(reader, wrap_col, writer)
}

/// Streaming encode with NO line wrapping — optimized fast path.
/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
/// 24MB reads mean 10-18MB input is consumed in a single read() call,
/// and the encoded output writes in 1-2 write() calls.
fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3

    // SAFETY: buf bytes are written by read_full before being processed.
    // encode_buf bytes are written by encode before being read.
    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(NOWRAP_READ);
    }
    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        encode_buf.set_len(encode_buf_size);
    }

    loop {
        let n = read_full(reader, &mut buf)?;
        if n == 0 {
            break;
        }
        let enc_len = BASE64_ENGINE.encoded_length(n);
        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
        writer.write_all(encoded)?;
    }
    Ok(())
}

/// Streaming encode WITH line wrapping.
/// For the common case (wrap_col divides evenly into 3-byte input groups),
/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
/// then writes it in a single write() call. This eliminates the overhead of
/// many writev() syscalls (one per ~512 lines via IoSlice).
///
/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
fn encode_stream_wrapped(
    reader: &mut impl Read,
    wrap_col: usize,
    writer: &mut impl Write,
) -> io::Result<()> {
    let bytes_per_line = wrap_col * 3 / 4;
    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
    // align the read buffer to bytes_per_line boundaries so each chunk produces
    // complete lines with no column carry-over between chunks.
    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
    }

    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
    const STREAM_READ: usize = 12 * 1024 * 1024;
    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(STREAM_READ);
    }
    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        encode_buf.set_len(encode_buf_size);
    }

    let mut col = 0usize;

    loop {
        let n = read_full(reader, &mut buf)?;
        if n == 0 {
            break;
        }
        let enc_len = BASE64_ENGINE.encoded_length(n);
        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());

        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
    }

    if col > 0 {
        writer.write_all(b"\n")?;
    }

    Ok(())
}

/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
/// encode each line directly into its final position with newline appended.
/// Eliminates the two-pass encode-then-fuse_wrap approach.
/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
fn encode_stream_wrapped_fused(
    reader: &mut impl Read,
    wrap_col: usize,
    bytes_per_line: usize,
    writer: &mut impl Write,
) -> io::Result<()> {
    // Align read size to bytes_per_line for complete output lines per chunk.
    // ~210K lines * 57 bytes = ~12MB input, ~16MB output per chunk.
    // Smaller chunks have better cache behavior for the 4-line unrolled encode loop.
    let lines_per_chunk = (12 * 1024 * 1024) / bytes_per_line;
    let read_size = lines_per_chunk * bytes_per_line;
    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline

    // SAFETY: buf bytes are written by read_full before being processed.
    // out_buf bytes are written by encode before being read.
    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(read_size);
    }
    // Output buffer: enough for all lines + remainder
    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(max_output);
    }

    loop {
        let n = read_full(reader, &mut buf)?;
        if n == 0 {
            break;
        }

        let full_lines = n / bytes_per_line;
        let remainder = n % bytes_per_line;

        // Encode each input line directly into its final output position.
        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
        // This eliminates the separate encode + fuse_wrap copy entirely.
        let dst = out_buf.as_mut_ptr();
        let mut line_idx = 0;

        // 4-line unrolled loop for better ILP
        while line_idx + 4 <= full_lines {
            let in_base = line_idx * bytes_per_line;
            let out_base = line_idx * line_out;
            unsafe {
                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
                *dst.add(out_base + wrap_col) = b'\n';

                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
                    s1.as_out(),
                );
                *dst.add(out_base + line_out + wrap_col) = b'\n';

                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
                    s2.as_out(),
                );
                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';

                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
                    s3.as_out(),
                );
                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
            }
            line_idx += 4;
        }

        // Remaining full lines
        while line_idx < full_lines {
            let in_base = line_idx * bytes_per_line;
            let out_base = line_idx * line_out;
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
                *dst.add(out_base + wrap_col) = b'\n';
            }
            line_idx += 1;
        }

        let mut wp = full_lines * line_out;

        // Handle remainder (partial last line of this chunk)
        if remainder > 0 {
            let enc_len = BASE64_ENGINE.encoded_length(remainder);
            let line_input = &buf[full_lines * bytes_per_line..n];
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
                *dst.add(wp + enc_len) = b'\n';
            }
            wp += enc_len + 1;
        }

        writer.write_all(&out_buf[..wp])?;
    }

    Ok(())
}

/// Stream-decode from a reader to a writer. Used for stdin processing.
/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
/// fill the entire buffer from the pipe, and 32MB means even large inputs
/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
pub fn decode_stream(
    reader: &mut impl Read,
    ignore_garbage: bool,
    writer: &mut impl Write,
) -> io::Result<()> {
    const READ_CHUNK: usize = 32 * 1024 * 1024;
    // SAFETY: buf bytes are written by read_full before being processed.
    // The extra 4 bytes accommodate carry-over from previous chunk.
    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(READ_CHUNK + 4);
    }
    let mut carry = [0u8; 4];
    let mut carry_len = 0usize;

    loop {
        // Copy carry bytes to start of buffer, read new data after them
        if carry_len > 0 {
            unsafe {
                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
            }
        }
        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
        if n == 0 {
            break;
        }
        let total_raw = carry_len + n;

        // Strip whitespace in-place in the buffer itself.
        // This eliminates the separate clean buffer allocation (saves 16MB).
        let clean_len = if ignore_garbage {
            // Scalar filter for ignore_garbage mode (rare path)
            let ptr = buf.as_mut_ptr();
            let mut wp = 0usize;
            for i in 0..total_raw {
                let b = unsafe { *ptr.add(i) };
                if is_base64_char(b) {
                    unsafe { *ptr.add(wp) = b };
                    wp += 1;
                }
            }
            wp
        } else {
            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
            // so we process ~76 bytes per memchr hit.
            let ptr = buf.as_mut_ptr();
            let data = &buf[..total_raw];
            let mut wp = 0usize;
            let mut gap_start = 0usize;
            let mut has_rare_ws = false;

            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
                let gap_len = pos - gap_start;
                if gap_len > 0 {
                    if !has_rare_ws {
                        has_rare_ws = data[gap_start..pos]
                            .iter()
                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
                    }
                    if wp != gap_start {
                        unsafe {
                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
                        }
                    }
                    wp += gap_len;
                }
                gap_start = pos + 1;
            }
            let tail_len = total_raw - gap_start;
            if tail_len > 0 {
                if !has_rare_ws {
                    has_rare_ws = data[gap_start..total_raw]
                        .iter()
                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
                }
                if wp != gap_start {
                    unsafe {
                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
                    }
                }
                wp += tail_len;
            }

            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
            if has_rare_ws {
                let mut rp = 0;
                let mut cwp = 0;
                while rp < wp {
                    let b = unsafe { *ptr.add(rp) };
                    if NOT_WHITESPACE[b as usize] {
                        unsafe { *ptr.add(cwp) = b };
                        cwp += 1;
                    }
                    rp += 1;
                }
                cwp
            } else {
                wp
            }
        };

        carry_len = 0;
        let is_last = n < READ_CHUNK;

        if is_last {
            // Last chunk: decode everything (including padding)
            decode_clean_slice(&mut buf[..clean_len], writer)?;
        } else {
            // Save incomplete base64 quadruplet for next iteration
            let decode_len = (clean_len / 4) * 4;
            let leftover = clean_len - decode_len;
            if leftover > 0 {
                unsafe {
                    std::ptr::copy_nonoverlapping(
                        buf.as_ptr().add(decode_len),
                        carry.as_mut_ptr(),
                        leftover,
                    );
                }
                carry_len = leftover;
            }
            if decode_len > 0 {
                decode_clean_slice(&mut buf[..decode_len], writer)?;
            }
        }
    }

    // Handle any remaining carry-over bytes
    if carry_len > 0 {
        let mut carry_buf = carry[..carry_len].to_vec();
        decode_clean_slice(&mut carry_buf, writer)?;
    }

    Ok(())
}

/// Write all IoSlice entries using write_vectored (writev syscall).
/// Hot path: single write_vectored succeeds fully (common on Linux pipes/files).
/// Cold path: partial write handled out-of-line to keep hot path tight.
#[inline(always)]
fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
    if slices.is_empty() {
        return Ok(());
    }
    let total: usize = slices.iter().map(|s| s.len()).sum();
    let written = out.write_vectored(slices)?;
    if written >= total {
        return Ok(());
    }
    if written == 0 {
        return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
    }
    write_all_vectored_slow(out, slices, written)
}

/// Handle partial write (cold path, never inlined).
#[cold]
#[inline(never)]
fn write_all_vectored_slow(
    out: &mut impl Write,
    slices: &[io::IoSlice],
    mut skip: usize,
) -> io::Result<()> {
    for slice in slices {
        let len = slice.len();
        if skip >= len {
            skip -= len;
            continue;
        }
        out.write_all(&slice[skip..])?;
        skip = 0;
    }
    Ok(())
}

/// Read as many bytes as possible into buf, retrying on partial reads.
/// Fast path: regular file reads usually return the full buffer on the first call,
/// avoiding the loop overhead entirely.
#[inline]
fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
    // Fast path: first read() usually fills the entire buffer for regular files
    let n = reader.read(buf)?;
    if n == buf.len() || n == 0 {
        return Ok(n);
    }
    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
    let mut total = n;
    while total < buf.len() {
        match reader.read(&mut buf[total..]) {
            Ok(0) => break,
            Ok(n) => total += n,
            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
            Err(e) => return Err(e),
        }
    }
    Ok(total)
}