Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4
5const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
6
7/// Number of available CPUs for parallel chunk splitting.
8/// Uses std::thread::available_parallelism() to avoid triggering premature
9/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
10#[inline]
11fn num_cpus() -> usize {
12    std::thread::available_parallelism()
13        .map(|n| n.get())
14        .unwrap_or(1)
15}
16
17/// Chunk size for sequential no-wrap encoding: 8MB aligned to 3 bytes.
18/// Larger chunks reduce function call overhead per iteration while still
19/// keeping peak buffer allocation reasonable (~10.7MB for the output).
20const NOWRAP_CHUNK: usize = 8 * 1024 * 1024 - (8 * 1024 * 1024 % 3);
21
22/// Minimum data size for parallel no-wrap encoding (16MB).
23/// For single-file CLI usage (typical benchmark), the Rayon pool is cold
24/// on first use (~200-500µs init). At 10MB, sequential encoding is faster
25/// because pool init + dispatch overhead exceeds the parallel benefit.
26/// Note: multi-file callers pay pool init only once; subsequent files would
27/// benefit from a lower threshold (~2MB). Optimized for single-file CLI.
28const PARALLEL_NOWRAP_THRESHOLD: usize = 16 * 1024 * 1024;
29
30/// Minimum data size for parallel wrapped encoding (12MB).
31/// Same cold-pool reasoning as PARALLEL_NOWRAP_THRESHOLD above.
32/// The sequential encode_wrapped_expand path with backward expansion
33/// eliminates per-group overhead from L1-scatter chunking.
34const PARALLEL_WRAPPED_THRESHOLD: usize = 12 * 1024 * 1024;
35
36/// Minimum data size for parallel decoding (1MB of base64 data).
37/// Lower threshold than encode because decode is more compute-intensive
38/// and benefits from parallelism at smaller sizes. After first use, the
39/// Rayon pool is warm (~10µs dispatch), making 1MB a good crossover point.
40const PARALLEL_DECODE_THRESHOLD: usize = 1024 * 1024;
41
42/// Hint HUGEPAGE for large output buffers on Linux.
43/// MADV_HUGEPAGE tells kernel to use 2MB pages, reducing TLB misses
44/// and minor fault count for large allocations (~25,600 → ~50 for 100MB).
45#[cfg(target_os = "linux")]
46fn hint_hugepage(buf: &mut Vec<u8>) {
47    if buf.capacity() >= 2 * 1024 * 1024 {
48        unsafe {
49            libc::madvise(
50                buf.as_mut_ptr() as *mut libc::c_void,
51                buf.capacity(),
52                libc::MADV_HUGEPAGE,
53            );
54        }
55    }
56}
57
58/// Encode data and write to output with line wrapping.
59/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
60pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
61    if data.is_empty() {
62        return Ok(());
63    }
64
65    if wrap_col == 0 {
66        return encode_no_wrap(data, out);
67    }
68
69    encode_wrapped(data, wrap_col, out)
70}
71
72/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
73fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
74    if data.len() >= PARALLEL_NOWRAP_THRESHOLD && num_cpus() > 1 {
75        return encode_no_wrap_parallel(data, out);
76    }
77
78    // Single-buffer encode: for data that fits in one chunk, encode directly
79    // and write once. For larger data, reuse the buffer across chunks.
80    let enc_len = BASE64_ENGINE.encoded_length(data.len().min(NOWRAP_CHUNK));
81    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
82    #[allow(clippy::uninit_vec)]
83    unsafe {
84        buf.set_len(enc_len);
85    }
86
87    for chunk in data.chunks(NOWRAP_CHUNK) {
88        let clen = BASE64_ENGINE.encoded_length(chunk.len());
89        let encoded = BASE64_ENGINE.encode(chunk, buf[..clen].as_out());
90        out.write_all(encoded)?;
91    }
92    Ok(())
93}
94
95/// Parallel no-wrap encoding into a single shared output buffer.
96/// Split at 3-byte boundaries, pre-calculate output offsets, encode in parallel.
97/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
98/// Single allocation + single write_all instead of N allocations + writev.
99fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
100    let num_threads = num_cpus().max(1);
101    let raw_chunk = data.len() / num_threads;
102    // Align to 3 bytes so each chunk encodes without padding (except the last)
103    let chunk_size = ((raw_chunk + 2) / 3) * 3;
104
105    // Split input into 3-byte-aligned chunks
106    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();
107
108    // Pre-calculate output offsets
109    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
110    let mut total_out = 0usize;
111    for chunk in &chunks {
112        offsets.push(total_out);
113        total_out += BASE64_ENGINE.encoded_length(chunk.len());
114    }
115
116    // Single allocation for all threads
117    let mut output: Vec<u8> = Vec::with_capacity(total_out);
118    #[allow(clippy::uninit_vec)]
119    unsafe {
120        output.set_len(total_out);
121    }
122    #[cfg(target_os = "linux")]
123    hint_hugepage(&mut output);
124
125    // Parallel encode: each thread writes into its pre-assigned region
126    let output_base = output.as_mut_ptr() as usize;
127    rayon::scope(|s| {
128        for (i, chunk) in chunks.iter().enumerate() {
129            let out_off = offsets[i];
130            let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
131            let base = output_base;
132            s.spawn(move |_| {
133                let dest =
134                    unsafe { std::slice::from_raw_parts_mut((base + out_off) as *mut u8, enc_len) };
135                let _ = BASE64_ENGINE.encode(chunk, dest.as_out());
136            });
137        }
138    });
139
140    out.write_all(&output[..total_out])
141}
142
143/// Encode with line wrapping using forward scatter from L1-cached temp buffer.
144/// Encodes groups of lines into a small temp buffer (fits in L1 cache), then
145/// scatter-copies wrap_col-byte chunks from temp to output with newlines.
146///
147/// This is faster than bulk encode + backward expansion because:
148/// - Temp buffer reads hit L1 cache (essentially free bandwidth)
149/// - Output buffer is written once (no double-write from backward memmove)
150/// - Forward access pattern is prefetcher-friendly
151fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
152    let bytes_per_line = wrap_col * 3 / 4;
153    if bytes_per_line == 0 {
154        return encode_wrapped_small(data, wrap_col, out);
155    }
156
157    if data.len() >= PARALLEL_WRAPPED_THRESHOLD && bytes_per_line.is_multiple_of(3) {
158        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
159    }
160
161    if bytes_per_line.is_multiple_of(3) {
162        return encode_wrapped_expand(data, wrap_col, bytes_per_line, out);
163    }
164
165    // Fallback for non-3-aligned bytes_per_line: use fuse_wrap approach
166    let enc_max = BASE64_ENGINE.encoded_length(data.len());
167    let num_full = enc_max / wrap_col;
168    let rem = enc_max % wrap_col;
169    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
170
171    // Encode full data, then fuse with newlines
172    let mut enc_buf: Vec<u8> = Vec::with_capacity(enc_max);
173    #[allow(clippy::uninit_vec)]
174    unsafe {
175        enc_buf.set_len(enc_max);
176    }
177    let _ = BASE64_ENGINE.encode(data, enc_buf[..enc_max].as_out());
178
179    let mut out_buf: Vec<u8> = Vec::with_capacity(out_len);
180    #[allow(clippy::uninit_vec)]
181    unsafe {
182        out_buf.set_len(out_len);
183    }
184    let n = fuse_wrap(&enc_buf, wrap_col, &mut out_buf);
185    out.write_all(&out_buf[..n])
186}
187
188/// Encode with backward expansion: single contiguous SIMD encode, then expand
189/// in-place to insert newlines. The encode is done in one call (no chunking),
190/// which eliminates per-group function call overhead from L1-scatter.
191/// The backward expansion only shifts data by ~1.3% (1 byte per 76 for wrap_col=76),
192/// and for most lines the shift exceeds wrap_col so memmove uses the fast memcpy path.
193fn encode_wrapped_expand(
194    data: &[u8],
195    wrap_col: usize,
196    bytes_per_line: usize,
197    out: &mut impl Write,
198) -> io::Result<()> {
199    debug_assert!(bytes_per_line.is_multiple_of(3));
200    let enc_len = BASE64_ENGINE.encoded_length(data.len());
201    if enc_len == 0 {
202        return Ok(());
203    }
204
205    let num_full = enc_len / wrap_col;
206    let rem = enc_len % wrap_col;
207    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
208
209    // Single allocation: encode into first enc_len bytes, expand backward to out_len.
210    // SAFETY: buf[..enc_len] is initialized by BASE64_ENGINE.encode below.
211    // buf[enc_len..out_len] is written by expand_backward before write_all reads it.
212    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
213    #[allow(clippy::uninit_vec)]
214    unsafe {
215        buf.set_len(out_len);
216    }
217    #[cfg(target_os = "linux")]
218    hint_hugepage(&mut buf);
219
220    // One SIMD encode call for the entire input (no chunking overhead)
221    let encoded = BASE64_ENGINE.encode(data, buf[..enc_len].as_out());
222    debug_assert_eq!(encoded.len(), enc_len, "encode wrote unexpected length");
223
224    // Expand backward to insert newlines — shifts only ~1.3% of data
225    expand_backward(buf.as_mut_ptr(), enc_len, out_len, wrap_col);
226
227    out.write_all(&buf[..out_len])
228}
229
230/// L1-scatter encode: encode groups of lines into a small L1-cached temp buffer,
231/// then scatter-copy each line to its final position in the output buffer with
232/// newline insertion. Each output byte is written exactly once — no read-back
233/// from main memory, halving memory traffic vs backward expansion.
234///
235/// Temp buffer (~20KB for 256 lines × 76 chars) stays hot in L1 cache, so
236/// reads during scatter are essentially free. Output buffer is streamed out
237/// with sequential writes that the prefetcher can handle efficiently.
238///
239/// Uses a full output buffer for vmsplice safety: vmsplice maps user pages
240/// into the pipe buffer, so the buffer must stay valid until the reader consumes.
241#[allow(dead_code)]
242fn encode_wrapped_scatter(
243    data: &[u8],
244    wrap_col: usize,
245    bytes_per_line: usize,
246    out: &mut impl Write,
247) -> io::Result<()> {
248    let enc_len = BASE64_ENGINE.encoded_length(data.len());
249    if enc_len == 0 {
250        return Ok(());
251    }
252
253    let num_full = enc_len / wrap_col;
254    let rem = enc_len % wrap_col;
255    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
256
257    // Output buffer — written once via scatter, then write_all to output
258    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
259    #[allow(clippy::uninit_vec)]
260    unsafe {
261        buf.set_len(out_len);
262    }
263    #[cfg(target_os = "linux")]
264    hint_hugepage(&mut buf);
265
266    // L1-cached temp buffer for encoding groups of lines.
267    // 256 lines × 76 chars = 19,456 bytes — fits comfortably in L1 (32-64KB).
268    const GROUP_LINES: usize = 256;
269    let group_input = GROUP_LINES * bytes_per_line;
270    let temp_size = GROUP_LINES * wrap_col;
271    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
272    #[allow(clippy::uninit_vec)]
273    unsafe {
274        temp.set_len(temp_size);
275    }
276
277    let line_out = wrap_col + 1;
278    let mut wp = 0usize; // write position in output buffer
279
280    for chunk in data.chunks(group_input) {
281        let clen = BASE64_ENGINE.encoded_length(chunk.len());
282        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());
283
284        // Scatter-copy full lines from temp to output with newlines
285        let lines = clen / wrap_col;
286        let chunk_rem = clen % wrap_col;
287
288        // 8-line unrolled scatter for ILP
289        let mut i = 0;
290        while i + 8 <= lines {
291            unsafe {
292                let src = temp.as_ptr().add(i * wrap_col);
293                let dst = buf.as_mut_ptr().add(wp);
294                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
295                *dst.add(wrap_col) = b'\n';
296                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
297                *dst.add(line_out + wrap_col) = b'\n';
298                std::ptr::copy_nonoverlapping(
299                    src.add(2 * wrap_col),
300                    dst.add(2 * line_out),
301                    wrap_col,
302                );
303                *dst.add(2 * line_out + wrap_col) = b'\n';
304                std::ptr::copy_nonoverlapping(
305                    src.add(3 * wrap_col),
306                    dst.add(3 * line_out),
307                    wrap_col,
308                );
309                *dst.add(3 * line_out + wrap_col) = b'\n';
310                std::ptr::copy_nonoverlapping(
311                    src.add(4 * wrap_col),
312                    dst.add(4 * line_out),
313                    wrap_col,
314                );
315                *dst.add(4 * line_out + wrap_col) = b'\n';
316                std::ptr::copy_nonoverlapping(
317                    src.add(5 * wrap_col),
318                    dst.add(5 * line_out),
319                    wrap_col,
320                );
321                *dst.add(5 * line_out + wrap_col) = b'\n';
322                std::ptr::copy_nonoverlapping(
323                    src.add(6 * wrap_col),
324                    dst.add(6 * line_out),
325                    wrap_col,
326                );
327                *dst.add(6 * line_out + wrap_col) = b'\n';
328                std::ptr::copy_nonoverlapping(
329                    src.add(7 * wrap_col),
330                    dst.add(7 * line_out),
331                    wrap_col,
332                );
333                *dst.add(7 * line_out + wrap_col) = b'\n';
334            }
335            wp += 8 * line_out;
336            i += 8;
337        }
338        // Remaining full lines
339        while i < lines {
340            unsafe {
341                std::ptr::copy_nonoverlapping(
342                    temp.as_ptr().add(i * wrap_col),
343                    buf.as_mut_ptr().add(wp),
344                    wrap_col,
345                );
346                *buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
347            }
348            wp += line_out;
349            i += 1;
350        }
351        // Partial last line (only on final chunk)
352        if chunk_rem > 0 {
353            unsafe {
354                std::ptr::copy_nonoverlapping(
355                    temp.as_ptr().add(lines * wrap_col),
356                    buf.as_mut_ptr().add(wp),
357                    chunk_rem,
358                );
359                *buf.as_mut_ptr().add(wp + chunk_rem) = b'\n';
360            }
361            wp += chunk_rem + 1;
362        }
363    }
364
365    out.write_all(&buf[..wp])
366}
367
368/// Scatter-copy encoded lines from temp buffer to output buffer with newlines.
369/// Uses copy_nonoverlapping since temp and output never overlap.
370#[inline]
371#[allow(dead_code)]
372fn scatter_lines(
373    temp: &[u8],
374    buf: &mut [u8],
375    line_start: usize,
376    count: usize,
377    wrap_col: usize,
378    line_out: usize,
379) {
380    unsafe {
381        let src = temp.as_ptr();
382        let dst = buf.as_mut_ptr();
383        for i in 0..count {
384            let s_off = i * wrap_col;
385            let d_off = (line_start + i) * line_out;
386            std::ptr::copy_nonoverlapping(src.add(s_off), dst.add(d_off), wrap_col);
387            *dst.add(d_off + wrap_col) = b'\n';
388        }
389    }
390}
391
392/// Expand encoded data in-place by inserting newlines at wrap_col boundaries.
393/// buf[0..enc_len] contains contiguous encoded data; buf has capacity for out_len.
394/// After expansion, buf[0..out_len] contains wrapped output with newlines.
395///
396/// Processes backward so shifted data never overwrites unread source data.
397/// For wrap_col=76: shift is ~1.3% (1 byte per 76), so most copies are
398/// non-overlapping and the memmove fast-path (memcpy) is used.
399#[inline]
400fn expand_backward(ptr: *mut u8, enc_len: usize, out_len: usize, wrap_col: usize) {
401    let num_full = enc_len / wrap_col;
402    let rem = enc_len % wrap_col;
403
404    unsafe {
405        let mut rp = enc_len;
406        let mut wp = out_len;
407
408        // Handle partial last line (remainder)
409        if rem > 0 {
410            wp -= 1;
411            *ptr.add(wp) = b'\n';
412            wp -= rem;
413            rp -= rem;
414            if rp != wp {
415                std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
416            }
417        }
418
419        // Process full lines backward
420        let mut lines_left = num_full;
421        while lines_left >= 8 {
422            // Unrolled: 8 lines per iteration
423            wp -= 1;
424            *ptr.add(wp) = b'\n';
425            rp -= wrap_col;
426            wp -= wrap_col;
427            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
428
429            wp -= 1;
430            *ptr.add(wp) = b'\n';
431            rp -= wrap_col;
432            wp -= wrap_col;
433            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
434
435            wp -= 1;
436            *ptr.add(wp) = b'\n';
437            rp -= wrap_col;
438            wp -= wrap_col;
439            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
440
441            wp -= 1;
442            *ptr.add(wp) = b'\n';
443            rp -= wrap_col;
444            wp -= wrap_col;
445            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
446
447            wp -= 1;
448            *ptr.add(wp) = b'\n';
449            rp -= wrap_col;
450            wp -= wrap_col;
451            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
452
453            wp -= 1;
454            *ptr.add(wp) = b'\n';
455            rp -= wrap_col;
456            wp -= wrap_col;
457            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
458
459            wp -= 1;
460            *ptr.add(wp) = b'\n';
461            rp -= wrap_col;
462            wp -= wrap_col;
463            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
464
465            wp -= 1;
466            *ptr.add(wp) = b'\n';
467            rp -= wrap_col;
468            wp -= wrap_col;
469            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
470
471            lines_left -= 8;
472        }
473
474        // Remaining lines (0-7)
475        while lines_left > 0 {
476            wp -= 1;
477            *ptr.add(wp) = b'\n';
478            rp -= wrap_col;
479            wp -= wrap_col;
480            if rp != wp {
481                std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
482            }
483            lines_left -= 1;
484        }
485    }
486}
487
488/// Static newline byte for IoSlice references in writev calls.
489static NEWLINE: [u8; 1] = [b'\n'];
490
491/// Write encoded base64 data with line wrapping using write_vectored (writev).
492/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
493/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
494/// This is zero-copy: no fused output buffer needed.
495#[inline]
496#[allow(dead_code)]
497fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
498    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
499    // Each line needs 2 entries (data + newline), so 512 lines per batch.
500    const MAX_IOV: usize = 1024;
501
502    let num_full_lines = encoded.len() / wrap_col;
503    let remainder = encoded.len() % wrap_col;
504    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };
505
506    // Small output: build all IoSlices and write in one call
507    if total_iov <= MAX_IOV {
508        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
509        let mut pos = 0;
510        for _ in 0..num_full_lines {
511            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
512            iov.push(io::IoSlice::new(&NEWLINE));
513            pos += wrap_col;
514        }
515        if remainder > 0 {
516            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
517            iov.push(io::IoSlice::new(&NEWLINE));
518        }
519        return write_all_vectored(out, &iov);
520    }
521
522    // Large output: fuse batches of lines into a reusable L1-cached buffer.
523    // Each batch copies ~39KB (512 lines × 77 bytes) from the encoded buffer
524    // with newlines inserted, then writes as a single contiguous write(2).
525    // This is faster than writev with 1024 IoSlice entries because:
526    // - One kernel memcpy per batch vs 1024 separate copies
527    // - Fused buffer (39KB) stays hot in L1 cache across batches
528    let line_out = wrap_col + 1;
529    const BATCH_LINES: usize = 512;
530    let batch_fused_size = BATCH_LINES * line_out;
531    let mut fused: Vec<u8> = Vec::with_capacity(batch_fused_size);
532    #[allow(clippy::uninit_vec)]
533    unsafe {
534        fused.set_len(batch_fused_size);
535    }
536
537    let mut rp = 0;
538    let mut lines_done = 0;
539
540    // Process full batches using 8-line unrolled fuse_wrap
541    while lines_done + BATCH_LINES <= num_full_lines {
542        let n = fuse_wrap(
543            &encoded[rp..rp + BATCH_LINES * wrap_col],
544            wrap_col,
545            &mut fused,
546        );
547        out.write_all(&fused[..n])?;
548        rp += BATCH_LINES * wrap_col;
549        lines_done += BATCH_LINES;
550    }
551
552    // Remaining full lines (partial batch)
553    let remaining_lines = num_full_lines - lines_done;
554    if remaining_lines > 0 {
555        let n = fuse_wrap(
556            &encoded[rp..rp + remaining_lines * wrap_col],
557            wrap_col,
558            &mut fused,
559        );
560        out.write_all(&fused[..n])?;
561        rp += remaining_lines * wrap_col;
562    }
563
564    // Partial last line
565    if remainder > 0 {
566        out.write_all(&encoded[rp..rp + remainder])?;
567        out.write_all(b"\n")?;
568    }
569    Ok(())
570}
571
572/// Write encoded base64 data with line wrapping using writev, tracking column state
573/// across calls. Used by encode_stream for piped input where chunks don't align
574/// to line boundaries.
575#[inline]
576fn write_wrapped_iov_streaming(
577    encoded: &[u8],
578    wrap_col: usize,
579    col: &mut usize,
580    out: &mut impl Write,
581) -> io::Result<()> {
582    const MAX_IOV: usize = 1024;
583    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
584    let mut rp = 0;
585
586    while rp < encoded.len() {
587        let space = wrap_col - *col;
588        let avail = encoded.len() - rp;
589
590        if avail <= space {
591            // Remaining data fits in current line
592            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
593            *col += avail;
594            if *col == wrap_col {
595                iov.push(io::IoSlice::new(&NEWLINE));
596                *col = 0;
597            }
598            break;
599        } else {
600            // Fill current line and add newline
601            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
602            iov.push(io::IoSlice::new(&NEWLINE));
603            rp += space;
604            *col = 0;
605        }
606
607        if iov.len() >= MAX_IOV - 1 {
608            write_all_vectored(out, &iov)?;
609            iov.clear();
610        }
611    }
612
613    if !iov.is_empty() {
614        write_all_vectored(out, &iov)?;
615    }
616    Ok(())
617}
618
619/// Parallel wrapped encoding with L1-scatter into a single shared output buffer.
620/// Pre-calculates each thread's output offset, allocates one buffer for all threads,
621/// and has each thread encode directly into its pre-assigned non-overlapping region.
622/// This saves N-1 buffer allocations and corresponding page faults vs per-thread Vecs,
623/// and uses a single write_all instead of writev.
624fn encode_wrapped_parallel(
625    data: &[u8],
626    wrap_col: usize,
627    bytes_per_line: usize,
628    out: &mut impl Write,
629) -> io::Result<()> {
630    let num_threads = num_cpus().max(1);
631    let lines_per_chunk = ((data.len() / bytes_per_line) / num_threads).max(1);
632    let chunk_input = lines_per_chunk * bytes_per_line;
633
634    // Split input at bytes_per_line boundaries (last chunk may have remainder)
635    let chunks: Vec<&[u8]> = data.chunks(chunk_input.max(bytes_per_line)).collect();
636
637    // Pre-calculate output offsets for each chunk
638    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
639    let mut total_out = 0usize;
640    for chunk in &chunks {
641        offsets.push(total_out);
642        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
643        let full_lines = enc_len / wrap_col;
644        let remainder = enc_len % wrap_col;
645        total_out += full_lines * (wrap_col + 1) + if remainder > 0 { remainder + 1 } else { 0 };
646    }
647
648    // Single allocation for all threads
649    let mut output: Vec<u8> = Vec::with_capacity(total_out);
650    #[allow(clippy::uninit_vec)]
651    unsafe {
652        output.set_len(total_out);
653    }
654    #[cfg(target_os = "linux")]
655    hint_hugepage(&mut output);
656
657    // Parallel encode: each thread writes into its pre-assigned region
658    let output_base = output.as_mut_ptr() as usize;
659    rayon::scope(|s| {
660        for (i, chunk) in chunks.iter().enumerate() {
661            let out_off = offsets[i];
662            let out_end = if i + 1 < offsets.len() {
663                offsets[i + 1]
664            } else {
665                total_out
666            };
667            let out_size = out_end - out_off;
668            let base = output_base;
669            s.spawn(move |_| {
670                let out_slice = unsafe {
671                    std::slice::from_raw_parts_mut((base + out_off) as *mut u8, out_size)
672                };
673                encode_chunk_l1_scatter_into(chunk, out_slice, wrap_col, bytes_per_line);
674            });
675        }
676    });
677
678    out.write_all(&output[..total_out])
679}
680
681/// Encode a chunk using L1-scatter, writing into a pre-allocated output slice.
682/// Encodes groups of 256 lines into L1-cached temp buffer, scatter-copy to output with newlines.
683/// The output slice must be large enough to hold the encoded+wrapped output.
684fn encode_chunk_l1_scatter_into(
685    data: &[u8],
686    output: &mut [u8],
687    wrap_col: usize,
688    bytes_per_line: usize,
689) {
690    const GROUP_LINES: usize = 256;
691    let group_input = GROUP_LINES * bytes_per_line;
692    let temp_size = GROUP_LINES * wrap_col;
693    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
694    #[allow(clippy::uninit_vec)]
695    unsafe {
696        temp.set_len(temp_size);
697    }
698
699    let line_out = wrap_col + 1;
700    let mut wp = 0usize;
701
702    for chunk in data.chunks(group_input) {
703        let clen = BASE64_ENGINE.encoded_length(chunk.len());
704        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());
705
706        let lines = clen / wrap_col;
707        let chunk_rem = clen % wrap_col;
708
709        // 8-line unrolled scatter
710        let mut i = 0;
711        while i + 8 <= lines {
712            unsafe {
713                let src = temp.as_ptr().add(i * wrap_col);
714                let dst = output.as_mut_ptr().add(wp);
715                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
716                *dst.add(wrap_col) = b'\n';
717                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
718                *dst.add(line_out + wrap_col) = b'\n';
719                std::ptr::copy_nonoverlapping(
720                    src.add(2 * wrap_col),
721                    dst.add(2 * line_out),
722                    wrap_col,
723                );
724                *dst.add(2 * line_out + wrap_col) = b'\n';
725                std::ptr::copy_nonoverlapping(
726                    src.add(3 * wrap_col),
727                    dst.add(3 * line_out),
728                    wrap_col,
729                );
730                *dst.add(3 * line_out + wrap_col) = b'\n';
731                std::ptr::copy_nonoverlapping(
732                    src.add(4 * wrap_col),
733                    dst.add(4 * line_out),
734                    wrap_col,
735                );
736                *dst.add(4 * line_out + wrap_col) = b'\n';
737                std::ptr::copy_nonoverlapping(
738                    src.add(5 * wrap_col),
739                    dst.add(5 * line_out),
740                    wrap_col,
741                );
742                *dst.add(5 * line_out + wrap_col) = b'\n';
743                std::ptr::copy_nonoverlapping(
744                    src.add(6 * wrap_col),
745                    dst.add(6 * line_out),
746                    wrap_col,
747                );
748                *dst.add(6 * line_out + wrap_col) = b'\n';
749                std::ptr::copy_nonoverlapping(
750                    src.add(7 * wrap_col),
751                    dst.add(7 * line_out),
752                    wrap_col,
753                );
754                *dst.add(7 * line_out + wrap_col) = b'\n';
755            }
756            wp += 8 * line_out;
757            i += 8;
758        }
759        while i < lines {
760            unsafe {
761                std::ptr::copy_nonoverlapping(
762                    temp.as_ptr().add(i * wrap_col),
763                    output.as_mut_ptr().add(wp),
764                    wrap_col,
765                );
766                *output.as_mut_ptr().add(wp + wrap_col) = b'\n';
767            }
768            wp += line_out;
769            i += 1;
770        }
771        if chunk_rem > 0 {
772            unsafe {
773                std::ptr::copy_nonoverlapping(
774                    temp.as_ptr().add(lines * wrap_col),
775                    output.as_mut_ptr().add(wp),
776                    chunk_rem,
777                );
778                *output.as_mut_ptr().add(wp + chunk_rem) = b'\n';
779            }
780            wp += chunk_rem + 1;
781        }
782    }
783}
784
785/// Fuse encoded base64 data with newlines in a single pass.
786/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
787/// Returns number of bytes written.
788#[inline]
789fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
790    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
791    let mut rp = 0;
792    let mut wp = 0;
793
794    // Unrolled: process 8 lines per iteration for better ILP
795    while rp + 8 * wrap_col <= encoded.len() {
796        unsafe {
797            let src = encoded.as_ptr().add(rp);
798            let dst = out_buf.as_mut_ptr().add(wp);
799
800            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
801            *dst.add(wrap_col) = b'\n';
802
803            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
804            *dst.add(line_out + wrap_col) = b'\n';
805
806            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
807            *dst.add(2 * line_out + wrap_col) = b'\n';
808
809            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
810            *dst.add(3 * line_out + wrap_col) = b'\n';
811
812            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
813            *dst.add(4 * line_out + wrap_col) = b'\n';
814
815            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
816            *dst.add(5 * line_out + wrap_col) = b'\n';
817
818            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
819            *dst.add(6 * line_out + wrap_col) = b'\n';
820
821            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
822            *dst.add(7 * line_out + wrap_col) = b'\n';
823        }
824        rp += 8 * wrap_col;
825        wp += 8 * line_out;
826    }
827
828    // Handle remaining 4 lines at a time
829    while rp + 4 * wrap_col <= encoded.len() {
830        unsafe {
831            let src = encoded.as_ptr().add(rp);
832            let dst = out_buf.as_mut_ptr().add(wp);
833
834            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
835            *dst.add(wrap_col) = b'\n';
836
837            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
838            *dst.add(line_out + wrap_col) = b'\n';
839
840            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
841            *dst.add(2 * line_out + wrap_col) = b'\n';
842
843            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
844            *dst.add(3 * line_out + wrap_col) = b'\n';
845        }
846        rp += 4 * wrap_col;
847        wp += 4 * line_out;
848    }
849
850    // Remaining full lines
851    while rp + wrap_col <= encoded.len() {
852        unsafe {
853            std::ptr::copy_nonoverlapping(
854                encoded.as_ptr().add(rp),
855                out_buf.as_mut_ptr().add(wp),
856                wrap_col,
857            );
858            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
859        }
860        rp += wrap_col;
861        wp += line_out;
862    }
863
864    // Partial last line
865    if rp < encoded.len() {
866        let remaining = encoded.len() - rp;
867        unsafe {
868            std::ptr::copy_nonoverlapping(
869                encoded.as_ptr().add(rp),
870                out_buf.as_mut_ptr().add(wp),
871                remaining,
872            );
873        }
874        wp += remaining;
875        out_buf[wp] = b'\n';
876        wp += 1;
877    }
878
879    wp
880}
881
882/// Fallback for very small wrap columns (< 4 chars).
883fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
884    let enc_max = BASE64_ENGINE.encoded_length(data.len());
885    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
886    #[allow(clippy::uninit_vec)]
887    unsafe {
888        buf.set_len(enc_max);
889    }
890    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
891
892    let wc = wrap_col.max(1);
893    for line in encoded.chunks(wc) {
894        out.write_all(line)?;
895        out.write_all(b"\n")?;
896    }
897    Ok(())
898}
899
900/// Decode base64 data and write to output (borrows data, allocates clean buffer).
901/// When `ignore_garbage` is true, strip all non-base64 characters.
902/// When false, only strip whitespace (standard behavior).
903pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
904    if data.is_empty() {
905        return Ok(());
906    }
907
908    if ignore_garbage {
909        let mut cleaned = strip_non_base64(data);
910        return decode_clean_slice(&mut cleaned, out);
911    }
912
913    // For large data (>= 512KB): use bulk strip + single-shot decode.
914    // try_line_decode decodes per-line (~25ns overhead per 76-byte line call),
915    // while strip+decode uses SIMD gap-copy + single-shot SIMD decode at ~6.5 GB/s.
916    // For 10MB decode benchmark: ~2ms (bulk) vs ~4ms (per-line) = 2x faster.
917    // For small data (< 512KB): per-line decode avoids allocation overhead.
918    if data.len() < 512 * 1024 && data.len() >= 77 {
919        if let Some(result) = try_line_decode(data, out) {
920            return result;
921        }
922    }
923
924    // Fast path: single-pass SIMD strip + decode
925    decode_stripping_whitespace(data, out)
926}
927
928/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
929/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
930/// in-place with base64_simd::decode_inplace. Zero additional allocations.
931///
932/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
933/// containing whitespace (newlines) get physically copied (~1.3% for
934/// 76-char line base64). The decode writes to the same buffer, but decoded
935/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
936pub fn decode_mmap_inplace(
937    data: &mut [u8],
938    ignore_garbage: bool,
939    out: &mut impl Write,
940) -> io::Result<()> {
941    if data.is_empty() {
942        return Ok(());
943    }
944
945    // For small data: try line-by-line decode (avoids COW page faults).
946    // For large data (>= 512KB): bulk strip+decode is faster than per-line decode.
947    if !ignore_garbage && data.len() >= 77 && data.len() < 512 * 1024 {
948        if let Some(result) = try_line_decode(data, out) {
949            return result;
950        }
951    }
952
953    if ignore_garbage {
954        // Strip non-base64 chars in-place
955        let ptr = data.as_mut_ptr();
956        let len = data.len();
957        let mut wp = 0;
958        for rp in 0..len {
959            let b = unsafe { *ptr.add(rp) };
960            if is_base64_char(b) {
961                unsafe { *ptr.add(wp) = b };
962                wp += 1;
963            }
964        }
965        let r = decode_inplace_with_padding(&mut data[..wp], out);
966        return r;
967    }
968
969    // Fast path: uniform-line fused strip+decode (no intermediate buffer).
970    if data.len() >= 77 {
971        if let Some(result) = try_decode_uniform_lines(data, out) {
972            return result;
973        }
974    }
975
976    // Fallback: strip whitespace in-place using SIMD memchr2 gap-copy.
977
978    // Quick check: no newlines at all — maybe already clean
979    if memchr::memchr2(b'\n', b'\r', data).is_none() {
980        // Check for rare whitespace
981        if !data
982            .iter()
983            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
984        {
985            // Perfectly clean — decode in-place directly
986            return decode_inplace_with_padding(data, out);
987        }
988        // Rare whitespace only — strip in-place
989        let ptr = data.as_mut_ptr();
990        let len = data.len();
991        let mut wp = 0;
992        for rp in 0..len {
993            let b = unsafe { *ptr.add(rp) };
994            if NOT_WHITESPACE[b as usize] {
995                unsafe { *ptr.add(wp) = b };
996                wp += 1;
997            }
998        }
999        return decode_inplace_with_padding(&mut data[..wp], out);
1000    }
1001
1002    // SIMD gap-copy: strip \n and \r in-place using memchr2
1003    let ptr = data.as_mut_ptr();
1004    let len = data.len();
1005    let mut wp = 0usize;
1006    let mut gap_start = 0usize;
1007    let mut has_rare_ws = false;
1008
1009    // SAFETY: memchr2_iter reads from the original data. We write to positions
1010    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
1011    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1012        let gap_len = pos - gap_start;
1013        if gap_len > 0 {
1014            if !has_rare_ws {
1015                // Check for rare whitespace during the gap-copy
1016                has_rare_ws = unsafe {
1017                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
1018                        .iter()
1019                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1020                };
1021            }
1022            if wp != gap_start {
1023                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
1024            }
1025            wp += gap_len;
1026        }
1027        gap_start = pos + 1;
1028    }
1029    // Final gap
1030    let tail_len = len - gap_start;
1031    if tail_len > 0 {
1032        if !has_rare_ws {
1033            has_rare_ws = unsafe {
1034                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
1035                    .iter()
1036                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1037            };
1038        }
1039        if wp != gap_start {
1040            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
1041        }
1042        wp += tail_len;
1043    }
1044
1045    // Second pass for rare whitespace if needed
1046    if has_rare_ws {
1047        let mut rp = 0;
1048        let mut cwp = 0;
1049        while rp < wp {
1050            let b = unsafe { *ptr.add(rp) };
1051            if NOT_WHITESPACE[b as usize] {
1052                unsafe { *ptr.add(cwp) = b };
1053                cwp += 1;
1054            }
1055            rp += 1;
1056        }
1057        wp = cwp;
1058    }
1059
1060    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
1061    if wp >= PARALLEL_DECODE_THRESHOLD {
1062        // For large data, use parallel decode from the cleaned slice
1063        return decode_borrowed_clean_parallel(out, &data[..wp]);
1064    }
1065    decode_inplace_with_padding(&mut data[..wp], out)
1066}
1067
1068/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
1069pub fn decode_owned(
1070    data: &mut Vec<u8>,
1071    ignore_garbage: bool,
1072    out: &mut impl Write,
1073) -> io::Result<()> {
1074    if data.is_empty() {
1075        return Ok(());
1076    }
1077
1078    if ignore_garbage {
1079        data.retain(|&b| is_base64_char(b));
1080    } else {
1081        strip_whitespace_inplace(data);
1082    }
1083
1084    decode_clean_slice(data, out)
1085}
1086
1087/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
1088/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
1089/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
1090/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
1091fn strip_whitespace_inplace(data: &mut Vec<u8>) {
1092    // Quick check: skip stripping if no \n or \r in the data.
1093    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1094    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
1095    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1096        // No newlines/CR — check for rare whitespace only
1097        if data
1098            .iter()
1099            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1100        {
1101            data.retain(|&b| NOT_WHITESPACE[b as usize]);
1102        }
1103        return;
1104    }
1105
1106    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
1107    // gaps between them to compact the data in-place. For typical base64 streams,
1108    // newlines are the only whitespace, so this handles >99% of cases.
1109    let ptr = data.as_mut_ptr();
1110    let len = data.len();
1111    let mut wp = 0usize;
1112    let mut gap_start = 0usize;
1113    let mut has_rare_ws = false;
1114
1115    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
1116        let gap_len = pos - gap_start;
1117        if gap_len > 0 {
1118            if !has_rare_ws {
1119                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
1120                has_rare_ws = data[gap_start..pos]
1121                    .iter()
1122                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1123            }
1124            if wp != gap_start {
1125                unsafe {
1126                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
1127                }
1128            }
1129            wp += gap_len;
1130        }
1131        gap_start = pos + 1;
1132    }
1133    // Copy the final gap
1134    let tail_len = len - gap_start;
1135    if tail_len > 0 {
1136        if !has_rare_ws {
1137            has_rare_ws = data[gap_start..]
1138                .iter()
1139                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1140        }
1141        if wp != gap_start {
1142            unsafe {
1143                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
1144            }
1145        }
1146        wp += tail_len;
1147    }
1148
1149    data.truncate(wp);
1150
1151    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
1152    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1153    if has_rare_ws {
1154        let ptr = data.as_mut_ptr();
1155        let len = data.len();
1156        let mut rp = 0;
1157        let mut cwp = 0;
1158        while rp < len {
1159            let b = unsafe { *ptr.add(rp) };
1160            if NOT_WHITESPACE[b as usize] {
1161                unsafe { *ptr.add(cwp) = b };
1162                cwp += 1;
1163            }
1164            rp += 1;
1165        }
1166        data.truncate(cwp);
1167    }
1168}
1169
1170/// 256-byte lookup table: true for non-whitespace bytes.
1171/// Used for single-pass whitespace stripping in decode.
1172static NOT_WHITESPACE: [bool; 256] = {
1173    let mut table = [true; 256];
1174    table[b' ' as usize] = false;
1175    table[b'\t' as usize] = false;
1176    table[b'\n' as usize] = false;
1177    table[b'\r' as usize] = false;
1178    table[0x0b] = false; // vertical tab
1179    table[0x0c] = false; // form feed
1180    table
1181};
1182
1183/// Fused strip+decode for uniform-line base64 data.
1184/// Detects consistent line length, then processes in sub-chunks: each sub-chunk
1185/// copies lines to a small local buffer (L2-hot) and decodes immediately.
1186/// Eliminates the large intermediate clean buffer (~12MB for 10MB decode).
1187/// Returns None if the data doesn't have uniform line structure.
1188fn try_decode_uniform_lines(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1189    let first_nl = memchr::memchr(b'\n', data)?;
1190    let line_len = first_nl;
1191    if line_len == 0 || line_len % 4 != 0 {
1192        return None;
1193    }
1194
1195    let stride = line_len + 1;
1196
1197    // Verify the data has consistent line structure (first + last lines)
1198    let check_lines = 4.min(data.len() / stride);
1199    for i in 1..check_lines {
1200        let expected_nl = i * stride - 1;
1201        if expected_nl >= data.len() || data[expected_nl] != b'\n' {
1202            return None;
1203        }
1204    }
1205
1206    let full_lines = if data.len() >= stride {
1207        let candidate = data.len() / stride;
1208        if candidate > 0 && data[candidate * stride - 1] != b'\n' {
1209            return None;
1210        }
1211        candidate
1212    } else {
1213        0
1214    };
1215
1216    let remainder_start = full_lines * stride;
1217    let remainder = &data[remainder_start..];
1218    let rem_clean = if remainder.last() == Some(&b'\n') {
1219        &remainder[..remainder.len() - 1]
1220    } else {
1221        remainder
1222    };
1223
1224    // Compute exact decoded sizes
1225    let decoded_per_line = line_len * 3 / 4;
1226    let rem_decoded_size = if rem_clean.is_empty() {
1227        0
1228    } else {
1229        let pad = rem_clean
1230            .iter()
1231            .rev()
1232            .take(2)
1233            .filter(|&&b| b == b'=')
1234            .count();
1235        rem_clean.len() * 3 / 4 - pad
1236    };
1237    let total_decoded = full_lines * decoded_per_line + rem_decoded_size;
1238    let clean_len = full_lines * line_len;
1239
1240    // Parallel path: fused strip+decode with 128KB sub-chunks per thread.
1241    // Each thread copies lines to a thread-local buffer (L2-hot) and decodes immediately,
1242    // eliminating the 12MB+ intermediate clean buffer entirely.
1243    if clean_len >= PARALLEL_DECODE_THRESHOLD && num_cpus() > 1 {
1244        let mut output: Vec<u8> = Vec::with_capacity(total_decoded);
1245        #[allow(clippy::uninit_vec)]
1246        unsafe {
1247            output.set_len(total_decoded);
1248        }
1249        #[cfg(target_os = "linux")]
1250        hint_hugepage(&mut output);
1251
1252        let out_ptr = output.as_mut_ptr() as usize;
1253        let src_ptr = data.as_ptr() as usize;
1254        let num_threads = num_cpus().max(1);
1255        let lines_per_thread = (full_lines + num_threads - 1) / num_threads;
1256        // 512KB sub-chunks: larger chunks give SIMD decode more contiguous data,
1257        // reducing per-call overhead. 512KB fits in L2 cache (256KB-1MB typical).
1258        let lines_per_sub = (512 * 1024 / line_len).max(1);
1259
1260        let err_flag = std::sync::atomic::AtomicBool::new(false);
1261        rayon::scope(|s| {
1262            for t in 0..num_threads {
1263                let err_flag = &err_flag;
1264                s.spawn(move |_| {
1265                    let start_line = t * lines_per_thread;
1266                    if start_line >= full_lines {
1267                        return;
1268                    }
1269                    let end_line = (start_line + lines_per_thread).min(full_lines);
1270                    let chunk_lines = end_line - start_line;
1271
1272                    let sub_buf_size = lines_per_sub.min(chunk_lines) * line_len;
1273                    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1274                    #[allow(clippy::uninit_vec)]
1275                    unsafe {
1276                        local_buf.set_len(sub_buf_size);
1277                    }
1278
1279                    let src = src_ptr as *const u8;
1280                    let out_base = out_ptr as *mut u8;
1281                    let local_dst = local_buf.as_mut_ptr();
1282
1283                    let mut sub_start = 0usize;
1284                    while sub_start < chunk_lines {
1285                        if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
1286                            return;
1287                        }
1288                        let sub_count = (chunk_lines - sub_start).min(lines_per_sub);
1289                        let sub_clean = sub_count * line_len;
1290
1291                        for i in 0..sub_count {
1292                            unsafe {
1293                                std::ptr::copy_nonoverlapping(
1294                                    src.add((start_line + sub_start + i) * stride),
1295                                    local_dst.add(i * line_len),
1296                                    line_len,
1297                                );
1298                            }
1299                        }
1300
1301                        let out_offset = (start_line + sub_start) * decoded_per_line;
1302                        let out_size = sub_count * decoded_per_line;
1303                        let out_slice = unsafe {
1304                            std::slice::from_raw_parts_mut(out_base.add(out_offset), out_size)
1305                        };
1306                        if BASE64_ENGINE
1307                            .decode(&local_buf[..sub_clean], out_slice.as_out())
1308                            .is_err()
1309                        {
1310                            err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
1311                            return;
1312                        }
1313
1314                        sub_start += sub_count;
1315                    }
1316                });
1317            }
1318        });
1319        let result: Result<(), io::Error> = if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
1320            Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1321        } else {
1322            Ok(())
1323        };
1324
1325        if let Err(e) = result {
1326            return Some(Err(e));
1327        }
1328
1329        if !rem_clean.is_empty() {
1330            let rem_out = &mut output[full_lines * decoded_per_line..total_decoded];
1331            match BASE64_ENGINE.decode(rem_clean, rem_out.as_out()) {
1332                Ok(_) => {}
1333                Err(_) => return Some(decode_error()),
1334            }
1335        }
1336
1337        return Some(out.write_all(&output[..total_decoded]));
1338    }
1339
1340    // Sequential path: fused strip+decode in 256KB sub-chunks.
1341    // Larger sub-chunks give SIMD decode more data per call, improving throughput.
1342    // Uses decode_inplace on a small reusable buffer — no large allocations at all.
1343    let lines_per_sub = (256 * 1024 / line_len).max(1);
1344    let sub_buf_size = lines_per_sub * line_len;
1345    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1346    #[allow(clippy::uninit_vec)]
1347    unsafe {
1348        local_buf.set_len(sub_buf_size);
1349    }
1350
1351    let src = data.as_ptr();
1352    let local_dst = local_buf.as_mut_ptr();
1353
1354    let mut line_idx = 0usize;
1355    while line_idx < full_lines {
1356        let sub_count = (full_lines - line_idx).min(lines_per_sub);
1357        let sub_clean = sub_count * line_len;
1358
1359        for i in 0..sub_count {
1360            unsafe {
1361                std::ptr::copy_nonoverlapping(
1362                    src.add((line_idx + i) * stride),
1363                    local_dst.add(i * line_len),
1364                    line_len,
1365                );
1366            }
1367        }
1368
1369        match BASE64_ENGINE.decode_inplace(&mut local_buf[..sub_clean]) {
1370            Ok(decoded) => {
1371                if let Err(e) = out.write_all(decoded) {
1372                    return Some(Err(e));
1373                }
1374            }
1375            Err(_) => return Some(decode_error()),
1376        }
1377
1378        line_idx += sub_count;
1379    }
1380
1381    if !rem_clean.is_empty() {
1382        let mut rem_buf = rem_clean.to_vec();
1383        match BASE64_ENGINE.decode_inplace(&mut rem_buf) {
1384            Ok(decoded) => {
1385                if let Err(e) = out.write_all(decoded) {
1386                    return Some(Err(e));
1387                }
1388            }
1389            Err(_) => return Some(decode_error()),
1390        }
1391    }
1392
1393    Some(Ok(()))
1394}
1395
1396/// Decode by stripping whitespace and decoding in a single fused pass.
1397/// For data with no whitespace, decodes directly without any copy.
1398/// Detects uniform line structure for fast structured-copy (no search needed),
1399/// falls back to SIMD memchr2 gap-copy for irregular data.
1400fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
1401    // Fast path for uniform-line base64 (e.g., standard 76-char lines + newline).
1402    // Copies at known offsets, avoiding the memchr2 search entirely.
1403    // For 13MB base64: saves ~1ms vs memchr2 gap-copy (just structured memcpy).
1404    if data.len() >= 77 {
1405        if let Some(result) = try_decode_uniform_lines(data, out) {
1406            return result;
1407        }
1408    }
1409
1410    // Quick check: skip stripping if no \n or \r in the data.
1411    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1412    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1413        // No newlines/CR — check for rare whitespace only
1414        if !data
1415            .iter()
1416            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1417        {
1418            return decode_borrowed_clean(out, data);
1419        }
1420        // Has rare whitespace only — strip and decode
1421        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
1422        for &b in data {
1423            if NOT_WHITESPACE[b as usize] {
1424                cleaned.push(b);
1425            }
1426        }
1427        return decode_clean_slice(&mut cleaned, out);
1428    }
1429
1430    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
1431    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
1432    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
1433    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
1434    let dst = clean.as_mut_ptr();
1435    let mut wp = 0usize;
1436    let mut gap_start = 0usize;
1437    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
1438    // This avoids the second full-scan pass when only \n/\r are present.
1439    let mut has_rare_ws = false;
1440
1441    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1442        let gap_len = pos - gap_start;
1443        if gap_len > 0 {
1444            // Check gap region for rare whitespace during copy.
1445            // This adds ~1 branch per gap but eliminates the second full scan.
1446            if !has_rare_ws {
1447                has_rare_ws = data[gap_start..pos]
1448                    .iter()
1449                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1450            }
1451            unsafe {
1452                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
1453            }
1454            wp += gap_len;
1455        }
1456        gap_start = pos + 1;
1457    }
1458    // Copy the final gap after the last \n/\r
1459    let tail_len = data.len() - gap_start;
1460    if tail_len > 0 {
1461        if !has_rare_ws {
1462            has_rare_ws = data[gap_start..]
1463                .iter()
1464                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1465        }
1466        unsafe {
1467            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
1468        }
1469        wp += tail_len;
1470    }
1471    unsafe {
1472        clean.set_len(wp);
1473    }
1474
1475    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
1476    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1477    if has_rare_ws {
1478        let ptr = clean.as_mut_ptr();
1479        let len = clean.len();
1480        let mut rp = 0;
1481        let mut cwp = 0;
1482        while rp < len {
1483            let b = unsafe { *ptr.add(rp) };
1484            if NOT_WHITESPACE[b as usize] {
1485                unsafe { *ptr.add(cwp) = b };
1486                cwp += 1;
1487            }
1488            rp += 1;
1489        }
1490        clean.truncate(cwp);
1491    }
1492
1493    // For large data (>= threshold), use parallel decode for multi-core speedup.
1494    // For small data, use in-place decode to avoid extra allocation.
1495    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
1496        decode_borrowed_clean_parallel(out, &clean)
1497    } else {
1498        decode_clean_slice(&mut clean, out)
1499    }
1500}
1501
1502/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
1503/// Returns Some(result) if the data has uniform line lengths suitable for
1504/// per-line decode, or None if the data doesn't fit this pattern.
1505///
1506/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
1507/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
1508/// We decode each line directly into its position in the output buffer.
1509fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1510    // Find the first newline to determine line length
1511    let first_nl = memchr::memchr(b'\n', data)?;
1512    let line_len = first_nl; // encoded chars per line (without newline)
1513
1514    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
1515    if line_len == 0 || line_len % 4 != 0 {
1516        return None;
1517    }
1518
1519    let line_stride = line_len + 1; // line_len chars + 1 newline byte
1520    let decoded_per_line = line_len * 3 / 4;
1521
1522    // Verify the data has a consistent line structure by checking the next few lines
1523    let check_lines = 4.min(data.len() / line_stride);
1524    for i in 1..check_lines {
1525        let expected_nl = i * line_stride - 1;
1526        if expected_nl >= data.len() {
1527            break;
1528        }
1529        if data[expected_nl] != b'\n' {
1530            return None; // Inconsistent line length
1531        }
1532    }
1533
1534    // Calculate full lines and remainder
1535    let full_lines = if data.len() >= line_stride {
1536        // Check how many complete lines fit
1537        let candidate = data.len() / line_stride;
1538        // Verify the last full line's newline
1539        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
1540            return None; // Not a clean line-structured file
1541        }
1542        candidate
1543    } else {
1544        0
1545    };
1546
1547    let remainder_start = full_lines * line_stride;
1548    let remainder = &data[remainder_start..];
1549
1550    // Calculate exact output size
1551    let remainder_clean_len = if remainder.is_empty() {
1552        0
1553    } else {
1554        // Remainder might end with newline, strip it
1555        let rem = if remainder.last() == Some(&b'\n') {
1556            &remainder[..remainder.len() - 1]
1557        } else {
1558            remainder
1559        };
1560        if rem.is_empty() {
1561            0
1562        } else {
1563            // Check for padding
1564            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
1565            if rem.len() % 4 != 0 {
1566                return None; // Invalid remainder
1567            }
1568            rem.len() * 3 / 4 - pad
1569        }
1570    };
1571
1572    // Single-allocation decode: allocate full decoded output, decode all lines
1573    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
1574    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
1575    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
1576    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
1577    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1578    #[allow(clippy::uninit_vec)]
1579    unsafe {
1580        out_buf.set_len(total_decoded);
1581    }
1582
1583    let dst = out_buf.as_mut_ptr();
1584
1585    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
1586    // Each thread decodes a contiguous block of lines directly to its final position
1587    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
1588    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
1589        let out_addr = dst as usize;
1590        let num_threads = num_cpus().max(1);
1591        let lines_per_chunk = (full_lines / num_threads).max(1);
1592
1593        // Build per-thread task ranges: (start_line, end_line)
1594        let mut tasks: Vec<(usize, usize)> = Vec::new();
1595        let mut line_off = 0;
1596        while line_off < full_lines {
1597            let end = (line_off + lines_per_chunk).min(full_lines);
1598            tasks.push((line_off, end));
1599            line_off = end;
1600        }
1601
1602        let decode_err = std::sync::atomic::AtomicBool::new(false);
1603        rayon::scope(|s| {
1604            for &(start_line, end_line) in &tasks {
1605                let decode_err = &decode_err;
1606                s.spawn(move |_| {
1607                    let out_ptr = out_addr as *mut u8;
1608                    let mut i = start_line;
1609
1610                    while i + 4 <= end_line {
1611                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
1612                            return;
1613                        }
1614                        let in_base = i * line_stride;
1615                        let ob = i * decoded_per_line;
1616                        unsafe {
1617                            let s0 =
1618                                std::slice::from_raw_parts_mut(out_ptr.add(ob), decoded_per_line);
1619                            if BASE64_ENGINE
1620                                .decode(&data[in_base..in_base + line_len], s0.as_out())
1621                                .is_err()
1622                            {
1623                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1624                                return;
1625                            }
1626                            let s1 = std::slice::from_raw_parts_mut(
1627                                out_ptr.add(ob + decoded_per_line),
1628                                decoded_per_line,
1629                            );
1630                            if BASE64_ENGINE
1631                                .decode(
1632                                    &data[in_base + line_stride..in_base + line_stride + line_len],
1633                                    s1.as_out(),
1634                                )
1635                                .is_err()
1636                            {
1637                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1638                                return;
1639                            }
1640                            let s2 = std::slice::from_raw_parts_mut(
1641                                out_ptr.add(ob + 2 * decoded_per_line),
1642                                decoded_per_line,
1643                            );
1644                            if BASE64_ENGINE
1645                                .decode(
1646                                    &data[in_base + 2 * line_stride
1647                                        ..in_base + 2 * line_stride + line_len],
1648                                    s2.as_out(),
1649                                )
1650                                .is_err()
1651                            {
1652                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1653                                return;
1654                            }
1655                            let s3 = std::slice::from_raw_parts_mut(
1656                                out_ptr.add(ob + 3 * decoded_per_line),
1657                                decoded_per_line,
1658                            );
1659                            if BASE64_ENGINE
1660                                .decode(
1661                                    &data[in_base + 3 * line_stride
1662                                        ..in_base + 3 * line_stride + line_len],
1663                                    s3.as_out(),
1664                                )
1665                                .is_err()
1666                            {
1667                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1668                                return;
1669                            }
1670                        }
1671                        i += 4;
1672                    }
1673
1674                    while i < end_line {
1675                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
1676                            return;
1677                        }
1678                        let in_start = i * line_stride;
1679                        let out_off = i * decoded_per_line;
1680                        let out_slice = unsafe {
1681                            std::slice::from_raw_parts_mut(out_ptr.add(out_off), decoded_per_line)
1682                        };
1683                        if BASE64_ENGINE
1684                            .decode(&data[in_start..in_start + line_len], out_slice.as_out())
1685                            .is_err()
1686                        {
1687                            decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1688                            return;
1689                        }
1690                        i += 1;
1691                    }
1692                });
1693            }
1694        });
1695
1696        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
1697            return Some(decode_error());
1698        }
1699    } else {
1700        // Sequential decode with 4x unrolling for smaller inputs
1701        let mut i = 0;
1702
1703        while i + 4 <= full_lines {
1704            let in_base = i * line_stride;
1705            let out_base = i * decoded_per_line;
1706            unsafe {
1707                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
1708                if BASE64_ENGINE
1709                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1710                    .is_err()
1711                {
1712                    return Some(decode_error());
1713                }
1714
1715                let s1 = std::slice::from_raw_parts_mut(
1716                    dst.add(out_base + decoded_per_line),
1717                    decoded_per_line,
1718                );
1719                if BASE64_ENGINE
1720                    .decode(
1721                        &data[in_base + line_stride..in_base + line_stride + line_len],
1722                        s1.as_out(),
1723                    )
1724                    .is_err()
1725                {
1726                    return Some(decode_error());
1727                }
1728
1729                let s2 = std::slice::from_raw_parts_mut(
1730                    dst.add(out_base + 2 * decoded_per_line),
1731                    decoded_per_line,
1732                );
1733                if BASE64_ENGINE
1734                    .decode(
1735                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
1736                        s2.as_out(),
1737                    )
1738                    .is_err()
1739                {
1740                    return Some(decode_error());
1741                }
1742
1743                let s3 = std::slice::from_raw_parts_mut(
1744                    dst.add(out_base + 3 * decoded_per_line),
1745                    decoded_per_line,
1746                );
1747                if BASE64_ENGINE
1748                    .decode(
1749                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
1750                        s3.as_out(),
1751                    )
1752                    .is_err()
1753                {
1754                    return Some(decode_error());
1755                }
1756            }
1757            i += 4;
1758        }
1759
1760        while i < full_lines {
1761            let in_start = i * line_stride;
1762            let in_end = in_start + line_len;
1763            let out_off = i * decoded_per_line;
1764            let out_slice =
1765                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
1766            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
1767                Ok(_) => {}
1768                Err(_) => return Some(decode_error()),
1769            }
1770            i += 1;
1771        }
1772    }
1773
1774    // Decode remainder
1775    if remainder_clean_len > 0 {
1776        let rem = if remainder.last() == Some(&b'\n') {
1777            &remainder[..remainder.len() - 1]
1778        } else {
1779            remainder
1780        };
1781        let out_off = full_lines * decoded_per_line;
1782        let out_slice =
1783            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
1784        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
1785            Ok(_) => {}
1786            Err(_) => return Some(decode_error()),
1787        }
1788    }
1789
1790    // Single write_all for the entire decoded output
1791    Some(out.write_all(&out_buf[..total_decoded]))
1792}
1793
1794/// Decode a clean (no whitespace) buffer in-place with SIMD.
1795fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1796    if data.is_empty() {
1797        return Ok(());
1798    }
1799    decode_inplace_with_padding(data, out)
1800}
1801
1802/// Cold error path — keeps hot decode path tight by moving error construction out of line.
1803#[cold]
1804#[inline(never)]
1805fn decode_error() -> io::Result<()> {
1806    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1807}
1808
1809/// Decode in-place with padding fallback for truncated input.
1810/// GNU base64 accepts missing padding at end of stream, so if decode fails
1811/// and the length mod 4 is 2 or 3, retry with padding added.
1812fn decode_inplace_with_padding(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1813    match BASE64_ENGINE.decode_inplace(data) {
1814        Ok(decoded) => out.write_all(decoded),
1815        Err(_) => {
1816            let remainder = data.len() % 4;
1817            if remainder == 2 || remainder == 3 {
1818                let has_existing_padding = memchr::memchr(b'=', data).is_some();
1819                let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
1820                padded.extend_from_slice(data);
1821                padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
1822                if let Ok(decoded) = BASE64_ENGINE.decode_inplace(&mut padded) {
1823                    out.write_all(decoded)?;
1824                    if has_existing_padding {
1825                        return decode_error();
1826                    }
1827                    return Ok(());
1828                }
1829            }
1830            decode_error()
1831        }
1832    }
1833}
1834
1835/// Decode clean base64 data (no whitespace) from a borrowed slice.
1836fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1837    if data.is_empty() {
1838        return Ok(());
1839    }
1840    // Parallel decode for large data: split at 4-byte boundaries,
1841    // decode each chunk independently (base64 is context-free per 4-char group).
1842    if data.len() >= PARALLEL_DECODE_THRESHOLD {
1843        return decode_borrowed_clean_parallel(out, data);
1844    }
1845    // If input has truncated padding, pad it first (GNU base64 accepts missing padding).
1846    let remainder = data.len() % 4;
1847    if remainder == 2 || remainder == 3 {
1848        // If input already has '=' but length mod 4 != 0, the padding is
1849        // wrong/truncated. GNU base64 still decodes but reports error.
1850        let has_existing_padding = memchr::memchr(b'=', data).is_some();
1851        let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
1852        padded.extend_from_slice(data);
1853        padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
1854        let result = decode_borrowed_clean(out, &padded);
1855        if has_existing_padding && result.is_ok() {
1856            return decode_error();
1857        }
1858        return result;
1859    }
1860    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
1861    // Decoded size = data.len() * 3 / 4 minus padding.
1862    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
1863    let decoded_size = data.len() * 3 / 4 - pad;
1864    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
1865    #[allow(clippy::uninit_vec)]
1866    unsafe {
1867        buf.set_len(decoded_size);
1868    }
1869    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
1870        Ok(decoded) => {
1871            out.write_all(decoded)?;
1872            Ok(())
1873        }
1874        Err(_) => decode_error(),
1875    }
1876}
1877
1878/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel.
1879/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
1880/// upfront, so each thread decodes directly to its final position. No compaction needed.
1881fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1882    let num_threads = num_cpus().max(1);
1883    let raw_chunk = data.len() / num_threads;
1884    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
1885    let chunk_size = ((raw_chunk + 3) / 4) * 4;
1886
1887    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
1888
1889    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
1890    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
1891    offsets.push(0);
1892    let mut total_decoded = 0usize;
1893    for (i, chunk) in chunks.iter().enumerate() {
1894        let decoded_size = if i == chunks.len() - 1 {
1895            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
1896            chunk.len() * 3 / 4 - pad
1897        } else {
1898            chunk.len() * 3 / 4
1899        };
1900        total_decoded += decoded_size;
1901        offsets.push(total_decoded);
1902    }
1903
1904    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1905    #[allow(clippy::uninit_vec)]
1906    unsafe {
1907        output_buf.set_len(total_decoded);
1908    }
1909    #[cfg(target_os = "linux")]
1910    hint_hugepage(&mut output_buf);
1911
1912    // Parallel decode: each thread decodes directly into its exact final position.
1913    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
1914    let out_addr = output_buf.as_mut_ptr() as usize;
1915    let err_flag = std::sync::atomic::AtomicBool::new(false);
1916    rayon::scope(|s| {
1917        for (i, chunk) in chunks.iter().enumerate() {
1918            let offset = offsets[i];
1919            let expected_size = offsets[i + 1] - offset;
1920            let err_flag = &err_flag;
1921            s.spawn(move |_| {
1922                if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
1923                    return;
1924                }
1925                // SAFETY: each thread writes to non-overlapping region
1926                let out_slice = unsafe {
1927                    std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
1928                };
1929                if BASE64_ENGINE.decode(chunk, out_slice.as_out()).is_err() {
1930                    err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
1931                }
1932            });
1933        }
1934    });
1935
1936    if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
1937        return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"));
1938    }
1939
1940    out.write_all(&output_buf[..total_decoded])
1941}
1942
1943/// Strip non-base64 characters (for -i / --ignore-garbage).
1944fn strip_non_base64(data: &[u8]) -> Vec<u8> {
1945    data.iter()
1946        .copied()
1947        .filter(|&b| is_base64_char(b))
1948        .collect()
1949}
1950
1951/// Check if a byte is a valid base64 alphabet character or padding.
1952#[inline]
1953fn is_base64_char(b: u8) -> bool {
1954    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
1955}
1956
1957/// Stream-encode from a reader to a writer. Used for stdin processing.
1958/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
1959pub fn encode_stream(
1960    reader: &mut impl Read,
1961    wrap_col: usize,
1962    writer: &mut impl Write,
1963) -> io::Result<()> {
1964    if wrap_col == 0 {
1965        return encode_stream_nowrap(reader, writer);
1966    }
1967    encode_stream_wrapped(reader, wrap_col, writer)
1968}
1969
1970/// Streaming encode with NO line wrapping — optimized fast path.
1971/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
1972/// 24MB reads mean 10-18MB input is consumed in a single read() call,
1973/// and the encoded output writes in 1-2 write() calls.
1974fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1975    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
1976    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3
1977
1978    // SAFETY: buf bytes are written by read_full before being processed.
1979    // encode_buf bytes are written by encode before being read.
1980    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
1981    #[allow(clippy::uninit_vec)]
1982    unsafe {
1983        buf.set_len(NOWRAP_READ);
1984    }
1985    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
1986    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1987    #[allow(clippy::uninit_vec)]
1988    unsafe {
1989        encode_buf.set_len(encode_buf_size);
1990    }
1991
1992    loop {
1993        let n = read_full(reader, &mut buf)?;
1994        if n == 0 {
1995            break;
1996        }
1997        let enc_len = BASE64_ENGINE.encoded_length(n);
1998        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1999        writer.write_all(encoded)?;
2000    }
2001    Ok(())
2002}
2003
2004/// Streaming encode WITH line wrapping.
2005/// For the common case (wrap_col divides evenly into 3-byte input groups),
2006/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
2007/// then writes it in a single write() call. This eliminates the overhead of
2008/// many writev() syscalls (one per ~512 lines via IoSlice).
2009///
2010/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
2011fn encode_stream_wrapped(
2012    reader: &mut impl Read,
2013    wrap_col: usize,
2014    writer: &mut impl Write,
2015) -> io::Result<()> {
2016    let bytes_per_line = wrap_col * 3 / 4;
2017    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
2018    // align the read buffer to bytes_per_line boundaries so each chunk produces
2019    // complete lines with no column carry-over between chunks.
2020    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
2021        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
2022    }
2023
2024    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
2025    const STREAM_READ: usize = 12 * 1024 * 1024;
2026    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
2027    #[allow(clippy::uninit_vec)]
2028    unsafe {
2029        buf.set_len(STREAM_READ);
2030    }
2031    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
2032    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
2033    #[allow(clippy::uninit_vec)]
2034    unsafe {
2035        encode_buf.set_len(encode_buf_size);
2036    }
2037
2038    let mut col = 0usize;
2039
2040    loop {
2041        let n = read_full(reader, &mut buf)?;
2042        if n == 0 {
2043            break;
2044        }
2045        let enc_len = BASE64_ENGINE.encoded_length(n);
2046        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
2047
2048        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
2049    }
2050
2051    if col > 0 {
2052        writer.write_all(b"\n")?;
2053    }
2054
2055    Ok(())
2056}
2057
2058/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
2059/// encode each line directly into its final position with newline appended.
2060/// Eliminates the two-pass encode-then-fuse_wrap approach.
2061/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
2062/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
2063fn encode_stream_wrapped_fused(
2064    reader: &mut impl Read,
2065    wrap_col: usize,
2066    bytes_per_line: usize,
2067    writer: &mut impl Write,
2068) -> io::Result<()> {
2069    // Align read size to bytes_per_line for complete output lines per chunk.
2070    // ~420K lines * 57 bytes = ~24MB input, ~32MB output.
2071    let lines_per_chunk = (24 * 1024 * 1024) / bytes_per_line;
2072    let read_size = lines_per_chunk * bytes_per_line;
2073    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline
2074
2075    // SAFETY: buf bytes are written by read_full before being processed.
2076    // out_buf bytes are written by encode before being read.
2077    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
2078    #[allow(clippy::uninit_vec)]
2079    unsafe {
2080        buf.set_len(read_size);
2081    }
2082    // Output buffer: enough for all lines + remainder
2083    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
2084    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
2085    #[allow(clippy::uninit_vec)]
2086    unsafe {
2087        out_buf.set_len(max_output);
2088    }
2089
2090    loop {
2091        let n = read_full(reader, &mut buf)?;
2092        if n == 0 {
2093            break;
2094        }
2095
2096        let full_lines = n / bytes_per_line;
2097        let remainder = n % bytes_per_line;
2098
2099        // Encode each input line directly into its final output position.
2100        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
2101        // This eliminates the separate encode + fuse_wrap copy entirely.
2102        let dst = out_buf.as_mut_ptr();
2103        let mut line_idx = 0;
2104
2105        // 4-line unrolled loop for better ILP
2106        while line_idx + 4 <= full_lines {
2107            let in_base = line_idx * bytes_per_line;
2108            let out_base = line_idx * line_out;
2109            unsafe {
2110                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
2111                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
2112                *dst.add(out_base + wrap_col) = b'\n';
2113
2114                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
2115                let _ = BASE64_ENGINE.encode(
2116                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
2117                    s1.as_out(),
2118                );
2119                *dst.add(out_base + line_out + wrap_col) = b'\n';
2120
2121                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
2122                let _ = BASE64_ENGINE.encode(
2123                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
2124                    s2.as_out(),
2125                );
2126                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
2127
2128                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
2129                let _ = BASE64_ENGINE.encode(
2130                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
2131                    s3.as_out(),
2132                );
2133                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
2134            }
2135            line_idx += 4;
2136        }
2137
2138        // Remaining full lines
2139        while line_idx < full_lines {
2140            let in_base = line_idx * bytes_per_line;
2141            let out_base = line_idx * line_out;
2142            unsafe {
2143                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
2144                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
2145                *dst.add(out_base + wrap_col) = b'\n';
2146            }
2147            line_idx += 1;
2148        }
2149
2150        let mut wp = full_lines * line_out;
2151
2152        // Handle remainder (partial last line of this chunk)
2153        if remainder > 0 {
2154            let enc_len = BASE64_ENGINE.encoded_length(remainder);
2155            let line_input = &buf[full_lines * bytes_per_line..n];
2156            unsafe {
2157                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
2158                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
2159                *dst.add(wp + enc_len) = b'\n';
2160            }
2161            wp += enc_len + 1;
2162        }
2163
2164        writer.write_all(&out_buf[..wp])?;
2165    }
2166
2167    Ok(())
2168}
2169
2170/// Stream-decode from a reader to a writer. Used for stdin processing.
2171/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
2172/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
2173/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
2174/// fill the entire buffer from the pipe, and 32MB means even large inputs
2175/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
2176pub fn decode_stream(
2177    reader: &mut impl Read,
2178    ignore_garbage: bool,
2179    writer: &mut impl Write,
2180) -> io::Result<()> {
2181    const READ_CHUNK: usize = 32 * 1024 * 1024;
2182    // SAFETY: buf bytes are written by read_full before being processed.
2183    // The extra 4 bytes accommodate carry-over from previous chunk.
2184    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
2185    #[allow(clippy::uninit_vec)]
2186    unsafe {
2187        buf.set_len(READ_CHUNK + 4);
2188    }
2189    let mut carry = [0u8; 4];
2190    let mut carry_len = 0usize;
2191
2192    loop {
2193        // Copy carry bytes to start of buffer, read new data after them
2194        if carry_len > 0 {
2195            unsafe {
2196                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
2197            }
2198        }
2199        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
2200        if n == 0 {
2201            break;
2202        }
2203        let total_raw = carry_len + n;
2204
2205        // Strip whitespace in-place in the buffer itself.
2206        // This eliminates the separate clean buffer allocation (saves 16MB).
2207        let clean_len = if ignore_garbage {
2208            // Scalar filter for ignore_garbage mode (rare path)
2209            let ptr = buf.as_mut_ptr();
2210            let mut wp = 0usize;
2211            for i in 0..total_raw {
2212                let b = unsafe { *ptr.add(i) };
2213                if is_base64_char(b) {
2214                    unsafe { *ptr.add(wp) = b };
2215                    wp += 1;
2216                }
2217            }
2218            wp
2219        } else {
2220            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
2221            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
2222            // so we process ~76 bytes per memchr hit.
2223            let ptr = buf.as_mut_ptr();
2224            let data = &buf[..total_raw];
2225            let mut wp = 0usize;
2226            let mut gap_start = 0usize;
2227            let mut has_rare_ws = false;
2228
2229            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
2230                let gap_len = pos - gap_start;
2231                if gap_len > 0 {
2232                    if !has_rare_ws {
2233                        has_rare_ws = data[gap_start..pos]
2234                            .iter()
2235                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2236                    }
2237                    if wp != gap_start {
2238                        unsafe {
2239                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
2240                        }
2241                    }
2242                    wp += gap_len;
2243                }
2244                gap_start = pos + 1;
2245            }
2246            let tail_len = total_raw - gap_start;
2247            if tail_len > 0 {
2248                if !has_rare_ws {
2249                    has_rare_ws = data[gap_start..total_raw]
2250                        .iter()
2251                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2252                }
2253                if wp != gap_start {
2254                    unsafe {
2255                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
2256                    }
2257                }
2258                wp += tail_len;
2259            }
2260
2261            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
2262            if has_rare_ws {
2263                let mut rp = 0;
2264                let mut cwp = 0;
2265                while rp < wp {
2266                    let b = unsafe { *ptr.add(rp) };
2267                    if NOT_WHITESPACE[b as usize] {
2268                        unsafe { *ptr.add(cwp) = b };
2269                        cwp += 1;
2270                    }
2271                    rp += 1;
2272                }
2273                cwp
2274            } else {
2275                wp
2276            }
2277        };
2278
2279        carry_len = 0;
2280        let is_last = n < READ_CHUNK;
2281
2282        if is_last {
2283            // Last chunk: decode everything (including padding)
2284            decode_clean_slice(&mut buf[..clean_len], writer)?;
2285        } else {
2286            // Save incomplete base64 quadruplet for next iteration
2287            let decode_len = (clean_len / 4) * 4;
2288            let leftover = clean_len - decode_len;
2289            if leftover > 0 {
2290                unsafe {
2291                    std::ptr::copy_nonoverlapping(
2292                        buf.as_ptr().add(decode_len),
2293                        carry.as_mut_ptr(),
2294                        leftover,
2295                    );
2296                }
2297                carry_len = leftover;
2298            }
2299            if decode_len > 0 {
2300                decode_clean_slice(&mut buf[..decode_len], writer)?;
2301            }
2302        }
2303    }
2304
2305    // Handle any remaining carry-over bytes
2306    if carry_len > 0 {
2307        let mut carry_buf = carry[..carry_len].to_vec();
2308        decode_clean_slice(&mut carry_buf, writer)?;
2309    }
2310
2311    Ok(())
2312}
2313
2314/// Write all IoSlice entries using write_vectored (writev syscall).
2315/// Hot path: single write_vectored succeeds fully (common on Linux pipes/files).
2316/// Cold path: partial write handled out-of-line to keep hot path tight.
2317#[inline(always)]
2318fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
2319    if slices.is_empty() {
2320        return Ok(());
2321    }
2322    let total: usize = slices.iter().map(|s| s.len()).sum();
2323    let written = out.write_vectored(slices)?;
2324    if written >= total {
2325        return Ok(());
2326    }
2327    if written == 0 {
2328        return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
2329    }
2330    write_all_vectored_slow(out, slices, written)
2331}
2332
2333/// Handle partial write (cold path, never inlined).
2334#[cold]
2335#[inline(never)]
2336fn write_all_vectored_slow(
2337    out: &mut impl Write,
2338    slices: &[io::IoSlice],
2339    mut skip: usize,
2340) -> io::Result<()> {
2341    for slice in slices {
2342        let len = slice.len();
2343        if skip >= len {
2344            skip -= len;
2345            continue;
2346        }
2347        out.write_all(&slice[skip..])?;
2348        skip = 0;
2349    }
2350    Ok(())
2351}
2352
2353/// Read as many bytes as possible into buf, retrying on partial reads.
2354/// Fast path: regular file reads usually return the full buffer on the first call,
2355/// avoiding the loop overhead entirely.
2356#[inline]
2357fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2358    // Fast path: first read() usually fills the entire buffer for regular files
2359    let n = reader.read(buf)?;
2360    if n == buf.len() || n == 0 {
2361        return Ok(n);
2362    }
2363    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
2364    let mut total = n;
2365    while total < buf.len() {
2366        match reader.read(&mut buf[total..]) {
2367            Ok(0) => break,
2368            Ok(n) => total += n,
2369            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2370            Err(e) => return Err(e),
2371        }
2372    }
2373    Ok(total)
2374}