Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4
5const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
6
7/// Number of available CPUs (cached by the OS). Used for encode parallel thresholds
8/// to avoid triggering Rayon's thread pool initialization for encode paths.
9#[inline]
10fn num_cpus() -> usize {
11    std::thread::available_parallelism()
12        .map(|n| n.get())
13        .unwrap_or(1)
14}
15
16/// Chunk size for sequential no-wrap encoding: 8MB aligned to 3 bytes.
17/// Larger chunks reduce function call overhead per iteration while still
18/// keeping peak buffer allocation reasonable (~10.7MB for the output).
19const NOWRAP_CHUNK: usize = 8 * 1024 * 1024 - (8 * 1024 * 1024 % 3);
20
21/// Minimum data size for parallel no-wrap encoding (4MB).
22/// For 1-2MB input, thread creation (~200µs for 4 threads) + per-thread
23/// buffer allocation page faults (~0.3ms) exceed the parallel encoding
24/// benefit. At 4MB+, the ~2x parallel speedup amortizes overhead.
25const PARALLEL_NOWRAP_THRESHOLD: usize = 4 * 1024 * 1024;
26
27/// Minimum data size for parallel wrapped encoding (2MB).
28/// Wrapped parallel uses N threads for SIMD encoding, providing ~Nx
29/// speedup. Per-thread buffers (~2.5MB each for 10MB input) page-fault
30/// concurrently, and std::thread::scope avoids Rayon pool init (~300µs).
31const PARALLEL_WRAPPED_THRESHOLD: usize = 2 * 1024 * 1024;
32
33/// Minimum data size for parallel decoding (2MB of base64 data).
34/// Lower threshold lets parallel decode kick in earlier for medium files.
35const PARALLEL_DECODE_THRESHOLD: usize = 2 * 1024 * 1024;
36
37/// Encode data and write to output with line wrapping.
38/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
39pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
40    if data.is_empty() {
41        return Ok(());
42    }
43
44    if wrap_col == 0 {
45        return encode_no_wrap(data, out);
46    }
47
48    encode_wrapped(data, wrap_col, out)
49}
50
51/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
52fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
53    if data.len() >= PARALLEL_NOWRAP_THRESHOLD && num_cpus() > 1 {
54        return encode_no_wrap_parallel(data, out);
55    }
56
57    // Single-buffer encode: for data that fits in one chunk, encode directly
58    // and write once. For larger data, reuse the buffer across chunks.
59    let enc_len = BASE64_ENGINE.encoded_length(data.len().min(NOWRAP_CHUNK));
60    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
61    #[allow(clippy::uninit_vec)]
62    unsafe {
63        buf.set_len(enc_len);
64    }
65
66    for chunk in data.chunks(NOWRAP_CHUNK) {
67        let clen = BASE64_ENGINE.encoded_length(chunk.len());
68        let encoded = BASE64_ENGINE.encode(chunk, buf[..clen].as_out());
69        out.write_all(encoded)?;
70    }
71    Ok(())
72}
73
74/// Parallel no-wrap encoding: split at 3-byte boundaries, encode chunks in parallel.
75/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
76///
77/// Uses std::thread::scope instead of Rayon to avoid pool initialization overhead (~300µs).
78/// Each scoped thread allocates its own output buffer and encodes independently.
79/// Output uses writev to combine all per-thread buffers in a single syscall.
80fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
81    let num_threads = num_cpus().max(1);
82    let raw_chunk = data.len() / num_threads;
83    // Align to 3 bytes so each chunk encodes without padding (except the last)
84    let chunk_size = ((raw_chunk + 2) / 3) * 3;
85
86    // Split input into 3-byte-aligned chunks
87    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();
88
89    // Each scoped thread allocates its own output buffer and encodes independently.
90    let results: Vec<Vec<u8>> = std::thread::scope(|s| {
91        let handles: Vec<_> = chunks
92            .iter()
93            .map(|chunk| {
94                s.spawn(|| {
95                    let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
96                    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
97                    #[allow(clippy::uninit_vec)]
98                    unsafe {
99                        buf.set_len(enc_len);
100                    }
101                    // HUGEPAGE on per-thread buffer reduces page faults
102                    #[cfg(target_os = "linux")]
103                    if enc_len >= 2 * 1024 * 1024 {
104                        unsafe {
105                            libc::madvise(
106                                buf.as_mut_ptr() as *mut libc::c_void,
107                                enc_len,
108                                libc::MADV_HUGEPAGE,
109                            );
110                        }
111                    }
112                    let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
113                    buf
114                })
115            })
116            .collect();
117        handles.into_iter().map(|h| h.join().unwrap()).collect()
118    });
119
120    // Single writev for all chunks in order
121    let slices: Vec<io::IoSlice> = results.iter().map(|r| io::IoSlice::new(r)).collect();
122    write_all_vectored(out, &slices)
123}
124
125/// Encode with line wrapping using forward scatter from L1-cached temp buffer.
126/// Encodes groups of lines into a small temp buffer (fits in L1 cache), then
127/// scatter-copies wrap_col-byte chunks from temp to output with newlines.
128///
129/// This is faster than bulk encode + backward expansion because:
130/// - Temp buffer reads hit L1 cache (essentially free bandwidth)
131/// - Output buffer is written once (no double-write from backward memmove)
132/// - Forward access pattern is prefetcher-friendly
133fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
134    let bytes_per_line = wrap_col * 3 / 4;
135    if bytes_per_line == 0 {
136        return encode_wrapped_small(data, wrap_col, out);
137    }
138
139    if data.len() >= PARALLEL_WRAPPED_THRESHOLD && bytes_per_line.is_multiple_of(3) {
140        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
141    }
142
143    if bytes_per_line.is_multiple_of(3) {
144        return encode_wrapped_scatter(data, wrap_col, bytes_per_line, out);
145    }
146
147    // Fallback for non-3-aligned bytes_per_line: use fuse_wrap approach
148    let enc_max = BASE64_ENGINE.encoded_length(data.len());
149    let num_full = enc_max / wrap_col;
150    let rem = enc_max % wrap_col;
151    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
152
153    // Encode full data, then fuse with newlines
154    let mut enc_buf: Vec<u8> = Vec::with_capacity(enc_max);
155    #[allow(clippy::uninit_vec)]
156    unsafe {
157        enc_buf.set_len(enc_max);
158    }
159    let _ = BASE64_ENGINE.encode(data, enc_buf[..enc_max].as_out());
160
161    let mut out_buf: Vec<u8> = Vec::with_capacity(out_len);
162    #[allow(clippy::uninit_vec)]
163    unsafe {
164        out_buf.set_len(out_len);
165    }
166    let n = fuse_wrap(&enc_buf, wrap_col, &mut out_buf);
167    out.write_all(&out_buf[..n])
168}
169
170/// L1-scatter encode: encode groups of lines into a small L1-cached temp buffer,
171/// then scatter-copy each line to its final position in the output buffer with
172/// newline insertion. Each output byte is written exactly once — no read-back
173/// from main memory, halving memory traffic vs backward expansion.
174///
175/// Temp buffer (~20KB for 256 lines × 76 chars) stays hot in L1 cache, so
176/// reads during scatter are essentially free. Output buffer is streamed out
177/// with sequential writes that the prefetcher can handle efficiently.
178fn encode_wrapped_scatter(
179    data: &[u8],
180    wrap_col: usize,
181    bytes_per_line: usize,
182    out: &mut impl Write,
183) -> io::Result<()> {
184    let enc_len = BASE64_ENGINE.encoded_length(data.len());
185    if enc_len == 0 {
186        return Ok(());
187    }
188
189    let num_full = enc_len / wrap_col;
190    let rem = enc_len % wrap_col;
191    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
192
193    // Output buffer — written once via scatter, then write_all to output
194    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
195    #[allow(clippy::uninit_vec)]
196    unsafe {
197        buf.set_len(out_len);
198    }
199    #[cfg(target_os = "linux")]
200    if out_len >= 2 * 1024 * 1024 {
201        unsafe {
202            libc::madvise(
203                buf.as_mut_ptr() as *mut libc::c_void,
204                out_len,
205                libc::MADV_HUGEPAGE,
206            );
207        }
208    }
209
210    // L1-cached temp buffer for encoding groups of lines.
211    // 256 lines × 76 chars = 19,456 bytes — fits comfortably in L1 (32-64KB).
212    const GROUP_LINES: usize = 256;
213    let group_input = GROUP_LINES * bytes_per_line;
214    let temp_size = GROUP_LINES * wrap_col;
215    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
216    #[allow(clippy::uninit_vec)]
217    unsafe {
218        temp.set_len(temp_size);
219    }
220
221    let line_out = wrap_col + 1;
222    let mut wp = 0usize; // write position in output buffer
223
224    for chunk in data.chunks(group_input) {
225        let clen = BASE64_ENGINE.encoded_length(chunk.len());
226        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());
227
228        // Scatter-copy full lines from temp to output with newlines
229        let lines = clen / wrap_col;
230        let chunk_rem = clen % wrap_col;
231
232        // 8-line unrolled scatter for ILP
233        let mut i = 0;
234        while i + 8 <= lines {
235            unsafe {
236                let src = temp.as_ptr().add(i * wrap_col);
237                let dst = buf.as_mut_ptr().add(wp);
238                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
239                *dst.add(wrap_col) = b'\n';
240                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
241                *dst.add(line_out + wrap_col) = b'\n';
242                std::ptr::copy_nonoverlapping(
243                    src.add(2 * wrap_col),
244                    dst.add(2 * line_out),
245                    wrap_col,
246                );
247                *dst.add(2 * line_out + wrap_col) = b'\n';
248                std::ptr::copy_nonoverlapping(
249                    src.add(3 * wrap_col),
250                    dst.add(3 * line_out),
251                    wrap_col,
252                );
253                *dst.add(3 * line_out + wrap_col) = b'\n';
254                std::ptr::copy_nonoverlapping(
255                    src.add(4 * wrap_col),
256                    dst.add(4 * line_out),
257                    wrap_col,
258                );
259                *dst.add(4 * line_out + wrap_col) = b'\n';
260                std::ptr::copy_nonoverlapping(
261                    src.add(5 * wrap_col),
262                    dst.add(5 * line_out),
263                    wrap_col,
264                );
265                *dst.add(5 * line_out + wrap_col) = b'\n';
266                std::ptr::copy_nonoverlapping(
267                    src.add(6 * wrap_col),
268                    dst.add(6 * line_out),
269                    wrap_col,
270                );
271                *dst.add(6 * line_out + wrap_col) = b'\n';
272                std::ptr::copy_nonoverlapping(
273                    src.add(7 * wrap_col),
274                    dst.add(7 * line_out),
275                    wrap_col,
276                );
277                *dst.add(7 * line_out + wrap_col) = b'\n';
278            }
279            wp += 8 * line_out;
280            i += 8;
281        }
282        // Remaining full lines
283        while i < lines {
284            unsafe {
285                std::ptr::copy_nonoverlapping(
286                    temp.as_ptr().add(i * wrap_col),
287                    buf.as_mut_ptr().add(wp),
288                    wrap_col,
289                );
290                *buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
291            }
292            wp += line_out;
293            i += 1;
294        }
295        // Partial last line (only on final chunk)
296        if chunk_rem > 0 {
297            unsafe {
298                std::ptr::copy_nonoverlapping(
299                    temp.as_ptr().add(lines * wrap_col),
300                    buf.as_mut_ptr().add(wp),
301                    chunk_rem,
302                );
303                *buf.as_mut_ptr().add(wp + chunk_rem) = b'\n';
304            }
305            wp += chunk_rem + 1;
306        }
307    }
308
309    out.write_all(&buf[..wp])
310}
311
312/// Scatter-copy encoded lines from temp buffer to output buffer with newlines.
313/// Uses copy_nonoverlapping since temp and output never overlap.
314#[inline]
315#[allow(dead_code)]
316fn scatter_lines(
317    temp: &[u8],
318    buf: &mut [u8],
319    line_start: usize,
320    count: usize,
321    wrap_col: usize,
322    line_out: usize,
323) {
324    unsafe {
325        let src = temp.as_ptr();
326        let dst = buf.as_mut_ptr();
327        for i in 0..count {
328            let s_off = i * wrap_col;
329            let d_off = (line_start + i) * line_out;
330            std::ptr::copy_nonoverlapping(src.add(s_off), dst.add(d_off), wrap_col);
331            *dst.add(d_off + wrap_col) = b'\n';
332        }
333    }
334}
335
336/// Expand encoded data in-place by inserting newlines at wrap_col boundaries.
337/// buf[0..enc_len] contains contiguous encoded data; buf has capacity for out_len.
338/// After expansion, buf[0..out_len] contains wrapped output with newlines.
339///
340/// Processes backward so shifted data never overwrites unread source data.
341/// For wrap_col=76: shift is ~1.3% (1 byte per 76), so most copies are
342/// non-overlapping and the memmove fast-path (memcpy) is used.
343#[inline]
344#[allow(dead_code)]
345fn expand_backward(ptr: *mut u8, enc_len: usize, out_len: usize, wrap_col: usize) {
346    let num_full = enc_len / wrap_col;
347    let rem = enc_len % wrap_col;
348
349    unsafe {
350        let mut rp = enc_len;
351        let mut wp = out_len;
352
353        // Handle partial last line (remainder)
354        if rem > 0 {
355            wp -= 1;
356            *ptr.add(wp) = b'\n';
357            wp -= rem;
358            rp -= rem;
359            if rp != wp {
360                std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
361            }
362        }
363
364        // Process full lines backward
365        let mut lines_left = num_full;
366        while lines_left >= 8 {
367            // Unrolled: 8 lines per iteration
368            wp -= 1;
369            *ptr.add(wp) = b'\n';
370            rp -= wrap_col;
371            wp -= wrap_col;
372            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
373
374            wp -= 1;
375            *ptr.add(wp) = b'\n';
376            rp -= wrap_col;
377            wp -= wrap_col;
378            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
379
380            wp -= 1;
381            *ptr.add(wp) = b'\n';
382            rp -= wrap_col;
383            wp -= wrap_col;
384            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
385
386            wp -= 1;
387            *ptr.add(wp) = b'\n';
388            rp -= wrap_col;
389            wp -= wrap_col;
390            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
391
392            wp -= 1;
393            *ptr.add(wp) = b'\n';
394            rp -= wrap_col;
395            wp -= wrap_col;
396            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
397
398            wp -= 1;
399            *ptr.add(wp) = b'\n';
400            rp -= wrap_col;
401            wp -= wrap_col;
402            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
403
404            wp -= 1;
405            *ptr.add(wp) = b'\n';
406            rp -= wrap_col;
407            wp -= wrap_col;
408            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
409
410            wp -= 1;
411            *ptr.add(wp) = b'\n';
412            rp -= wrap_col;
413            wp -= wrap_col;
414            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
415
416            lines_left -= 8;
417        }
418
419        // Remaining lines (0-7)
420        while lines_left > 0 {
421            wp -= 1;
422            *ptr.add(wp) = b'\n';
423            rp -= wrap_col;
424            wp -= wrap_col;
425            if rp != wp {
426                std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
427            }
428            lines_left -= 1;
429        }
430    }
431}
432
433/// Static newline byte for IoSlice references in writev calls.
434static NEWLINE: [u8; 1] = [b'\n'];
435
436/// Write encoded base64 data with line wrapping using write_vectored (writev).
437/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
438/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
439/// This is zero-copy: no fused output buffer needed.
440#[inline]
441#[allow(dead_code)]
442fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
443    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
444    // Each line needs 2 entries (data + newline), so 512 lines per batch.
445    const MAX_IOV: usize = 1024;
446
447    let num_full_lines = encoded.len() / wrap_col;
448    let remainder = encoded.len() % wrap_col;
449    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };
450
451    // Small output: build all IoSlices and write in one call
452    if total_iov <= MAX_IOV {
453        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
454        let mut pos = 0;
455        for _ in 0..num_full_lines {
456            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
457            iov.push(io::IoSlice::new(&NEWLINE));
458            pos += wrap_col;
459        }
460        if remainder > 0 {
461            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
462            iov.push(io::IoSlice::new(&NEWLINE));
463        }
464        return write_all_vectored(out, &iov);
465    }
466
467    // Large output: fuse batches of lines into a reusable L1-cached buffer.
468    // Each batch copies ~39KB (512 lines × 77 bytes) from the encoded buffer
469    // with newlines inserted, then writes as a single contiguous write(2).
470    // This is faster than writev with 1024 IoSlice entries because:
471    // - One kernel memcpy per batch vs 1024 separate copies
472    // - Fused buffer (39KB) stays hot in L1 cache across batches
473    let line_out = wrap_col + 1;
474    const BATCH_LINES: usize = 512;
475    let batch_fused_size = BATCH_LINES * line_out;
476    let mut fused: Vec<u8> = Vec::with_capacity(batch_fused_size);
477    #[allow(clippy::uninit_vec)]
478    unsafe {
479        fused.set_len(batch_fused_size);
480    }
481
482    let mut rp = 0;
483    let mut lines_done = 0;
484
485    // Process full batches using 8-line unrolled fuse_wrap
486    while lines_done + BATCH_LINES <= num_full_lines {
487        let n = fuse_wrap(
488            &encoded[rp..rp + BATCH_LINES * wrap_col],
489            wrap_col,
490            &mut fused,
491        );
492        out.write_all(&fused[..n])?;
493        rp += BATCH_LINES * wrap_col;
494        lines_done += BATCH_LINES;
495    }
496
497    // Remaining full lines (partial batch)
498    let remaining_lines = num_full_lines - lines_done;
499    if remaining_lines > 0 {
500        let n = fuse_wrap(
501            &encoded[rp..rp + remaining_lines * wrap_col],
502            wrap_col,
503            &mut fused,
504        );
505        out.write_all(&fused[..n])?;
506        rp += remaining_lines * wrap_col;
507    }
508
509    // Partial last line
510    if remainder > 0 {
511        out.write_all(&encoded[rp..rp + remainder])?;
512        out.write_all(b"\n")?;
513    }
514    Ok(())
515}
516
517/// Write encoded base64 data with line wrapping using writev, tracking column state
518/// across calls. Used by encode_stream for piped input where chunks don't align
519/// to line boundaries.
520#[inline]
521fn write_wrapped_iov_streaming(
522    encoded: &[u8],
523    wrap_col: usize,
524    col: &mut usize,
525    out: &mut impl Write,
526) -> io::Result<()> {
527    const MAX_IOV: usize = 1024;
528    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
529    let mut rp = 0;
530
531    while rp < encoded.len() {
532        let space = wrap_col - *col;
533        let avail = encoded.len() - rp;
534
535        if avail <= space {
536            // Remaining data fits in current line
537            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
538            *col += avail;
539            if *col == wrap_col {
540                iov.push(io::IoSlice::new(&NEWLINE));
541                *col = 0;
542            }
543            break;
544        } else {
545            // Fill current line and add newline
546            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
547            iov.push(io::IoSlice::new(&NEWLINE));
548            rp += space;
549            *col = 0;
550        }
551
552        if iov.len() >= MAX_IOV - 1 {
553            write_all_vectored(out, &iov)?;
554            iov.clear();
555        }
556    }
557
558    if !iov.is_empty() {
559        write_all_vectored(out, &iov)?;
560    }
561    Ok(())
562}
563
564/// Parallel wrapped encoding with L1-scatter: each thread encodes groups of
565/// lines into a small L1-cached temp buffer, then scatter-copies to its output
566/// buffer with newlines. Each output byte is written exactly once per thread.
567///
568/// Main thread combines all per-thread output buffers with a single writev call.
569fn encode_wrapped_parallel(
570    data: &[u8],
571    wrap_col: usize,
572    bytes_per_line: usize,
573    out: &mut impl Write,
574) -> io::Result<()> {
575    let num_threads = num_cpus().max(1);
576    let lines_per_chunk = ((data.len() / bytes_per_line) / num_threads).max(1);
577    let chunk_input = lines_per_chunk * bytes_per_line;
578
579    // Split input at bytes_per_line boundaries (last chunk may have remainder)
580    let chunks: Vec<&[u8]> = data.chunks(chunk_input.max(bytes_per_line)).collect();
581
582    // Each thread: L1-scatter encode into output buffer
583    let output_chunks: Vec<Vec<u8>> = std::thread::scope(|s| {
584        let handles: Vec<_> = chunks
585            .iter()
586            .map(|chunk| s.spawn(move || encode_chunk_l1_scatter(chunk, wrap_col, bytes_per_line)))
587            .collect();
588        handles.into_iter().map(|h| h.join().unwrap()).collect()
589    });
590
591    // Single writev combining all thread outputs in order
592    let slices: Vec<io::IoSlice> = output_chunks.iter().map(|c| io::IoSlice::new(c)).collect();
593    write_all_vectored(out, &slices)
594}
595
596/// Encode a chunk of data using L1-scatter approach: encode groups of lines
597/// into an L1-cached temp buffer, scatter-copy to output with newlines.
598/// Returns the ready-to-write output buffer.
599fn encode_chunk_l1_scatter(data: &[u8], wrap_col: usize, bytes_per_line: usize) -> Vec<u8> {
600    let enc_len = BASE64_ENGINE.encoded_length(data.len());
601    let full_lines = enc_len / wrap_col;
602    let remainder = enc_len % wrap_col;
603    let out_size = full_lines * (wrap_col + 1) + if remainder > 0 { remainder + 1 } else { 0 };
604
605    let mut output: Vec<u8> = Vec::with_capacity(out_size);
606    #[allow(clippy::uninit_vec)]
607    unsafe {
608        output.set_len(out_size);
609    }
610    #[cfg(target_os = "linux")]
611    if out_size >= 2 * 1024 * 1024 {
612        unsafe {
613            libc::madvise(
614                output.as_mut_ptr() as *mut libc::c_void,
615                out_size,
616                libc::MADV_HUGEPAGE,
617            );
618        }
619    }
620
621    // L1-cached temp buffer: 256 lines × wrap_col fits in L1 cache
622    const GROUP_LINES: usize = 256;
623    let group_input = GROUP_LINES * bytes_per_line;
624    let temp_size = GROUP_LINES * wrap_col;
625    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
626    #[allow(clippy::uninit_vec)]
627    unsafe {
628        temp.set_len(temp_size);
629    }
630
631    let line_out = wrap_col + 1;
632    let mut wp = 0usize;
633
634    for chunk in data.chunks(group_input) {
635        let clen = BASE64_ENGINE.encoded_length(chunk.len());
636        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());
637
638        let lines = clen / wrap_col;
639        let chunk_rem = clen % wrap_col;
640
641        // 8-line unrolled scatter
642        let mut i = 0;
643        while i + 8 <= lines {
644            unsafe {
645                let src = temp.as_ptr().add(i * wrap_col);
646                let dst = output.as_mut_ptr().add(wp);
647                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
648                *dst.add(wrap_col) = b'\n';
649                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
650                *dst.add(line_out + wrap_col) = b'\n';
651                std::ptr::copy_nonoverlapping(
652                    src.add(2 * wrap_col),
653                    dst.add(2 * line_out),
654                    wrap_col,
655                );
656                *dst.add(2 * line_out + wrap_col) = b'\n';
657                std::ptr::copy_nonoverlapping(
658                    src.add(3 * wrap_col),
659                    dst.add(3 * line_out),
660                    wrap_col,
661                );
662                *dst.add(3 * line_out + wrap_col) = b'\n';
663                std::ptr::copy_nonoverlapping(
664                    src.add(4 * wrap_col),
665                    dst.add(4 * line_out),
666                    wrap_col,
667                );
668                *dst.add(4 * line_out + wrap_col) = b'\n';
669                std::ptr::copy_nonoverlapping(
670                    src.add(5 * wrap_col),
671                    dst.add(5 * line_out),
672                    wrap_col,
673                );
674                *dst.add(5 * line_out + wrap_col) = b'\n';
675                std::ptr::copy_nonoverlapping(
676                    src.add(6 * wrap_col),
677                    dst.add(6 * line_out),
678                    wrap_col,
679                );
680                *dst.add(6 * line_out + wrap_col) = b'\n';
681                std::ptr::copy_nonoverlapping(
682                    src.add(7 * wrap_col),
683                    dst.add(7 * line_out),
684                    wrap_col,
685                );
686                *dst.add(7 * line_out + wrap_col) = b'\n';
687            }
688            wp += 8 * line_out;
689            i += 8;
690        }
691        while i < lines {
692            unsafe {
693                std::ptr::copy_nonoverlapping(
694                    temp.as_ptr().add(i * wrap_col),
695                    output.as_mut_ptr().add(wp),
696                    wrap_col,
697                );
698                *output.as_mut_ptr().add(wp + wrap_col) = b'\n';
699            }
700            wp += line_out;
701            i += 1;
702        }
703        if chunk_rem > 0 {
704            unsafe {
705                std::ptr::copy_nonoverlapping(
706                    temp.as_ptr().add(lines * wrap_col),
707                    output.as_mut_ptr().add(wp),
708                    chunk_rem,
709                );
710                *output.as_mut_ptr().add(wp + chunk_rem) = b'\n';
711            }
712            wp += chunk_rem + 1;
713        }
714    }
715
716    unsafe { output.set_len(wp) };
717    output
718}
719
720/// Fuse encoded base64 data with newlines in a single pass.
721/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
722/// Returns number of bytes written.
723#[inline]
724fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
725    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
726    let mut rp = 0;
727    let mut wp = 0;
728
729    // Unrolled: process 8 lines per iteration for better ILP
730    while rp + 8 * wrap_col <= encoded.len() {
731        unsafe {
732            let src = encoded.as_ptr().add(rp);
733            let dst = out_buf.as_mut_ptr().add(wp);
734
735            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
736            *dst.add(wrap_col) = b'\n';
737
738            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
739            *dst.add(line_out + wrap_col) = b'\n';
740
741            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
742            *dst.add(2 * line_out + wrap_col) = b'\n';
743
744            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
745            *dst.add(3 * line_out + wrap_col) = b'\n';
746
747            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
748            *dst.add(4 * line_out + wrap_col) = b'\n';
749
750            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
751            *dst.add(5 * line_out + wrap_col) = b'\n';
752
753            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
754            *dst.add(6 * line_out + wrap_col) = b'\n';
755
756            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
757            *dst.add(7 * line_out + wrap_col) = b'\n';
758        }
759        rp += 8 * wrap_col;
760        wp += 8 * line_out;
761    }
762
763    // Handle remaining 4 lines at a time
764    while rp + 4 * wrap_col <= encoded.len() {
765        unsafe {
766            let src = encoded.as_ptr().add(rp);
767            let dst = out_buf.as_mut_ptr().add(wp);
768
769            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
770            *dst.add(wrap_col) = b'\n';
771
772            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
773            *dst.add(line_out + wrap_col) = b'\n';
774
775            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
776            *dst.add(2 * line_out + wrap_col) = b'\n';
777
778            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
779            *dst.add(3 * line_out + wrap_col) = b'\n';
780        }
781        rp += 4 * wrap_col;
782        wp += 4 * line_out;
783    }
784
785    // Remaining full lines
786    while rp + wrap_col <= encoded.len() {
787        unsafe {
788            std::ptr::copy_nonoverlapping(
789                encoded.as_ptr().add(rp),
790                out_buf.as_mut_ptr().add(wp),
791                wrap_col,
792            );
793            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
794        }
795        rp += wrap_col;
796        wp += line_out;
797    }
798
799    // Partial last line
800    if rp < encoded.len() {
801        let remaining = encoded.len() - rp;
802        unsafe {
803            std::ptr::copy_nonoverlapping(
804                encoded.as_ptr().add(rp),
805                out_buf.as_mut_ptr().add(wp),
806                remaining,
807            );
808        }
809        wp += remaining;
810        out_buf[wp] = b'\n';
811        wp += 1;
812    }
813
814    wp
815}
816
817/// Fallback for very small wrap columns (< 4 chars).
818fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
819    let enc_max = BASE64_ENGINE.encoded_length(data.len());
820    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
821    #[allow(clippy::uninit_vec)]
822    unsafe {
823        buf.set_len(enc_max);
824    }
825    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
826
827    let wc = wrap_col.max(1);
828    for line in encoded.chunks(wc) {
829        out.write_all(line)?;
830        out.write_all(b"\n")?;
831    }
832    Ok(())
833}
834
835/// Decode base64 data and write to output (borrows data, allocates clean buffer).
836/// When `ignore_garbage` is true, strip all non-base64 characters.
837/// When false, only strip whitespace (standard behavior).
838pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
839    if data.is_empty() {
840        return Ok(());
841    }
842
843    if ignore_garbage {
844        let mut cleaned = strip_non_base64(data);
845        return decode_clean_slice(&mut cleaned, out);
846    }
847
848    // For large data (>= 512KB): use bulk strip + single-shot decode.
849    // try_line_decode decodes per-line (~25ns overhead per 76-byte line call),
850    // while strip+decode uses SIMD gap-copy + single-shot SIMD decode at ~6.5 GB/s.
851    // For 10MB decode benchmark: ~2ms (bulk) vs ~4ms (per-line) = 2x faster.
852    // For small data (< 512KB): per-line decode avoids allocation overhead.
853    if data.len() < 512 * 1024 && data.len() >= 77 {
854        if let Some(result) = try_line_decode(data, out) {
855            return result;
856        }
857    }
858
859    // Fast path: single-pass SIMD strip + decode
860    decode_stripping_whitespace(data, out)
861}
862
863/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
864/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
865/// in-place with base64_simd::decode_inplace. Zero additional allocations.
866///
867/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
868/// containing whitespace (newlines) get physically copied (~1.3% for
869/// 76-char line base64). The decode writes to the same buffer, but decoded
870/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
871pub fn decode_mmap_inplace(
872    data: &mut [u8],
873    ignore_garbage: bool,
874    out: &mut impl Write,
875) -> io::Result<()> {
876    if data.is_empty() {
877        return Ok(());
878    }
879
880    // For small data: try line-by-line decode (avoids COW page faults).
881    // For large data (>= 512KB): bulk strip+decode is faster than per-line decode.
882    if !ignore_garbage && data.len() >= 77 && data.len() < 512 * 1024 {
883        if let Some(result) = try_line_decode(data, out) {
884            return result;
885        }
886    }
887
888    if ignore_garbage {
889        // Strip non-base64 chars in-place
890        let ptr = data.as_mut_ptr();
891        let len = data.len();
892        let mut wp = 0;
893        for rp in 0..len {
894            let b = unsafe { *ptr.add(rp) };
895            if is_base64_char(b) {
896                unsafe { *ptr.add(wp) = b };
897                wp += 1;
898            }
899        }
900        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
901            Ok(decoded) => return out.write_all(decoded),
902            Err(_) => return decode_error(),
903        }
904    }
905
906    // Fast path: uniform-line fused strip+decode (no intermediate buffer).
907    if data.len() >= 77 {
908        if let Some(result) = try_decode_uniform_lines(data, out) {
909            return result;
910        }
911    }
912
913    // Fallback: strip whitespace in-place using SIMD memchr2 gap-copy.
914
915    // Quick check: no newlines at all — maybe already clean
916    if memchr::memchr2(b'\n', b'\r', data).is_none() {
917        // Check for rare whitespace
918        if !data
919            .iter()
920            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
921        {
922            // Perfectly clean — decode in-place directly
923            match BASE64_ENGINE.decode_inplace(data) {
924                Ok(decoded) => return out.write_all(decoded),
925                Err(_) => return decode_error(),
926            }
927        }
928        // Rare whitespace only — strip in-place
929        let ptr = data.as_mut_ptr();
930        let len = data.len();
931        let mut wp = 0;
932        for rp in 0..len {
933            let b = unsafe { *ptr.add(rp) };
934            if NOT_WHITESPACE[b as usize] {
935                unsafe { *ptr.add(wp) = b };
936                wp += 1;
937            }
938        }
939        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
940            Ok(decoded) => return out.write_all(decoded),
941            Err(_) => return decode_error(),
942        }
943    }
944
945    // SIMD gap-copy: strip \n and \r in-place using memchr2
946    let ptr = data.as_mut_ptr();
947    let len = data.len();
948    let mut wp = 0usize;
949    let mut gap_start = 0usize;
950    let mut has_rare_ws = false;
951
952    // SAFETY: memchr2_iter reads from the original data. We write to positions
953    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
954    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
955        let gap_len = pos - gap_start;
956        if gap_len > 0 {
957            if !has_rare_ws {
958                // Check for rare whitespace during the gap-copy
959                has_rare_ws = unsafe {
960                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
961                        .iter()
962                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
963                };
964            }
965            if wp != gap_start {
966                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
967            }
968            wp += gap_len;
969        }
970        gap_start = pos + 1;
971    }
972    // Final gap
973    let tail_len = len - gap_start;
974    if tail_len > 0 {
975        if !has_rare_ws {
976            has_rare_ws = unsafe {
977                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
978                    .iter()
979                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
980            };
981        }
982        if wp != gap_start {
983            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
984        }
985        wp += tail_len;
986    }
987
988    // Second pass for rare whitespace if needed
989    if has_rare_ws {
990        let mut rp = 0;
991        let mut cwp = 0;
992        while rp < wp {
993            let b = unsafe { *ptr.add(rp) };
994            if NOT_WHITESPACE[b as usize] {
995                unsafe { *ptr.add(cwp) = b };
996                cwp += 1;
997            }
998            rp += 1;
999        }
1000        wp = cwp;
1001    }
1002
1003    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
1004    if wp >= PARALLEL_DECODE_THRESHOLD {
1005        // For large data, use parallel decode from the cleaned slice
1006        return decode_borrowed_clean_parallel(out, &data[..wp]);
1007    }
1008    match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
1009        Ok(decoded) => out.write_all(decoded),
1010        Err(_) => decode_error(),
1011    }
1012}
1013
1014/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
1015pub fn decode_owned(
1016    data: &mut Vec<u8>,
1017    ignore_garbage: bool,
1018    out: &mut impl Write,
1019) -> io::Result<()> {
1020    if data.is_empty() {
1021        return Ok(());
1022    }
1023
1024    if ignore_garbage {
1025        data.retain(|&b| is_base64_char(b));
1026    } else {
1027        strip_whitespace_inplace(data);
1028    }
1029
1030    decode_clean_slice(data, out)
1031}
1032
1033/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
1034/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
1035/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
1036/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
1037fn strip_whitespace_inplace(data: &mut Vec<u8>) {
1038    // Quick check: skip stripping if no \n or \r in the data.
1039    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1040    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
1041    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1042        // No newlines/CR — check for rare whitespace only
1043        if data
1044            .iter()
1045            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1046        {
1047            data.retain(|&b| NOT_WHITESPACE[b as usize]);
1048        }
1049        return;
1050    }
1051
1052    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
1053    // gaps between them to compact the data in-place. For typical base64 streams,
1054    // newlines are the only whitespace, so this handles >99% of cases.
1055    let ptr = data.as_mut_ptr();
1056    let len = data.len();
1057    let mut wp = 0usize;
1058    let mut gap_start = 0usize;
1059    let mut has_rare_ws = false;
1060
1061    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
1062        let gap_len = pos - gap_start;
1063        if gap_len > 0 {
1064            if !has_rare_ws {
1065                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
1066                has_rare_ws = data[gap_start..pos]
1067                    .iter()
1068                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1069            }
1070            if wp != gap_start {
1071                unsafe {
1072                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
1073                }
1074            }
1075            wp += gap_len;
1076        }
1077        gap_start = pos + 1;
1078    }
1079    // Copy the final gap
1080    let tail_len = len - gap_start;
1081    if tail_len > 0 {
1082        if !has_rare_ws {
1083            has_rare_ws = data[gap_start..]
1084                .iter()
1085                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1086        }
1087        if wp != gap_start {
1088            unsafe {
1089                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
1090            }
1091        }
1092        wp += tail_len;
1093    }
1094
1095    data.truncate(wp);
1096
1097    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
1098    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1099    if has_rare_ws {
1100        let ptr = data.as_mut_ptr();
1101        let len = data.len();
1102        let mut rp = 0;
1103        let mut cwp = 0;
1104        while rp < len {
1105            let b = unsafe { *ptr.add(rp) };
1106            if NOT_WHITESPACE[b as usize] {
1107                unsafe { *ptr.add(cwp) = b };
1108                cwp += 1;
1109            }
1110            rp += 1;
1111        }
1112        data.truncate(cwp);
1113    }
1114}
1115
1116/// 256-byte lookup table: true for non-whitespace bytes.
1117/// Used for single-pass whitespace stripping in decode.
1118static NOT_WHITESPACE: [bool; 256] = {
1119    let mut table = [true; 256];
1120    table[b' ' as usize] = false;
1121    table[b'\t' as usize] = false;
1122    table[b'\n' as usize] = false;
1123    table[b'\r' as usize] = false;
1124    table[0x0b] = false; // vertical tab
1125    table[0x0c] = false; // form feed
1126    table
1127};
1128
1129/// Fused strip+decode for uniform-line base64 data.
1130/// Detects consistent line length, then processes in sub-chunks: each sub-chunk
1131/// copies lines to a small local buffer (L2-hot) and decodes immediately.
1132/// Eliminates the large intermediate clean buffer (~12MB for 10MB decode).
1133/// Returns None if the data doesn't have uniform line structure.
1134fn try_decode_uniform_lines(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1135    let first_nl = memchr::memchr(b'\n', data)?;
1136    let line_len = first_nl;
1137    if line_len == 0 || line_len % 4 != 0 {
1138        return None;
1139    }
1140
1141    let stride = line_len + 1;
1142
1143    // Verify the data has consistent line structure (first + last lines)
1144    let check_lines = 4.min(data.len() / stride);
1145    for i in 1..check_lines {
1146        let expected_nl = i * stride - 1;
1147        if expected_nl >= data.len() || data[expected_nl] != b'\n' {
1148            return None;
1149        }
1150    }
1151
1152    let full_lines = if data.len() >= stride {
1153        let candidate = data.len() / stride;
1154        if candidate > 0 && data[candidate * stride - 1] != b'\n' {
1155            return None;
1156        }
1157        candidate
1158    } else {
1159        0
1160    };
1161
1162    let remainder_start = full_lines * stride;
1163    let remainder = &data[remainder_start..];
1164    let rem_clean = if remainder.last() == Some(&b'\n') {
1165        &remainder[..remainder.len() - 1]
1166    } else {
1167        remainder
1168    };
1169
1170    // Compute exact decoded sizes
1171    let decoded_per_line = line_len * 3 / 4;
1172    let rem_decoded_size = if rem_clean.is_empty() {
1173        0
1174    } else {
1175        let pad = rem_clean
1176            .iter()
1177            .rev()
1178            .take(2)
1179            .filter(|&&b| b == b'=')
1180            .count();
1181        rem_clean.len() * 3 / 4 - pad
1182    };
1183    let total_decoded = full_lines * decoded_per_line + rem_decoded_size;
1184    let clean_len = full_lines * line_len;
1185
1186    // Parallel path: fused strip+decode with 128KB sub-chunks per thread.
1187    // Each thread copies lines to a thread-local buffer (L2-hot) and decodes immediately,
1188    // eliminating the 12MB+ intermediate clean buffer entirely.
1189    if clean_len >= PARALLEL_DECODE_THRESHOLD && num_cpus() > 1 {
1190        let mut output: Vec<u8> = Vec::with_capacity(total_decoded);
1191        #[allow(clippy::uninit_vec)]
1192        unsafe {
1193            output.set_len(total_decoded);
1194        }
1195
1196        let out_ptr = output.as_mut_ptr() as usize;
1197        let src_ptr = data.as_ptr() as usize;
1198        let num_threads = num_cpus().max(1);
1199        let lines_per_thread = (full_lines + num_threads - 1) / num_threads;
1200        let lines_per_sub = (256 * 1024 / line_len).max(1);
1201
1202        let result: Result<(), io::Error> = std::thread::scope(|s| {
1203            let handles: Vec<_> = (0..num_threads)
1204                .map(|t| {
1205                    s.spawn(move || -> Result<(), io::Error> {
1206                        let start_line = t * lines_per_thread;
1207                        if start_line >= full_lines {
1208                            return Ok(());
1209                        }
1210                        let end_line = (start_line + lines_per_thread).min(full_lines);
1211                        let chunk_lines = end_line - start_line;
1212
1213                        let sub_buf_size = lines_per_sub.min(chunk_lines) * line_len;
1214                        let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1215                        #[allow(clippy::uninit_vec)]
1216                        unsafe {
1217                            local_buf.set_len(sub_buf_size);
1218                        }
1219
1220                        let src = src_ptr as *const u8;
1221                        let out_base = out_ptr as *mut u8;
1222                        let local_dst = local_buf.as_mut_ptr();
1223
1224                        let mut sub_start = 0usize;
1225                        while sub_start < chunk_lines {
1226                            let sub_count = (chunk_lines - sub_start).min(lines_per_sub);
1227                            let sub_clean = sub_count * line_len;
1228
1229                            for i in 0..sub_count {
1230                                unsafe {
1231                                    std::ptr::copy_nonoverlapping(
1232                                        src.add((start_line + sub_start + i) * stride),
1233                                        local_dst.add(i * line_len),
1234                                        line_len,
1235                                    );
1236                                }
1237                            }
1238
1239                            let out_offset = (start_line + sub_start) * decoded_per_line;
1240                            let out_size = sub_count * decoded_per_line;
1241                            let out_slice = unsafe {
1242                                std::slice::from_raw_parts_mut(out_base.add(out_offset), out_size)
1243                            };
1244                            BASE64_ENGINE
1245                                .decode(&local_buf[..sub_clean], out_slice.as_out())
1246                                .map_err(|_| {
1247                                    io::Error::new(io::ErrorKind::InvalidData, "invalid input")
1248                                })?;
1249
1250                            sub_start += sub_count;
1251                        }
1252                        Ok(())
1253                    })
1254                })
1255                .collect();
1256            for h in handles {
1257                h.join().unwrap()?;
1258            }
1259            Ok(())
1260        });
1261
1262        if let Err(e) = result {
1263            return Some(Err(e));
1264        }
1265
1266        if !rem_clean.is_empty() {
1267            let rem_out = &mut output[full_lines * decoded_per_line..total_decoded];
1268            match BASE64_ENGINE.decode(rem_clean, rem_out.as_out()) {
1269                Ok(_) => {}
1270                Err(_) => return Some(decode_error()),
1271            }
1272        }
1273
1274        return Some(out.write_all(&output[..total_decoded]));
1275    }
1276
1277    // Sequential path: fused strip+decode in 256KB sub-chunks.
1278    // Larger sub-chunks give SIMD decode more data per call, improving throughput.
1279    // Uses decode_inplace on a small reusable buffer — no large allocations at all.
1280    let lines_per_sub = (256 * 1024 / line_len).max(1);
1281    let sub_buf_size = lines_per_sub * line_len;
1282    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1283    #[allow(clippy::uninit_vec)]
1284    unsafe {
1285        local_buf.set_len(sub_buf_size);
1286    }
1287
1288    let src = data.as_ptr();
1289    let local_dst = local_buf.as_mut_ptr();
1290
1291    let mut line_idx = 0usize;
1292    while line_idx < full_lines {
1293        let sub_count = (full_lines - line_idx).min(lines_per_sub);
1294        let sub_clean = sub_count * line_len;
1295
1296        for i in 0..sub_count {
1297            unsafe {
1298                std::ptr::copy_nonoverlapping(
1299                    src.add((line_idx + i) * stride),
1300                    local_dst.add(i * line_len),
1301                    line_len,
1302                );
1303            }
1304        }
1305
1306        match BASE64_ENGINE.decode_inplace(&mut local_buf[..sub_clean]) {
1307            Ok(decoded) => {
1308                if let Err(e) = out.write_all(decoded) {
1309                    return Some(Err(e));
1310                }
1311            }
1312            Err(_) => return Some(decode_error()),
1313        }
1314
1315        line_idx += sub_count;
1316    }
1317
1318    if !rem_clean.is_empty() {
1319        let mut rem_buf = rem_clean.to_vec();
1320        match BASE64_ENGINE.decode_inplace(&mut rem_buf) {
1321            Ok(decoded) => {
1322                if let Err(e) = out.write_all(decoded) {
1323                    return Some(Err(e));
1324                }
1325            }
1326            Err(_) => return Some(decode_error()),
1327        }
1328    }
1329
1330    Some(Ok(()))
1331}
1332
1333/// Decode by stripping whitespace and decoding in a single fused pass.
1334/// For data with no whitespace, decodes directly without any copy.
1335/// Detects uniform line structure for fast structured-copy (no search needed),
1336/// falls back to SIMD memchr2 gap-copy for irregular data.
1337fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
1338    // Fast path for uniform-line base64 (e.g., standard 76-char lines + newline).
1339    // Copies at known offsets, avoiding the memchr2 search entirely.
1340    // For 13MB base64: saves ~1ms vs memchr2 gap-copy (just structured memcpy).
1341    if data.len() >= 77 {
1342        if let Some(result) = try_decode_uniform_lines(data, out) {
1343            return result;
1344        }
1345    }
1346
1347    // Quick check: skip stripping if no \n or \r in the data.
1348    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1349    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1350        // No newlines/CR — check for rare whitespace only
1351        if !data
1352            .iter()
1353            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1354        {
1355            return decode_borrowed_clean(out, data);
1356        }
1357        // Has rare whitespace only — strip and decode
1358        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
1359        for &b in data {
1360            if NOT_WHITESPACE[b as usize] {
1361                cleaned.push(b);
1362            }
1363        }
1364        return decode_clean_slice(&mut cleaned, out);
1365    }
1366
1367    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
1368    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
1369    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
1370    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
1371    let dst = clean.as_mut_ptr();
1372    let mut wp = 0usize;
1373    let mut gap_start = 0usize;
1374    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
1375    // This avoids the second full-scan pass when only \n/\r are present.
1376    let mut has_rare_ws = false;
1377
1378    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1379        let gap_len = pos - gap_start;
1380        if gap_len > 0 {
1381            // Check gap region for rare whitespace during copy.
1382            // This adds ~1 branch per gap but eliminates the second full scan.
1383            if !has_rare_ws {
1384                has_rare_ws = data[gap_start..pos]
1385                    .iter()
1386                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1387            }
1388            unsafe {
1389                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
1390            }
1391            wp += gap_len;
1392        }
1393        gap_start = pos + 1;
1394    }
1395    // Copy the final gap after the last \n/\r
1396    let tail_len = data.len() - gap_start;
1397    if tail_len > 0 {
1398        if !has_rare_ws {
1399            has_rare_ws = data[gap_start..]
1400                .iter()
1401                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1402        }
1403        unsafe {
1404            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
1405        }
1406        wp += tail_len;
1407    }
1408    unsafe {
1409        clean.set_len(wp);
1410    }
1411
1412    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
1413    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1414    if has_rare_ws {
1415        let ptr = clean.as_mut_ptr();
1416        let len = clean.len();
1417        let mut rp = 0;
1418        let mut cwp = 0;
1419        while rp < len {
1420            let b = unsafe { *ptr.add(rp) };
1421            if NOT_WHITESPACE[b as usize] {
1422                unsafe { *ptr.add(cwp) = b };
1423                cwp += 1;
1424            }
1425            rp += 1;
1426        }
1427        clean.truncate(cwp);
1428    }
1429
1430    // For large data (>= threshold), use parallel decode for multi-core speedup.
1431    // For small data, use in-place decode to avoid extra allocation.
1432    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
1433        decode_borrowed_clean_parallel(out, &clean)
1434    } else {
1435        decode_clean_slice(&mut clean, out)
1436    }
1437}
1438
1439/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
1440/// Returns Some(result) if the data has uniform line lengths suitable for
1441/// per-line decode, or None if the data doesn't fit this pattern.
1442///
1443/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
1444/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
1445/// We decode each line directly into its position in the output buffer.
1446fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1447    // Find the first newline to determine line length
1448    let first_nl = memchr::memchr(b'\n', data)?;
1449    let line_len = first_nl; // encoded chars per line (without newline)
1450
1451    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
1452    if line_len == 0 || line_len % 4 != 0 {
1453        return None;
1454    }
1455
1456    let line_stride = line_len + 1; // line_len chars + 1 newline byte
1457    let decoded_per_line = line_len * 3 / 4;
1458
1459    // Verify the data has a consistent line structure by checking the next few lines
1460    let check_lines = 4.min(data.len() / line_stride);
1461    for i in 1..check_lines {
1462        let expected_nl = i * line_stride - 1;
1463        if expected_nl >= data.len() {
1464            break;
1465        }
1466        if data[expected_nl] != b'\n' {
1467            return None; // Inconsistent line length
1468        }
1469    }
1470
1471    // Calculate full lines and remainder
1472    let full_lines = if data.len() >= line_stride {
1473        // Check how many complete lines fit
1474        let candidate = data.len() / line_stride;
1475        // Verify the last full line's newline
1476        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
1477            return None; // Not a clean line-structured file
1478        }
1479        candidate
1480    } else {
1481        0
1482    };
1483
1484    let remainder_start = full_lines * line_stride;
1485    let remainder = &data[remainder_start..];
1486
1487    // Calculate exact output size
1488    let remainder_clean_len = if remainder.is_empty() {
1489        0
1490    } else {
1491        // Remainder might end with newline, strip it
1492        let rem = if remainder.last() == Some(&b'\n') {
1493            &remainder[..remainder.len() - 1]
1494        } else {
1495            remainder
1496        };
1497        if rem.is_empty() {
1498            0
1499        } else {
1500            // Check for padding
1501            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
1502            if rem.len() % 4 != 0 {
1503                return None; // Invalid remainder
1504            }
1505            rem.len() * 3 / 4 - pad
1506        }
1507    };
1508
1509    // Single-allocation decode: allocate full decoded output, decode all lines
1510    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
1511    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
1512    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
1513    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
1514    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1515    #[allow(clippy::uninit_vec)]
1516    unsafe {
1517        out_buf.set_len(total_decoded);
1518    }
1519
1520    let dst = out_buf.as_mut_ptr();
1521
1522    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
1523    // Each thread decodes a contiguous block of lines directly to its final position
1524    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
1525    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
1526        let out_addr = dst as usize;
1527        let num_threads = num_cpus().max(1);
1528        let lines_per_chunk = (full_lines / num_threads).max(1);
1529
1530        // Build per-thread task ranges: (start_line, end_line)
1531        let mut tasks: Vec<(usize, usize)> = Vec::new();
1532        let mut line_off = 0;
1533        while line_off < full_lines {
1534            let end = (line_off + lines_per_chunk).min(full_lines);
1535            tasks.push((line_off, end));
1536            line_off = end;
1537        }
1538
1539        let decode_result: Result<(), io::Error> = std::thread::scope(|s| {
1540            let handles: Vec<_> = tasks
1541                .iter()
1542                .map(|&(start_line, end_line)| {
1543                    s.spawn(move || -> Result<(), io::Error> {
1544                        let out_ptr = out_addr as *mut u8;
1545                        let mut i = start_line;
1546
1547                        while i + 4 <= end_line {
1548                            let in_base = i * line_stride;
1549                            let ob = i * decoded_per_line;
1550                            unsafe {
1551                                let s0 = std::slice::from_raw_parts_mut(
1552                                    out_ptr.add(ob),
1553                                    decoded_per_line,
1554                                );
1555                                if BASE64_ENGINE
1556                                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1557                                    .is_err()
1558                                {
1559                                    return Err(io::Error::new(
1560                                        io::ErrorKind::InvalidData,
1561                                        "invalid input",
1562                                    ));
1563                                }
1564                                let s1 = std::slice::from_raw_parts_mut(
1565                                    out_ptr.add(ob + decoded_per_line),
1566                                    decoded_per_line,
1567                                );
1568                                if BASE64_ENGINE
1569                                    .decode(
1570                                        &data[in_base + line_stride
1571                                            ..in_base + line_stride + line_len],
1572                                        s1.as_out(),
1573                                    )
1574                                    .is_err()
1575                                {
1576                                    return Err(io::Error::new(
1577                                        io::ErrorKind::InvalidData,
1578                                        "invalid input",
1579                                    ));
1580                                }
1581                                let s2 = std::slice::from_raw_parts_mut(
1582                                    out_ptr.add(ob + 2 * decoded_per_line),
1583                                    decoded_per_line,
1584                                );
1585                                if BASE64_ENGINE
1586                                    .decode(
1587                                        &data[in_base + 2 * line_stride
1588                                            ..in_base + 2 * line_stride + line_len],
1589                                        s2.as_out(),
1590                                    )
1591                                    .is_err()
1592                                {
1593                                    return Err(io::Error::new(
1594                                        io::ErrorKind::InvalidData,
1595                                        "invalid input",
1596                                    ));
1597                                }
1598                                let s3 = std::slice::from_raw_parts_mut(
1599                                    out_ptr.add(ob + 3 * decoded_per_line),
1600                                    decoded_per_line,
1601                                );
1602                                if BASE64_ENGINE
1603                                    .decode(
1604                                        &data[in_base + 3 * line_stride
1605                                            ..in_base + 3 * line_stride + line_len],
1606                                        s3.as_out(),
1607                                    )
1608                                    .is_err()
1609                                {
1610                                    return Err(io::Error::new(
1611                                        io::ErrorKind::InvalidData,
1612                                        "invalid input",
1613                                    ));
1614                                }
1615                            }
1616                            i += 4;
1617                        }
1618
1619                        while i < end_line {
1620                            let in_start = i * line_stride;
1621                            let out_off = i * decoded_per_line;
1622                            let out_slice = unsafe {
1623                                std::slice::from_raw_parts_mut(
1624                                    out_ptr.add(out_off),
1625                                    decoded_per_line,
1626                                )
1627                            };
1628                            if BASE64_ENGINE
1629                                .decode(&data[in_start..in_start + line_len], out_slice.as_out())
1630                                .is_err()
1631                            {
1632                                return Err(io::Error::new(
1633                                    io::ErrorKind::InvalidData,
1634                                    "invalid input",
1635                                ));
1636                            }
1637                            i += 1;
1638                        }
1639
1640                        Ok(())
1641                    })
1642                })
1643                .collect();
1644            for h in handles {
1645                h.join().unwrap()?;
1646            }
1647            Ok(())
1648        });
1649
1650        if decode_result.is_err() {
1651            return Some(decode_error());
1652        }
1653    } else {
1654        // Sequential decode with 4x unrolling for smaller inputs
1655        let mut i = 0;
1656
1657        while i + 4 <= full_lines {
1658            let in_base = i * line_stride;
1659            let out_base = i * decoded_per_line;
1660            unsafe {
1661                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
1662                if BASE64_ENGINE
1663                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1664                    .is_err()
1665                {
1666                    return Some(decode_error());
1667                }
1668
1669                let s1 = std::slice::from_raw_parts_mut(
1670                    dst.add(out_base + decoded_per_line),
1671                    decoded_per_line,
1672                );
1673                if BASE64_ENGINE
1674                    .decode(
1675                        &data[in_base + line_stride..in_base + line_stride + line_len],
1676                        s1.as_out(),
1677                    )
1678                    .is_err()
1679                {
1680                    return Some(decode_error());
1681                }
1682
1683                let s2 = std::slice::from_raw_parts_mut(
1684                    dst.add(out_base + 2 * decoded_per_line),
1685                    decoded_per_line,
1686                );
1687                if BASE64_ENGINE
1688                    .decode(
1689                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
1690                        s2.as_out(),
1691                    )
1692                    .is_err()
1693                {
1694                    return Some(decode_error());
1695                }
1696
1697                let s3 = std::slice::from_raw_parts_mut(
1698                    dst.add(out_base + 3 * decoded_per_line),
1699                    decoded_per_line,
1700                );
1701                if BASE64_ENGINE
1702                    .decode(
1703                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
1704                        s3.as_out(),
1705                    )
1706                    .is_err()
1707                {
1708                    return Some(decode_error());
1709                }
1710            }
1711            i += 4;
1712        }
1713
1714        while i < full_lines {
1715            let in_start = i * line_stride;
1716            let in_end = in_start + line_len;
1717            let out_off = i * decoded_per_line;
1718            let out_slice =
1719                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
1720            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
1721                Ok(_) => {}
1722                Err(_) => return Some(decode_error()),
1723            }
1724            i += 1;
1725        }
1726    }
1727
1728    // Decode remainder
1729    if remainder_clean_len > 0 {
1730        let rem = if remainder.last() == Some(&b'\n') {
1731            &remainder[..remainder.len() - 1]
1732        } else {
1733            remainder
1734        };
1735        let out_off = full_lines * decoded_per_line;
1736        let out_slice =
1737            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
1738        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
1739            Ok(_) => {}
1740            Err(_) => return Some(decode_error()),
1741        }
1742    }
1743
1744    // Single write_all for the entire decoded output
1745    Some(out.write_all(&out_buf[..total_decoded]))
1746}
1747
1748/// Decode a clean (no whitespace) buffer in-place with SIMD.
1749fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1750    if data.is_empty() {
1751        return Ok(());
1752    }
1753    match BASE64_ENGINE.decode_inplace(data) {
1754        Ok(decoded) => out.write_all(decoded),
1755        Err(_) => decode_error(),
1756    }
1757}
1758
1759/// Cold error path — keeps hot decode path tight by moving error construction out of line.
1760#[cold]
1761#[inline(never)]
1762fn decode_error() -> io::Result<()> {
1763    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1764}
1765
1766/// Decode clean base64 data (no whitespace) from a borrowed slice.
1767fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1768    if data.is_empty() {
1769        return Ok(());
1770    }
1771    // Parallel decode for large data: split at 4-byte boundaries,
1772    // decode each chunk independently (base64 is context-free per 4-char group).
1773    if data.len() >= PARALLEL_DECODE_THRESHOLD {
1774        return decode_borrowed_clean_parallel(out, data);
1775    }
1776    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
1777    // Decoded size = data.len() * 3 / 4 minus padding.
1778    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
1779    let decoded_size = data.len() * 3 / 4 - pad;
1780    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
1781    #[allow(clippy::uninit_vec)]
1782    unsafe {
1783        buf.set_len(decoded_size);
1784    }
1785    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
1786        Ok(decoded) => {
1787            out.write_all(decoded)?;
1788            Ok(())
1789        }
1790        Err(_) => decode_error(),
1791    }
1792}
1793
1794/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel.
1795/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
1796/// upfront, so each thread decodes directly to its final position. No compaction needed.
1797fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1798    let num_threads = num_cpus().max(1);
1799    let raw_chunk = data.len() / num_threads;
1800    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
1801    let chunk_size = ((raw_chunk + 3) / 4) * 4;
1802
1803    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
1804
1805    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
1806    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
1807    offsets.push(0);
1808    let mut total_decoded = 0usize;
1809    for (i, chunk) in chunks.iter().enumerate() {
1810        let decoded_size = if i == chunks.len() - 1 {
1811            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
1812            chunk.len() * 3 / 4 - pad
1813        } else {
1814            chunk.len() * 3 / 4
1815        };
1816        total_decoded += decoded_size;
1817        offsets.push(total_decoded);
1818    }
1819
1820    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1821    #[allow(clippy::uninit_vec)]
1822    unsafe {
1823        output_buf.set_len(total_decoded);
1824    }
1825
1826    // Parallel decode: each thread decodes directly into its exact final position.
1827    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
1828    let out_addr = output_buf.as_mut_ptr() as usize;
1829    let decode_result: Result<(), io::Error> = std::thread::scope(|s| {
1830        let handles: Vec<_> = chunks
1831            .iter()
1832            .enumerate()
1833            .map(|(i, chunk)| {
1834                let offset = offsets[i];
1835                let expected_size = offsets[i + 1] - offset;
1836                s.spawn(move || -> Result<(), io::Error> {
1837                    // SAFETY: each thread writes to non-overlapping region
1838                    let out_slice = unsafe {
1839                        std::slice::from_raw_parts_mut(
1840                            (out_addr as *mut u8).add(offset),
1841                            expected_size,
1842                        )
1843                    };
1844                    let decoded = BASE64_ENGINE
1845                        .decode(chunk, out_slice.as_out())
1846                        .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid input"))?;
1847                    debug_assert_eq!(decoded.len(), expected_size);
1848                    Ok(())
1849                })
1850            })
1851            .collect();
1852        for h in handles {
1853            h.join().unwrap()?;
1854        }
1855        Ok(())
1856    });
1857
1858    decode_result?;
1859
1860    out.write_all(&output_buf[..total_decoded])
1861}
1862
1863/// Strip non-base64 characters (for -i / --ignore-garbage).
1864fn strip_non_base64(data: &[u8]) -> Vec<u8> {
1865    data.iter()
1866        .copied()
1867        .filter(|&b| is_base64_char(b))
1868        .collect()
1869}
1870
1871/// Check if a byte is a valid base64 alphabet character or padding.
1872#[inline]
1873fn is_base64_char(b: u8) -> bool {
1874    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
1875}
1876
1877/// Stream-encode from a reader to a writer. Used for stdin processing.
1878/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
1879pub fn encode_stream(
1880    reader: &mut impl Read,
1881    wrap_col: usize,
1882    writer: &mut impl Write,
1883) -> io::Result<()> {
1884    if wrap_col == 0 {
1885        return encode_stream_nowrap(reader, writer);
1886    }
1887    encode_stream_wrapped(reader, wrap_col, writer)
1888}
1889
1890/// Streaming encode with NO line wrapping — optimized fast path.
1891/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
1892/// 24MB reads mean 10-18MB input is consumed in a single read() call,
1893/// and the encoded output writes in 1-2 write() calls.
1894fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1895    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
1896    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3
1897
1898    // SAFETY: buf bytes are written by read_full before being processed.
1899    // encode_buf bytes are written by encode before being read.
1900    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
1901    #[allow(clippy::uninit_vec)]
1902    unsafe {
1903        buf.set_len(NOWRAP_READ);
1904    }
1905    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
1906    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1907    #[allow(clippy::uninit_vec)]
1908    unsafe {
1909        encode_buf.set_len(encode_buf_size);
1910    }
1911
1912    loop {
1913        let n = read_full(reader, &mut buf)?;
1914        if n == 0 {
1915            break;
1916        }
1917        let enc_len = BASE64_ENGINE.encoded_length(n);
1918        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1919        writer.write_all(encoded)?;
1920    }
1921    Ok(())
1922}
1923
1924/// Streaming encode WITH line wrapping.
1925/// For the common case (wrap_col divides evenly into 3-byte input groups),
1926/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
1927/// then writes it in a single write() call. This eliminates the overhead of
1928/// many writev() syscalls (one per ~512 lines via IoSlice).
1929///
1930/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
1931fn encode_stream_wrapped(
1932    reader: &mut impl Read,
1933    wrap_col: usize,
1934    writer: &mut impl Write,
1935) -> io::Result<()> {
1936    let bytes_per_line = wrap_col * 3 / 4;
1937    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
1938    // align the read buffer to bytes_per_line boundaries so each chunk produces
1939    // complete lines with no column carry-over between chunks.
1940    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
1941        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
1942    }
1943
1944    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
1945    const STREAM_READ: usize = 12 * 1024 * 1024;
1946    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
1947    #[allow(clippy::uninit_vec)]
1948    unsafe {
1949        buf.set_len(STREAM_READ);
1950    }
1951    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
1952    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1953    #[allow(clippy::uninit_vec)]
1954    unsafe {
1955        encode_buf.set_len(encode_buf_size);
1956    }
1957
1958    let mut col = 0usize;
1959
1960    loop {
1961        let n = read_full(reader, &mut buf)?;
1962        if n == 0 {
1963            break;
1964        }
1965        let enc_len = BASE64_ENGINE.encoded_length(n);
1966        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1967
1968        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
1969    }
1970
1971    if col > 0 {
1972        writer.write_all(b"\n")?;
1973    }
1974
1975    Ok(())
1976}
1977
1978/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
1979/// encode each line directly into its final position with newline appended.
1980/// Eliminates the two-pass encode-then-fuse_wrap approach.
1981/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
1982/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
1983fn encode_stream_wrapped_fused(
1984    reader: &mut impl Read,
1985    wrap_col: usize,
1986    bytes_per_line: usize,
1987    writer: &mut impl Write,
1988) -> io::Result<()> {
1989    // Align read size to bytes_per_line for complete output lines per chunk.
1990    // ~420K lines * 57 bytes = ~24MB input, ~32MB output.
1991    let lines_per_chunk = (24 * 1024 * 1024) / bytes_per_line;
1992    let read_size = lines_per_chunk * bytes_per_line;
1993    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline
1994
1995    // SAFETY: buf bytes are written by read_full before being processed.
1996    // out_buf bytes are written by encode before being read.
1997    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
1998    #[allow(clippy::uninit_vec)]
1999    unsafe {
2000        buf.set_len(read_size);
2001    }
2002    // Output buffer: enough for all lines + remainder
2003    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
2004    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
2005    #[allow(clippy::uninit_vec)]
2006    unsafe {
2007        out_buf.set_len(max_output);
2008    }
2009
2010    loop {
2011        let n = read_full(reader, &mut buf)?;
2012        if n == 0 {
2013            break;
2014        }
2015
2016        let full_lines = n / bytes_per_line;
2017        let remainder = n % bytes_per_line;
2018
2019        // Encode each input line directly into its final output position.
2020        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
2021        // This eliminates the separate encode + fuse_wrap copy entirely.
2022        let dst = out_buf.as_mut_ptr();
2023        let mut line_idx = 0;
2024
2025        // 4-line unrolled loop for better ILP
2026        while line_idx + 4 <= full_lines {
2027            let in_base = line_idx * bytes_per_line;
2028            let out_base = line_idx * line_out;
2029            unsafe {
2030                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
2031                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
2032                *dst.add(out_base + wrap_col) = b'\n';
2033
2034                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
2035                let _ = BASE64_ENGINE.encode(
2036                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
2037                    s1.as_out(),
2038                );
2039                *dst.add(out_base + line_out + wrap_col) = b'\n';
2040
2041                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
2042                let _ = BASE64_ENGINE.encode(
2043                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
2044                    s2.as_out(),
2045                );
2046                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
2047
2048                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
2049                let _ = BASE64_ENGINE.encode(
2050                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
2051                    s3.as_out(),
2052                );
2053                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
2054            }
2055            line_idx += 4;
2056        }
2057
2058        // Remaining full lines
2059        while line_idx < full_lines {
2060            let in_base = line_idx * bytes_per_line;
2061            let out_base = line_idx * line_out;
2062            unsafe {
2063                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
2064                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
2065                *dst.add(out_base + wrap_col) = b'\n';
2066            }
2067            line_idx += 1;
2068        }
2069
2070        let mut wp = full_lines * line_out;
2071
2072        // Handle remainder (partial last line of this chunk)
2073        if remainder > 0 {
2074            let enc_len = BASE64_ENGINE.encoded_length(remainder);
2075            let line_input = &buf[full_lines * bytes_per_line..n];
2076            unsafe {
2077                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
2078                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
2079                *dst.add(wp + enc_len) = b'\n';
2080            }
2081            wp += enc_len + 1;
2082        }
2083
2084        writer.write_all(&out_buf[..wp])?;
2085    }
2086
2087    Ok(())
2088}
2089
2090/// Stream-decode from a reader to a writer. Used for stdin processing.
2091/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
2092/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
2093/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
2094/// fill the entire buffer from the pipe, and 32MB means even large inputs
2095/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
2096pub fn decode_stream(
2097    reader: &mut impl Read,
2098    ignore_garbage: bool,
2099    writer: &mut impl Write,
2100) -> io::Result<()> {
2101    const READ_CHUNK: usize = 32 * 1024 * 1024;
2102    // SAFETY: buf bytes are written by read_full before being processed.
2103    // The extra 4 bytes accommodate carry-over from previous chunk.
2104    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
2105    #[allow(clippy::uninit_vec)]
2106    unsafe {
2107        buf.set_len(READ_CHUNK + 4);
2108    }
2109    let mut carry = [0u8; 4];
2110    let mut carry_len = 0usize;
2111
2112    loop {
2113        // Copy carry bytes to start of buffer, read new data after them
2114        if carry_len > 0 {
2115            unsafe {
2116                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
2117            }
2118        }
2119        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
2120        if n == 0 {
2121            break;
2122        }
2123        let total_raw = carry_len + n;
2124
2125        // Strip whitespace in-place in the buffer itself.
2126        // This eliminates the separate clean buffer allocation (saves 16MB).
2127        let clean_len = if ignore_garbage {
2128            // Scalar filter for ignore_garbage mode (rare path)
2129            let ptr = buf.as_mut_ptr();
2130            let mut wp = 0usize;
2131            for i in 0..total_raw {
2132                let b = unsafe { *ptr.add(i) };
2133                if is_base64_char(b) {
2134                    unsafe { *ptr.add(wp) = b };
2135                    wp += 1;
2136                }
2137            }
2138            wp
2139        } else {
2140            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
2141            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
2142            // so we process ~76 bytes per memchr hit.
2143            let ptr = buf.as_mut_ptr();
2144            let data = &buf[..total_raw];
2145            let mut wp = 0usize;
2146            let mut gap_start = 0usize;
2147            let mut has_rare_ws = false;
2148
2149            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
2150                let gap_len = pos - gap_start;
2151                if gap_len > 0 {
2152                    if !has_rare_ws {
2153                        has_rare_ws = data[gap_start..pos]
2154                            .iter()
2155                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2156                    }
2157                    if wp != gap_start {
2158                        unsafe {
2159                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
2160                        }
2161                    }
2162                    wp += gap_len;
2163                }
2164                gap_start = pos + 1;
2165            }
2166            let tail_len = total_raw - gap_start;
2167            if tail_len > 0 {
2168                if !has_rare_ws {
2169                    has_rare_ws = data[gap_start..total_raw]
2170                        .iter()
2171                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2172                }
2173                if wp != gap_start {
2174                    unsafe {
2175                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
2176                    }
2177                }
2178                wp += tail_len;
2179            }
2180
2181            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
2182            if has_rare_ws {
2183                let mut rp = 0;
2184                let mut cwp = 0;
2185                while rp < wp {
2186                    let b = unsafe { *ptr.add(rp) };
2187                    if NOT_WHITESPACE[b as usize] {
2188                        unsafe { *ptr.add(cwp) = b };
2189                        cwp += 1;
2190                    }
2191                    rp += 1;
2192                }
2193                cwp
2194            } else {
2195                wp
2196            }
2197        };
2198
2199        carry_len = 0;
2200        let is_last = n < READ_CHUNK;
2201
2202        if is_last {
2203            // Last chunk: decode everything (including padding)
2204            decode_clean_slice(&mut buf[..clean_len], writer)?;
2205        } else {
2206            // Save incomplete base64 quadruplet for next iteration
2207            let decode_len = (clean_len / 4) * 4;
2208            let leftover = clean_len - decode_len;
2209            if leftover > 0 {
2210                unsafe {
2211                    std::ptr::copy_nonoverlapping(
2212                        buf.as_ptr().add(decode_len),
2213                        carry.as_mut_ptr(),
2214                        leftover,
2215                    );
2216                }
2217                carry_len = leftover;
2218            }
2219            if decode_len > 0 {
2220                decode_clean_slice(&mut buf[..decode_len], writer)?;
2221            }
2222        }
2223    }
2224
2225    // Handle any remaining carry-over bytes
2226    if carry_len > 0 {
2227        let mut carry_buf = carry[..carry_len].to_vec();
2228        decode_clean_slice(&mut carry_buf, writer)?;
2229    }
2230
2231    Ok(())
2232}
2233
2234/// Write all IoSlice entries using write_vectored (writev syscall).
2235/// Hot path: single write_vectored succeeds fully (common on Linux pipes/files).
2236/// Cold path: partial write handled out-of-line to keep hot path tight.
2237#[inline(always)]
2238fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
2239    if slices.is_empty() {
2240        return Ok(());
2241    }
2242    let total: usize = slices.iter().map(|s| s.len()).sum();
2243    let written = out.write_vectored(slices)?;
2244    if written >= total {
2245        return Ok(());
2246    }
2247    if written == 0 {
2248        return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
2249    }
2250    write_all_vectored_slow(out, slices, written)
2251}
2252
2253/// Handle partial write (cold path, never inlined).
2254#[cold]
2255#[inline(never)]
2256fn write_all_vectored_slow(
2257    out: &mut impl Write,
2258    slices: &[io::IoSlice],
2259    mut skip: usize,
2260) -> io::Result<()> {
2261    for slice in slices {
2262        let len = slice.len();
2263        if skip >= len {
2264            skip -= len;
2265            continue;
2266        }
2267        out.write_all(&slice[skip..])?;
2268        skip = 0;
2269    }
2270    Ok(())
2271}
2272
2273/// Read as many bytes as possible into buf, retrying on partial reads.
2274/// Fast path: regular file reads usually return the full buffer on the first call,
2275/// avoiding the loop overhead entirely.
2276#[inline]
2277fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2278    // Fast path: first read() usually fills the entire buffer for regular files
2279    let n = reader.read(buf)?;
2280    if n == buf.len() || n == 0 {
2281        return Ok(n);
2282    }
2283    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
2284    let mut total = n;
2285    while total < buf.len() {
2286        match reader.read(&mut buf[total..]) {
2287            Ok(0) => break,
2288            Ok(n) => total += n,
2289            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2290            Err(e) => return Err(e),
2291        }
2292    }
2293    Ok(total)
2294}