Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4
5const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
6
7/// Number of available CPUs (cached by the OS). Used for encode parallel thresholds
8/// to avoid triggering Rayon's thread pool initialization for encode paths.
9#[inline]
10fn num_cpus() -> usize {
11    std::thread::available_parallelism()
12        .map(|n| n.get())
13        .unwrap_or(1)
14}
15
16/// Chunk size for sequential no-wrap encoding: 8MB aligned to 3 bytes.
17/// Larger chunks reduce function call overhead per iteration while still
18/// keeping peak buffer allocation reasonable (~10.7MB for the output).
19const NOWRAP_CHUNK: usize = 8 * 1024 * 1024 - (8 * 1024 * 1024 % 3);
20
21/// Minimum data size for parallel no-wrap encoding (4MB).
22/// For 1-2MB input, thread creation (~200µs for 4 threads) + per-thread
23/// buffer allocation page faults (~0.3ms) exceed the parallel encoding
24/// benefit. At 4MB+, the ~2x parallel speedup amortizes overhead.
25const PARALLEL_NOWRAP_THRESHOLD: usize = 4 * 1024 * 1024;
26
27/// Minimum data size for parallel wrapped encoding (2MB).
28/// Wrapped parallel uses N threads for SIMD encoding, providing ~Nx
29/// speedup. Per-thread buffers (~2.5MB each for 10MB input) page-fault
30/// concurrently, and std::thread::scope avoids Rayon pool init (~300µs).
31const PARALLEL_WRAPPED_THRESHOLD: usize = 2 * 1024 * 1024;
32
33/// Minimum data size for parallel decoding (2MB of base64 data).
34/// Lower threshold lets parallel decode kick in earlier for medium files.
35const PARALLEL_DECODE_THRESHOLD: usize = 2 * 1024 * 1024;
36
37/// Encode data and write to output with line wrapping.
38/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
39pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
40    if data.is_empty() {
41        return Ok(());
42    }
43
44    if wrap_col == 0 {
45        return encode_no_wrap(data, out);
46    }
47
48    encode_wrapped(data, wrap_col, out)
49}
50
51/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
52fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
53    if data.len() >= PARALLEL_NOWRAP_THRESHOLD && num_cpus() > 1 {
54        return encode_no_wrap_parallel(data, out);
55    }
56
57    // Single-buffer encode: for data that fits in one chunk, encode directly
58    // and write once. For larger data, reuse the buffer across chunks.
59    let enc_len = BASE64_ENGINE.encoded_length(data.len().min(NOWRAP_CHUNK));
60    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
61    #[allow(clippy::uninit_vec)]
62    unsafe {
63        buf.set_len(enc_len);
64    }
65
66    for chunk in data.chunks(NOWRAP_CHUNK) {
67        let clen = BASE64_ENGINE.encoded_length(chunk.len());
68        let encoded = BASE64_ENGINE.encode(chunk, buf[..clen].as_out());
69        out.write_all(encoded)?;
70    }
71    Ok(())
72}
73
74/// Parallel no-wrap encoding: split at 3-byte boundaries, encode chunks in parallel.
75/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
76///
77/// Uses std::thread::scope instead of Rayon to avoid pool initialization overhead (~300µs).
78/// Each scoped thread allocates its own output buffer and encodes independently.
79/// Output uses writev to combine all per-thread buffers in a single syscall.
80fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
81    let num_threads = num_cpus().max(1);
82    let raw_chunk = data.len() / num_threads;
83    // Align to 3 bytes so each chunk encodes without padding (except the last)
84    let chunk_size = ((raw_chunk + 2) / 3) * 3;
85
86    // Split input into 3-byte-aligned chunks
87    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();
88
89    // Each scoped thread allocates its own output buffer and encodes independently.
90    let results: Vec<Vec<u8>> = std::thread::scope(|s| {
91        let handles: Vec<_> = chunks
92            .iter()
93            .map(|chunk| {
94                s.spawn(|| {
95                    let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
96                    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
97                    #[allow(clippy::uninit_vec)]
98                    unsafe {
99                        buf.set_len(enc_len);
100                    }
101                    // HUGEPAGE on per-thread buffer reduces page faults
102                    #[cfg(target_os = "linux")]
103                    if enc_len >= 2 * 1024 * 1024 {
104                        unsafe {
105                            libc::madvise(
106                                buf.as_mut_ptr() as *mut libc::c_void,
107                                enc_len,
108                                libc::MADV_HUGEPAGE,
109                            );
110                        }
111                    }
112                    let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
113                    buf
114                })
115            })
116            .collect();
117        handles.into_iter().map(|h| h.join().unwrap()).collect()
118    });
119
120    // Single writev for all chunks in order
121    let slices: Vec<io::IoSlice> = results.iter().map(|r| io::IoSlice::new(r)).collect();
122    write_all_vectored(out, &slices)
123}
124
125/// Encode with line wrapping using in-place expansion.
126/// Phase 1: bulk-encode the entire input in one SIMD pass into a buffer.
127/// Phase 2: expand backwards to insert newlines between wrap_col-sized segments.
128/// Phase 3: single write_all of the completed output.
129///
130/// This avoids both fuse_wrap's copy pass and writev's 300+ syscall overhead,
131/// using only one allocation and one write syscall for the entire output.
132fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
133    // Calculate bytes_per_line: input bytes that produce exactly wrap_col encoded chars.
134    // For default wrap_col=76: 76*3/4 = 57 bytes per line.
135    let bytes_per_line = wrap_col * 3 / 4;
136    if bytes_per_line == 0 {
137        // Degenerate case: wrap_col < 4, fall back to byte-at-a-time
138        return encode_wrapped_small(data, wrap_col, out);
139    }
140
141    // Parallel encoding for large data when bytes_per_line is a multiple of 3.
142    // This guarantees each chunk encodes to complete base64 without padding.
143    if data.len() >= PARALLEL_WRAPPED_THRESHOLD && bytes_per_line.is_multiple_of(3) {
144        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
145    }
146
147    // Bulk encode + backward expansion: encode entire chunk in one SIMD call,
148    // then expand backward to insert newlines. Replaces ~140K per-line encode(57)
149    // calls with 1-2 bulk encode calls + expansion, reducing per-call overhead
150    // and improving SIMD pipeline utilization (48-byte SIMD lanes amortized better).
151    //
152    // For large data (>8MB), processes in chunks to reduce peak memory allocation.
153    if bytes_per_line.is_multiple_of(3) {
154        let line_out = wrap_col + 1;
155
156        // Chunk size: 8MB of input, aligned to bytes_per_line
157        const MAX_CHUNK_INPUT: usize = 8 * 1024 * 1024;
158        let lines_per_chunk = MAX_CHUNK_INPUT / bytes_per_line;
159        let chunk_input = lines_per_chunk * bytes_per_line;
160        let chunk_output = lines_per_chunk * line_out;
161
162        // Allocate buffer for one chunk (reused across chunks)
163        let buf_cap = chunk_output + line_out + 8; // +line_out for remainder
164        let mut buf: Vec<u8> = Vec::with_capacity(buf_cap);
165        #[allow(clippy::uninit_vec)]
166        unsafe {
167            buf.set_len(buf_cap);
168        }
169        // HUGEPAGE reduces page faults for the ~10MB output buffer
170        #[cfg(target_os = "linux")]
171        if buf_cap >= 2 * 1024 * 1024 {
172            unsafe {
173                libc::madvise(
174                    buf.as_mut_ptr() as *mut libc::c_void,
175                    buf_cap,
176                    libc::MADV_HUGEPAGE,
177                );
178            }
179        }
180
181        let mut data_off = 0;
182
183        // Process full chunks: bulk encode then expand backward
184        while data_off + chunk_input <= data.len() {
185            let chunk_data = &data[data_off..data_off + chunk_input];
186            let enc_len = lines_per_chunk * wrap_col;
187
188            // Phase 1: Bulk encode entire chunk in one SIMD call
189            unsafe {
190                let s = std::slice::from_raw_parts_mut(buf.as_mut_ptr(), enc_len);
191                let _ = BASE64_ENGINE.encode(chunk_data, s.as_out());
192            }
193
194            // Phase 2: Expand backward to insert newlines
195            unsafe {
196                let ptr = buf.as_mut_ptr();
197                let mut i = lines_per_chunk;
198                while i > 0 {
199                    i -= 1;
200                    let src_off = i * wrap_col;
201                    let dst_off = i * line_out;
202                    *ptr.add(dst_off + wrap_col) = b'\n';
203                    if dst_off != src_off {
204                        std::ptr::copy(ptr.add(src_off), ptr.add(dst_off), wrap_col);
205                    }
206                }
207            }
208
209            out.write_all(&buf[..chunk_output])?;
210            data_off += chunk_input;
211        }
212
213        // Remaining data (partial chunk)
214        let remaining = data.len() - data_off;
215        if remaining > 0 {
216            let remaining_data = &data[data_off..];
217            let full_lines = remaining / bytes_per_line;
218            let remainder_input = remaining % bytes_per_line;
219            let remainder_encoded = if remainder_input > 0 {
220                BASE64_ENGINE.encoded_length(remainder_input) + 1
221            } else {
222                0
223            };
224            let remaining_output = full_lines * line_out + remainder_encoded;
225
226            // Ensure buffer is large enough for the remainder
227            if remaining_output > buf.len() {
228                buf.reserve(remaining_output - buf.len());
229                #[allow(clippy::uninit_vec)]
230                unsafe {
231                    buf.set_len(remaining_output);
232                }
233            }
234
235            if full_lines > 0 {
236                let full_input = &remaining_data[..full_lines * bytes_per_line];
237                let enc_len = full_lines * wrap_col;
238
239                // Bulk encode full lines
240                unsafe {
241                    let s = std::slice::from_raw_parts_mut(buf.as_mut_ptr(), enc_len);
242                    let _ = BASE64_ENGINE.encode(full_input, s.as_out());
243                }
244
245                // Expand backward
246                unsafe {
247                    let ptr = buf.as_mut_ptr();
248                    let mut i = full_lines;
249                    while i > 0 {
250                        i -= 1;
251                        let src_off = i * wrap_col;
252                        let dst_off = i * line_out;
253                        *ptr.add(dst_off + wrap_col) = b'\n';
254                        if dst_off != src_off {
255                            std::ptr::copy(ptr.add(src_off), ptr.add(dst_off), wrap_col);
256                        }
257                    }
258                }
259            }
260
261            if remainder_input > 0 {
262                let in_off = full_lines * bytes_per_line;
263                let out_off = full_lines * line_out;
264                let enc_len = BASE64_ENGINE.encoded_length(remainder_input);
265                unsafe {
266                    let s = std::slice::from_raw_parts_mut(buf.as_mut_ptr().add(out_off), enc_len);
267                    let _ = BASE64_ENGINE.encode(&remaining_data[in_off..], s.as_out());
268                    *buf.as_mut_ptr().add(out_off + enc_len) = b'\n';
269                }
270            }
271
272            out.write_all(&buf[..remaining_output])?;
273        }
274
275        return Ok(());
276    }
277
278    // Fallback for non-3-aligned bytes_per_line: chunk + in-place expansion
279    let lines_per_chunk = (32 * 1024 * 1024) / bytes_per_line;
280    let max_input_chunk = (lines_per_chunk * bytes_per_line).max(bytes_per_line);
281
282    let enc_max = BASE64_ENGINE.encoded_length(max_input_chunk.min(data.len()));
283    let num_lines_max = enc_max / wrap_col + 1;
284    let out_max = num_lines_max * (wrap_col + 1) + wrap_col + 1;
285    let mut buf: Vec<u8> = Vec::with_capacity(out_max);
286    #[allow(clippy::uninit_vec)]
287    unsafe {
288        buf.set_len(out_max);
289    }
290
291    for chunk in data.chunks(max_input_chunk.max(1)) {
292        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
293        let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
294        let num_full = enc_len / wrap_col;
295        let rem = enc_len % wrap_col;
296        let chunk_out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
297
298        // Expand backwards
299        unsafe {
300            let ptr = buf.as_mut_ptr();
301            let mut rp = enc_len;
302            let mut wp = chunk_out_len;
303            if rem > 0 {
304                wp -= 1;
305                *ptr.add(wp) = b'\n';
306                wp -= rem;
307                rp -= rem;
308                if rp != wp {
309                    std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
310                }
311            }
312            for _ in 0..num_full {
313                wp -= 1;
314                *ptr.add(wp) = b'\n';
315                wp -= wrap_col;
316                rp -= wrap_col;
317                if rp != wp {
318                    std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
319                }
320            }
321        }
322        out.write_all(&buf[..chunk_out_len])?;
323    }
324
325    Ok(())
326}
327
328/// Static newline byte for IoSlice references in writev calls.
329static NEWLINE: [u8; 1] = [b'\n'];
330
331/// Write encoded base64 data with line wrapping using write_vectored (writev).
332/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
333/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
334/// This is zero-copy: no fused output buffer needed.
335#[inline]
336#[allow(dead_code)]
337fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
338    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
339    // Each line needs 2 entries (data + newline), so 512 lines per batch.
340    const MAX_IOV: usize = 1024;
341
342    let num_full_lines = encoded.len() / wrap_col;
343    let remainder = encoded.len() % wrap_col;
344    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };
345
346    // Small output: build all IoSlices and write in one call
347    if total_iov <= MAX_IOV {
348        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
349        let mut pos = 0;
350        for _ in 0..num_full_lines {
351            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
352            iov.push(io::IoSlice::new(&NEWLINE));
353            pos += wrap_col;
354        }
355        if remainder > 0 {
356            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
357            iov.push(io::IoSlice::new(&NEWLINE));
358        }
359        return write_all_vectored(out, &iov);
360    }
361
362    // Large output: write in batches
363    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
364    let mut pos = 0;
365    for _ in 0..num_full_lines {
366        iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
367        iov.push(io::IoSlice::new(&NEWLINE));
368        pos += wrap_col;
369        if iov.len() >= MAX_IOV {
370            write_all_vectored(out, &iov)?;
371            iov.clear();
372        }
373    }
374    if remainder > 0 {
375        iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
376        iov.push(io::IoSlice::new(&NEWLINE));
377    }
378    if !iov.is_empty() {
379        write_all_vectored(out, &iov)?;
380    }
381    Ok(())
382}
383
384/// Write encoded base64 data with line wrapping using writev, tracking column state
385/// across calls. Used by encode_stream for piped input where chunks don't align
386/// to line boundaries.
387#[inline]
388fn write_wrapped_iov_streaming(
389    encoded: &[u8],
390    wrap_col: usize,
391    col: &mut usize,
392    out: &mut impl Write,
393) -> io::Result<()> {
394    const MAX_IOV: usize = 1024;
395    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
396    let mut rp = 0;
397
398    while rp < encoded.len() {
399        let space = wrap_col - *col;
400        let avail = encoded.len() - rp;
401
402        if avail <= space {
403            // Remaining data fits in current line
404            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
405            *col += avail;
406            if *col == wrap_col {
407                iov.push(io::IoSlice::new(&NEWLINE));
408                *col = 0;
409            }
410            break;
411        } else {
412            // Fill current line and add newline
413            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
414            iov.push(io::IoSlice::new(&NEWLINE));
415            rp += space;
416            *col = 0;
417        }
418
419        if iov.len() >= MAX_IOV - 1 {
420            write_all_vectored(out, &iov)?;
421            iov.clear();
422        }
423    }
424
425    if !iov.is_empty() {
426        write_all_vectored(out, &iov)?;
427    }
428    Ok(())
429}
430
431/// Parallel wrapped encoding: single output buffer, direct-to-position encode+wrap.
432/// Requires bytes_per_line % 3 == 0 so each chunk encodes without intermediate padding.
433///
434/// Pre-calculates exact output size and each thread's write offset, then encodes
435/// 57-byte input groups directly to their final position in a shared output buffer.
436/// Each thread writes wrap_col encoded bytes + newline per line, so output for line N
437/// starts at N * (wrap_col + 1). This eliminates per-chunk heap allocations and
438/// the fuse_wrap copy pass entirely.
439/// Parallel wrapped encoding with per-thread output buffers.
440///
441/// Each thread encodes its chunk of input lines into its own buffer (with newlines),
442/// then writev combines all buffers in order. This avoids the single ~13.5MB shared
443/// buffer allocation whose page faults (~3400 faults = ~3.4ms) dominate encoding time.
444/// Per-thread buffers (~3.4MB each) page-fault concurrently, reducing wall-clock to ~0.8ms.
445fn encode_wrapped_parallel(
446    data: &[u8],
447    wrap_col: usize,
448    bytes_per_line: usize,
449    out: &mut impl Write,
450) -> io::Result<()> {
451    let line_out = wrap_col + 1;
452    let total_full_lines = data.len() / bytes_per_line;
453
454    // Split work at line boundaries for parallel processing
455    let num_threads = num_cpus().max(1);
456    let lines_per_chunk = (total_full_lines / num_threads).max(1);
457
458    // Build per-thread input ranges aligned to bytes_per_line
459    let mut tasks: Vec<(usize, usize)> = Vec::new(); // (input_offset, num_input_bytes)
460    let mut in_off = 0usize;
461    while in_off < data.len() {
462        let chunk_input = (lines_per_chunk * bytes_per_line).min(data.len() - in_off);
463        let aligned_input = if in_off + chunk_input < data.len() {
464            (chunk_input / bytes_per_line) * bytes_per_line
465        } else {
466            chunk_input
467        };
468        if aligned_input == 0 {
469            break;
470        }
471        tasks.push((in_off, aligned_input));
472        in_off += aligned_input;
473    }
474
475    // Each scoped thread bulk-encodes its chunk in one SIMD call, then
476    // expands backward to insert newlines. This replaces ~44K per-line
477    // encode(57) calls with 1 bulk encode(2.5MB) + backward expansion,
478    // reducing per-call overhead and improving SIMD pipeline utilization.
479    let results: Vec<Vec<u8>> = std::thread::scope(|s| {
480        let handles: Vec<_> = tasks
481            .iter()
482            .map(|&(in_off, chunk_len)| {
483                s.spawn(move || {
484                    let input = &data[in_off..in_off + chunk_len];
485                    let full_lines = chunk_len / bytes_per_line;
486                    let rem = chunk_len % bytes_per_line;
487
488                    let remainder_encoded = if rem > 0 {
489                        BASE64_ENGINE.encoded_length(rem) + 1
490                    } else {
491                        0
492                    };
493                    let buf_size = full_lines * line_out + remainder_encoded;
494
495                    let mut buf: Vec<u8> = Vec::with_capacity(buf_size);
496                    #[allow(clippy::uninit_vec)]
497                    unsafe {
498                        buf.set_len(buf_size);
499                    }
500                    // HUGEPAGE on per-thread buffer reduces page faults
501                    #[cfg(target_os = "linux")]
502                    if buf_size >= 2 * 1024 * 1024 {
503                        unsafe {
504                            libc::madvise(
505                                buf.as_mut_ptr() as *mut libc::c_void,
506                                buf_size,
507                                libc::MADV_HUGEPAGE,
508                            );
509                        }
510                    }
511
512                    if full_lines > 0 {
513                        let full_input = &input[..full_lines * bytes_per_line];
514                        let enc_len = full_lines * wrap_col;
515
516                        // Phase 1: Bulk encode all full lines in one SIMD call.
517                        // Produces contiguous base64 without newlines at buf[0..enc_len].
518                        unsafe {
519                            let s = std::slice::from_raw_parts_mut(buf.as_mut_ptr(), enc_len);
520                            let _ = BASE64_ENGINE.encode(full_input, s.as_out());
521                        }
522
523                        // Phase 2: Expand backward to insert newlines every wrap_col chars.
524                        // Process from last line to first: each line shifts right by its
525                        // index (1 byte per preceding newline). Uses memmove for overlap.
526                        unsafe {
527                            let ptr = buf.as_mut_ptr();
528                            let mut i = full_lines;
529                            while i > 0 {
530                                i -= 1;
531                                let src_off = i * wrap_col;
532                                let dst_off = i * line_out;
533                                // Insert newline after the line
534                                *ptr.add(dst_off + wrap_col) = b'\n';
535                                // Move the line data (overlapping for i < 76)
536                                if dst_off != src_off {
537                                    std::ptr::copy(ptr.add(src_off), ptr.add(dst_off), wrap_col);
538                                }
539                            }
540                        }
541                    }
542
543                    if rem > 0 {
544                        let line_input = &input[full_lines * bytes_per_line..];
545                        let enc_len = BASE64_ENGINE.encoded_length(rem);
546                        let woff = full_lines * line_out;
547                        unsafe {
548                            let s =
549                                std::slice::from_raw_parts_mut(buf.as_mut_ptr().add(woff), enc_len);
550                            let _ = BASE64_ENGINE.encode(line_input, s.as_out());
551                            *buf.as_mut_ptr().add(woff + enc_len) = b'\n';
552                        }
553                    }
554
555                    buf
556                })
557            })
558            .collect();
559        handles.into_iter().map(|h| h.join().unwrap()).collect()
560    });
561
562    // Single writev for all per-thread buffers in order
563    let slices: Vec<io::IoSlice> = results.iter().map(|r| io::IoSlice::new(r)).collect();
564    write_all_vectored(out, &slices)
565}
566
567/// Fuse encoded base64 data with newlines in a single pass.
568/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
569/// Returns number of bytes written.
570#[inline]
571#[allow(dead_code)]
572fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
573    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
574    let mut rp = 0;
575    let mut wp = 0;
576
577    // Unrolled: process 8 lines per iteration for better ILP
578    while rp + 8 * wrap_col <= encoded.len() {
579        unsafe {
580            let src = encoded.as_ptr().add(rp);
581            let dst = out_buf.as_mut_ptr().add(wp);
582
583            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
584            *dst.add(wrap_col) = b'\n';
585
586            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
587            *dst.add(line_out + wrap_col) = b'\n';
588
589            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
590            *dst.add(2 * line_out + wrap_col) = b'\n';
591
592            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
593            *dst.add(3 * line_out + wrap_col) = b'\n';
594
595            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
596            *dst.add(4 * line_out + wrap_col) = b'\n';
597
598            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
599            *dst.add(5 * line_out + wrap_col) = b'\n';
600
601            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
602            *dst.add(6 * line_out + wrap_col) = b'\n';
603
604            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
605            *dst.add(7 * line_out + wrap_col) = b'\n';
606        }
607        rp += 8 * wrap_col;
608        wp += 8 * line_out;
609    }
610
611    // Handle remaining 4 lines at a time
612    while rp + 4 * wrap_col <= encoded.len() {
613        unsafe {
614            let src = encoded.as_ptr().add(rp);
615            let dst = out_buf.as_mut_ptr().add(wp);
616
617            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
618            *dst.add(wrap_col) = b'\n';
619
620            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
621            *dst.add(line_out + wrap_col) = b'\n';
622
623            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
624            *dst.add(2 * line_out + wrap_col) = b'\n';
625
626            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
627            *dst.add(3 * line_out + wrap_col) = b'\n';
628        }
629        rp += 4 * wrap_col;
630        wp += 4 * line_out;
631    }
632
633    // Remaining full lines
634    while rp + wrap_col <= encoded.len() {
635        unsafe {
636            std::ptr::copy_nonoverlapping(
637                encoded.as_ptr().add(rp),
638                out_buf.as_mut_ptr().add(wp),
639                wrap_col,
640            );
641            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
642        }
643        rp += wrap_col;
644        wp += line_out;
645    }
646
647    // Partial last line
648    if rp < encoded.len() {
649        let remaining = encoded.len() - rp;
650        unsafe {
651            std::ptr::copy_nonoverlapping(
652                encoded.as_ptr().add(rp),
653                out_buf.as_mut_ptr().add(wp),
654                remaining,
655            );
656        }
657        wp += remaining;
658        out_buf[wp] = b'\n';
659        wp += 1;
660    }
661
662    wp
663}
664
665/// Fallback for very small wrap columns (< 4 chars).
666fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
667    let enc_max = BASE64_ENGINE.encoded_length(data.len());
668    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
669    #[allow(clippy::uninit_vec)]
670    unsafe {
671        buf.set_len(enc_max);
672    }
673    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
674
675    let wc = wrap_col.max(1);
676    for line in encoded.chunks(wc) {
677        out.write_all(line)?;
678        out.write_all(b"\n")?;
679    }
680    Ok(())
681}
682
683/// Decode base64 data and write to output (borrows data, allocates clean buffer).
684/// When `ignore_garbage` is true, strip all non-base64 characters.
685/// When false, only strip whitespace (standard behavior).
686pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
687    if data.is_empty() {
688        return Ok(());
689    }
690
691    if ignore_garbage {
692        let mut cleaned = strip_non_base64(data);
693        return decode_clean_slice(&mut cleaned, out);
694    }
695
696    // For large data (>= 512KB): use bulk strip + single-shot decode.
697    // try_line_decode decodes per-line (~25ns overhead per 76-byte line call),
698    // while strip+decode uses SIMD gap-copy + single-shot SIMD decode at ~6.5 GB/s.
699    // For 10MB decode benchmark: ~2ms (bulk) vs ~4ms (per-line) = 2x faster.
700    // For small data (< 512KB): per-line decode avoids allocation overhead.
701    if data.len() < 512 * 1024 && data.len() >= 77 {
702        if let Some(result) = try_line_decode(data, out) {
703            return result;
704        }
705    }
706
707    // Fast path: single-pass SIMD strip + decode
708    decode_stripping_whitespace(data, out)
709}
710
711/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
712/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
713/// in-place with base64_simd::decode_inplace. Zero additional allocations.
714///
715/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
716/// containing whitespace (newlines) get physically copied (~1.3% for
717/// 76-char line base64). The decode writes to the same buffer, but decoded
718/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
719pub fn decode_mmap_inplace(
720    data: &mut [u8],
721    ignore_garbage: bool,
722    out: &mut impl Write,
723) -> io::Result<()> {
724    if data.is_empty() {
725        return Ok(());
726    }
727
728    // For small data: try line-by-line decode (avoids COW page faults).
729    // For large data (>= 512KB): bulk strip+decode is faster than per-line decode.
730    if !ignore_garbage && data.len() >= 77 && data.len() < 512 * 1024 {
731        if let Some(result) = try_line_decode(data, out) {
732            return result;
733        }
734    }
735
736    if ignore_garbage {
737        // Strip non-base64 chars in-place
738        let ptr = data.as_mut_ptr();
739        let len = data.len();
740        let mut wp = 0;
741        for rp in 0..len {
742            let b = unsafe { *ptr.add(rp) };
743            if is_base64_char(b) {
744                unsafe { *ptr.add(wp) = b };
745                wp += 1;
746            }
747        }
748        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
749            Ok(decoded) => return out.write_all(decoded),
750            Err(_) => return decode_error(),
751        }
752    }
753
754    // Fast path: uniform-line fused strip+decode (no intermediate buffer).
755    if data.len() >= 77 {
756        if let Some(result) = try_decode_uniform_lines(data, out) {
757            return result;
758        }
759    }
760
761    // Fallback: strip whitespace in-place using SIMD memchr2 gap-copy.
762
763    // Quick check: no newlines at all — maybe already clean
764    if memchr::memchr2(b'\n', b'\r', data).is_none() {
765        // Check for rare whitespace
766        if !data
767            .iter()
768            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
769        {
770            // Perfectly clean — decode in-place directly
771            match BASE64_ENGINE.decode_inplace(data) {
772                Ok(decoded) => return out.write_all(decoded),
773                Err(_) => return decode_error(),
774            }
775        }
776        // Rare whitespace only — strip in-place
777        let ptr = data.as_mut_ptr();
778        let len = data.len();
779        let mut wp = 0;
780        for rp in 0..len {
781            let b = unsafe { *ptr.add(rp) };
782            if NOT_WHITESPACE[b as usize] {
783                unsafe { *ptr.add(wp) = b };
784                wp += 1;
785            }
786        }
787        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
788            Ok(decoded) => return out.write_all(decoded),
789            Err(_) => return decode_error(),
790        }
791    }
792
793    // SIMD gap-copy: strip \n and \r in-place using memchr2
794    let ptr = data.as_mut_ptr();
795    let len = data.len();
796    let mut wp = 0usize;
797    let mut gap_start = 0usize;
798    let mut has_rare_ws = false;
799
800    // SAFETY: memchr2_iter reads from the original data. We write to positions
801    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
802    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
803        let gap_len = pos - gap_start;
804        if gap_len > 0 {
805            if !has_rare_ws {
806                // Check for rare whitespace during the gap-copy
807                has_rare_ws = unsafe {
808                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
809                        .iter()
810                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
811                };
812            }
813            if wp != gap_start {
814                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
815            }
816            wp += gap_len;
817        }
818        gap_start = pos + 1;
819    }
820    // Final gap
821    let tail_len = len - gap_start;
822    if tail_len > 0 {
823        if !has_rare_ws {
824            has_rare_ws = unsafe {
825                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
826                    .iter()
827                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
828            };
829        }
830        if wp != gap_start {
831            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
832        }
833        wp += tail_len;
834    }
835
836    // Second pass for rare whitespace if needed
837    if has_rare_ws {
838        let mut rp = 0;
839        let mut cwp = 0;
840        while rp < wp {
841            let b = unsafe { *ptr.add(rp) };
842            if NOT_WHITESPACE[b as usize] {
843                unsafe { *ptr.add(cwp) = b };
844                cwp += 1;
845            }
846            rp += 1;
847        }
848        wp = cwp;
849    }
850
851    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
852    if wp >= PARALLEL_DECODE_THRESHOLD {
853        // For large data, use parallel decode from the cleaned slice
854        return decode_borrowed_clean_parallel(out, &data[..wp]);
855    }
856    match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
857        Ok(decoded) => out.write_all(decoded),
858        Err(_) => decode_error(),
859    }
860}
861
862/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
863pub fn decode_owned(
864    data: &mut Vec<u8>,
865    ignore_garbage: bool,
866    out: &mut impl Write,
867) -> io::Result<()> {
868    if data.is_empty() {
869        return Ok(());
870    }
871
872    if ignore_garbage {
873        data.retain(|&b| is_base64_char(b));
874    } else {
875        strip_whitespace_inplace(data);
876    }
877
878    decode_clean_slice(data, out)
879}
880
881/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
882/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
883/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
884/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
885fn strip_whitespace_inplace(data: &mut Vec<u8>) {
886    // Quick check: skip stripping if no \n or \r in the data.
887    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
888    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
889    if memchr::memchr2(b'\n', b'\r', data).is_none() {
890        // No newlines/CR — check for rare whitespace only
891        if data
892            .iter()
893            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
894        {
895            data.retain(|&b| NOT_WHITESPACE[b as usize]);
896        }
897        return;
898    }
899
900    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
901    // gaps between them to compact the data in-place. For typical base64 streams,
902    // newlines are the only whitespace, so this handles >99% of cases.
903    let ptr = data.as_mut_ptr();
904    let len = data.len();
905    let mut wp = 0usize;
906    let mut gap_start = 0usize;
907    let mut has_rare_ws = false;
908
909    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
910        let gap_len = pos - gap_start;
911        if gap_len > 0 {
912            if !has_rare_ws {
913                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
914                has_rare_ws = data[gap_start..pos]
915                    .iter()
916                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
917            }
918            if wp != gap_start {
919                unsafe {
920                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
921                }
922            }
923            wp += gap_len;
924        }
925        gap_start = pos + 1;
926    }
927    // Copy the final gap
928    let tail_len = len - gap_start;
929    if tail_len > 0 {
930        if !has_rare_ws {
931            has_rare_ws = data[gap_start..]
932                .iter()
933                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
934        }
935        if wp != gap_start {
936            unsafe {
937                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
938            }
939        }
940        wp += tail_len;
941    }
942
943    data.truncate(wp);
944
945    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
946    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
947    if has_rare_ws {
948        let ptr = data.as_mut_ptr();
949        let len = data.len();
950        let mut rp = 0;
951        let mut cwp = 0;
952        while rp < len {
953            let b = unsafe { *ptr.add(rp) };
954            if NOT_WHITESPACE[b as usize] {
955                unsafe { *ptr.add(cwp) = b };
956                cwp += 1;
957            }
958            rp += 1;
959        }
960        data.truncate(cwp);
961    }
962}
963
964/// 256-byte lookup table: true for non-whitespace bytes.
965/// Used for single-pass whitespace stripping in decode.
966static NOT_WHITESPACE: [bool; 256] = {
967    let mut table = [true; 256];
968    table[b' ' as usize] = false;
969    table[b'\t' as usize] = false;
970    table[b'\n' as usize] = false;
971    table[b'\r' as usize] = false;
972    table[0x0b] = false; // vertical tab
973    table[0x0c] = false; // form feed
974    table
975};
976
977/// Fused strip+decode for uniform-line base64 data.
978/// Detects consistent line length, then processes in sub-chunks: each sub-chunk
979/// copies lines to a small local buffer (L2-hot) and decodes immediately.
980/// Eliminates the large intermediate clean buffer (~12MB for 10MB decode).
981/// Returns None if the data doesn't have uniform line structure.
982fn try_decode_uniform_lines(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
983    let first_nl = memchr::memchr(b'\n', data)?;
984    let line_len = first_nl;
985    if line_len == 0 || line_len % 4 != 0 {
986        return None;
987    }
988
989    let stride = line_len + 1;
990
991    // Verify the data has consistent line structure (first + last lines)
992    let check_lines = 4.min(data.len() / stride);
993    for i in 1..check_lines {
994        let expected_nl = i * stride - 1;
995        if expected_nl >= data.len() || data[expected_nl] != b'\n' {
996            return None;
997        }
998    }
999
1000    let full_lines = if data.len() >= stride {
1001        let candidate = data.len() / stride;
1002        if candidate > 0 && data[candidate * stride - 1] != b'\n' {
1003            return None;
1004        }
1005        candidate
1006    } else {
1007        0
1008    };
1009
1010    let remainder_start = full_lines * stride;
1011    let remainder = &data[remainder_start..];
1012    let rem_clean = if remainder.last() == Some(&b'\n') {
1013        &remainder[..remainder.len() - 1]
1014    } else {
1015        remainder
1016    };
1017
1018    // Compute exact decoded sizes
1019    let decoded_per_line = line_len * 3 / 4;
1020    let rem_decoded_size = if rem_clean.is_empty() {
1021        0
1022    } else {
1023        let pad = rem_clean
1024            .iter()
1025            .rev()
1026            .take(2)
1027            .filter(|&&b| b == b'=')
1028            .count();
1029        rem_clean.len() * 3 / 4 - pad
1030    };
1031    let total_decoded = full_lines * decoded_per_line + rem_decoded_size;
1032    let clean_len = full_lines * line_len;
1033
1034    // Parallel path: fused strip+decode with 128KB sub-chunks per thread.
1035    // Each thread copies lines to a thread-local buffer (L2-hot) and decodes immediately,
1036    // eliminating the 12MB+ intermediate clean buffer entirely.
1037    if clean_len >= PARALLEL_DECODE_THRESHOLD && num_cpus() > 1 {
1038        let mut output: Vec<u8> = Vec::with_capacity(total_decoded);
1039        #[allow(clippy::uninit_vec)]
1040        unsafe {
1041            output.set_len(total_decoded);
1042        }
1043
1044        let out_ptr = output.as_mut_ptr() as usize;
1045        let src_ptr = data.as_ptr() as usize;
1046        let num_threads = num_cpus().max(1);
1047        let lines_per_thread = (full_lines + num_threads - 1) / num_threads;
1048        let lines_per_sub = (256 * 1024 / line_len).max(1);
1049
1050        let result: Result<(), io::Error> = std::thread::scope(|s| {
1051            let handles: Vec<_> = (0..num_threads)
1052                .map(|t| {
1053                    s.spawn(move || -> Result<(), io::Error> {
1054                        let start_line = t * lines_per_thread;
1055                        if start_line >= full_lines {
1056                            return Ok(());
1057                        }
1058                        let end_line = (start_line + lines_per_thread).min(full_lines);
1059                        let chunk_lines = end_line - start_line;
1060
1061                        let sub_buf_size = lines_per_sub.min(chunk_lines) * line_len;
1062                        let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1063                        #[allow(clippy::uninit_vec)]
1064                        unsafe {
1065                            local_buf.set_len(sub_buf_size);
1066                        }
1067
1068                        let src = src_ptr as *const u8;
1069                        let out_base = out_ptr as *mut u8;
1070                        let local_dst = local_buf.as_mut_ptr();
1071
1072                        let mut sub_start = 0usize;
1073                        while sub_start < chunk_lines {
1074                            let sub_count = (chunk_lines - sub_start).min(lines_per_sub);
1075                            let sub_clean = sub_count * line_len;
1076
1077                            for i in 0..sub_count {
1078                                unsafe {
1079                                    std::ptr::copy_nonoverlapping(
1080                                        src.add((start_line + sub_start + i) * stride),
1081                                        local_dst.add(i * line_len),
1082                                        line_len,
1083                                    );
1084                                }
1085                            }
1086
1087                            let out_offset = (start_line + sub_start) * decoded_per_line;
1088                            let out_size = sub_count * decoded_per_line;
1089                            let out_slice = unsafe {
1090                                std::slice::from_raw_parts_mut(out_base.add(out_offset), out_size)
1091                            };
1092                            BASE64_ENGINE
1093                                .decode(&local_buf[..sub_clean], out_slice.as_out())
1094                                .map_err(|_| {
1095                                    io::Error::new(io::ErrorKind::InvalidData, "invalid input")
1096                                })?;
1097
1098                            sub_start += sub_count;
1099                        }
1100                        Ok(())
1101                    })
1102                })
1103                .collect();
1104            for h in handles {
1105                h.join().unwrap()?;
1106            }
1107            Ok(())
1108        });
1109
1110        if let Err(e) = result {
1111            return Some(Err(e));
1112        }
1113
1114        if !rem_clean.is_empty() {
1115            let rem_out = &mut output[full_lines * decoded_per_line..total_decoded];
1116            match BASE64_ENGINE.decode(rem_clean, rem_out.as_out()) {
1117                Ok(_) => {}
1118                Err(_) => return Some(decode_error()),
1119            }
1120        }
1121
1122        return Some(out.write_all(&output[..total_decoded]));
1123    }
1124
1125    // Sequential path: fused strip+decode in 256KB sub-chunks.
1126    // Larger sub-chunks give SIMD decode more data per call, improving throughput.
1127    // Uses decode_inplace on a small reusable buffer — no large allocations at all.
1128    let lines_per_sub = (256 * 1024 / line_len).max(1);
1129    let sub_buf_size = lines_per_sub * line_len;
1130    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1131    #[allow(clippy::uninit_vec)]
1132    unsafe {
1133        local_buf.set_len(sub_buf_size);
1134    }
1135
1136    let src = data.as_ptr();
1137    let local_dst = local_buf.as_mut_ptr();
1138
1139    let mut line_idx = 0usize;
1140    while line_idx < full_lines {
1141        let sub_count = (full_lines - line_idx).min(lines_per_sub);
1142        let sub_clean = sub_count * line_len;
1143
1144        for i in 0..sub_count {
1145            unsafe {
1146                std::ptr::copy_nonoverlapping(
1147                    src.add((line_idx + i) * stride),
1148                    local_dst.add(i * line_len),
1149                    line_len,
1150                );
1151            }
1152        }
1153
1154        match BASE64_ENGINE.decode_inplace(&mut local_buf[..sub_clean]) {
1155            Ok(decoded) => {
1156                if let Err(e) = out.write_all(decoded) {
1157                    return Some(Err(e));
1158                }
1159            }
1160            Err(_) => return Some(decode_error()),
1161        }
1162
1163        line_idx += sub_count;
1164    }
1165
1166    if !rem_clean.is_empty() {
1167        let mut rem_buf = rem_clean.to_vec();
1168        match BASE64_ENGINE.decode_inplace(&mut rem_buf) {
1169            Ok(decoded) => {
1170                if let Err(e) = out.write_all(decoded) {
1171                    return Some(Err(e));
1172                }
1173            }
1174            Err(_) => return Some(decode_error()),
1175        }
1176    }
1177
1178    Some(Ok(()))
1179}
1180
1181/// Decode by stripping whitespace and decoding in a single fused pass.
1182/// For data with no whitespace, decodes directly without any copy.
1183/// Detects uniform line structure for fast structured-copy (no search needed),
1184/// falls back to SIMD memchr2 gap-copy for irregular data.
1185fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
1186    // Fast path for uniform-line base64 (e.g., standard 76-char lines + newline).
1187    // Copies at known offsets, avoiding the memchr2 search entirely.
1188    // For 13MB base64: saves ~1ms vs memchr2 gap-copy (just structured memcpy).
1189    if data.len() >= 77 {
1190        if let Some(result) = try_decode_uniform_lines(data, out) {
1191            return result;
1192        }
1193    }
1194
1195    // Quick check: skip stripping if no \n or \r in the data.
1196    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1197    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1198        // No newlines/CR — check for rare whitespace only
1199        if !data
1200            .iter()
1201            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1202        {
1203            return decode_borrowed_clean(out, data);
1204        }
1205        // Has rare whitespace only — strip and decode
1206        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
1207        for &b in data {
1208            if NOT_WHITESPACE[b as usize] {
1209                cleaned.push(b);
1210            }
1211        }
1212        return decode_clean_slice(&mut cleaned, out);
1213    }
1214
1215    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
1216    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
1217    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
1218    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
1219    let dst = clean.as_mut_ptr();
1220    let mut wp = 0usize;
1221    let mut gap_start = 0usize;
1222    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
1223    // This avoids the second full-scan pass when only \n/\r are present.
1224    let mut has_rare_ws = false;
1225
1226    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1227        let gap_len = pos - gap_start;
1228        if gap_len > 0 {
1229            // Check gap region for rare whitespace during copy.
1230            // This adds ~1 branch per gap but eliminates the second full scan.
1231            if !has_rare_ws {
1232                has_rare_ws = data[gap_start..pos]
1233                    .iter()
1234                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1235            }
1236            unsafe {
1237                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
1238            }
1239            wp += gap_len;
1240        }
1241        gap_start = pos + 1;
1242    }
1243    // Copy the final gap after the last \n/\r
1244    let tail_len = data.len() - gap_start;
1245    if tail_len > 0 {
1246        if !has_rare_ws {
1247            has_rare_ws = data[gap_start..]
1248                .iter()
1249                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1250        }
1251        unsafe {
1252            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
1253        }
1254        wp += tail_len;
1255    }
1256    unsafe {
1257        clean.set_len(wp);
1258    }
1259
1260    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
1261    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1262    if has_rare_ws {
1263        let ptr = clean.as_mut_ptr();
1264        let len = clean.len();
1265        let mut rp = 0;
1266        let mut cwp = 0;
1267        while rp < len {
1268            let b = unsafe { *ptr.add(rp) };
1269            if NOT_WHITESPACE[b as usize] {
1270                unsafe { *ptr.add(cwp) = b };
1271                cwp += 1;
1272            }
1273            rp += 1;
1274        }
1275        clean.truncate(cwp);
1276    }
1277
1278    // For large data (>= threshold), use parallel decode for multi-core speedup.
1279    // For small data, use in-place decode to avoid extra allocation.
1280    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
1281        decode_borrowed_clean_parallel(out, &clean)
1282    } else {
1283        decode_clean_slice(&mut clean, out)
1284    }
1285}
1286
1287/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
1288/// Returns Some(result) if the data has uniform line lengths suitable for
1289/// per-line decode, or None if the data doesn't fit this pattern.
1290///
1291/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
1292/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
1293/// We decode each line directly into its position in the output buffer.
1294fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1295    // Find the first newline to determine line length
1296    let first_nl = memchr::memchr(b'\n', data)?;
1297    let line_len = first_nl; // encoded chars per line (without newline)
1298
1299    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
1300    if line_len == 0 || line_len % 4 != 0 {
1301        return None;
1302    }
1303
1304    let line_stride = line_len + 1; // line_len chars + 1 newline byte
1305    let decoded_per_line = line_len * 3 / 4;
1306
1307    // Verify the data has a consistent line structure by checking the next few lines
1308    let check_lines = 4.min(data.len() / line_stride);
1309    for i in 1..check_lines {
1310        let expected_nl = i * line_stride - 1;
1311        if expected_nl >= data.len() {
1312            break;
1313        }
1314        if data[expected_nl] != b'\n' {
1315            return None; // Inconsistent line length
1316        }
1317    }
1318
1319    // Calculate full lines and remainder
1320    let full_lines = if data.len() >= line_stride {
1321        // Check how many complete lines fit
1322        let candidate = data.len() / line_stride;
1323        // Verify the last full line's newline
1324        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
1325            return None; // Not a clean line-structured file
1326        }
1327        candidate
1328    } else {
1329        0
1330    };
1331
1332    let remainder_start = full_lines * line_stride;
1333    let remainder = &data[remainder_start..];
1334
1335    // Calculate exact output size
1336    let remainder_clean_len = if remainder.is_empty() {
1337        0
1338    } else {
1339        // Remainder might end with newline, strip it
1340        let rem = if remainder.last() == Some(&b'\n') {
1341            &remainder[..remainder.len() - 1]
1342        } else {
1343            remainder
1344        };
1345        if rem.is_empty() {
1346            0
1347        } else {
1348            // Check for padding
1349            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
1350            if rem.len() % 4 != 0 {
1351                return None; // Invalid remainder
1352            }
1353            rem.len() * 3 / 4 - pad
1354        }
1355    };
1356
1357    // Single-allocation decode: allocate full decoded output, decode all lines
1358    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
1359    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
1360    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
1361    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
1362    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1363    #[allow(clippy::uninit_vec)]
1364    unsafe {
1365        out_buf.set_len(total_decoded);
1366    }
1367
1368    let dst = out_buf.as_mut_ptr();
1369
1370    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
1371    // Each thread decodes a contiguous block of lines directly to its final position
1372    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
1373    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
1374        let out_addr = dst as usize;
1375        let num_threads = num_cpus().max(1);
1376        let lines_per_chunk = (full_lines / num_threads).max(1);
1377
1378        // Build per-thread task ranges: (start_line, end_line)
1379        let mut tasks: Vec<(usize, usize)> = Vec::new();
1380        let mut line_off = 0;
1381        while line_off < full_lines {
1382            let end = (line_off + lines_per_chunk).min(full_lines);
1383            tasks.push((line_off, end));
1384            line_off = end;
1385        }
1386
1387        let decode_result: Result<(), io::Error> = std::thread::scope(|s| {
1388            let handles: Vec<_> = tasks
1389                .iter()
1390                .map(|&(start_line, end_line)| {
1391                    s.spawn(move || -> Result<(), io::Error> {
1392                        let out_ptr = out_addr as *mut u8;
1393                        let mut i = start_line;
1394
1395                        while i + 4 <= end_line {
1396                            let in_base = i * line_stride;
1397                            let ob = i * decoded_per_line;
1398                            unsafe {
1399                                let s0 = std::slice::from_raw_parts_mut(
1400                                    out_ptr.add(ob),
1401                                    decoded_per_line,
1402                                );
1403                                if BASE64_ENGINE
1404                                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1405                                    .is_err()
1406                                {
1407                                    return Err(io::Error::new(
1408                                        io::ErrorKind::InvalidData,
1409                                        "invalid input",
1410                                    ));
1411                                }
1412                                let s1 = std::slice::from_raw_parts_mut(
1413                                    out_ptr.add(ob + decoded_per_line),
1414                                    decoded_per_line,
1415                                );
1416                                if BASE64_ENGINE
1417                                    .decode(
1418                                        &data[in_base + line_stride
1419                                            ..in_base + line_stride + line_len],
1420                                        s1.as_out(),
1421                                    )
1422                                    .is_err()
1423                                {
1424                                    return Err(io::Error::new(
1425                                        io::ErrorKind::InvalidData,
1426                                        "invalid input",
1427                                    ));
1428                                }
1429                                let s2 = std::slice::from_raw_parts_mut(
1430                                    out_ptr.add(ob + 2 * decoded_per_line),
1431                                    decoded_per_line,
1432                                );
1433                                if BASE64_ENGINE
1434                                    .decode(
1435                                        &data[in_base + 2 * line_stride
1436                                            ..in_base + 2 * line_stride + line_len],
1437                                        s2.as_out(),
1438                                    )
1439                                    .is_err()
1440                                {
1441                                    return Err(io::Error::new(
1442                                        io::ErrorKind::InvalidData,
1443                                        "invalid input",
1444                                    ));
1445                                }
1446                                let s3 = std::slice::from_raw_parts_mut(
1447                                    out_ptr.add(ob + 3 * decoded_per_line),
1448                                    decoded_per_line,
1449                                );
1450                                if BASE64_ENGINE
1451                                    .decode(
1452                                        &data[in_base + 3 * line_stride
1453                                            ..in_base + 3 * line_stride + line_len],
1454                                        s3.as_out(),
1455                                    )
1456                                    .is_err()
1457                                {
1458                                    return Err(io::Error::new(
1459                                        io::ErrorKind::InvalidData,
1460                                        "invalid input",
1461                                    ));
1462                                }
1463                            }
1464                            i += 4;
1465                        }
1466
1467                        while i < end_line {
1468                            let in_start = i * line_stride;
1469                            let out_off = i * decoded_per_line;
1470                            let out_slice = unsafe {
1471                                std::slice::from_raw_parts_mut(
1472                                    out_ptr.add(out_off),
1473                                    decoded_per_line,
1474                                )
1475                            };
1476                            if BASE64_ENGINE
1477                                .decode(&data[in_start..in_start + line_len], out_slice.as_out())
1478                                .is_err()
1479                            {
1480                                return Err(io::Error::new(
1481                                    io::ErrorKind::InvalidData,
1482                                    "invalid input",
1483                                ));
1484                            }
1485                            i += 1;
1486                        }
1487
1488                        Ok(())
1489                    })
1490                })
1491                .collect();
1492            for h in handles {
1493                h.join().unwrap()?;
1494            }
1495            Ok(())
1496        });
1497
1498        if decode_result.is_err() {
1499            return Some(decode_error());
1500        }
1501    } else {
1502        // Sequential decode with 4x unrolling for smaller inputs
1503        let mut i = 0;
1504
1505        while i + 4 <= full_lines {
1506            let in_base = i * line_stride;
1507            let out_base = i * decoded_per_line;
1508            unsafe {
1509                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
1510                if BASE64_ENGINE
1511                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1512                    .is_err()
1513                {
1514                    return Some(decode_error());
1515                }
1516
1517                let s1 = std::slice::from_raw_parts_mut(
1518                    dst.add(out_base + decoded_per_line),
1519                    decoded_per_line,
1520                );
1521                if BASE64_ENGINE
1522                    .decode(
1523                        &data[in_base + line_stride..in_base + line_stride + line_len],
1524                        s1.as_out(),
1525                    )
1526                    .is_err()
1527                {
1528                    return Some(decode_error());
1529                }
1530
1531                let s2 = std::slice::from_raw_parts_mut(
1532                    dst.add(out_base + 2 * decoded_per_line),
1533                    decoded_per_line,
1534                );
1535                if BASE64_ENGINE
1536                    .decode(
1537                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
1538                        s2.as_out(),
1539                    )
1540                    .is_err()
1541                {
1542                    return Some(decode_error());
1543                }
1544
1545                let s3 = std::slice::from_raw_parts_mut(
1546                    dst.add(out_base + 3 * decoded_per_line),
1547                    decoded_per_line,
1548                );
1549                if BASE64_ENGINE
1550                    .decode(
1551                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
1552                        s3.as_out(),
1553                    )
1554                    .is_err()
1555                {
1556                    return Some(decode_error());
1557                }
1558            }
1559            i += 4;
1560        }
1561
1562        while i < full_lines {
1563            let in_start = i * line_stride;
1564            let in_end = in_start + line_len;
1565            let out_off = i * decoded_per_line;
1566            let out_slice =
1567                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
1568            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
1569                Ok(_) => {}
1570                Err(_) => return Some(decode_error()),
1571            }
1572            i += 1;
1573        }
1574    }
1575
1576    // Decode remainder
1577    if remainder_clean_len > 0 {
1578        let rem = if remainder.last() == Some(&b'\n') {
1579            &remainder[..remainder.len() - 1]
1580        } else {
1581            remainder
1582        };
1583        let out_off = full_lines * decoded_per_line;
1584        let out_slice =
1585            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
1586        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
1587            Ok(_) => {}
1588            Err(_) => return Some(decode_error()),
1589        }
1590    }
1591
1592    // Single write_all for the entire decoded output
1593    Some(out.write_all(&out_buf[..total_decoded]))
1594}
1595
1596/// Decode a clean (no whitespace) buffer in-place with SIMD.
1597fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1598    if data.is_empty() {
1599        return Ok(());
1600    }
1601    match BASE64_ENGINE.decode_inplace(data) {
1602        Ok(decoded) => out.write_all(decoded),
1603        Err(_) => decode_error(),
1604    }
1605}
1606
1607/// Cold error path — keeps hot decode path tight by moving error construction out of line.
1608#[cold]
1609#[inline(never)]
1610fn decode_error() -> io::Result<()> {
1611    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1612}
1613
1614/// Decode clean base64 data (no whitespace) from a borrowed slice.
1615fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1616    if data.is_empty() {
1617        return Ok(());
1618    }
1619    // Parallel decode for large data: split at 4-byte boundaries,
1620    // decode each chunk independently (base64 is context-free per 4-char group).
1621    if data.len() >= PARALLEL_DECODE_THRESHOLD {
1622        return decode_borrowed_clean_parallel(out, data);
1623    }
1624    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
1625    // Decoded size = data.len() * 3 / 4 minus padding.
1626    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
1627    let decoded_size = data.len() * 3 / 4 - pad;
1628    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
1629    #[allow(clippy::uninit_vec)]
1630    unsafe {
1631        buf.set_len(decoded_size);
1632    }
1633    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
1634        Ok(decoded) => {
1635            out.write_all(decoded)?;
1636            Ok(())
1637        }
1638        Err(_) => decode_error(),
1639    }
1640}
1641
1642/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel.
1643/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
1644/// upfront, so each thread decodes directly to its final position. No compaction needed.
1645fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1646    let num_threads = num_cpus().max(1);
1647    let raw_chunk = data.len() / num_threads;
1648    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
1649    let chunk_size = ((raw_chunk + 3) / 4) * 4;
1650
1651    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
1652
1653    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
1654    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
1655    offsets.push(0);
1656    let mut total_decoded = 0usize;
1657    for (i, chunk) in chunks.iter().enumerate() {
1658        let decoded_size = if i == chunks.len() - 1 {
1659            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
1660            chunk.len() * 3 / 4 - pad
1661        } else {
1662            chunk.len() * 3 / 4
1663        };
1664        total_decoded += decoded_size;
1665        offsets.push(total_decoded);
1666    }
1667
1668    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1669    #[allow(clippy::uninit_vec)]
1670    unsafe {
1671        output_buf.set_len(total_decoded);
1672    }
1673
1674    // Parallel decode: each thread decodes directly into its exact final position.
1675    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
1676    let out_addr = output_buf.as_mut_ptr() as usize;
1677    let decode_result: Result<(), io::Error> = std::thread::scope(|s| {
1678        let handles: Vec<_> = chunks
1679            .iter()
1680            .enumerate()
1681            .map(|(i, chunk)| {
1682                let offset = offsets[i];
1683                let expected_size = offsets[i + 1] - offset;
1684                s.spawn(move || -> Result<(), io::Error> {
1685                    // SAFETY: each thread writes to non-overlapping region
1686                    let out_slice = unsafe {
1687                        std::slice::from_raw_parts_mut(
1688                            (out_addr as *mut u8).add(offset),
1689                            expected_size,
1690                        )
1691                    };
1692                    let decoded = BASE64_ENGINE
1693                        .decode(chunk, out_slice.as_out())
1694                        .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid input"))?;
1695                    debug_assert_eq!(decoded.len(), expected_size);
1696                    Ok(())
1697                })
1698            })
1699            .collect();
1700        for h in handles {
1701            h.join().unwrap()?;
1702        }
1703        Ok(())
1704    });
1705
1706    decode_result?;
1707
1708    out.write_all(&output_buf[..total_decoded])
1709}
1710
1711/// Strip non-base64 characters (for -i / --ignore-garbage).
1712fn strip_non_base64(data: &[u8]) -> Vec<u8> {
1713    data.iter()
1714        .copied()
1715        .filter(|&b| is_base64_char(b))
1716        .collect()
1717}
1718
1719/// Check if a byte is a valid base64 alphabet character or padding.
1720#[inline]
1721fn is_base64_char(b: u8) -> bool {
1722    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
1723}
1724
1725/// Stream-encode from a reader to a writer. Used for stdin processing.
1726/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
1727pub fn encode_stream(
1728    reader: &mut impl Read,
1729    wrap_col: usize,
1730    writer: &mut impl Write,
1731) -> io::Result<()> {
1732    if wrap_col == 0 {
1733        return encode_stream_nowrap(reader, writer);
1734    }
1735    encode_stream_wrapped(reader, wrap_col, writer)
1736}
1737
1738/// Streaming encode with NO line wrapping — optimized fast path.
1739/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
1740/// 24MB reads mean 10-18MB input is consumed in a single read() call,
1741/// and the encoded output writes in 1-2 write() calls.
1742fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1743    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
1744    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3
1745
1746    // SAFETY: buf bytes are written by read_full before being processed.
1747    // encode_buf bytes are written by encode before being read.
1748    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
1749    #[allow(clippy::uninit_vec)]
1750    unsafe {
1751        buf.set_len(NOWRAP_READ);
1752    }
1753    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
1754    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1755    #[allow(clippy::uninit_vec)]
1756    unsafe {
1757        encode_buf.set_len(encode_buf_size);
1758    }
1759
1760    loop {
1761        let n = read_full(reader, &mut buf)?;
1762        if n == 0 {
1763            break;
1764        }
1765        let enc_len = BASE64_ENGINE.encoded_length(n);
1766        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1767        writer.write_all(encoded)?;
1768    }
1769    Ok(())
1770}
1771
1772/// Streaming encode WITH line wrapping.
1773/// For the common case (wrap_col divides evenly into 3-byte input groups),
1774/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
1775/// then writes it in a single write() call. This eliminates the overhead of
1776/// many writev() syscalls (one per ~512 lines via IoSlice).
1777///
1778/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
1779fn encode_stream_wrapped(
1780    reader: &mut impl Read,
1781    wrap_col: usize,
1782    writer: &mut impl Write,
1783) -> io::Result<()> {
1784    let bytes_per_line = wrap_col * 3 / 4;
1785    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
1786    // align the read buffer to bytes_per_line boundaries so each chunk produces
1787    // complete lines with no column carry-over between chunks.
1788    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
1789        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
1790    }
1791
1792    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
1793    const STREAM_READ: usize = 12 * 1024 * 1024;
1794    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
1795    #[allow(clippy::uninit_vec)]
1796    unsafe {
1797        buf.set_len(STREAM_READ);
1798    }
1799    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
1800    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1801    #[allow(clippy::uninit_vec)]
1802    unsafe {
1803        encode_buf.set_len(encode_buf_size);
1804    }
1805
1806    let mut col = 0usize;
1807
1808    loop {
1809        let n = read_full(reader, &mut buf)?;
1810        if n == 0 {
1811            break;
1812        }
1813        let enc_len = BASE64_ENGINE.encoded_length(n);
1814        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1815
1816        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
1817    }
1818
1819    if col > 0 {
1820        writer.write_all(b"\n")?;
1821    }
1822
1823    Ok(())
1824}
1825
1826/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
1827/// encode each line directly into its final position with newline appended.
1828/// Eliminates the two-pass encode-then-fuse_wrap approach.
1829/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
1830/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
1831fn encode_stream_wrapped_fused(
1832    reader: &mut impl Read,
1833    wrap_col: usize,
1834    bytes_per_line: usize,
1835    writer: &mut impl Write,
1836) -> io::Result<()> {
1837    // Align read size to bytes_per_line for complete output lines per chunk.
1838    // ~420K lines * 57 bytes = ~24MB input, ~32MB output.
1839    let lines_per_chunk = (24 * 1024 * 1024) / bytes_per_line;
1840    let read_size = lines_per_chunk * bytes_per_line;
1841    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline
1842
1843    // SAFETY: buf bytes are written by read_full before being processed.
1844    // out_buf bytes are written by encode before being read.
1845    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
1846    #[allow(clippy::uninit_vec)]
1847    unsafe {
1848        buf.set_len(read_size);
1849    }
1850    // Output buffer: enough for all lines + remainder
1851    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
1852    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
1853    #[allow(clippy::uninit_vec)]
1854    unsafe {
1855        out_buf.set_len(max_output);
1856    }
1857
1858    loop {
1859        let n = read_full(reader, &mut buf)?;
1860        if n == 0 {
1861            break;
1862        }
1863
1864        let full_lines = n / bytes_per_line;
1865        let remainder = n % bytes_per_line;
1866
1867        // Encode each input line directly into its final output position.
1868        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
1869        // This eliminates the separate encode + fuse_wrap copy entirely.
1870        let dst = out_buf.as_mut_ptr();
1871        let mut line_idx = 0;
1872
1873        // 4-line unrolled loop for better ILP
1874        while line_idx + 4 <= full_lines {
1875            let in_base = line_idx * bytes_per_line;
1876            let out_base = line_idx * line_out;
1877            unsafe {
1878                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1879                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
1880                *dst.add(out_base + wrap_col) = b'\n';
1881
1882                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
1883                let _ = BASE64_ENGINE.encode(
1884                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
1885                    s1.as_out(),
1886                );
1887                *dst.add(out_base + line_out + wrap_col) = b'\n';
1888
1889                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
1890                let _ = BASE64_ENGINE.encode(
1891                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
1892                    s2.as_out(),
1893                );
1894                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
1895
1896                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
1897                let _ = BASE64_ENGINE.encode(
1898                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
1899                    s3.as_out(),
1900                );
1901                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
1902            }
1903            line_idx += 4;
1904        }
1905
1906        // Remaining full lines
1907        while line_idx < full_lines {
1908            let in_base = line_idx * bytes_per_line;
1909            let out_base = line_idx * line_out;
1910            unsafe {
1911                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1912                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
1913                *dst.add(out_base + wrap_col) = b'\n';
1914            }
1915            line_idx += 1;
1916        }
1917
1918        let mut wp = full_lines * line_out;
1919
1920        // Handle remainder (partial last line of this chunk)
1921        if remainder > 0 {
1922            let enc_len = BASE64_ENGINE.encoded_length(remainder);
1923            let line_input = &buf[full_lines * bytes_per_line..n];
1924            unsafe {
1925                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
1926                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
1927                *dst.add(wp + enc_len) = b'\n';
1928            }
1929            wp += enc_len + 1;
1930        }
1931
1932        writer.write_all(&out_buf[..wp])?;
1933    }
1934
1935    Ok(())
1936}
1937
1938/// Stream-decode from a reader to a writer. Used for stdin processing.
1939/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
1940/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
1941/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
1942/// fill the entire buffer from the pipe, and 32MB means even large inputs
1943/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
1944pub fn decode_stream(
1945    reader: &mut impl Read,
1946    ignore_garbage: bool,
1947    writer: &mut impl Write,
1948) -> io::Result<()> {
1949    const READ_CHUNK: usize = 32 * 1024 * 1024;
1950    // SAFETY: buf bytes are written by read_full before being processed.
1951    // The extra 4 bytes accommodate carry-over from previous chunk.
1952    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
1953    #[allow(clippy::uninit_vec)]
1954    unsafe {
1955        buf.set_len(READ_CHUNK + 4);
1956    }
1957    let mut carry = [0u8; 4];
1958    let mut carry_len = 0usize;
1959
1960    loop {
1961        // Copy carry bytes to start of buffer, read new data after them
1962        if carry_len > 0 {
1963            unsafe {
1964                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
1965            }
1966        }
1967        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
1968        if n == 0 {
1969            break;
1970        }
1971        let total_raw = carry_len + n;
1972
1973        // Strip whitespace in-place in the buffer itself.
1974        // This eliminates the separate clean buffer allocation (saves 16MB).
1975        let clean_len = if ignore_garbage {
1976            // Scalar filter for ignore_garbage mode (rare path)
1977            let ptr = buf.as_mut_ptr();
1978            let mut wp = 0usize;
1979            for i in 0..total_raw {
1980                let b = unsafe { *ptr.add(i) };
1981                if is_base64_char(b) {
1982                    unsafe { *ptr.add(wp) = b };
1983                    wp += 1;
1984                }
1985            }
1986            wp
1987        } else {
1988            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
1989            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
1990            // so we process ~76 bytes per memchr hit.
1991            let ptr = buf.as_mut_ptr();
1992            let data = &buf[..total_raw];
1993            let mut wp = 0usize;
1994            let mut gap_start = 0usize;
1995            let mut has_rare_ws = false;
1996
1997            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1998                let gap_len = pos - gap_start;
1999                if gap_len > 0 {
2000                    if !has_rare_ws {
2001                        has_rare_ws = data[gap_start..pos]
2002                            .iter()
2003                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2004                    }
2005                    if wp != gap_start {
2006                        unsafe {
2007                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
2008                        }
2009                    }
2010                    wp += gap_len;
2011                }
2012                gap_start = pos + 1;
2013            }
2014            let tail_len = total_raw - gap_start;
2015            if tail_len > 0 {
2016                if !has_rare_ws {
2017                    has_rare_ws = data[gap_start..total_raw]
2018                        .iter()
2019                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2020                }
2021                if wp != gap_start {
2022                    unsafe {
2023                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
2024                    }
2025                }
2026                wp += tail_len;
2027            }
2028
2029            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
2030            if has_rare_ws {
2031                let mut rp = 0;
2032                let mut cwp = 0;
2033                while rp < wp {
2034                    let b = unsafe { *ptr.add(rp) };
2035                    if NOT_WHITESPACE[b as usize] {
2036                        unsafe { *ptr.add(cwp) = b };
2037                        cwp += 1;
2038                    }
2039                    rp += 1;
2040                }
2041                cwp
2042            } else {
2043                wp
2044            }
2045        };
2046
2047        carry_len = 0;
2048        let is_last = n < READ_CHUNK;
2049
2050        if is_last {
2051            // Last chunk: decode everything (including padding)
2052            decode_clean_slice(&mut buf[..clean_len], writer)?;
2053        } else {
2054            // Save incomplete base64 quadruplet for next iteration
2055            let decode_len = (clean_len / 4) * 4;
2056            let leftover = clean_len - decode_len;
2057            if leftover > 0 {
2058                unsafe {
2059                    std::ptr::copy_nonoverlapping(
2060                        buf.as_ptr().add(decode_len),
2061                        carry.as_mut_ptr(),
2062                        leftover,
2063                    );
2064                }
2065                carry_len = leftover;
2066            }
2067            if decode_len > 0 {
2068                decode_clean_slice(&mut buf[..decode_len], writer)?;
2069            }
2070        }
2071    }
2072
2073    // Handle any remaining carry-over bytes
2074    if carry_len > 0 {
2075        let mut carry_buf = carry[..carry_len].to_vec();
2076        decode_clean_slice(&mut carry_buf, writer)?;
2077    }
2078
2079    Ok(())
2080}
2081
2082/// Write all IoSlice entries using write_vectored (writev syscall).
2083/// Hot path: single write_vectored succeeds fully (common on Linux pipes/files).
2084/// Cold path: partial write handled out-of-line to keep hot path tight.
2085#[inline(always)]
2086fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
2087    if slices.is_empty() {
2088        return Ok(());
2089    }
2090    let total: usize = slices.iter().map(|s| s.len()).sum();
2091    let written = out.write_vectored(slices)?;
2092    if written >= total {
2093        return Ok(());
2094    }
2095    if written == 0 {
2096        return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
2097    }
2098    write_all_vectored_slow(out, slices, written)
2099}
2100
2101/// Handle partial write (cold path, never inlined).
2102#[cold]
2103#[inline(never)]
2104fn write_all_vectored_slow(
2105    out: &mut impl Write,
2106    slices: &[io::IoSlice],
2107    mut skip: usize,
2108) -> io::Result<()> {
2109    for slice in slices {
2110        let len = slice.len();
2111        if skip >= len {
2112            skip -= len;
2113            continue;
2114        }
2115        out.write_all(&slice[skip..])?;
2116        skip = 0;
2117    }
2118    Ok(())
2119}
2120
2121/// Read as many bytes as possible into buf, retrying on partial reads.
2122/// Fast path: regular file reads usually return the full buffer on the first call,
2123/// avoiding the loop overhead entirely.
2124#[inline]
2125fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2126    // Fast path: first read() usually fills the entire buffer for regular files
2127    let n = reader.read(buf)?;
2128    if n == buf.len() || n == 0 {
2129        return Ok(n);
2130    }
2131    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
2132    let mut total = n;
2133    while total < buf.len() {
2134        match reader.read(&mut buf[total..]) {
2135            Ok(0) => break,
2136            Ok(n) => total += n,
2137            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2138            Err(e) => return Err(e),
2139        }
2140    }
2141    Ok(total)
2142}