Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4use rayon::prelude::*;
5
6const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
7
8/// Chunk size for no-wrap encoding: 32MB aligned to 3 bytes.
9/// Larger chunks = fewer write() syscalls for big files.
10const NOWRAP_CHUNK: usize = 32 * 1024 * 1024 - (32 * 1024 * 1024 % 3);
11
12/// Minimum data size for parallel encoding (16MB).
13/// base64_simd SIMD encoding runs at ~8 GB/s per core, processing 10MB
14/// in ~1.25ms. Rayon thread pool creation + sync costs ~100-500us.
15/// For 10MB benchmark workloads, single-threaded is faster.
16const PARALLEL_ENCODE_THRESHOLD: usize = 16 * 1024 * 1024;
17
18/// Minimum data size for parallel decoding (4MB of base64 data).
19/// With 2+ cores, parallel decode at 4MB provides ~1.5x speedup:
20/// single-core decode at 8GB/s takes ~0.5ms for 4MB, while dual-core
21/// takes ~0.25ms + ~0.1ms rayon overhead = ~0.35ms.
22const PARALLEL_DECODE_THRESHOLD: usize = 4 * 1024 * 1024;
23
24/// Encode data and write to output with line wrapping.
25/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
26pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
27    if data.is_empty() {
28        return Ok(());
29    }
30
31    if wrap_col == 0 {
32        return encode_no_wrap(data, out);
33    }
34
35    encode_wrapped(data, wrap_col, out)
36}
37
38/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
39fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
40    if data.len() >= PARALLEL_ENCODE_THRESHOLD {
41        return encode_no_wrap_parallel(data, out);
42    }
43
44    let actual_chunk = NOWRAP_CHUNK.min(data.len());
45    let enc_max = BASE64_ENGINE.encoded_length(actual_chunk);
46    // SAFETY: encode() writes exactly enc_len bytes before we read them.
47    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
48    #[allow(clippy::uninit_vec)]
49    unsafe {
50        buf.set_len(enc_max);
51    }
52
53    for chunk in data.chunks(NOWRAP_CHUNK) {
54        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
55        let encoded = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
56        out.write_all(encoded)?;
57    }
58    Ok(())
59}
60
61/// Parallel no-wrap encoding: split at 3-byte boundaries, encode chunks in parallel.
62/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
63/// Uses write_vectored (writev) to send all encoded chunks in a single syscall.
64fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
65    let num_threads = rayon::current_num_threads().max(1);
66    let raw_chunk = data.len() / num_threads;
67    // Align to 3 bytes so each chunk encodes without padding (except the last)
68    let chunk_size = ((raw_chunk + 2) / 3) * 3;
69
70    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();
71    let encoded_chunks: Vec<Vec<u8>> = chunks
72        .par_iter()
73        .map(|chunk| {
74            let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
75            let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
76            #[allow(clippy::uninit_vec)]
77            unsafe {
78                buf.set_len(enc_len);
79            }
80            let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
81            buf
82        })
83        .collect();
84
85    // Use write_vectored to send all chunks in a single syscall
86    let iov: Vec<io::IoSlice> = encoded_chunks.iter().map(|c| io::IoSlice::new(c)).collect();
87    write_all_vectored(out, &iov)
88}
89
90/// Encode with line wrapping — uses writev to interleave encoded segments
91/// with newlines without copying data. For each wrap_col-sized segment of
92/// encoded output, we create an IoSlice pointing directly at the encode buffer,
93/// interleaved with IoSlice entries pointing at a static newline byte.
94fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
95    // Calculate bytes_per_line: input bytes that produce exactly wrap_col encoded chars.
96    // For default wrap_col=76: 76*3/4 = 57 bytes per line.
97    let bytes_per_line = wrap_col * 3 / 4;
98    if bytes_per_line == 0 {
99        // Degenerate case: wrap_col < 4, fall back to byte-at-a-time
100        return encode_wrapped_small(data, wrap_col, out);
101    }
102
103    // Parallel encoding for large data when bytes_per_line is a multiple of 3.
104    // This guarantees each chunk encodes to complete base64 without padding.
105    if data.len() >= PARALLEL_ENCODE_THRESHOLD && bytes_per_line.is_multiple_of(3) {
106        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
107    }
108
109    // Bulk encode + insert newlines strategy: encode the entire input in one
110    // SIMD pass (optimal vectorization), then insert newlines every wrap_col
111    // chars via backward memmove. This is faster than per-line encode calls
112    // because base64_simd processes the full input in a single vectorized sweep.
113    if bytes_per_line.is_multiple_of(3) {
114        // Phase 1: Bulk encode all data at once
115        let enc_len = BASE64_ENGINE.encoded_length(data.len());
116        let num_lines = (enc_len + wrap_col - 1) / wrap_col;
117        let total_output = enc_len + num_lines; // +1 newline per line (including last)
118
119        let mut out_buf: Vec<u8> = Vec::with_capacity(total_output);
120        #[allow(clippy::uninit_vec)]
121        unsafe {
122            out_buf.set_len(total_output);
123        }
124
125        // Encode entire input into the START of the buffer (enc_len bytes)
126        let encoded = BASE64_ENGINE.encode(data, out_buf[..enc_len].as_out());
127        let encoded_len = encoded.len();
128
129        // Phase 2: Insert newlines by working backwards.
130        // We expand the encoded data from enc_len bytes to total_output bytes
131        // by inserting \n every wrap_col chars. Working backwards avoids
132        // overwriting data we haven't moved yet.
133        let num_full_lines = encoded_len / wrap_col;
134        let remainder = encoded_len % wrap_col;
135
136        let ptr = out_buf.as_mut_ptr();
137        let mut wp = total_output; // write position (from end)
138
139        // Handle remainder (last partial line + newline)
140        if remainder > 0 {
141            wp -= 1;
142            unsafe { *ptr.add(wp) = b'\n' };
143            wp -= remainder;
144            let rp = encoded_len - remainder;
145            if wp != rp {
146                unsafe {
147                    std::ptr::copy(ptr.add(rp), ptr.add(wp), remainder);
148                }
149            }
150        }
151
152        // Full lines: work backwards, 4-line unrolled for throughput
153        let mut line = num_full_lines;
154        while line >= 4 {
155            line -= 4;
156            let rp = line * wrap_col;
157            let owp = wp;
158            unsafe {
159                // Line 3
160                wp = owp - 1;
161                *ptr.add(wp) = b'\n';
162                wp -= wrap_col;
163                std::ptr::copy(ptr.add(rp + 3 * wrap_col), ptr.add(wp), wrap_col);
164
165                // Line 2
166                wp -= 1;
167                *ptr.add(wp) = b'\n';
168                wp -= wrap_col;
169                std::ptr::copy(ptr.add(rp + 2 * wrap_col), ptr.add(wp), wrap_col);
170
171                // Line 1
172                wp -= 1;
173                *ptr.add(wp) = b'\n';
174                wp -= wrap_col;
175                std::ptr::copy(ptr.add(rp + wrap_col), ptr.add(wp), wrap_col);
176
177                // Line 0
178                wp -= 1;
179                *ptr.add(wp) = b'\n';
180                wp -= wrap_col;
181                std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
182            }
183        }
184        while line > 0 {
185            line -= 1;
186            let rp = line * wrap_col;
187            wp -= 1;
188            unsafe {
189                *ptr.add(wp) = b'\n';
190            }
191            wp -= wrap_col;
192            if wp != rp {
193                unsafe {
194                    std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
195                }
196            }
197        }
198
199        return out.write_all(&out_buf[..total_output]);
200    }
201
202    // Fallback for non-3-aligned bytes_per_line: use writev
203    let lines_per_chunk = (32 * 1024 * 1024) / bytes_per_line;
204    let max_input_chunk = (lines_per_chunk * bytes_per_line).max(bytes_per_line);
205    let input_chunk = max_input_chunk.min(data.len());
206
207    let enc_max = BASE64_ENGINE.encoded_length(input_chunk);
208    let mut encode_buf: Vec<u8> = Vec::with_capacity(enc_max);
209    #[allow(clippy::uninit_vec)]
210    unsafe {
211        encode_buf.set_len(enc_max);
212    }
213
214    for chunk in data.chunks(max_input_chunk.max(1)) {
215        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
216        let encoded = BASE64_ENGINE.encode(chunk, encode_buf[..enc_len].as_out());
217        write_wrapped_iov(encoded, wrap_col, out)?;
218    }
219
220    Ok(())
221}
222
223/// Static newline byte for IoSlice references in writev calls.
224static NEWLINE: [u8; 1] = [b'\n'];
225
226/// Write encoded base64 data with line wrapping using write_vectored (writev).
227/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
228/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
229/// This is zero-copy: no fused output buffer needed.
230#[inline]
231fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
232    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
233    // Each line needs 2 entries (data + newline), so 512 lines per batch.
234    const MAX_IOV: usize = 1024;
235
236    let num_full_lines = encoded.len() / wrap_col;
237    let remainder = encoded.len() % wrap_col;
238    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };
239
240    // Small output: build all IoSlices and write in one call
241    if total_iov <= MAX_IOV {
242        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
243        let mut pos = 0;
244        for _ in 0..num_full_lines {
245            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
246            iov.push(io::IoSlice::new(&NEWLINE));
247            pos += wrap_col;
248        }
249        if remainder > 0 {
250            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
251            iov.push(io::IoSlice::new(&NEWLINE));
252        }
253        return write_all_vectored(out, &iov);
254    }
255
256    // Large output: write in batches
257    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
258    let mut pos = 0;
259    for _ in 0..num_full_lines {
260        iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
261        iov.push(io::IoSlice::new(&NEWLINE));
262        pos += wrap_col;
263        if iov.len() >= MAX_IOV {
264            write_all_vectored(out, &iov)?;
265            iov.clear();
266        }
267    }
268    if remainder > 0 {
269        iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
270        iov.push(io::IoSlice::new(&NEWLINE));
271    }
272    if !iov.is_empty() {
273        write_all_vectored(out, &iov)?;
274    }
275    Ok(())
276}
277
278/// Write encoded base64 data with line wrapping using writev, tracking column state
279/// across calls. Used by encode_stream for piped input where chunks don't align
280/// to line boundaries.
281#[inline]
282fn write_wrapped_iov_streaming(
283    encoded: &[u8],
284    wrap_col: usize,
285    col: &mut usize,
286    out: &mut impl Write,
287) -> io::Result<()> {
288    const MAX_IOV: usize = 1024;
289    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
290    let mut rp = 0;
291
292    while rp < encoded.len() {
293        let space = wrap_col - *col;
294        let avail = encoded.len() - rp;
295
296        if avail <= space {
297            // Remaining data fits in current line
298            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
299            *col += avail;
300            if *col == wrap_col {
301                iov.push(io::IoSlice::new(&NEWLINE));
302                *col = 0;
303            }
304            break;
305        } else {
306            // Fill current line and add newline
307            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
308            iov.push(io::IoSlice::new(&NEWLINE));
309            rp += space;
310            *col = 0;
311        }
312
313        if iov.len() >= MAX_IOV - 1 {
314            write_all_vectored(out, &iov)?;
315            iov.clear();
316        }
317    }
318
319    if !iov.is_empty() {
320        write_all_vectored(out, &iov)?;
321    }
322    Ok(())
323}
324
325/// Parallel wrapped encoding: single output buffer, direct-to-position encode+wrap.
326/// Requires bytes_per_line % 3 == 0 so each chunk encodes without intermediate padding.
327///
328/// Pre-calculates exact output size and each thread's write offset, then encodes
329/// 57-byte input groups directly to their final position in a shared output buffer.
330/// Each thread writes wrap_col encoded bytes + newline per line, so output for line N
331/// starts at N * (wrap_col + 1). This eliminates per-chunk heap allocations and
332/// the fuse_wrap copy pass entirely.
333fn encode_wrapped_parallel(
334    data: &[u8],
335    wrap_col: usize,
336    bytes_per_line: usize,
337    out: &mut impl Write,
338) -> io::Result<()> {
339    let line_out = wrap_col + 1; // wrap_col data + 1 newline per line
340    let total_full_lines = data.len() / bytes_per_line;
341    let remainder_input = data.len() % bytes_per_line;
342
343    // Calculate exact output size
344    let remainder_encoded = if remainder_input > 0 {
345        BASE64_ENGINE.encoded_length(remainder_input) + 1 // +1 for trailing newline
346    } else {
347        0
348    };
349    let total_output = total_full_lines * line_out + remainder_encoded;
350
351    // Pre-allocate single contiguous output buffer
352    let mut outbuf: Vec<u8> = Vec::with_capacity(total_output);
353    #[allow(clippy::uninit_vec)]
354    unsafe {
355        outbuf.set_len(total_output);
356    }
357
358    // Split work at line boundaries for parallel processing
359    let num_threads = rayon::current_num_threads().max(1);
360    let lines_per_chunk = (total_full_lines / num_threads).max(1);
361    let input_chunk = lines_per_chunk * bytes_per_line;
362
363    // Compute per-chunk metadata: (input_offset, output_offset, num_input_bytes)
364    let mut tasks: Vec<(usize, usize, usize)> = Vec::new();
365    let mut in_off = 0usize;
366    let mut out_off = 0usize;
367    while in_off < data.len() {
368        let chunk_input = input_chunk.min(data.len() - in_off);
369        // Align to bytes_per_line except for the very last chunk
370        let aligned_input = if in_off + chunk_input < data.len() {
371            (chunk_input / bytes_per_line) * bytes_per_line
372        } else {
373            chunk_input
374        };
375        if aligned_input == 0 {
376            break;
377        }
378        let full_lines = aligned_input / bytes_per_line;
379        let rem = aligned_input % bytes_per_line;
380        let chunk_output = full_lines * line_out
381            + if rem > 0 {
382                BASE64_ENGINE.encoded_length(rem) + 1
383            } else {
384                0
385            };
386        tasks.push((in_off, out_off, aligned_input));
387        in_off += aligned_input;
388        out_off += chunk_output;
389    }
390
391    // Parallel encode: each thread encodes lines directly into the final
392    // output buffer, eliminating per-thread buffer allocation and the
393    // scatter copy phase entirely. Each 57-byte input line encodes to
394    // exactly 76 encoded bytes + 1 newline = 77 bytes at a known offset.
395    // base64_simd handles the SIMD encoding even for 57-byte inputs.
396    // SAFETY: tasks have non-overlapping output regions.
397    let out_addr = outbuf.as_mut_ptr() as usize;
398
399    tasks.par_iter().for_each(|&(in_off, out_off, chunk_len)| {
400        let input = &data[in_off..in_off + chunk_len];
401        let full_lines = chunk_len / bytes_per_line;
402        let rem = chunk_len % bytes_per_line;
403
404        let out_ptr = out_addr as *mut u8;
405
406        // Encode each line directly into its final position in the output buffer.
407        // No thread-local buffer needed — each 57-byte input -> 76 encoded bytes
408        // written directly at out_off + line_idx * 77.
409        if full_lines > 0 {
410            let dst = unsafe { out_ptr.add(out_off) };
411            let mut line_idx = 0;
412
413            // 4-line unrolled loop for ILP
414            while line_idx + 4 <= full_lines {
415                let in_base = line_idx * bytes_per_line;
416                let out_base = line_idx * line_out;
417                unsafe {
418                    let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
419                    let _ = BASE64_ENGINE
420                        .encode(&input[in_base..in_base + bytes_per_line], s0.as_out());
421                    *dst.add(out_base + wrap_col) = b'\n';
422
423                    let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
424                    let _ = BASE64_ENGINE.encode(
425                        &input[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
426                        s1.as_out(),
427                    );
428                    *dst.add(out_base + line_out + wrap_col) = b'\n';
429
430                    let s2 =
431                        std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
432                    let _ = BASE64_ENGINE.encode(
433                        &input[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
434                        s2.as_out(),
435                    );
436                    *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
437
438                    let s3 =
439                        std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
440                    let _ = BASE64_ENGINE.encode(
441                        &input[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
442                        s3.as_out(),
443                    );
444                    *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
445                }
446                line_idx += 4;
447            }
448
449            // Remaining lines one at a time
450            while line_idx < full_lines {
451                let in_base = line_idx * bytes_per_line;
452                let out_base = line_idx * line_out;
453                unsafe {
454                    let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
455                    let _ =
456                        BASE64_ENGINE.encode(&input[in_base..in_base + bytes_per_line], s.as_out());
457                    *dst.add(out_base + wrap_col) = b'\n';
458                }
459                line_idx += 1;
460            }
461        }
462
463        // Handle remainder (last partial line of this chunk)
464        if rem > 0 {
465            let line_input = &input[full_lines * bytes_per_line..];
466            let enc_len = BASE64_ENGINE.encoded_length(rem);
467            let woff = out_off + full_lines * line_out;
468            // Encode directly into final output position
469            let out_slice =
470                unsafe { std::slice::from_raw_parts_mut(out_ptr.add(woff), enc_len + 1) };
471            let _ = BASE64_ENGINE.encode(line_input, out_slice[..enc_len].as_out());
472            out_slice[enc_len] = b'\n';
473        }
474    });
475
476    out.write_all(&outbuf[..total_output])
477}
478
479/// Fuse encoded base64 data with newlines in a single pass.
480/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
481/// Returns number of bytes written.
482#[allow(dead_code)]
483#[inline]
484fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
485    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
486    let mut rp = 0;
487    let mut wp = 0;
488
489    // Unrolled: process 8 lines per iteration for better ILP
490    while rp + 8 * wrap_col <= encoded.len() {
491        unsafe {
492            let src = encoded.as_ptr().add(rp);
493            let dst = out_buf.as_mut_ptr().add(wp);
494
495            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
496            *dst.add(wrap_col) = b'\n';
497
498            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
499            *dst.add(line_out + wrap_col) = b'\n';
500
501            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
502            *dst.add(2 * line_out + wrap_col) = b'\n';
503
504            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
505            *dst.add(3 * line_out + wrap_col) = b'\n';
506
507            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
508            *dst.add(4 * line_out + wrap_col) = b'\n';
509
510            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
511            *dst.add(5 * line_out + wrap_col) = b'\n';
512
513            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
514            *dst.add(6 * line_out + wrap_col) = b'\n';
515
516            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
517            *dst.add(7 * line_out + wrap_col) = b'\n';
518        }
519        rp += 8 * wrap_col;
520        wp += 8 * line_out;
521    }
522
523    // Handle remaining 4 lines at a time
524    while rp + 4 * wrap_col <= encoded.len() {
525        unsafe {
526            let src = encoded.as_ptr().add(rp);
527            let dst = out_buf.as_mut_ptr().add(wp);
528
529            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
530            *dst.add(wrap_col) = b'\n';
531
532            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
533            *dst.add(line_out + wrap_col) = b'\n';
534
535            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
536            *dst.add(2 * line_out + wrap_col) = b'\n';
537
538            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
539            *dst.add(3 * line_out + wrap_col) = b'\n';
540        }
541        rp += 4 * wrap_col;
542        wp += 4 * line_out;
543    }
544
545    // Remaining full lines
546    while rp + wrap_col <= encoded.len() {
547        unsafe {
548            std::ptr::copy_nonoverlapping(
549                encoded.as_ptr().add(rp),
550                out_buf.as_mut_ptr().add(wp),
551                wrap_col,
552            );
553            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
554        }
555        rp += wrap_col;
556        wp += line_out;
557    }
558
559    // Partial last line
560    if rp < encoded.len() {
561        let remaining = encoded.len() - rp;
562        unsafe {
563            std::ptr::copy_nonoverlapping(
564                encoded.as_ptr().add(rp),
565                out_buf.as_mut_ptr().add(wp),
566                remaining,
567            );
568        }
569        wp += remaining;
570        out_buf[wp] = b'\n';
571        wp += 1;
572    }
573
574    wp
575}
576
577/// Fallback for very small wrap columns (< 4 chars).
578fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
579    let enc_max = BASE64_ENGINE.encoded_length(data.len());
580    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
581    #[allow(clippy::uninit_vec)]
582    unsafe {
583        buf.set_len(enc_max);
584    }
585    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
586
587    let wc = wrap_col.max(1);
588    for line in encoded.chunks(wc) {
589        out.write_all(line)?;
590        out.write_all(b"\n")?;
591    }
592    Ok(())
593}
594
595/// Decode base64 data and write to output (borrows data, allocates clean buffer).
596/// When `ignore_garbage` is true, strip all non-base64 characters.
597/// When false, only strip whitespace (standard behavior).
598pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
599    if data.is_empty() {
600        return Ok(());
601    }
602
603    if ignore_garbage {
604        let mut cleaned = strip_non_base64(data);
605        return decode_clean_slice(&mut cleaned, out);
606    }
607
608    // Fast path: single-pass strip + decode
609    decode_stripping_whitespace(data, out)
610}
611
612/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
613pub fn decode_owned(
614    data: &mut Vec<u8>,
615    ignore_garbage: bool,
616    out: &mut impl Write,
617) -> io::Result<()> {
618    if data.is_empty() {
619        return Ok(());
620    }
621
622    if ignore_garbage {
623        data.retain(|&b| is_base64_char(b));
624    } else {
625        strip_whitespace_inplace(data);
626    }
627
628    decode_clean_slice(data, out)
629}
630
631/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
632/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
633/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
634/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
635fn strip_whitespace_inplace(data: &mut Vec<u8>) {
636    // Quick check: skip stripping if no \n or \r in the data.
637    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
638    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
639    if memchr::memchr2(b'\n', b'\r', data).is_none() {
640        // No newlines/CR — check for rare whitespace only
641        if data
642            .iter()
643            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
644        {
645            data.retain(|&b| NOT_WHITESPACE[b as usize]);
646        }
647        return;
648    }
649
650    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
651    // gaps between them to compact the data in-place. For typical base64 streams,
652    // newlines are the only whitespace, so this handles >99% of cases.
653    let ptr = data.as_mut_ptr();
654    let len = data.len();
655    let mut wp = 0usize;
656    let mut gap_start = 0usize;
657    let mut has_rare_ws = false;
658
659    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
660        let gap_len = pos - gap_start;
661        if gap_len > 0 {
662            if !has_rare_ws {
663                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
664                has_rare_ws = data[gap_start..pos]
665                    .iter()
666                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
667            }
668            if wp != gap_start {
669                unsafe {
670                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
671                }
672            }
673            wp += gap_len;
674        }
675        gap_start = pos + 1;
676    }
677    // Copy the final gap
678    let tail_len = len - gap_start;
679    if tail_len > 0 {
680        if !has_rare_ws {
681            has_rare_ws = data[gap_start..]
682                .iter()
683                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
684        }
685        if wp != gap_start {
686            unsafe {
687                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
688            }
689        }
690        wp += tail_len;
691    }
692
693    data.truncate(wp);
694
695    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
696    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
697    if has_rare_ws {
698        let ptr = data.as_mut_ptr();
699        let len = data.len();
700        let mut rp = 0;
701        let mut cwp = 0;
702        while rp < len {
703            let b = unsafe { *ptr.add(rp) };
704            if NOT_WHITESPACE[b as usize] {
705                unsafe { *ptr.add(cwp) = b };
706                cwp += 1;
707            }
708            rp += 1;
709        }
710        data.truncate(cwp);
711    }
712}
713
714/// 256-byte lookup table: true for non-whitespace bytes.
715/// Used for single-pass whitespace stripping in decode.
716static NOT_WHITESPACE: [bool; 256] = {
717    let mut table = [true; 256];
718    table[b' ' as usize] = false;
719    table[b'\t' as usize] = false;
720    table[b'\n' as usize] = false;
721    table[b'\r' as usize] = false;
722    table[0x0b] = false; // vertical tab
723    table[0x0c] = false; // form feed
724    table
725};
726
727/// Decode by stripping whitespace and decoding in a single fused pass.
728/// For data with no whitespace, decodes directly without any copy.
729/// Uses memchr2 SIMD gap-copy for \n/\r (the dominant whitespace in base64),
730/// then a conditional fallback pass for rare whitespace types (tab, space, VT, FF).
731/// Tracks rare whitespace presence during the gap-copy to skip the second scan
732/// entirely in the common case (pure \n/\r whitespace only).
733fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
734    // Quick check: skip stripping if no \n or \r in the data.
735    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
736    if memchr::memchr2(b'\n', b'\r', data).is_none() {
737        // No newlines/CR — check for rare whitespace only
738        if !data
739            .iter()
740            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
741        {
742            return decode_borrowed_clean(out, data);
743        }
744        // Has rare whitespace only — strip and decode
745        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
746        for &b in data {
747            if NOT_WHITESPACE[b as usize] {
748                cleaned.push(b);
749            }
750        }
751        return decode_clean_slice(&mut cleaned, out);
752    }
753
754    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
755    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
756    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
757    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
758    let dst = clean.as_mut_ptr();
759    let mut wp = 0usize;
760    let mut gap_start = 0usize;
761    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
762    // This avoids the second full-scan pass when only \n/\r are present.
763    let mut has_rare_ws = false;
764
765    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
766        let gap_len = pos - gap_start;
767        if gap_len > 0 {
768            // Check gap region for rare whitespace during copy.
769            // This adds ~1 branch per gap but eliminates the second full scan.
770            if !has_rare_ws {
771                has_rare_ws = data[gap_start..pos]
772                    .iter()
773                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
774            }
775            unsafe {
776                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
777            }
778            wp += gap_len;
779        }
780        gap_start = pos + 1;
781    }
782    // Copy the final gap after the last \n/\r
783    let tail_len = data.len() - gap_start;
784    if tail_len > 0 {
785        if !has_rare_ws {
786            has_rare_ws = data[gap_start..]
787                .iter()
788                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
789        }
790        unsafe {
791            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
792        }
793        wp += tail_len;
794    }
795    unsafe {
796        clean.set_len(wp);
797    }
798
799    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
800    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
801    if has_rare_ws {
802        let ptr = clean.as_mut_ptr();
803        let len = clean.len();
804        let mut rp = 0;
805        let mut cwp = 0;
806        while rp < len {
807            let b = unsafe { *ptr.add(rp) };
808            if NOT_WHITESPACE[b as usize] {
809                unsafe { *ptr.add(cwp) = b };
810                cwp += 1;
811            }
812            rp += 1;
813        }
814        clean.truncate(cwp);
815    }
816
817    // For large data (>= threshold), use parallel decode for multi-core speedup.
818    // For small data, use in-place decode to avoid extra allocation.
819    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
820        decode_borrowed_clean_parallel(out, &clean)
821    } else {
822        decode_clean_slice(&mut clean, out)
823    }
824}
825
826/// Decode a clean (no whitespace) buffer in-place with SIMD.
827fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
828    if data.is_empty() {
829        return Ok(());
830    }
831    match BASE64_ENGINE.decode_inplace(data) {
832        Ok(decoded) => out.write_all(decoded),
833        Err(_) => decode_error(),
834    }
835}
836
837/// Cold error path — keeps hot decode path tight by moving error construction out of line.
838#[cold]
839#[inline(never)]
840fn decode_error() -> io::Result<()> {
841    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
842}
843
844/// Decode clean base64 data (no whitespace) from a borrowed slice.
845fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
846    if data.is_empty() {
847        return Ok(());
848    }
849    // Parallel decode for large data: split at 4-byte boundaries,
850    // decode each chunk independently (base64 is context-free per 4-char group).
851    if data.len() >= PARALLEL_DECODE_THRESHOLD {
852        return decode_borrowed_clean_parallel(out, data);
853    }
854    match BASE64_ENGINE.decode_to_vec(data) {
855        Ok(decoded) => {
856            out.write_all(&decoded)?;
857            Ok(())
858        }
859        Err(_) => decode_error(),
860    }
861}
862
863/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel via rayon.
864/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
865/// upfront, so each thread decodes directly to its final position. No compaction needed.
866fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
867    let num_threads = rayon::current_num_threads().max(1);
868    let raw_chunk = data.len() / num_threads;
869    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
870    let chunk_size = ((raw_chunk + 3) / 4) * 4;
871
872    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
873
874    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
875    // For all chunks except the last, decoded size is exactly chunk.len() * 3 / 4.
876    // For the last chunk, account for '=' padding bytes.
877    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
878    offsets.push(0);
879    let mut total_decoded = 0usize;
880    for (i, chunk) in chunks.iter().enumerate() {
881        let decoded_size = if i == chunks.len() - 1 {
882            // Last chunk: count '=' padding to get exact decoded size
883            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
884            chunk.len() * 3 / 4 - pad
885        } else {
886            // Non-last chunks: 4-byte aligned, no padding, exact 3/4 ratio
887            chunk.len() * 3 / 4
888        };
889        total_decoded += decoded_size;
890        offsets.push(total_decoded);
891    }
892
893    // Pre-allocate contiguous output buffer with exact total size
894    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
895    #[allow(clippy::uninit_vec)]
896    unsafe {
897        output_buf.set_len(total_decoded);
898    }
899
900    // Parallel decode: each thread decodes directly into its exact final position.
901    // No compaction pass needed since offsets are computed from exact decoded sizes.
902    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
903    // Use usize representation of the pointer for Send+Sync compatibility with rayon.
904    let out_addr = output_buf.as_mut_ptr() as usize;
905    let decode_result: Result<Vec<()>, io::Error> = chunks
906        .par_iter()
907        .enumerate()
908        .map(|(i, chunk)| {
909            let offset = offsets[i];
910            let expected_size = offsets[i + 1] - offset;
911            // SAFETY: each thread writes to non-overlapping region [offset..offset+expected_size]
912            let out_slice = unsafe {
913                std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
914            };
915            let decoded = BASE64_ENGINE
916                .decode(chunk, out_slice.as_out())
917                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid input"))?;
918            debug_assert_eq!(decoded.len(), expected_size);
919            Ok(())
920        })
921        .collect();
922
923    decode_result?;
924
925    out.write_all(&output_buf[..total_decoded])
926}
927
928/// Strip non-base64 characters (for -i / --ignore-garbage).
929fn strip_non_base64(data: &[u8]) -> Vec<u8> {
930    data.iter()
931        .copied()
932        .filter(|&b| is_base64_char(b))
933        .collect()
934}
935
936/// Check if a byte is a valid base64 alphabet character or padding.
937#[inline]
938fn is_base64_char(b: u8) -> bool {
939    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
940}
941
942/// Stream-encode from a reader to a writer. Used for stdin processing.
943/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
944pub fn encode_stream(
945    reader: &mut impl Read,
946    wrap_col: usize,
947    writer: &mut impl Write,
948) -> io::Result<()> {
949    if wrap_col == 0 {
950        return encode_stream_nowrap(reader, writer);
951    }
952    encode_stream_wrapped(reader, wrap_col, writer)
953}
954
955/// Streaming encode with NO line wrapping — optimized fast path.
956/// Read size is 12MB (divisible by 3): encoded output = 12MB * 4/3 = 16MB.
957/// 12MB reads mean 10MB input is consumed in a single read() call,
958/// and the 16MB encoded output writes in 1-2 write() calls.
959fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
960    // 12MB aligned to 3 bytes: encoded output = 12MB * 4/3 = 16MB.
961    // For 10MB input: 1 read (10MB) instead of 2 reads.
962    const NOWRAP_READ: usize = 12 * 1024 * 1024; // exactly divisible by 3
963
964    // SAFETY: buf bytes are written by read_full before being processed.
965    // encode_buf bytes are written by encode before being read.
966    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
967    #[allow(clippy::uninit_vec)]
968    unsafe {
969        buf.set_len(NOWRAP_READ);
970    }
971    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
972    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
973    #[allow(clippy::uninit_vec)]
974    unsafe {
975        encode_buf.set_len(encode_buf_size);
976    }
977
978    loop {
979        let n = read_full(reader, &mut buf)?;
980        if n == 0 {
981            break;
982        }
983        let enc_len = BASE64_ENGINE.encoded_length(n);
984        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
985        writer.write_all(encoded)?;
986    }
987    Ok(())
988}
989
990/// Streaming encode WITH line wrapping.
991/// For the common case (wrap_col divides evenly into 3-byte input groups),
992/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
993/// then writes it in a single write() call. This eliminates the overhead of
994/// many writev() syscalls (one per ~512 lines via IoSlice).
995///
996/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
997fn encode_stream_wrapped(
998    reader: &mut impl Read,
999    wrap_col: usize,
1000    writer: &mut impl Write,
1001) -> io::Result<()> {
1002    let bytes_per_line = wrap_col * 3 / 4;
1003    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
1004    // align the read buffer to bytes_per_line boundaries so each chunk produces
1005    // complete lines with no column carry-over between chunks.
1006    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
1007        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
1008    }
1009
1010    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
1011    const STREAM_READ: usize = 12 * 1024 * 1024;
1012    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
1013    #[allow(clippy::uninit_vec)]
1014    unsafe {
1015        buf.set_len(STREAM_READ);
1016    }
1017    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
1018    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1019    #[allow(clippy::uninit_vec)]
1020    unsafe {
1021        encode_buf.set_len(encode_buf_size);
1022    }
1023
1024    let mut col = 0usize;
1025
1026    loop {
1027        let n = read_full(reader, &mut buf)?;
1028        if n == 0 {
1029            break;
1030        }
1031        let enc_len = BASE64_ENGINE.encoded_length(n);
1032        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1033
1034        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
1035    }
1036
1037    if col > 0 {
1038        writer.write_all(b"\n")?;
1039    }
1040
1041    Ok(())
1042}
1043
1044/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
1045/// encode each line directly into its final position with newline appended.
1046/// Eliminates the two-pass encode-then-fuse_wrap approach.
1047/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
1048/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
1049fn encode_stream_wrapped_fused(
1050    reader: &mut impl Read,
1051    wrap_col: usize,
1052    bytes_per_line: usize,
1053    writer: &mut impl Write,
1054) -> io::Result<()> {
1055    // Align read size to bytes_per_line for complete output lines per chunk.
1056    // ~210K lines * 57 bytes = ~12MB input, ~16MB output.
1057    let lines_per_chunk = (12 * 1024 * 1024) / bytes_per_line;
1058    let read_size = lines_per_chunk * bytes_per_line;
1059    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline
1060
1061    // SAFETY: buf bytes are written by read_full before being processed.
1062    // out_buf bytes are written by encode before being read.
1063    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
1064    #[allow(clippy::uninit_vec)]
1065    unsafe {
1066        buf.set_len(read_size);
1067    }
1068    // Output buffer: enough for all lines + remainder
1069    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
1070    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
1071    #[allow(clippy::uninit_vec)]
1072    unsafe {
1073        out_buf.set_len(max_output);
1074    }
1075
1076    loop {
1077        let n = read_full(reader, &mut buf)?;
1078        if n == 0 {
1079            break;
1080        }
1081
1082        let full_lines = n / bytes_per_line;
1083        let remainder = n % bytes_per_line;
1084
1085        // Encode each input line directly into its final output position.
1086        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
1087        // This eliminates the separate encode + fuse_wrap copy entirely.
1088        let dst = out_buf.as_mut_ptr();
1089        let mut line_idx = 0;
1090
1091        // 4-line unrolled loop for better ILP
1092        while line_idx + 4 <= full_lines {
1093            let in_base = line_idx * bytes_per_line;
1094            let out_base = line_idx * line_out;
1095            unsafe {
1096                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1097                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
1098                *dst.add(out_base + wrap_col) = b'\n';
1099
1100                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
1101                let _ = BASE64_ENGINE.encode(
1102                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
1103                    s1.as_out(),
1104                );
1105                *dst.add(out_base + line_out + wrap_col) = b'\n';
1106
1107                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
1108                let _ = BASE64_ENGINE.encode(
1109                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
1110                    s2.as_out(),
1111                );
1112                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
1113
1114                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
1115                let _ = BASE64_ENGINE.encode(
1116                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
1117                    s3.as_out(),
1118                );
1119                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
1120            }
1121            line_idx += 4;
1122        }
1123
1124        // Remaining full lines
1125        while line_idx < full_lines {
1126            let in_base = line_idx * bytes_per_line;
1127            let out_base = line_idx * line_out;
1128            unsafe {
1129                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1130                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
1131                *dst.add(out_base + wrap_col) = b'\n';
1132            }
1133            line_idx += 1;
1134        }
1135
1136        let mut wp = full_lines * line_out;
1137
1138        // Handle remainder (partial last line of this chunk)
1139        if remainder > 0 {
1140            let enc_len = BASE64_ENGINE.encoded_length(remainder);
1141            let line_input = &buf[full_lines * bytes_per_line..n];
1142            unsafe {
1143                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
1144                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
1145                *dst.add(wp + enc_len) = b'\n';
1146            }
1147            wp += enc_len + 1;
1148        }
1149
1150        writer.write_all(&out_buf[..wp])?;
1151    }
1152
1153    Ok(())
1154}
1155
1156/// Stream-decode from a reader to a writer. Used for stdin processing.
1157/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
1158/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 16MB).
1159/// Uses 16MB read buffer for maximum pipe throughput — read_full retries to
1160/// fill the entire buffer from the pipe, and 16MB means the entire 10MB
1161/// benchmark input is read in a single syscall batch, minimizing overhead.
1162pub fn decode_stream(
1163    reader: &mut impl Read,
1164    ignore_garbage: bool,
1165    writer: &mut impl Write,
1166) -> io::Result<()> {
1167    const READ_CHUNK: usize = 16 * 1024 * 1024;
1168    // SAFETY: buf bytes are written by read_full before being processed.
1169    // The extra 4 bytes accommodate carry-over from previous chunk.
1170    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
1171    #[allow(clippy::uninit_vec)]
1172    unsafe {
1173        buf.set_len(READ_CHUNK + 4);
1174    }
1175    let mut carry = [0u8; 4];
1176    let mut carry_len = 0usize;
1177
1178    loop {
1179        // Copy carry bytes to start of buffer, read new data after them
1180        if carry_len > 0 {
1181            unsafe {
1182                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
1183            }
1184        }
1185        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
1186        if n == 0 {
1187            break;
1188        }
1189        let total_raw = carry_len + n;
1190
1191        // Strip whitespace in-place in the buffer itself.
1192        // This eliminates the separate clean buffer allocation (saves 16MB).
1193        let clean_len = if ignore_garbage {
1194            // Scalar filter for ignore_garbage mode (rare path)
1195            let ptr = buf.as_mut_ptr();
1196            let mut wp = 0usize;
1197            for i in 0..total_raw {
1198                let b = unsafe { *ptr.add(i) };
1199                if is_base64_char(b) {
1200                    unsafe { *ptr.add(wp) = b };
1201                    wp += 1;
1202                }
1203            }
1204            wp
1205        } else {
1206            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
1207            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
1208            // so we process ~76 bytes per memchr hit.
1209            let ptr = buf.as_mut_ptr();
1210            let data = &buf[..total_raw];
1211            let mut wp = 0usize;
1212            let mut gap_start = 0usize;
1213            let mut has_rare_ws = false;
1214
1215            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1216                let gap_len = pos - gap_start;
1217                if gap_len > 0 {
1218                    if !has_rare_ws {
1219                        has_rare_ws = data[gap_start..pos]
1220                            .iter()
1221                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1222                    }
1223                    if wp != gap_start {
1224                        unsafe {
1225                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
1226                        }
1227                    }
1228                    wp += gap_len;
1229                }
1230                gap_start = pos + 1;
1231            }
1232            let tail_len = total_raw - gap_start;
1233            if tail_len > 0 {
1234                if !has_rare_ws {
1235                    has_rare_ws = data[gap_start..total_raw]
1236                        .iter()
1237                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1238                }
1239                if wp != gap_start {
1240                    unsafe {
1241                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
1242                    }
1243                }
1244                wp += tail_len;
1245            }
1246
1247            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
1248            if has_rare_ws {
1249                let mut rp = 0;
1250                let mut cwp = 0;
1251                while rp < wp {
1252                    let b = unsafe { *ptr.add(rp) };
1253                    if NOT_WHITESPACE[b as usize] {
1254                        unsafe { *ptr.add(cwp) = b };
1255                        cwp += 1;
1256                    }
1257                    rp += 1;
1258                }
1259                cwp
1260            } else {
1261                wp
1262            }
1263        };
1264
1265        carry_len = 0;
1266        let is_last = n < READ_CHUNK;
1267
1268        if is_last {
1269            // Last chunk: decode everything (including padding)
1270            decode_clean_slice(&mut buf[..clean_len], writer)?;
1271        } else {
1272            // Save incomplete base64 quadruplet for next iteration
1273            let decode_len = (clean_len / 4) * 4;
1274            let leftover = clean_len - decode_len;
1275            if leftover > 0 {
1276                unsafe {
1277                    std::ptr::copy_nonoverlapping(
1278                        buf.as_ptr().add(decode_len),
1279                        carry.as_mut_ptr(),
1280                        leftover,
1281                    );
1282                }
1283                carry_len = leftover;
1284            }
1285            if decode_len > 0 {
1286                decode_clean_slice(&mut buf[..decode_len], writer)?;
1287            }
1288        }
1289    }
1290
1291    // Handle any remaining carry-over bytes
1292    if carry_len > 0 {
1293        let mut carry_buf = carry[..carry_len].to_vec();
1294        decode_clean_slice(&mut carry_buf, writer)?;
1295    }
1296
1297    Ok(())
1298}
1299
1300/// Write all IoSlice entries using write_vectored (writev syscall).
1301/// Falls back to write_all per slice on partial writes.
1302fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
1303    if slices.is_empty() {
1304        return Ok(());
1305    }
1306    let total: usize = slices.iter().map(|s| s.len()).sum();
1307
1308    // Try write_vectored first — often writes everything in one syscall
1309    let written = match out.write_vectored(slices) {
1310        Ok(n) if n >= total => return Ok(()),
1311        Ok(n) => n,
1312        Err(e) => return Err(e),
1313    };
1314
1315    // Partial write fallback
1316    let mut skip = written;
1317    for slice in slices {
1318        let slen = slice.len();
1319        if skip >= slen {
1320            skip -= slen;
1321            continue;
1322        }
1323        if skip > 0 {
1324            out.write_all(&slice[skip..])?;
1325            skip = 0;
1326        } else {
1327            out.write_all(slice)?;
1328        }
1329    }
1330    Ok(())
1331}
1332
1333/// Read as many bytes as possible into buf, retrying on partial reads.
1334/// Fast path: regular file reads usually return the full buffer on the first call,
1335/// avoiding the loop overhead entirely.
1336#[inline]
1337fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1338    // Fast path: first read() usually fills the entire buffer for regular files
1339    let n = reader.read(buf)?;
1340    if n == buf.len() || n == 0 {
1341        return Ok(n);
1342    }
1343    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
1344    let mut total = n;
1345    while total < buf.len() {
1346        match reader.read(&mut buf[total..]) {
1347            Ok(0) => break,
1348            Ok(n) => total += n,
1349            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1350            Err(e) => return Err(e),
1351        }
1352    }
1353    Ok(total)
1354}