Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4use rayon::prelude::*;
5
6const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
7
8/// Chunk size for no-wrap encoding: 32MB aligned to 3 bytes.
9/// Larger chunks = fewer write() syscalls for big files.
10const NOWRAP_CHUNK: usize = 32 * 1024 * 1024 - (32 * 1024 * 1024 % 3);
11
12/// Minimum data size for parallel encoding (1MB).
13/// Lowered from 4MB so 10MB benchmark workloads get multi-core processing.
14const PARALLEL_ENCODE_THRESHOLD: usize = 1024 * 1024;
15
16/// Minimum data size for parallel decoding (1MB of base64 data).
17/// Lowered from 4MB for better parallelism on typical workloads.
18const PARALLEL_DECODE_THRESHOLD: usize = 1024 * 1024;
19
20/// Encode data and write to output with line wrapping.
21/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
22pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
23    if data.is_empty() {
24        return Ok(());
25    }
26
27    if wrap_col == 0 {
28        return encode_no_wrap(data, out);
29    }
30
31    encode_wrapped(data, wrap_col, out)
32}
33
34/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
35fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
36    if data.len() >= PARALLEL_ENCODE_THRESHOLD {
37        return encode_no_wrap_parallel(data, out);
38    }
39
40    let actual_chunk = NOWRAP_CHUNK.min(data.len());
41    let enc_max = BASE64_ENGINE.encoded_length(actual_chunk);
42    // SAFETY: encode() writes exactly enc_len bytes before we read them.
43    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
44    #[allow(clippy::uninit_vec)]
45    unsafe {
46        buf.set_len(enc_max);
47    }
48
49    for chunk in data.chunks(NOWRAP_CHUNK) {
50        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
51        let encoded = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
52        out.write_all(encoded)?;
53    }
54    Ok(())
55}
56
57/// Parallel no-wrap encoding: split at 3-byte boundaries, encode chunks in parallel.
58/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
59/// Uses write_vectored (writev) to send all encoded chunks in a single syscall.
60fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
61    let num_threads = rayon::current_num_threads().max(1);
62    let raw_chunk = data.len() / num_threads;
63    // Align to 3 bytes so each chunk encodes without padding (except the last)
64    let chunk_size = ((raw_chunk + 2) / 3) * 3;
65
66    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();
67    let encoded_chunks: Vec<Vec<u8>> = chunks
68        .par_iter()
69        .map(|chunk| {
70            let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
71            let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
72            #[allow(clippy::uninit_vec)]
73            unsafe {
74                buf.set_len(enc_len);
75            }
76            let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
77            buf
78        })
79        .collect();
80
81    // Use write_vectored to send all chunks in a single syscall
82    let iov: Vec<io::IoSlice> = encoded_chunks.iter().map(|c| io::IoSlice::new(c)).collect();
83    write_all_vectored(out, &iov)
84}
85
86/// Encode with line wrapping — fused encode+wrap in a single output buffer.
87/// Encodes aligned input chunks, then interleaves newlines directly into
88/// a single output buffer, eliminating the separate wrap pass.
89fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
90    // Calculate bytes_per_line: input bytes that produce exactly wrap_col encoded chars.
91    // For default wrap_col=76: 76*3/4 = 57 bytes per line.
92    let bytes_per_line = wrap_col * 3 / 4;
93    if bytes_per_line == 0 {
94        // Degenerate case: wrap_col < 4, fall back to byte-at-a-time
95        return encode_wrapped_small(data, wrap_col, out);
96    }
97
98    // Parallel encoding for large data when bytes_per_line is a multiple of 3.
99    // This guarantees each chunk encodes to complete base64 without padding.
100    if data.len() >= PARALLEL_ENCODE_THRESHOLD && bytes_per_line.is_multiple_of(3) {
101        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
102    }
103
104    // Align input chunk to bytes_per_line for complete output lines.
105    // Use 32MB chunks — large enough to process most files in a single pass,
106    // reducing write() syscalls.
107    let lines_per_chunk = (32 * 1024 * 1024) / bytes_per_line;
108    let max_input_chunk = (lines_per_chunk * bytes_per_line).max(bytes_per_line);
109    let input_chunk = max_input_chunk.min(data.len());
110
111    let enc_max = BASE64_ENGINE.encoded_length(input_chunk);
112    let mut encode_buf: Vec<u8> = Vec::with_capacity(enc_max);
113    #[allow(clippy::uninit_vec)]
114    unsafe {
115        encode_buf.set_len(enc_max);
116    }
117
118    // Fused output buffer: holds encoded data with newlines interleaved
119    let max_lines = enc_max / wrap_col + 2;
120    let fused_max = enc_max + max_lines;
121    let mut fused_buf: Vec<u8> = Vec::with_capacity(fused_max);
122    #[allow(clippy::uninit_vec)]
123    unsafe {
124        fused_buf.set_len(fused_max);
125    }
126
127    for chunk in data.chunks(max_input_chunk.max(1)) {
128        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
129        let encoded = BASE64_ENGINE.encode(chunk, encode_buf[..enc_len].as_out());
130
131        // Fuse: copy encoded data into fused_buf with newlines interleaved
132        let wp = fuse_wrap(encoded, wrap_col, &mut fused_buf);
133        out.write_all(&fused_buf[..wp])?;
134    }
135
136    Ok(())
137}
138
139/// Parallel wrapped encoding: split at bytes_per_line boundaries, encode + wrap in parallel.
140/// Requires bytes_per_line.is_multiple_of(3) so each chunk encodes without intermediate padding.
141/// Uses write_vectored (writev) to send all encoded+wrapped chunks in a single syscall.
142fn encode_wrapped_parallel(
143    data: &[u8],
144    wrap_col: usize,
145    bytes_per_line: usize,
146    out: &mut impl Write,
147) -> io::Result<()> {
148    let num_threads = rayon::current_num_threads().max(1);
149    // Split at bytes_per_line boundaries for complete output lines per chunk
150    let lines_per_chunk = (data.len() / bytes_per_line / num_threads).max(1);
151    let chunk_size = lines_per_chunk * bytes_per_line;
152
153    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(bytes_per_line)).collect();
154    let encoded_chunks: Vec<Vec<u8>> = chunks
155        .par_iter()
156        .map(|chunk| {
157            let enc_max = BASE64_ENGINE.encoded_length(chunk.len());
158            let max_lines = enc_max / wrap_col + 2;
159            // Single allocation with two non-overlapping regions:
160            //   [0..fused_size) = fuse_wrap output region
161            //   [fused_size..fused_size+enc_max) = encode region
162            let fused_size = enc_max + max_lines;
163            let total_size = fused_size + enc_max;
164            let mut buf: Vec<u8> = Vec::with_capacity(total_size);
165            #[allow(clippy::uninit_vec)]
166            unsafe {
167                buf.set_len(total_size);
168            }
169            // Encode into the second region [fused_size..fused_size+enc_max]
170            let _ = BASE64_ENGINE.encode(chunk, buf[fused_size..fused_size + enc_max].as_out());
171            // Use split_at_mut to get non-overlapping mutable/immutable refs
172            let (fused_region, encode_region) = buf.split_at_mut(fused_size);
173            let encoded = &encode_region[..enc_max];
174            let wp = fuse_wrap(encoded, wrap_col, fused_region);
175            buf.truncate(wp);
176            buf
177        })
178        .collect();
179
180    // Use write_vectored to send all chunks in a single syscall
181    let iov: Vec<io::IoSlice> = encoded_chunks.iter().map(|c| io::IoSlice::new(c)).collect();
182    write_all_vectored(out, &iov)
183}
184
185/// Fuse encoded base64 data with newlines in a single pass.
186/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
187/// Returns number of bytes written.
188#[inline]
189fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
190    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
191    let mut rp = 0;
192    let mut wp = 0;
193
194    // Unrolled: process 8 lines per iteration for better ILP
195    while rp + 8 * wrap_col <= encoded.len() {
196        unsafe {
197            let src = encoded.as_ptr().add(rp);
198            let dst = out_buf.as_mut_ptr().add(wp);
199
200            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
201            *dst.add(wrap_col) = b'\n';
202
203            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
204            *dst.add(line_out + wrap_col) = b'\n';
205
206            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
207            *dst.add(2 * line_out + wrap_col) = b'\n';
208
209            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
210            *dst.add(3 * line_out + wrap_col) = b'\n';
211
212            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
213            *dst.add(4 * line_out + wrap_col) = b'\n';
214
215            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
216            *dst.add(5 * line_out + wrap_col) = b'\n';
217
218            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
219            *dst.add(6 * line_out + wrap_col) = b'\n';
220
221            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
222            *dst.add(7 * line_out + wrap_col) = b'\n';
223        }
224        rp += 8 * wrap_col;
225        wp += 8 * line_out;
226    }
227
228    // Handle remaining 4 lines at a time
229    while rp + 4 * wrap_col <= encoded.len() {
230        unsafe {
231            let src = encoded.as_ptr().add(rp);
232            let dst = out_buf.as_mut_ptr().add(wp);
233
234            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
235            *dst.add(wrap_col) = b'\n';
236
237            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
238            *dst.add(line_out + wrap_col) = b'\n';
239
240            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
241            *dst.add(2 * line_out + wrap_col) = b'\n';
242
243            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
244            *dst.add(3 * line_out + wrap_col) = b'\n';
245        }
246        rp += 4 * wrap_col;
247        wp += 4 * line_out;
248    }
249
250    // Remaining full lines
251    while rp + wrap_col <= encoded.len() {
252        unsafe {
253            std::ptr::copy_nonoverlapping(
254                encoded.as_ptr().add(rp),
255                out_buf.as_mut_ptr().add(wp),
256                wrap_col,
257            );
258            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
259        }
260        rp += wrap_col;
261        wp += line_out;
262    }
263
264    // Partial last line
265    if rp < encoded.len() {
266        let remaining = encoded.len() - rp;
267        unsafe {
268            std::ptr::copy_nonoverlapping(
269                encoded.as_ptr().add(rp),
270                out_buf.as_mut_ptr().add(wp),
271                remaining,
272            );
273        }
274        wp += remaining;
275        out_buf[wp] = b'\n';
276        wp += 1;
277    }
278
279    wp
280}
281
282/// Fallback for very small wrap columns (< 4 chars).
283fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
284    let enc_max = BASE64_ENGINE.encoded_length(data.len());
285    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
286    #[allow(clippy::uninit_vec)]
287    unsafe {
288        buf.set_len(enc_max);
289    }
290    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
291
292    let wc = wrap_col.max(1);
293    for line in encoded.chunks(wc) {
294        out.write_all(line)?;
295        out.write_all(b"\n")?;
296    }
297    Ok(())
298}
299
300/// Decode base64 data and write to output (borrows data, allocates clean buffer).
301/// When `ignore_garbage` is true, strip all non-base64 characters.
302/// When false, only strip whitespace (standard behavior).
303pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
304    if data.is_empty() {
305        return Ok(());
306    }
307
308    if ignore_garbage {
309        let mut cleaned = strip_non_base64(data);
310        return decode_clean_slice(&mut cleaned, out);
311    }
312
313    // Fast path: single-pass strip + decode
314    decode_stripping_whitespace(data, out)
315}
316
317/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
318pub fn decode_owned(
319    data: &mut Vec<u8>,
320    ignore_garbage: bool,
321    out: &mut impl Write,
322) -> io::Result<()> {
323    if data.is_empty() {
324        return Ok(());
325    }
326
327    if ignore_garbage {
328        data.retain(|&b| is_base64_char(b));
329    } else {
330        strip_whitespace_inplace(data);
331    }
332
333    decode_clean_slice(data, out)
334}
335
336/// Strip all whitespace from a Vec in-place using the lookup table.
337/// Single-pass compaction: uses NOT_WHITESPACE table to classify all whitespace
338/// types simultaneously, avoiding the previous multi-scan approach.
339fn strip_whitespace_inplace(data: &mut Vec<u8>) {
340    // Quick check: any whitespace at all?
341    let has_ws = data.iter().any(|&b| !NOT_WHITESPACE[b as usize]);
342    if !has_ws {
343        return;
344    }
345
346    // Single-pass in-place compaction using the lookup table.
347    let ptr = data.as_ptr();
348    let mut_ptr = data.as_mut_ptr();
349    let len = data.len();
350    let mut wp = 0usize;
351
352    for i in 0..len {
353        let b = unsafe { *ptr.add(i) };
354        if NOT_WHITESPACE[b as usize] {
355            unsafe { *mut_ptr.add(wp) = b };
356            wp += 1;
357        }
358    }
359
360    data.truncate(wp);
361}
362
363/// 256-byte lookup table: true for non-whitespace bytes.
364/// Used for single-pass whitespace stripping in decode.
365static NOT_WHITESPACE: [bool; 256] = {
366    let mut table = [true; 256];
367    table[b' ' as usize] = false;
368    table[b'\t' as usize] = false;
369    table[b'\n' as usize] = false;
370    table[b'\r' as usize] = false;
371    table[0x0b] = false; // vertical tab
372    table[0x0c] = false; // form feed
373    table
374};
375
376/// Decode by stripping whitespace and decoding in a single fused pass.
377/// For data with no whitespace, decodes directly without any copy.
378/// Uses memchr2 SIMD gap-copy for \n/\r (the dominant whitespace in base64),
379/// then a fallback pass for rare whitespace types (tab, space, VT, FF).
380fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
381    // Quick check: any whitespace at all?  Use the lookup table for a single scan.
382    let has_ws = data.iter().any(|&b| !NOT_WHITESPACE[b as usize]);
383    if !has_ws {
384        // No whitespace — decode directly from borrowed data
385        return decode_borrowed_clean(out, data);
386    }
387
388    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
389    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
390    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
391    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
392    let dst = clean.as_mut_ptr();
393    let mut wp = 0usize;
394    let mut gap_start = 0usize;
395
396    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
397        let gap_len = pos - gap_start;
398        if gap_len > 0 {
399            unsafe {
400                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
401            }
402            wp += gap_len;
403        }
404        gap_start = pos + 1;
405    }
406    // Copy the final gap after the last \n/\r
407    let tail_len = data.len() - gap_start;
408    if tail_len > 0 {
409        unsafe {
410            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
411        }
412        wp += tail_len;
413    }
414    unsafe {
415        clean.set_len(wp);
416    }
417
418    // Second pass for rare whitespace (tab, space, VT, FF) using lookup table.
419    // In typical base64 streams this does nothing, but correctness requires it.
420    let has_rare_ws = clean.iter().any(|&b| !NOT_WHITESPACE[b as usize]);
421    if has_rare_ws {
422        let ptr = clean.as_mut_ptr();
423        let len = clean.len();
424        let mut rp = 0;
425        let mut cwp = 0;
426        while rp < len {
427            let b = unsafe { *ptr.add(rp) };
428            if NOT_WHITESPACE[b as usize] {
429                unsafe { *ptr.add(cwp) = b };
430                cwp += 1;
431            }
432            rp += 1;
433        }
434        clean.truncate(cwp);
435    }
436
437    decode_clean_slice(&mut clean, out)
438}
439
440/// Decode a clean (no whitespace) buffer in-place with SIMD.
441fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
442    if data.is_empty() {
443        return Ok(());
444    }
445    match BASE64_ENGINE.decode_inplace(data) {
446        Ok(decoded) => out.write_all(decoded),
447        Err(_) => decode_error(),
448    }
449}
450
451/// Cold error path — keeps hot decode path tight by moving error construction out of line.
452#[cold]
453#[inline(never)]
454fn decode_error() -> io::Result<()> {
455    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
456}
457
458/// Decode clean base64 data (no whitespace) from a borrowed slice.
459fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
460    if data.is_empty() {
461        return Ok(());
462    }
463    // Parallel decode for large data: split at 4-byte boundaries,
464    // decode each chunk independently (base64 is context-free per 4-char group).
465    if data.len() >= PARALLEL_DECODE_THRESHOLD {
466        return decode_borrowed_clean_parallel(out, data);
467    }
468    match BASE64_ENGINE.decode_to_vec(data) {
469        Ok(decoded) => {
470            out.write_all(&decoded)?;
471            Ok(())
472        }
473        Err(_) => decode_error(),
474    }
475}
476
477/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel via rayon.
478/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
479/// upfront, so each thread decodes directly to its final position. No compaction needed.
480fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
481    let num_threads = rayon::current_num_threads().max(1);
482    let raw_chunk = data.len() / num_threads;
483    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
484    let chunk_size = ((raw_chunk + 3) / 4) * 4;
485
486    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
487
488    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
489    // For all chunks except the last, decoded size is exactly chunk.len() * 3 / 4.
490    // For the last chunk, account for '=' padding bytes.
491    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
492    offsets.push(0);
493    let mut total_decoded = 0usize;
494    for (i, chunk) in chunks.iter().enumerate() {
495        let decoded_size = if i == chunks.len() - 1 {
496            // Last chunk: count '=' padding to get exact decoded size
497            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
498            chunk.len() * 3 / 4 - pad
499        } else {
500            // Non-last chunks: 4-byte aligned, no padding, exact 3/4 ratio
501            chunk.len() * 3 / 4
502        };
503        total_decoded += decoded_size;
504        offsets.push(total_decoded);
505    }
506
507    // Pre-allocate contiguous output buffer with exact total size
508    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
509    #[allow(clippy::uninit_vec)]
510    unsafe {
511        output_buf.set_len(total_decoded);
512    }
513
514    // Parallel decode: each thread decodes directly into its exact final position.
515    // No compaction pass needed since offsets are computed from exact decoded sizes.
516    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
517    // Use usize representation of the pointer for Send+Sync compatibility with rayon.
518    let out_addr = output_buf.as_mut_ptr() as usize;
519    let decode_result: Result<Vec<()>, io::Error> = chunks
520        .par_iter()
521        .enumerate()
522        .map(|(i, chunk)| {
523            let offset = offsets[i];
524            let expected_size = offsets[i + 1] - offset;
525            // SAFETY: each thread writes to non-overlapping region [offset..offset+expected_size]
526            let out_slice = unsafe {
527                std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
528            };
529            let decoded = BASE64_ENGINE
530                .decode(chunk, out_slice.as_out())
531                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid input"))?;
532            debug_assert_eq!(decoded.len(), expected_size);
533            Ok(())
534        })
535        .collect();
536
537    decode_result?;
538
539    out.write_all(&output_buf[..total_decoded])
540}
541
542/// Strip non-base64 characters (for -i / --ignore-garbage).
543fn strip_non_base64(data: &[u8]) -> Vec<u8> {
544    data.iter()
545        .copied()
546        .filter(|&b| is_base64_char(b))
547        .collect()
548}
549
550/// Check if a byte is a valid base64 alphabet character or padding.
551#[inline]
552fn is_base64_char(b: u8) -> bool {
553    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
554}
555
556/// Stream-encode from a reader to a writer. Used for stdin processing.
557/// Uses 3MB read chunks (aligned to 3 bytes for padding-free intermediate encoding).
558/// 3MB is optimal for piped input: large enough for good throughput, small enough
559/// that read_full() fills the buffer quickly from pipes (3 reads at 1MB pipe size).
560pub fn encode_stream(
561    reader: &mut impl Read,
562    wrap_col: usize,
563    writer: &mut impl Write,
564) -> io::Result<()> {
565    // 3MB aligned to 3 bytes — sweet spot for pipe throughput
566    const STREAM_READ: usize = 3 * 1024 * 1024;
567    let mut buf = vec![0u8; STREAM_READ];
568
569    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
570    let mut encode_buf = vec![0u8; encode_buf_size];
571
572    if wrap_col == 0 {
573        // No wrapping: encode each chunk and write directly.
574        loop {
575            let n = read_full(reader, &mut buf)?;
576            if n == 0 {
577                break;
578            }
579            let enc_len = BASE64_ENGINE.encoded_length(n);
580            let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
581            writer.write_all(encoded)?;
582        }
583    } else {
584        // Wrapping: fused encode+wrap into a single output buffer.
585        let max_fused = encode_buf_size + (encode_buf_size / wrap_col + 2);
586        let mut fused_buf = vec![0u8; max_fused];
587        let mut col = 0usize;
588
589        loop {
590            let n = read_full(reader, &mut buf)?;
591            if n == 0 {
592                break;
593            }
594            let enc_len = BASE64_ENGINE.encoded_length(n);
595            let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
596
597            // Build fused output in a single buffer, then one write.
598            let wp = build_fused_output(encoded, wrap_col, &mut col, &mut fused_buf);
599            writer.write_all(&fused_buf[..wp])?;
600        }
601
602        if col > 0 {
603            writer.write_all(b"\n")?;
604        }
605    }
606
607    Ok(())
608}
609
610/// Build fused encode+wrap output into a pre-allocated buffer.
611/// Returns the number of bytes written.
612/// Uses unsafe ptr ops to avoid bounds checks in the hot loop.
613#[inline]
614fn build_fused_output(data: &[u8], wrap_col: usize, col: &mut usize, out_buf: &mut [u8]) -> usize {
615    let mut rp = 0;
616    let mut wp = 0;
617    let len = data.len();
618    let src = data.as_ptr();
619    let dst = out_buf.as_mut_ptr();
620
621    while rp < len {
622        let space = wrap_col - *col;
623        let avail = len - rp;
624
625        if avail <= space {
626            unsafe {
627                std::ptr::copy_nonoverlapping(src.add(rp), dst.add(wp), avail);
628            }
629            wp += avail;
630            *col += avail;
631            if *col == wrap_col {
632                unsafe { *dst.add(wp) = b'\n' };
633                wp += 1;
634                *col = 0;
635            }
636            break;
637        } else {
638            unsafe {
639                std::ptr::copy_nonoverlapping(src.add(rp), dst.add(wp), space);
640                *dst.add(wp + space) = b'\n';
641            }
642            wp += space + 1;
643            rp += space;
644            *col = 0;
645        }
646    }
647
648    wp
649}
650
651/// Stream-decode from a reader to a writer. Used for stdin processing.
652/// Fused single-pass: read chunk -> strip whitespace -> decode immediately.
653/// Uses 16MB read buffer to reduce syscalls and memchr2-based SIMD whitespace
654/// stripping for the common case (only \n and \r whitespace in base64 streams).
655pub fn decode_stream(
656    reader: &mut impl Read,
657    ignore_garbage: bool,
658    writer: &mut impl Write,
659) -> io::Result<()> {
660    const READ_CHUNK: usize = 16 * 1024 * 1024;
661    let mut buf = vec![0u8; READ_CHUNK];
662    // Pre-allocate clean buffer once and reuse across iterations.
663    // Use Vec with set_len for zero-overhead reset instead of clear() + extend().
664    let mut clean: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
665    let mut carry = [0u8; 4];
666    let mut carry_len = 0usize;
667
668    loop {
669        let n = read_full(reader, &mut buf)?;
670        if n == 0 {
671            break;
672        }
673
674        // Copy carry bytes to start of clean buffer (0-3 bytes from previous chunk)
675        unsafe {
676            std::ptr::copy_nonoverlapping(carry.as_ptr(), clean.as_mut_ptr(), carry_len);
677        }
678
679        let chunk = &buf[..n];
680        if ignore_garbage {
681            // Scalar filter for ignore_garbage mode (rare path)
682            let dst = unsafe { clean.as_mut_ptr().add(carry_len) };
683            let mut wp = 0usize;
684            for &b in chunk {
685                if is_base64_char(b) {
686                    unsafe { *dst.add(wp) = b };
687                    wp += 1;
688                }
689            }
690            unsafe { clean.set_len(carry_len + wp) };
691        } else {
692            // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy
693            // the gaps between them. For typical base64 (76-char lines), newlines
694            // are ~1/77 of the data, so we process ~76 bytes per memchr hit
695            // instead of 1 byte per scalar iteration.
696            let dst = unsafe { clean.as_mut_ptr().add(carry_len) };
697            let mut wp = 0usize;
698            let mut gap_start = 0usize;
699
700            for pos in memchr::memchr2_iter(b'\n', b'\r', chunk) {
701                let gap_len = pos - gap_start;
702                if gap_len > 0 {
703                    unsafe {
704                        std::ptr::copy_nonoverlapping(
705                            chunk.as_ptr().add(gap_start),
706                            dst.add(wp),
707                            gap_len,
708                        );
709                    }
710                    wp += gap_len;
711                }
712                gap_start = pos + 1;
713            }
714            let tail_len = n - gap_start;
715            if tail_len > 0 {
716                unsafe {
717                    std::ptr::copy_nonoverlapping(
718                        chunk.as_ptr().add(gap_start),
719                        dst.add(wp),
720                        tail_len,
721                    );
722                }
723                wp += tail_len;
724            }
725            let total_clean = carry_len + wp;
726            unsafe { clean.set_len(total_clean) };
727
728            // Second pass for rare whitespace (tab, space, VT, FF) using lookup table.
729            // In typical base64 streams this does nothing, but we need correctness.
730            let has_rare_ws = clean[carry_len..total_clean]
731                .iter()
732                .any(|&b| !NOT_WHITESPACE[b as usize]);
733            if has_rare_ws {
734                let ptr = clean.as_mut_ptr();
735                let mut rp = carry_len;
736                let mut cwp = carry_len;
737                while rp < total_clean {
738                    let b = unsafe { *ptr.add(rp) };
739                    if NOT_WHITESPACE[b as usize] {
740                        unsafe { *ptr.add(cwp) = b };
741                        cwp += 1;
742                    }
743                    rp += 1;
744                }
745                clean.truncate(cwp);
746            }
747        }
748
749        carry_len = 0;
750        let is_last = n < READ_CHUNK;
751
752        if is_last {
753            // Last chunk: decode everything (including padding)
754            decode_clean_slice(&mut clean, writer)?;
755        } else {
756            // Save incomplete base64 quadruplet for next iteration
757            let clean_len = clean.len();
758            let decode_len = (clean_len / 4) * 4;
759            let leftover = clean_len - decode_len;
760            if leftover > 0 {
761                unsafe {
762                    std::ptr::copy_nonoverlapping(
763                        clean.as_ptr().add(decode_len),
764                        carry.as_mut_ptr(),
765                        leftover,
766                    );
767                }
768                carry_len = leftover;
769            }
770            if decode_len > 0 {
771                clean.truncate(decode_len);
772                decode_clean_slice(&mut clean, writer)?;
773            }
774        }
775    }
776
777    // Handle any remaining carry-over bytes
778    if carry_len > 0 {
779        let mut carry_buf = carry[..carry_len].to_vec();
780        decode_clean_slice(&mut carry_buf, writer)?;
781    }
782
783    Ok(())
784}
785
786/// Write all IoSlice entries using write_vectored (writev syscall).
787/// Falls back to write_all per slice on partial writes.
788fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
789    if slices.is_empty() {
790        return Ok(());
791    }
792    let total: usize = slices.iter().map(|s| s.len()).sum();
793
794    // Try write_vectored first — often writes everything in one syscall
795    let written = match out.write_vectored(slices) {
796        Ok(n) if n >= total => return Ok(()),
797        Ok(n) => n,
798        Err(e) => return Err(e),
799    };
800
801    // Partial write fallback
802    let mut skip = written;
803    for slice in slices {
804        let slen = slice.len();
805        if skip >= slen {
806            skip -= slen;
807            continue;
808        }
809        if skip > 0 {
810            out.write_all(&slice[skip..])?;
811            skip = 0;
812        } else {
813            out.write_all(slice)?;
814        }
815    }
816    Ok(())
817}
818
819/// Read as many bytes as possible into buf, retrying on partial reads.
820/// Fast path: regular file reads usually return the full buffer on the first call,
821/// avoiding the loop overhead entirely.
822#[inline]
823fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
824    // Fast path: first read() usually fills the entire buffer for regular files
825    let n = reader.read(buf)?;
826    if n == buf.len() || n == 0 {
827        return Ok(n);
828    }
829    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
830    let mut total = n;
831    while total < buf.len() {
832        match reader.read(&mut buf[total..]) {
833            Ok(0) => break,
834            Ok(n) => total += n,
835            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
836            Err(e) => return Err(e),
837        }
838    }
839    Ok(total)
840}