Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4use rayon::prelude::*;
5
6const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
7
8/// Number of available CPUs (cached by the OS). Used for encode parallel thresholds
9/// to avoid triggering Rayon's thread pool initialization for encode paths.
10#[inline]
11fn num_cpus() -> usize {
12    std::thread::available_parallelism()
13        .map(|n| n.get())
14        .unwrap_or(1)
15}
16
17/// Chunk size for sequential no-wrap encoding: 4MB aligned to 3 bytes.
18/// Smaller chunks reduce peak memory (page fault overhead for large buffers).
19/// For 10MB input, 4MB chunks = 5.3MB buffer vs 13.3MB with 32MB chunks,
20/// saving ~2000 page faults (~0.4ms). Subsequent chunks reuse hot pages.
21const NOWRAP_CHUNK: usize = 4 * 1024 * 1024 - (4 * 1024 * 1024 % 3);
22
23/// Minimum data size for parallel no-wrap encoding (2MB).
24/// No-wrap parallel has minimal overhead: split at 3-byte boundaries,
25/// encode each chunk independently, single shared output buffer.
26/// At 2MB+ the 2-4x parallel speedup easily amortizes Rayon dispatch.
27const PARALLEL_NOWRAP_THRESHOLD: usize = 2 * 1024 * 1024;
28
29/// Minimum data size for parallel wrapped encoding (4MB).
30/// Wrapped parallel uses 4 threads for SIMD encoding, providing ~3x
31/// speedup for 10MB files (1ms vs 4ms encode). The 13.5MB shared buffer
32/// allocation (~0.7ms page faults) and Rayon init (~0.3ms) are amortized
33/// by the parallel speedup at 4MB+.
34const PARALLEL_WRAPPED_THRESHOLD: usize = 4 * 1024 * 1024;
35
36/// Minimum data size for parallel decoding (2MB of base64 data).
37/// Lower threshold lets parallel decode kick in earlier for medium files.
38const PARALLEL_DECODE_THRESHOLD: usize = 2 * 1024 * 1024;
39
40/// Encode data and write to output with line wrapping.
41/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
42pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
43    if data.is_empty() {
44        return Ok(());
45    }
46
47    if wrap_col == 0 {
48        return encode_no_wrap(data, out);
49    }
50
51    encode_wrapped(data, wrap_col, out)
52}
53
54/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
55fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
56    if data.len() >= PARALLEL_NOWRAP_THRESHOLD && num_cpus() > 1 {
57        return encode_no_wrap_parallel(data, out);
58    }
59
60    let actual_chunk = NOWRAP_CHUNK.min(data.len());
61    let enc_max = BASE64_ENGINE.encoded_length(actual_chunk);
62    // SAFETY: encode() writes exactly enc_len bytes before we read them.
63    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
64    #[allow(clippy::uninit_vec)]
65    unsafe {
66        buf.set_len(enc_max);
67    }
68
69    for chunk in data.chunks(NOWRAP_CHUNK) {
70        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
71        let encoded = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
72        out.write_all(encoded)?;
73    }
74    Ok(())
75}
76
77/// Parallel no-wrap encoding: split at 3-byte boundaries, encode chunks in parallel.
78/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
79///
80/// Uses std::thread::scope instead of Rayon to avoid pool initialization overhead (~300µs).
81/// Each scoped thread allocates its own output buffer and encodes independently.
82/// Output uses writev to combine all per-thread buffers in a single syscall.
83fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
84    let num_threads = num_cpus().max(1);
85    let raw_chunk = data.len() / num_threads;
86    // Align to 3 bytes so each chunk encodes without padding (except the last)
87    let chunk_size = ((raw_chunk + 2) / 3) * 3;
88
89    // Split input into 3-byte-aligned chunks
90    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();
91
92    // Each scoped thread allocates its own output buffer and encodes independently.
93    let results: Vec<Vec<u8>> = std::thread::scope(|s| {
94        let handles: Vec<_> = chunks
95            .iter()
96            .map(|chunk| {
97                s.spawn(|| {
98                    let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
99                    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
100                    #[allow(clippy::uninit_vec)]
101                    unsafe {
102                        buf.set_len(enc_len);
103                    }
104                    let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
105                    buf
106                })
107            })
108            .collect();
109        handles.into_iter().map(|h| h.join().unwrap()).collect()
110    });
111
112    // Single writev for all chunks in order
113    let slices: Vec<io::IoSlice> = results.iter().map(|r| io::IoSlice::new(r)).collect();
114    write_all_vectored(out, &slices)
115}
116
117/// Encode with line wrapping using in-place expansion.
118/// Phase 1: bulk-encode the entire input in one SIMD pass into a buffer.
119/// Phase 2: expand backwards to insert newlines between wrap_col-sized segments.
120/// Phase 3: single write_all of the completed output.
121///
122/// This avoids both fuse_wrap's copy pass and writev's 300+ syscall overhead,
123/// using only one allocation and one write syscall for the entire output.
124fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
125    // Calculate bytes_per_line: input bytes that produce exactly wrap_col encoded chars.
126    // For default wrap_col=76: 76*3/4 = 57 bytes per line.
127    let bytes_per_line = wrap_col * 3 / 4;
128    if bytes_per_line == 0 {
129        // Degenerate case: wrap_col < 4, fall back to byte-at-a-time
130        return encode_wrapped_small(data, wrap_col, out);
131    }
132
133    // Parallel encoding for large data when bytes_per_line is a multiple of 3.
134    // This guarantees each chunk encodes to complete base64 without padding.
135    if data.len() >= PARALLEL_WRAPPED_THRESHOLD && bytes_per_line.is_multiple_of(3) {
136        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
137    }
138
139    // Direct-to-position encode+wrap: encode each line directly to its final position
140    // in the output buffer, eliminating the backward expansion pass entirely.
141    // Each bytes_per_line input bytes encode to exactly wrap_col output bytes + 1 newline.
142    //
143    // For large data (>4MB), processes in chunks to reduce peak memory allocation.
144    // 4MB input → ~5.4MB buffer vs 13.5MB for 10MB all at once, saving ~2000 page
145    // faults (~0.4ms). After the first chunk, pages are hot in TLB so reuse is free.
146    if bytes_per_line.is_multiple_of(3) {
147        let line_out = wrap_col + 1;
148
149        // Chunk size: 4MB of input, aligned to bytes_per_line
150        const MAX_CHUNK_INPUT: usize = 4 * 1024 * 1024;
151        let lines_per_chunk = MAX_CHUNK_INPUT / bytes_per_line;
152        let chunk_input = lines_per_chunk * bytes_per_line;
153        let chunk_output = lines_per_chunk * line_out;
154
155        // Allocate buffer for one chunk (reused across chunks)
156        let buf_cap = chunk_output + line_out + 8; // +line_out for remainder
157        let mut buf: Vec<u8> = Vec::with_capacity(buf_cap);
158        #[allow(clippy::uninit_vec)]
159        unsafe {
160            buf.set_len(buf_cap);
161        }
162
163        let mut data_off = 0;
164
165        // Process full chunks
166        while data_off + chunk_input <= data.len() {
167            let chunk_data = &data[data_off..data_off + chunk_input];
168            let dst = buf.as_mut_ptr();
169            let mut line_idx = 0;
170
171            // 4-line unrolled loop for ILP
172            while line_idx + 4 <= lines_per_chunk {
173                let in_base = line_idx * bytes_per_line;
174                let out_base = line_idx * line_out;
175                unsafe {
176                    let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
177                    let _ = BASE64_ENGINE
178                        .encode(&chunk_data[in_base..in_base + bytes_per_line], s0.as_out());
179                    *dst.add(out_base + wrap_col) = b'\n';
180
181                    let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
182                    let _ = BASE64_ENGINE.encode(
183                        &chunk_data[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
184                        s1.as_out(),
185                    );
186                    *dst.add(out_base + line_out + wrap_col) = b'\n';
187
188                    let s2 =
189                        std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
190                    let _ = BASE64_ENGINE.encode(
191                        &chunk_data[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
192                        s2.as_out(),
193                    );
194                    *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
195
196                    let s3 =
197                        std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
198                    let _ = BASE64_ENGINE.encode(
199                        &chunk_data[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
200                        s3.as_out(),
201                    );
202                    *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
203                }
204                line_idx += 4;
205            }
206
207            while line_idx < lines_per_chunk {
208                let in_base = line_idx * bytes_per_line;
209                let out_base = line_idx * line_out;
210                unsafe {
211                    let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
212                    let _ = BASE64_ENGINE
213                        .encode(&chunk_data[in_base..in_base + bytes_per_line], s.as_out());
214                    *dst.add(out_base + wrap_col) = b'\n';
215                }
216                line_idx += 1;
217            }
218
219            out.write_all(&buf[..chunk_output])?;
220            data_off += chunk_input;
221        }
222
223        // Remaining data (partial chunk)
224        let remaining = data.len() - data_off;
225        if remaining > 0 {
226            let remaining_data = &data[data_off..];
227            let full_lines = remaining / bytes_per_line;
228            let remainder_input = remaining % bytes_per_line;
229            let remainder_encoded = if remainder_input > 0 {
230                BASE64_ENGINE.encoded_length(remainder_input) + 1
231            } else {
232                0
233            };
234            let remaining_output = full_lines * line_out + remainder_encoded;
235
236            // Ensure buffer is large enough for the remainder
237            if remaining_output > buf.len() {
238                buf.reserve(remaining_output - buf.len());
239                #[allow(clippy::uninit_vec)]
240                unsafe {
241                    buf.set_len(remaining_output);
242                }
243            }
244
245            let dst = buf.as_mut_ptr();
246            let mut line_idx = 0;
247
248            while line_idx + 4 <= full_lines {
249                let in_base = line_idx * bytes_per_line;
250                let out_base = line_idx * line_out;
251                unsafe {
252                    let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
253                    let _ = BASE64_ENGINE.encode(
254                        &remaining_data[in_base..in_base + bytes_per_line],
255                        s0.as_out(),
256                    );
257                    *dst.add(out_base + wrap_col) = b'\n';
258
259                    let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
260                    let _ = BASE64_ENGINE.encode(
261                        &remaining_data[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
262                        s1.as_out(),
263                    );
264                    *dst.add(out_base + line_out + wrap_col) = b'\n';
265
266                    let s2 =
267                        std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
268                    let _ = BASE64_ENGINE.encode(
269                        &remaining_data[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
270                        s2.as_out(),
271                    );
272                    *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
273
274                    let s3 =
275                        std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
276                    let _ = BASE64_ENGINE.encode(
277                        &remaining_data[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
278                        s3.as_out(),
279                    );
280                    *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
281                }
282                line_idx += 4;
283            }
284
285            while line_idx < full_lines {
286                let in_base = line_idx * bytes_per_line;
287                let out_base = line_idx * line_out;
288                unsafe {
289                    let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
290                    let _ = BASE64_ENGINE.encode(
291                        &remaining_data[in_base..in_base + bytes_per_line],
292                        s.as_out(),
293                    );
294                    *dst.add(out_base + wrap_col) = b'\n';
295                }
296                line_idx += 1;
297            }
298
299            if remainder_input > 0 {
300                let in_off = full_lines * bytes_per_line;
301                let out_off = full_lines * line_out;
302                let enc_len = BASE64_ENGINE.encoded_length(remainder_input);
303                unsafe {
304                    let s = std::slice::from_raw_parts_mut(dst.add(out_off), enc_len);
305                    let _ = BASE64_ENGINE.encode(&remaining_data[in_off..], s.as_out());
306                    *dst.add(out_off + enc_len) = b'\n';
307                }
308            }
309
310            out.write_all(&buf[..remaining_output])?;
311        }
312
313        return Ok(());
314    }
315
316    // Fallback for non-3-aligned bytes_per_line: chunk + in-place expansion
317    let lines_per_chunk = (32 * 1024 * 1024) / bytes_per_line;
318    let max_input_chunk = (lines_per_chunk * bytes_per_line).max(bytes_per_line);
319
320    let enc_max = BASE64_ENGINE.encoded_length(max_input_chunk.min(data.len()));
321    let num_lines_max = enc_max / wrap_col + 1;
322    let out_max = num_lines_max * (wrap_col + 1) + wrap_col + 1;
323    let mut buf: Vec<u8> = Vec::with_capacity(out_max);
324    #[allow(clippy::uninit_vec)]
325    unsafe {
326        buf.set_len(out_max);
327    }
328
329    for chunk in data.chunks(max_input_chunk.max(1)) {
330        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
331        let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
332        let num_full = enc_len / wrap_col;
333        let rem = enc_len % wrap_col;
334        let chunk_out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
335
336        // Expand backwards
337        unsafe {
338            let ptr = buf.as_mut_ptr();
339            let mut rp = enc_len;
340            let mut wp = chunk_out_len;
341            if rem > 0 {
342                wp -= 1;
343                *ptr.add(wp) = b'\n';
344                wp -= rem;
345                rp -= rem;
346                if rp != wp {
347                    std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
348                }
349            }
350            for _ in 0..num_full {
351                wp -= 1;
352                *ptr.add(wp) = b'\n';
353                wp -= wrap_col;
354                rp -= wrap_col;
355                if rp != wp {
356                    std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
357                }
358            }
359        }
360        out.write_all(&buf[..chunk_out_len])?;
361    }
362
363    Ok(())
364}
365
366/// Static newline byte for IoSlice references in writev calls.
367static NEWLINE: [u8; 1] = [b'\n'];
368
369/// Write encoded base64 data with line wrapping using write_vectored (writev).
370/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
371/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
372/// This is zero-copy: no fused output buffer needed.
373#[inline]
374#[allow(dead_code)]
375fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
376    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
377    // Each line needs 2 entries (data + newline), so 512 lines per batch.
378    const MAX_IOV: usize = 1024;
379
380    let num_full_lines = encoded.len() / wrap_col;
381    let remainder = encoded.len() % wrap_col;
382    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };
383
384    // Small output: build all IoSlices and write in one call
385    if total_iov <= MAX_IOV {
386        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
387        let mut pos = 0;
388        for _ in 0..num_full_lines {
389            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
390            iov.push(io::IoSlice::new(&NEWLINE));
391            pos += wrap_col;
392        }
393        if remainder > 0 {
394            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
395            iov.push(io::IoSlice::new(&NEWLINE));
396        }
397        return write_all_vectored(out, &iov);
398    }
399
400    // Large output: write in batches
401    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
402    let mut pos = 0;
403    for _ in 0..num_full_lines {
404        iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
405        iov.push(io::IoSlice::new(&NEWLINE));
406        pos += wrap_col;
407        if iov.len() >= MAX_IOV {
408            write_all_vectored(out, &iov)?;
409            iov.clear();
410        }
411    }
412    if remainder > 0 {
413        iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
414        iov.push(io::IoSlice::new(&NEWLINE));
415    }
416    if !iov.is_empty() {
417        write_all_vectored(out, &iov)?;
418    }
419    Ok(())
420}
421
422/// Write encoded base64 data with line wrapping using writev, tracking column state
423/// across calls. Used by encode_stream for piped input where chunks don't align
424/// to line boundaries.
425#[inline]
426fn write_wrapped_iov_streaming(
427    encoded: &[u8],
428    wrap_col: usize,
429    col: &mut usize,
430    out: &mut impl Write,
431) -> io::Result<()> {
432    const MAX_IOV: usize = 1024;
433    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
434    let mut rp = 0;
435
436    while rp < encoded.len() {
437        let space = wrap_col - *col;
438        let avail = encoded.len() - rp;
439
440        if avail <= space {
441            // Remaining data fits in current line
442            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
443            *col += avail;
444            if *col == wrap_col {
445                iov.push(io::IoSlice::new(&NEWLINE));
446                *col = 0;
447            }
448            break;
449        } else {
450            // Fill current line and add newline
451            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
452            iov.push(io::IoSlice::new(&NEWLINE));
453            rp += space;
454            *col = 0;
455        }
456
457        if iov.len() >= MAX_IOV - 1 {
458            write_all_vectored(out, &iov)?;
459            iov.clear();
460        }
461    }
462
463    if !iov.is_empty() {
464        write_all_vectored(out, &iov)?;
465    }
466    Ok(())
467}
468
469/// Parallel wrapped encoding: single output buffer, direct-to-position encode+wrap.
470/// Requires bytes_per_line % 3 == 0 so each chunk encodes without intermediate padding.
471///
472/// Pre-calculates exact output size and each thread's write offset, then encodes
473/// 57-byte input groups directly to their final position in a shared output buffer.
474/// Each thread writes wrap_col encoded bytes + newline per line, so output for line N
475/// starts at N * (wrap_col + 1). This eliminates per-chunk heap allocations and
476/// the fuse_wrap copy pass entirely.
477/// Parallel wrapped encoding with per-thread output buffers.
478///
479/// Each thread encodes its chunk of input lines into its own buffer (with newlines),
480/// then writev combines all buffers in order. This avoids the single ~13.5MB shared
481/// buffer allocation whose page faults (~3400 faults = ~3.4ms) dominate encoding time.
482/// Per-thread buffers (~3.4MB each) page-fault concurrently, reducing wall-clock to ~0.8ms.
483fn encode_wrapped_parallel(
484    data: &[u8],
485    wrap_col: usize,
486    bytes_per_line: usize,
487    out: &mut impl Write,
488) -> io::Result<()> {
489    let line_out = wrap_col + 1;
490    let total_full_lines = data.len() / bytes_per_line;
491
492    // Split work at line boundaries for parallel processing
493    let num_threads = num_cpus().max(1);
494    let lines_per_chunk = (total_full_lines / num_threads).max(1);
495
496    // Build per-thread input ranges aligned to bytes_per_line
497    let mut tasks: Vec<(usize, usize)> = Vec::new(); // (input_offset, num_input_bytes)
498    let mut in_off = 0usize;
499    while in_off < data.len() {
500        let chunk_input = (lines_per_chunk * bytes_per_line).min(data.len() - in_off);
501        let aligned_input = if in_off + chunk_input < data.len() {
502            (chunk_input / bytes_per_line) * bytes_per_line
503        } else {
504            chunk_input
505        };
506        if aligned_input == 0 {
507            break;
508        }
509        tasks.push((in_off, aligned_input));
510        in_off += aligned_input;
511    }
512
513    // Each scoped thread encodes into its own buffer with interleaved newlines.
514    let results: Vec<Vec<u8>> = std::thread::scope(|s| {
515        let handles: Vec<_> = tasks
516            .iter()
517            .map(|&(in_off, chunk_len)| {
518                s.spawn(move || {
519                    let input = &data[in_off..in_off + chunk_len];
520                    let full_lines = chunk_len / bytes_per_line;
521                    let rem = chunk_len % bytes_per_line;
522
523                    let remainder_encoded = if rem > 0 {
524                        BASE64_ENGINE.encoded_length(rem) + 1
525                    } else {
526                        0
527                    };
528                    let buf_size = full_lines * line_out + remainder_encoded;
529
530                    let mut buf: Vec<u8> = Vec::with_capacity(buf_size);
531                    #[allow(clippy::uninit_vec)]
532                    unsafe {
533                        buf.set_len(buf_size);
534                    }
535
536                    if full_lines > 0 {
537                        let dst = buf.as_mut_ptr();
538                        let mut line_idx = 0;
539
540                        // 4-line unrolled loop for ILP
541                        while line_idx + 4 <= full_lines {
542                            let in_base = line_idx * bytes_per_line;
543                            let out_base = line_idx * line_out;
544                            unsafe {
545                                let s0 =
546                                    std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
547                                let _ = BASE64_ENGINE
548                                    .encode(&input[in_base..in_base + bytes_per_line], s0.as_out());
549                                *dst.add(out_base + wrap_col) = b'\n';
550
551                                let s1 = std::slice::from_raw_parts_mut(
552                                    dst.add(out_base + line_out),
553                                    wrap_col,
554                                );
555                                let _ = BASE64_ENGINE.encode(
556                                    &input[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
557                                    s1.as_out(),
558                                );
559                                *dst.add(out_base + line_out + wrap_col) = b'\n';
560
561                                let s2 = std::slice::from_raw_parts_mut(
562                                    dst.add(out_base + 2 * line_out),
563                                    wrap_col,
564                                );
565                                let _ = BASE64_ENGINE.encode(
566                                    &input[in_base + 2 * bytes_per_line
567                                        ..in_base + 3 * bytes_per_line],
568                                    s2.as_out(),
569                                );
570                                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
571
572                                let s3 = std::slice::from_raw_parts_mut(
573                                    dst.add(out_base + 3 * line_out),
574                                    wrap_col,
575                                );
576                                let _ = BASE64_ENGINE.encode(
577                                    &input[in_base + 3 * bytes_per_line
578                                        ..in_base + 4 * bytes_per_line],
579                                    s3.as_out(),
580                                );
581                                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
582                            }
583                            line_idx += 4;
584                        }
585
586                        while line_idx < full_lines {
587                            let in_base = line_idx * bytes_per_line;
588                            let out_base = line_idx * line_out;
589                            unsafe {
590                                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
591                                let _ = BASE64_ENGINE
592                                    .encode(&input[in_base..in_base + bytes_per_line], s.as_out());
593                                *dst.add(out_base + wrap_col) = b'\n';
594                            }
595                            line_idx += 1;
596                        }
597                    }
598
599                    if rem > 0 {
600                        let line_input = &input[full_lines * bytes_per_line..];
601                        let enc_len = BASE64_ENGINE.encoded_length(rem);
602                        let woff = full_lines * line_out;
603                        unsafe {
604                            let s =
605                                std::slice::from_raw_parts_mut(buf.as_mut_ptr().add(woff), enc_len);
606                            let _ = BASE64_ENGINE.encode(line_input, s.as_out());
607                            *buf.as_mut_ptr().add(woff + enc_len) = b'\n';
608                        }
609                    }
610
611                    buf
612                })
613            })
614            .collect();
615        handles.into_iter().map(|h| h.join().unwrap()).collect()
616    });
617
618    // Single writev for all per-thread buffers in order
619    let slices: Vec<io::IoSlice> = results.iter().map(|r| io::IoSlice::new(r)).collect();
620    write_all_vectored(out, &slices)
621}
622
623/// Fuse encoded base64 data with newlines in a single pass.
624/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
625/// Returns number of bytes written.
626#[inline]
627#[allow(dead_code)]
628fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
629    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
630    let mut rp = 0;
631    let mut wp = 0;
632
633    // Unrolled: process 8 lines per iteration for better ILP
634    while rp + 8 * wrap_col <= encoded.len() {
635        unsafe {
636            let src = encoded.as_ptr().add(rp);
637            let dst = out_buf.as_mut_ptr().add(wp);
638
639            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
640            *dst.add(wrap_col) = b'\n';
641
642            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
643            *dst.add(line_out + wrap_col) = b'\n';
644
645            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
646            *dst.add(2 * line_out + wrap_col) = b'\n';
647
648            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
649            *dst.add(3 * line_out + wrap_col) = b'\n';
650
651            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
652            *dst.add(4 * line_out + wrap_col) = b'\n';
653
654            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
655            *dst.add(5 * line_out + wrap_col) = b'\n';
656
657            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
658            *dst.add(6 * line_out + wrap_col) = b'\n';
659
660            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
661            *dst.add(7 * line_out + wrap_col) = b'\n';
662        }
663        rp += 8 * wrap_col;
664        wp += 8 * line_out;
665    }
666
667    // Handle remaining 4 lines at a time
668    while rp + 4 * wrap_col <= encoded.len() {
669        unsafe {
670            let src = encoded.as_ptr().add(rp);
671            let dst = out_buf.as_mut_ptr().add(wp);
672
673            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
674            *dst.add(wrap_col) = b'\n';
675
676            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
677            *dst.add(line_out + wrap_col) = b'\n';
678
679            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
680            *dst.add(2 * line_out + wrap_col) = b'\n';
681
682            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
683            *dst.add(3 * line_out + wrap_col) = b'\n';
684        }
685        rp += 4 * wrap_col;
686        wp += 4 * line_out;
687    }
688
689    // Remaining full lines
690    while rp + wrap_col <= encoded.len() {
691        unsafe {
692            std::ptr::copy_nonoverlapping(
693                encoded.as_ptr().add(rp),
694                out_buf.as_mut_ptr().add(wp),
695                wrap_col,
696            );
697            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
698        }
699        rp += wrap_col;
700        wp += line_out;
701    }
702
703    // Partial last line
704    if rp < encoded.len() {
705        let remaining = encoded.len() - rp;
706        unsafe {
707            std::ptr::copy_nonoverlapping(
708                encoded.as_ptr().add(rp),
709                out_buf.as_mut_ptr().add(wp),
710                remaining,
711            );
712        }
713        wp += remaining;
714        out_buf[wp] = b'\n';
715        wp += 1;
716    }
717
718    wp
719}
720
721/// Fallback for very small wrap columns (< 4 chars).
722fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
723    let enc_max = BASE64_ENGINE.encoded_length(data.len());
724    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
725    #[allow(clippy::uninit_vec)]
726    unsafe {
727        buf.set_len(enc_max);
728    }
729    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
730
731    let wc = wrap_col.max(1);
732    for line in encoded.chunks(wc) {
733        out.write_all(line)?;
734        out.write_all(b"\n")?;
735    }
736    Ok(())
737}
738
739/// Decode base64 data and write to output (borrows data, allocates clean buffer).
740/// When `ignore_garbage` is true, strip all non-base64 characters.
741/// When false, only strip whitespace (standard behavior).
742pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
743    if data.is_empty() {
744        return Ok(());
745    }
746
747    if ignore_garbage {
748        let mut cleaned = strip_non_base64(data);
749        return decode_clean_slice(&mut cleaned, out);
750    }
751
752    // For large data (>= 512KB): use bulk strip + single-shot decode.
753    // try_line_decode decodes per-line (~25ns overhead per 76-byte line call),
754    // while strip+decode uses SIMD gap-copy + single-shot SIMD decode at ~6.5 GB/s.
755    // For 10MB decode benchmark: ~2ms (bulk) vs ~4ms (per-line) = 2x faster.
756    // For small data (< 512KB): per-line decode avoids allocation overhead.
757    if data.len() < 512 * 1024 && data.len() >= 77 {
758        if let Some(result) = try_line_decode(data, out) {
759            return result;
760        }
761    }
762
763    // Fast path: single-pass SIMD strip + decode
764    decode_stripping_whitespace(data, out)
765}
766
767/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
768/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
769/// in-place with base64_simd::decode_inplace. Zero additional allocations.
770///
771/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
772/// containing whitespace (newlines) get physically copied (~1.3% for
773/// 76-char line base64). The decode writes to the same buffer, but decoded
774/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
775pub fn decode_mmap_inplace(
776    data: &mut [u8],
777    ignore_garbage: bool,
778    out: &mut impl Write,
779) -> io::Result<()> {
780    if data.is_empty() {
781        return Ok(());
782    }
783
784    // For small data: try line-by-line decode (avoids COW page faults).
785    // For large data (>= 512KB): bulk strip+decode is faster than per-line decode.
786    if !ignore_garbage && data.len() >= 77 && data.len() < 512 * 1024 {
787        if let Some(result) = try_line_decode(data, out) {
788            return result;
789        }
790    }
791
792    if ignore_garbage {
793        // Strip non-base64 chars in-place
794        let ptr = data.as_mut_ptr();
795        let len = data.len();
796        let mut wp = 0;
797        for rp in 0..len {
798            let b = unsafe { *ptr.add(rp) };
799            if is_base64_char(b) {
800                unsafe { *ptr.add(wp) = b };
801                wp += 1;
802            }
803        }
804        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
805            Ok(decoded) => return out.write_all(decoded),
806            Err(_) => return decode_error(),
807        }
808    }
809
810    // Fast path: uniform-line fused strip+decode (no intermediate buffer).
811    if data.len() >= 77 {
812        if let Some(result) = try_decode_uniform_lines(data, out) {
813            return result;
814        }
815    }
816
817    // Fallback: strip whitespace in-place using SIMD memchr2 gap-copy.
818
819    // Quick check: no newlines at all — maybe already clean
820    if memchr::memchr2(b'\n', b'\r', data).is_none() {
821        // Check for rare whitespace
822        if !data
823            .iter()
824            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
825        {
826            // Perfectly clean — decode in-place directly
827            match BASE64_ENGINE.decode_inplace(data) {
828                Ok(decoded) => return out.write_all(decoded),
829                Err(_) => return decode_error(),
830            }
831        }
832        // Rare whitespace only — strip in-place
833        let ptr = data.as_mut_ptr();
834        let len = data.len();
835        let mut wp = 0;
836        for rp in 0..len {
837            let b = unsafe { *ptr.add(rp) };
838            if NOT_WHITESPACE[b as usize] {
839                unsafe { *ptr.add(wp) = b };
840                wp += 1;
841            }
842        }
843        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
844            Ok(decoded) => return out.write_all(decoded),
845            Err(_) => return decode_error(),
846        }
847    }
848
849    // SIMD gap-copy: strip \n and \r in-place using memchr2
850    let ptr = data.as_mut_ptr();
851    let len = data.len();
852    let mut wp = 0usize;
853    let mut gap_start = 0usize;
854    let mut has_rare_ws = false;
855
856    // SAFETY: memchr2_iter reads from the original data. We write to positions
857    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
858    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
859        let gap_len = pos - gap_start;
860        if gap_len > 0 {
861            if !has_rare_ws {
862                // Check for rare whitespace during the gap-copy
863                has_rare_ws = unsafe {
864                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
865                        .iter()
866                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
867                };
868            }
869            if wp != gap_start {
870                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
871            }
872            wp += gap_len;
873        }
874        gap_start = pos + 1;
875    }
876    // Final gap
877    let tail_len = len - gap_start;
878    if tail_len > 0 {
879        if !has_rare_ws {
880            has_rare_ws = unsafe {
881                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
882                    .iter()
883                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
884            };
885        }
886        if wp != gap_start {
887            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
888        }
889        wp += tail_len;
890    }
891
892    // Second pass for rare whitespace if needed
893    if has_rare_ws {
894        let mut rp = 0;
895        let mut cwp = 0;
896        while rp < wp {
897            let b = unsafe { *ptr.add(rp) };
898            if NOT_WHITESPACE[b as usize] {
899                unsafe { *ptr.add(cwp) = b };
900                cwp += 1;
901            }
902            rp += 1;
903        }
904        wp = cwp;
905    }
906
907    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
908    if wp >= PARALLEL_DECODE_THRESHOLD {
909        // For large data, use parallel decode from the cleaned slice
910        return decode_borrowed_clean_parallel(out, &data[..wp]);
911    }
912    match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
913        Ok(decoded) => out.write_all(decoded),
914        Err(_) => decode_error(),
915    }
916}
917
918/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
919pub fn decode_owned(
920    data: &mut Vec<u8>,
921    ignore_garbage: bool,
922    out: &mut impl Write,
923) -> io::Result<()> {
924    if data.is_empty() {
925        return Ok(());
926    }
927
928    if ignore_garbage {
929        data.retain(|&b| is_base64_char(b));
930    } else {
931        strip_whitespace_inplace(data);
932    }
933
934    decode_clean_slice(data, out)
935}
936
937/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
938/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
939/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
940/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
941fn strip_whitespace_inplace(data: &mut Vec<u8>) {
942    // Quick check: skip stripping if no \n or \r in the data.
943    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
944    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
945    if memchr::memchr2(b'\n', b'\r', data).is_none() {
946        // No newlines/CR — check for rare whitespace only
947        if data
948            .iter()
949            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
950        {
951            data.retain(|&b| NOT_WHITESPACE[b as usize]);
952        }
953        return;
954    }
955
956    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
957    // gaps between them to compact the data in-place. For typical base64 streams,
958    // newlines are the only whitespace, so this handles >99% of cases.
959    let ptr = data.as_mut_ptr();
960    let len = data.len();
961    let mut wp = 0usize;
962    let mut gap_start = 0usize;
963    let mut has_rare_ws = false;
964
965    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
966        let gap_len = pos - gap_start;
967        if gap_len > 0 {
968            if !has_rare_ws {
969                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
970                has_rare_ws = data[gap_start..pos]
971                    .iter()
972                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
973            }
974            if wp != gap_start {
975                unsafe {
976                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
977                }
978            }
979            wp += gap_len;
980        }
981        gap_start = pos + 1;
982    }
983    // Copy the final gap
984    let tail_len = len - gap_start;
985    if tail_len > 0 {
986        if !has_rare_ws {
987            has_rare_ws = data[gap_start..]
988                .iter()
989                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
990        }
991        if wp != gap_start {
992            unsafe {
993                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
994            }
995        }
996        wp += tail_len;
997    }
998
999    data.truncate(wp);
1000
1001    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
1002    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1003    if has_rare_ws {
1004        let ptr = data.as_mut_ptr();
1005        let len = data.len();
1006        let mut rp = 0;
1007        let mut cwp = 0;
1008        while rp < len {
1009            let b = unsafe { *ptr.add(rp) };
1010            if NOT_WHITESPACE[b as usize] {
1011                unsafe { *ptr.add(cwp) = b };
1012                cwp += 1;
1013            }
1014            rp += 1;
1015        }
1016        data.truncate(cwp);
1017    }
1018}
1019
1020/// 256-byte lookup table: true for non-whitespace bytes.
1021/// Used for single-pass whitespace stripping in decode.
1022static NOT_WHITESPACE: [bool; 256] = {
1023    let mut table = [true; 256];
1024    table[b' ' as usize] = false;
1025    table[b'\t' as usize] = false;
1026    table[b'\n' as usize] = false;
1027    table[b'\r' as usize] = false;
1028    table[0x0b] = false; // vertical tab
1029    table[0x0c] = false; // form feed
1030    table
1031};
1032
1033/// Fused strip+decode for uniform-line base64 data.
1034/// Detects consistent line length, then processes in sub-chunks: each sub-chunk
1035/// copies lines to a small local buffer (L2-hot) and decodes immediately.
1036/// Eliminates the large intermediate clean buffer (~12MB for 10MB decode).
1037/// Returns None if the data doesn't have uniform line structure.
1038fn try_decode_uniform_lines(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1039    let first_nl = memchr::memchr(b'\n', data)?;
1040    let line_len = first_nl;
1041    if line_len == 0 || line_len % 4 != 0 {
1042        return None;
1043    }
1044
1045    let stride = line_len + 1;
1046
1047    // Verify the data has consistent line structure (first + last lines)
1048    let check_lines = 4.min(data.len() / stride);
1049    for i in 1..check_lines {
1050        let expected_nl = i * stride - 1;
1051        if expected_nl >= data.len() || data[expected_nl] != b'\n' {
1052            return None;
1053        }
1054    }
1055
1056    let full_lines = if data.len() >= stride {
1057        let candidate = data.len() / stride;
1058        if candidate > 0 && data[candidate * stride - 1] != b'\n' {
1059            return None;
1060        }
1061        candidate
1062    } else {
1063        0
1064    };
1065
1066    let remainder_start = full_lines * stride;
1067    let remainder = &data[remainder_start..];
1068    let rem_clean = if remainder.last() == Some(&b'\n') {
1069        &remainder[..remainder.len() - 1]
1070    } else {
1071        remainder
1072    };
1073
1074    // Compute exact decoded sizes
1075    let decoded_per_line = line_len * 3 / 4;
1076    let rem_decoded_size = if rem_clean.is_empty() {
1077        0
1078    } else {
1079        let pad = rem_clean
1080            .iter()
1081            .rev()
1082            .take(2)
1083            .filter(|&&b| b == b'=')
1084            .count();
1085        rem_clean.len() * 3 / 4 - pad
1086    };
1087    let total_decoded = full_lines * decoded_per_line + rem_decoded_size;
1088    let clean_len = full_lines * line_len;
1089
1090    // Parallel path: fused strip+decode with 128KB sub-chunks per thread.
1091    // Each thread copies lines to a thread-local buffer (L2-hot) and decodes immediately,
1092    // eliminating the 12MB+ intermediate clean buffer entirely.
1093    if clean_len >= PARALLEL_DECODE_THRESHOLD && rayon::current_num_threads() > 1 {
1094        let mut output: Vec<u8> = Vec::with_capacity(total_decoded);
1095        #[allow(clippy::uninit_vec)]
1096        unsafe {
1097            output.set_len(total_decoded);
1098        }
1099
1100        let out_ptr = output.as_mut_ptr() as usize;
1101        let src_ptr = data.as_ptr() as usize;
1102        let num_threads = rayon::current_num_threads().max(1);
1103        let lines_per_thread = (full_lines + num_threads - 1) / num_threads;
1104        let lines_per_sub = (256 * 1024 / line_len).max(1);
1105
1106        let result: Result<Vec<()>, io::Error> = (0..num_threads)
1107            .into_par_iter()
1108            .map(|t| {
1109                let start_line = t * lines_per_thread;
1110                if start_line >= full_lines {
1111                    return Ok(());
1112                }
1113                let end_line = (start_line + lines_per_thread).min(full_lines);
1114                let chunk_lines = end_line - start_line;
1115
1116                let sub_buf_size = lines_per_sub.min(chunk_lines) * line_len;
1117                let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1118                #[allow(clippy::uninit_vec)]
1119                unsafe {
1120                    local_buf.set_len(sub_buf_size);
1121                }
1122
1123                let src = src_ptr as *const u8;
1124                let out_base = out_ptr as *mut u8;
1125                let local_dst = local_buf.as_mut_ptr();
1126
1127                let mut sub_start = 0usize;
1128                while sub_start < chunk_lines {
1129                    let sub_count = (chunk_lines - sub_start).min(lines_per_sub);
1130                    let sub_clean = sub_count * line_len;
1131
1132                    for i in 0..sub_count {
1133                        unsafe {
1134                            std::ptr::copy_nonoverlapping(
1135                                src.add((start_line + sub_start + i) * stride),
1136                                local_dst.add(i * line_len),
1137                                line_len,
1138                            );
1139                        }
1140                    }
1141
1142                    let out_offset = (start_line + sub_start) * decoded_per_line;
1143                    let out_size = sub_count * decoded_per_line;
1144                    let out_slice = unsafe {
1145                        std::slice::from_raw_parts_mut(out_base.add(out_offset), out_size)
1146                    };
1147                    BASE64_ENGINE
1148                        .decode(&local_buf[..sub_clean], out_slice.as_out())
1149                        .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid input"))?;
1150
1151                    sub_start += sub_count;
1152                }
1153                Ok(())
1154            })
1155            .collect();
1156
1157        if let Err(e) = result {
1158            return Some(Err(e));
1159        }
1160
1161        if !rem_clean.is_empty() {
1162            let rem_out = &mut output[full_lines * decoded_per_line..total_decoded];
1163            match BASE64_ENGINE.decode(rem_clean, rem_out.as_out()) {
1164                Ok(_) => {}
1165                Err(_) => return Some(decode_error()),
1166            }
1167        }
1168
1169        return Some(out.write_all(&output[..total_decoded]));
1170    }
1171
1172    // Sequential path: fused strip+decode in 256KB sub-chunks.
1173    // Larger sub-chunks give SIMD decode more data per call, improving throughput.
1174    // Uses decode_inplace on a small reusable buffer — no large allocations at all.
1175    let lines_per_sub = (256 * 1024 / line_len).max(1);
1176    let sub_buf_size = lines_per_sub * line_len;
1177    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1178    #[allow(clippy::uninit_vec)]
1179    unsafe {
1180        local_buf.set_len(sub_buf_size);
1181    }
1182
1183    let src = data.as_ptr();
1184    let local_dst = local_buf.as_mut_ptr();
1185
1186    let mut line_idx = 0usize;
1187    while line_idx < full_lines {
1188        let sub_count = (full_lines - line_idx).min(lines_per_sub);
1189        let sub_clean = sub_count * line_len;
1190
1191        for i in 0..sub_count {
1192            unsafe {
1193                std::ptr::copy_nonoverlapping(
1194                    src.add((line_idx + i) * stride),
1195                    local_dst.add(i * line_len),
1196                    line_len,
1197                );
1198            }
1199        }
1200
1201        match BASE64_ENGINE.decode_inplace(&mut local_buf[..sub_clean]) {
1202            Ok(decoded) => {
1203                if let Err(e) = out.write_all(decoded) {
1204                    return Some(Err(e));
1205                }
1206            }
1207            Err(_) => return Some(decode_error()),
1208        }
1209
1210        line_idx += sub_count;
1211    }
1212
1213    if !rem_clean.is_empty() {
1214        let mut rem_buf = rem_clean.to_vec();
1215        match BASE64_ENGINE.decode_inplace(&mut rem_buf) {
1216            Ok(decoded) => {
1217                if let Err(e) = out.write_all(decoded) {
1218                    return Some(Err(e));
1219                }
1220            }
1221            Err(_) => return Some(decode_error()),
1222        }
1223    }
1224
1225    Some(Ok(()))
1226}
1227
1228/// Decode by stripping whitespace and decoding in a single fused pass.
1229/// For data with no whitespace, decodes directly without any copy.
1230/// Detects uniform line structure for fast structured-copy (no search needed),
1231/// falls back to SIMD memchr2 gap-copy for irregular data.
1232fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
1233    // Fast path for uniform-line base64 (e.g., standard 76-char lines + newline).
1234    // Copies at known offsets, avoiding the memchr2 search entirely.
1235    // For 13MB base64: saves ~1ms vs memchr2 gap-copy (just structured memcpy).
1236    if data.len() >= 77 {
1237        if let Some(result) = try_decode_uniform_lines(data, out) {
1238            return result;
1239        }
1240    }
1241
1242    // Quick check: skip stripping if no \n or \r in the data.
1243    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1244    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1245        // No newlines/CR — check for rare whitespace only
1246        if !data
1247            .iter()
1248            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1249        {
1250            return decode_borrowed_clean(out, data);
1251        }
1252        // Has rare whitespace only — strip and decode
1253        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
1254        for &b in data {
1255            if NOT_WHITESPACE[b as usize] {
1256                cleaned.push(b);
1257            }
1258        }
1259        return decode_clean_slice(&mut cleaned, out);
1260    }
1261
1262    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
1263    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
1264    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
1265    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
1266    let dst = clean.as_mut_ptr();
1267    let mut wp = 0usize;
1268    let mut gap_start = 0usize;
1269    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
1270    // This avoids the second full-scan pass when only \n/\r are present.
1271    let mut has_rare_ws = false;
1272
1273    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1274        let gap_len = pos - gap_start;
1275        if gap_len > 0 {
1276            // Check gap region for rare whitespace during copy.
1277            // This adds ~1 branch per gap but eliminates the second full scan.
1278            if !has_rare_ws {
1279                has_rare_ws = data[gap_start..pos]
1280                    .iter()
1281                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1282            }
1283            unsafe {
1284                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
1285            }
1286            wp += gap_len;
1287        }
1288        gap_start = pos + 1;
1289    }
1290    // Copy the final gap after the last \n/\r
1291    let tail_len = data.len() - gap_start;
1292    if tail_len > 0 {
1293        if !has_rare_ws {
1294            has_rare_ws = data[gap_start..]
1295                .iter()
1296                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1297        }
1298        unsafe {
1299            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
1300        }
1301        wp += tail_len;
1302    }
1303    unsafe {
1304        clean.set_len(wp);
1305    }
1306
1307    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
1308    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1309    if has_rare_ws {
1310        let ptr = clean.as_mut_ptr();
1311        let len = clean.len();
1312        let mut rp = 0;
1313        let mut cwp = 0;
1314        while rp < len {
1315            let b = unsafe { *ptr.add(rp) };
1316            if NOT_WHITESPACE[b as usize] {
1317                unsafe { *ptr.add(cwp) = b };
1318                cwp += 1;
1319            }
1320            rp += 1;
1321        }
1322        clean.truncate(cwp);
1323    }
1324
1325    // For large data (>= threshold), use parallel decode for multi-core speedup.
1326    // For small data, use in-place decode to avoid extra allocation.
1327    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
1328        decode_borrowed_clean_parallel(out, &clean)
1329    } else {
1330        decode_clean_slice(&mut clean, out)
1331    }
1332}
1333
1334/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
1335/// Returns Some(result) if the data has uniform line lengths suitable for
1336/// per-line decode, or None if the data doesn't fit this pattern.
1337///
1338/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
1339/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
1340/// We decode each line directly into its position in the output buffer.
1341fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1342    // Find the first newline to determine line length
1343    let first_nl = memchr::memchr(b'\n', data)?;
1344    let line_len = first_nl; // encoded chars per line (without newline)
1345
1346    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
1347    if line_len == 0 || line_len % 4 != 0 {
1348        return None;
1349    }
1350
1351    let line_stride = line_len + 1; // line_len chars + 1 newline byte
1352    let decoded_per_line = line_len * 3 / 4;
1353
1354    // Verify the data has a consistent line structure by checking the next few lines
1355    let check_lines = 4.min(data.len() / line_stride);
1356    for i in 1..check_lines {
1357        let expected_nl = i * line_stride - 1;
1358        if expected_nl >= data.len() {
1359            break;
1360        }
1361        if data[expected_nl] != b'\n' {
1362            return None; // Inconsistent line length
1363        }
1364    }
1365
1366    // Calculate full lines and remainder
1367    let full_lines = if data.len() >= line_stride {
1368        // Check how many complete lines fit
1369        let candidate = data.len() / line_stride;
1370        // Verify the last full line's newline
1371        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
1372            return None; // Not a clean line-structured file
1373        }
1374        candidate
1375    } else {
1376        0
1377    };
1378
1379    let remainder_start = full_lines * line_stride;
1380    let remainder = &data[remainder_start..];
1381
1382    // Calculate exact output size
1383    let remainder_clean_len = if remainder.is_empty() {
1384        0
1385    } else {
1386        // Remainder might end with newline, strip it
1387        let rem = if remainder.last() == Some(&b'\n') {
1388            &remainder[..remainder.len() - 1]
1389        } else {
1390            remainder
1391        };
1392        if rem.is_empty() {
1393            0
1394        } else {
1395            // Check for padding
1396            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
1397            if rem.len() % 4 != 0 {
1398                return None; // Invalid remainder
1399            }
1400            rem.len() * 3 / 4 - pad
1401        }
1402    };
1403
1404    // Single-allocation decode: allocate full decoded output, decode all lines
1405    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
1406    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
1407    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
1408    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
1409    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1410    #[allow(clippy::uninit_vec)]
1411    unsafe {
1412        out_buf.set_len(total_decoded);
1413    }
1414
1415    let dst = out_buf.as_mut_ptr();
1416
1417    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
1418    // Each thread decodes a contiguous block of lines directly to its final position
1419    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
1420    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
1421        let out_addr = dst as usize;
1422        let num_threads = rayon::current_num_threads().max(1);
1423        let lines_per_chunk = (full_lines / num_threads).max(1);
1424
1425        // Build per-thread task ranges: (start_line, end_line)
1426        let mut tasks: Vec<(usize, usize)> = Vec::new();
1427        let mut line_off = 0;
1428        while line_off < full_lines {
1429            let end = (line_off + lines_per_chunk).min(full_lines);
1430            tasks.push((line_off, end));
1431            line_off = end;
1432        }
1433
1434        let decode_result: Result<Vec<()>, io::Error> = tasks
1435            .par_iter()
1436            .map(|&(start_line, end_line)| {
1437                let out_ptr = out_addr as *mut u8;
1438                let mut i = start_line;
1439
1440                // 4x unrolled decode within each thread's range
1441                while i + 4 <= end_line {
1442                    let in_base = i * line_stride;
1443                    let ob = i * decoded_per_line;
1444                    unsafe {
1445                        let s0 = std::slice::from_raw_parts_mut(out_ptr.add(ob), decoded_per_line);
1446                        if BASE64_ENGINE
1447                            .decode(&data[in_base..in_base + line_len], s0.as_out())
1448                            .is_err()
1449                        {
1450                            return Err(io::Error::new(
1451                                io::ErrorKind::InvalidData,
1452                                "invalid input",
1453                            ));
1454                        }
1455                        let s1 = std::slice::from_raw_parts_mut(
1456                            out_ptr.add(ob + decoded_per_line),
1457                            decoded_per_line,
1458                        );
1459                        if BASE64_ENGINE
1460                            .decode(
1461                                &data[in_base + line_stride..in_base + line_stride + line_len],
1462                                s1.as_out(),
1463                            )
1464                            .is_err()
1465                        {
1466                            return Err(io::Error::new(
1467                                io::ErrorKind::InvalidData,
1468                                "invalid input",
1469                            ));
1470                        }
1471                        let s2 = std::slice::from_raw_parts_mut(
1472                            out_ptr.add(ob + 2 * decoded_per_line),
1473                            decoded_per_line,
1474                        );
1475                        if BASE64_ENGINE
1476                            .decode(
1477                                &data[in_base + 2 * line_stride
1478                                    ..in_base + 2 * line_stride + line_len],
1479                                s2.as_out(),
1480                            )
1481                            .is_err()
1482                        {
1483                            return Err(io::Error::new(
1484                                io::ErrorKind::InvalidData,
1485                                "invalid input",
1486                            ));
1487                        }
1488                        let s3 = std::slice::from_raw_parts_mut(
1489                            out_ptr.add(ob + 3 * decoded_per_line),
1490                            decoded_per_line,
1491                        );
1492                        if BASE64_ENGINE
1493                            .decode(
1494                                &data[in_base + 3 * line_stride
1495                                    ..in_base + 3 * line_stride + line_len],
1496                                s3.as_out(),
1497                            )
1498                            .is_err()
1499                        {
1500                            return Err(io::Error::new(
1501                                io::ErrorKind::InvalidData,
1502                                "invalid input",
1503                            ));
1504                        }
1505                    }
1506                    i += 4;
1507                }
1508
1509                while i < end_line {
1510                    let in_start = i * line_stride;
1511                    let out_off = i * decoded_per_line;
1512                    let out_slice = unsafe {
1513                        std::slice::from_raw_parts_mut(out_ptr.add(out_off), decoded_per_line)
1514                    };
1515                    if BASE64_ENGINE
1516                        .decode(&data[in_start..in_start + line_len], out_slice.as_out())
1517                        .is_err()
1518                    {
1519                        return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"));
1520                    }
1521                    i += 1;
1522                }
1523
1524                Ok(())
1525            })
1526            .collect();
1527
1528        if decode_result.is_err() {
1529            return Some(decode_error());
1530        }
1531    } else {
1532        // Sequential decode with 4x unrolling for smaller inputs
1533        let mut i = 0;
1534
1535        while i + 4 <= full_lines {
1536            let in_base = i * line_stride;
1537            let out_base = i * decoded_per_line;
1538            unsafe {
1539                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
1540                if BASE64_ENGINE
1541                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1542                    .is_err()
1543                {
1544                    return Some(decode_error());
1545                }
1546
1547                let s1 = std::slice::from_raw_parts_mut(
1548                    dst.add(out_base + decoded_per_line),
1549                    decoded_per_line,
1550                );
1551                if BASE64_ENGINE
1552                    .decode(
1553                        &data[in_base + line_stride..in_base + line_stride + line_len],
1554                        s1.as_out(),
1555                    )
1556                    .is_err()
1557                {
1558                    return Some(decode_error());
1559                }
1560
1561                let s2 = std::slice::from_raw_parts_mut(
1562                    dst.add(out_base + 2 * decoded_per_line),
1563                    decoded_per_line,
1564                );
1565                if BASE64_ENGINE
1566                    .decode(
1567                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
1568                        s2.as_out(),
1569                    )
1570                    .is_err()
1571                {
1572                    return Some(decode_error());
1573                }
1574
1575                let s3 = std::slice::from_raw_parts_mut(
1576                    dst.add(out_base + 3 * decoded_per_line),
1577                    decoded_per_line,
1578                );
1579                if BASE64_ENGINE
1580                    .decode(
1581                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
1582                        s3.as_out(),
1583                    )
1584                    .is_err()
1585                {
1586                    return Some(decode_error());
1587                }
1588            }
1589            i += 4;
1590        }
1591
1592        while i < full_lines {
1593            let in_start = i * line_stride;
1594            let in_end = in_start + line_len;
1595            let out_off = i * decoded_per_line;
1596            let out_slice =
1597                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
1598            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
1599                Ok(_) => {}
1600                Err(_) => return Some(decode_error()),
1601            }
1602            i += 1;
1603        }
1604    }
1605
1606    // Decode remainder
1607    if remainder_clean_len > 0 {
1608        let rem = if remainder.last() == Some(&b'\n') {
1609            &remainder[..remainder.len() - 1]
1610        } else {
1611            remainder
1612        };
1613        let out_off = full_lines * decoded_per_line;
1614        let out_slice =
1615            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
1616        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
1617            Ok(_) => {}
1618            Err(_) => return Some(decode_error()),
1619        }
1620    }
1621
1622    // Single write_all for the entire decoded output
1623    Some(out.write_all(&out_buf[..total_decoded]))
1624}
1625
1626/// Decode a clean (no whitespace) buffer in-place with SIMD.
1627fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1628    if data.is_empty() {
1629        return Ok(());
1630    }
1631    match BASE64_ENGINE.decode_inplace(data) {
1632        Ok(decoded) => out.write_all(decoded),
1633        Err(_) => decode_error(),
1634    }
1635}
1636
1637/// Cold error path — keeps hot decode path tight by moving error construction out of line.
1638#[cold]
1639#[inline(never)]
1640fn decode_error() -> io::Result<()> {
1641    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1642}
1643
1644/// Decode clean base64 data (no whitespace) from a borrowed slice.
1645fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1646    if data.is_empty() {
1647        return Ok(());
1648    }
1649    // Parallel decode for large data: split at 4-byte boundaries,
1650    // decode each chunk independently (base64 is context-free per 4-char group).
1651    if data.len() >= PARALLEL_DECODE_THRESHOLD {
1652        return decode_borrowed_clean_parallel(out, data);
1653    }
1654    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
1655    // Decoded size = data.len() * 3 / 4 minus padding.
1656    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
1657    let decoded_size = data.len() * 3 / 4 - pad;
1658    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
1659    #[allow(clippy::uninit_vec)]
1660    unsafe {
1661        buf.set_len(decoded_size);
1662    }
1663    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
1664        Ok(decoded) => {
1665            out.write_all(decoded)?;
1666            Ok(())
1667        }
1668        Err(_) => decode_error(),
1669    }
1670}
1671
1672/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel via rayon.
1673/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
1674/// upfront, so each thread decodes directly to its final position. No compaction needed.
1675fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1676    let num_threads = rayon::current_num_threads().max(1);
1677    let raw_chunk = data.len() / num_threads;
1678    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
1679    let chunk_size = ((raw_chunk + 3) / 4) * 4;
1680
1681    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
1682
1683    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
1684    // For all chunks except the last, decoded size is exactly chunk.len() * 3 / 4.
1685    // For the last chunk, account for '=' padding bytes.
1686    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
1687    offsets.push(0);
1688    let mut total_decoded = 0usize;
1689    for (i, chunk) in chunks.iter().enumerate() {
1690        let decoded_size = if i == chunks.len() - 1 {
1691            // Last chunk: count '=' padding to get exact decoded size
1692            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
1693            chunk.len() * 3 / 4 - pad
1694        } else {
1695            // Non-last chunks: 4-byte aligned, no padding, exact 3/4 ratio
1696            chunk.len() * 3 / 4
1697        };
1698        total_decoded += decoded_size;
1699        offsets.push(total_decoded);
1700    }
1701
1702    // Pre-allocate contiguous output buffer with exact total size
1703    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1704    #[allow(clippy::uninit_vec)]
1705    unsafe {
1706        output_buf.set_len(total_decoded);
1707    }
1708
1709    // Parallel decode: each thread decodes directly into its exact final position.
1710    // No compaction pass needed since offsets are computed from exact decoded sizes.
1711    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
1712    // Use usize representation of the pointer for Send+Sync compatibility with rayon.
1713    let out_addr = output_buf.as_mut_ptr() as usize;
1714    let decode_result: Result<Vec<()>, io::Error> = chunks
1715        .par_iter()
1716        .enumerate()
1717        .map(|(i, chunk)| {
1718            let offset = offsets[i];
1719            let expected_size = offsets[i + 1] - offset;
1720            // SAFETY: each thread writes to non-overlapping region [offset..offset+expected_size]
1721            let out_slice = unsafe {
1722                std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
1723            };
1724            let decoded = BASE64_ENGINE
1725                .decode(chunk, out_slice.as_out())
1726                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid input"))?;
1727            debug_assert_eq!(decoded.len(), expected_size);
1728            Ok(())
1729        })
1730        .collect();
1731
1732    decode_result?;
1733
1734    out.write_all(&output_buf[..total_decoded])
1735}
1736
1737/// Strip non-base64 characters (for -i / --ignore-garbage).
1738fn strip_non_base64(data: &[u8]) -> Vec<u8> {
1739    data.iter()
1740        .copied()
1741        .filter(|&b| is_base64_char(b))
1742        .collect()
1743}
1744
1745/// Check if a byte is a valid base64 alphabet character or padding.
1746#[inline]
1747fn is_base64_char(b: u8) -> bool {
1748    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
1749}
1750
1751/// Stream-encode from a reader to a writer. Used for stdin processing.
1752/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
1753pub fn encode_stream(
1754    reader: &mut impl Read,
1755    wrap_col: usize,
1756    writer: &mut impl Write,
1757) -> io::Result<()> {
1758    if wrap_col == 0 {
1759        return encode_stream_nowrap(reader, writer);
1760    }
1761    encode_stream_wrapped(reader, wrap_col, writer)
1762}
1763
1764/// Streaming encode with NO line wrapping — optimized fast path.
1765/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
1766/// 24MB reads mean 10-18MB input is consumed in a single read() call,
1767/// and the encoded output writes in 1-2 write() calls.
1768fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1769    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
1770    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3
1771
1772    // SAFETY: buf bytes are written by read_full before being processed.
1773    // encode_buf bytes are written by encode before being read.
1774    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
1775    #[allow(clippy::uninit_vec)]
1776    unsafe {
1777        buf.set_len(NOWRAP_READ);
1778    }
1779    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
1780    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1781    #[allow(clippy::uninit_vec)]
1782    unsafe {
1783        encode_buf.set_len(encode_buf_size);
1784    }
1785
1786    loop {
1787        let n = read_full(reader, &mut buf)?;
1788        if n == 0 {
1789            break;
1790        }
1791        let enc_len = BASE64_ENGINE.encoded_length(n);
1792        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1793        writer.write_all(encoded)?;
1794    }
1795    Ok(())
1796}
1797
1798/// Streaming encode WITH line wrapping.
1799/// For the common case (wrap_col divides evenly into 3-byte input groups),
1800/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
1801/// then writes it in a single write() call. This eliminates the overhead of
1802/// many writev() syscalls (one per ~512 lines via IoSlice).
1803///
1804/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
1805fn encode_stream_wrapped(
1806    reader: &mut impl Read,
1807    wrap_col: usize,
1808    writer: &mut impl Write,
1809) -> io::Result<()> {
1810    let bytes_per_line = wrap_col * 3 / 4;
1811    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
1812    // align the read buffer to bytes_per_line boundaries so each chunk produces
1813    // complete lines with no column carry-over between chunks.
1814    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
1815        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
1816    }
1817
1818    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
1819    const STREAM_READ: usize = 12 * 1024 * 1024;
1820    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
1821    #[allow(clippy::uninit_vec)]
1822    unsafe {
1823        buf.set_len(STREAM_READ);
1824    }
1825    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
1826    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1827    #[allow(clippy::uninit_vec)]
1828    unsafe {
1829        encode_buf.set_len(encode_buf_size);
1830    }
1831
1832    let mut col = 0usize;
1833
1834    loop {
1835        let n = read_full(reader, &mut buf)?;
1836        if n == 0 {
1837            break;
1838        }
1839        let enc_len = BASE64_ENGINE.encoded_length(n);
1840        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1841
1842        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
1843    }
1844
1845    if col > 0 {
1846        writer.write_all(b"\n")?;
1847    }
1848
1849    Ok(())
1850}
1851
1852/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
1853/// encode each line directly into its final position with newline appended.
1854/// Eliminates the two-pass encode-then-fuse_wrap approach.
1855/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
1856/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
1857fn encode_stream_wrapped_fused(
1858    reader: &mut impl Read,
1859    wrap_col: usize,
1860    bytes_per_line: usize,
1861    writer: &mut impl Write,
1862) -> io::Result<()> {
1863    // Align read size to bytes_per_line for complete output lines per chunk.
1864    // ~420K lines * 57 bytes = ~24MB input, ~32MB output.
1865    let lines_per_chunk = (24 * 1024 * 1024) / bytes_per_line;
1866    let read_size = lines_per_chunk * bytes_per_line;
1867    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline
1868
1869    // SAFETY: buf bytes are written by read_full before being processed.
1870    // out_buf bytes are written by encode before being read.
1871    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
1872    #[allow(clippy::uninit_vec)]
1873    unsafe {
1874        buf.set_len(read_size);
1875    }
1876    // Output buffer: enough for all lines + remainder
1877    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
1878    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
1879    #[allow(clippy::uninit_vec)]
1880    unsafe {
1881        out_buf.set_len(max_output);
1882    }
1883
1884    loop {
1885        let n = read_full(reader, &mut buf)?;
1886        if n == 0 {
1887            break;
1888        }
1889
1890        let full_lines = n / bytes_per_line;
1891        let remainder = n % bytes_per_line;
1892
1893        // Encode each input line directly into its final output position.
1894        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
1895        // This eliminates the separate encode + fuse_wrap copy entirely.
1896        let dst = out_buf.as_mut_ptr();
1897        let mut line_idx = 0;
1898
1899        // 4-line unrolled loop for better ILP
1900        while line_idx + 4 <= full_lines {
1901            let in_base = line_idx * bytes_per_line;
1902            let out_base = line_idx * line_out;
1903            unsafe {
1904                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1905                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
1906                *dst.add(out_base + wrap_col) = b'\n';
1907
1908                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
1909                let _ = BASE64_ENGINE.encode(
1910                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
1911                    s1.as_out(),
1912                );
1913                *dst.add(out_base + line_out + wrap_col) = b'\n';
1914
1915                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
1916                let _ = BASE64_ENGINE.encode(
1917                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
1918                    s2.as_out(),
1919                );
1920                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
1921
1922                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
1923                let _ = BASE64_ENGINE.encode(
1924                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
1925                    s3.as_out(),
1926                );
1927                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
1928            }
1929            line_idx += 4;
1930        }
1931
1932        // Remaining full lines
1933        while line_idx < full_lines {
1934            let in_base = line_idx * bytes_per_line;
1935            let out_base = line_idx * line_out;
1936            unsafe {
1937                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1938                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
1939                *dst.add(out_base + wrap_col) = b'\n';
1940            }
1941            line_idx += 1;
1942        }
1943
1944        let mut wp = full_lines * line_out;
1945
1946        // Handle remainder (partial last line of this chunk)
1947        if remainder > 0 {
1948            let enc_len = BASE64_ENGINE.encoded_length(remainder);
1949            let line_input = &buf[full_lines * bytes_per_line..n];
1950            unsafe {
1951                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
1952                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
1953                *dst.add(wp + enc_len) = b'\n';
1954            }
1955            wp += enc_len + 1;
1956        }
1957
1958        writer.write_all(&out_buf[..wp])?;
1959    }
1960
1961    Ok(())
1962}
1963
1964/// Stream-decode from a reader to a writer. Used for stdin processing.
1965/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
1966/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
1967/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
1968/// fill the entire buffer from the pipe, and 32MB means even large inputs
1969/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
1970pub fn decode_stream(
1971    reader: &mut impl Read,
1972    ignore_garbage: bool,
1973    writer: &mut impl Write,
1974) -> io::Result<()> {
1975    const READ_CHUNK: usize = 32 * 1024 * 1024;
1976    // SAFETY: buf bytes are written by read_full before being processed.
1977    // The extra 4 bytes accommodate carry-over from previous chunk.
1978    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
1979    #[allow(clippy::uninit_vec)]
1980    unsafe {
1981        buf.set_len(READ_CHUNK + 4);
1982    }
1983    let mut carry = [0u8; 4];
1984    let mut carry_len = 0usize;
1985
1986    loop {
1987        // Copy carry bytes to start of buffer, read new data after them
1988        if carry_len > 0 {
1989            unsafe {
1990                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
1991            }
1992        }
1993        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
1994        if n == 0 {
1995            break;
1996        }
1997        let total_raw = carry_len + n;
1998
1999        // Strip whitespace in-place in the buffer itself.
2000        // This eliminates the separate clean buffer allocation (saves 16MB).
2001        let clean_len = if ignore_garbage {
2002            // Scalar filter for ignore_garbage mode (rare path)
2003            let ptr = buf.as_mut_ptr();
2004            let mut wp = 0usize;
2005            for i in 0..total_raw {
2006                let b = unsafe { *ptr.add(i) };
2007                if is_base64_char(b) {
2008                    unsafe { *ptr.add(wp) = b };
2009                    wp += 1;
2010                }
2011            }
2012            wp
2013        } else {
2014            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
2015            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
2016            // so we process ~76 bytes per memchr hit.
2017            let ptr = buf.as_mut_ptr();
2018            let data = &buf[..total_raw];
2019            let mut wp = 0usize;
2020            let mut gap_start = 0usize;
2021            let mut has_rare_ws = false;
2022
2023            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
2024                let gap_len = pos - gap_start;
2025                if gap_len > 0 {
2026                    if !has_rare_ws {
2027                        has_rare_ws = data[gap_start..pos]
2028                            .iter()
2029                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2030                    }
2031                    if wp != gap_start {
2032                        unsafe {
2033                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
2034                        }
2035                    }
2036                    wp += gap_len;
2037                }
2038                gap_start = pos + 1;
2039            }
2040            let tail_len = total_raw - gap_start;
2041            if tail_len > 0 {
2042                if !has_rare_ws {
2043                    has_rare_ws = data[gap_start..total_raw]
2044                        .iter()
2045                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2046                }
2047                if wp != gap_start {
2048                    unsafe {
2049                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
2050                    }
2051                }
2052                wp += tail_len;
2053            }
2054
2055            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
2056            if has_rare_ws {
2057                let mut rp = 0;
2058                let mut cwp = 0;
2059                while rp < wp {
2060                    let b = unsafe { *ptr.add(rp) };
2061                    if NOT_WHITESPACE[b as usize] {
2062                        unsafe { *ptr.add(cwp) = b };
2063                        cwp += 1;
2064                    }
2065                    rp += 1;
2066                }
2067                cwp
2068            } else {
2069                wp
2070            }
2071        };
2072
2073        carry_len = 0;
2074        let is_last = n < READ_CHUNK;
2075
2076        if is_last {
2077            // Last chunk: decode everything (including padding)
2078            decode_clean_slice(&mut buf[..clean_len], writer)?;
2079        } else {
2080            // Save incomplete base64 quadruplet for next iteration
2081            let decode_len = (clean_len / 4) * 4;
2082            let leftover = clean_len - decode_len;
2083            if leftover > 0 {
2084                unsafe {
2085                    std::ptr::copy_nonoverlapping(
2086                        buf.as_ptr().add(decode_len),
2087                        carry.as_mut_ptr(),
2088                        leftover,
2089                    );
2090                }
2091                carry_len = leftover;
2092            }
2093            if decode_len > 0 {
2094                decode_clean_slice(&mut buf[..decode_len], writer)?;
2095            }
2096        }
2097    }
2098
2099    // Handle any remaining carry-over bytes
2100    if carry_len > 0 {
2101        let mut carry_buf = carry[..carry_len].to_vec();
2102        decode_clean_slice(&mut carry_buf, writer)?;
2103    }
2104
2105    Ok(())
2106}
2107
2108/// Write all IoSlice entries using write_vectored (writev syscall).
2109/// Hot path: single write_vectored succeeds fully (common on Linux pipes/files).
2110/// Cold path: partial write handled out-of-line to keep hot path tight.
2111#[inline(always)]
2112fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
2113    if slices.is_empty() {
2114        return Ok(());
2115    }
2116    let total: usize = slices.iter().map(|s| s.len()).sum();
2117    let written = out.write_vectored(slices)?;
2118    if written >= total {
2119        return Ok(());
2120    }
2121    if written == 0 {
2122        return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
2123    }
2124    write_all_vectored_slow(out, slices, written)
2125}
2126
2127/// Handle partial write (cold path, never inlined).
2128#[cold]
2129#[inline(never)]
2130fn write_all_vectored_slow(
2131    out: &mut impl Write,
2132    slices: &[io::IoSlice],
2133    mut skip: usize,
2134) -> io::Result<()> {
2135    for slice in slices {
2136        let len = slice.len();
2137        if skip >= len {
2138            skip -= len;
2139            continue;
2140        }
2141        out.write_all(&slice[skip..])?;
2142        skip = 0;
2143    }
2144    Ok(())
2145}
2146
2147/// Read as many bytes as possible into buf, retrying on partial reads.
2148/// Fast path: regular file reads usually return the full buffer on the first call,
2149/// avoiding the loop overhead entirely.
2150#[inline]
2151fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2152    // Fast path: first read() usually fills the entire buffer for regular files
2153    let n = reader.read(buf)?;
2154    if n == buf.len() || n == 0 {
2155        return Ok(n);
2156    }
2157    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
2158    let mut total = n;
2159    while total < buf.len() {
2160        match reader.read(&mut buf[total..]) {
2161            Ok(0) => break,
2162            Ok(n) => total += n,
2163            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2164            Err(e) => return Err(e),
2165        }
2166    }
2167    Ok(total)
2168}