Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4
5const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
6
7/// Number of available CPUs for parallel chunk splitting.
8/// Uses std::thread::available_parallelism() to avoid triggering premature
9/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
10#[inline]
11fn num_cpus() -> usize {
12    std::thread::available_parallelism()
13        .map(|n| n.get())
14        .unwrap_or(1)
15}
16
17/// Chunk size for sequential no-wrap encoding: 8MB aligned to 3 bytes.
18/// Larger chunks reduce function call overhead per iteration while still
19/// keeping peak buffer allocation reasonable (~10.7MB for the output).
20const NOWRAP_CHUNK: usize = 8 * 1024 * 1024 - (8 * 1024 * 1024 % 3);
21
22/// Minimum data size for parallel no-wrap encoding (16MB).
23/// For single-file CLI usage (typical benchmark), the Rayon pool is cold
24/// on first use (~200-500µs init). At 10MB, sequential encoding is faster
25/// because pool init + dispatch overhead exceeds the parallel benefit.
26/// Note: multi-file callers pay pool init only once; subsequent files would
27/// benefit from a lower threshold (~2MB). Optimized for single-file CLI.
28const PARALLEL_NOWRAP_THRESHOLD: usize = 16 * 1024 * 1024;
29
30/// Minimum data size for parallel wrapped encoding (12MB).
31/// Same cold-pool reasoning as PARALLEL_NOWRAP_THRESHOLD above.
32/// The sequential encode_wrapped_expand path with backward expansion
33/// eliminates per-group overhead from L1-scatter chunking.
34const PARALLEL_WRAPPED_THRESHOLD: usize = 12 * 1024 * 1024;
35
36/// Minimum data size for parallel decoding (1MB of base64 data).
37/// Lower threshold than encode because decode is more compute-intensive
38/// and benefits from parallelism at smaller sizes. After first use, the
39/// Rayon pool is warm (~10µs dispatch), making 1MB a good crossover point.
40const PARALLEL_DECODE_THRESHOLD: usize = 1024 * 1024;
41
42/// Hint HUGEPAGE for large output buffers on Linux.
43/// MADV_HUGEPAGE tells kernel to use 2MB pages, reducing TLB misses
44/// and minor fault count for large allocations (~25,600 → ~50 for 100MB).
45#[cfg(target_os = "linux")]
46fn hint_hugepage(buf: &mut Vec<u8>) {
47    if buf.capacity() >= 2 * 1024 * 1024 {
48        unsafe {
49            libc::madvise(
50                buf.as_mut_ptr() as *mut libc::c_void,
51                buf.capacity(),
52                libc::MADV_HUGEPAGE,
53            );
54        }
55    }
56}
57
58/// Encode data and write to output with line wrapping.
59/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
60pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
61    if data.is_empty() {
62        return Ok(());
63    }
64
65    if wrap_col == 0 {
66        return encode_no_wrap(data, out);
67    }
68
69    encode_wrapped(data, wrap_col, out)
70}
71
72/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
73fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
74    if data.len() >= PARALLEL_NOWRAP_THRESHOLD && num_cpus() > 1 {
75        return encode_no_wrap_parallel(data, out);
76    }
77
78    // Single-buffer encode: for data that fits in one chunk, encode directly
79    // and write once. For larger data, reuse the buffer across chunks.
80    let enc_len = BASE64_ENGINE.encoded_length(data.len().min(NOWRAP_CHUNK));
81    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
82    #[allow(clippy::uninit_vec)]
83    unsafe {
84        buf.set_len(enc_len);
85    }
86
87    for chunk in data.chunks(NOWRAP_CHUNK) {
88        let clen = BASE64_ENGINE.encoded_length(chunk.len());
89        let encoded = BASE64_ENGINE.encode(chunk, buf[..clen].as_out());
90        out.write_all(encoded)?;
91    }
92    Ok(())
93}
94
95/// Parallel no-wrap encoding into a single shared output buffer.
96/// Split at 3-byte boundaries, pre-calculate output offsets, encode in parallel.
97/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
98/// Single allocation + single write_all instead of N allocations + writev.
99fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
100    let num_threads = num_cpus().max(1);
101    let raw_chunk = data.len() / num_threads;
102    // Align to 3 bytes so each chunk encodes without padding (except the last)
103    let chunk_size = ((raw_chunk + 2) / 3) * 3;
104
105    // Split input into 3-byte-aligned chunks
106    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();
107
108    // Pre-calculate output offsets
109    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
110    let mut total_out = 0usize;
111    for chunk in &chunks {
112        offsets.push(total_out);
113        total_out += BASE64_ENGINE.encoded_length(chunk.len());
114    }
115
116    // Single allocation for all threads
117    let mut output: Vec<u8> = Vec::with_capacity(total_out);
118    #[allow(clippy::uninit_vec)]
119    unsafe {
120        output.set_len(total_out);
121    }
122    #[cfg(target_os = "linux")]
123    hint_hugepage(&mut output);
124
125    // Parallel encode: each thread writes into its pre-assigned region
126    let output_base = output.as_mut_ptr() as usize;
127    rayon::scope(|s| {
128        for (i, chunk) in chunks.iter().enumerate() {
129            let out_off = offsets[i];
130            let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
131            let base = output_base;
132            s.spawn(move |_| {
133                let dest =
134                    unsafe { std::slice::from_raw_parts_mut((base + out_off) as *mut u8, enc_len) };
135                let _ = BASE64_ENGINE.encode(chunk, dest.as_out());
136            });
137        }
138    });
139
140    out.write_all(&output[..total_out])
141}
142
143/// Encode with line wrapping using forward scatter from L1-cached temp buffer.
144/// Encodes groups of lines into a small temp buffer (fits in L1 cache), then
145/// scatter-copies wrap_col-byte chunks from temp to output with newlines.
146///
147/// This is faster than bulk encode + backward expansion because:
148/// - Temp buffer reads hit L1 cache (essentially free bandwidth)
149/// - Output buffer is written once (no double-write from backward memmove)
150/// - Forward access pattern is prefetcher-friendly
151fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
152    let bytes_per_line = wrap_col * 3 / 4;
153    if bytes_per_line == 0 {
154        return encode_wrapped_small(data, wrap_col, out);
155    }
156
157    if data.len() >= PARALLEL_WRAPPED_THRESHOLD && bytes_per_line.is_multiple_of(3) {
158        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
159    }
160
161    // Chunked encode for data > 1MB: process ~1MB at a time to keep the output
162    // buffer small (~1.3MB) and warm in L2. Reduces minor page faults from ~3500 to ~320.
163    // Guard: skip if bytes_per_line > 1MB (lines_per_chunk would be 0 → infinite loop).
164    if bytes_per_line.is_multiple_of(3) && data.len() > 1024 * 1024 {
165        let lines_per_chunk = (1024 * 1024) / bytes_per_line;
166        if lines_per_chunk > 0 {
167            return encode_wrapped_chunked(data, wrap_col, bytes_per_line, out);
168        }
169    }
170
171    if bytes_per_line.is_multiple_of(3) {
172        return encode_wrapped_expand(data, wrap_col, bytes_per_line, out);
173    }
174
175    // Fallback for non-3-aligned bytes_per_line: use fuse_wrap approach
176    let enc_max = BASE64_ENGINE.encoded_length(data.len());
177    let num_full = enc_max / wrap_col;
178    let rem = enc_max % wrap_col;
179    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
180
181    // Encode full data, then fuse with newlines
182    let mut enc_buf: Vec<u8> = Vec::with_capacity(enc_max);
183    #[allow(clippy::uninit_vec)]
184    unsafe {
185        enc_buf.set_len(enc_max);
186    }
187    let _ = BASE64_ENGINE.encode(data, enc_buf[..enc_max].as_out());
188
189    let mut out_buf: Vec<u8> = Vec::with_capacity(out_len);
190    #[allow(clippy::uninit_vec)]
191    unsafe {
192        out_buf.set_len(out_len);
193    }
194    let n = fuse_wrap(&enc_buf, wrap_col, &mut out_buf);
195    out.write_all(&out_buf[..n])
196}
197
198/// Chunked encode+wrap for large in-memory data. Processes ~1MB input at a time,
199/// reusing a small output buffer (~1.3MB) that stays warm in L2 cache.
200/// Each chunk is aligned to bytes_per_line so it produces complete lines.
201/// Reduces page faults from ~3500 (14MB alloc) to ~320 (1.3MB alloc) for 10MB input.
202fn encode_wrapped_chunked(
203    data: &[u8],
204    wrap_col: usize,
205    bytes_per_line: usize,
206    out: &mut impl Write,
207) -> io::Result<()> {
208    debug_assert!(bytes_per_line.is_multiple_of(3));
209
210    // ~1MB input per chunk, aligned to bytes_per_line
211    let lines_per_chunk = (1024 * 1024) / bytes_per_line;
212    let chunk_input = lines_per_chunk * bytes_per_line;
213    let line_out = wrap_col + 1;
214    let max_chunk_out =
215        lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
216
217    // SAFETY: out_buf is allocated to max_chunk_out bytes and set_len'd without
218    // initialization. The encode loop below writes exactly `full_lines * line_out`
219    // bytes (encode + newline per line), plus any remainder. Only the written
220    // prefix `out_buf[..total_out]` is passed to write_all — no uninitialized
221    // bytes are ever read. u8 has no drop glue, so the uninitialized tail is safe.
222    let mut out_buf: Vec<u8> = Vec::with_capacity(max_chunk_out);
223    #[allow(clippy::uninit_vec)]
224    unsafe {
225        out_buf.set_len(max_chunk_out);
226    }
227
228    let mut pos = 0;
229    while pos < data.len() {
230        let remaining = data.len() - pos;
231        let chunk_len = remaining.min(chunk_input);
232        let chunk = &data[pos..pos + chunk_len];
233
234        let full_lines = chunk_len / bytes_per_line;
235        let remainder = chunk_len % bytes_per_line;
236
237        let dst = out_buf.as_mut_ptr();
238        let mut line_idx = 0;
239
240        // 4-line unrolled loop
241        while line_idx + 4 <= full_lines {
242            let in_base = line_idx * bytes_per_line;
243            let out_base = line_idx * line_out;
244            unsafe {
245                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
246                let _ =
247                    BASE64_ENGINE.encode(&chunk[in_base..in_base + bytes_per_line], s0.as_out());
248                *dst.add(out_base + wrap_col) = b'\n';
249
250                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
251                let _ = BASE64_ENGINE.encode(
252                    &chunk[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
253                    s1.as_out(),
254                );
255                *dst.add(out_base + line_out + wrap_col) = b'\n';
256
257                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
258                let _ = BASE64_ENGINE.encode(
259                    &chunk[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
260                    s2.as_out(),
261                );
262                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
263
264                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
265                let _ = BASE64_ENGINE.encode(
266                    &chunk[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
267                    s3.as_out(),
268                );
269                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
270            }
271            line_idx += 4;
272        }
273        while line_idx < full_lines {
274            let in_off = line_idx * bytes_per_line;
275            let out_off = line_idx * line_out;
276            unsafe {
277                let s = std::slice::from_raw_parts_mut(dst.add(out_off), wrap_col);
278                let _ = BASE64_ENGINE.encode(&chunk[in_off..in_off + bytes_per_line], s.as_out());
279                *dst.add(out_off + wrap_col) = b'\n';
280            }
281            line_idx += 1;
282        }
283
284        let mut total_out = full_lines * line_out;
285
286        if remainder > 0 {
287            let in_off = full_lines * bytes_per_line;
288            let enc_len = BASE64_ENGINE.encoded_length(remainder);
289            unsafe {
290                let s = std::slice::from_raw_parts_mut(dst.add(total_out), enc_len);
291                let _ = BASE64_ENGINE.encode(&chunk[in_off..in_off + remainder], s.as_out());
292                *dst.add(total_out + enc_len) = b'\n';
293            }
294            total_out += enc_len + 1;
295        }
296
297        out.write_all(&out_buf[..total_out])?;
298        pos += chunk_len;
299    }
300
301    Ok(())
302}
303
304/// Encode with backward expansion: single contiguous SIMD encode, then expand
305/// in-place to insert newlines. The encode is done in one call (no chunking),
306/// which eliminates per-group function call overhead from L1-scatter.
307/// The backward expansion only shifts data by ~1.3% (1 byte per 76 for wrap_col=76),
308/// and for most lines the shift exceeds wrap_col so memmove uses the fast memcpy path.
309fn encode_wrapped_expand(
310    data: &[u8],
311    wrap_col: usize,
312    bytes_per_line: usize,
313    out: &mut impl Write,
314) -> io::Result<()> {
315    debug_assert!(bytes_per_line.is_multiple_of(3));
316    let enc_len = BASE64_ENGINE.encoded_length(data.len());
317    if enc_len == 0 {
318        return Ok(());
319    }
320
321    let num_full = enc_len / wrap_col;
322    let rem = enc_len % wrap_col;
323    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
324
325    // Single allocation: encode into first enc_len bytes, expand backward to out_len.
326    // SAFETY: buf[..enc_len] is initialized by BASE64_ENGINE.encode below.
327    // buf[enc_len..out_len] is written by expand_backward before write_all reads it.
328    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
329    #[allow(clippy::uninit_vec)]
330    unsafe {
331        buf.set_len(out_len);
332    }
333    #[cfg(target_os = "linux")]
334    hint_hugepage(&mut buf);
335
336    // One SIMD encode call for the entire input (no chunking overhead)
337    let encoded = BASE64_ENGINE.encode(data, buf[..enc_len].as_out());
338    debug_assert_eq!(encoded.len(), enc_len, "encode wrote unexpected length");
339
340    // Expand backward to insert newlines — shifts only ~1.3% of data
341    expand_backward(buf.as_mut_ptr(), enc_len, out_len, wrap_col);
342
343    out.write_all(&buf[..out_len])
344}
345
346/// L1-scatter encode: encode groups of lines into a small L1-cached temp buffer,
347/// then scatter-copy each line to its final position in the output buffer with
348/// newline insertion. Each output byte is written exactly once — no read-back
349/// from main memory, halving memory traffic vs backward expansion.
350///
351/// Temp buffer (~20KB for 256 lines × 76 chars) stays hot in L1 cache, so
352/// reads during scatter are essentially free. Output buffer is streamed out
353/// with sequential writes that the prefetcher can handle efficiently.
354///
355/// Uses a full output buffer for vmsplice safety: vmsplice maps user pages
356/// into the pipe buffer, so the buffer must stay valid until the reader consumes.
357#[allow(dead_code)]
358fn encode_wrapped_scatter(
359    data: &[u8],
360    wrap_col: usize,
361    bytes_per_line: usize,
362    out: &mut impl Write,
363) -> io::Result<()> {
364    let enc_len = BASE64_ENGINE.encoded_length(data.len());
365    if enc_len == 0 {
366        return Ok(());
367    }
368
369    let num_full = enc_len / wrap_col;
370    let rem = enc_len % wrap_col;
371    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
372
373    // Output buffer — written once via scatter, then write_all to output
374    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
375    #[allow(clippy::uninit_vec)]
376    unsafe {
377        buf.set_len(out_len);
378    }
379    #[cfg(target_os = "linux")]
380    hint_hugepage(&mut buf);
381
382    // L1-cached temp buffer for encoding groups of lines.
383    // 256 lines × 76 chars = 19,456 bytes — fits comfortably in L1 (32-64KB).
384    const GROUP_LINES: usize = 256;
385    let group_input = GROUP_LINES * bytes_per_line;
386    let temp_size = GROUP_LINES * wrap_col;
387    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
388    #[allow(clippy::uninit_vec)]
389    unsafe {
390        temp.set_len(temp_size);
391    }
392
393    let line_out = wrap_col + 1;
394    let mut wp = 0usize; // write position in output buffer
395
396    for chunk in data.chunks(group_input) {
397        let clen = BASE64_ENGINE.encoded_length(chunk.len());
398        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());
399
400        // Scatter-copy full lines from temp to output with newlines
401        let lines = clen / wrap_col;
402        let chunk_rem = clen % wrap_col;
403
404        // 8-line unrolled scatter for ILP
405        let mut i = 0;
406        while i + 8 <= lines {
407            unsafe {
408                let src = temp.as_ptr().add(i * wrap_col);
409                let dst = buf.as_mut_ptr().add(wp);
410                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
411                *dst.add(wrap_col) = b'\n';
412                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
413                *dst.add(line_out + wrap_col) = b'\n';
414                std::ptr::copy_nonoverlapping(
415                    src.add(2 * wrap_col),
416                    dst.add(2 * line_out),
417                    wrap_col,
418                );
419                *dst.add(2 * line_out + wrap_col) = b'\n';
420                std::ptr::copy_nonoverlapping(
421                    src.add(3 * wrap_col),
422                    dst.add(3 * line_out),
423                    wrap_col,
424                );
425                *dst.add(3 * line_out + wrap_col) = b'\n';
426                std::ptr::copy_nonoverlapping(
427                    src.add(4 * wrap_col),
428                    dst.add(4 * line_out),
429                    wrap_col,
430                );
431                *dst.add(4 * line_out + wrap_col) = b'\n';
432                std::ptr::copy_nonoverlapping(
433                    src.add(5 * wrap_col),
434                    dst.add(5 * line_out),
435                    wrap_col,
436                );
437                *dst.add(5 * line_out + wrap_col) = b'\n';
438                std::ptr::copy_nonoverlapping(
439                    src.add(6 * wrap_col),
440                    dst.add(6 * line_out),
441                    wrap_col,
442                );
443                *dst.add(6 * line_out + wrap_col) = b'\n';
444                std::ptr::copy_nonoverlapping(
445                    src.add(7 * wrap_col),
446                    dst.add(7 * line_out),
447                    wrap_col,
448                );
449                *dst.add(7 * line_out + wrap_col) = b'\n';
450            }
451            wp += 8 * line_out;
452            i += 8;
453        }
454        // Remaining full lines
455        while i < lines {
456            unsafe {
457                std::ptr::copy_nonoverlapping(
458                    temp.as_ptr().add(i * wrap_col),
459                    buf.as_mut_ptr().add(wp),
460                    wrap_col,
461                );
462                *buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
463            }
464            wp += line_out;
465            i += 1;
466        }
467        // Partial last line (only on final chunk)
468        if chunk_rem > 0 {
469            unsafe {
470                std::ptr::copy_nonoverlapping(
471                    temp.as_ptr().add(lines * wrap_col),
472                    buf.as_mut_ptr().add(wp),
473                    chunk_rem,
474                );
475                *buf.as_mut_ptr().add(wp + chunk_rem) = b'\n';
476            }
477            wp += chunk_rem + 1;
478        }
479    }
480
481    out.write_all(&buf[..wp])
482}
483
484/// Scatter-copy encoded lines from temp buffer to output buffer with newlines.
485/// Uses copy_nonoverlapping since temp and output never overlap.
486#[inline]
487#[allow(dead_code)]
488fn scatter_lines(
489    temp: &[u8],
490    buf: &mut [u8],
491    line_start: usize,
492    count: usize,
493    wrap_col: usize,
494    line_out: usize,
495) {
496    unsafe {
497        let src = temp.as_ptr();
498        let dst = buf.as_mut_ptr();
499        for i in 0..count {
500            let s_off = i * wrap_col;
501            let d_off = (line_start + i) * line_out;
502            std::ptr::copy_nonoverlapping(src.add(s_off), dst.add(d_off), wrap_col);
503            *dst.add(d_off + wrap_col) = b'\n';
504        }
505    }
506}
507
508/// Expand encoded data in-place by inserting newlines at wrap_col boundaries.
509/// buf[0..enc_len] contains contiguous encoded data; buf has capacity for out_len.
510/// After expansion, buf[0..out_len] contains wrapped output with newlines.
511///
512/// Processes backward so shifted data never overwrites unread source data.
513/// For wrap_col=76: shift is ~1.3% (1 byte per 76), so most copies are
514/// non-overlapping and the memmove fast-path (memcpy) is used.
515#[inline]
516fn expand_backward(ptr: *mut u8, enc_len: usize, out_len: usize, wrap_col: usize) {
517    let num_full = enc_len / wrap_col;
518    let rem = enc_len % wrap_col;
519
520    unsafe {
521        let mut rp = enc_len;
522        let mut wp = out_len;
523
524        // Handle partial last line (remainder)
525        if rem > 0 {
526            wp -= 1;
527            *ptr.add(wp) = b'\n';
528            wp -= rem;
529            rp -= rem;
530            if rp != wp {
531                std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
532            }
533        }
534
535        // Process full lines backward
536        let mut lines_left = num_full;
537        while lines_left >= 8 {
538            // Unrolled: 8 lines per iteration
539            wp -= 1;
540            *ptr.add(wp) = b'\n';
541            rp -= wrap_col;
542            wp -= wrap_col;
543            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
544
545            wp -= 1;
546            *ptr.add(wp) = b'\n';
547            rp -= wrap_col;
548            wp -= wrap_col;
549            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
550
551            wp -= 1;
552            *ptr.add(wp) = b'\n';
553            rp -= wrap_col;
554            wp -= wrap_col;
555            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
556
557            wp -= 1;
558            *ptr.add(wp) = b'\n';
559            rp -= wrap_col;
560            wp -= wrap_col;
561            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
562
563            wp -= 1;
564            *ptr.add(wp) = b'\n';
565            rp -= wrap_col;
566            wp -= wrap_col;
567            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
568
569            wp -= 1;
570            *ptr.add(wp) = b'\n';
571            rp -= wrap_col;
572            wp -= wrap_col;
573            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
574
575            wp -= 1;
576            *ptr.add(wp) = b'\n';
577            rp -= wrap_col;
578            wp -= wrap_col;
579            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
580
581            wp -= 1;
582            *ptr.add(wp) = b'\n';
583            rp -= wrap_col;
584            wp -= wrap_col;
585            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
586
587            lines_left -= 8;
588        }
589
590        // Remaining lines (0-7)
591        while lines_left > 0 {
592            wp -= 1;
593            *ptr.add(wp) = b'\n';
594            rp -= wrap_col;
595            wp -= wrap_col;
596            if rp != wp {
597                std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
598            }
599            lines_left -= 1;
600        }
601    }
602}
603
604/// Static newline byte for IoSlice references in writev calls.
605static NEWLINE: [u8; 1] = [b'\n'];
606
607/// Write encoded base64 data with line wrapping using write_vectored (writev).
608/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
609/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
610/// This is zero-copy: no fused output buffer needed.
611#[inline]
612#[allow(dead_code)]
613fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
614    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
615    // Each line needs 2 entries (data + newline), so 512 lines per batch.
616    const MAX_IOV: usize = 1024;
617
618    let num_full_lines = encoded.len() / wrap_col;
619    let remainder = encoded.len() % wrap_col;
620    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };
621
622    // Small output: build all IoSlices and write in one call
623    if total_iov <= MAX_IOV {
624        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
625        let mut pos = 0;
626        for _ in 0..num_full_lines {
627            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
628            iov.push(io::IoSlice::new(&NEWLINE));
629            pos += wrap_col;
630        }
631        if remainder > 0 {
632            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
633            iov.push(io::IoSlice::new(&NEWLINE));
634        }
635        return write_all_vectored(out, &iov);
636    }
637
638    // Large output: fuse batches of lines into a reusable L1-cached buffer.
639    // Each batch copies ~39KB (512 lines × 77 bytes) from the encoded buffer
640    // with newlines inserted, then writes as a single contiguous write(2).
641    // This is faster than writev with 1024 IoSlice entries because:
642    // - One kernel memcpy per batch vs 1024 separate copies
643    // - Fused buffer (39KB) stays hot in L1 cache across batches
644    let line_out = wrap_col + 1;
645    const BATCH_LINES: usize = 512;
646    let batch_fused_size = BATCH_LINES * line_out;
647    let mut fused: Vec<u8> = Vec::with_capacity(batch_fused_size);
648    #[allow(clippy::uninit_vec)]
649    unsafe {
650        fused.set_len(batch_fused_size);
651    }
652
653    let mut rp = 0;
654    let mut lines_done = 0;
655
656    // Process full batches using 8-line unrolled fuse_wrap
657    while lines_done + BATCH_LINES <= num_full_lines {
658        let n = fuse_wrap(
659            &encoded[rp..rp + BATCH_LINES * wrap_col],
660            wrap_col,
661            &mut fused,
662        );
663        out.write_all(&fused[..n])?;
664        rp += BATCH_LINES * wrap_col;
665        lines_done += BATCH_LINES;
666    }
667
668    // Remaining full lines (partial batch)
669    let remaining_lines = num_full_lines - lines_done;
670    if remaining_lines > 0 {
671        let n = fuse_wrap(
672            &encoded[rp..rp + remaining_lines * wrap_col],
673            wrap_col,
674            &mut fused,
675        );
676        out.write_all(&fused[..n])?;
677        rp += remaining_lines * wrap_col;
678    }
679
680    // Partial last line
681    if remainder > 0 {
682        out.write_all(&encoded[rp..rp + remainder])?;
683        out.write_all(b"\n")?;
684    }
685    Ok(())
686}
687
688/// Write encoded base64 data with line wrapping using writev, tracking column state
689/// across calls. Used by encode_stream for piped input where chunks don't align
690/// to line boundaries.
691#[inline]
692fn write_wrapped_iov_streaming(
693    encoded: &[u8],
694    wrap_col: usize,
695    col: &mut usize,
696    out: &mut impl Write,
697) -> io::Result<()> {
698    const MAX_IOV: usize = 1024;
699    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
700    let mut rp = 0;
701
702    while rp < encoded.len() {
703        let space = wrap_col - *col;
704        let avail = encoded.len() - rp;
705
706        if avail <= space {
707            // Remaining data fits in current line
708            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
709            *col += avail;
710            if *col == wrap_col {
711                iov.push(io::IoSlice::new(&NEWLINE));
712                *col = 0;
713            }
714            break;
715        } else {
716            // Fill current line and add newline
717            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
718            iov.push(io::IoSlice::new(&NEWLINE));
719            rp += space;
720            *col = 0;
721        }
722
723        if iov.len() >= MAX_IOV - 1 {
724            write_all_vectored(out, &iov)?;
725            iov.clear();
726        }
727    }
728
729    if !iov.is_empty() {
730        write_all_vectored(out, &iov)?;
731    }
732    Ok(())
733}
734
735/// Parallel wrapped encoding with L1-scatter into a single shared output buffer.
736/// Pre-calculates each thread's output offset, allocates one buffer for all threads,
737/// and has each thread encode directly into its pre-assigned non-overlapping region.
738/// This saves N-1 buffer allocations and corresponding page faults vs per-thread Vecs,
739/// and uses a single write_all instead of writev.
740fn encode_wrapped_parallel(
741    data: &[u8],
742    wrap_col: usize,
743    bytes_per_line: usize,
744    out: &mut impl Write,
745) -> io::Result<()> {
746    let num_threads = num_cpus().max(1);
747    let lines_per_chunk = ((data.len() / bytes_per_line) / num_threads).max(1);
748    let chunk_input = lines_per_chunk * bytes_per_line;
749
750    // Split input at bytes_per_line boundaries (last chunk may have remainder)
751    let chunks: Vec<&[u8]> = data.chunks(chunk_input.max(bytes_per_line)).collect();
752
753    // Pre-calculate output offsets for each chunk
754    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
755    let mut total_out = 0usize;
756    for chunk in &chunks {
757        offsets.push(total_out);
758        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
759        let full_lines = enc_len / wrap_col;
760        let remainder = enc_len % wrap_col;
761        total_out += full_lines * (wrap_col + 1) + if remainder > 0 { remainder + 1 } else { 0 };
762    }
763
764    // Single allocation for all threads
765    let mut output: Vec<u8> = Vec::with_capacity(total_out);
766    #[allow(clippy::uninit_vec)]
767    unsafe {
768        output.set_len(total_out);
769    }
770    #[cfg(target_os = "linux")]
771    hint_hugepage(&mut output);
772
773    // Parallel encode: each thread writes into its pre-assigned region
774    let output_base = output.as_mut_ptr() as usize;
775    rayon::scope(|s| {
776        for (i, chunk) in chunks.iter().enumerate() {
777            let out_off = offsets[i];
778            let out_end = if i + 1 < offsets.len() {
779                offsets[i + 1]
780            } else {
781                total_out
782            };
783            let out_size = out_end - out_off;
784            let base = output_base;
785            s.spawn(move |_| {
786                let out_slice = unsafe {
787                    std::slice::from_raw_parts_mut((base + out_off) as *mut u8, out_size)
788                };
789                encode_chunk_l1_scatter_into(chunk, out_slice, wrap_col, bytes_per_line);
790            });
791        }
792    });
793
794    out.write_all(&output[..total_out])
795}
796
797/// Encode a chunk using L1-scatter, writing into a pre-allocated output slice.
798/// Encodes groups of 256 lines into L1-cached temp buffer, scatter-copy to output with newlines.
799/// The output slice must be large enough to hold the encoded+wrapped output.
800fn encode_chunk_l1_scatter_into(
801    data: &[u8],
802    output: &mut [u8],
803    wrap_col: usize,
804    bytes_per_line: usize,
805) {
806    const GROUP_LINES: usize = 256;
807    let group_input = GROUP_LINES * bytes_per_line;
808    let temp_size = GROUP_LINES * wrap_col;
809    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
810    #[allow(clippy::uninit_vec)]
811    unsafe {
812        temp.set_len(temp_size);
813    }
814
815    let line_out = wrap_col + 1;
816    let mut wp = 0usize;
817
818    for chunk in data.chunks(group_input) {
819        let clen = BASE64_ENGINE.encoded_length(chunk.len());
820        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());
821
822        let lines = clen / wrap_col;
823        let chunk_rem = clen % wrap_col;
824
825        // 8-line unrolled scatter
826        let mut i = 0;
827        while i + 8 <= lines {
828            unsafe {
829                let src = temp.as_ptr().add(i * wrap_col);
830                let dst = output.as_mut_ptr().add(wp);
831                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
832                *dst.add(wrap_col) = b'\n';
833                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
834                *dst.add(line_out + wrap_col) = b'\n';
835                std::ptr::copy_nonoverlapping(
836                    src.add(2 * wrap_col),
837                    dst.add(2 * line_out),
838                    wrap_col,
839                );
840                *dst.add(2 * line_out + wrap_col) = b'\n';
841                std::ptr::copy_nonoverlapping(
842                    src.add(3 * wrap_col),
843                    dst.add(3 * line_out),
844                    wrap_col,
845                );
846                *dst.add(3 * line_out + wrap_col) = b'\n';
847                std::ptr::copy_nonoverlapping(
848                    src.add(4 * wrap_col),
849                    dst.add(4 * line_out),
850                    wrap_col,
851                );
852                *dst.add(4 * line_out + wrap_col) = b'\n';
853                std::ptr::copy_nonoverlapping(
854                    src.add(5 * wrap_col),
855                    dst.add(5 * line_out),
856                    wrap_col,
857                );
858                *dst.add(5 * line_out + wrap_col) = b'\n';
859                std::ptr::copy_nonoverlapping(
860                    src.add(6 * wrap_col),
861                    dst.add(6 * line_out),
862                    wrap_col,
863                );
864                *dst.add(6 * line_out + wrap_col) = b'\n';
865                std::ptr::copy_nonoverlapping(
866                    src.add(7 * wrap_col),
867                    dst.add(7 * line_out),
868                    wrap_col,
869                );
870                *dst.add(7 * line_out + wrap_col) = b'\n';
871            }
872            wp += 8 * line_out;
873            i += 8;
874        }
875        while i < lines {
876            unsafe {
877                std::ptr::copy_nonoverlapping(
878                    temp.as_ptr().add(i * wrap_col),
879                    output.as_mut_ptr().add(wp),
880                    wrap_col,
881                );
882                *output.as_mut_ptr().add(wp + wrap_col) = b'\n';
883            }
884            wp += line_out;
885            i += 1;
886        }
887        if chunk_rem > 0 {
888            unsafe {
889                std::ptr::copy_nonoverlapping(
890                    temp.as_ptr().add(lines * wrap_col),
891                    output.as_mut_ptr().add(wp),
892                    chunk_rem,
893                );
894                *output.as_mut_ptr().add(wp + chunk_rem) = b'\n';
895            }
896            wp += chunk_rem + 1;
897        }
898    }
899}
900
901/// Fuse encoded base64 data with newlines in a single pass.
902/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
903/// Returns number of bytes written.
904#[inline]
905fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
906    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
907    let mut rp = 0;
908    let mut wp = 0;
909
910    // Unrolled: process 8 lines per iteration for better ILP
911    while rp + 8 * wrap_col <= encoded.len() {
912        unsafe {
913            let src = encoded.as_ptr().add(rp);
914            let dst = out_buf.as_mut_ptr().add(wp);
915
916            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
917            *dst.add(wrap_col) = b'\n';
918
919            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
920            *dst.add(line_out + wrap_col) = b'\n';
921
922            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
923            *dst.add(2 * line_out + wrap_col) = b'\n';
924
925            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
926            *dst.add(3 * line_out + wrap_col) = b'\n';
927
928            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
929            *dst.add(4 * line_out + wrap_col) = b'\n';
930
931            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
932            *dst.add(5 * line_out + wrap_col) = b'\n';
933
934            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
935            *dst.add(6 * line_out + wrap_col) = b'\n';
936
937            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
938            *dst.add(7 * line_out + wrap_col) = b'\n';
939        }
940        rp += 8 * wrap_col;
941        wp += 8 * line_out;
942    }
943
944    // Handle remaining 4 lines at a time
945    while rp + 4 * wrap_col <= encoded.len() {
946        unsafe {
947            let src = encoded.as_ptr().add(rp);
948            let dst = out_buf.as_mut_ptr().add(wp);
949
950            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
951            *dst.add(wrap_col) = b'\n';
952
953            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
954            *dst.add(line_out + wrap_col) = b'\n';
955
956            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
957            *dst.add(2 * line_out + wrap_col) = b'\n';
958
959            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
960            *dst.add(3 * line_out + wrap_col) = b'\n';
961        }
962        rp += 4 * wrap_col;
963        wp += 4 * line_out;
964    }
965
966    // Remaining full lines
967    while rp + wrap_col <= encoded.len() {
968        unsafe {
969            std::ptr::copy_nonoverlapping(
970                encoded.as_ptr().add(rp),
971                out_buf.as_mut_ptr().add(wp),
972                wrap_col,
973            );
974            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
975        }
976        rp += wrap_col;
977        wp += line_out;
978    }
979
980    // Partial last line
981    if rp < encoded.len() {
982        let remaining = encoded.len() - rp;
983        unsafe {
984            std::ptr::copy_nonoverlapping(
985                encoded.as_ptr().add(rp),
986                out_buf.as_mut_ptr().add(wp),
987                remaining,
988            );
989        }
990        wp += remaining;
991        out_buf[wp] = b'\n';
992        wp += 1;
993    }
994
995    wp
996}
997
998/// Fallback for very small wrap columns (< 4 chars).
999fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
1000    let enc_max = BASE64_ENGINE.encoded_length(data.len());
1001    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
1002    #[allow(clippy::uninit_vec)]
1003    unsafe {
1004        buf.set_len(enc_max);
1005    }
1006    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
1007
1008    let wc = wrap_col.max(1);
1009    for line in encoded.chunks(wc) {
1010        out.write_all(line)?;
1011        out.write_all(b"\n")?;
1012    }
1013    Ok(())
1014}
1015
1016/// Decode base64 data and write to output (borrows data, allocates clean buffer).
1017/// When `ignore_garbage` is true, strip all non-base64 characters.
1018/// When false, only strip whitespace (standard behavior).
1019pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
1020    if data.is_empty() {
1021        return Ok(());
1022    }
1023
1024    if ignore_garbage {
1025        let mut cleaned = strip_non_base64(data);
1026        return decode_clean_slice(&mut cleaned, out);
1027    }
1028
1029    // For large data (>= 512KB): use bulk strip + single-shot decode.
1030    // try_line_decode decodes per-line (~25ns overhead per 76-byte line call),
1031    // while strip+decode uses SIMD gap-copy + single-shot SIMD decode at ~6.5 GB/s.
1032    // For 10MB decode benchmark: ~2ms (bulk) vs ~4ms (per-line) = 2x faster.
1033    // For small data (< 512KB): per-line decode avoids allocation overhead.
1034    if data.len() < 512 * 1024 && data.len() >= 77 {
1035        if let Some(result) = try_line_decode(data, out) {
1036            return result;
1037        }
1038    }
1039
1040    // Fast path: single-pass SIMD strip + decode
1041    decode_stripping_whitespace(data, out)
1042}
1043
1044/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
1045/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
1046/// in-place with base64_simd::decode_inplace. Zero additional allocations.
1047///
1048/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
1049/// containing whitespace (newlines) get physically copied (~1.3% for
1050/// 76-char line base64). The decode writes to the same buffer, but decoded
1051/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
1052pub fn decode_mmap_inplace(
1053    data: &mut [u8],
1054    ignore_garbage: bool,
1055    out: &mut impl Write,
1056) -> io::Result<()> {
1057    if data.is_empty() {
1058        return Ok(());
1059    }
1060
1061    // For small data: try line-by-line decode (avoids COW page faults).
1062    // For large data (>= 512KB): bulk strip+decode is faster than per-line decode.
1063    if !ignore_garbage && data.len() >= 77 && data.len() < 512 * 1024 {
1064        if let Some(result) = try_line_decode(data, out) {
1065            return result;
1066        }
1067    }
1068
1069    if ignore_garbage {
1070        // Strip non-base64 chars in-place
1071        let ptr = data.as_mut_ptr();
1072        let len = data.len();
1073        let mut wp = 0;
1074        for rp in 0..len {
1075            let b = unsafe { *ptr.add(rp) };
1076            if is_base64_char(b) {
1077                unsafe { *ptr.add(wp) = b };
1078                wp += 1;
1079            }
1080        }
1081        let r = decode_inplace_with_padding(&mut data[..wp], out);
1082        return r;
1083    }
1084
1085    // Fast path: uniform-line fused strip+decode (no intermediate buffer).
1086    if data.len() >= 77 {
1087        if let Some(result) = try_decode_uniform_lines(data, out) {
1088            return result;
1089        }
1090    }
1091
1092    // Fallback: strip whitespace in-place using SIMD memchr2 gap-copy.
1093
1094    // Quick check: no newlines at all — maybe already clean
1095    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1096        // Check for rare whitespace
1097        if !data
1098            .iter()
1099            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1100        {
1101            // Perfectly clean — decode in-place directly
1102            return decode_inplace_with_padding(data, out);
1103        }
1104        // Rare whitespace only — strip in-place
1105        let ptr = data.as_mut_ptr();
1106        let len = data.len();
1107        let mut wp = 0;
1108        for rp in 0..len {
1109            let b = unsafe { *ptr.add(rp) };
1110            if NOT_WHITESPACE[b as usize] {
1111                unsafe { *ptr.add(wp) = b };
1112                wp += 1;
1113            }
1114        }
1115        return decode_inplace_with_padding(&mut data[..wp], out);
1116    }
1117
1118    // SIMD gap-copy: strip \n and \r in-place using memchr2
1119    let ptr = data.as_mut_ptr();
1120    let len = data.len();
1121    let mut wp = 0usize;
1122    let mut gap_start = 0usize;
1123    let mut has_rare_ws = false;
1124
1125    // SAFETY: memchr2_iter reads from the original data. We write to positions
1126    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
1127    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1128        let gap_len = pos - gap_start;
1129        if gap_len > 0 {
1130            if !has_rare_ws {
1131                // Check for rare whitespace during the gap-copy
1132                has_rare_ws = unsafe {
1133                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
1134                        .iter()
1135                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1136                };
1137            }
1138            if wp != gap_start {
1139                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
1140            }
1141            wp += gap_len;
1142        }
1143        gap_start = pos + 1;
1144    }
1145    // Final gap
1146    let tail_len = len - gap_start;
1147    if tail_len > 0 {
1148        if !has_rare_ws {
1149            has_rare_ws = unsafe {
1150                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
1151                    .iter()
1152                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1153            };
1154        }
1155        if wp != gap_start {
1156            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
1157        }
1158        wp += tail_len;
1159    }
1160
1161    // Second pass for rare whitespace if needed
1162    if has_rare_ws {
1163        let mut rp = 0;
1164        let mut cwp = 0;
1165        while rp < wp {
1166            let b = unsafe { *ptr.add(rp) };
1167            if NOT_WHITESPACE[b as usize] {
1168                unsafe { *ptr.add(cwp) = b };
1169                cwp += 1;
1170            }
1171            rp += 1;
1172        }
1173        wp = cwp;
1174    }
1175
1176    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
1177    if wp >= PARALLEL_DECODE_THRESHOLD {
1178        // For large data, use parallel decode from the cleaned slice
1179        return decode_borrowed_clean_parallel(out, &data[..wp]);
1180    }
1181    decode_inplace_with_padding(&mut data[..wp], out)
1182}
1183
1184/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
1185pub fn decode_owned(
1186    data: &mut Vec<u8>,
1187    ignore_garbage: bool,
1188    out: &mut impl Write,
1189) -> io::Result<()> {
1190    if data.is_empty() {
1191        return Ok(());
1192    }
1193
1194    if ignore_garbage {
1195        data.retain(|&b| is_base64_char(b));
1196    } else {
1197        strip_whitespace_inplace(data);
1198    }
1199
1200    decode_clean_slice(data, out)
1201}
1202
1203/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
1204/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
1205/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
1206/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
1207fn strip_whitespace_inplace(data: &mut Vec<u8>) {
1208    // Quick check: skip stripping if no \n or \r in the data.
1209    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1210    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
1211    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1212        // No newlines/CR — check for rare whitespace only
1213        if data
1214            .iter()
1215            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1216        {
1217            data.retain(|&b| NOT_WHITESPACE[b as usize]);
1218        }
1219        return;
1220    }
1221
1222    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
1223    // gaps between them to compact the data in-place. For typical base64 streams,
1224    // newlines are the only whitespace, so this handles >99% of cases.
1225    let ptr = data.as_mut_ptr();
1226    let len = data.len();
1227    let mut wp = 0usize;
1228    let mut gap_start = 0usize;
1229    let mut has_rare_ws = false;
1230
1231    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
1232        let gap_len = pos - gap_start;
1233        if gap_len > 0 {
1234            if !has_rare_ws {
1235                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
1236                has_rare_ws = data[gap_start..pos]
1237                    .iter()
1238                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1239            }
1240            if wp != gap_start {
1241                unsafe {
1242                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
1243                }
1244            }
1245            wp += gap_len;
1246        }
1247        gap_start = pos + 1;
1248    }
1249    // Copy the final gap
1250    let tail_len = len - gap_start;
1251    if tail_len > 0 {
1252        if !has_rare_ws {
1253            has_rare_ws = data[gap_start..]
1254                .iter()
1255                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1256        }
1257        if wp != gap_start {
1258            unsafe {
1259                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
1260            }
1261        }
1262        wp += tail_len;
1263    }
1264
1265    data.truncate(wp);
1266
1267    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
1268    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1269    if has_rare_ws {
1270        let ptr = data.as_mut_ptr();
1271        let len = data.len();
1272        let mut rp = 0;
1273        let mut cwp = 0;
1274        while rp < len {
1275            let b = unsafe { *ptr.add(rp) };
1276            if NOT_WHITESPACE[b as usize] {
1277                unsafe { *ptr.add(cwp) = b };
1278                cwp += 1;
1279            }
1280            rp += 1;
1281        }
1282        data.truncate(cwp);
1283    }
1284}
1285
1286/// 256-byte lookup table: true for non-whitespace bytes.
1287/// Used for single-pass whitespace stripping in decode.
1288static NOT_WHITESPACE: [bool; 256] = {
1289    let mut table = [true; 256];
1290    table[b' ' as usize] = false;
1291    table[b'\t' as usize] = false;
1292    table[b'\n' as usize] = false;
1293    table[b'\r' as usize] = false;
1294    table[0x0b] = false; // vertical tab
1295    table[0x0c] = false; // form feed
1296    table
1297};
1298
1299/// Fused strip+decode for uniform-line base64 data.
1300/// Detects consistent line length, then processes in sub-chunks: each sub-chunk
1301/// copies lines to a small local buffer (L2-hot) and decodes immediately.
1302/// Eliminates the large intermediate clean buffer (~12MB for 10MB decode).
1303/// Returns None if the data doesn't have uniform line structure.
1304fn try_decode_uniform_lines(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1305    let first_nl = memchr::memchr(b'\n', data)?;
1306    let line_len = first_nl;
1307    if line_len == 0 || line_len % 4 != 0 {
1308        return None;
1309    }
1310
1311    let stride = line_len + 1;
1312
1313    // Verify the data has consistent line structure (first + last lines)
1314    let check_lines = 4.min(data.len() / stride);
1315    for i in 1..check_lines {
1316        let expected_nl = i * stride - 1;
1317        if expected_nl >= data.len() || data[expected_nl] != b'\n' {
1318            return None;
1319        }
1320    }
1321
1322    let full_lines = if data.len() >= stride {
1323        let candidate = data.len() / stride;
1324        if candidate > 0 && data[candidate * stride - 1] != b'\n' {
1325            return None;
1326        }
1327        candidate
1328    } else {
1329        0
1330    };
1331
1332    let remainder_start = full_lines * stride;
1333    let remainder = &data[remainder_start..];
1334    let rem_clean = if remainder.last() == Some(&b'\n') {
1335        &remainder[..remainder.len() - 1]
1336    } else {
1337        remainder
1338    };
1339
1340    // Compute exact decoded sizes
1341    let decoded_per_line = line_len * 3 / 4;
1342    let rem_decoded_size = if rem_clean.is_empty() {
1343        0
1344    } else {
1345        let pad = rem_clean
1346            .iter()
1347            .rev()
1348            .take(2)
1349            .filter(|&&b| b == b'=')
1350            .count();
1351        rem_clean.len() * 3 / 4 - pad
1352    };
1353    let total_decoded = full_lines * decoded_per_line + rem_decoded_size;
1354    let clean_len = full_lines * line_len;
1355
1356    // Parallel path: fused strip+decode with 128KB sub-chunks per thread.
1357    // Each thread copies lines to a thread-local buffer (L2-hot) and decodes immediately,
1358    // eliminating the 12MB+ intermediate clean buffer entirely.
1359    if clean_len >= PARALLEL_DECODE_THRESHOLD && num_cpus() > 1 {
1360        let mut output: Vec<u8> = Vec::with_capacity(total_decoded);
1361        #[allow(clippy::uninit_vec)]
1362        unsafe {
1363            output.set_len(total_decoded);
1364        }
1365        #[cfg(target_os = "linux")]
1366        hint_hugepage(&mut output);
1367
1368        let out_ptr = output.as_mut_ptr() as usize;
1369        let src_ptr = data.as_ptr() as usize;
1370        let num_threads = num_cpus().max(1);
1371        let lines_per_thread = (full_lines + num_threads - 1) / num_threads;
1372        // 512KB sub-chunks: larger chunks give SIMD decode more contiguous data,
1373        // reducing per-call overhead. 512KB fits in L2 cache (256KB-1MB typical).
1374        let lines_per_sub = (512 * 1024 / line_len).max(1);
1375
1376        let err_flag = std::sync::atomic::AtomicBool::new(false);
1377        rayon::scope(|s| {
1378            for t in 0..num_threads {
1379                let err_flag = &err_flag;
1380                s.spawn(move |_| {
1381                    let start_line = t * lines_per_thread;
1382                    if start_line >= full_lines {
1383                        return;
1384                    }
1385                    let end_line = (start_line + lines_per_thread).min(full_lines);
1386                    let chunk_lines = end_line - start_line;
1387
1388                    let sub_buf_size = lines_per_sub.min(chunk_lines) * line_len;
1389                    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1390                    #[allow(clippy::uninit_vec)]
1391                    unsafe {
1392                        local_buf.set_len(sub_buf_size);
1393                    }
1394
1395                    let src = src_ptr as *const u8;
1396                    let out_base = out_ptr as *mut u8;
1397                    let local_dst = local_buf.as_mut_ptr();
1398
1399                    let mut sub_start = 0usize;
1400                    while sub_start < chunk_lines {
1401                        if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
1402                            return;
1403                        }
1404                        let sub_count = (chunk_lines - sub_start).min(lines_per_sub);
1405                        let sub_clean = sub_count * line_len;
1406
1407                        for i in 0..sub_count {
1408                            unsafe {
1409                                std::ptr::copy_nonoverlapping(
1410                                    src.add((start_line + sub_start + i) * stride),
1411                                    local_dst.add(i * line_len),
1412                                    line_len,
1413                                );
1414                            }
1415                        }
1416
1417                        let out_offset = (start_line + sub_start) * decoded_per_line;
1418                        let out_size = sub_count * decoded_per_line;
1419                        let out_slice = unsafe {
1420                            std::slice::from_raw_parts_mut(out_base.add(out_offset), out_size)
1421                        };
1422                        if BASE64_ENGINE
1423                            .decode(&local_buf[..sub_clean], out_slice.as_out())
1424                            .is_err()
1425                        {
1426                            err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
1427                            return;
1428                        }
1429
1430                        sub_start += sub_count;
1431                    }
1432                });
1433            }
1434        });
1435        let result: Result<(), io::Error> = if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
1436            Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1437        } else {
1438            Ok(())
1439        };
1440
1441        if let Err(e) = result {
1442            return Some(Err(e));
1443        }
1444
1445        if !rem_clean.is_empty() {
1446            let rem_out = &mut output[full_lines * decoded_per_line..total_decoded];
1447            match BASE64_ENGINE.decode(rem_clean, rem_out.as_out()) {
1448                Ok(_) => {}
1449                Err(_) => return Some(decode_error()),
1450            }
1451        }
1452
1453        return Some(out.write_all(&output[..total_decoded]));
1454    }
1455
1456    // Sequential path: fused strip+decode in 256KB sub-chunks.
1457    // Larger sub-chunks give SIMD decode more data per call, improving throughput.
1458    // Uses decode_inplace on a small reusable buffer — no large allocations at all.
1459    let lines_per_sub = (256 * 1024 / line_len).max(1);
1460    let sub_buf_size = lines_per_sub * line_len;
1461    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
1462    #[allow(clippy::uninit_vec)]
1463    unsafe {
1464        local_buf.set_len(sub_buf_size);
1465    }
1466
1467    let src = data.as_ptr();
1468    let local_dst = local_buf.as_mut_ptr();
1469
1470    let mut line_idx = 0usize;
1471    while line_idx < full_lines {
1472        let sub_count = (full_lines - line_idx).min(lines_per_sub);
1473        let sub_clean = sub_count * line_len;
1474
1475        for i in 0..sub_count {
1476            unsafe {
1477                std::ptr::copy_nonoverlapping(
1478                    src.add((line_idx + i) * stride),
1479                    local_dst.add(i * line_len),
1480                    line_len,
1481                );
1482            }
1483        }
1484
1485        match BASE64_ENGINE.decode_inplace(&mut local_buf[..sub_clean]) {
1486            Ok(decoded) => {
1487                if let Err(e) = out.write_all(decoded) {
1488                    return Some(Err(e));
1489                }
1490            }
1491            Err(_) => return Some(decode_error()),
1492        }
1493
1494        line_idx += sub_count;
1495    }
1496
1497    if !rem_clean.is_empty() {
1498        let mut rem_buf = rem_clean.to_vec();
1499        match BASE64_ENGINE.decode_inplace(&mut rem_buf) {
1500            Ok(decoded) => {
1501                if let Err(e) = out.write_all(decoded) {
1502                    return Some(Err(e));
1503                }
1504            }
1505            Err(_) => return Some(decode_error()),
1506        }
1507    }
1508
1509    Some(Ok(()))
1510}
1511
1512/// Decode by stripping whitespace and decoding in a single fused pass.
1513/// For data with no whitespace, decodes directly without any copy.
1514/// Detects uniform line structure for fast structured-copy (no search needed),
1515/// falls back to SIMD memchr2 gap-copy for irregular data.
1516fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
1517    // Fast path for uniform-line base64 (e.g., standard 76-char lines + newline).
1518    // Copies at known offsets, avoiding the memchr2 search entirely.
1519    // For 13MB base64: saves ~1ms vs memchr2 gap-copy (just structured memcpy).
1520    if data.len() >= 77 {
1521        if let Some(result) = try_decode_uniform_lines(data, out) {
1522            return result;
1523        }
1524    }
1525
1526    // Quick check: skip stripping if no \n or \r in the data.
1527    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
1528    if memchr::memchr2(b'\n', b'\r', data).is_none() {
1529        // No newlines/CR — check for rare whitespace only
1530        if !data
1531            .iter()
1532            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
1533        {
1534            return decode_borrowed_clean(out, data);
1535        }
1536        // Has rare whitespace only — strip and decode
1537        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
1538        for &b in data {
1539            if NOT_WHITESPACE[b as usize] {
1540                cleaned.push(b);
1541            }
1542        }
1543        return decode_clean_slice(&mut cleaned, out);
1544    }
1545
1546    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
1547    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
1548    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
1549    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
1550    let dst = clean.as_mut_ptr();
1551    let mut wp = 0usize;
1552    let mut gap_start = 0usize;
1553    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
1554    // This avoids the second full-scan pass when only \n/\r are present.
1555    let mut has_rare_ws = false;
1556
1557    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1558        let gap_len = pos - gap_start;
1559        if gap_len > 0 {
1560            // Check gap region for rare whitespace during copy.
1561            // This adds ~1 branch per gap but eliminates the second full scan.
1562            if !has_rare_ws {
1563                has_rare_ws = data[gap_start..pos]
1564                    .iter()
1565                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1566            }
1567            unsafe {
1568                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
1569            }
1570            wp += gap_len;
1571        }
1572        gap_start = pos + 1;
1573    }
1574    // Copy the final gap after the last \n/\r
1575    let tail_len = data.len() - gap_start;
1576    if tail_len > 0 {
1577        if !has_rare_ws {
1578            has_rare_ws = data[gap_start..]
1579                .iter()
1580                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1581        }
1582        unsafe {
1583            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
1584        }
1585        wp += tail_len;
1586    }
1587    unsafe {
1588        clean.set_len(wp);
1589    }
1590
1591    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
1592    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
1593    if has_rare_ws {
1594        let ptr = clean.as_mut_ptr();
1595        let len = clean.len();
1596        let mut rp = 0;
1597        let mut cwp = 0;
1598        while rp < len {
1599            let b = unsafe { *ptr.add(rp) };
1600            if NOT_WHITESPACE[b as usize] {
1601                unsafe { *ptr.add(cwp) = b };
1602                cwp += 1;
1603            }
1604            rp += 1;
1605        }
1606        clean.truncate(cwp);
1607    }
1608
1609    // For large data (>= threshold), use parallel decode for multi-core speedup.
1610    // For small data, use in-place decode to avoid extra allocation.
1611    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
1612        decode_borrowed_clean_parallel(out, &clean)
1613    } else {
1614        decode_clean_slice(&mut clean, out)
1615    }
1616}
1617
1618/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
1619/// Returns Some(result) if the data has uniform line lengths suitable for
1620/// per-line decode, or None if the data doesn't fit this pattern.
1621///
1622/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
1623/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
1624/// We decode each line directly into its position in the output buffer.
1625fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1626    // Find the first newline to determine line length
1627    let first_nl = memchr::memchr(b'\n', data)?;
1628    let line_len = first_nl; // encoded chars per line (without newline)
1629
1630    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
1631    if line_len == 0 || line_len % 4 != 0 {
1632        return None;
1633    }
1634
1635    let line_stride = line_len + 1; // line_len chars + 1 newline byte
1636    let decoded_per_line = line_len * 3 / 4;
1637
1638    // Verify the data has a consistent line structure by checking the next few lines
1639    let check_lines = 4.min(data.len() / line_stride);
1640    for i in 1..check_lines {
1641        let expected_nl = i * line_stride - 1;
1642        if expected_nl >= data.len() {
1643            break;
1644        }
1645        if data[expected_nl] != b'\n' {
1646            return None; // Inconsistent line length
1647        }
1648    }
1649
1650    // Calculate full lines and remainder
1651    let full_lines = if data.len() >= line_stride {
1652        // Check how many complete lines fit
1653        let candidate = data.len() / line_stride;
1654        // Verify the last full line's newline
1655        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
1656            return None; // Not a clean line-structured file
1657        }
1658        candidate
1659    } else {
1660        0
1661    };
1662
1663    let remainder_start = full_lines * line_stride;
1664    let remainder = &data[remainder_start..];
1665
1666    // Calculate exact output size
1667    let remainder_clean_len = if remainder.is_empty() {
1668        0
1669    } else {
1670        // Remainder might end with newline, strip it
1671        let rem = if remainder.last() == Some(&b'\n') {
1672            &remainder[..remainder.len() - 1]
1673        } else {
1674            remainder
1675        };
1676        if rem.is_empty() {
1677            0
1678        } else {
1679            // Check for padding
1680            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
1681            if rem.len() % 4 != 0 {
1682                return None; // Invalid remainder
1683            }
1684            rem.len() * 3 / 4 - pad
1685        }
1686    };
1687
1688    // Single-allocation decode: allocate full decoded output, decode all lines
1689    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
1690    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
1691    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
1692    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
1693    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1694    #[allow(clippy::uninit_vec)]
1695    unsafe {
1696        out_buf.set_len(total_decoded);
1697    }
1698
1699    let dst = out_buf.as_mut_ptr();
1700
1701    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
1702    // Each thread decodes a contiguous block of lines directly to its final position
1703    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
1704    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
1705        let out_addr = dst as usize;
1706        let num_threads = num_cpus().max(1);
1707        let lines_per_chunk = (full_lines / num_threads).max(1);
1708
1709        // Build per-thread task ranges: (start_line, end_line)
1710        let mut tasks: Vec<(usize, usize)> = Vec::new();
1711        let mut line_off = 0;
1712        while line_off < full_lines {
1713            let end = (line_off + lines_per_chunk).min(full_lines);
1714            tasks.push((line_off, end));
1715            line_off = end;
1716        }
1717
1718        let decode_err = std::sync::atomic::AtomicBool::new(false);
1719        rayon::scope(|s| {
1720            for &(start_line, end_line) in &tasks {
1721                let decode_err = &decode_err;
1722                s.spawn(move |_| {
1723                    let out_ptr = out_addr as *mut u8;
1724                    let mut i = start_line;
1725
1726                    while i + 4 <= end_line {
1727                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
1728                            return;
1729                        }
1730                        let in_base = i * line_stride;
1731                        let ob = i * decoded_per_line;
1732                        unsafe {
1733                            let s0 =
1734                                std::slice::from_raw_parts_mut(out_ptr.add(ob), decoded_per_line);
1735                            if BASE64_ENGINE
1736                                .decode(&data[in_base..in_base + line_len], s0.as_out())
1737                                .is_err()
1738                            {
1739                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1740                                return;
1741                            }
1742                            let s1 = std::slice::from_raw_parts_mut(
1743                                out_ptr.add(ob + decoded_per_line),
1744                                decoded_per_line,
1745                            );
1746                            if BASE64_ENGINE
1747                                .decode(
1748                                    &data[in_base + line_stride..in_base + line_stride + line_len],
1749                                    s1.as_out(),
1750                                )
1751                                .is_err()
1752                            {
1753                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1754                                return;
1755                            }
1756                            let s2 = std::slice::from_raw_parts_mut(
1757                                out_ptr.add(ob + 2 * decoded_per_line),
1758                                decoded_per_line,
1759                            );
1760                            if BASE64_ENGINE
1761                                .decode(
1762                                    &data[in_base + 2 * line_stride
1763                                        ..in_base + 2 * line_stride + line_len],
1764                                    s2.as_out(),
1765                                )
1766                                .is_err()
1767                            {
1768                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1769                                return;
1770                            }
1771                            let s3 = std::slice::from_raw_parts_mut(
1772                                out_ptr.add(ob + 3 * decoded_per_line),
1773                                decoded_per_line,
1774                            );
1775                            if BASE64_ENGINE
1776                                .decode(
1777                                    &data[in_base + 3 * line_stride
1778                                        ..in_base + 3 * line_stride + line_len],
1779                                    s3.as_out(),
1780                                )
1781                                .is_err()
1782                            {
1783                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1784                                return;
1785                            }
1786                        }
1787                        i += 4;
1788                    }
1789
1790                    while i < end_line {
1791                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
1792                            return;
1793                        }
1794                        let in_start = i * line_stride;
1795                        let out_off = i * decoded_per_line;
1796                        let out_slice = unsafe {
1797                            std::slice::from_raw_parts_mut(out_ptr.add(out_off), decoded_per_line)
1798                        };
1799                        if BASE64_ENGINE
1800                            .decode(&data[in_start..in_start + line_len], out_slice.as_out())
1801                            .is_err()
1802                        {
1803                            decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
1804                            return;
1805                        }
1806                        i += 1;
1807                    }
1808                });
1809            }
1810        });
1811
1812        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
1813            return Some(decode_error());
1814        }
1815    } else {
1816        // Sequential decode with 4x unrolling for smaller inputs
1817        let mut i = 0;
1818
1819        while i + 4 <= full_lines {
1820            let in_base = i * line_stride;
1821            let out_base = i * decoded_per_line;
1822            unsafe {
1823                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
1824                if BASE64_ENGINE
1825                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1826                    .is_err()
1827                {
1828                    return Some(decode_error());
1829                }
1830
1831                let s1 = std::slice::from_raw_parts_mut(
1832                    dst.add(out_base + decoded_per_line),
1833                    decoded_per_line,
1834                );
1835                if BASE64_ENGINE
1836                    .decode(
1837                        &data[in_base + line_stride..in_base + line_stride + line_len],
1838                        s1.as_out(),
1839                    )
1840                    .is_err()
1841                {
1842                    return Some(decode_error());
1843                }
1844
1845                let s2 = std::slice::from_raw_parts_mut(
1846                    dst.add(out_base + 2 * decoded_per_line),
1847                    decoded_per_line,
1848                );
1849                if BASE64_ENGINE
1850                    .decode(
1851                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
1852                        s2.as_out(),
1853                    )
1854                    .is_err()
1855                {
1856                    return Some(decode_error());
1857                }
1858
1859                let s3 = std::slice::from_raw_parts_mut(
1860                    dst.add(out_base + 3 * decoded_per_line),
1861                    decoded_per_line,
1862                );
1863                if BASE64_ENGINE
1864                    .decode(
1865                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
1866                        s3.as_out(),
1867                    )
1868                    .is_err()
1869                {
1870                    return Some(decode_error());
1871                }
1872            }
1873            i += 4;
1874        }
1875
1876        while i < full_lines {
1877            let in_start = i * line_stride;
1878            let in_end = in_start + line_len;
1879            let out_off = i * decoded_per_line;
1880            let out_slice =
1881                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
1882            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
1883                Ok(_) => {}
1884                Err(_) => return Some(decode_error()),
1885            }
1886            i += 1;
1887        }
1888    }
1889
1890    // Decode remainder
1891    if remainder_clean_len > 0 {
1892        let rem = if remainder.last() == Some(&b'\n') {
1893            &remainder[..remainder.len() - 1]
1894        } else {
1895            remainder
1896        };
1897        let out_off = full_lines * decoded_per_line;
1898        let out_slice =
1899            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
1900        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
1901            Ok(_) => {}
1902            Err(_) => return Some(decode_error()),
1903        }
1904    }
1905
1906    // Single write_all for the entire decoded output
1907    Some(out.write_all(&out_buf[..total_decoded]))
1908}
1909
1910/// Decode a clean (no whitespace) buffer in-place with SIMD.
1911fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1912    if data.is_empty() {
1913        return Ok(());
1914    }
1915    decode_inplace_with_padding(data, out)
1916}
1917
1918/// Cold error path — keeps hot decode path tight by moving error construction out of line.
1919#[cold]
1920#[inline(never)]
1921fn decode_error() -> io::Result<()> {
1922    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1923}
1924
1925/// Decode in-place with padding fallback for truncated input.
1926/// GNU base64 accepts missing padding at end of stream, so if decode fails
1927/// and the length mod 4 is 2 or 3, retry with padding added.
1928fn decode_inplace_with_padding(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1929    match BASE64_ENGINE.decode_inplace(data) {
1930        Ok(decoded) => out.write_all(decoded),
1931        Err(_) => {
1932            let remainder = data.len() % 4;
1933            if remainder == 2 || remainder == 3 {
1934                let has_existing_padding = memchr::memchr(b'=', data).is_some();
1935                let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
1936                padded.extend_from_slice(data);
1937                padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
1938                if let Ok(decoded) = BASE64_ENGINE.decode_inplace(&mut padded) {
1939                    out.write_all(decoded)?;
1940                    if has_existing_padding {
1941                        return decode_error();
1942                    }
1943                    return Ok(());
1944                }
1945            }
1946            decode_error()
1947        }
1948    }
1949}
1950
1951/// Decode clean base64 data (no whitespace) from a borrowed slice.
1952fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1953    if data.is_empty() {
1954        return Ok(());
1955    }
1956    // Parallel decode for large data: split at 4-byte boundaries,
1957    // decode each chunk independently (base64 is context-free per 4-char group).
1958    if data.len() >= PARALLEL_DECODE_THRESHOLD {
1959        return decode_borrowed_clean_parallel(out, data);
1960    }
1961    // If input has truncated padding, pad it first (GNU base64 accepts missing padding).
1962    let remainder = data.len() % 4;
1963    if remainder == 2 || remainder == 3 {
1964        // If input already has '=' but length mod 4 != 0, the padding is
1965        // wrong/truncated. GNU base64 still decodes but reports error.
1966        let has_existing_padding = memchr::memchr(b'=', data).is_some();
1967        let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
1968        padded.extend_from_slice(data);
1969        padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
1970        let result = decode_borrowed_clean(out, &padded);
1971        if has_existing_padding && result.is_ok() {
1972            return decode_error();
1973        }
1974        return result;
1975    }
1976    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
1977    // Decoded size = data.len() * 3 / 4 minus padding.
1978    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
1979    let decoded_size = data.len() * 3 / 4 - pad;
1980    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
1981    #[allow(clippy::uninit_vec)]
1982    unsafe {
1983        buf.set_len(decoded_size);
1984    }
1985    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
1986        Ok(decoded) => {
1987            out.write_all(decoded)?;
1988            Ok(())
1989        }
1990        Err(_) => decode_error(),
1991    }
1992}
1993
1994/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel.
1995/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
1996/// upfront, so each thread decodes directly to its final position. No compaction needed.
1997fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1998    let num_threads = num_cpus().max(1);
1999    let raw_chunk = data.len() / num_threads;
2000    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
2001    let chunk_size = ((raw_chunk + 3) / 4) * 4;
2002
2003    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
2004
2005    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
2006    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
2007    offsets.push(0);
2008    let mut total_decoded = 0usize;
2009    for (i, chunk) in chunks.iter().enumerate() {
2010        let decoded_size = if i == chunks.len() - 1 {
2011            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
2012            chunk.len() * 3 / 4 - pad
2013        } else {
2014            chunk.len() * 3 / 4
2015        };
2016        total_decoded += decoded_size;
2017        offsets.push(total_decoded);
2018    }
2019
2020    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
2021    #[allow(clippy::uninit_vec)]
2022    unsafe {
2023        output_buf.set_len(total_decoded);
2024    }
2025    #[cfg(target_os = "linux")]
2026    hint_hugepage(&mut output_buf);
2027
2028    // Parallel decode: each thread decodes directly into its exact final position.
2029    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
2030    let out_addr = output_buf.as_mut_ptr() as usize;
2031    let err_flag = std::sync::atomic::AtomicBool::new(false);
2032    rayon::scope(|s| {
2033        for (i, chunk) in chunks.iter().enumerate() {
2034            let offset = offsets[i];
2035            let expected_size = offsets[i + 1] - offset;
2036            let err_flag = &err_flag;
2037            s.spawn(move |_| {
2038                if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
2039                    return;
2040                }
2041                // SAFETY: each thread writes to non-overlapping region
2042                let out_slice = unsafe {
2043                    std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
2044                };
2045                if BASE64_ENGINE.decode(chunk, out_slice.as_out()).is_err() {
2046                    err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
2047                }
2048            });
2049        }
2050    });
2051
2052    if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
2053        return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"));
2054    }
2055
2056    out.write_all(&output_buf[..total_decoded])
2057}
2058
2059/// Strip non-base64 characters (for -i / --ignore-garbage).
2060fn strip_non_base64(data: &[u8]) -> Vec<u8> {
2061    data.iter()
2062        .copied()
2063        .filter(|&b| is_base64_char(b))
2064        .collect()
2065}
2066
2067/// Check if a byte is a valid base64 alphabet character or padding.
2068#[inline]
2069fn is_base64_char(b: u8) -> bool {
2070    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
2071}
2072
2073/// Stream-encode from a reader to a writer. Used for stdin processing.
2074/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
2075pub fn encode_stream(
2076    reader: &mut impl Read,
2077    wrap_col: usize,
2078    writer: &mut impl Write,
2079) -> io::Result<()> {
2080    if wrap_col == 0 {
2081        return encode_stream_nowrap(reader, writer);
2082    }
2083    encode_stream_wrapped(reader, wrap_col, writer)
2084}
2085
2086/// Streaming encode with NO line wrapping — optimized fast path.
2087/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
2088/// 24MB reads mean 10-18MB input is consumed in a single read() call,
2089/// and the encoded output writes in 1-2 write() calls.
2090fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2091    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
2092    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3
2093
2094    // SAFETY: buf bytes are written by read_full before being processed.
2095    // encode_buf bytes are written by encode before being read.
2096    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
2097    #[allow(clippy::uninit_vec)]
2098    unsafe {
2099        buf.set_len(NOWRAP_READ);
2100    }
2101    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
2102    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
2103    #[allow(clippy::uninit_vec)]
2104    unsafe {
2105        encode_buf.set_len(encode_buf_size);
2106    }
2107
2108    loop {
2109        let n = read_full(reader, &mut buf)?;
2110        if n == 0 {
2111            break;
2112        }
2113        let enc_len = BASE64_ENGINE.encoded_length(n);
2114        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
2115        writer.write_all(encoded)?;
2116    }
2117    Ok(())
2118}
2119
2120/// Streaming encode WITH line wrapping.
2121/// For the common case (wrap_col divides evenly into 3-byte input groups),
2122/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
2123/// then writes it in a single write() call. This eliminates the overhead of
2124/// many writev() syscalls (one per ~512 lines via IoSlice).
2125///
2126/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
2127fn encode_stream_wrapped(
2128    reader: &mut impl Read,
2129    wrap_col: usize,
2130    writer: &mut impl Write,
2131) -> io::Result<()> {
2132    let bytes_per_line = wrap_col * 3 / 4;
2133    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
2134    // align the read buffer to bytes_per_line boundaries so each chunk produces
2135    // complete lines with no column carry-over between chunks.
2136    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
2137        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
2138    }
2139
2140    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
2141    const STREAM_READ: usize = 12 * 1024 * 1024;
2142    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
2143    #[allow(clippy::uninit_vec)]
2144    unsafe {
2145        buf.set_len(STREAM_READ);
2146    }
2147    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
2148    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
2149    #[allow(clippy::uninit_vec)]
2150    unsafe {
2151        encode_buf.set_len(encode_buf_size);
2152    }
2153
2154    let mut col = 0usize;
2155
2156    loop {
2157        let n = read_full(reader, &mut buf)?;
2158        if n == 0 {
2159            break;
2160        }
2161        let enc_len = BASE64_ENGINE.encoded_length(n);
2162        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
2163
2164        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
2165    }
2166
2167    if col > 0 {
2168        writer.write_all(b"\n")?;
2169    }
2170
2171    Ok(())
2172}
2173
2174/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
2175/// encode each line directly into its final position with newline appended.
2176/// Eliminates the two-pass encode-then-fuse_wrap approach.
2177/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
2178/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
2179fn encode_stream_wrapped_fused(
2180    reader: &mut impl Read,
2181    wrap_col: usize,
2182    bytes_per_line: usize,
2183    writer: &mut impl Write,
2184) -> io::Result<()> {
2185    // Align read size to bytes_per_line for complete output lines per chunk.
2186    // ~210K lines * 57 bytes = ~12MB input, ~16MB output per chunk.
2187    // Smaller chunks have better cache behavior for the 4-line unrolled encode loop.
2188    let lines_per_chunk = (12 * 1024 * 1024) / bytes_per_line;
2189    let read_size = lines_per_chunk * bytes_per_line;
2190    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline
2191
2192    // SAFETY: buf bytes are written by read_full before being processed.
2193    // out_buf bytes are written by encode before being read.
2194    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
2195    #[allow(clippy::uninit_vec)]
2196    unsafe {
2197        buf.set_len(read_size);
2198    }
2199    // Output buffer: enough for all lines + remainder
2200    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
2201    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
2202    #[allow(clippy::uninit_vec)]
2203    unsafe {
2204        out_buf.set_len(max_output);
2205    }
2206
2207    loop {
2208        let n = read_full(reader, &mut buf)?;
2209        if n == 0 {
2210            break;
2211        }
2212
2213        let full_lines = n / bytes_per_line;
2214        let remainder = n % bytes_per_line;
2215
2216        // Encode each input line directly into its final output position.
2217        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
2218        // This eliminates the separate encode + fuse_wrap copy entirely.
2219        let dst = out_buf.as_mut_ptr();
2220        let mut line_idx = 0;
2221
2222        // 4-line unrolled loop for better ILP
2223        while line_idx + 4 <= full_lines {
2224            let in_base = line_idx * bytes_per_line;
2225            let out_base = line_idx * line_out;
2226            unsafe {
2227                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
2228                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
2229                *dst.add(out_base + wrap_col) = b'\n';
2230
2231                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
2232                let _ = BASE64_ENGINE.encode(
2233                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
2234                    s1.as_out(),
2235                );
2236                *dst.add(out_base + line_out + wrap_col) = b'\n';
2237
2238                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
2239                let _ = BASE64_ENGINE.encode(
2240                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
2241                    s2.as_out(),
2242                );
2243                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
2244
2245                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
2246                let _ = BASE64_ENGINE.encode(
2247                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
2248                    s3.as_out(),
2249                );
2250                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
2251            }
2252            line_idx += 4;
2253        }
2254
2255        // Remaining full lines
2256        while line_idx < full_lines {
2257            let in_base = line_idx * bytes_per_line;
2258            let out_base = line_idx * line_out;
2259            unsafe {
2260                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
2261                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
2262                *dst.add(out_base + wrap_col) = b'\n';
2263            }
2264            line_idx += 1;
2265        }
2266
2267        let mut wp = full_lines * line_out;
2268
2269        // Handle remainder (partial last line of this chunk)
2270        if remainder > 0 {
2271            let enc_len = BASE64_ENGINE.encoded_length(remainder);
2272            let line_input = &buf[full_lines * bytes_per_line..n];
2273            unsafe {
2274                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
2275                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
2276                *dst.add(wp + enc_len) = b'\n';
2277            }
2278            wp += enc_len + 1;
2279        }
2280
2281        writer.write_all(&out_buf[..wp])?;
2282    }
2283
2284    Ok(())
2285}
2286
2287/// Stream-decode from a reader to a writer. Used for stdin processing.
2288/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
2289/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
2290/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
2291/// fill the entire buffer from the pipe, and 32MB means even large inputs
2292/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
2293pub fn decode_stream(
2294    reader: &mut impl Read,
2295    ignore_garbage: bool,
2296    writer: &mut impl Write,
2297) -> io::Result<()> {
2298    const READ_CHUNK: usize = 32 * 1024 * 1024;
2299    // SAFETY: buf bytes are written by read_full before being processed.
2300    // The extra 4 bytes accommodate carry-over from previous chunk.
2301    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
2302    #[allow(clippy::uninit_vec)]
2303    unsafe {
2304        buf.set_len(READ_CHUNK + 4);
2305    }
2306    let mut carry = [0u8; 4];
2307    let mut carry_len = 0usize;
2308
2309    loop {
2310        // Copy carry bytes to start of buffer, read new data after them
2311        if carry_len > 0 {
2312            unsafe {
2313                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
2314            }
2315        }
2316        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
2317        if n == 0 {
2318            break;
2319        }
2320        let total_raw = carry_len + n;
2321
2322        // Strip whitespace in-place in the buffer itself.
2323        // This eliminates the separate clean buffer allocation (saves 16MB).
2324        let clean_len = if ignore_garbage {
2325            // Scalar filter for ignore_garbage mode (rare path)
2326            let ptr = buf.as_mut_ptr();
2327            let mut wp = 0usize;
2328            for i in 0..total_raw {
2329                let b = unsafe { *ptr.add(i) };
2330                if is_base64_char(b) {
2331                    unsafe { *ptr.add(wp) = b };
2332                    wp += 1;
2333                }
2334            }
2335            wp
2336        } else {
2337            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
2338            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
2339            // so we process ~76 bytes per memchr hit.
2340            let ptr = buf.as_mut_ptr();
2341            let data = &buf[..total_raw];
2342            let mut wp = 0usize;
2343            let mut gap_start = 0usize;
2344            let mut has_rare_ws = false;
2345
2346            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
2347                let gap_len = pos - gap_start;
2348                if gap_len > 0 {
2349                    if !has_rare_ws {
2350                        has_rare_ws = data[gap_start..pos]
2351                            .iter()
2352                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2353                    }
2354                    if wp != gap_start {
2355                        unsafe {
2356                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
2357                        }
2358                    }
2359                    wp += gap_len;
2360                }
2361                gap_start = pos + 1;
2362            }
2363            let tail_len = total_raw - gap_start;
2364            if tail_len > 0 {
2365                if !has_rare_ws {
2366                    has_rare_ws = data[gap_start..total_raw]
2367                        .iter()
2368                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
2369                }
2370                if wp != gap_start {
2371                    unsafe {
2372                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
2373                    }
2374                }
2375                wp += tail_len;
2376            }
2377
2378            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
2379            if has_rare_ws {
2380                let mut rp = 0;
2381                let mut cwp = 0;
2382                while rp < wp {
2383                    let b = unsafe { *ptr.add(rp) };
2384                    if NOT_WHITESPACE[b as usize] {
2385                        unsafe { *ptr.add(cwp) = b };
2386                        cwp += 1;
2387                    }
2388                    rp += 1;
2389                }
2390                cwp
2391            } else {
2392                wp
2393            }
2394        };
2395
2396        carry_len = 0;
2397        let is_last = n < READ_CHUNK;
2398
2399        if is_last {
2400            // Last chunk: decode everything (including padding)
2401            decode_clean_slice(&mut buf[..clean_len], writer)?;
2402        } else {
2403            // Save incomplete base64 quadruplet for next iteration
2404            let decode_len = (clean_len / 4) * 4;
2405            let leftover = clean_len - decode_len;
2406            if leftover > 0 {
2407                unsafe {
2408                    std::ptr::copy_nonoverlapping(
2409                        buf.as_ptr().add(decode_len),
2410                        carry.as_mut_ptr(),
2411                        leftover,
2412                    );
2413                }
2414                carry_len = leftover;
2415            }
2416            if decode_len > 0 {
2417                decode_clean_slice(&mut buf[..decode_len], writer)?;
2418            }
2419        }
2420    }
2421
2422    // Handle any remaining carry-over bytes
2423    if carry_len > 0 {
2424        let mut carry_buf = carry[..carry_len].to_vec();
2425        decode_clean_slice(&mut carry_buf, writer)?;
2426    }
2427
2428    Ok(())
2429}
2430
2431/// Write all IoSlice entries using write_vectored (writev syscall).
2432/// Hot path: single write_vectored succeeds fully (common on Linux pipes/files).
2433/// Cold path: partial write handled out-of-line to keep hot path tight.
2434#[inline(always)]
2435fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
2436    if slices.is_empty() {
2437        return Ok(());
2438    }
2439    let total: usize = slices.iter().map(|s| s.len()).sum();
2440    let written = out.write_vectored(slices)?;
2441    if written >= total {
2442        return Ok(());
2443    }
2444    if written == 0 {
2445        return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
2446    }
2447    write_all_vectored_slow(out, slices, written)
2448}
2449
2450/// Handle partial write (cold path, never inlined).
2451#[cold]
2452#[inline(never)]
2453fn write_all_vectored_slow(
2454    out: &mut impl Write,
2455    slices: &[io::IoSlice],
2456    mut skip: usize,
2457) -> io::Result<()> {
2458    for slice in slices {
2459        let len = slice.len();
2460        if skip >= len {
2461            skip -= len;
2462            continue;
2463        }
2464        out.write_all(&slice[skip..])?;
2465        skip = 0;
2466    }
2467    Ok(())
2468}
2469
2470/// Read as many bytes as possible into buf, retrying on partial reads.
2471/// Fast path: regular file reads usually return the full buffer on the first call,
2472/// avoiding the loop overhead entirely.
2473#[inline]
2474fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2475    // Fast path: first read() usually fills the entire buffer for regular files
2476    let n = reader.read(buf)?;
2477    if n == buf.len() || n == 0 {
2478        return Ok(n);
2479    }
2480    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
2481    let mut total = n;
2482    while total < buf.len() {
2483        match reader.read(&mut buf[total..]) {
2484            Ok(0) => break,
2485            Ok(n) => total += n,
2486            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2487            Err(e) => return Err(e),
2488        }
2489    }
2490    Ok(total)
2491}