Skip to main content

coreutils_rs/base64/
core.rs

1use std::io::{self, Read, Write};
2
3use base64_simd::AsOut;
4use rayon::prelude::*;
5
6const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;
7
8/// Chunk size for no-wrap encoding: 32MB aligned to 3 bytes.
9/// Larger chunks = fewer write() syscalls for big files.
10const NOWRAP_CHUNK: usize = 32 * 1024 * 1024 - (32 * 1024 * 1024 % 3);
11
12/// Minimum data size for parallel encoding (2MB).
13/// For wrapped encoding, the parallel path assigns line-aligned chunks to each thread,
14/// with each thread encoding directly to its position in a shared output buffer.
15/// At 2MB+ the parallel speedup (2-4x on 4+ cores) exceeds rayon overhead (~200us).
16const PARALLEL_ENCODE_THRESHOLD: usize = 2 * 1024 * 1024;
17
18/// Minimum data size for parallel decoding (2MB of base64 data).
19/// At 2MB+ the parallel speedup on multi-core exceeds rayon overhead (~200us).
20/// For 10MB benchmark inputs (~13MB base64), this enables parallel decode.
21const PARALLEL_DECODE_THRESHOLD: usize = 2 * 1024 * 1024;
22
23/// Encode data and write to output with line wrapping.
24/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
25pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
26    if data.is_empty() {
27        return Ok(());
28    }
29
30    if wrap_col == 0 {
31        return encode_no_wrap(data, out);
32    }
33
34    encode_wrapped(data, wrap_col, out)
35}
36
37/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
38fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
39    if data.len() >= PARALLEL_ENCODE_THRESHOLD && rayon::current_num_threads() > 1 {
40        return encode_no_wrap_parallel(data, out);
41    }
42
43    let actual_chunk = NOWRAP_CHUNK.min(data.len());
44    let enc_max = BASE64_ENGINE.encoded_length(actual_chunk);
45    // SAFETY: encode() writes exactly enc_len bytes before we read them.
46    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
47    #[allow(clippy::uninit_vec)]
48    unsafe {
49        buf.set_len(enc_max);
50    }
51
52    for chunk in data.chunks(NOWRAP_CHUNK) {
53        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
54        let encoded = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
55        out.write_all(encoded)?;
56    }
57    Ok(())
58}
59
60/// Parallel no-wrap encoding: split at 3-byte boundaries, encode chunks in parallel.
61/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
62/// Uses a single shared output buffer with direct-to-position encoding (no per-thread allocs).
63fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
64    let num_threads = rayon::current_num_threads().max(1);
65    let raw_chunk = data.len() / num_threads;
66    // Align to 3 bytes so each chunk encodes without padding (except the last)
67    let chunk_size = ((raw_chunk + 2) / 3) * 3;
68
69    // Pre-compute per-chunk metadata: (input_offset, output_offset, input_len)
70    let mut tasks: Vec<(usize, usize, usize)> = Vec::new();
71    let mut in_off = 0usize;
72    let mut out_off = 0usize;
73    while in_off < data.len() {
74        let chunk_len = chunk_size.max(3).min(data.len() - in_off);
75        let enc_len = BASE64_ENGINE.encoded_length(chunk_len);
76        tasks.push((in_off, out_off, chunk_len));
77        in_off += chunk_len;
78        out_off += enc_len;
79    }
80    let total_output = out_off;
81
82    // Single shared output buffer
83    let mut outbuf: Vec<u8> = Vec::with_capacity(total_output);
84    #[allow(clippy::uninit_vec)]
85    unsafe {
86        outbuf.set_len(total_output);
87    }
88
89    // Parallel encode: each thread encodes directly into its position in the shared buffer.
90    // SAFETY: tasks have non-overlapping output regions.
91    let buf_addr = outbuf.as_mut_ptr() as usize;
92    tasks.par_iter().for_each(|&(in_off, out_off, chunk_len)| {
93        let enc_len = BASE64_ENGINE.encoded_length(chunk_len);
94        let out_slice =
95            unsafe { std::slice::from_raw_parts_mut((buf_addr as *mut u8).add(out_off), enc_len) };
96        let _ = BASE64_ENGINE.encode(&data[in_off..in_off + chunk_len], out_slice.as_out());
97    });
98
99    out.write_all(&outbuf[..total_output])
100}
101
102/// Encode with line wrapping using in-place expansion.
103/// Phase 1: bulk-encode the entire input in one SIMD pass into a buffer.
104/// Phase 2: expand backwards to insert newlines between wrap_col-sized segments.
105/// Phase 3: single write_all of the completed output.
106///
107/// This avoids both fuse_wrap's copy pass and writev's 300+ syscall overhead,
108/// using only one allocation and one write syscall for the entire output.
109fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
110    // Calculate bytes_per_line: input bytes that produce exactly wrap_col encoded chars.
111    // For default wrap_col=76: 76*3/4 = 57 bytes per line.
112    let bytes_per_line = wrap_col * 3 / 4;
113    if bytes_per_line == 0 {
114        // Degenerate case: wrap_col < 4, fall back to byte-at-a-time
115        return encode_wrapped_small(data, wrap_col, out);
116    }
117
118    // Parallel encoding for large data when bytes_per_line is a multiple of 3.
119    // This guarantees each chunk encodes to complete base64 without padding.
120    if data.len() >= PARALLEL_ENCODE_THRESHOLD && bytes_per_line.is_multiple_of(3) {
121        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
122    }
123
124    // Direct-to-position encode+wrap: encode each line directly to its final position
125    // in the output buffer, eliminating the backward expansion pass entirely.
126    // Each bytes_per_line input bytes encode to exactly wrap_col output bytes + 1 newline.
127    if bytes_per_line.is_multiple_of(3) {
128        let line_out = wrap_col + 1;
129        let total_full_lines = data.len() / bytes_per_line;
130        let remainder_input = data.len() % bytes_per_line;
131
132        let remainder_encoded = if remainder_input > 0 {
133            BASE64_ENGINE.encoded_length(remainder_input) + 1
134        } else {
135            0
136        };
137        let total_output = total_full_lines * line_out + remainder_encoded;
138
139        let mut buf: Vec<u8> = Vec::with_capacity(total_output);
140        #[allow(clippy::uninit_vec)]
141        unsafe {
142            buf.set_len(total_output);
143        }
144
145        let dst = buf.as_mut_ptr();
146        let mut line_idx = 0;
147
148        // 4-line unrolled loop for ILP
149        while line_idx + 4 <= total_full_lines {
150            let in_base = line_idx * bytes_per_line;
151            let out_base = line_idx * line_out;
152            unsafe {
153                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
154                let _ = BASE64_ENGINE.encode(&data[in_base..in_base + bytes_per_line], s0.as_out());
155                *dst.add(out_base + wrap_col) = b'\n';
156
157                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
158                let _ = BASE64_ENGINE.encode(
159                    &data[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
160                    s1.as_out(),
161                );
162                *dst.add(out_base + line_out + wrap_col) = b'\n';
163
164                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
165                let _ = BASE64_ENGINE.encode(
166                    &data[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
167                    s2.as_out(),
168                );
169                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
170
171                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
172                let _ = BASE64_ENGINE.encode(
173                    &data[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
174                    s3.as_out(),
175                );
176                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
177            }
178            line_idx += 4;
179        }
180
181        while line_idx < total_full_lines {
182            let in_base = line_idx * bytes_per_line;
183            let out_base = line_idx * line_out;
184            unsafe {
185                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
186                let _ = BASE64_ENGINE.encode(&data[in_base..in_base + bytes_per_line], s.as_out());
187                *dst.add(out_base + wrap_col) = b'\n';
188            }
189            line_idx += 1;
190        }
191
192        // Handle remainder
193        if remainder_input > 0 {
194            let in_off = total_full_lines * bytes_per_line;
195            let out_off = total_full_lines * line_out;
196            let enc_len = BASE64_ENGINE.encoded_length(remainder_input);
197            unsafe {
198                let s = std::slice::from_raw_parts_mut(dst.add(out_off), enc_len);
199                let _ = BASE64_ENGINE.encode(&data[in_off..], s.as_out());
200                *dst.add(out_off + enc_len) = b'\n';
201            }
202        }
203
204        return out.write_all(&buf[..total_output]);
205    }
206
207    // Fallback for non-3-aligned bytes_per_line: chunk + in-place expansion
208    let lines_per_chunk = (32 * 1024 * 1024) / bytes_per_line;
209    let max_input_chunk = (lines_per_chunk * bytes_per_line).max(bytes_per_line);
210
211    let enc_max = BASE64_ENGINE.encoded_length(max_input_chunk.min(data.len()));
212    let num_lines_max = enc_max / wrap_col + 1;
213    let out_max = num_lines_max * (wrap_col + 1) + wrap_col + 1;
214    let mut buf: Vec<u8> = Vec::with_capacity(out_max);
215    #[allow(clippy::uninit_vec)]
216    unsafe {
217        buf.set_len(out_max);
218    }
219
220    for chunk in data.chunks(max_input_chunk.max(1)) {
221        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
222        let _ = BASE64_ENGINE.encode(chunk, buf[..enc_len].as_out());
223        let num_full = enc_len / wrap_col;
224        let rem = enc_len % wrap_col;
225        let chunk_out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };
226
227        // Expand backwards
228        unsafe {
229            let ptr = buf.as_mut_ptr();
230            let mut rp = enc_len;
231            let mut wp = chunk_out_len;
232            if rem > 0 {
233                wp -= 1;
234                *ptr.add(wp) = b'\n';
235                wp -= rem;
236                rp -= rem;
237                if rp != wp {
238                    std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
239                }
240            }
241            for _ in 0..num_full {
242                wp -= 1;
243                *ptr.add(wp) = b'\n';
244                wp -= wrap_col;
245                rp -= wrap_col;
246                if rp != wp {
247                    std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
248                }
249            }
250        }
251        out.write_all(&buf[..chunk_out_len])?;
252    }
253
254    Ok(())
255}
256
257/// Static newline byte for IoSlice references in writev calls.
258static NEWLINE: [u8; 1] = [b'\n'];
259
260/// Write encoded base64 data with line wrapping using write_vectored (writev).
261/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
262/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
263/// This is zero-copy: no fused output buffer needed.
264#[inline]
265#[allow(dead_code)]
266fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
267    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
268    // Each line needs 2 entries (data + newline), so 512 lines per batch.
269    const MAX_IOV: usize = 1024;
270
271    let num_full_lines = encoded.len() / wrap_col;
272    let remainder = encoded.len() % wrap_col;
273    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };
274
275    // Small output: build all IoSlices and write in one call
276    if total_iov <= MAX_IOV {
277        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
278        let mut pos = 0;
279        for _ in 0..num_full_lines {
280            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
281            iov.push(io::IoSlice::new(&NEWLINE));
282            pos += wrap_col;
283        }
284        if remainder > 0 {
285            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
286            iov.push(io::IoSlice::new(&NEWLINE));
287        }
288        return write_all_vectored(out, &iov);
289    }
290
291    // Large output: write in batches
292    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
293    let mut pos = 0;
294    for _ in 0..num_full_lines {
295        iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
296        iov.push(io::IoSlice::new(&NEWLINE));
297        pos += wrap_col;
298        if iov.len() >= MAX_IOV {
299            write_all_vectored(out, &iov)?;
300            iov.clear();
301        }
302    }
303    if remainder > 0 {
304        iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
305        iov.push(io::IoSlice::new(&NEWLINE));
306    }
307    if !iov.is_empty() {
308        write_all_vectored(out, &iov)?;
309    }
310    Ok(())
311}
312
313/// Write encoded base64 data with line wrapping using writev, tracking column state
314/// across calls. Used by encode_stream for piped input where chunks don't align
315/// to line boundaries.
316#[inline]
317fn write_wrapped_iov_streaming(
318    encoded: &[u8],
319    wrap_col: usize,
320    col: &mut usize,
321    out: &mut impl Write,
322) -> io::Result<()> {
323    const MAX_IOV: usize = 1024;
324    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
325    let mut rp = 0;
326
327    while rp < encoded.len() {
328        let space = wrap_col - *col;
329        let avail = encoded.len() - rp;
330
331        if avail <= space {
332            // Remaining data fits in current line
333            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
334            *col += avail;
335            if *col == wrap_col {
336                iov.push(io::IoSlice::new(&NEWLINE));
337                *col = 0;
338            }
339            break;
340        } else {
341            // Fill current line and add newline
342            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
343            iov.push(io::IoSlice::new(&NEWLINE));
344            rp += space;
345            *col = 0;
346        }
347
348        if iov.len() >= MAX_IOV - 1 {
349            write_all_vectored(out, &iov)?;
350            iov.clear();
351        }
352    }
353
354    if !iov.is_empty() {
355        write_all_vectored(out, &iov)?;
356    }
357    Ok(())
358}
359
360/// Parallel wrapped encoding: single output buffer, direct-to-position encode+wrap.
361/// Requires bytes_per_line % 3 == 0 so each chunk encodes without intermediate padding.
362///
363/// Pre-calculates exact output size and each thread's write offset, then encodes
364/// 57-byte input groups directly to their final position in a shared output buffer.
365/// Each thread writes wrap_col encoded bytes + newline per line, so output for line N
366/// starts at N * (wrap_col + 1). This eliminates per-chunk heap allocations and
367/// the fuse_wrap copy pass entirely.
368fn encode_wrapped_parallel(
369    data: &[u8],
370    wrap_col: usize,
371    bytes_per_line: usize,
372    out: &mut impl Write,
373) -> io::Result<()> {
374    let line_out = wrap_col + 1; // wrap_col data + 1 newline per line
375    let total_full_lines = data.len() / bytes_per_line;
376    let remainder_input = data.len() % bytes_per_line;
377
378    // Calculate exact output size
379    let remainder_encoded = if remainder_input > 0 {
380        BASE64_ENGINE.encoded_length(remainder_input) + 1 // +1 for trailing newline
381    } else {
382        0
383    };
384    let total_output = total_full_lines * line_out + remainder_encoded;
385
386    // Pre-allocate single contiguous output buffer
387    let mut outbuf: Vec<u8> = Vec::with_capacity(total_output);
388    #[allow(clippy::uninit_vec)]
389    unsafe {
390        outbuf.set_len(total_output);
391    }
392
393    // Split work at line boundaries for parallel processing
394    let num_threads = rayon::current_num_threads().max(1);
395    let lines_per_chunk = (total_full_lines / num_threads).max(1);
396    let input_chunk = lines_per_chunk * bytes_per_line;
397
398    // Compute per-chunk metadata: (input_offset, output_offset, num_input_bytes)
399    let mut tasks: Vec<(usize, usize, usize)> = Vec::new();
400    let mut in_off = 0usize;
401    let mut out_off = 0usize;
402    while in_off < data.len() {
403        let chunk_input = input_chunk.min(data.len() - in_off);
404        // Align to bytes_per_line except for the very last chunk
405        let aligned_input = if in_off + chunk_input < data.len() {
406            (chunk_input / bytes_per_line) * bytes_per_line
407        } else {
408            chunk_input
409        };
410        if aligned_input == 0 {
411            break;
412        }
413        let full_lines = aligned_input / bytes_per_line;
414        let rem = aligned_input % bytes_per_line;
415        let chunk_output = full_lines * line_out
416            + if rem > 0 {
417                BASE64_ENGINE.encoded_length(rem) + 1
418            } else {
419                0
420            };
421        tasks.push((in_off, out_off, aligned_input));
422        in_off += aligned_input;
423        out_off += chunk_output;
424    }
425
426    // Parallel encode: each thread encodes lines directly into the final
427    // output buffer, eliminating per-thread buffer allocation and the
428    // scatter copy phase entirely. Each 57-byte input line encodes to
429    // exactly 76 encoded bytes + 1 newline = 77 bytes at a known offset.
430    // base64_simd handles the SIMD encoding even for 57-byte inputs.
431    // SAFETY: tasks have non-overlapping output regions.
432    let out_addr = outbuf.as_mut_ptr() as usize;
433
434    tasks.par_iter().for_each(|&(in_off, out_off, chunk_len)| {
435        let input = &data[in_off..in_off + chunk_len];
436        let full_lines = chunk_len / bytes_per_line;
437        let rem = chunk_len % bytes_per_line;
438
439        let out_ptr = out_addr as *mut u8;
440
441        // Encode each line directly into its final position in the output buffer.
442        // No thread-local buffer needed — each 57-byte input -> 76 encoded bytes
443        // written directly at out_off + line_idx * 77.
444        if full_lines > 0 {
445            let dst = unsafe { out_ptr.add(out_off) };
446            let mut line_idx = 0;
447
448            // 4-line unrolled loop for ILP
449            while line_idx + 4 <= full_lines {
450                let in_base = line_idx * bytes_per_line;
451                let out_base = line_idx * line_out;
452                unsafe {
453                    let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
454                    let _ = BASE64_ENGINE
455                        .encode(&input[in_base..in_base + bytes_per_line], s0.as_out());
456                    *dst.add(out_base + wrap_col) = b'\n';
457
458                    let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
459                    let _ = BASE64_ENGINE.encode(
460                        &input[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
461                        s1.as_out(),
462                    );
463                    *dst.add(out_base + line_out + wrap_col) = b'\n';
464
465                    let s2 =
466                        std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
467                    let _ = BASE64_ENGINE.encode(
468                        &input[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
469                        s2.as_out(),
470                    );
471                    *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
472
473                    let s3 =
474                        std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
475                    let _ = BASE64_ENGINE.encode(
476                        &input[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
477                        s3.as_out(),
478                    );
479                    *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
480                }
481                line_idx += 4;
482            }
483
484            // Remaining lines one at a time
485            while line_idx < full_lines {
486                let in_base = line_idx * bytes_per_line;
487                let out_base = line_idx * line_out;
488                unsafe {
489                    let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
490                    let _ =
491                        BASE64_ENGINE.encode(&input[in_base..in_base + bytes_per_line], s.as_out());
492                    *dst.add(out_base + wrap_col) = b'\n';
493                }
494                line_idx += 1;
495            }
496        }
497
498        // Handle remainder (last partial line of this chunk)
499        if rem > 0 {
500            let line_input = &input[full_lines * bytes_per_line..];
501            let enc_len = BASE64_ENGINE.encoded_length(rem);
502            let woff = out_off + full_lines * line_out;
503            // Encode directly into final output position
504            let out_slice =
505                unsafe { std::slice::from_raw_parts_mut(out_ptr.add(woff), enc_len + 1) };
506            let _ = BASE64_ENGINE.encode(line_input, out_slice[..enc_len].as_out());
507            out_slice[enc_len] = b'\n';
508        }
509    });
510
511    out.write_all(&outbuf[..total_output])
512}
513
514/// Fuse encoded base64 data with newlines in a single pass.
515/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
516/// Returns number of bytes written.
517#[inline]
518#[allow(dead_code)]
519fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
520    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
521    let mut rp = 0;
522    let mut wp = 0;
523
524    // Unrolled: process 8 lines per iteration for better ILP
525    while rp + 8 * wrap_col <= encoded.len() {
526        unsafe {
527            let src = encoded.as_ptr().add(rp);
528            let dst = out_buf.as_mut_ptr().add(wp);
529
530            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
531            *dst.add(wrap_col) = b'\n';
532
533            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
534            *dst.add(line_out + wrap_col) = b'\n';
535
536            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
537            *dst.add(2 * line_out + wrap_col) = b'\n';
538
539            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
540            *dst.add(3 * line_out + wrap_col) = b'\n';
541
542            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
543            *dst.add(4 * line_out + wrap_col) = b'\n';
544
545            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
546            *dst.add(5 * line_out + wrap_col) = b'\n';
547
548            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
549            *dst.add(6 * line_out + wrap_col) = b'\n';
550
551            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
552            *dst.add(7 * line_out + wrap_col) = b'\n';
553        }
554        rp += 8 * wrap_col;
555        wp += 8 * line_out;
556    }
557
558    // Handle remaining 4 lines at a time
559    while rp + 4 * wrap_col <= encoded.len() {
560        unsafe {
561            let src = encoded.as_ptr().add(rp);
562            let dst = out_buf.as_mut_ptr().add(wp);
563
564            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
565            *dst.add(wrap_col) = b'\n';
566
567            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
568            *dst.add(line_out + wrap_col) = b'\n';
569
570            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
571            *dst.add(2 * line_out + wrap_col) = b'\n';
572
573            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
574            *dst.add(3 * line_out + wrap_col) = b'\n';
575        }
576        rp += 4 * wrap_col;
577        wp += 4 * line_out;
578    }
579
580    // Remaining full lines
581    while rp + wrap_col <= encoded.len() {
582        unsafe {
583            std::ptr::copy_nonoverlapping(
584                encoded.as_ptr().add(rp),
585                out_buf.as_mut_ptr().add(wp),
586                wrap_col,
587            );
588            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
589        }
590        rp += wrap_col;
591        wp += line_out;
592    }
593
594    // Partial last line
595    if rp < encoded.len() {
596        let remaining = encoded.len() - rp;
597        unsafe {
598            std::ptr::copy_nonoverlapping(
599                encoded.as_ptr().add(rp),
600                out_buf.as_mut_ptr().add(wp),
601                remaining,
602            );
603        }
604        wp += remaining;
605        out_buf[wp] = b'\n';
606        wp += 1;
607    }
608
609    wp
610}
611
612/// Fallback for very small wrap columns (< 4 chars).
613fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
614    let enc_max = BASE64_ENGINE.encoded_length(data.len());
615    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
616    #[allow(clippy::uninit_vec)]
617    unsafe {
618        buf.set_len(enc_max);
619    }
620    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());
621
622    let wc = wrap_col.max(1);
623    for line in encoded.chunks(wc) {
624        out.write_all(line)?;
625        out.write_all(b"\n")?;
626    }
627    Ok(())
628}
629
630/// Decode base64 data and write to output (borrows data, allocates clean buffer).
631/// When `ignore_garbage` is true, strip all non-base64 characters.
632/// When false, only strip whitespace (standard behavior).
633pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
634    if data.is_empty() {
635        return Ok(());
636    }
637
638    if ignore_garbage {
639        let mut cleaned = strip_non_base64(data);
640        return decode_clean_slice(&mut cleaned, out);
641    }
642
643    // Try line-by-line decode: if data has uniform 76+1 byte lines (76 base64
644    // chars + newline), decode each line directly into the output buffer.
645    // This avoids the whitespace stripping copy entirely.
646    if data.len() >= 77 {
647        if let Some(result) = try_line_decode(data, out) {
648            return result;
649        }
650    }
651
652    // Fast path: single-pass strip + decode
653    decode_stripping_whitespace(data, out)
654}
655
656/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
657/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
658/// in-place with base64_simd::decode_inplace. Zero additional allocations.
659///
660/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
661/// containing whitespace (newlines) get physically copied (~1.3% for
662/// 76-char line base64). The decode writes to the same buffer, but decoded
663/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
664pub fn decode_mmap_inplace(
665    data: &mut [u8],
666    ignore_garbage: bool,
667    out: &mut impl Write,
668) -> io::Result<()> {
669    if data.is_empty() {
670        return Ok(());
671    }
672
673    // Try line-by-line decode first: avoids the in-place whitespace strip
674    // and COW page faults entirely. Each line is decoded independently.
675    if !ignore_garbage && data.len() >= 77 {
676        if let Some(result) = try_line_decode(data, out) {
677            return result;
678        }
679    }
680
681    if ignore_garbage {
682        // Strip non-base64 chars in-place
683        let ptr = data.as_mut_ptr();
684        let len = data.len();
685        let mut wp = 0;
686        for rp in 0..len {
687            let b = unsafe { *ptr.add(rp) };
688            if is_base64_char(b) {
689                unsafe { *ptr.add(wp) = b };
690                wp += 1;
691            }
692        }
693        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
694            Ok(decoded) => return out.write_all(decoded),
695            Err(_) => return decode_error(),
696        }
697    }
698
699    // Fast path: strip whitespace in-place, then decode in-place.
700    // Uses SIMD memchr2 gap-copy for \n/\r (dominant whitespace in base64).
701
702    // Quick check: no newlines at all — maybe already clean
703    if memchr::memchr2(b'\n', b'\r', data).is_none() {
704        // Check for rare whitespace
705        if !data
706            .iter()
707            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
708        {
709            // Perfectly clean — decode in-place directly
710            match BASE64_ENGINE.decode_inplace(data) {
711                Ok(decoded) => return out.write_all(decoded),
712                Err(_) => return decode_error(),
713            }
714        }
715        // Rare whitespace only — strip in-place
716        let ptr = data.as_mut_ptr();
717        let len = data.len();
718        let mut wp = 0;
719        for rp in 0..len {
720            let b = unsafe { *ptr.add(rp) };
721            if NOT_WHITESPACE[b as usize] {
722                unsafe { *ptr.add(wp) = b };
723                wp += 1;
724            }
725        }
726        match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
727            Ok(decoded) => return out.write_all(decoded),
728            Err(_) => return decode_error(),
729        }
730    }
731
732    // SIMD gap-copy: strip \n and \r in-place using memchr2
733    let ptr = data.as_mut_ptr();
734    let len = data.len();
735    let mut wp = 0usize;
736    let mut gap_start = 0usize;
737    let mut has_rare_ws = false;
738
739    // SAFETY: memchr2_iter reads from the original data. We write to positions
740    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
741    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
742        let gap_len = pos - gap_start;
743        if gap_len > 0 {
744            if !has_rare_ws {
745                // Check for rare whitespace during the gap-copy
746                has_rare_ws = unsafe {
747                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
748                        .iter()
749                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
750                };
751            }
752            if wp != gap_start {
753                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
754            }
755            wp += gap_len;
756        }
757        gap_start = pos + 1;
758    }
759    // Final gap
760    let tail_len = len - gap_start;
761    if tail_len > 0 {
762        if !has_rare_ws {
763            has_rare_ws = unsafe {
764                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
765                    .iter()
766                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
767            };
768        }
769        if wp != gap_start {
770            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
771        }
772        wp += tail_len;
773    }
774
775    // Second pass for rare whitespace if needed
776    if has_rare_ws {
777        let mut rp = 0;
778        let mut cwp = 0;
779        while rp < wp {
780            let b = unsafe { *ptr.add(rp) };
781            if NOT_WHITESPACE[b as usize] {
782                unsafe { *ptr.add(cwp) = b };
783                cwp += 1;
784            }
785            rp += 1;
786        }
787        wp = cwp;
788    }
789
790    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
791    if wp >= PARALLEL_DECODE_THRESHOLD {
792        // For large data, use parallel decode from the cleaned slice
793        return decode_borrowed_clean_parallel(out, &data[..wp]);
794    }
795    match BASE64_ENGINE.decode_inplace(&mut data[..wp]) {
796        Ok(decoded) => out.write_all(decoded),
797        Err(_) => decode_error(),
798    }
799}
800
801/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
802pub fn decode_owned(
803    data: &mut Vec<u8>,
804    ignore_garbage: bool,
805    out: &mut impl Write,
806) -> io::Result<()> {
807    if data.is_empty() {
808        return Ok(());
809    }
810
811    if ignore_garbage {
812        data.retain(|&b| is_base64_char(b));
813    } else {
814        strip_whitespace_inplace(data);
815    }
816
817    decode_clean_slice(data, out)
818}
819
820/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
821/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
822/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
823/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
824fn strip_whitespace_inplace(data: &mut Vec<u8>) {
825    // Quick check: skip stripping if no \n or \r in the data.
826    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
827    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
828    if memchr::memchr2(b'\n', b'\r', data).is_none() {
829        // No newlines/CR — check for rare whitespace only
830        if data
831            .iter()
832            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
833        {
834            data.retain(|&b| NOT_WHITESPACE[b as usize]);
835        }
836        return;
837    }
838
839    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
840    // gaps between them to compact the data in-place. For typical base64 streams,
841    // newlines are the only whitespace, so this handles >99% of cases.
842    let ptr = data.as_mut_ptr();
843    let len = data.len();
844    let mut wp = 0usize;
845    let mut gap_start = 0usize;
846    let mut has_rare_ws = false;
847
848    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
849        let gap_len = pos - gap_start;
850        if gap_len > 0 {
851            if !has_rare_ws {
852                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
853                has_rare_ws = data[gap_start..pos]
854                    .iter()
855                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
856            }
857            if wp != gap_start {
858                unsafe {
859                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
860                }
861            }
862            wp += gap_len;
863        }
864        gap_start = pos + 1;
865    }
866    // Copy the final gap
867    let tail_len = len - gap_start;
868    if tail_len > 0 {
869        if !has_rare_ws {
870            has_rare_ws = data[gap_start..]
871                .iter()
872                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
873        }
874        if wp != gap_start {
875            unsafe {
876                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
877            }
878        }
879        wp += tail_len;
880    }
881
882    data.truncate(wp);
883
884    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
885    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
886    if has_rare_ws {
887        let ptr = data.as_mut_ptr();
888        let len = data.len();
889        let mut rp = 0;
890        let mut cwp = 0;
891        while rp < len {
892            let b = unsafe { *ptr.add(rp) };
893            if NOT_WHITESPACE[b as usize] {
894                unsafe { *ptr.add(cwp) = b };
895                cwp += 1;
896            }
897            rp += 1;
898        }
899        data.truncate(cwp);
900    }
901}
902
903/// 256-byte lookup table: true for non-whitespace bytes.
904/// Used for single-pass whitespace stripping in decode.
905static NOT_WHITESPACE: [bool; 256] = {
906    let mut table = [true; 256];
907    table[b' ' as usize] = false;
908    table[b'\t' as usize] = false;
909    table[b'\n' as usize] = false;
910    table[b'\r' as usize] = false;
911    table[0x0b] = false; // vertical tab
912    table[0x0c] = false; // form feed
913    table
914};
915
916/// Decode by stripping whitespace and decoding in a single fused pass.
917/// For data with no whitespace, decodes directly without any copy.
918/// Uses memchr2 SIMD gap-copy for \n/\r (the dominant whitespace in base64),
919/// then a conditional fallback pass for rare whitespace types (tab, space, VT, FF).
920/// Tracks rare whitespace presence during the gap-copy to skip the second scan
921/// entirely in the common case (pure \n/\r whitespace only).
922fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
923    // Quick check: skip stripping if no \n or \r in the data.
924    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
925    if memchr::memchr2(b'\n', b'\r', data).is_none() {
926        // No newlines/CR — check for rare whitespace only
927        if !data
928            .iter()
929            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
930        {
931            return decode_borrowed_clean(out, data);
932        }
933        // Has rare whitespace only — strip and decode
934        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
935        for &b in data {
936            if NOT_WHITESPACE[b as usize] {
937                cleaned.push(b);
938            }
939        }
940        return decode_clean_slice(&mut cleaned, out);
941    }
942
943    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
944    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
945    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
946    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
947    let dst = clean.as_mut_ptr();
948    let mut wp = 0usize;
949    let mut gap_start = 0usize;
950    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
951    // This avoids the second full-scan pass when only \n/\r are present.
952    let mut has_rare_ws = false;
953
954    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
955        let gap_len = pos - gap_start;
956        if gap_len > 0 {
957            // Check gap region for rare whitespace during copy.
958            // This adds ~1 branch per gap but eliminates the second full scan.
959            if !has_rare_ws {
960                has_rare_ws = data[gap_start..pos]
961                    .iter()
962                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
963            }
964            unsafe {
965                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
966            }
967            wp += gap_len;
968        }
969        gap_start = pos + 1;
970    }
971    // Copy the final gap after the last \n/\r
972    let tail_len = data.len() - gap_start;
973    if tail_len > 0 {
974        if !has_rare_ws {
975            has_rare_ws = data[gap_start..]
976                .iter()
977                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
978        }
979        unsafe {
980            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
981        }
982        wp += tail_len;
983    }
984    unsafe {
985        clean.set_len(wp);
986    }
987
988    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
989    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
990    if has_rare_ws {
991        let ptr = clean.as_mut_ptr();
992        let len = clean.len();
993        let mut rp = 0;
994        let mut cwp = 0;
995        while rp < len {
996            let b = unsafe { *ptr.add(rp) };
997            if NOT_WHITESPACE[b as usize] {
998                unsafe { *ptr.add(cwp) = b };
999                cwp += 1;
1000            }
1001            rp += 1;
1002        }
1003        clean.truncate(cwp);
1004    }
1005
1006    // For large data (>= threshold), use parallel decode for multi-core speedup.
1007    // For small data, use in-place decode to avoid extra allocation.
1008    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
1009        decode_borrowed_clean_parallel(out, &clean)
1010    } else {
1011        decode_clean_slice(&mut clean, out)
1012    }
1013}
1014
1015/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
1016/// Returns Some(result) if the data has uniform line lengths suitable for
1017/// per-line decode, or None if the data doesn't fit this pattern.
1018///
1019/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
1020/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
1021/// We decode each line directly into its position in the output buffer.
1022fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
1023    // Find the first newline to determine line length
1024    let first_nl = memchr::memchr(b'\n', data)?;
1025    let line_len = first_nl; // encoded chars per line (without newline)
1026
1027    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
1028    if line_len == 0 || line_len % 4 != 0 {
1029        return None;
1030    }
1031
1032    let line_stride = line_len + 1; // line_len chars + 1 newline byte
1033    let decoded_per_line = line_len * 3 / 4;
1034
1035    // Verify the data has a consistent line structure by checking the next few lines
1036    let check_lines = 4.min(data.len() / line_stride);
1037    for i in 1..check_lines {
1038        let expected_nl = i * line_stride - 1;
1039        if expected_nl >= data.len() {
1040            break;
1041        }
1042        if data[expected_nl] != b'\n' {
1043            return None; // Inconsistent line length
1044        }
1045    }
1046
1047    // Calculate full lines and remainder
1048    let full_lines = if data.len() >= line_stride {
1049        // Check how many complete lines fit
1050        let candidate = data.len() / line_stride;
1051        // Verify the last full line's newline
1052        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
1053            return None; // Not a clean line-structured file
1054        }
1055        candidate
1056    } else {
1057        0
1058    };
1059
1060    let remainder_start = full_lines * line_stride;
1061    let remainder = &data[remainder_start..];
1062
1063    // Calculate exact output size
1064    let remainder_clean_len = if remainder.is_empty() {
1065        0
1066    } else {
1067        // Remainder might end with newline, strip it
1068        let rem = if remainder.last() == Some(&b'\n') {
1069            &remainder[..remainder.len() - 1]
1070        } else {
1071            remainder
1072        };
1073        if rem.is_empty() {
1074            0
1075        } else {
1076            // Check for padding
1077            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
1078            if rem.len() % 4 != 0 {
1079                return None; // Invalid remainder
1080            }
1081            rem.len() * 3 / 4 - pad
1082        }
1083    };
1084
1085    // Single-allocation decode: allocate full decoded output, decode all lines
1086    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
1087    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
1088    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
1089    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
1090    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1091    #[allow(clippy::uninit_vec)]
1092    unsafe {
1093        out_buf.set_len(total_decoded);
1094    }
1095
1096    let dst = out_buf.as_mut_ptr();
1097
1098    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
1099    // Each thread decodes a contiguous block of lines directly to its final position
1100    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
1101    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
1102        let out_addr = dst as usize;
1103        let num_threads = rayon::current_num_threads().max(1);
1104        let lines_per_chunk = (full_lines / num_threads).max(1);
1105
1106        // Build per-thread task ranges: (start_line, end_line)
1107        let mut tasks: Vec<(usize, usize)> = Vec::new();
1108        let mut line_off = 0;
1109        while line_off < full_lines {
1110            let end = (line_off + lines_per_chunk).min(full_lines);
1111            tasks.push((line_off, end));
1112            line_off = end;
1113        }
1114
1115        let decode_result: Result<Vec<()>, io::Error> = tasks
1116            .par_iter()
1117            .map(|&(start_line, end_line)| {
1118                let out_ptr = out_addr as *mut u8;
1119                let mut i = start_line;
1120
1121                // 4x unrolled decode within each thread's range
1122                while i + 4 <= end_line {
1123                    let in_base = i * line_stride;
1124                    let ob = i * decoded_per_line;
1125                    unsafe {
1126                        let s0 = std::slice::from_raw_parts_mut(out_ptr.add(ob), decoded_per_line);
1127                        if BASE64_ENGINE
1128                            .decode(&data[in_base..in_base + line_len], s0.as_out())
1129                            .is_err()
1130                        {
1131                            return Err(io::Error::new(
1132                                io::ErrorKind::InvalidData,
1133                                "invalid input",
1134                            ));
1135                        }
1136                        let s1 = std::slice::from_raw_parts_mut(
1137                            out_ptr.add(ob + decoded_per_line),
1138                            decoded_per_line,
1139                        );
1140                        if BASE64_ENGINE
1141                            .decode(
1142                                &data[in_base + line_stride..in_base + line_stride + line_len],
1143                                s1.as_out(),
1144                            )
1145                            .is_err()
1146                        {
1147                            return Err(io::Error::new(
1148                                io::ErrorKind::InvalidData,
1149                                "invalid input",
1150                            ));
1151                        }
1152                        let s2 = std::slice::from_raw_parts_mut(
1153                            out_ptr.add(ob + 2 * decoded_per_line),
1154                            decoded_per_line,
1155                        );
1156                        if BASE64_ENGINE
1157                            .decode(
1158                                &data[in_base + 2 * line_stride
1159                                    ..in_base + 2 * line_stride + line_len],
1160                                s2.as_out(),
1161                            )
1162                            .is_err()
1163                        {
1164                            return Err(io::Error::new(
1165                                io::ErrorKind::InvalidData,
1166                                "invalid input",
1167                            ));
1168                        }
1169                        let s3 = std::slice::from_raw_parts_mut(
1170                            out_ptr.add(ob + 3 * decoded_per_line),
1171                            decoded_per_line,
1172                        );
1173                        if BASE64_ENGINE
1174                            .decode(
1175                                &data[in_base + 3 * line_stride
1176                                    ..in_base + 3 * line_stride + line_len],
1177                                s3.as_out(),
1178                            )
1179                            .is_err()
1180                        {
1181                            return Err(io::Error::new(
1182                                io::ErrorKind::InvalidData,
1183                                "invalid input",
1184                            ));
1185                        }
1186                    }
1187                    i += 4;
1188                }
1189
1190                while i < end_line {
1191                    let in_start = i * line_stride;
1192                    let out_off = i * decoded_per_line;
1193                    let out_slice = unsafe {
1194                        std::slice::from_raw_parts_mut(out_ptr.add(out_off), decoded_per_line)
1195                    };
1196                    if BASE64_ENGINE
1197                        .decode(&data[in_start..in_start + line_len], out_slice.as_out())
1198                        .is_err()
1199                    {
1200                        return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"));
1201                    }
1202                    i += 1;
1203                }
1204
1205                Ok(())
1206            })
1207            .collect();
1208
1209        if decode_result.is_err() {
1210            return Some(decode_error());
1211        }
1212    } else {
1213        // Sequential decode with 4x unrolling for smaller inputs
1214        let mut i = 0;
1215
1216        while i + 4 <= full_lines {
1217            let in_base = i * line_stride;
1218            let out_base = i * decoded_per_line;
1219            unsafe {
1220                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
1221                if BASE64_ENGINE
1222                    .decode(&data[in_base..in_base + line_len], s0.as_out())
1223                    .is_err()
1224                {
1225                    return Some(decode_error());
1226                }
1227
1228                let s1 = std::slice::from_raw_parts_mut(
1229                    dst.add(out_base + decoded_per_line),
1230                    decoded_per_line,
1231                );
1232                if BASE64_ENGINE
1233                    .decode(
1234                        &data[in_base + line_stride..in_base + line_stride + line_len],
1235                        s1.as_out(),
1236                    )
1237                    .is_err()
1238                {
1239                    return Some(decode_error());
1240                }
1241
1242                let s2 = std::slice::from_raw_parts_mut(
1243                    dst.add(out_base + 2 * decoded_per_line),
1244                    decoded_per_line,
1245                );
1246                if BASE64_ENGINE
1247                    .decode(
1248                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
1249                        s2.as_out(),
1250                    )
1251                    .is_err()
1252                {
1253                    return Some(decode_error());
1254                }
1255
1256                let s3 = std::slice::from_raw_parts_mut(
1257                    dst.add(out_base + 3 * decoded_per_line),
1258                    decoded_per_line,
1259                );
1260                if BASE64_ENGINE
1261                    .decode(
1262                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
1263                        s3.as_out(),
1264                    )
1265                    .is_err()
1266                {
1267                    return Some(decode_error());
1268                }
1269            }
1270            i += 4;
1271        }
1272
1273        while i < full_lines {
1274            let in_start = i * line_stride;
1275            let in_end = in_start + line_len;
1276            let out_off = i * decoded_per_line;
1277            let out_slice =
1278                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
1279            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
1280                Ok(_) => {}
1281                Err(_) => return Some(decode_error()),
1282            }
1283            i += 1;
1284        }
1285    }
1286
1287    // Decode remainder
1288    if remainder_clean_len > 0 {
1289        let rem = if remainder.last() == Some(&b'\n') {
1290            &remainder[..remainder.len() - 1]
1291        } else {
1292            remainder
1293        };
1294        let out_off = full_lines * decoded_per_line;
1295        let out_slice =
1296            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
1297        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
1298            Ok(_) => {}
1299            Err(_) => return Some(decode_error()),
1300        }
1301    }
1302
1303    // Single write_all for the entire decoded output
1304    Some(out.write_all(&out_buf[..total_decoded]))
1305}
1306
1307/// Decode a clean (no whitespace) buffer in-place with SIMD.
1308fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
1309    if data.is_empty() {
1310        return Ok(());
1311    }
1312    match BASE64_ENGINE.decode_inplace(data) {
1313        Ok(decoded) => out.write_all(decoded),
1314        Err(_) => decode_error(),
1315    }
1316}
1317
1318/// Cold error path — keeps hot decode path tight by moving error construction out of line.
1319#[cold]
1320#[inline(never)]
1321fn decode_error() -> io::Result<()> {
1322    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
1323}
1324
1325/// Decode clean base64 data (no whitespace) from a borrowed slice.
1326fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1327    if data.is_empty() {
1328        return Ok(());
1329    }
1330    // Parallel decode for large data: split at 4-byte boundaries,
1331    // decode each chunk independently (base64 is context-free per 4-char group).
1332    if data.len() >= PARALLEL_DECODE_THRESHOLD {
1333        return decode_borrowed_clean_parallel(out, data);
1334    }
1335    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
1336    // Decoded size = data.len() * 3 / 4 minus padding.
1337    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
1338    let decoded_size = data.len() * 3 / 4 - pad;
1339    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
1340    #[allow(clippy::uninit_vec)]
1341    unsafe {
1342        buf.set_len(decoded_size);
1343    }
1344    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
1345        Ok(decoded) => {
1346            out.write_all(decoded)?;
1347            Ok(())
1348        }
1349        Err(_) => decode_error(),
1350    }
1351}
1352
1353/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel via rayon.
1354/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
1355/// upfront, so each thread decodes directly to its final position. No compaction needed.
1356fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
1357    let num_threads = rayon::current_num_threads().max(1);
1358    let raw_chunk = data.len() / num_threads;
1359    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
1360    let chunk_size = ((raw_chunk + 3) / 4) * 4;
1361
1362    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();
1363
1364    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
1365    // For all chunks except the last, decoded size is exactly chunk.len() * 3 / 4.
1366    // For the last chunk, account for '=' padding bytes.
1367    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
1368    offsets.push(0);
1369    let mut total_decoded = 0usize;
1370    for (i, chunk) in chunks.iter().enumerate() {
1371        let decoded_size = if i == chunks.len() - 1 {
1372            // Last chunk: count '=' padding to get exact decoded size
1373            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
1374            chunk.len() * 3 / 4 - pad
1375        } else {
1376            // Non-last chunks: 4-byte aligned, no padding, exact 3/4 ratio
1377            chunk.len() * 3 / 4
1378        };
1379        total_decoded += decoded_size;
1380        offsets.push(total_decoded);
1381    }
1382
1383    // Pre-allocate contiguous output buffer with exact total size
1384    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
1385    #[allow(clippy::uninit_vec)]
1386    unsafe {
1387        output_buf.set_len(total_decoded);
1388    }
1389
1390    // Parallel decode: each thread decodes directly into its exact final position.
1391    // No compaction pass needed since offsets are computed from exact decoded sizes.
1392    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
1393    // Use usize representation of the pointer for Send+Sync compatibility with rayon.
1394    let out_addr = output_buf.as_mut_ptr() as usize;
1395    let decode_result: Result<Vec<()>, io::Error> = chunks
1396        .par_iter()
1397        .enumerate()
1398        .map(|(i, chunk)| {
1399            let offset = offsets[i];
1400            let expected_size = offsets[i + 1] - offset;
1401            // SAFETY: each thread writes to non-overlapping region [offset..offset+expected_size]
1402            let out_slice = unsafe {
1403                std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
1404            };
1405            let decoded = BASE64_ENGINE
1406                .decode(chunk, out_slice.as_out())
1407                .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "invalid input"))?;
1408            debug_assert_eq!(decoded.len(), expected_size);
1409            Ok(())
1410        })
1411        .collect();
1412
1413    decode_result?;
1414
1415    out.write_all(&output_buf[..total_decoded])
1416}
1417
1418/// Strip non-base64 characters (for -i / --ignore-garbage).
1419fn strip_non_base64(data: &[u8]) -> Vec<u8> {
1420    data.iter()
1421        .copied()
1422        .filter(|&b| is_base64_char(b))
1423        .collect()
1424}
1425
1426/// Check if a byte is a valid base64 alphabet character or padding.
1427#[inline]
1428fn is_base64_char(b: u8) -> bool {
1429    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
1430}
1431
1432/// Stream-encode from a reader to a writer. Used for stdin processing.
1433/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
1434pub fn encode_stream(
1435    reader: &mut impl Read,
1436    wrap_col: usize,
1437    writer: &mut impl Write,
1438) -> io::Result<()> {
1439    if wrap_col == 0 {
1440        return encode_stream_nowrap(reader, writer);
1441    }
1442    encode_stream_wrapped(reader, wrap_col, writer)
1443}
1444
1445/// Streaming encode with NO line wrapping — optimized fast path.
1446/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
1447/// 24MB reads mean 10-18MB input is consumed in a single read() call,
1448/// and the encoded output writes in 1-2 write() calls.
1449fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
1450    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
1451    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3
1452
1453    // SAFETY: buf bytes are written by read_full before being processed.
1454    // encode_buf bytes are written by encode before being read.
1455    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
1456    #[allow(clippy::uninit_vec)]
1457    unsafe {
1458        buf.set_len(NOWRAP_READ);
1459    }
1460    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
1461    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1462    #[allow(clippy::uninit_vec)]
1463    unsafe {
1464        encode_buf.set_len(encode_buf_size);
1465    }
1466
1467    loop {
1468        let n = read_full(reader, &mut buf)?;
1469        if n == 0 {
1470            break;
1471        }
1472        let enc_len = BASE64_ENGINE.encoded_length(n);
1473        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1474        writer.write_all(encoded)?;
1475    }
1476    Ok(())
1477}
1478
1479/// Streaming encode WITH line wrapping.
1480/// For the common case (wrap_col divides evenly into 3-byte input groups),
1481/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
1482/// then writes it in a single write() call. This eliminates the overhead of
1483/// many writev() syscalls (one per ~512 lines via IoSlice).
1484///
1485/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
1486fn encode_stream_wrapped(
1487    reader: &mut impl Read,
1488    wrap_col: usize,
1489    writer: &mut impl Write,
1490) -> io::Result<()> {
1491    let bytes_per_line = wrap_col * 3 / 4;
1492    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
1493    // align the read buffer to bytes_per_line boundaries so each chunk produces
1494    // complete lines with no column carry-over between chunks.
1495    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
1496        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
1497    }
1498
1499    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
1500    const STREAM_READ: usize = 12 * 1024 * 1024;
1501    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
1502    #[allow(clippy::uninit_vec)]
1503    unsafe {
1504        buf.set_len(STREAM_READ);
1505    }
1506    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
1507    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
1508    #[allow(clippy::uninit_vec)]
1509    unsafe {
1510        encode_buf.set_len(encode_buf_size);
1511    }
1512
1513    let mut col = 0usize;
1514
1515    loop {
1516        let n = read_full(reader, &mut buf)?;
1517        if n == 0 {
1518            break;
1519        }
1520        let enc_len = BASE64_ENGINE.encoded_length(n);
1521        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
1522
1523        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
1524    }
1525
1526    if col > 0 {
1527        writer.write_all(b"\n")?;
1528    }
1529
1530    Ok(())
1531}
1532
1533/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
1534/// encode each line directly into its final position with newline appended.
1535/// Eliminates the two-pass encode-then-fuse_wrap approach.
1536/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
1537/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
1538fn encode_stream_wrapped_fused(
1539    reader: &mut impl Read,
1540    wrap_col: usize,
1541    bytes_per_line: usize,
1542    writer: &mut impl Write,
1543) -> io::Result<()> {
1544    // Align read size to bytes_per_line for complete output lines per chunk.
1545    // ~420K lines * 57 bytes = ~24MB input, ~32MB output.
1546    let lines_per_chunk = (24 * 1024 * 1024) / bytes_per_line;
1547    let read_size = lines_per_chunk * bytes_per_line;
1548    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline
1549
1550    // SAFETY: buf bytes are written by read_full before being processed.
1551    // out_buf bytes are written by encode before being read.
1552    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
1553    #[allow(clippy::uninit_vec)]
1554    unsafe {
1555        buf.set_len(read_size);
1556    }
1557    // Output buffer: enough for all lines + remainder
1558    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
1559    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
1560    #[allow(clippy::uninit_vec)]
1561    unsafe {
1562        out_buf.set_len(max_output);
1563    }
1564
1565    loop {
1566        let n = read_full(reader, &mut buf)?;
1567        if n == 0 {
1568            break;
1569        }
1570
1571        let full_lines = n / bytes_per_line;
1572        let remainder = n % bytes_per_line;
1573
1574        // Encode each input line directly into its final output position.
1575        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
1576        // This eliminates the separate encode + fuse_wrap copy entirely.
1577        let dst = out_buf.as_mut_ptr();
1578        let mut line_idx = 0;
1579
1580        // 4-line unrolled loop for better ILP
1581        while line_idx + 4 <= full_lines {
1582            let in_base = line_idx * bytes_per_line;
1583            let out_base = line_idx * line_out;
1584            unsafe {
1585                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1586                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
1587                *dst.add(out_base + wrap_col) = b'\n';
1588
1589                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
1590                let _ = BASE64_ENGINE.encode(
1591                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
1592                    s1.as_out(),
1593                );
1594                *dst.add(out_base + line_out + wrap_col) = b'\n';
1595
1596                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
1597                let _ = BASE64_ENGINE.encode(
1598                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
1599                    s2.as_out(),
1600                );
1601                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';
1602
1603                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
1604                let _ = BASE64_ENGINE.encode(
1605                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
1606                    s3.as_out(),
1607                );
1608                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
1609            }
1610            line_idx += 4;
1611        }
1612
1613        // Remaining full lines
1614        while line_idx < full_lines {
1615            let in_base = line_idx * bytes_per_line;
1616            let out_base = line_idx * line_out;
1617            unsafe {
1618                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
1619                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
1620                *dst.add(out_base + wrap_col) = b'\n';
1621            }
1622            line_idx += 1;
1623        }
1624
1625        let mut wp = full_lines * line_out;
1626
1627        // Handle remainder (partial last line of this chunk)
1628        if remainder > 0 {
1629            let enc_len = BASE64_ENGINE.encoded_length(remainder);
1630            let line_input = &buf[full_lines * bytes_per_line..n];
1631            unsafe {
1632                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
1633                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
1634                *dst.add(wp + enc_len) = b'\n';
1635            }
1636            wp += enc_len + 1;
1637        }
1638
1639        writer.write_all(&out_buf[..wp])?;
1640    }
1641
1642    Ok(())
1643}
1644
1645/// Stream-decode from a reader to a writer. Used for stdin processing.
1646/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
1647/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
1648/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
1649/// fill the entire buffer from the pipe, and 32MB means even large inputs
1650/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
1651pub fn decode_stream(
1652    reader: &mut impl Read,
1653    ignore_garbage: bool,
1654    writer: &mut impl Write,
1655) -> io::Result<()> {
1656    const READ_CHUNK: usize = 32 * 1024 * 1024;
1657    // SAFETY: buf bytes are written by read_full before being processed.
1658    // The extra 4 bytes accommodate carry-over from previous chunk.
1659    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
1660    #[allow(clippy::uninit_vec)]
1661    unsafe {
1662        buf.set_len(READ_CHUNK + 4);
1663    }
1664    let mut carry = [0u8; 4];
1665    let mut carry_len = 0usize;
1666
1667    loop {
1668        // Copy carry bytes to start of buffer, read new data after them
1669        if carry_len > 0 {
1670            unsafe {
1671                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
1672            }
1673        }
1674        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
1675        if n == 0 {
1676            break;
1677        }
1678        let total_raw = carry_len + n;
1679
1680        // Strip whitespace in-place in the buffer itself.
1681        // This eliminates the separate clean buffer allocation (saves 16MB).
1682        let clean_len = if ignore_garbage {
1683            // Scalar filter for ignore_garbage mode (rare path)
1684            let ptr = buf.as_mut_ptr();
1685            let mut wp = 0usize;
1686            for i in 0..total_raw {
1687                let b = unsafe { *ptr.add(i) };
1688                if is_base64_char(b) {
1689                    unsafe { *ptr.add(wp) = b };
1690                    wp += 1;
1691                }
1692            }
1693            wp
1694        } else {
1695            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
1696            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
1697            // so we process ~76 bytes per memchr hit.
1698            let ptr = buf.as_mut_ptr();
1699            let data = &buf[..total_raw];
1700            let mut wp = 0usize;
1701            let mut gap_start = 0usize;
1702            let mut has_rare_ws = false;
1703
1704            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
1705                let gap_len = pos - gap_start;
1706                if gap_len > 0 {
1707                    if !has_rare_ws {
1708                        has_rare_ws = data[gap_start..pos]
1709                            .iter()
1710                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1711                    }
1712                    if wp != gap_start {
1713                        unsafe {
1714                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
1715                        }
1716                    }
1717                    wp += gap_len;
1718                }
1719                gap_start = pos + 1;
1720            }
1721            let tail_len = total_raw - gap_start;
1722            if tail_len > 0 {
1723                if !has_rare_ws {
1724                    has_rare_ws = data[gap_start..total_raw]
1725                        .iter()
1726                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
1727                }
1728                if wp != gap_start {
1729                    unsafe {
1730                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
1731                    }
1732                }
1733                wp += tail_len;
1734            }
1735
1736            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
1737            if has_rare_ws {
1738                let mut rp = 0;
1739                let mut cwp = 0;
1740                while rp < wp {
1741                    let b = unsafe { *ptr.add(rp) };
1742                    if NOT_WHITESPACE[b as usize] {
1743                        unsafe { *ptr.add(cwp) = b };
1744                        cwp += 1;
1745                    }
1746                    rp += 1;
1747                }
1748                cwp
1749            } else {
1750                wp
1751            }
1752        };
1753
1754        carry_len = 0;
1755        let is_last = n < READ_CHUNK;
1756
1757        if is_last {
1758            // Last chunk: decode everything (including padding)
1759            decode_clean_slice(&mut buf[..clean_len], writer)?;
1760        } else {
1761            // Save incomplete base64 quadruplet for next iteration
1762            let decode_len = (clean_len / 4) * 4;
1763            let leftover = clean_len - decode_len;
1764            if leftover > 0 {
1765                unsafe {
1766                    std::ptr::copy_nonoverlapping(
1767                        buf.as_ptr().add(decode_len),
1768                        carry.as_mut_ptr(),
1769                        leftover,
1770                    );
1771                }
1772                carry_len = leftover;
1773            }
1774            if decode_len > 0 {
1775                decode_clean_slice(&mut buf[..decode_len], writer)?;
1776            }
1777        }
1778    }
1779
1780    // Handle any remaining carry-over bytes
1781    if carry_len > 0 {
1782        let mut carry_buf = carry[..carry_len].to_vec();
1783        decode_clean_slice(&mut carry_buf, writer)?;
1784    }
1785
1786    Ok(())
1787}
1788
1789/// Write all IoSlice entries using write_vectored (writev syscall).
1790/// Falls back to write_all per slice on partial writes.
1791fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
1792    if slices.is_empty() {
1793        return Ok(());
1794    }
1795    let total: usize = slices.iter().map(|s| s.len()).sum();
1796
1797    // Try write_vectored first — often writes everything in one syscall
1798    let written = match out.write_vectored(slices) {
1799        Ok(n) if n >= total => return Ok(()),
1800        Ok(n) => n,
1801        Err(e) => return Err(e),
1802    };
1803
1804    // Partial write fallback
1805    let mut skip = written;
1806    for slice in slices {
1807        let slen = slice.len();
1808        if skip >= slen {
1809            skip -= slen;
1810            continue;
1811        }
1812        if skip > 0 {
1813            out.write_all(&slice[skip..])?;
1814            skip = 0;
1815        } else {
1816            out.write_all(slice)?;
1817        }
1818    }
1819    Ok(())
1820}
1821
1822/// Read as many bytes as possible into buf, retrying on partial reads.
1823/// Fast path: regular file reads usually return the full buffer on the first call,
1824/// avoiding the loop overhead entirely.
1825#[inline]
1826fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
1827    // Fast path: first read() usually fills the entire buffer for regular files
1828    let n = reader.read(buf)?;
1829    if n == buf.len() || n == 0 {
1830        return Ok(n);
1831    }
1832    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
1833    let mut total = n;
1834    while total < buf.len() {
1835        match reader.read(&mut buf[total..]) {
1836            Ok(0) => break,
1837            Ok(n) => total += n,
1838            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1839            Err(e) => return Err(e),
1840        }
1841    }
1842    Ok(total)
1843}