fcoreutils 0.22.0

High-performance GNU coreutils replacement with SIMD and parallelism
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
use std::io::{self, Read, Write};

use base64_simd::AsOut;

const BASE64_ENGINE: &base64_simd::Base64 = &base64_simd::STANDARD;

/// Number of available CPUs for parallel chunk splitting.
/// Uses std::thread::available_parallelism() to avoid triggering premature
/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
#[inline]
fn num_cpus() -> usize {
    std::thread::available_parallelism()
        .map(|n| n.get())
        .unwrap_or(1)
}

/// Chunk size for sequential no-wrap encoding: 8MB aligned to 3 bytes.
/// Larger chunks reduce function call overhead per iteration while still
/// keeping peak buffer allocation reasonable (~10.7MB for the output).
const NOWRAP_CHUNK: usize = 8 * 1024 * 1024 - (8 * 1024 * 1024 % 3);

/// Minimum data size for parallel no-wrap encoding (16MB).
/// For single-file CLI usage (typical benchmark), the Rayon pool is cold
/// on first use (~200-500µs init). At 10MB, sequential encoding is faster
/// because pool init + dispatch overhead exceeds the parallel benefit.
/// Note: multi-file callers pay pool init only once; subsequent files would
/// benefit from a lower threshold (~2MB). Optimized for single-file CLI.
const PARALLEL_NOWRAP_THRESHOLD: usize = 16 * 1024 * 1024;

/// Minimum data size for parallel wrapped encoding (12MB).
/// Same cold-pool reasoning as PARALLEL_NOWRAP_THRESHOLD above.
/// The sequential encode_wrapped_expand path with backward expansion
/// eliminates per-group overhead from L1-scatter chunking.
const PARALLEL_WRAPPED_THRESHOLD: usize = 12 * 1024 * 1024;

/// Minimum data size for parallel decoding (1MB of base64 data).
/// Lower threshold than encode because decode is more compute-intensive
/// and benefits from parallelism at smaller sizes. After first use, the
/// Rayon pool is warm (~10µs dispatch), making 1MB a good crossover point.
const PARALLEL_DECODE_THRESHOLD: usize = 1024 * 1024;

/// Hint HUGEPAGE for large output buffers on Linux.
/// MADV_HUGEPAGE tells kernel to use 2MB pages, reducing TLB misses
/// and minor fault count for large allocations (~25,600 → ~50 for 100MB).
#[cfg(target_os = "linux")]
fn hint_hugepage(buf: &mut Vec<u8>) {
    if buf.capacity() >= 2 * 1024 * 1024 {
        unsafe {
            libc::madvise(
                buf.as_mut_ptr() as *mut libc::c_void,
                buf.capacity(),
                libc::MADV_HUGEPAGE,
            );
        }
    }
}

/// Encode data and write to output with line wrapping.
/// Uses SIMD encoding with fused encode+wrap for maximum throughput.
pub fn encode_to_writer(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    if wrap_col == 0 {
        return encode_no_wrap(data, out);
    }

    encode_wrapped(data, wrap_col, out)
}

/// Encode without wrapping — parallel SIMD encoding for large data, sequential for small.
fn encode_no_wrap(data: &[u8], out: &mut impl Write) -> io::Result<()> {
    if data.len() >= PARALLEL_NOWRAP_THRESHOLD && num_cpus() > 1 {
        return encode_no_wrap_parallel(data, out);
    }

    // Single-buffer encode: for data that fits in one chunk, encode directly
    // and write once. For larger data, reuse the buffer across chunks.
    let enc_len = BASE64_ENGINE.encoded_length(data.len().min(NOWRAP_CHUNK));
    let mut buf: Vec<u8> = Vec::with_capacity(enc_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(enc_len);
    }

    for chunk in data.chunks(NOWRAP_CHUNK) {
        let clen = BASE64_ENGINE.encoded_length(chunk.len());
        let encoded = BASE64_ENGINE.encode(chunk, buf[..clen].as_out());
        out.write_all(encoded)?;
    }
    Ok(())
}

/// Parallel no-wrap encoding into a single shared output buffer.
/// Split at 3-byte boundaries, pre-calculate output offsets, encode in parallel.
/// Each chunk except possibly the last is 3-byte aligned, so no padding in intermediate chunks.
/// Single allocation + single write_all instead of N allocations + writev.
fn encode_no_wrap_parallel(data: &[u8], out: &mut impl Write) -> io::Result<()> {
    let num_threads = num_cpus().max(1);
    let raw_chunk = data.len() / num_threads;
    // Align to 3 bytes so each chunk encodes without padding (except the last)
    let chunk_size = ((raw_chunk + 2) / 3) * 3;

    // Split input into 3-byte-aligned chunks
    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(3)).collect();

    // Pre-calculate output offsets
    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
    let mut total_out = 0usize;
    for chunk in &chunks {
        offsets.push(total_out);
        total_out += BASE64_ENGINE.encoded_length(chunk.len());
    }

    // Single allocation for all threads
    let mut output: Vec<u8> = Vec::with_capacity(total_out);
    #[allow(clippy::uninit_vec)]
    unsafe {
        output.set_len(total_out);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut output);

    // Parallel encode: each thread writes into its pre-assigned region
    let output_base = output.as_mut_ptr() as usize;
    rayon::scope(|s| {
        for (i, chunk) in chunks.iter().enumerate() {
            let out_off = offsets[i];
            let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
            let base = output_base;
            s.spawn(move |_| {
                let dest =
                    unsafe { std::slice::from_raw_parts_mut((base + out_off) as *mut u8, enc_len) };
                let _ = BASE64_ENGINE.encode(chunk, dest.as_out());
            });
        }
    });

    out.write_all(&output[..total_out])
}

/// Encode with line wrapping using forward scatter from L1-cached temp buffer.
/// Encodes groups of lines into a small temp buffer (fits in L1 cache), then
/// scatter-copies wrap_col-byte chunks from temp to output with newlines.
///
/// This is faster than bulk encode + backward expansion because:
/// - Temp buffer reads hit L1 cache (essentially free bandwidth)
/// - Output buffer is written once (no double-write from backward memmove)
/// - Forward access pattern is prefetcher-friendly
fn encode_wrapped(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    let bytes_per_line = wrap_col * 3 / 4;
    if bytes_per_line == 0 {
        return encode_wrapped_small(data, wrap_col, out);
    }

    if data.len() >= PARALLEL_WRAPPED_THRESHOLD && bytes_per_line.is_multiple_of(3) {
        return encode_wrapped_parallel(data, wrap_col, bytes_per_line, out);
    }

    // Chunked encode for data > 1MB: process ~1MB at a time to keep the output
    // buffer small (~1.3MB) and warm in L2. Reduces minor page faults from ~3500 to ~320.
    // Guard: skip if bytes_per_line > 1MB (lines_per_chunk would be 0 → infinite loop).
    if bytes_per_line.is_multiple_of(3) && data.len() > 1024 * 1024 {
        let lines_per_chunk = (1024 * 1024) / bytes_per_line;
        if lines_per_chunk > 0 {
            return encode_wrapped_chunked(data, wrap_col, bytes_per_line, out);
        }
    }

    if bytes_per_line.is_multiple_of(3) {
        return encode_wrapped_expand(data, wrap_col, bytes_per_line, out);
    }

    // Fallback for non-3-aligned bytes_per_line: use fuse_wrap approach
    let enc_max = BASE64_ENGINE.encoded_length(data.len());
    let num_full = enc_max / wrap_col;
    let rem = enc_max % wrap_col;
    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };

    // Encode full data, then fuse with newlines
    let mut enc_buf: Vec<u8> = Vec::with_capacity(enc_max);
    #[allow(clippy::uninit_vec)]
    unsafe {
        enc_buf.set_len(enc_max);
    }
    let _ = BASE64_ENGINE.encode(data, enc_buf[..enc_max].as_out());

    let mut out_buf: Vec<u8> = Vec::with_capacity(out_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(out_len);
    }
    let n = fuse_wrap(&enc_buf, wrap_col, &mut out_buf);
    out.write_all(&out_buf[..n])
}

/// Chunked encode+wrap for large in-memory data. Processes ~1MB input at a time,
/// reusing a small output buffer (~1.3MB) that stays warm in L2 cache.
/// Each chunk is aligned to bytes_per_line so it produces complete lines.
/// Reduces page faults from ~3500 (14MB alloc) to ~320 (1.3MB alloc) for 10MB input.
fn encode_wrapped_chunked(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    debug_assert!(bytes_per_line.is_multiple_of(3));

    // ~1MB input per chunk, aligned to bytes_per_line
    let lines_per_chunk = (1024 * 1024) / bytes_per_line;
    let chunk_input = lines_per_chunk * bytes_per_line;
    let line_out = wrap_col + 1;
    let max_chunk_out =
        lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;

    // SAFETY: out_buf is allocated to max_chunk_out bytes and set_len'd without
    // initialization. The encode loop below writes exactly `full_lines * line_out`
    // bytes (encode + newline per line), plus any remainder. Only the written
    // prefix `out_buf[..total_out]` is passed to write_all — no uninitialized
    // bytes are ever read. u8 has no drop glue, so the uninitialized tail is safe.
    let mut out_buf: Vec<u8> = Vec::with_capacity(max_chunk_out);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(max_chunk_out);
    }

    let mut pos = 0;
    while pos < data.len() {
        let remaining = data.len() - pos;
        let chunk_len = remaining.min(chunk_input);
        let chunk = &data[pos..pos + chunk_len];

        let full_lines = chunk_len / bytes_per_line;
        let remainder = chunk_len % bytes_per_line;

        let dst = out_buf.as_mut_ptr();
        let mut line_idx = 0;

        // 4-line unrolled loop
        while line_idx + 4 <= full_lines {
            let in_base = line_idx * bytes_per_line;
            let out_base = line_idx * line_out;
            unsafe {
                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
                let _ =
                    BASE64_ENGINE.encode(&chunk[in_base..in_base + bytes_per_line], s0.as_out());
                *dst.add(out_base + wrap_col) = b'\n';

                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &chunk[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
                    s1.as_out(),
                );
                *dst.add(out_base + line_out + wrap_col) = b'\n';

                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &chunk[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
                    s2.as_out(),
                );
                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';

                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &chunk[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
                    s3.as_out(),
                );
                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
            }
            line_idx += 4;
        }
        while line_idx < full_lines {
            let in_off = line_idx * bytes_per_line;
            let out_off = line_idx * line_out;
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(out_off), wrap_col);
                let _ = BASE64_ENGINE.encode(&chunk[in_off..in_off + bytes_per_line], s.as_out());
                *dst.add(out_off + wrap_col) = b'\n';
            }
            line_idx += 1;
        }

        let mut total_out = full_lines * line_out;

        if remainder > 0 {
            let in_off = full_lines * bytes_per_line;
            let enc_len = BASE64_ENGINE.encoded_length(remainder);
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(total_out), enc_len);
                let _ = BASE64_ENGINE.encode(&chunk[in_off..in_off + remainder], s.as_out());
                *dst.add(total_out + enc_len) = b'\n';
            }
            total_out += enc_len + 1;
        }

        out.write_all(&out_buf[..total_out])?;
        pos += chunk_len;
    }

    Ok(())
}

/// Encode with backward expansion: single contiguous SIMD encode, then expand
/// in-place to insert newlines. The encode is done in one call (no chunking),
/// which eliminates per-group function call overhead from L1-scatter.
/// The backward expansion only shifts data by ~1.3% (1 byte per 76 for wrap_col=76),
/// and for most lines the shift exceeds wrap_col so memmove uses the fast memcpy path.
fn encode_wrapped_expand(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    debug_assert!(bytes_per_line.is_multiple_of(3));
    let enc_len = BASE64_ENGINE.encoded_length(data.len());
    if enc_len == 0 {
        return Ok(());
    }

    let num_full = enc_len / wrap_col;
    let rem = enc_len % wrap_col;
    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };

    // Single allocation: encode into first enc_len bytes, expand backward to out_len.
    // SAFETY: buf[..enc_len] is initialized by BASE64_ENGINE.encode below.
    // buf[enc_len..out_len] is written by expand_backward before write_all reads it.
    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(out_len);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut buf);

    // One SIMD encode call for the entire input (no chunking overhead)
    let encoded = BASE64_ENGINE.encode(data, buf[..enc_len].as_out());
    debug_assert_eq!(encoded.len(), enc_len, "encode wrote unexpected length");

    // Expand backward to insert newlines — shifts only ~1.3% of data
    expand_backward(buf.as_mut_ptr(), enc_len, out_len, wrap_col);

    out.write_all(&buf[..out_len])
}

/// L1-scatter encode: encode groups of lines into a small L1-cached temp buffer,
/// then scatter-copy each line to its final position in the output buffer with
/// newline insertion. Each output byte is written exactly once — no read-back
/// from main memory, halving memory traffic vs backward expansion.
///
/// Temp buffer (~20KB for 256 lines × 76 chars) stays hot in L1 cache, so
/// reads during scatter are essentially free. Output buffer is streamed out
/// with sequential writes that the prefetcher can handle efficiently.
///
/// Uses a full output buffer for vmsplice safety: vmsplice maps user pages
/// into the pipe buffer, so the buffer must stay valid until the reader consumes.
#[allow(dead_code)]
fn encode_wrapped_scatter(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    let enc_len = BASE64_ENGINE.encoded_length(data.len());
    if enc_len == 0 {
        return Ok(());
    }

    let num_full = enc_len / wrap_col;
    let rem = enc_len % wrap_col;
    let out_len = num_full * (wrap_col + 1) + if rem > 0 { rem + 1 } else { 0 };

    // Output buffer — written once via scatter, then write_all to output
    let mut buf: Vec<u8> = Vec::with_capacity(out_len);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(out_len);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut buf);

    // L1-cached temp buffer for encoding groups of lines.
    // 256 lines × 76 chars = 19,456 bytes — fits comfortably in L1 (32-64KB).
    const GROUP_LINES: usize = 256;
    let group_input = GROUP_LINES * bytes_per_line;
    let temp_size = GROUP_LINES * wrap_col;
    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        temp.set_len(temp_size);
    }

    let line_out = wrap_col + 1;
    let mut wp = 0usize; // write position in output buffer

    for chunk in data.chunks(group_input) {
        let clen = BASE64_ENGINE.encoded_length(chunk.len());
        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());

        // Scatter-copy full lines from temp to output with newlines
        let lines = clen / wrap_col;
        let chunk_rem = clen % wrap_col;

        // 8-line unrolled scatter for ILP
        let mut i = 0;
        while i + 8 <= lines {
            unsafe {
                let src = temp.as_ptr().add(i * wrap_col);
                let dst = buf.as_mut_ptr().add(wp);
                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
                *dst.add(wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
                *dst.add(line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(2 * wrap_col),
                    dst.add(2 * line_out),
                    wrap_col,
                );
                *dst.add(2 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(3 * wrap_col),
                    dst.add(3 * line_out),
                    wrap_col,
                );
                *dst.add(3 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(4 * wrap_col),
                    dst.add(4 * line_out),
                    wrap_col,
                );
                *dst.add(4 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(5 * wrap_col),
                    dst.add(5 * line_out),
                    wrap_col,
                );
                *dst.add(5 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(6 * wrap_col),
                    dst.add(6 * line_out),
                    wrap_col,
                );
                *dst.add(6 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(7 * wrap_col),
                    dst.add(7 * line_out),
                    wrap_col,
                );
                *dst.add(7 * line_out + wrap_col) = b'\n';
            }
            wp += 8 * line_out;
            i += 8;
        }
        // Remaining full lines
        while i < lines {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(i * wrap_col),
                    buf.as_mut_ptr().add(wp),
                    wrap_col,
                );
                *buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
            }
            wp += line_out;
            i += 1;
        }
        // Partial last line (only on final chunk)
        if chunk_rem > 0 {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(lines * wrap_col),
                    buf.as_mut_ptr().add(wp),
                    chunk_rem,
                );
                *buf.as_mut_ptr().add(wp + chunk_rem) = b'\n';
            }
            wp += chunk_rem + 1;
        }
    }

    out.write_all(&buf[..wp])
}

/// Scatter-copy encoded lines from temp buffer to output buffer with newlines.
/// Uses copy_nonoverlapping since temp and output never overlap.
#[inline]
#[allow(dead_code)]
fn scatter_lines(
    temp: &[u8],
    buf: &mut [u8],
    line_start: usize,
    count: usize,
    wrap_col: usize,
    line_out: usize,
) {
    unsafe {
        let src = temp.as_ptr();
        let dst = buf.as_mut_ptr();
        for i in 0..count {
            let s_off = i * wrap_col;
            let d_off = (line_start + i) * line_out;
            std::ptr::copy_nonoverlapping(src.add(s_off), dst.add(d_off), wrap_col);
            *dst.add(d_off + wrap_col) = b'\n';
        }
    }
}

/// Expand encoded data in-place by inserting newlines at wrap_col boundaries.
/// buf[0..enc_len] contains contiguous encoded data; buf has capacity for out_len.
/// After expansion, buf[0..out_len] contains wrapped output with newlines.
///
/// Processes backward so shifted data never overwrites unread source data.
/// For wrap_col=76: shift is ~1.3% (1 byte per 76), so most copies are
/// non-overlapping and the memmove fast-path (memcpy) is used.
#[inline]
fn expand_backward(ptr: *mut u8, enc_len: usize, out_len: usize, wrap_col: usize) {
    let num_full = enc_len / wrap_col;
    let rem = enc_len % wrap_col;

    unsafe {
        let mut rp = enc_len;
        let mut wp = out_len;

        // Handle partial last line (remainder)
        if rem > 0 {
            wp -= 1;
            *ptr.add(wp) = b'\n';
            wp -= rem;
            rp -= rem;
            if rp != wp {
                std::ptr::copy(ptr.add(rp), ptr.add(wp), rem);
            }
        }

        // Process full lines backward
        let mut lines_left = num_full;
        while lines_left >= 8 {
            // Unrolled: 8 lines per iteration
            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);

            lines_left -= 8;
        }

        // Remaining lines (0-7)
        while lines_left > 0 {
            wp -= 1;
            *ptr.add(wp) = b'\n';
            rp -= wrap_col;
            wp -= wrap_col;
            if rp != wp {
                std::ptr::copy(ptr.add(rp), ptr.add(wp), wrap_col);
            }
            lines_left -= 1;
        }
    }
}

/// Static newline byte for IoSlice references in writev calls.
static NEWLINE: [u8; 1] = [b'\n'];

/// Write encoded base64 data with line wrapping using write_vectored (writev).
/// Builds IoSlice entries pointing at wrap_col-sized segments of the encoded buffer,
/// interleaved with newline IoSlices, then writes in batches of MAX_WRITEV_IOV.
/// This is zero-copy: no fused output buffer needed.
#[inline]
#[allow(dead_code)]
fn write_wrapped_iov(encoded: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    // Max IoSlice entries per writev batch. Linux UIO_MAXIOV is 1024.
    // Each line needs 2 entries (data + newline), so 512 lines per batch.
    const MAX_IOV: usize = 1024;

    let num_full_lines = encoded.len() / wrap_col;
    let remainder = encoded.len() % wrap_col;
    let total_iov = num_full_lines * 2 + if remainder > 0 { 2 } else { 0 };

    // Small output: build all IoSlices and write in one call
    if total_iov <= MAX_IOV {
        let mut iov: Vec<io::IoSlice> = Vec::with_capacity(total_iov);
        let mut pos = 0;
        for _ in 0..num_full_lines {
            iov.push(io::IoSlice::new(&encoded[pos..pos + wrap_col]));
            iov.push(io::IoSlice::new(&NEWLINE));
            pos += wrap_col;
        }
        if remainder > 0 {
            iov.push(io::IoSlice::new(&encoded[pos..pos + remainder]));
            iov.push(io::IoSlice::new(&NEWLINE));
        }
        return write_all_vectored(out, &iov);
    }

    // Large output: fuse batches of lines into a reusable L1-cached buffer.
    // Each batch copies ~39KB (512 lines × 77 bytes) from the encoded buffer
    // with newlines inserted, then writes as a single contiguous write(2).
    // This is faster than writev with 1024 IoSlice entries because:
    // - One kernel memcpy per batch vs 1024 separate copies
    // - Fused buffer (39KB) stays hot in L1 cache across batches
    let line_out = wrap_col + 1;
    const BATCH_LINES: usize = 512;
    let batch_fused_size = BATCH_LINES * line_out;
    let mut fused: Vec<u8> = Vec::with_capacity(batch_fused_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        fused.set_len(batch_fused_size);
    }

    let mut rp = 0;
    let mut lines_done = 0;

    // Process full batches using 8-line unrolled fuse_wrap
    while lines_done + BATCH_LINES <= num_full_lines {
        let n = fuse_wrap(
            &encoded[rp..rp + BATCH_LINES * wrap_col],
            wrap_col,
            &mut fused,
        );
        out.write_all(&fused[..n])?;
        rp += BATCH_LINES * wrap_col;
        lines_done += BATCH_LINES;
    }

    // Remaining full lines (partial batch)
    let remaining_lines = num_full_lines - lines_done;
    if remaining_lines > 0 {
        let n = fuse_wrap(
            &encoded[rp..rp + remaining_lines * wrap_col],
            wrap_col,
            &mut fused,
        );
        out.write_all(&fused[..n])?;
        rp += remaining_lines * wrap_col;
    }

    // Partial last line
    if remainder > 0 {
        out.write_all(&encoded[rp..rp + remainder])?;
        out.write_all(b"\n")?;
    }
    Ok(())
}

/// Write encoded base64 data with line wrapping using writev, tracking column state
/// across calls. Used by encode_stream for piped input where chunks don't align
/// to line boundaries.
#[inline]
fn write_wrapped_iov_streaming(
    encoded: &[u8],
    wrap_col: usize,
    col: &mut usize,
    out: &mut impl Write,
) -> io::Result<()> {
    const MAX_IOV: usize = 1024;
    let mut iov: Vec<io::IoSlice> = Vec::with_capacity(MAX_IOV);
    let mut rp = 0;

    while rp < encoded.len() {
        let space = wrap_col - *col;
        let avail = encoded.len() - rp;

        if avail <= space {
            // Remaining data fits in current line
            iov.push(io::IoSlice::new(&encoded[rp..rp + avail]));
            *col += avail;
            if *col == wrap_col {
                iov.push(io::IoSlice::new(&NEWLINE));
                *col = 0;
            }
            break;
        } else {
            // Fill current line and add newline
            iov.push(io::IoSlice::new(&encoded[rp..rp + space]));
            iov.push(io::IoSlice::new(&NEWLINE));
            rp += space;
            *col = 0;
        }

        if iov.len() >= MAX_IOV - 1 {
            write_all_vectored(out, &iov)?;
            iov.clear();
        }
    }

    if !iov.is_empty() {
        write_all_vectored(out, &iov)?;
    }
    Ok(())
}

/// Parallel wrapped encoding with L1-scatter into a single shared output buffer.
/// Pre-calculates each thread's output offset, allocates one buffer for all threads,
/// and has each thread encode directly into its pre-assigned non-overlapping region.
/// This saves N-1 buffer allocations and corresponding page faults vs per-thread Vecs,
/// and uses a single write_all instead of writev.
fn encode_wrapped_parallel(
    data: &[u8],
    wrap_col: usize,
    bytes_per_line: usize,
    out: &mut impl Write,
) -> io::Result<()> {
    let num_threads = num_cpus().max(1);
    let lines_per_chunk = ((data.len() / bytes_per_line) / num_threads).max(1);
    let chunk_input = lines_per_chunk * bytes_per_line;

    // Split input at bytes_per_line boundaries (last chunk may have remainder)
    let chunks: Vec<&[u8]> = data.chunks(chunk_input.max(bytes_per_line)).collect();

    // Pre-calculate output offsets for each chunk
    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
    let mut total_out = 0usize;
    for chunk in &chunks {
        offsets.push(total_out);
        let enc_len = BASE64_ENGINE.encoded_length(chunk.len());
        let full_lines = enc_len / wrap_col;
        let remainder = enc_len % wrap_col;
        total_out += full_lines * (wrap_col + 1) + if remainder > 0 { remainder + 1 } else { 0 };
    }

    // Single allocation for all threads
    let mut output: Vec<u8> = Vec::with_capacity(total_out);
    #[allow(clippy::uninit_vec)]
    unsafe {
        output.set_len(total_out);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut output);

    // Parallel encode: each thread writes into its pre-assigned region
    let output_base = output.as_mut_ptr() as usize;
    rayon::scope(|s| {
        for (i, chunk) in chunks.iter().enumerate() {
            let out_off = offsets[i];
            let out_end = if i + 1 < offsets.len() {
                offsets[i + 1]
            } else {
                total_out
            };
            let out_size = out_end - out_off;
            let base = output_base;
            s.spawn(move |_| {
                let out_slice = unsafe {
                    std::slice::from_raw_parts_mut((base + out_off) as *mut u8, out_size)
                };
                encode_chunk_l1_scatter_into(chunk, out_slice, wrap_col, bytes_per_line);
            });
        }
    });

    out.write_all(&output[..total_out])
}

/// Encode a chunk using L1-scatter, writing into a pre-allocated output slice.
/// Encodes groups of 256 lines into L1-cached temp buffer, scatter-copy to output with newlines.
/// The output slice must be large enough to hold the encoded+wrapped output.
fn encode_chunk_l1_scatter_into(
    data: &[u8],
    output: &mut [u8],
    wrap_col: usize,
    bytes_per_line: usize,
) {
    const GROUP_LINES: usize = 256;
    let group_input = GROUP_LINES * bytes_per_line;
    let temp_size = GROUP_LINES * wrap_col;
    let mut temp: Vec<u8> = Vec::with_capacity(temp_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        temp.set_len(temp_size);
    }

    let line_out = wrap_col + 1;
    let mut wp = 0usize;

    for chunk in data.chunks(group_input) {
        let clen = BASE64_ENGINE.encoded_length(chunk.len());
        let _ = BASE64_ENGINE.encode(chunk, temp[..clen].as_out());

        let lines = clen / wrap_col;
        let chunk_rem = clen % wrap_col;

        // 8-line unrolled scatter
        let mut i = 0;
        while i + 8 <= lines {
            unsafe {
                let src = temp.as_ptr().add(i * wrap_col);
                let dst = output.as_mut_ptr().add(wp);
                std::ptr::copy_nonoverlapping(src, dst, wrap_col);
                *dst.add(wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
                *dst.add(line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(2 * wrap_col),
                    dst.add(2 * line_out),
                    wrap_col,
                );
                *dst.add(2 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(3 * wrap_col),
                    dst.add(3 * line_out),
                    wrap_col,
                );
                *dst.add(3 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(4 * wrap_col),
                    dst.add(4 * line_out),
                    wrap_col,
                );
                *dst.add(4 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(5 * wrap_col),
                    dst.add(5 * line_out),
                    wrap_col,
                );
                *dst.add(5 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(6 * wrap_col),
                    dst.add(6 * line_out),
                    wrap_col,
                );
                *dst.add(6 * line_out + wrap_col) = b'\n';
                std::ptr::copy_nonoverlapping(
                    src.add(7 * wrap_col),
                    dst.add(7 * line_out),
                    wrap_col,
                );
                *dst.add(7 * line_out + wrap_col) = b'\n';
            }
            wp += 8 * line_out;
            i += 8;
        }
        while i < lines {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(i * wrap_col),
                    output.as_mut_ptr().add(wp),
                    wrap_col,
                );
                *output.as_mut_ptr().add(wp + wrap_col) = b'\n';
            }
            wp += line_out;
            i += 1;
        }
        if chunk_rem > 0 {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    temp.as_ptr().add(lines * wrap_col),
                    output.as_mut_ptr().add(wp),
                    chunk_rem,
                );
                *output.as_mut_ptr().add(wp + chunk_rem) = b'\n';
            }
            wp += chunk_rem + 1;
        }
    }
}

/// Fuse encoded base64 data with newlines in a single pass.
/// Uses ptr::copy_nonoverlapping with 8-line unrolling for max throughput.
/// Returns number of bytes written.
#[inline]
fn fuse_wrap(encoded: &[u8], wrap_col: usize, out_buf: &mut [u8]) -> usize {
    let line_out = wrap_col + 1; // wrap_col data bytes + 1 newline
    let mut rp = 0;
    let mut wp = 0;

    // Unrolled: process 8 lines per iteration for better ILP
    while rp + 8 * wrap_col <= encoded.len() {
        unsafe {
            let src = encoded.as_ptr().add(rp);
            let dst = out_buf.as_mut_ptr().add(wp);

            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
            *dst.add(wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
            *dst.add(line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
            *dst.add(2 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
            *dst.add(3 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(4 * wrap_col), dst.add(4 * line_out), wrap_col);
            *dst.add(4 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(5 * wrap_col), dst.add(5 * line_out), wrap_col);
            *dst.add(5 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(6 * wrap_col), dst.add(6 * line_out), wrap_col);
            *dst.add(6 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(7 * wrap_col), dst.add(7 * line_out), wrap_col);
            *dst.add(7 * line_out + wrap_col) = b'\n';
        }
        rp += 8 * wrap_col;
        wp += 8 * line_out;
    }

    // Handle remaining 4 lines at a time
    while rp + 4 * wrap_col <= encoded.len() {
        unsafe {
            let src = encoded.as_ptr().add(rp);
            let dst = out_buf.as_mut_ptr().add(wp);

            std::ptr::copy_nonoverlapping(src, dst, wrap_col);
            *dst.add(wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(wrap_col), dst.add(line_out), wrap_col);
            *dst.add(line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(2 * wrap_col), dst.add(2 * line_out), wrap_col);
            *dst.add(2 * line_out + wrap_col) = b'\n';

            std::ptr::copy_nonoverlapping(src.add(3 * wrap_col), dst.add(3 * line_out), wrap_col);
            *dst.add(3 * line_out + wrap_col) = b'\n';
        }
        rp += 4 * wrap_col;
        wp += 4 * line_out;
    }

    // Remaining full lines
    while rp + wrap_col <= encoded.len() {
        unsafe {
            std::ptr::copy_nonoverlapping(
                encoded.as_ptr().add(rp),
                out_buf.as_mut_ptr().add(wp),
                wrap_col,
            );
            *out_buf.as_mut_ptr().add(wp + wrap_col) = b'\n';
        }
        rp += wrap_col;
        wp += line_out;
    }

    // Partial last line
    if rp < encoded.len() {
        let remaining = encoded.len() - rp;
        unsafe {
            std::ptr::copy_nonoverlapping(
                encoded.as_ptr().add(rp),
                out_buf.as_mut_ptr().add(wp),
                remaining,
            );
        }
        wp += remaining;
        out_buf[wp] = b'\n';
        wp += 1;
    }

    wp
}

/// Fallback for very small wrap columns (< 4 chars).
fn encode_wrapped_small(data: &[u8], wrap_col: usize, out: &mut impl Write) -> io::Result<()> {
    let enc_max = BASE64_ENGINE.encoded_length(data.len());
    let mut buf: Vec<u8> = Vec::with_capacity(enc_max);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(enc_max);
    }
    let encoded = BASE64_ENGINE.encode(data, buf[..enc_max].as_out());

    let wc = wrap_col.max(1);
    for line in encoded.chunks(wc) {
        out.write_all(line)?;
        out.write_all(b"\n")?;
    }
    Ok(())
}

/// Decode base64 data and write to output (borrows data, allocates clean buffer).
/// When `ignore_garbage` is true, strip all non-base64 characters.
/// When false, only strip whitespace (standard behavior).
pub fn decode_to_writer(data: &[u8], ignore_garbage: bool, out: &mut impl Write) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    if ignore_garbage {
        let mut cleaned = strip_non_base64(data);
        return decode_clean_slice(&mut cleaned, out);
    }

    // For large data (>= 512KB): use bulk strip + single-shot decode.
    // try_line_decode decodes per-line (~25ns overhead per 76-byte line call),
    // while strip+decode uses SIMD gap-copy + single-shot SIMD decode at ~6.5 GB/s.
    // For 10MB decode benchmark: ~2ms (bulk) vs ~4ms (per-line) = 2x faster.
    // For small data (< 512KB): per-line decode avoids allocation overhead.
    if data.len() < 512 * 1024 && data.len() >= 77 {
        if let Some(result) = try_line_decode(data, out) {
            return result;
        }
    }

    // Fast path: single-pass SIMD strip + decode
    decode_stripping_whitespace(data, out)
}

/// Decode base64 from a mutable buffer (MAP_PRIVATE mmap or owned Vec).
/// Strips whitespace in-place using SIMD memchr2 gap-copy, then decodes
/// in-place with base64_simd::decode_inplace. Zero additional allocations.
///
/// For MAP_PRIVATE mmap: the kernel uses COW semantics, so only pages
/// containing whitespace (newlines) get physically copied (~1.3% for
/// 76-char line base64). The decode writes to the same buffer, but decoded
/// data is always shorter than encoded (3/4 ratio), so it fits in-place.
pub fn decode_mmap_inplace(
    data: &mut [u8],
    ignore_garbage: bool,
    out: &mut impl Write,
) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    // For small data: try line-by-line decode (avoids COW page faults).
    // For large data (>= 512KB): bulk strip+decode is faster than per-line decode.
    if !ignore_garbage && data.len() >= 77 && data.len() < 512 * 1024 {
        if let Some(result) = try_line_decode(data, out) {
            return result;
        }
    }

    if ignore_garbage {
        // Strip non-base64 chars in-place
        let ptr = data.as_mut_ptr();
        let len = data.len();
        let mut wp = 0;
        for rp in 0..len {
            let b = unsafe { *ptr.add(rp) };
            if is_base64_char(b) {
                unsafe { *ptr.add(wp) = b };
                wp += 1;
            }
        }
        let r = decode_inplace_with_padding(&mut data[..wp], out);
        return r;
    }

    // Fast path: uniform-line fused strip+decode (no intermediate buffer).
    if data.len() >= 77 {
        if let Some(result) = try_decode_uniform_lines(data, out) {
            return result;
        }
    }

    // Fallback: strip whitespace in-place using SIMD memchr2 gap-copy.

    // Quick check: no newlines at all — maybe already clean
    if memchr::memchr2(b'\n', b'\r', data).is_none() {
        // Check for rare whitespace
        if !data
            .iter()
            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
        {
            // Perfectly clean — decode in-place directly
            return decode_inplace_with_padding(data, out);
        }
        // Rare whitespace only — strip in-place
        let ptr = data.as_mut_ptr();
        let len = data.len();
        let mut wp = 0;
        for rp in 0..len {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(wp) = b };
                wp += 1;
            }
        }
        return decode_inplace_with_padding(&mut data[..wp], out);
    }

    // SIMD gap-copy: strip \n and \r in-place using memchr2
    let ptr = data.as_mut_ptr();
    let len = data.len();
    let mut wp = 0usize;
    let mut gap_start = 0usize;
    let mut has_rare_ws = false;

    // SAFETY: memchr2_iter reads from the original data. We write to positions
    // [0..wp] which are always <= gap_start, so we never overwrite unread data.
    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
        let gap_len = pos - gap_start;
        if gap_len > 0 {
            if !has_rare_ws {
                // Check for rare whitespace during the gap-copy
                has_rare_ws = unsafe {
                    std::slice::from_raw_parts(ptr.add(gap_start), gap_len)
                        .iter()
                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
                };
            }
            if wp != gap_start {
                unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len) };
            }
            wp += gap_len;
        }
        gap_start = pos + 1;
    }
    // Final gap
    let tail_len = len - gap_start;
    if tail_len > 0 {
        if !has_rare_ws {
            has_rare_ws = unsafe {
                std::slice::from_raw_parts(ptr.add(gap_start), tail_len)
                    .iter()
                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
            };
        }
        if wp != gap_start {
            unsafe { std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len) };
        }
        wp += tail_len;
    }

    // Second pass for rare whitespace if needed
    if has_rare_ws {
        let mut rp = 0;
        let mut cwp = 0;
        while rp < wp {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(cwp) = b };
                cwp += 1;
            }
            rp += 1;
        }
        wp = cwp;
    }

    // Decode in-place: decoded data is always shorter than encoded (3/4 ratio)
    if wp >= PARALLEL_DECODE_THRESHOLD {
        // For large data, use parallel decode from the cleaned slice
        return decode_borrowed_clean_parallel(out, &data[..wp]);
    }
    decode_inplace_with_padding(&mut data[..wp], out)
}

/// Decode base64 from an owned Vec (in-place whitespace strip + decode).
pub fn decode_owned(
    data: &mut Vec<u8>,
    ignore_garbage: bool,
    out: &mut impl Write,
) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    if ignore_garbage {
        data.retain(|&b| is_base64_char(b));
    } else {
        strip_whitespace_inplace(data);
    }

    decode_clean_slice(data, out)
}

/// Strip all whitespace from a Vec in-place using SIMD memchr2 gap-copy.
/// For typical base64 (76-char lines with \n), newlines are ~1/77 of the data,
/// so SIMD memchr2 skips ~76 bytes per hit instead of checking every byte.
/// Falls back to scalar compaction only for rare whitespace (tab, space, VT, FF).
fn strip_whitespace_inplace(data: &mut Vec<u8>) {
    // Quick check: skip stripping if no \n or \r in the data.
    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
    // For typical base64 (76-char lines), we'll find \n immediately and skip this.
    if memchr::memchr2(b'\n', b'\r', data).is_none() {
        // No newlines/CR — check for rare whitespace only
        if data
            .iter()
            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
        {
            data.retain(|&b| NOT_WHITESPACE[b as usize]);
        }
        return;
    }

    // SIMD gap-copy: find \n and \r positions with memchr2, then memmove the
    // gaps between them to compact the data in-place. For typical base64 streams,
    // newlines are the only whitespace, so this handles >99% of cases.
    let ptr = data.as_mut_ptr();
    let len = data.len();
    let mut wp = 0usize;
    let mut gap_start = 0usize;
    let mut has_rare_ws = false;

    for pos in memchr::memchr2_iter(b'\n', b'\r', data.as_slice()) {
        let gap_len = pos - gap_start;
        if gap_len > 0 {
            if !has_rare_ws {
                // Check for rare whitespace during copy (amortized ~1 branch per 77 bytes)
                has_rare_ws = data[gap_start..pos]
                    .iter()
                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
            }
            if wp != gap_start {
                unsafe {
                    std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
                }
            }
            wp += gap_len;
        }
        gap_start = pos + 1;
    }
    // Copy the final gap
    let tail_len = len - gap_start;
    if tail_len > 0 {
        if !has_rare_ws {
            has_rare_ws = data[gap_start..]
                .iter()
                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
        }
        if wp != gap_start {
            unsafe {
                std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
            }
        }
        wp += tail_len;
    }

    data.truncate(wp);

    // Second pass for rare whitespace (tab, space, VT, FF) — only if detected.
    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
    if has_rare_ws {
        let ptr = data.as_mut_ptr();
        let len = data.len();
        let mut rp = 0;
        let mut cwp = 0;
        while rp < len {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(cwp) = b };
                cwp += 1;
            }
            rp += 1;
        }
        data.truncate(cwp);
    }
}

/// 256-byte lookup table: true for non-whitespace bytes.
/// Used for single-pass whitespace stripping in decode.
static NOT_WHITESPACE: [bool; 256] = {
    let mut table = [true; 256];
    table[b' ' as usize] = false;
    table[b'\t' as usize] = false;
    table[b'\n' as usize] = false;
    table[b'\r' as usize] = false;
    table[0x0b] = false; // vertical tab
    table[0x0c] = false; // form feed
    table
};

/// Fused strip+decode for uniform-line base64 data.
/// Detects consistent line length, then processes in sub-chunks: each sub-chunk
/// copies lines to a small local buffer (L2-hot) and decodes immediately.
/// Eliminates the large intermediate clean buffer (~12MB for 10MB decode).
/// Returns None if the data doesn't have uniform line structure.
fn try_decode_uniform_lines(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
    let first_nl = memchr::memchr(b'\n', data)?;
    let line_len = first_nl;
    if line_len == 0 || line_len % 4 != 0 {
        return None;
    }

    let stride = line_len + 1;

    // Verify the data has consistent line structure (first + last lines)
    let check_lines = 4.min(data.len() / stride);
    for i in 1..check_lines {
        let expected_nl = i * stride - 1;
        if expected_nl >= data.len() || data[expected_nl] != b'\n' {
            return None;
        }
    }

    let full_lines = if data.len() >= stride {
        let candidate = data.len() / stride;
        if candidate > 0 && data[candidate * stride - 1] != b'\n' {
            return None;
        }
        candidate
    } else {
        0
    };

    let remainder_start = full_lines * stride;
    let remainder = &data[remainder_start..];
    let rem_clean = if remainder.last() == Some(&b'\n') {
        &remainder[..remainder.len() - 1]
    } else {
        remainder
    };

    // Compute exact decoded sizes
    let decoded_per_line = line_len * 3 / 4;
    let rem_decoded_size = if rem_clean.is_empty() {
        0
    } else {
        let pad = rem_clean
            .iter()
            .rev()
            .take(2)
            .filter(|&&b| b == b'=')
            .count();
        rem_clean.len() * 3 / 4 - pad
    };
    let total_decoded = full_lines * decoded_per_line + rem_decoded_size;
    let clean_len = full_lines * line_len;

    // Parallel path: fused strip+decode with 128KB sub-chunks per thread.
    // Each thread copies lines to a thread-local buffer (L2-hot) and decodes immediately,
    // eliminating the 12MB+ intermediate clean buffer entirely.
    if clean_len >= PARALLEL_DECODE_THRESHOLD && num_cpus() > 1 {
        let mut output: Vec<u8> = Vec::with_capacity(total_decoded);
        #[allow(clippy::uninit_vec)]
        unsafe {
            output.set_len(total_decoded);
        }
        #[cfg(target_os = "linux")]
        hint_hugepage(&mut output);

        let out_ptr = output.as_mut_ptr() as usize;
        let src_ptr = data.as_ptr() as usize;
        let num_threads = num_cpus().max(1);
        let lines_per_thread = (full_lines + num_threads - 1) / num_threads;
        // 512KB sub-chunks: larger chunks give SIMD decode more contiguous data,
        // reducing per-call overhead. 512KB fits in L2 cache (256KB-1MB typical).
        let lines_per_sub = (512 * 1024 / line_len).max(1);

        let err_flag = std::sync::atomic::AtomicBool::new(false);
        rayon::scope(|s| {
            for t in 0..num_threads {
                let err_flag = &err_flag;
                s.spawn(move |_| {
                    let start_line = t * lines_per_thread;
                    if start_line >= full_lines {
                        return;
                    }
                    let end_line = (start_line + lines_per_thread).min(full_lines);
                    let chunk_lines = end_line - start_line;

                    let sub_buf_size = lines_per_sub.min(chunk_lines) * line_len;
                    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
                    #[allow(clippy::uninit_vec)]
                    unsafe {
                        local_buf.set_len(sub_buf_size);
                    }

                    let src = src_ptr as *const u8;
                    let out_base = out_ptr as *mut u8;
                    let local_dst = local_buf.as_mut_ptr();

                    let mut sub_start = 0usize;
                    while sub_start < chunk_lines {
                        if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
                            return;
                        }
                        let sub_count = (chunk_lines - sub_start).min(lines_per_sub);
                        let sub_clean = sub_count * line_len;

                        for i in 0..sub_count {
                            unsafe {
                                std::ptr::copy_nonoverlapping(
                                    src.add((start_line + sub_start + i) * stride),
                                    local_dst.add(i * line_len),
                                    line_len,
                                );
                            }
                        }

                        let out_offset = (start_line + sub_start) * decoded_per_line;
                        let out_size = sub_count * decoded_per_line;
                        let out_slice = unsafe {
                            std::slice::from_raw_parts_mut(out_base.add(out_offset), out_size)
                        };
                        if BASE64_ENGINE
                            .decode(&local_buf[..sub_clean], out_slice.as_out())
                            .is_err()
                        {
                            err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
                            return;
                        }

                        sub_start += sub_count;
                    }
                });
            }
        });
        let result: Result<(), io::Error> = if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
            Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
        } else {
            Ok(())
        };

        if let Err(e) = result {
            return Some(Err(e));
        }

        if !rem_clean.is_empty() {
            let rem_out = &mut output[full_lines * decoded_per_line..total_decoded];
            match BASE64_ENGINE.decode(rem_clean, rem_out.as_out()) {
                Ok(_) => {}
                Err(_) => return Some(decode_error()),
            }
        }

        return Some(out.write_all(&output[..total_decoded]));
    }

    // Sequential path: fused strip+decode in 256KB sub-chunks.
    // Larger sub-chunks give SIMD decode more data per call, improving throughput.
    // Uses decode_inplace on a small reusable buffer — no large allocations at all.
    let lines_per_sub = (256 * 1024 / line_len).max(1);
    let sub_buf_size = lines_per_sub * line_len;
    let mut local_buf: Vec<u8> = Vec::with_capacity(sub_buf_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        local_buf.set_len(sub_buf_size);
    }

    let src = data.as_ptr();
    let local_dst = local_buf.as_mut_ptr();

    let mut line_idx = 0usize;
    while line_idx < full_lines {
        let sub_count = (full_lines - line_idx).min(lines_per_sub);
        let sub_clean = sub_count * line_len;

        for i in 0..sub_count {
            unsafe {
                std::ptr::copy_nonoverlapping(
                    src.add((line_idx + i) * stride),
                    local_dst.add(i * line_len),
                    line_len,
                );
            }
        }

        match BASE64_ENGINE.decode_inplace(&mut local_buf[..sub_clean]) {
            Ok(decoded) => {
                if let Err(e) = out.write_all(decoded) {
                    return Some(Err(e));
                }
            }
            Err(_) => return Some(decode_error()),
        }

        line_idx += sub_count;
    }

    if !rem_clean.is_empty() {
        let mut rem_buf = rem_clean.to_vec();
        match BASE64_ENGINE.decode_inplace(&mut rem_buf) {
            Ok(decoded) => {
                if let Err(e) = out.write_all(decoded) {
                    return Some(Err(e));
                }
            }
            Err(_) => return Some(decode_error()),
        }
    }

    Some(Ok(()))
}

/// Decode by stripping whitespace and decoding in a single fused pass.
/// For data with no whitespace, decodes directly without any copy.
/// Detects uniform line structure for fast structured-copy (no search needed),
/// falls back to SIMD memchr2 gap-copy for irregular data.
fn decode_stripping_whitespace(data: &[u8], out: &mut impl Write) -> io::Result<()> {
    // Fast path for uniform-line base64 (e.g., standard 76-char lines + newline).
    // Copies at known offsets, avoiding the memchr2 search entirely.
    // For 13MB base64: saves ~1ms vs memchr2 gap-copy (just structured memcpy).
    if data.len() >= 77 {
        if let Some(result) = try_decode_uniform_lines(data, out) {
            return result;
        }
    }

    // Quick check: skip stripping if no \n or \r in the data.
    // Uses SIMD memchr2 for fast scanning (~10 GB/s) instead of per-byte check.
    if memchr::memchr2(b'\n', b'\r', data).is_none() {
        // No newlines/CR — check for rare whitespace only
        if !data
            .iter()
            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c)
        {
            return decode_borrowed_clean(out, data);
        }
        // Has rare whitespace only — strip and decode
        let mut cleaned: Vec<u8> = Vec::with_capacity(data.len());
        for &b in data {
            if NOT_WHITESPACE[b as usize] {
                cleaned.push(b);
            }
        }
        return decode_clean_slice(&mut cleaned, out);
    }

    // SIMD gap-copy: use memchr2 to find \n and \r positions, then copy the
    // gaps between them. For typical base64 (76-char lines), newlines are ~1/77
    // of the data, so we process ~76 bytes per memchr hit instead of 1 per scalar.
    let mut clean: Vec<u8> = Vec::with_capacity(data.len());
    let dst = clean.as_mut_ptr();
    let mut wp = 0usize;
    let mut gap_start = 0usize;
    // Track whether any rare whitespace (tab, space, VT, FF) exists in gap regions.
    // This avoids the second full-scan pass when only \n/\r are present.
    let mut has_rare_ws = false;

    for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
        let gap_len = pos - gap_start;
        if gap_len > 0 {
            // Check gap region for rare whitespace during copy.
            // This adds ~1 branch per gap but eliminates the second full scan.
            if !has_rare_ws {
                has_rare_ws = data[gap_start..pos]
                    .iter()
                    .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
            }
            unsafe {
                std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), gap_len);
            }
            wp += gap_len;
        }
        gap_start = pos + 1;
    }
    // Copy the final gap after the last \n/\r
    let tail_len = data.len() - gap_start;
    if tail_len > 0 {
        if !has_rare_ws {
            has_rare_ws = data[gap_start..]
                .iter()
                .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
        }
        unsafe {
            std::ptr::copy_nonoverlapping(data.as_ptr().add(gap_start), dst.add(wp), tail_len);
        }
        wp += tail_len;
    }
    unsafe {
        clean.set_len(wp);
    }

    // Second pass for rare whitespace (tab, space, VT, FF) — only runs when needed.
    // In typical base64 streams (76-char lines with \n), this is skipped entirely.
    if has_rare_ws {
        let ptr = clean.as_mut_ptr();
        let len = clean.len();
        let mut rp = 0;
        let mut cwp = 0;
        while rp < len {
            let b = unsafe { *ptr.add(rp) };
            if NOT_WHITESPACE[b as usize] {
                unsafe { *ptr.add(cwp) = b };
                cwp += 1;
            }
            rp += 1;
        }
        clean.truncate(cwp);
    }

    // For large data (>= threshold), use parallel decode for multi-core speedup.
    // For small data, use in-place decode to avoid extra allocation.
    if clean.len() >= PARALLEL_DECODE_THRESHOLD {
        decode_borrowed_clean_parallel(out, &clean)
    } else {
        decode_clean_slice(&mut clean, out)
    }
}

/// Try to decode base64 data line-by-line, avoiding whitespace stripping.
/// Returns Some(result) if the data has uniform line lengths suitable for
/// per-line decode, or None if the data doesn't fit this pattern.
///
/// For standard 76-char-line base64 (wrap=76): each line is 76 encoded chars
/// + newline = 77 bytes. 76 chars = 19 groups of 4 = 57 decoded bytes per line.
/// We decode each line directly into its position in the output buffer.
fn try_line_decode(data: &[u8], out: &mut impl Write) -> Option<io::Result<()>> {
    // Find the first newline to determine line length
    let first_nl = memchr::memchr(b'\n', data)?;
    let line_len = first_nl; // encoded chars per line (without newline)

    // Line length must be a multiple of 4 (complete base64 groups, no padding mid-stream)
    if line_len == 0 || line_len % 4 != 0 {
        return None;
    }

    let line_stride = line_len + 1; // line_len chars + 1 newline byte
    let decoded_per_line = line_len * 3 / 4;

    // Verify the data has a consistent line structure by checking the next few lines
    let check_lines = 4.min(data.len() / line_stride);
    for i in 1..check_lines {
        let expected_nl = i * line_stride - 1;
        if expected_nl >= data.len() {
            break;
        }
        if data[expected_nl] != b'\n' {
            return None; // Inconsistent line length
        }
    }

    // Calculate full lines and remainder
    let full_lines = if data.len() >= line_stride {
        // Check how many complete lines fit
        let candidate = data.len() / line_stride;
        // Verify the last full line's newline
        if candidate > 0 && data[candidate * line_stride - 1] != b'\n' {
            return None; // Not a clean line-structured file
        }
        candidate
    } else {
        0
    };

    let remainder_start = full_lines * line_stride;
    let remainder = &data[remainder_start..];

    // Calculate exact output size
    let remainder_clean_len = if remainder.is_empty() {
        0
    } else {
        // Remainder might end with newline, strip it
        let rem = if remainder.last() == Some(&b'\n') {
            &remainder[..remainder.len() - 1]
        } else {
            remainder
        };
        if rem.is_empty() {
            0
        } else {
            // Check for padding
            let pad = rem.iter().rev().take(2).filter(|&&b| b == b'=').count();
            if rem.len() % 4 != 0 {
                return None; // Invalid remainder
            }
            rem.len() * 3 / 4 - pad
        }
    };

    // Single-allocation decode: allocate full decoded output, decode all lines
    // directly into it, then write_all in one syscall. For 10MB base64 (7.5MB decoded),
    // this does 1 write() instead of ~30 chunked writes. The 7.5MB allocation is trivial
    // compared to the mmap'd input. SIMD decode at ~8 GB/s finishes in <1ms.
    let total_decoded = full_lines * decoded_per_line + remainder_clean_len;
    let mut out_buf: Vec<u8> = Vec::with_capacity(total_decoded);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(total_decoded);
    }

    let dst = out_buf.as_mut_ptr();

    // Parallel line decode for large inputs (>= 4MB): split lines across threads.
    // Each thread decodes a contiguous block of lines directly to its final position
    // in the shared output buffer. SAFETY: non-overlapping output regions per thread.
    if data.len() >= PARALLEL_DECODE_THRESHOLD && full_lines >= 64 {
        let out_addr = dst as usize;
        let num_threads = num_cpus().max(1);
        let lines_per_chunk = (full_lines / num_threads).max(1);

        // Build per-thread task ranges: (start_line, end_line)
        let mut tasks: Vec<(usize, usize)> = Vec::new();
        let mut line_off = 0;
        while line_off < full_lines {
            let end = (line_off + lines_per_chunk).min(full_lines);
            tasks.push((line_off, end));
            line_off = end;
        }

        let decode_err = std::sync::atomic::AtomicBool::new(false);
        rayon::scope(|s| {
            for &(start_line, end_line) in &tasks {
                let decode_err = &decode_err;
                s.spawn(move |_| {
                    let out_ptr = out_addr as *mut u8;
                    let mut i = start_line;

                    while i + 4 <= end_line {
                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
                            return;
                        }
                        let in_base = i * line_stride;
                        let ob = i * decoded_per_line;
                        unsafe {
                            let s0 =
                                std::slice::from_raw_parts_mut(out_ptr.add(ob), decoded_per_line);
                            if BASE64_ENGINE
                                .decode(&data[in_base..in_base + line_len], s0.as_out())
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                            let s1 = std::slice::from_raw_parts_mut(
                                out_ptr.add(ob + decoded_per_line),
                                decoded_per_line,
                            );
                            if BASE64_ENGINE
                                .decode(
                                    &data[in_base + line_stride..in_base + line_stride + line_len],
                                    s1.as_out(),
                                )
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                            let s2 = std::slice::from_raw_parts_mut(
                                out_ptr.add(ob + 2 * decoded_per_line),
                                decoded_per_line,
                            );
                            if BASE64_ENGINE
                                .decode(
                                    &data[in_base + 2 * line_stride
                                        ..in_base + 2 * line_stride + line_len],
                                    s2.as_out(),
                                )
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                            let s3 = std::slice::from_raw_parts_mut(
                                out_ptr.add(ob + 3 * decoded_per_line),
                                decoded_per_line,
                            );
                            if BASE64_ENGINE
                                .decode(
                                    &data[in_base + 3 * line_stride
                                        ..in_base + 3 * line_stride + line_len],
                                    s3.as_out(),
                                )
                                .is_err()
                            {
                                decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                                return;
                            }
                        }
                        i += 4;
                    }

                    while i < end_line {
                        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
                            return;
                        }
                        let in_start = i * line_stride;
                        let out_off = i * decoded_per_line;
                        let out_slice = unsafe {
                            std::slice::from_raw_parts_mut(out_ptr.add(out_off), decoded_per_line)
                        };
                        if BASE64_ENGINE
                            .decode(&data[in_start..in_start + line_len], out_slice.as_out())
                            .is_err()
                        {
                            decode_err.store(true, std::sync::atomic::Ordering::Relaxed);
                            return;
                        }
                        i += 1;
                    }
                });
            }
        });

        if decode_err.load(std::sync::atomic::Ordering::Relaxed) {
            return Some(decode_error());
        }
    } else {
        // Sequential decode with 4x unrolling for smaller inputs
        let mut i = 0;

        while i + 4 <= full_lines {
            let in_base = i * line_stride;
            let out_base = i * decoded_per_line;
            unsafe {
                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), decoded_per_line);
                if BASE64_ENGINE
                    .decode(&data[in_base..in_base + line_len], s0.as_out())
                    .is_err()
                {
                    return Some(decode_error());
                }

                let s1 = std::slice::from_raw_parts_mut(
                    dst.add(out_base + decoded_per_line),
                    decoded_per_line,
                );
                if BASE64_ENGINE
                    .decode(
                        &data[in_base + line_stride..in_base + line_stride + line_len],
                        s1.as_out(),
                    )
                    .is_err()
                {
                    return Some(decode_error());
                }

                let s2 = std::slice::from_raw_parts_mut(
                    dst.add(out_base + 2 * decoded_per_line),
                    decoded_per_line,
                );
                if BASE64_ENGINE
                    .decode(
                        &data[in_base + 2 * line_stride..in_base + 2 * line_stride + line_len],
                        s2.as_out(),
                    )
                    .is_err()
                {
                    return Some(decode_error());
                }

                let s3 = std::slice::from_raw_parts_mut(
                    dst.add(out_base + 3 * decoded_per_line),
                    decoded_per_line,
                );
                if BASE64_ENGINE
                    .decode(
                        &data[in_base + 3 * line_stride..in_base + 3 * line_stride + line_len],
                        s3.as_out(),
                    )
                    .is_err()
                {
                    return Some(decode_error());
                }
            }
            i += 4;
        }

        while i < full_lines {
            let in_start = i * line_stride;
            let in_end = in_start + line_len;
            let out_off = i * decoded_per_line;
            let out_slice =
                unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), decoded_per_line) };
            match BASE64_ENGINE.decode(&data[in_start..in_end], out_slice.as_out()) {
                Ok(_) => {}
                Err(_) => return Some(decode_error()),
            }
            i += 1;
        }
    }

    // Decode remainder
    if remainder_clean_len > 0 {
        let rem = if remainder.last() == Some(&b'\n') {
            &remainder[..remainder.len() - 1]
        } else {
            remainder
        };
        let out_off = full_lines * decoded_per_line;
        let out_slice =
            unsafe { std::slice::from_raw_parts_mut(dst.add(out_off), remainder_clean_len) };
        match BASE64_ENGINE.decode(rem, out_slice.as_out()) {
            Ok(_) => {}
            Err(_) => return Some(decode_error()),
        }
    }

    // Single write_all for the entire decoded output
    Some(out.write_all(&out_buf[..total_decoded]))
}

/// Decode a clean (no whitespace) buffer in-place with SIMD.
fn decode_clean_slice(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }
    decode_inplace_with_padding(data, out)
}

/// Cold error path — keeps hot decode path tight by moving error construction out of line.
#[cold]
#[inline(never)]
fn decode_error() -> io::Result<()> {
    Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"))
}

/// Decode in-place with padding fallback for truncated input.
/// GNU base64 accepts missing padding at end of stream, so if decode fails
/// and the length mod 4 is 2 or 3, retry with padding added.
fn decode_inplace_with_padding(data: &mut [u8], out: &mut impl Write) -> io::Result<()> {
    match BASE64_ENGINE.decode_inplace(data) {
        Ok(decoded) => out.write_all(decoded),
        Err(_) => {
            let remainder = data.len() % 4;
            if remainder == 2 || remainder == 3 {
                let has_existing_padding = memchr::memchr(b'=', data).is_some();
                let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
                padded.extend_from_slice(data);
                padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
                if let Ok(decoded) = BASE64_ENGINE.decode_inplace(&mut padded) {
                    out.write_all(decoded)?;
                    if has_existing_padding {
                        return decode_error();
                    }
                    return Ok(());
                }
            }
            decode_error()
        }
    }
}

/// Decode clean base64 data (no whitespace) from a borrowed slice.
fn decode_borrowed_clean(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }
    // Parallel decode for large data: split at 4-byte boundaries,
    // decode each chunk independently (base64 is context-free per 4-char group).
    if data.len() >= PARALLEL_DECODE_THRESHOLD {
        return decode_borrowed_clean_parallel(out, data);
    }
    // If input has truncated padding, pad it first (GNU base64 accepts missing padding).
    let remainder = data.len() % 4;
    if remainder == 2 || remainder == 3 {
        // If input already has '=' but length mod 4 != 0, the padding is
        // wrong/truncated. GNU base64 still decodes but reports error.
        let has_existing_padding = memchr::memchr(b'=', data).is_some();
        let mut padded = Vec::with_capacity(data.len() + (4 - remainder));
        padded.extend_from_slice(data);
        padded.extend(std::iter::repeat_n(b'=', 4 - remainder));
        let result = decode_borrowed_clean(out, &padded);
        if has_existing_padding && result.is_ok() {
            return decode_error();
        }
        return result;
    }
    // Pre-allocate exact output size to avoid decode_to_vec's reallocation.
    // Decoded size = data.len() * 3 / 4 minus padding.
    let pad = data.iter().rev().take(2).filter(|&&b| b == b'=').count();
    let decoded_size = data.len() * 3 / 4 - pad;
    let mut buf: Vec<u8> = Vec::with_capacity(decoded_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(decoded_size);
    }
    match BASE64_ENGINE.decode(data, buf[..decoded_size].as_out()) {
        Ok(decoded) => {
            out.write_all(decoded)?;
            Ok(())
        }
        Err(_) => decode_error(),
    }
}

/// Parallel decode: split at 4-byte boundaries, decode chunks in parallel.
/// Pre-allocates a single contiguous output buffer with exact decoded offsets computed
/// upfront, so each thread decodes directly to its final position. No compaction needed.
fn decode_borrowed_clean_parallel(out: &mut impl Write, data: &[u8]) -> io::Result<()> {
    let num_threads = num_cpus().max(1);
    let raw_chunk = data.len() / num_threads;
    // Align to 4 bytes (each 4 base64 chars = 3 decoded bytes, context-free)
    let chunk_size = ((raw_chunk + 3) / 4) * 4;

    let chunks: Vec<&[u8]> = data.chunks(chunk_size.max(4)).collect();

    // Compute exact decoded sizes per chunk upfront to eliminate the compaction pass.
    let mut offsets: Vec<usize> = Vec::with_capacity(chunks.len() + 1);
    offsets.push(0);
    let mut total_decoded = 0usize;
    for (i, chunk) in chunks.iter().enumerate() {
        let decoded_size = if i == chunks.len() - 1 {
            let pad = chunk.iter().rev().take(2).filter(|&&b| b == b'=').count();
            chunk.len() * 3 / 4 - pad
        } else {
            chunk.len() * 3 / 4
        };
        total_decoded += decoded_size;
        offsets.push(total_decoded);
    }

    let mut output_buf: Vec<u8> = Vec::with_capacity(total_decoded);
    #[allow(clippy::uninit_vec)]
    unsafe {
        output_buf.set_len(total_decoded);
    }
    #[cfg(target_os = "linux")]
    hint_hugepage(&mut output_buf);

    // Parallel decode: each thread decodes directly into its exact final position.
    // SAFETY: each thread writes to a non-overlapping region of the output buffer.
    let out_addr = output_buf.as_mut_ptr() as usize;
    let err_flag = std::sync::atomic::AtomicBool::new(false);
    rayon::scope(|s| {
        for (i, chunk) in chunks.iter().enumerate() {
            let offset = offsets[i];
            let expected_size = offsets[i + 1] - offset;
            let err_flag = &err_flag;
            s.spawn(move |_| {
                if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
                    return;
                }
                // SAFETY: each thread writes to non-overlapping region
                let out_slice = unsafe {
                    std::slice::from_raw_parts_mut((out_addr as *mut u8).add(offset), expected_size)
                };
                if BASE64_ENGINE.decode(chunk, out_slice.as_out()).is_err() {
                    err_flag.store(true, std::sync::atomic::Ordering::Relaxed);
                }
            });
        }
    });

    if err_flag.load(std::sync::atomic::Ordering::Relaxed) {
        return Err(io::Error::new(io::ErrorKind::InvalidData, "invalid input"));
    }

    out.write_all(&output_buf[..total_decoded])
}

/// Strip non-base64 characters (for -i / --ignore-garbage).
fn strip_non_base64(data: &[u8]) -> Vec<u8> {
    data.iter()
        .copied()
        .filter(|&b| is_base64_char(b))
        .collect()
}

/// Check if a byte is a valid base64 alphabet character or padding.
#[inline]
fn is_base64_char(b: u8) -> bool {
    b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'='
}

/// Stream-encode from a reader to a writer. Used for stdin processing.
/// Dispatches to specialized paths for wrap_col=0 (no wrap) and wrap_col>0 (wrapping).
pub fn encode_stream(
    reader: &mut impl Read,
    wrap_col: usize,
    writer: &mut impl Write,
) -> io::Result<()> {
    if wrap_col == 0 {
        return encode_stream_nowrap(reader, writer);
    }
    encode_stream_wrapped(reader, wrap_col, writer)
}

/// Streaming encode with NO line wrapping — optimized fast path.
/// Read size is 24MB (divisible by 3): encoded output = 24MB * 4/3 = 32MB.
/// 24MB reads mean 10-18MB input is consumed in a single read() call,
/// and the encoded output writes in 1-2 write() calls.
fn encode_stream_nowrap(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
    // 24MB aligned to 3 bytes: 24MB reads handle up to 24MB input in one pass.
    const NOWRAP_READ: usize = 24 * 1024 * 1024; // exactly divisible by 3

    // SAFETY: buf bytes are written by read_full before being processed.
    // encode_buf bytes are written by encode before being read.
    let mut buf: Vec<u8> = Vec::with_capacity(NOWRAP_READ);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(NOWRAP_READ);
    }
    let encode_buf_size = BASE64_ENGINE.encoded_length(NOWRAP_READ);
    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        encode_buf.set_len(encode_buf_size);
    }

    loop {
        let n = read_full(reader, &mut buf)?;
        if n == 0 {
            break;
        }
        let enc_len = BASE64_ENGINE.encoded_length(n);
        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());
        writer.write_all(encoded)?;
    }
    Ok(())
}

/// Streaming encode WITH line wrapping.
/// For the common case (wrap_col divides evenly into 3-byte input groups),
/// uses fuse_wrap to build a contiguous output buffer with newlines interleaved,
/// then writes it in a single write() call. This eliminates the overhead of
/// many writev() syscalls (one per ~512 lines via IoSlice).
///
/// For non-aligned wrap columns, falls back to the IoSlice/writev approach.
fn encode_stream_wrapped(
    reader: &mut impl Read,
    wrap_col: usize,
    writer: &mut impl Write,
) -> io::Result<()> {
    let bytes_per_line = wrap_col * 3 / 4;
    // For the common case (76-col wrapping, bytes_per_line=57 which is divisible by 3),
    // align the read buffer to bytes_per_line boundaries so each chunk produces
    // complete lines with no column carry-over between chunks.
    if bytes_per_line > 0 && bytes_per_line.is_multiple_of(3) {
        return encode_stream_wrapped_fused(reader, wrap_col, bytes_per_line, writer);
    }

    // Fallback: non-aligned wrap columns use IoSlice/writev with column tracking
    const STREAM_READ: usize = 12 * 1024 * 1024;
    let mut buf: Vec<u8> = Vec::with_capacity(STREAM_READ);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(STREAM_READ);
    }
    let encode_buf_size = BASE64_ENGINE.encoded_length(STREAM_READ);
    let mut encode_buf: Vec<u8> = Vec::with_capacity(encode_buf_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        encode_buf.set_len(encode_buf_size);
    }

    let mut col = 0usize;

    loop {
        let n = read_full(reader, &mut buf)?;
        if n == 0 {
            break;
        }
        let enc_len = BASE64_ENGINE.encoded_length(n);
        let encoded = BASE64_ENGINE.encode(&buf[..n], encode_buf[..enc_len].as_out());

        write_wrapped_iov_streaming(encoded, wrap_col, &mut col, writer)?;
    }

    if col > 0 {
        writer.write_all(b"\n")?;
    }

    Ok(())
}

/// Direct-to-position encode+wrap streaming: align reads to bytes_per_line boundaries,
/// encode each line directly into its final position with newline appended.
/// Eliminates the two-pass encode-then-fuse_wrap approach.
/// For 76-col wrapping (bytes_per_line=57): 12MB / 57 = ~210K complete lines per chunk.
/// Output = 210K * 77 bytes = ~16MB, one write() syscall per chunk.
fn encode_stream_wrapped_fused(
    reader: &mut impl Read,
    wrap_col: usize,
    bytes_per_line: usize,
    writer: &mut impl Write,
) -> io::Result<()> {
    // Align read size to bytes_per_line for complete output lines per chunk.
    // ~210K lines * 57 bytes = ~12MB input, ~16MB output per chunk.
    // Smaller chunks have better cache behavior for the 4-line unrolled encode loop.
    let lines_per_chunk = (12 * 1024 * 1024) / bytes_per_line;
    let read_size = lines_per_chunk * bytes_per_line;
    let line_out = wrap_col + 1; // wrap_col encoded bytes + 1 newline

    // SAFETY: buf bytes are written by read_full before being processed.
    // out_buf bytes are written by encode before being read.
    let mut buf: Vec<u8> = Vec::with_capacity(read_size);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(read_size);
    }
    // Output buffer: enough for all lines + remainder
    let max_output = lines_per_chunk * line_out + BASE64_ENGINE.encoded_length(bytes_per_line) + 2;
    let mut out_buf: Vec<u8> = Vec::with_capacity(max_output);
    #[allow(clippy::uninit_vec)]
    unsafe {
        out_buf.set_len(max_output);
    }

    loop {
        let n = read_full(reader, &mut buf)?;
        if n == 0 {
            break;
        }

        let full_lines = n / bytes_per_line;
        let remainder = n % bytes_per_line;

        // Encode each input line directly into its final output position.
        // Each 57-byte input line -> 76 encoded bytes + '\n' = 77 bytes at offset line_idx * 77.
        // This eliminates the separate encode + fuse_wrap copy entirely.
        let dst = out_buf.as_mut_ptr();
        let mut line_idx = 0;

        // 4-line unrolled loop for better ILP
        while line_idx + 4 <= full_lines {
            let in_base = line_idx * bytes_per_line;
            let out_base = line_idx * line_out;
            unsafe {
                let s0 = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s0.as_out());
                *dst.add(out_base + wrap_col) = b'\n';

                let s1 = std::slice::from_raw_parts_mut(dst.add(out_base + line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &buf[in_base + bytes_per_line..in_base + 2 * bytes_per_line],
                    s1.as_out(),
                );
                *dst.add(out_base + line_out + wrap_col) = b'\n';

                let s2 = std::slice::from_raw_parts_mut(dst.add(out_base + 2 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &buf[in_base + 2 * bytes_per_line..in_base + 3 * bytes_per_line],
                    s2.as_out(),
                );
                *dst.add(out_base + 2 * line_out + wrap_col) = b'\n';

                let s3 = std::slice::from_raw_parts_mut(dst.add(out_base + 3 * line_out), wrap_col);
                let _ = BASE64_ENGINE.encode(
                    &buf[in_base + 3 * bytes_per_line..in_base + 4 * bytes_per_line],
                    s3.as_out(),
                );
                *dst.add(out_base + 3 * line_out + wrap_col) = b'\n';
            }
            line_idx += 4;
        }

        // Remaining full lines
        while line_idx < full_lines {
            let in_base = line_idx * bytes_per_line;
            let out_base = line_idx * line_out;
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(out_base), wrap_col);
                let _ = BASE64_ENGINE.encode(&buf[in_base..in_base + bytes_per_line], s.as_out());
                *dst.add(out_base + wrap_col) = b'\n';
            }
            line_idx += 1;
        }

        let mut wp = full_lines * line_out;

        // Handle remainder (partial last line of this chunk)
        if remainder > 0 {
            let enc_len = BASE64_ENGINE.encoded_length(remainder);
            let line_input = &buf[full_lines * bytes_per_line..n];
            unsafe {
                let s = std::slice::from_raw_parts_mut(dst.add(wp), enc_len);
                let _ = BASE64_ENGINE.encode(line_input, s.as_out());
                *dst.add(wp + enc_len) = b'\n';
            }
            wp += enc_len + 1;
        }

        writer.write_all(&out_buf[..wp])?;
    }

    Ok(())
}

/// Stream-decode from a reader to a writer. Used for stdin processing.
/// In-place strip + decode: read chunk -> strip whitespace in-place in read buffer
/// -> decode in-place -> write. Eliminates separate clean buffer allocation (saves 32MB).
/// Uses 32MB read buffer for maximum pipe throughput — read_full retries to
/// fill the entire buffer from the pipe, and 32MB means even large inputs
/// (up to ~24MB after base64 encoding of 18MB raw) are read in a single syscall batch.
pub fn decode_stream(
    reader: &mut impl Read,
    ignore_garbage: bool,
    writer: &mut impl Write,
) -> io::Result<()> {
    const READ_CHUNK: usize = 32 * 1024 * 1024;
    // SAFETY: buf bytes are written by read_full before being processed.
    // The extra 4 bytes accommodate carry-over from previous chunk.
    let mut buf: Vec<u8> = Vec::with_capacity(READ_CHUNK + 4);
    #[allow(clippy::uninit_vec)]
    unsafe {
        buf.set_len(READ_CHUNK + 4);
    }
    let mut carry = [0u8; 4];
    let mut carry_len = 0usize;

    loop {
        // Copy carry bytes to start of buffer, read new data after them
        if carry_len > 0 {
            unsafe {
                std::ptr::copy_nonoverlapping(carry.as_ptr(), buf.as_mut_ptr(), carry_len);
            }
        }
        let n = read_full(reader, &mut buf[carry_len..carry_len + READ_CHUNK])?;
        if n == 0 {
            break;
        }
        let total_raw = carry_len + n;

        // Strip whitespace in-place in the buffer itself.
        // This eliminates the separate clean buffer allocation (saves 16MB).
        let clean_len = if ignore_garbage {
            // Scalar filter for ignore_garbage mode (rare path)
            let ptr = buf.as_mut_ptr();
            let mut wp = 0usize;
            for i in 0..total_raw {
                let b = unsafe { *ptr.add(i) };
                if is_base64_char(b) {
                    unsafe { *ptr.add(wp) = b };
                    wp += 1;
                }
            }
            wp
        } else {
            // In-place SIMD gap-copy using memchr2 to find \n and \r positions.
            // For typical base64 (76-char lines), newlines are ~1/77 of the data,
            // so we process ~76 bytes per memchr hit.
            let ptr = buf.as_mut_ptr();
            let data = &buf[..total_raw];
            let mut wp = 0usize;
            let mut gap_start = 0usize;
            let mut has_rare_ws = false;

            for pos in memchr::memchr2_iter(b'\n', b'\r', data) {
                let gap_len = pos - gap_start;
                if gap_len > 0 {
                    if !has_rare_ws {
                        has_rare_ws = data[gap_start..pos]
                            .iter()
                            .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
                    }
                    if wp != gap_start {
                        unsafe {
                            std::ptr::copy(ptr.add(gap_start), ptr.add(wp), gap_len);
                        }
                    }
                    wp += gap_len;
                }
                gap_start = pos + 1;
            }
            let tail_len = total_raw - gap_start;
            if tail_len > 0 {
                if !has_rare_ws {
                    has_rare_ws = data[gap_start..total_raw]
                        .iter()
                        .any(|&b| b == b' ' || b == b'\t' || b == 0x0b || b == 0x0c);
                }
                if wp != gap_start {
                    unsafe {
                        std::ptr::copy(ptr.add(gap_start), ptr.add(wp), tail_len);
                    }
                }
                wp += tail_len;
            }

            // Second pass for rare whitespace (tab, space, VT, FF) — only when detected.
            if has_rare_ws {
                let mut rp = 0;
                let mut cwp = 0;
                while rp < wp {
                    let b = unsafe { *ptr.add(rp) };
                    if NOT_WHITESPACE[b as usize] {
                        unsafe { *ptr.add(cwp) = b };
                        cwp += 1;
                    }
                    rp += 1;
                }
                cwp
            } else {
                wp
            }
        };

        carry_len = 0;
        let is_last = n < READ_CHUNK;

        if is_last {
            // Last chunk: decode everything (including padding)
            decode_clean_slice(&mut buf[..clean_len], writer)?;
        } else {
            // Save incomplete base64 quadruplet for next iteration
            let decode_len = (clean_len / 4) * 4;
            let leftover = clean_len - decode_len;
            if leftover > 0 {
                unsafe {
                    std::ptr::copy_nonoverlapping(
                        buf.as_ptr().add(decode_len),
                        carry.as_mut_ptr(),
                        leftover,
                    );
                }
                carry_len = leftover;
            }
            if decode_len > 0 {
                decode_clean_slice(&mut buf[..decode_len], writer)?;
            }
        }
    }

    // Handle any remaining carry-over bytes
    if carry_len > 0 {
        let mut carry_buf = carry[..carry_len].to_vec();
        decode_clean_slice(&mut carry_buf, writer)?;
    }

    Ok(())
}

/// Write all IoSlice entries using write_vectored (writev syscall).
/// Hot path: single write_vectored succeeds fully (common on Linux pipes/files).
/// Cold path: partial write handled out-of-line to keep hot path tight.
#[inline(always)]
fn write_all_vectored(out: &mut impl Write, slices: &[io::IoSlice]) -> io::Result<()> {
    if slices.is_empty() {
        return Ok(());
    }
    let total: usize = slices.iter().map(|s| s.len()).sum();
    let written = out.write_vectored(slices)?;
    if written >= total {
        return Ok(());
    }
    if written == 0 {
        return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
    }
    write_all_vectored_slow(out, slices, written)
}

/// Handle partial write (cold path, never inlined).
#[cold]
#[inline(never)]
fn write_all_vectored_slow(
    out: &mut impl Write,
    slices: &[io::IoSlice],
    mut skip: usize,
) -> io::Result<()> {
    for slice in slices {
        let len = slice.len();
        if skip >= len {
            skip -= len;
            continue;
        }
        out.write_all(&slice[skip..])?;
        skip = 0;
    }
    Ok(())
}

/// Read as many bytes as possible into buf, retrying on partial reads.
/// Fast path: regular file reads usually return the full buffer on the first call,
/// avoiding the loop overhead entirely.
#[inline]
fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
    // Fast path: first read() usually fills the entire buffer for regular files
    let n = reader.read(buf)?;
    if n == buf.len() || n == 0 {
        return Ok(n);
    }
    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
    let mut total = n;
    while total < buf.len() {
        match reader.read(&mut buf[total..]) {
            Ok(0) => break,
            Ok(n) => total += n,
            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
            Err(e) => return Err(e),
        }
    }
    Ok(total)
}