Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (8MB).
5/// Files above this threshold use rayon parallel chunked processing.
6/// 8MB balances the split_for_scope scan overhead against parallel benefits.
7const PARALLEL_THRESHOLD: usize = 8 * 1024 * 1024;
8
9/// Max iovec entries per writev call (Linux default).
10const MAX_IOV: usize = 1024;
11
12/// Configuration for cut operations.
13pub struct CutConfig<'a> {
14    pub mode: CutMode,
15    pub ranges: &'a [Range],
16    pub complement: bool,
17    pub delim: u8,
18    pub output_delim: &'a [u8],
19    pub suppress_no_delim: bool,
20    pub line_delim: u8,
21}
22
23/// A range specification like 1, 3-5, -3, 4-
24#[derive(Debug, Clone)]
25pub struct Range {
26    pub start: usize, // 1-based, 0 means "from beginning"
27    pub end: usize,   // 1-based, usize::MAX means "to end"
28}
29
30/// Parse a LIST specification like "1,3-5,7-" into ranges.
31/// Each range is 1-based. Returns sorted, merged ranges.
32/// When `no_merge_adjacent` is true, overlapping ranges are still merged but
33/// adjacent ranges (e.g., 1-2,3-4) are kept separate. This is needed when
34/// `--output-delimiter` is specified for byte/char mode so the delimiter is
35/// inserted between originally separate but adjacent ranges.
36pub fn parse_ranges(spec: &str, no_merge_adjacent: bool) -> Result<Vec<Range>, String> {
37    let mut ranges = Vec::new();
38
39    for part in spec.split(',') {
40        let part = part.trim();
41        if part.is_empty() {
42            continue;
43        }
44
45        if let Some(idx) = part.find('-') {
46            let left = &part[..idx];
47            let right = &part[idx + 1..];
48
49            // Reject bare "-" (both sides empty)
50            if left.is_empty() && right.is_empty() {
51                return Err("invalid range with no endpoint: -".to_string());
52            }
53
54            let start = if left.is_empty() {
55                1
56            } else {
57                left.parse::<usize>()
58                    .map_err(|_| format!("invalid range: '{}'", part))?
59            };
60
61            let end = if right.is_empty() {
62                usize::MAX
63            } else {
64                right
65                    .parse::<usize>()
66                    .map_err(|_| format!("invalid range: '{}'", part))?
67            };
68
69            if start == 0 {
70                return Err("fields and positions are numbered from 1".to_string());
71            }
72            if start > end {
73                return Err(format!("invalid decreasing range: '{}'", part));
74            }
75
76            ranges.push(Range { start, end });
77        } else {
78            let n = part
79                .parse::<usize>()
80                .map_err(|_| format!("invalid field: '{}'", part))?;
81            if n == 0 {
82                return Err("fields and positions are numbered from 1".to_string());
83            }
84            ranges.push(Range { start: n, end: n });
85        }
86    }
87
88    if ranges.is_empty() {
89        return Err("you must specify a list of bytes, characters, or fields".to_string());
90    }
91
92    // Sort and merge overlapping/adjacent ranges
93    ranges.sort_by_key(|r| (r.start, r.end));
94    let mut merged = vec![ranges[0].clone()];
95    for r in &ranges[1..] {
96        let last = merged.last_mut().unwrap();
97        if no_merge_adjacent {
98            // Only merge truly overlapping ranges, not adjacent ones
99            if r.start <= last.end {
100                last.end = last.end.max(r.end);
101            } else {
102                merged.push(r.clone());
103            }
104        } else {
105            // Merge both overlapping and adjacent ranges
106            if r.start <= last.end.saturating_add(1) {
107                last.end = last.end.max(r.end);
108            } else {
109                merged.push(r.clone());
110            }
111        }
112    }
113
114    Ok(merged)
115}
116
117/// Check if a 1-based position is in any range.
118/// Ranges must be sorted. Uses early exit since ranges are sorted.
119#[inline(always)]
120fn in_ranges(ranges: &[Range], pos: usize) -> bool {
121    for r in ranges {
122        if pos < r.start {
123            return false;
124        }
125        if pos <= r.end {
126            return true;
127        }
128    }
129    false
130}
131
132/// Pre-compute a 64-bit mask for field selection.
133/// Bit i-1 is set if field i should be output.
134#[inline]
135fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
136    let mut mask: u64 = 0;
137    for i in 1..=64u32 {
138        let in_range = in_ranges(ranges, i as usize);
139        if in_range != complement {
140            mask |= 1u64 << (i - 1);
141        }
142    }
143    mask
144}
145
146/// Check if a field should be selected, using bitset for first 64 fields.
147#[inline(always)]
148fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
149    if field_num <= 64 {
150        (mask >> (field_num - 1)) & 1 == 1
151    } else {
152        in_ranges(ranges, field_num) != complement
153    }
154}
155
156// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
157
158/// Append a slice to buf without capacity checks.
159/// Caller MUST ensure buf has enough remaining capacity.
160#[inline(always)]
161unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
162    unsafe {
163        let len = buf.len();
164        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
165        buf.set_len(len + data.len());
166    }
167}
168
169/// Append a single byte to buf without capacity checks.
170/// Caller MUST ensure buf has enough remaining capacity.
171#[inline(always)]
172unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
173    unsafe {
174        let len = buf.len();
175        *buf.as_mut_ptr().add(len) = b;
176        buf.set_len(len + 1);
177    }
178}
179
180/// Write multiple IoSlice buffers using write_vectored (writev syscall).
181/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
182/// Cold path (partial write) is out-of-line to keep the hot loop tight.
183#[inline]
184fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
185    if slices.is_empty() {
186        return Ok(());
187    }
188    for batch in slices.chunks(MAX_IOV) {
189        let total: usize = batch.iter().map(|s| s.len()).sum();
190        let written = out.write_vectored(batch)?;
191        if written >= total {
192            continue;
193        }
194        if written == 0 {
195            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
196        }
197        write_ioslices_slow(out, batch, written)?;
198    }
199    Ok(())
200}
201
202/// Handle partial write_vectored (cold path, never inlined).
203#[cold]
204#[inline(never)]
205fn write_ioslices_slow(
206    out: &mut impl Write,
207    slices: &[IoSlice],
208    mut skip: usize,
209) -> io::Result<()> {
210    for slice in slices {
211        let len = slice.len();
212        if skip >= len {
213            skip -= len;
214            continue;
215        }
216        out.write_all(&slice[skip..])?;
217        skip = 0;
218    }
219    Ok(())
220}
221
222// ── Chunk splitting for parallel processing ──────────────────────────────
223
224/// Number of available CPUs for parallel chunk splitting.
225/// Uses std::thread::available_parallelism() to avoid triggering premature
226/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
227#[inline]
228fn num_cpus() -> usize {
229    std::thread::available_parallelism()
230        .map(|n| n.get())
231        .unwrap_or(1)
232}
233
234/// Split data into chunks for rayon::scope parallel processing.
235/// Uses Rayon's thread count to match the number of worker threads.
236fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
237    let num_threads = num_cpus().max(1);
238    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
239        return vec![data];
240    }
241
242    let chunk_size = data.len() / num_threads;
243    let mut chunks = Vec::with_capacity(num_threads);
244    let mut pos = 0;
245
246    for _ in 0..num_threads - 1 {
247        let target = pos + chunk_size;
248        if target >= data.len() {
249            break;
250        }
251        let boundary = memchr::memchr(line_delim, &data[target..])
252            .map(|p| target + p + 1)
253            .unwrap_or(data.len());
254        if boundary > pos {
255            chunks.push(&data[pos..boundary]);
256        }
257        pos = boundary;
258    }
259
260    if pos < data.len() {
261        chunks.push(&data[pos..]);
262    }
263
264    chunks
265}
266
267// ── Fast path: multi-field non-contiguous extraction ─────────────────────
268
269/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
270/// Pre-collects delimiter positions per line into a stack-allocated array,
271/// then directly indexes into them for each selected field.
272/// This is O(max_field) per line instead of O(num_fields * scan_length).
273fn process_fields_multi_select(
274    data: &[u8],
275    delim: u8,
276    line_delim: u8,
277    ranges: &[Range],
278    suppress: bool,
279    out: &mut impl Write,
280) -> io::Result<()> {
281    let max_field = ranges.last().map_or(0, |r| r.end);
282
283    if data.len() >= PARALLEL_THRESHOLD {
284        let chunks = split_for_scope(data, line_delim);
285        let n = chunks.len();
286        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
287        rayon::scope(|s| {
288            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
289                s.spawn(move |_| {
290                    result.reserve(chunk.len() * 3 / 4);
291                    multi_select_chunk(
292                        chunk, delim, line_delim, ranges, max_field, suppress, result,
293                    );
294                });
295            }
296        });
297        let slices: Vec<IoSlice> = results
298            .iter()
299            .filter(|r| !r.is_empty())
300            .map(|r| IoSlice::new(r))
301            .collect();
302        write_ioslices(out, &slices)?;
303    } else {
304        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
305        multi_select_chunk(
306            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
307        );
308        if !buf.is_empty() {
309            out.write_all(&buf)?;
310        }
311    }
312    Ok(())
313}
314
315/// Process a chunk for multi-field extraction using two-level scanning.
316/// Outer memchr(newline) for line boundaries, inner memchr_iter(delim) for delimiter
317/// positions with early exit at max_field. This is faster than memchr2 single-pass
318/// because memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles),
319/// and the inner scan exits early at max_field instead of processing all delimiters.
320fn multi_select_chunk(
321    data: &[u8],
322    delim: u8,
323    line_delim: u8,
324    ranges: &[Range],
325    max_field: usize,
326    suppress: bool,
327    buf: &mut Vec<u8>,
328) {
329    buf.reserve(data.len());
330    let base = data.as_ptr();
331    let mut start = 0;
332    let max_delims = max_field.min(64);
333
334    for end_pos in memchr_iter(line_delim, data) {
335        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
336        multi_select_line_fast(
337            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
338        );
339        start = end_pos + 1;
340    }
341    if start < data.len() {
342        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
343        multi_select_line_fast(
344            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
345        );
346    }
347}
348
349/// Extract selected fields from a single line using delimiter position scanning.
350/// Optimized: collects delimiter positions into a stack array with early exit at max_delims,
351/// then indexes directly for each selected field. Uses raw pointer arithmetic.
352#[inline(always)]
353fn multi_select_line_fast(
354    line: &[u8],
355    delim: u8,
356    line_delim: u8,
357    ranges: &[Range],
358    max_delims: usize,
359    suppress: bool,
360    buf: &mut Vec<u8>,
361    _line_abs_start: usize,
362    _data_base: *const u8,
363) {
364    let len = line.len();
365    if len == 0 {
366        if !suppress {
367            unsafe { buf_push(buf, line_delim) };
368        }
369        return;
370    }
371
372    let base = line.as_ptr();
373
374    // Collect delimiter positions up to max_delims (early exit).
375    let mut delim_pos = [0usize; 64];
376    let mut num_delims: usize = 0;
377
378    for pos in memchr_iter(delim, line) {
379        if num_delims < max_delims {
380            delim_pos[num_delims] = pos;
381            num_delims += 1;
382            if num_delims >= max_delims {
383                break;
384            }
385        }
386    }
387
388    if num_delims == 0 {
389        if !suppress {
390            unsafe {
391                buf_extend(buf, line);
392                buf_push(buf, line_delim);
393            }
394        }
395        return;
396    }
397
398    let total_fields = num_delims + 1;
399    let mut first_output = true;
400
401    for r in ranges {
402        let range_start = r.start;
403        let range_end = r.end.min(total_fields);
404        if range_start > total_fields {
405            break;
406        }
407        for field_num in range_start..=range_end {
408            if field_num > total_fields {
409                break;
410            }
411
412            let field_start = if field_num == 1 {
413                0
414            } else if field_num - 2 < num_delims {
415                delim_pos[field_num - 2] + 1
416            } else {
417                continue;
418            };
419            let field_end = if field_num <= num_delims {
420                delim_pos[field_num - 1]
421            } else {
422                len
423            };
424
425            if !first_output {
426                unsafe { buf_push(buf, delim) };
427            }
428            unsafe {
429                buf_extend(
430                    buf,
431                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
432                );
433            }
434            first_output = false;
435        }
436    }
437
438    unsafe { buf_push(buf, line_delim) };
439}
440
441// ── Fast path: field extraction with batched output ──────────────────────
442
443/// Optimized field extraction with early exit and batched output.
444fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
445    let delim = cfg.delim;
446    let line_delim = cfg.line_delim;
447    let ranges = cfg.ranges;
448    let complement = cfg.complement;
449    let output_delim = cfg.output_delim;
450    let suppress = cfg.suppress_no_delim;
451
452    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
453    // That scan was O(N) over the entire file just to check an edge case
454    // (no delimiter in any line). The per-line processing already handles
455    // lines without delimiters correctly, so the scan was pure overhead
456    // for files that DO contain delimiters (the common case).
457
458    // Ultra-fast path: single field extraction (e.g., cut -f5)
459    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
460        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
461    }
462
463    // Fast path: complement of single field or contiguous range with default output delimiter.
464    if complement
465        && ranges.len() == 1
466        && output_delim.len() == 1
467        && output_delim[0] == delim
468        && ranges[0].start == ranges[0].end
469    {
470        return process_complement_single_field(
471            data,
472            delim,
473            line_delim,
474            ranges[0].start,
475            suppress,
476            out,
477        );
478    }
479
480    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
481    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
482    if complement
483        && ranges.len() == 1
484        && ranges[0].start > 1
485        && ranges[0].end < usize::MAX
486        && output_delim.len() == 1
487        && output_delim[0] == delim
488    {
489        return process_complement_range(
490            data,
491            delim,
492            line_delim,
493            ranges[0].start,
494            ranges[0].end,
495            suppress,
496            out,
497        );
498    }
499
500    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
501    if !complement
502        && ranges.len() == 1
503        && ranges[0].start == 1
504        && output_delim.len() == 1
505        && output_delim[0] == delim
506        && ranges[0].end < usize::MAX
507    {
508        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
509    }
510
511    // Fast path: open-ended field range from field N (e.g., cut -f3-)
512    if !complement
513        && ranges.len() == 1
514        && ranges[0].end == usize::MAX
515        && ranges[0].start > 1
516        && output_delim.len() == 1
517        && output_delim[0] == delim
518    {
519        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
520    }
521
522    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
523    if !complement
524        && ranges.len() == 1
525        && ranges[0].start > 1
526        && ranges[0].end < usize::MAX
527        && output_delim.len() == 1
528        && output_delim[0] == delim
529    {
530        return process_fields_mid_range(
531            data,
532            delim,
533            line_delim,
534            ranges[0].start,
535            ranges[0].end,
536            suppress,
537            out,
538        );
539    }
540
541    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
542    // Uses delimiter position caching: find all delimiter positions per line,
543    // then directly index into them for each selected field.
544    // This is faster than the general extract_fields_to_buf which re-checks
545    // is_selected() for every field encountered.
546    if !complement
547        && ranges.len() > 1
548        && ranges.last().map_or(false, |r| r.end < usize::MAX)
549        && output_delim.len() == 1
550        && output_delim[0] == delim
551        && delim != line_delim
552    {
553        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
554    }
555
556    // General field extraction
557    let max_field = if complement {
558        usize::MAX
559    } else {
560        ranges.last().map(|r| r.end).unwrap_or(0)
561    };
562    let field_mask = compute_field_mask(ranges, complement);
563
564    if data.len() >= PARALLEL_THRESHOLD {
565        let chunks = split_for_scope(data, line_delim);
566        let n = chunks.len();
567        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
568        rayon::scope(|s| {
569            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
570                s.spawn(move |_| {
571                    result.reserve(chunk.len() + 1);
572                    process_fields_chunk(
573                        chunk,
574                        delim,
575                        ranges,
576                        output_delim,
577                        suppress,
578                        max_field,
579                        field_mask,
580                        line_delim,
581                        complement,
582                        result,
583                    );
584                });
585            }
586        });
587        let slices: Vec<IoSlice> = results
588            .iter()
589            .filter(|r| !r.is_empty())
590            .map(|r| IoSlice::new(r))
591            .collect();
592        write_ioslices(out, &slices)?;
593    } else {
594        // +1 for potential trailing line_delim when input doesn't end with one
595        let mut buf = Vec::with_capacity(data.len() + 1);
596        process_fields_chunk(
597            data,
598            delim,
599            ranges,
600            output_delim,
601            suppress,
602            max_field,
603            field_mask,
604            line_delim,
605            complement,
606            &mut buf,
607        );
608        if !buf.is_empty() {
609            out.write_all(&buf)?;
610        }
611    }
612    Ok(())
613}
614
615/// Process a chunk of data for general field extraction.
616/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
617/// memchr_iter(delim) for delimiter positions. This is faster than memchr2 single-pass
618/// because memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles).
619fn process_fields_chunk(
620    data: &[u8],
621    delim: u8,
622    ranges: &[Range],
623    output_delim: &[u8],
624    suppress: bool,
625    max_field: usize,
626    field_mask: u64,
627    line_delim: u8,
628    complement: bool,
629    buf: &mut Vec<u8>,
630) {
631    // Always use two-level approach: outer memchr(newline) + inner memchr_iter(delim).
632    // Even for complement/unbounded ranges, two-level is faster because memchr is
633    // ~30-50% faster per byte than memchr2. The per-line function call overhead
634    // is negligible compared to the SIMD scan savings.
635    if delim != line_delim {
636        buf.reserve(data.len());
637        let mut start = 0;
638        for end_pos in memchr_iter(line_delim, data) {
639            let line = &data[start..end_pos];
640            extract_fields_to_buf(
641                line,
642                delim,
643                ranges,
644                output_delim,
645                suppress,
646                max_field,
647                field_mask,
648                line_delim,
649                buf,
650                complement,
651            );
652            start = end_pos + 1;
653        }
654        if start < data.len() {
655            extract_fields_to_buf(
656                &data[start..],
657                delim,
658                ranges,
659                output_delim,
660                suppress,
661                max_field,
662                field_mask,
663                line_delim,
664                buf,
665                complement,
666            );
667        }
668        return;
669    }
670
671    // Fallback: when delim == line_delim, use the two-level scan approach
672    let mut start = 0;
673    for end_pos in memchr_iter(line_delim, data) {
674        let line = &data[start..end_pos];
675        extract_fields_to_buf(
676            line,
677            delim,
678            ranges,
679            output_delim,
680            suppress,
681            max_field,
682            field_mask,
683            line_delim,
684            buf,
685            complement,
686        );
687        start = end_pos + 1;
688    }
689    if start < data.len() {
690        extract_fields_to_buf(
691            &data[start..],
692            delim,
693            ranges,
694            output_delim,
695            suppress,
696            max_field,
697            field_mask,
698            line_delim,
699            buf,
700            complement,
701        );
702    }
703}
704
705// ── Ultra-fast single field extraction ───────────────────────────────────
706
707/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
708/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
709/// memchr(delim) for the field delimiter with early exit.
710fn process_single_field(
711    data: &[u8],
712    delim: u8,
713    line_delim: u8,
714    target: usize,
715    suppress: bool,
716    out: &mut impl Write,
717) -> io::Result<()> {
718    let target_idx = target - 1;
719
720    // For single-field extraction, parallelize at 16MB+ to match PARALLEL_THRESHOLD.
721    const FIELD_PARALLEL_MIN: usize = 16 * 1024 * 1024;
722
723    if delim != line_delim {
724        // Field 1 fast path: two-level scan (outer newline + inner first-delim).
725        // For field 1, only needs to find the first delimiter per line.
726        // Lines without delimiter are tracked as contiguous runs for bulk copy.
727        if target_idx == 0 && !suppress {
728            if data.len() >= FIELD_PARALLEL_MIN {
729                return single_field1_parallel(data, delim, line_delim, out);
730            }
731            // Sequential: two-level scan into buffer, single write_all.
732            // Buffer approach is faster than writev for high-delimiter-density
733            // data because it produces one contiguous buffer (one write syscall)
734            // instead of many IoSlice entries (kernel overhead per iovec).
735            let mut buf = Vec::with_capacity(data.len() + 1);
736            single_field1_to_buf(data, delim, line_delim, &mut buf);
737            if !buf.is_empty() {
738                out.write_all(&buf)?;
739            }
740            return Ok(());
741        }
742
743        // Two-level approach for field N: outer newline scan + inner delim scan
744        // with early exit at target_idx. Faster than memchr2 single-pass because
745        // we only scan delimiters up to target_idx per line (not all of them).
746        if data.len() >= FIELD_PARALLEL_MIN {
747            let chunks = split_for_scope(data, line_delim);
748            let n = chunks.len();
749            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
750            rayon::scope(|s| {
751                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
752                    s.spawn(move |_| {
753                        result.reserve(chunk.len() / 2);
754                        process_single_field_chunk(
755                            chunk, delim, target_idx, line_delim, suppress, result,
756                        );
757                    });
758                }
759            });
760            let slices: Vec<IoSlice> = results
761                .iter()
762                .filter(|r| !r.is_empty())
763                .map(|r| IoSlice::new(r))
764                .collect();
765            write_ioslices(out, &slices)?;
766        } else {
767            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
768            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
769            if !buf.is_empty() {
770                out.write_all(&buf)?;
771            }
772        }
773        return Ok(());
774    }
775
776    // Fallback for delim == line_delim: nested loop approach
777    if data.len() >= FIELD_PARALLEL_MIN {
778        let chunks = split_for_scope(data, line_delim);
779        let n = chunks.len();
780        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
781        rayon::scope(|s| {
782            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
783                s.spawn(move |_| {
784                    result.reserve(chunk.len() / 4);
785                    process_single_field_chunk(
786                        chunk, delim, target_idx, line_delim, suppress, result,
787                    );
788                });
789            }
790        });
791        let slices: Vec<IoSlice> = results
792            .iter()
793            .filter(|r| !r.is_empty())
794            .map(|r| IoSlice::new(r))
795            .collect();
796        write_ioslices(out, &slices)?;
797    } else {
798        let mut buf = Vec::with_capacity(data.len() / 4);
799        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
800        if !buf.is_empty() {
801            out.write_all(&buf)?;
802        }
803    }
804    Ok(())
805}
806
807/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
808/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
809fn process_complement_range(
810    data: &[u8],
811    delim: u8,
812    line_delim: u8,
813    skip_start: usize,
814    skip_end: usize,
815    suppress: bool,
816    out: &mut impl Write,
817) -> io::Result<()> {
818    if data.len() >= PARALLEL_THRESHOLD {
819        let chunks = split_for_scope(data, line_delim);
820        let n = chunks.len();
821        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
822        rayon::scope(|s| {
823            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
824                s.spawn(move |_| {
825                    result.reserve(chunk.len());
826                    complement_range_chunk(
827                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
828                    );
829                });
830            }
831        });
832        let slices: Vec<IoSlice> = results
833            .iter()
834            .filter(|r| !r.is_empty())
835            .map(|r| IoSlice::new(r))
836            .collect();
837        write_ioslices(out, &slices)?;
838    } else {
839        let mut buf = Vec::with_capacity(data.len());
840        complement_range_chunk(
841            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
842        );
843        if !buf.is_empty() {
844            out.write_all(&buf)?;
845        }
846    }
847    Ok(())
848}
849
850/// Process a chunk for complement range extraction.
851fn complement_range_chunk(
852    data: &[u8],
853    delim: u8,
854    skip_start: usize,
855    skip_end: usize,
856    line_delim: u8,
857    suppress: bool,
858    buf: &mut Vec<u8>,
859) {
860    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
861    buf.reserve(data.len());
862    let mut start = 0;
863    for end_pos in memchr_iter(line_delim, data) {
864        let line = &data[start..end_pos];
865        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
866        start = end_pos + 1;
867    }
868    if start < data.len() {
869        complement_range_line(
870            &data[start..],
871            delim,
872            skip_start,
873            skip_end,
874            line_delim,
875            suppress,
876            buf,
877        );
878    }
879}
880
881/// Extract all fields except skip_start..=skip_end from one line.
882/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
883///
884/// Optimized: only scans for enough delimiters to find the skip region boundaries.
885/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
886/// does a single copy of prefix + suffix, avoiding scanning past field 5.
887#[inline(always)]
888fn complement_range_line(
889    line: &[u8],
890    delim: u8,
891    skip_start: usize,
892    skip_end: usize,
893    line_delim: u8,
894    suppress: bool,
895    buf: &mut Vec<u8>,
896) {
897    let len = line.len();
898    if len == 0 {
899        if !suppress {
900            unsafe { buf_push(buf, line_delim) };
901        }
902        return;
903    }
904
905    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
906    let base = line.as_ptr();
907
908    // 1-based field numbers. To skip fields skip_start..=skip_end:
909    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
910    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
911    //
912    // Find the first (skip_start - 1) delimiters to locate prefix_end,
913    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
914
915    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
916    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
917    let total_need = need_prefix_delims + need_skip_delims;
918
919    // Find delimiter positions up to total_need
920    let mut delim_count: usize = 0;
921    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
922    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
923
924    for pos in memchr_iter(delim, line) {
925        delim_count += 1;
926        if delim_count == need_prefix_delims {
927            prefix_end_pos = pos;
928        }
929        if delim_count == total_need {
930            suffix_start_pos = pos + 1;
931            break;
932        }
933    }
934
935    if delim_count == 0 {
936        // No delimiter at all
937        if !suppress {
938            unsafe {
939                buf_extend(buf, line);
940                buf_push(buf, line_delim);
941            }
942        }
943        return;
944    }
945
946    // Case analysis:
947    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
948    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
949    // 3. Enough to reach skip_end: prefix + delim + suffix
950
951    if delim_count < need_prefix_delims {
952        // Not enough fields to reach skip region — output entire line
953        unsafe {
954            buf_extend(buf, line);
955            buf_push(buf, line_delim);
956        }
957        return;
958    }
959
960    let has_prefix = need_prefix_delims > 0;
961    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
962
963    if has_prefix && has_suffix {
964        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
965        unsafe {
966            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
967            buf_push(buf, delim);
968            buf_extend(
969                buf,
970                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
971            );
972            buf_push(buf, line_delim);
973        }
974    } else if has_prefix {
975        // Only prefix, no suffix (skip region extends to end of line)
976        unsafe {
977            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
978            buf_push(buf, line_delim);
979        }
980    } else if has_suffix {
981        // No prefix (skip_start == 1), only suffix
982        unsafe {
983            buf_extend(
984                buf,
985                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
986            );
987            buf_push(buf, line_delim);
988        }
989    } else {
990        // All fields skipped
991        unsafe { buf_push(buf, line_delim) };
992    }
993}
994
995/// Complement single-field extraction: skip one field, output rest unchanged.
996fn process_complement_single_field(
997    data: &[u8],
998    delim: u8,
999    line_delim: u8,
1000    skip_field: usize,
1001    suppress: bool,
1002    out: &mut impl Write,
1003) -> io::Result<()> {
1004    let skip_idx = skip_field - 1;
1005
1006    if data.len() >= PARALLEL_THRESHOLD {
1007        let chunks = split_for_scope(data, line_delim);
1008        let n = chunks.len();
1009        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1010        rayon::scope(|s| {
1011            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1012                s.spawn(move |_| {
1013                    result.reserve(chunk.len());
1014                    complement_single_field_chunk(
1015                        chunk, delim, skip_idx, line_delim, suppress, result,
1016                    );
1017                });
1018            }
1019        });
1020        let slices: Vec<IoSlice> = results
1021            .iter()
1022            .filter(|r| !r.is_empty())
1023            .map(|r| IoSlice::new(r))
1024            .collect();
1025        write_ioslices(out, &slices)?;
1026    } else {
1027        let mut buf = Vec::with_capacity(data.len());
1028        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1029        if !buf.is_empty() {
1030            out.write_all(&buf)?;
1031        }
1032    }
1033    Ok(())
1034}
1035
1036/// Process a chunk for complement single-field extraction using two-level scanning.
1037/// Outer memchr(newline) for line boundaries, inner memchr_iter(delim) with early exit
1038/// after finding the skip field's bounding delimiters. Faster than memchr2 single-pass
1039/// because memchr is faster per byte and inner scan exits early.
1040fn complement_single_field_chunk(
1041    data: &[u8],
1042    delim: u8,
1043    skip_idx: usize,
1044    line_delim: u8,
1045    suppress: bool,
1046    buf: &mut Vec<u8>,
1047) {
1048    buf.reserve(data.len());
1049    let mut start = 0;
1050    for end_pos in memchr_iter(line_delim, data) {
1051        let line = &data[start..end_pos];
1052        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1053        start = end_pos + 1;
1054    }
1055    if start < data.len() {
1056        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
1057    }
1058}
1059
1060/// Fallback per-line complement single-field extraction (for delim == line_delim).
1061#[inline(always)]
1062fn complement_single_field_line(
1063    line: &[u8],
1064    delim: u8,
1065    skip_idx: usize,
1066    line_delim: u8,
1067    suppress: bool,
1068    buf: &mut Vec<u8>,
1069) {
1070    let len = line.len();
1071    if len == 0 {
1072        if !suppress {
1073            unsafe { buf_push(buf, line_delim) };
1074        }
1075        return;
1076    }
1077
1078    let base = line.as_ptr();
1079    let need_before = skip_idx;
1080    let need_total = skip_idx + 1;
1081
1082    let mut delim_count: usize = 0;
1083    let mut skip_start_pos: usize = 0;
1084    let mut skip_end_pos: usize = len;
1085    let mut found_end = false;
1086
1087    for pos in memchr_iter(delim, line) {
1088        delim_count += 1;
1089        if delim_count == need_before {
1090            skip_start_pos = pos + 1;
1091        }
1092        if delim_count == need_total {
1093            skip_end_pos = pos;
1094            found_end = true;
1095            break;
1096        }
1097    }
1098
1099    if delim_count == 0 {
1100        if !suppress {
1101            unsafe {
1102                buf_extend(buf, line);
1103                buf_push(buf, line_delim);
1104            }
1105        }
1106        return;
1107    }
1108
1109    if delim_count < need_before {
1110        unsafe {
1111            buf_extend(buf, line);
1112            buf_push(buf, line_delim);
1113        }
1114        return;
1115    }
1116
1117    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1118    let has_suffix = found_end && skip_end_pos < len;
1119
1120    if has_prefix && has_suffix {
1121        unsafe {
1122            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1123            buf_push(buf, delim);
1124            buf_extend(
1125                buf,
1126                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1127            );
1128            buf_push(buf, line_delim);
1129        }
1130    } else if has_prefix {
1131        unsafe {
1132            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1133            buf_push(buf, line_delim);
1134        }
1135    } else if has_suffix {
1136        unsafe {
1137            buf_extend(
1138                buf,
1139                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1140            );
1141            buf_push(buf, line_delim);
1142        }
1143    } else {
1144        unsafe { buf_push(buf, line_delim) };
1145    }
1146}
1147
1148/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1149/// Zero-copy for the non-parallel path: identifies the truncation point per line
1150/// and writes contiguous runs directly from the source data.
1151fn process_fields_prefix(
1152    data: &[u8],
1153    delim: u8,
1154    line_delim: u8,
1155    last_field: usize,
1156    suppress: bool,
1157    out: &mut impl Write,
1158) -> io::Result<()> {
1159    if data.len() >= PARALLEL_THRESHOLD {
1160        let chunks = split_for_scope(data, line_delim);
1161        let n = chunks.len();
1162        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1163        rayon::scope(|s| {
1164            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1165                s.spawn(move |_| {
1166                    result.reserve(chunk.len());
1167                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1168                });
1169            }
1170        });
1171        let slices: Vec<IoSlice> = results
1172            .iter()
1173            .filter(|r| !r.is_empty())
1174            .map(|r| IoSlice::new(r))
1175            .collect();
1176        write_ioslices(out, &slices)?;
1177    } else if !suppress {
1178        // Zero-copy fast path: scan for truncation points, write runs from source.
1179        // When suppress is false, every line is output (with or without delimiter).
1180        // Most lines have enough fields, so the output is often identical to input.
1181        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1182    } else {
1183        let mut buf = Vec::with_capacity(data.len());
1184        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1185        if !buf.is_empty() {
1186            out.write_all(&buf)?;
1187        }
1188    }
1189    Ok(())
1190}
1191
1192/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1193/// directly into the source data, flushing in MAX_IOV-sized batches.
1194/// For lines where the Nth delimiter exists, we truncate at that point.
1195/// For lines with fewer fields, we output them unchanged (contiguous run).
1196/// Lines without any delimiter are output unchanged (suppress=false assumed).
1197#[inline]
1198fn fields_prefix_zerocopy(
1199    data: &[u8],
1200    delim: u8,
1201    line_delim: u8,
1202    last_field: usize,
1203    out: &mut impl Write,
1204) -> io::Result<()> {
1205    let newline_buf: [u8; 1] = [line_delim];
1206    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1207    let mut start = 0;
1208    let mut run_start: usize = 0;
1209
1210    for end_pos in memchr_iter(line_delim, data) {
1211        let line = &data[start..end_pos];
1212        let mut field_count = 1;
1213        let mut truncate_at: Option<usize> = None;
1214        for dpos in memchr_iter(delim, line) {
1215            if field_count >= last_field {
1216                truncate_at = Some(start + dpos);
1217                break;
1218            }
1219            field_count += 1;
1220        }
1221
1222        if let Some(trunc_pos) = truncate_at {
1223            if run_start < start {
1224                iov.push(IoSlice::new(&data[run_start..start]));
1225            }
1226            iov.push(IoSlice::new(&data[start..trunc_pos]));
1227            iov.push(IoSlice::new(&newline_buf));
1228            run_start = end_pos + 1;
1229
1230            if iov.len() >= MAX_IOV - 2 {
1231                write_ioslices(out, &iov)?;
1232                iov.clear();
1233            }
1234        }
1235        start = end_pos + 1;
1236    }
1237    // Handle last line without terminator
1238    if start < data.len() {
1239        let line = &data[start..];
1240        let mut field_count = 1;
1241        let mut truncate_at: Option<usize> = None;
1242        for dpos in memchr_iter(delim, line) {
1243            if field_count >= last_field {
1244                truncate_at = Some(start + dpos);
1245                break;
1246            }
1247            field_count += 1;
1248        }
1249        if let Some(trunc_pos) = truncate_at {
1250            if run_start < start {
1251                iov.push(IoSlice::new(&data[run_start..start]));
1252            }
1253            iov.push(IoSlice::new(&data[start..trunc_pos]));
1254            iov.push(IoSlice::new(&newline_buf));
1255            if !iov.is_empty() {
1256                write_ioslices(out, &iov)?;
1257            }
1258            return Ok(());
1259        }
1260    }
1261    // Flush remaining contiguous run
1262    if run_start < data.len() {
1263        iov.push(IoSlice::new(&data[run_start..]));
1264        if !data.is_empty() && *data.last().unwrap() != line_delim {
1265            iov.push(IoSlice::new(&newline_buf));
1266        }
1267    }
1268    if !iov.is_empty() {
1269        write_ioslices(out, &iov)?;
1270    }
1271    Ok(())
1272}
1273
1274/// Process a chunk for contiguous from-start field range extraction.
1275fn fields_prefix_chunk(
1276    data: &[u8],
1277    delim: u8,
1278    line_delim: u8,
1279    last_field: usize,
1280    suppress: bool,
1281    buf: &mut Vec<u8>,
1282) {
1283    buf.reserve(data.len());
1284    let mut start = 0;
1285    for end_pos in memchr_iter(line_delim, data) {
1286        let line = &data[start..end_pos];
1287        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1288        start = end_pos + 1;
1289    }
1290    if start < data.len() {
1291        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1292    }
1293}
1294
1295/// Extract first N fields from one line (contiguous from-start range).
1296/// Uses memchr SIMD for delimiter scanning on all line sizes.
1297#[inline(always)]
1298fn fields_prefix_line(
1299    line: &[u8],
1300    delim: u8,
1301    line_delim: u8,
1302    last_field: usize,
1303    suppress: bool,
1304    buf: &mut Vec<u8>,
1305) {
1306    let len = line.len();
1307    if len == 0 {
1308        if !suppress {
1309            unsafe { buf_push(buf, line_delim) };
1310        }
1311        return;
1312    }
1313
1314    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1315    let base = line.as_ptr();
1316
1317    let mut field_count = 1usize;
1318    let mut has_delim = false;
1319
1320    for pos in memchr_iter(delim, line) {
1321        has_delim = true;
1322        if field_count >= last_field {
1323            unsafe {
1324                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1325                buf_push(buf, line_delim);
1326            }
1327            return;
1328        }
1329        field_count += 1;
1330    }
1331
1332    if !has_delim {
1333        if !suppress {
1334            unsafe {
1335                buf_extend(buf, line);
1336                buf_push(buf, line_delim);
1337            }
1338        }
1339        return;
1340    }
1341
1342    unsafe {
1343        buf_extend(buf, line);
1344        buf_push(buf, line_delim);
1345    }
1346}
1347
1348/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1349fn process_fields_suffix(
1350    data: &[u8],
1351    delim: u8,
1352    line_delim: u8,
1353    start_field: usize,
1354    suppress: bool,
1355    out: &mut impl Write,
1356) -> io::Result<()> {
1357    if data.len() >= PARALLEL_THRESHOLD {
1358        let chunks = split_for_scope(data, line_delim);
1359        let n = chunks.len();
1360        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1361        rayon::scope(|s| {
1362            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1363                s.spawn(move |_| {
1364                    result.reserve(chunk.len());
1365                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1366                });
1367            }
1368        });
1369        let slices: Vec<IoSlice> = results
1370            .iter()
1371            .filter(|r| !r.is_empty())
1372            .map(|r| IoSlice::new(r))
1373            .collect();
1374        write_ioslices(out, &slices)?;
1375    } else {
1376        let mut buf = Vec::with_capacity(data.len());
1377        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1378        if !buf.is_empty() {
1379            out.write_all(&buf)?;
1380        }
1381    }
1382    Ok(())
1383}
1384
1385/// Process a chunk for open-ended field suffix extraction.
1386fn fields_suffix_chunk(
1387    data: &[u8],
1388    delim: u8,
1389    line_delim: u8,
1390    start_field: usize,
1391    suppress: bool,
1392    buf: &mut Vec<u8>,
1393) {
1394    buf.reserve(data.len());
1395    let mut start = 0;
1396    for end_pos in memchr_iter(line_delim, data) {
1397        let line = &data[start..end_pos];
1398        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1399        start = end_pos + 1;
1400    }
1401    if start < data.len() {
1402        fields_suffix_line(
1403            &data[start..],
1404            delim,
1405            line_delim,
1406            start_field,
1407            suppress,
1408            buf,
1409        );
1410    }
1411}
1412
1413/// Extract fields from start_field to end from one line.
1414/// Uses memchr SIMD for delimiter scanning on all line sizes.
1415#[inline(always)]
1416fn fields_suffix_line(
1417    line: &[u8],
1418    delim: u8,
1419    line_delim: u8,
1420    start_field: usize,
1421    suppress: bool,
1422    buf: &mut Vec<u8>,
1423) {
1424    let len = line.len();
1425    if len == 0 {
1426        if !suppress {
1427            unsafe { buf_push(buf, line_delim) };
1428        }
1429        return;
1430    }
1431
1432    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1433    let base = line.as_ptr();
1434
1435    let skip_delims = start_field - 1;
1436    let mut delim_count = 0usize;
1437    let mut has_delim = false;
1438
1439    for pos in memchr_iter(delim, line) {
1440        has_delim = true;
1441        delim_count += 1;
1442        if delim_count >= skip_delims {
1443            unsafe {
1444                buf_extend(
1445                    buf,
1446                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1447                );
1448                buf_push(buf, line_delim);
1449            }
1450            return;
1451        }
1452    }
1453
1454    if !has_delim {
1455        if !suppress {
1456            unsafe {
1457                buf_extend(buf, line);
1458                buf_push(buf, line_delim);
1459            }
1460        }
1461        return;
1462    }
1463
1464    // Fewer delimiters than needed
1465    unsafe { buf_push(buf, line_delim) };
1466}
1467
1468/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1469/// Optimized: skip to start_field using memchr, then output until end_field.
1470fn process_fields_mid_range(
1471    data: &[u8],
1472    delim: u8,
1473    line_delim: u8,
1474    start_field: usize,
1475    end_field: usize,
1476    suppress: bool,
1477    out: &mut impl Write,
1478) -> io::Result<()> {
1479    if data.len() >= PARALLEL_THRESHOLD {
1480        let chunks = split_for_scope(data, line_delim);
1481        let n = chunks.len();
1482        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1483        rayon::scope(|s| {
1484            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1485                s.spawn(move |_| {
1486                    result.reserve(chunk.len());
1487                    fields_mid_range_chunk(
1488                        chunk,
1489                        delim,
1490                        line_delim,
1491                        start_field,
1492                        end_field,
1493                        suppress,
1494                        result,
1495                    );
1496                });
1497            }
1498        });
1499        let slices: Vec<IoSlice> = results
1500            .iter()
1501            .filter(|r| !r.is_empty())
1502            .map(|r| IoSlice::new(r))
1503            .collect();
1504        write_ioslices(out, &slices)?;
1505    } else {
1506        let mut buf = Vec::with_capacity(data.len());
1507        fields_mid_range_chunk(
1508            data,
1509            delim,
1510            line_delim,
1511            start_field,
1512            end_field,
1513            suppress,
1514            &mut buf,
1515        );
1516        if !buf.is_empty() {
1517            out.write_all(&buf)?;
1518        }
1519    }
1520    Ok(())
1521}
1522
1523/// Process a chunk for contiguous mid-range field extraction.
1524/// Two-level scan: outer memchr(newline) for line boundaries, inner memchr_iter(delim)
1525/// with early exit at target_end_delim. Faster than memchr2 single-pass because
1526/// memchr is faster per byte and inner scan exits early.
1527fn fields_mid_range_chunk(
1528    data: &[u8],
1529    delim: u8,
1530    line_delim: u8,
1531    start_field: usize,
1532    end_field: usize,
1533    suppress: bool,
1534    buf: &mut Vec<u8>,
1535) {
1536    buf.reserve(data.len());
1537    let mut start = 0;
1538    for end_pos in memchr_iter(line_delim, data) {
1539        let line = &data[start..end_pos];
1540        fields_mid_range_line(
1541            line,
1542            delim,
1543            line_delim,
1544            start_field,
1545            end_field,
1546            suppress,
1547            buf,
1548        );
1549        start = end_pos + 1;
1550    }
1551    if start < data.len() {
1552        fields_mid_range_line(
1553            &data[start..],
1554            delim,
1555            line_delim,
1556            start_field,
1557            end_field,
1558            suppress,
1559            buf,
1560        );
1561    }
1562}
1563
1564/// Extract fields start_field..=end_field from one line.
1565/// Uses scalar byte scanning for short lines, memchr_iter for longer.
1566/// Raw pointer arithmetic to eliminate bounds checking.
1567#[inline(always)]
1568fn fields_mid_range_line(
1569    line: &[u8],
1570    delim: u8,
1571    line_delim: u8,
1572    start_field: usize,
1573    end_field: usize,
1574    suppress: bool,
1575    buf: &mut Vec<u8>,
1576) {
1577    let len = line.len();
1578    if len == 0 {
1579        if !suppress {
1580            unsafe { buf_push(buf, line_delim) };
1581        }
1582        return;
1583    }
1584
1585    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
1586    let base = line.as_ptr();
1587
1588    // Count delimiters to find start_field and end_field boundaries
1589    let skip_before = start_field - 1; // delimiters to skip before start_field
1590    let field_span = end_field - start_field; // additional delimiters within the range
1591    let target_end_delim = skip_before + field_span + 1;
1592    let mut delim_count = 0;
1593    let mut range_start = 0;
1594    let mut has_delim = false;
1595
1596    for pos in memchr_iter(delim, line) {
1597        has_delim = true;
1598        delim_count += 1;
1599        if delim_count == skip_before {
1600            range_start = pos + 1;
1601        }
1602        if delim_count == target_end_delim {
1603            if skip_before == 0 {
1604                range_start = 0;
1605            }
1606            unsafe {
1607                buf_extend(
1608                    buf,
1609                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1610                );
1611                buf_push(buf, line_delim);
1612            }
1613            return;
1614        }
1615    }
1616
1617    if !has_delim {
1618        if !suppress {
1619            unsafe {
1620                buf_extend(buf, line);
1621                buf_push(buf, line_delim);
1622            }
1623        }
1624        return;
1625    }
1626
1627    // Line has delimiters but fewer fields than end_field
1628    if delim_count >= skip_before {
1629        // We have at least start_field, output from range_start to end
1630        if skip_before == 0 {
1631            range_start = 0;
1632        }
1633        unsafe {
1634            buf_extend(
1635                buf,
1636                std::slice::from_raw_parts(base.add(range_start), len - range_start),
1637            );
1638            buf_push(buf, line_delim);
1639        }
1640    } else {
1641        // Not enough fields even for start_field — output empty line
1642        unsafe { buf_push(buf, line_delim) };
1643    }
1644}
1645
1646/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
1647/// directly into the source data, flushing in MAX_IOV-sized batches.
1648/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
1649///
1650/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
1651/// Parallel field-1 extraction for large data using memchr2 single-pass.
1652/// Splits data into per-thread chunks, each chunk extracts field 1 using
1653/// memchr2(delim, newline) which finds the first special byte in one scan.
1654/// For field 1: first special byte is either the delimiter (field end) or
1655/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
1656fn single_field1_parallel(
1657    data: &[u8],
1658    delim: u8,
1659    line_delim: u8,
1660    out: &mut impl Write,
1661) -> io::Result<()> {
1662    let chunks = split_for_scope(data, line_delim);
1663    let n = chunks.len();
1664    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1665    rayon::scope(|s| {
1666        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1667            s.spawn(move |_| {
1668                result.reserve(chunk.len() + 1);
1669                single_field1_to_buf(chunk, delim, line_delim, result);
1670            });
1671        }
1672    });
1673    let slices: Vec<IoSlice> = results
1674        .iter()
1675        .filter(|r| !r.is_empty())
1676        .map(|r| IoSlice::new(r))
1677        .collect();
1678    write_ioslices(out, &slices)
1679}
1680
1681/// Extract field 1 from a chunk using two-level scanning: outer memchr(newline)
1682/// for line boundaries, inner memchr(delim) for the first delimiter per line.
1683///
1684/// This is faster than memchr2_iter single-pass because:
1685/// 1. memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles)
1686/// 2. For field 1, the inner memchr exits after the FIRST delimiter, skipping
1687///    all subsequent delimiters on the line (huge win for multi-column CSV)
1688/// 3. Lines without delimiter produce contiguous runs that are bulk-copied
1689///
1690/// Uses a single output pointer to avoid per-line buf.len() load/store.
1691#[inline]
1692fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
1693    debug_assert_ne!(delim, line_delim, "delim and line_delim must differ");
1694    // Reserve data.len() + 1: output <= input for all lines except potentially
1695    // the last line without trailing newline, where we add a newline (GNU compat).
1696    buf.reserve(data.len() + 1);
1697
1698    let base = data.as_ptr();
1699    let initial_len = buf.len();
1700    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
1701    let mut start = 0;
1702    // Track the start of contiguous runs of no-delimiter lines for bulk copy.
1703    let mut run_start: usize = 0;
1704    let mut in_run = true; // we start in a run
1705
1706    for end_pos in memchr_iter(line_delim, data) {
1707        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
1708        match memchr::memchr(delim, line) {
1709            Some(dp) => {
1710                // Line has delimiter — flush contiguous run, output field1 + newline
1711                if in_run && run_start < start {
1712                    // Bulk copy the contiguous run of unchanged lines
1713                    let run_len = start - run_start;
1714                    unsafe {
1715                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1716                        out_ptr = out_ptr.add(run_len);
1717                    }
1718                }
1719                // Output field (bytes before first delimiter) + newline
1720                unsafe {
1721                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
1722                    out_ptr = out_ptr.add(dp);
1723                    *out_ptr = line_delim;
1724                    out_ptr = out_ptr.add(1);
1725                }
1726                run_start = end_pos + 1;
1727                in_run = true;
1728            }
1729            None => {
1730                // No delimiter — this line stays in the contiguous run
1731                if !in_run {
1732                    run_start = start;
1733                    in_run = true;
1734                }
1735            }
1736        }
1737        start = end_pos + 1;
1738    }
1739
1740    // Flush any remaining contiguous run
1741    if in_run && run_start < start {
1742        let run_len = start - run_start;
1743        unsafe {
1744            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1745            out_ptr = out_ptr.add(run_len);
1746        }
1747    }
1748
1749    // Handle last line without trailing newline
1750    if start < data.len() {
1751        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
1752        match memchr::memchr(delim, line) {
1753            Some(dp) => {
1754                // Field + trailing newline
1755                unsafe {
1756                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
1757                    out_ptr = out_ptr.add(dp);
1758                    *out_ptr = line_delim;
1759                    out_ptr = out_ptr.add(1);
1760                }
1761            }
1762            None => {
1763                // No delimiter — output remaining data + newline (GNU compat)
1764                let len = data.len() - start;
1765                unsafe {
1766                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, len);
1767                    out_ptr = out_ptr.add(len);
1768                    *out_ptr = line_delim;
1769                    out_ptr = out_ptr.add(1);
1770                }
1771            }
1772        }
1773    }
1774
1775    unsafe {
1776        let new_len = out_ptr as usize - buf.as_ptr() as usize;
1777        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
1778        buf.set_len(new_len);
1779    }
1780}
1781
1782/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
1783/// directly into the source data. Uses two-level scan: outer memchr(newline)
1784/// for the first delimiter. This is faster than memchr2 for SMALL data because
1785/// the inner scan exits after the FIRST delimiter, skipping all
1786/// subsequent delimiters on the line.
1787///
1788/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
1789/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
1790#[inline]
1791#[allow(dead_code)]
1792fn single_field1_zerocopy(
1793    data: &[u8],
1794    delim: u8,
1795    line_delim: u8,
1796    out: &mut impl Write,
1797) -> io::Result<()> {
1798    let newline_buf: [u8; 1] = [line_delim];
1799
1800    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1801    let mut run_start: usize = 0;
1802    let mut start = 0;
1803
1804    for end_pos in memchr_iter(line_delim, data) {
1805        let line = &data[start..end_pos];
1806        if let Some(dp) = memchr::memchr(delim, line) {
1807            // Line has delimiter — truncate at first delimiter.
1808            // Flush current contiguous run, then add truncated field + newline.
1809            if run_start < start {
1810                iov.push(IoSlice::new(&data[run_start..start]));
1811            }
1812            iov.push(IoSlice::new(&data[start..start + dp]));
1813            iov.push(IoSlice::new(&newline_buf));
1814            run_start = end_pos + 1;
1815
1816            if iov.len() >= MAX_IOV - 2 {
1817                write_ioslices(out, &iov)?;
1818                iov.clear();
1819            }
1820        }
1821        // else: no delimiter in line, output unchanged (stays in contiguous run)
1822        start = end_pos + 1;
1823    }
1824
1825    // Handle last line (no trailing newline)
1826    if start < data.len() {
1827        let line = &data[start..];
1828        if let Some(dp) = memchr::memchr(delim, line) {
1829            if run_start < start {
1830                iov.push(IoSlice::new(&data[run_start..start]));
1831            }
1832            iov.push(IoSlice::new(&data[start..start + dp]));
1833            iov.push(IoSlice::new(&newline_buf));
1834            if !iov.is_empty() {
1835                write_ioslices(out, &iov)?;
1836            }
1837            return Ok(());
1838        }
1839    }
1840
1841    // Flush remaining contiguous run
1842    if run_start < data.len() {
1843        iov.push(IoSlice::new(&data[run_start..]));
1844        if !data.is_empty() && *data.last().unwrap() != line_delim {
1845            iov.push(IoSlice::new(&newline_buf));
1846        }
1847    }
1848    if !iov.is_empty() {
1849        write_ioslices(out, &iov)?;
1850    }
1851    Ok(())
1852}
1853
1854/// Process a chunk of data for single-field extraction using write-pointer pattern.
1855/// Two-level scan: outer memchr(newline), inner memchr_iter(delim) with early exit.
1856/// Uses contiguous run tracking for lines that pass through unchanged.
1857fn process_single_field_chunk(
1858    data: &[u8],
1859    delim: u8,
1860    target_idx: usize,
1861    line_delim: u8,
1862    suppress: bool,
1863    buf: &mut Vec<u8>,
1864) {
1865    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
1866    buf.reserve(data.len() + 1);
1867
1868    let base = data.as_ptr();
1869    let initial_len = buf.len();
1870    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
1871    let mut start = 0;
1872    // Track contiguous runs of lines that output unchanged
1873    let mut run_start: usize = 0;
1874    let mut in_run = !suppress; // if suppress, no line passes through without delimiter
1875
1876    for end_pos in memchr_iter(line_delim, data) {
1877        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
1878        let line_len = end_pos - start;
1879
1880        if line_len == 0 {
1881            if !suppress {
1882                // Empty line passes through in the run
1883                if !in_run {
1884                    run_start = start;
1885                    in_run = true;
1886                }
1887            }
1888            start = end_pos + 1;
1889            continue;
1890        }
1891
1892        // Count delimiters up to target_idx to find the target field
1893        let mut field_start_offset = 0;
1894        let mut field_idx = 0;
1895        let mut found = false;
1896        let mut has_delim = false;
1897
1898        for pos in memchr_iter(delim, line) {
1899            has_delim = true;
1900            if field_idx == target_idx {
1901                // Found the target field: line[field_start_offset..pos]
1902                // Flush run, output field + newline
1903                if in_run && run_start < start {
1904                    let run_len = start - run_start;
1905                    unsafe {
1906                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1907                        out_ptr = out_ptr.add(run_len);
1908                    }
1909                }
1910                let field_len = pos - field_start_offset;
1911                unsafe {
1912                    std::ptr::copy_nonoverlapping(
1913                        base.add(start + field_start_offset),
1914                        out_ptr,
1915                        field_len,
1916                    );
1917                    out_ptr = out_ptr.add(field_len);
1918                    *out_ptr = line_delim;
1919                    out_ptr = out_ptr.add(1);
1920                }
1921                run_start = end_pos + 1;
1922                in_run = true;
1923                found = true;
1924                break;
1925            }
1926            field_idx += 1;
1927            field_start_offset = pos + 1;
1928        }
1929
1930        if !found {
1931            if !has_delim {
1932                // No delimiter in line
1933                if !suppress {
1934                    // Line passes through unchanged — stays in run
1935                    if !in_run {
1936                        run_start = start;
1937                        in_run = true;
1938                    }
1939                } else {
1940                    // Suppress: flush run, skip this line
1941                    if in_run && run_start < start {
1942                        let run_len = start - run_start;
1943                        unsafe {
1944                            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1945                            out_ptr = out_ptr.add(run_len);
1946                        }
1947                    }
1948                    in_run = false;
1949                    run_start = end_pos + 1;
1950                }
1951            } else if field_idx == target_idx {
1952                // Last field is the target: line[field_start_offset..]
1953                if in_run && run_start < start {
1954                    let run_len = start - run_start;
1955                    unsafe {
1956                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1957                        out_ptr = out_ptr.add(run_len);
1958                    }
1959                }
1960                let field_len = line_len - field_start_offset;
1961                unsafe {
1962                    std::ptr::copy_nonoverlapping(
1963                        base.add(start + field_start_offset),
1964                        out_ptr,
1965                        field_len,
1966                    );
1967                    out_ptr = out_ptr.add(field_len);
1968                    *out_ptr = line_delim;
1969                    out_ptr = out_ptr.add(1);
1970                }
1971                run_start = end_pos + 1;
1972                in_run = true;
1973            } else {
1974                // Not enough fields for target — output empty line
1975                if in_run && run_start < start {
1976                    let run_len = start - run_start;
1977                    unsafe {
1978                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1979                        out_ptr = out_ptr.add(run_len);
1980                    }
1981                }
1982                unsafe {
1983                    *out_ptr = line_delim;
1984                    out_ptr = out_ptr.add(1);
1985                }
1986                run_start = end_pos + 1;
1987                in_run = true;
1988            }
1989        }
1990
1991        start = end_pos + 1;
1992    }
1993
1994    // Flush remaining contiguous run
1995    if in_run && run_start < start {
1996        let run_len = start - run_start;
1997        unsafe {
1998            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1999            out_ptr = out_ptr.add(run_len);
2000        }
2001    }
2002
2003    // Handle last line without trailing newline
2004    if start < data.len() {
2005        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2006        let line_len = data.len() - start;
2007
2008        if line_len == 0 {
2009            if !suppress {
2010                unsafe {
2011                    *out_ptr = line_delim;
2012                    out_ptr = out_ptr.add(1);
2013                }
2014            }
2015        } else {
2016            let mut field_start_offset = 0;
2017            let mut field_idx = 0;
2018            let mut found = false;
2019            let mut has_delim = false;
2020
2021            for pos in memchr_iter(delim, line) {
2022                has_delim = true;
2023                if field_idx == target_idx {
2024                    let field_len = pos - field_start_offset;
2025                    unsafe {
2026                        std::ptr::copy_nonoverlapping(
2027                            base.add(start + field_start_offset),
2028                            out_ptr,
2029                            field_len,
2030                        );
2031                        out_ptr = out_ptr.add(field_len);
2032                        *out_ptr = line_delim;
2033                        out_ptr = out_ptr.add(1);
2034                    }
2035                    found = true;
2036                    break;
2037                }
2038                field_idx += 1;
2039                field_start_offset = pos + 1;
2040            }
2041
2042            if !found {
2043                if !has_delim {
2044                    if !suppress {
2045                        unsafe {
2046                            std::ptr::copy_nonoverlapping(base.add(start), out_ptr, line_len);
2047                            out_ptr = out_ptr.add(line_len);
2048                            *out_ptr = line_delim;
2049                            out_ptr = out_ptr.add(1);
2050                        }
2051                    }
2052                } else if field_idx == target_idx {
2053                    let field_len = line_len - field_start_offset;
2054                    unsafe {
2055                        std::ptr::copy_nonoverlapping(
2056                            base.add(start + field_start_offset),
2057                            out_ptr,
2058                            field_len,
2059                        );
2060                        out_ptr = out_ptr.add(field_len);
2061                        *out_ptr = line_delim;
2062                        out_ptr = out_ptr.add(1);
2063                    }
2064                } else {
2065                    unsafe {
2066                        *out_ptr = line_delim;
2067                        out_ptr = out_ptr.add(1);
2068                    }
2069                }
2070            }
2071        }
2072    }
2073
2074    unsafe {
2075        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2076        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2077        buf.set_len(new_len);
2078    }
2079}
2080
2081/// Extract fields from a single line into the output buffer.
2082/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2083/// Raw pointer arithmetic eliminates per-field bounds checking.
2084#[inline(always)]
2085fn extract_fields_to_buf(
2086    line: &[u8],
2087    delim: u8,
2088    ranges: &[Range],
2089    output_delim: &[u8],
2090    suppress: bool,
2091    max_field: usize,
2092    field_mask: u64,
2093    line_delim: u8,
2094    buf: &mut Vec<u8>,
2095    complement: bool,
2096) {
2097    let len = line.len();
2098
2099    if len == 0 {
2100        if !suppress {
2101            buf.push(line_delim);
2102        }
2103        return;
2104    }
2105
2106    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2107    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2108    let needed = len + output_delim.len() * 16 + 1;
2109    if buf.capacity() - buf.len() < needed {
2110        buf.reserve(needed);
2111    }
2112
2113    let base = line.as_ptr();
2114    let mut field_num: usize = 1;
2115    let mut field_start: usize = 0;
2116    let mut first_output = true;
2117    let mut has_delim = false;
2118
2119    // Use memchr SIMD for all line sizes
2120    for delim_pos in memchr_iter(delim, line) {
2121        has_delim = true;
2122
2123        if is_selected(field_num, field_mask, ranges, complement) {
2124            if !first_output {
2125                unsafe { buf_extend(buf, output_delim) };
2126            }
2127            unsafe {
2128                buf_extend(
2129                    buf,
2130                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2131                )
2132            };
2133            first_output = false;
2134        }
2135
2136        field_num += 1;
2137        field_start = delim_pos + 1;
2138
2139        if field_num > max_field {
2140            break;
2141        }
2142    }
2143
2144    // Last field
2145    if (field_num <= max_field || complement)
2146        && has_delim
2147        && is_selected(field_num, field_mask, ranges, complement)
2148    {
2149        if !first_output {
2150            unsafe { buf_extend(buf, output_delim) };
2151        }
2152        unsafe {
2153            buf_extend(
2154                buf,
2155                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2156            )
2157        };
2158        first_output = false;
2159    }
2160
2161    if !first_output {
2162        unsafe { buf_push(buf, line_delim) };
2163    } else if !has_delim {
2164        if !suppress {
2165            unsafe {
2166                buf_extend(buf, line);
2167                buf_push(buf, line_delim);
2168            }
2169        }
2170    } else {
2171        unsafe { buf_push(buf, line_delim) };
2172    }
2173}
2174
2175// ── Fast path: byte/char extraction with batched output ──────────────────
2176
2177/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2178/// Zero-copy: writes directly from the source data using output runs.
2179/// For lines shorter than max_bytes, the output is identical to the input,
2180/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2181fn process_bytes_from_start(
2182    data: &[u8],
2183    max_bytes: usize,
2184    line_delim: u8,
2185    out: &mut impl Write,
2186) -> io::Result<()> {
2187    // For small data (< PARALLEL_THRESHOLD): check if all lines fit for zero-copy passthrough.
2188    // The sequential scan + write_all is competitive with per-line processing for small data.
2189    //
2190    // For large data (>= PARALLEL_THRESHOLD): skip the all_fit scan entirely.
2191    // The scan is sequential (~1.7ms for 10MB at memchr speed) while parallel per-line
2192    // processing is much faster (~0.5ms for 10MB with 4 threads). Even when all lines fit,
2193    // the parallel copy + write is faster than sequential scan + zero-copy write.
2194    if data.len() < PARALLEL_THRESHOLD && max_bytes > 0 && max_bytes < usize::MAX {
2195        let mut start = 0;
2196        let mut all_fit = true;
2197        for pos in memchr_iter(line_delim, data) {
2198            if pos - start > max_bytes {
2199                all_fit = false;
2200                break;
2201            }
2202            start = pos + 1;
2203        }
2204        // Check last line (no trailing delimiter)
2205        if all_fit && start < data.len() && data.len() - start > max_bytes {
2206            all_fit = false;
2207        }
2208        if all_fit {
2209            // All lines fit: output = input. Handle missing trailing delimiter.
2210            if !data.is_empty() && data[data.len() - 1] == line_delim {
2211                return out.write_all(data);
2212            } else if !data.is_empty() {
2213                out.write_all(data)?;
2214                return out.write_all(&[line_delim]);
2215            }
2216            return Ok(());
2217        }
2218    }
2219
2220    if data.len() >= PARALLEL_THRESHOLD {
2221        let chunks = split_for_scope(data, line_delim);
2222        let n = chunks.len();
2223        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2224        rayon::scope(|s| {
2225            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2226                s.spawn(move |_| {
2227                    // Output can be up to input size (when all lines fit).
2228                    // Reserve full chunk size to avoid reallocation.
2229                    result.reserve(chunk.len());
2230                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2231                });
2232            }
2233        });
2234        // Use write_vectored (writev) to batch N writes into fewer syscalls
2235        let slices: Vec<IoSlice> = results
2236            .iter()
2237            .filter(|r| !r.is_empty())
2238            .map(|r| IoSlice::new(r))
2239            .collect();
2240        write_ioslices(out, &slices)?;
2241    } else {
2242        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2243        // because every line gets truncated, creating 3 IoSlice entries per line.
2244        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2245        // managing millions of IoSlice entries through the kernel.
2246        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2247        if max_bytes <= 512 {
2248            // Estimate output size without scanning: output <= data.len(),
2249            // typically ~data.len()/4 for short max_bytes on longer lines.
2250            let est_out = (data.len() / 4).max(max_bytes + 2);
2251            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2252            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2253            if !buf.is_empty() {
2254                out.write_all(&buf)?;
2255            }
2256        } else {
2257            // Zero-copy path: track contiguous output runs and write directly from source.
2258            // For lines <= max_bytes, we include them as-is (no copy needed).
2259            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2260            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2261        }
2262    }
2263    Ok(())
2264}
2265
2266/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2267/// directly into the source data, flushing in MAX_IOV-sized batches.
2268/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2269/// produce two IoSlices (truncated data + newline).
2270#[inline]
2271fn bytes_from_start_zerocopy(
2272    data: &[u8],
2273    max_bytes: usize,
2274    line_delim: u8,
2275    out: &mut impl Write,
2276) -> io::Result<()> {
2277    let newline_buf: [u8; 1] = [line_delim];
2278    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2279    let mut start = 0;
2280    let mut run_start: usize = 0;
2281
2282    for pos in memchr_iter(line_delim, data) {
2283        let line_len = pos - start;
2284        if line_len > max_bytes {
2285            // This line needs truncation
2286            if run_start < start {
2287                iov.push(IoSlice::new(&data[run_start..start]));
2288            }
2289            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2290            iov.push(IoSlice::new(&newline_buf));
2291            run_start = pos + 1;
2292
2293            if iov.len() >= MAX_IOV - 2 {
2294                write_ioslices(out, &iov)?;
2295                iov.clear();
2296            }
2297        }
2298        start = pos + 1;
2299    }
2300    // Handle last line without terminator
2301    if start < data.len() {
2302        let line_len = data.len() - start;
2303        if line_len > max_bytes {
2304            if run_start < start {
2305                iov.push(IoSlice::new(&data[run_start..start]));
2306            }
2307            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2308            iov.push(IoSlice::new(&newline_buf));
2309            if !iov.is_empty() {
2310                write_ioslices(out, &iov)?;
2311            }
2312            return Ok(());
2313        }
2314    }
2315    // Flush remaining contiguous run
2316    if run_start < data.len() {
2317        iov.push(IoSlice::new(&data[run_start..]));
2318        if !data.is_empty() && *data.last().unwrap() != line_delim {
2319            iov.push(IoSlice::new(&newline_buf));
2320        }
2321    }
2322    if !iov.is_empty() {
2323        write_ioslices(out, &iov)?;
2324    }
2325    Ok(())
2326}
2327
2328/// Process a chunk for from-start byte range extraction (parallel path).
2329/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2330/// Pre-reserves data.len() (output never exceeds input), then uses a single
2331/// write pointer with deferred set_len — no per-line capacity checks.
2332#[inline]
2333fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2334    // Output is always <= input size (we only truncate, never expand).
2335    // Single reserve eliminates ALL per-line capacity checks.
2336    buf.reserve(data.len());
2337
2338    let src = data.as_ptr();
2339    let dst_base = buf.as_mut_ptr();
2340    let mut wp = buf.len();
2341    let mut start = 0;
2342
2343    for pos in memchr_iter(line_delim, data) {
2344        let line_len = pos - start;
2345        let take = line_len.min(max_bytes);
2346        unsafe {
2347            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2348            *dst_base.add(wp + take) = line_delim;
2349        }
2350        wp += take + 1;
2351        start = pos + 1;
2352    }
2353    // Handle last line without terminator
2354    if start < data.len() {
2355        let line_len = data.len() - start;
2356        let take = line_len.min(max_bytes);
2357        unsafe {
2358            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2359            *dst_base.add(wp + take) = line_delim;
2360        }
2361        wp += take + 1;
2362    }
2363    unsafe { buf.set_len(wp) };
2364}
2365
2366/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2367fn process_bytes_from_offset(
2368    data: &[u8],
2369    skip_bytes: usize,
2370    line_delim: u8,
2371    out: &mut impl Write,
2372) -> io::Result<()> {
2373    if data.len() >= PARALLEL_THRESHOLD {
2374        let chunks = split_for_scope(data, line_delim);
2375        let n = chunks.len();
2376        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2377        rayon::scope(|s| {
2378            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2379                s.spawn(move |_| {
2380                    result.reserve(chunk.len());
2381                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2382                });
2383            }
2384        });
2385        // Use write_vectored (writev) to batch N writes into fewer syscalls
2386        let slices: Vec<IoSlice> = results
2387            .iter()
2388            .filter(|r| !r.is_empty())
2389            .map(|r| IoSlice::new(r))
2390            .collect();
2391        write_ioslices(out, &slices)?;
2392    } else {
2393        // Zero-copy: write suffix of each line directly from source
2394        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2395    }
2396    Ok(())
2397}
2398
2399/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2400/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2401/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2402#[inline]
2403fn bytes_from_offset_zerocopy(
2404    data: &[u8],
2405    skip_bytes: usize,
2406    line_delim: u8,
2407    out: &mut impl Write,
2408) -> io::Result<()> {
2409    let delim_buf = [line_delim];
2410    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2411
2412    let mut start = 0;
2413    for pos in memchr_iter(line_delim, data) {
2414        let line_len = pos - start;
2415        if line_len > skip_bytes {
2416            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2417        }
2418        iov.push(IoSlice::new(&delim_buf));
2419        // Flush when approaching MAX_IOV to avoid oversized writev
2420        if iov.len() >= MAX_IOV - 1 {
2421            write_ioslices(out, &iov)?;
2422            iov.clear();
2423        }
2424        start = pos + 1;
2425    }
2426    if start < data.len() {
2427        let line_len = data.len() - start;
2428        if line_len > skip_bytes {
2429            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2430        }
2431        iov.push(IoSlice::new(&delim_buf));
2432    }
2433    if !iov.is_empty() {
2434        write_ioslices(out, &iov)?;
2435    }
2436    Ok(())
2437}
2438
2439/// Process a chunk for from-offset byte range extraction.
2440/// Single reserve + deferred set_len for zero per-line overhead.
2441#[inline]
2442fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2443    buf.reserve(data.len());
2444
2445    let src = data.as_ptr();
2446    let dst_base = buf.as_mut_ptr();
2447    let mut wp = buf.len();
2448    let mut start = 0;
2449
2450    for pos in memchr_iter(line_delim, data) {
2451        let line_len = pos - start;
2452        if line_len > skip_bytes {
2453            let take = line_len - skip_bytes;
2454            unsafe {
2455                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2456            }
2457            wp += take;
2458        }
2459        unsafe {
2460            *dst_base.add(wp) = line_delim;
2461        }
2462        wp += 1;
2463        start = pos + 1;
2464    }
2465    if start < data.len() {
2466        let line_len = data.len() - start;
2467        if line_len > skip_bytes {
2468            let take = line_len - skip_bytes;
2469            unsafe {
2470                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2471            }
2472            wp += take;
2473        }
2474        unsafe {
2475            *dst_base.add(wp) = line_delim;
2476        }
2477        wp += 1;
2478    }
2479    unsafe { buf.set_len(wp) };
2480}
2481
2482/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2483fn process_bytes_mid_range(
2484    data: &[u8],
2485    start_byte: usize,
2486    end_byte: usize,
2487    line_delim: u8,
2488    out: &mut impl Write,
2489) -> io::Result<()> {
2490    let skip = start_byte.saturating_sub(1);
2491
2492    if data.len() >= PARALLEL_THRESHOLD {
2493        let chunks = split_for_scope(data, line_delim);
2494        let n = chunks.len();
2495        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2496        rayon::scope(|s| {
2497            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2498                s.spawn(move |_| {
2499                    result.reserve(chunk.len());
2500                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2501                });
2502            }
2503        });
2504        let slices: Vec<IoSlice> = results
2505            .iter()
2506            .filter(|r| !r.is_empty())
2507            .map(|r| IoSlice::new(r))
2508            .collect();
2509        write_ioslices(out, &slices)?;
2510    } else {
2511        let mut buf = Vec::with_capacity(data.len());
2512        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2513        if !buf.is_empty() {
2514            out.write_all(&buf)?;
2515        }
2516    }
2517    Ok(())
2518}
2519
2520/// Process a chunk for mid-range byte extraction.
2521/// For each line, output bytes skip..min(line_len, end_byte).
2522/// Single reserve + deferred set_len.
2523#[inline]
2524fn bytes_mid_range_chunk(
2525    data: &[u8],
2526    skip: usize,
2527    end_byte: usize,
2528    line_delim: u8,
2529    buf: &mut Vec<u8>,
2530) {
2531    buf.reserve(data.len());
2532
2533    let src = data.as_ptr();
2534    let dst_base = buf.as_mut_ptr();
2535    let mut wp = buf.len();
2536    let mut start = 0;
2537
2538    for pos in memchr_iter(line_delim, data) {
2539        let line_len = pos - start;
2540        if line_len > skip {
2541            let take_end = line_len.min(end_byte);
2542            let take = take_end - skip;
2543            unsafe {
2544                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2545            }
2546            wp += take;
2547        }
2548        unsafe {
2549            *dst_base.add(wp) = line_delim;
2550        }
2551        wp += 1;
2552        start = pos + 1;
2553    }
2554    if start < data.len() {
2555        let line_len = data.len() - start;
2556        if line_len > skip {
2557            let take_end = line_len.min(end_byte);
2558            let take = take_end - skip;
2559            unsafe {
2560                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2561            }
2562            wp += take;
2563        }
2564        unsafe {
2565            *dst_base.add(wp) = line_delim;
2566        }
2567        wp += 1;
2568    }
2569    unsafe { buf.set_len(wp) };
2570}
2571
2572/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2573fn process_bytes_complement_mid(
2574    data: &[u8],
2575    skip_start: usize,
2576    skip_end: usize,
2577    line_delim: u8,
2578    out: &mut impl Write,
2579) -> io::Result<()> {
2580    let prefix_bytes = skip_start - 1; // bytes before the skip region
2581    if data.len() >= PARALLEL_THRESHOLD {
2582        let chunks = split_for_scope(data, line_delim);
2583        let n = chunks.len();
2584        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2585        rayon::scope(|s| {
2586            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2587                s.spawn(move |_| {
2588                    result.reserve(chunk.len());
2589                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
2590                });
2591            }
2592        });
2593        let slices: Vec<IoSlice> = results
2594            .iter()
2595            .filter(|r| !r.is_empty())
2596            .map(|r| IoSlice::new(r))
2597            .collect();
2598        write_ioslices(out, &slices)?;
2599    } else {
2600        let mut buf = Vec::with_capacity(data.len());
2601        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
2602        if !buf.is_empty() {
2603            out.write_all(&buf)?;
2604        }
2605    }
2606    Ok(())
2607}
2608
2609/// Process a chunk for complement mid-range byte extraction.
2610/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2611#[inline]
2612fn bytes_complement_mid_chunk(
2613    data: &[u8],
2614    prefix_bytes: usize,
2615    skip_end: usize,
2616    line_delim: u8,
2617    buf: &mut Vec<u8>,
2618) {
2619    buf.reserve(data.len());
2620
2621    let src = data.as_ptr();
2622    let dst_base = buf.as_mut_ptr();
2623    let mut wp = buf.len();
2624    let mut start = 0;
2625
2626    for pos in memchr_iter(line_delim, data) {
2627        let line_len = pos - start;
2628        // Copy prefix (bytes before skip region)
2629        let take_prefix = prefix_bytes.min(line_len);
2630        if take_prefix > 0 {
2631            unsafe {
2632                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2633            }
2634            wp += take_prefix;
2635        }
2636        // Copy suffix (bytes after skip region)
2637        if line_len > skip_end {
2638            let suffix_len = line_len - skip_end;
2639            unsafe {
2640                std::ptr::copy_nonoverlapping(
2641                    src.add(start + skip_end),
2642                    dst_base.add(wp),
2643                    suffix_len,
2644                );
2645            }
2646            wp += suffix_len;
2647        }
2648        unsafe {
2649            *dst_base.add(wp) = line_delim;
2650        }
2651        wp += 1;
2652        start = pos + 1;
2653    }
2654    if start < data.len() {
2655        let line_len = data.len() - start;
2656        let take_prefix = prefix_bytes.min(line_len);
2657        if take_prefix > 0 {
2658            unsafe {
2659                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2660            }
2661            wp += take_prefix;
2662        }
2663        if line_len > skip_end {
2664            let suffix_len = line_len - skip_end;
2665            unsafe {
2666                std::ptr::copy_nonoverlapping(
2667                    src.add(start + skip_end),
2668                    dst_base.add(wp),
2669                    suffix_len,
2670                );
2671            }
2672            wp += suffix_len;
2673        }
2674        unsafe {
2675            *dst_base.add(wp) = line_delim;
2676        }
2677        wp += 1;
2678    }
2679    unsafe { buf.set_len(wp) };
2680}
2681
2682/// Optimized byte/char extraction with batched output and parallel processing.
2683fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2684    let line_delim = cfg.line_delim;
2685    let ranges = cfg.ranges;
2686    let complement = cfg.complement;
2687    let output_delim = cfg.output_delim;
2688
2689    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
2690    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
2691        let max_bytes = ranges[0].end;
2692        if max_bytes < usize::MAX {
2693            return process_bytes_from_start(data, max_bytes, line_delim, out);
2694        }
2695    }
2696
2697    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
2698    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
2699        let skip_bytes = ranges[0].start.saturating_sub(1);
2700        if skip_bytes > 0 {
2701            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
2702        }
2703    }
2704
2705    // Fast path: single mid-range (e.g., cut -b5-100)
2706    if !complement
2707        && ranges.len() == 1
2708        && ranges[0].start > 1
2709        && ranges[0].end < usize::MAX
2710        && output_delim.is_empty()
2711    {
2712        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
2713    }
2714
2715    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
2716    if complement
2717        && ranges.len() == 1
2718        && ranges[0].start == 1
2719        && ranges[0].end < usize::MAX
2720        && output_delim.is_empty()
2721    {
2722        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
2723    }
2724
2725    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
2726    if complement
2727        && ranges.len() == 1
2728        && ranges[0].end == usize::MAX
2729        && ranges[0].start > 1
2730        && output_delim.is_empty()
2731    {
2732        let max_bytes = ranges[0].start - 1;
2733        return process_bytes_from_start(data, max_bytes, line_delim, out);
2734    }
2735
2736    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
2737    if complement
2738        && ranges.len() == 1
2739        && ranges[0].start > 1
2740        && ranges[0].end < usize::MAX
2741        && output_delim.is_empty()
2742    {
2743        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
2744    }
2745
2746    if data.len() >= PARALLEL_THRESHOLD {
2747        let chunks = split_for_scope(data, line_delim);
2748        let n = chunks.len();
2749        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2750        rayon::scope(|s| {
2751            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2752                s.spawn(move |_| {
2753                    result.reserve(chunk.len() + 1);
2754                    process_bytes_chunk(
2755                        chunk,
2756                        ranges,
2757                        complement,
2758                        output_delim,
2759                        line_delim,
2760                        result,
2761                    );
2762                });
2763            }
2764        });
2765        let slices: Vec<IoSlice> = results
2766            .iter()
2767            .filter(|r| !r.is_empty())
2768            .map(|r| IoSlice::new(r))
2769            .collect();
2770        write_ioslices(out, &slices)?;
2771    } else {
2772        // +1 for potential trailing line_delim when input doesn't end with one
2773        let mut buf = Vec::with_capacity(data.len() + 1);
2774        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
2775        if !buf.is_empty() {
2776            out.write_all(&buf)?;
2777        }
2778    }
2779    Ok(())
2780}
2781
2782/// Process a chunk of data for byte/char extraction.
2783/// Uses raw pointer arithmetic for the newline scan.
2784/// Complement single-range fast path: compute complement ranges once, then use
2785/// the non-complement multi-range path which is more cache-friendly.
2786fn process_bytes_chunk(
2787    data: &[u8],
2788    ranges: &[Range],
2789    complement: bool,
2790    output_delim: &[u8],
2791    line_delim: u8,
2792    buf: &mut Vec<u8>,
2793) {
2794    buf.reserve(data.len());
2795    let base = data.as_ptr();
2796    let mut start = 0;
2797    for end_pos in memchr_iter(line_delim, data) {
2798        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2799        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2800        unsafe { buf_push(buf, line_delim) };
2801        start = end_pos + 1;
2802    }
2803    if start < data.len() {
2804        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2805        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2806        unsafe { buf_push(buf, line_delim) };
2807    }
2808}
2809
2810/// Extract byte ranges from a line into the output buffer.
2811/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
2812/// Raw pointer arithmetic eliminates per-range bounds checking.
2813#[inline(always)]
2814fn cut_bytes_to_buf(
2815    line: &[u8],
2816    ranges: &[Range],
2817    complement: bool,
2818    output_delim: &[u8],
2819    buf: &mut Vec<u8>,
2820) {
2821    let len = line.len();
2822    let base = line.as_ptr();
2823    let mut first_range = true;
2824
2825    // Reserve worst case: full line + delimiters between ranges
2826    let needed = len + output_delim.len() * ranges.len() + 1;
2827    if buf.capacity() - buf.len() < needed {
2828        buf.reserve(needed);
2829    }
2830
2831    if complement {
2832        let mut pos: usize = 1;
2833        for r in ranges {
2834            let rs = r.start;
2835            let re = r.end.min(len);
2836            if pos < rs {
2837                if !first_range && !output_delim.is_empty() {
2838                    unsafe { buf_extend(buf, output_delim) };
2839                }
2840                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
2841                first_range = false;
2842            }
2843            pos = re + 1;
2844            if pos > len {
2845                break;
2846            }
2847        }
2848        if pos <= len {
2849            if !first_range && !output_delim.is_empty() {
2850                unsafe { buf_extend(buf, output_delim) };
2851            }
2852            unsafe {
2853                buf_extend(
2854                    buf,
2855                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
2856                )
2857            };
2858        }
2859    } else if output_delim.is_empty() && ranges.len() == 1 {
2860        // Ultra-fast path: single range, no output delimiter
2861        let start = ranges[0].start.saturating_sub(1);
2862        let end = ranges[0].end.min(len);
2863        if start < len {
2864            unsafe {
2865                buf_extend(
2866                    buf,
2867                    std::slice::from_raw_parts(base.add(start), end - start),
2868                )
2869            };
2870        }
2871    } else {
2872        for r in ranges {
2873            let start = r.start.saturating_sub(1);
2874            let end = r.end.min(len);
2875            if start >= len {
2876                break;
2877            }
2878            if !first_range && !output_delim.is_empty() {
2879                unsafe { buf_extend(buf, output_delim) };
2880            }
2881            unsafe {
2882                buf_extend(
2883                    buf,
2884                    std::slice::from_raw_parts(base.add(start), end - start),
2885                )
2886            };
2887            first_range = false;
2888        }
2889    }
2890}
2891
2892// ── Public API ───────────────────────────────────────────────────────────
2893
2894/// Cut fields from a line using a delimiter. Writes to `out`.
2895#[inline]
2896pub fn cut_fields(
2897    line: &[u8],
2898    delim: u8,
2899    ranges: &[Range],
2900    complement: bool,
2901    output_delim: &[u8],
2902    suppress_no_delim: bool,
2903    out: &mut impl Write,
2904) -> io::Result<bool> {
2905    if memchr::memchr(delim, line).is_none() {
2906        if !suppress_no_delim {
2907            out.write_all(line)?;
2908            return Ok(true);
2909        }
2910        return Ok(false);
2911    }
2912
2913    let mut field_num: usize = 1;
2914    let mut field_start: usize = 0;
2915    let mut first_output = true;
2916
2917    for delim_pos in memchr_iter(delim, line) {
2918        let selected = in_ranges(ranges, field_num) != complement;
2919        if selected {
2920            if !first_output {
2921                out.write_all(output_delim)?;
2922            }
2923            out.write_all(&line[field_start..delim_pos])?;
2924            first_output = false;
2925        }
2926        field_start = delim_pos + 1;
2927        field_num += 1;
2928    }
2929
2930    let selected = in_ranges(ranges, field_num) != complement;
2931    if selected {
2932        if !first_output {
2933            out.write_all(output_delim)?;
2934        }
2935        out.write_all(&line[field_start..])?;
2936    }
2937
2938    Ok(true)
2939}
2940
2941/// Cut bytes/chars from a line. Writes selected bytes to `out`.
2942#[inline]
2943pub fn cut_bytes(
2944    line: &[u8],
2945    ranges: &[Range],
2946    complement: bool,
2947    output_delim: &[u8],
2948    out: &mut impl Write,
2949) -> io::Result<bool> {
2950    let mut first_range = true;
2951
2952    if complement {
2953        let len = line.len();
2954        let mut comp_ranges = Vec::new();
2955        let mut pos: usize = 1;
2956        for r in ranges {
2957            let rs = r.start;
2958            let re = r.end.min(len);
2959            if pos < rs {
2960                comp_ranges.push((pos, rs - 1));
2961            }
2962            pos = re + 1;
2963            if pos > len {
2964                break;
2965            }
2966        }
2967        if pos <= len {
2968            comp_ranges.push((pos, len));
2969        }
2970        for &(s, e) in &comp_ranges {
2971            if !first_range && !output_delim.is_empty() {
2972                out.write_all(output_delim)?;
2973            }
2974            out.write_all(&line[s - 1..e])?;
2975            first_range = false;
2976        }
2977    } else {
2978        for r in ranges {
2979            let start = r.start.saturating_sub(1);
2980            let end = r.end.min(line.len());
2981            if start >= line.len() {
2982                break;
2983            }
2984            if !first_range && !output_delim.is_empty() {
2985                out.write_all(output_delim)?;
2986            }
2987            out.write_all(&line[start..end])?;
2988            first_range = false;
2989        }
2990    }
2991    Ok(true)
2992}
2993
2994/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
2995/// Output is always <= input (we remove everything after first delimiter per line).
2996/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
2997/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
2998///
2999/// Lines without delimiter pass through unchanged (unless suppress=true).
3000/// Lines with delimiter: keep bytes before delimiter + newline.
3001pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3002    let len = data.len();
3003    let mut wp: usize = 0;
3004    let mut rp: usize = 0;
3005
3006    while rp < len {
3007        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3008            None => {
3009                // Rest is partial line, no delimiter
3010                if suppress {
3011                    // suppress: skip lines without delimiter
3012                    break;
3013                }
3014                let remaining = len - rp;
3015                if wp != rp {
3016                    data.copy_within(rp..len, wp);
3017                }
3018                wp += remaining;
3019                break;
3020            }
3021            Some(offset) => {
3022                let actual = rp + offset;
3023                if data[actual] == line_delim {
3024                    // No delimiter on this line
3025                    if suppress {
3026                        // Skip this line entirely
3027                        rp = actual + 1;
3028                    } else {
3029                        // Output entire line including newline
3030                        let chunk_len = actual + 1 - rp;
3031                        if wp != rp {
3032                            data.copy_within(rp..actual + 1, wp);
3033                        }
3034                        wp += chunk_len;
3035                        rp = actual + 1;
3036                    }
3037                } else {
3038                    // Delimiter found: output field 1 (up to delimiter) + newline
3039                    let field_len = actual - rp;
3040                    if wp != rp && field_len > 0 {
3041                        data.copy_within(rp..actual, wp);
3042                    }
3043                    wp += field_len;
3044                    data[wp] = line_delim;
3045                    wp += 1;
3046                    // Skip to next newline
3047                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3048                        None => {
3049                            rp = len;
3050                        }
3051                        Some(nl_off) => {
3052                            rp = actual + 1 + nl_off + 1;
3053                        }
3054                    }
3055                }
3056            }
3057        }
3058    }
3059    wp
3060}
3061
3062/// Process a full data buffer (from mmap or read) with cut operation.
3063pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3064    match cfg.mode {
3065        CutMode::Fields => process_fields_fast(data, cfg, out),
3066        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3067    }
3068}
3069
3070/// Process input from a reader (for stdin).
3071/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3072/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3073/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3074pub fn process_cut_reader<R: BufRead>(
3075    mut reader: R,
3076    cfg: &CutConfig,
3077    out: &mut impl Write,
3078) -> io::Result<()> {
3079    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3080    let line_delim = cfg.line_delim;
3081
3082    // Read large chunks and process in batch.
3083    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3084    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3085
3086    loop {
3087        // Read up to CHUNK_SIZE bytes
3088        buf.reserve(CHUNK_SIZE);
3089        let read_start = buf.len();
3090        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3091        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3092        buf.truncate(read_start + n);
3093
3094        if buf.is_empty() {
3095            break;
3096        }
3097
3098        if n == 0 {
3099            // EOF with leftover data (last line without terminator)
3100            process_cut_data(&buf, cfg, out)?;
3101            break;
3102        }
3103
3104        // Find the last line delimiter in the buffer so we process complete lines
3105        let process_end = match memchr::memrchr(line_delim, &buf) {
3106            Some(pos) => pos + 1,
3107            None => {
3108                // No line delimiter found — keep accumulating
3109                continue;
3110            }
3111        };
3112
3113        // Process the complete lines using the fast batch path
3114        process_cut_data(&buf[..process_end], cfg, out)?;
3115
3116        // Shift leftover to the front for next iteration
3117        let leftover_len = buf.len() - process_end;
3118        if leftover_len > 0 {
3119            buf.copy_within(process_end.., 0);
3120        }
3121        buf.truncate(leftover_len);
3122    }
3123
3124    Ok(())
3125}
3126
3127/// Read as many bytes as possible into buf, retrying on partial reads.
3128#[inline]
3129fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3130    let n = reader.read(buf)?;
3131    if n == buf.len() || n == 0 {
3132        return Ok(n);
3133    }
3134    // Slow path: partial read — retry to fill buffer
3135    let mut total = n;
3136    while total < buf.len() {
3137        match reader.read(&mut buf[total..]) {
3138            Ok(0) => break,
3139            Ok(n) => total += n,
3140            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3141            Err(e) => return Err(e),
3142        }
3143    }
3144    Ok(total)
3145}
3146
3147/// In-place cut processing for mutable data buffers.
3148/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3149/// for the given configuration (caller should fall back to regular processing).
3150///
3151/// In-place avoids allocating intermediate output buffers — the result is written
3152/// directly into the input buffer (output is always <= input for non-complement modes
3153/// with default output delimiter).
3154///
3155/// Note: if the input does not end with line_delim, we fall back to the regular
3156/// path because GNU cut always adds a trailing line delimiter, and the in-place
3157/// buffer cannot grow beyond the input size.
3158pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3159    if cfg.complement {
3160        return None;
3161    }
3162    // If input doesn't end with line_delim, the output may need an extra byte
3163    // (GNU cut always terminates the last line). In-place can't grow the buffer,
3164    // so fall back to the regular allocating path.
3165    if data.is_empty() || data[data.len() - 1] != cfg.line_delim {
3166        return None;
3167    }
3168
3169    match cfg.mode {
3170        CutMode::Fields => {
3171            // Only handle when output delimiter matches input (single-byte)
3172            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3173                return None;
3174            }
3175            if cfg.delim == cfg.line_delim {
3176                return None;
3177            }
3178            Some(cut_fields_inplace_general(
3179                data,
3180                cfg.delim,
3181                cfg.line_delim,
3182                cfg.ranges,
3183                cfg.suppress_no_delim,
3184            ))
3185        }
3186        CutMode::Bytes | CutMode::Characters => {
3187            if !cfg.output_delim.is_empty() {
3188                return None;
3189            }
3190            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3191        }
3192    }
3193}
3194
3195/// In-place generalized field extraction.
3196/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3197fn cut_fields_inplace_general(
3198    data: &mut [u8],
3199    delim: u8,
3200    line_delim: u8,
3201    ranges: &[Range],
3202    suppress: bool,
3203) -> usize {
3204    // Special case: field 1 only (existing optimized path)
3205    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3206        return cut_field1_inplace(data, delim, line_delim, suppress);
3207    }
3208
3209    let len = data.len();
3210    if len == 0 {
3211        return 0;
3212    }
3213
3214    let max_field = ranges.last().map_or(0, |r| r.end);
3215    let max_delims = max_field.min(64);
3216    let mut wp: usize = 0;
3217    let mut rp: usize = 0;
3218
3219    while rp < len {
3220        let line_end = memchr::memchr(line_delim, &data[rp..])
3221            .map(|p| rp + p)
3222            .unwrap_or(len);
3223        let line_len = line_end - rp;
3224
3225        // Collect delimiter positions (relative to line start)
3226        let mut delim_pos = [0usize; 64];
3227        let mut num_delims: usize = 0;
3228
3229        for pos in memchr_iter(delim, &data[rp..line_end]) {
3230            if num_delims < max_delims {
3231                delim_pos[num_delims] = pos;
3232                num_delims += 1;
3233                if num_delims >= max_delims {
3234                    break;
3235                }
3236            }
3237        }
3238
3239        if num_delims == 0 {
3240            // No delimiter in line
3241            if !suppress {
3242                if wp != rp {
3243                    data.copy_within(rp..line_end, wp);
3244                }
3245                wp += line_len;
3246                if line_end < len {
3247                    data[wp] = line_delim;
3248                    wp += 1;
3249                }
3250            }
3251        } else {
3252            let total_fields = num_delims + 1;
3253            let mut first_output = true;
3254
3255            for r in ranges {
3256                let range_start = r.start;
3257                let range_end = r.end.min(total_fields);
3258                if range_start > total_fields {
3259                    break;
3260                }
3261                for field_num in range_start..=range_end {
3262                    if field_num > total_fields {
3263                        break;
3264                    }
3265
3266                    let field_start = if field_num == 1 {
3267                        0
3268                    } else if field_num - 2 < num_delims {
3269                        delim_pos[field_num - 2] + 1
3270                    } else {
3271                        continue;
3272                    };
3273                    let field_end = if field_num <= num_delims {
3274                        delim_pos[field_num - 1]
3275                    } else {
3276                        line_len
3277                    };
3278
3279                    if !first_output {
3280                        data[wp] = delim;
3281                        wp += 1;
3282                    }
3283                    let flen = field_end - field_start;
3284                    if flen > 0 {
3285                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3286                        wp += flen;
3287                    }
3288                    first_output = false;
3289                }
3290            }
3291
3292            if !first_output && line_end < len {
3293                data[wp] = line_delim;
3294                wp += 1;
3295            } else if first_output && line_end < len {
3296                // No fields selected but line had delimiters — output empty line
3297                data[wp] = line_delim;
3298                wp += 1;
3299            }
3300        }
3301
3302        rp = if line_end < len { line_end + 1 } else { len };
3303    }
3304
3305    wp
3306}
3307
3308/// In-place byte/char range extraction.
3309fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3310    let len = data.len();
3311    if len == 0 {
3312        return 0;
3313    }
3314
3315    // Quick check: single range from byte 1 to end = no-op
3316    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3317        return len;
3318    }
3319
3320    // Single range from byte 1: fast truncation path
3321    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3322        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3323    }
3324
3325    let mut wp: usize = 0;
3326    let mut rp: usize = 0;
3327
3328    while rp < len {
3329        let line_end = memchr::memchr(line_delim, &data[rp..])
3330            .map(|p| rp + p)
3331            .unwrap_or(len);
3332        let line_len = line_end - rp;
3333
3334        for r in ranges {
3335            let start = r.start.saturating_sub(1);
3336            let end = r.end.min(line_len);
3337            if start >= line_len {
3338                break;
3339            }
3340            let flen = end - start;
3341            if flen > 0 {
3342                data.copy_within(rp + start..rp + start + flen, wp);
3343                wp += flen;
3344            }
3345        }
3346
3347        if line_end < len {
3348            data[wp] = line_delim;
3349            wp += 1;
3350        }
3351
3352        rp = if line_end < len { line_end + 1 } else { len };
3353    }
3354
3355    wp
3356}
3357
3358/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3359fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3360    let len = data.len();
3361
3362    // Quick check: see if all lines fit within max_bytes (common case)
3363    let mut all_fit = true;
3364    let mut start = 0;
3365    for pos in memchr_iter(line_delim, data) {
3366        if pos - start > max_bytes {
3367            all_fit = false;
3368            break;
3369        }
3370        start = pos + 1;
3371    }
3372    if all_fit && start < len && len - start > max_bytes {
3373        all_fit = false;
3374    }
3375    if all_fit {
3376        return len;
3377    }
3378
3379    // Some lines need truncation
3380    let mut wp: usize = 0;
3381    let mut rp: usize = 0;
3382
3383    while rp < len {
3384        let line_end = memchr::memchr(line_delim, &data[rp..])
3385            .map(|p| rp + p)
3386            .unwrap_or(len);
3387        let line_len = line_end - rp;
3388
3389        let take = line_len.min(max_bytes);
3390        if take > 0 && wp != rp {
3391            data.copy_within(rp..rp + take, wp);
3392        }
3393        wp += take;
3394
3395        if line_end < len {
3396            data[wp] = line_delim;
3397            wp += 1;
3398        }
3399
3400        rp = if line_end < len { line_end + 1 } else { len };
3401    }
3402
3403    wp
3404}
3405
3406/// Cut operation mode
3407#[derive(Debug, Clone, Copy, PartialEq)]
3408pub enum CutMode {
3409    Bytes,
3410    Characters,
3411    Fields,
3412}