Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (32MB).
5/// Files above this threshold use rayon parallel chunked processing.
6/// 32MB balances rayon init overhead + buffer allocation against parallel benefits.
7/// For 10MB files, sequential is faster due to thread coordination + memory overhead.
8const PARALLEL_THRESHOLD: usize = 32 * 1024 * 1024;
9
10/// Max iovec entries per writev call (Linux default).
11const MAX_IOV: usize = 1024;
12
13/// Input chunk size for sequential processing. Keeps output buffer (~256KB)
14/// hot in L2 cache and avoids full-size allocation page faults.
15const SEQ_CHUNK: usize = 256 * 1024;
16
17/// Process data in newline-aligned chunks, writing each chunk's output immediately.
18/// Avoids allocating a full-size output buffer (e.g. 12MB for 11MB input).
19fn process_chunked(
20    data: &[u8],
21    line_delim: u8,
22    out: &mut impl Write,
23    mut process_fn: impl FnMut(&[u8], &mut Vec<u8>),
24) -> io::Result<()> {
25    let mut buf = Vec::with_capacity(SEQ_CHUNK * 2);
26    let mut start = 0;
27    while start < data.len() {
28        let end = if start + SEQ_CHUNK >= data.len() {
29            data.len()
30        } else {
31            match memchr::memrchr(line_delim, &data[start..start + SEQ_CHUNK]) {
32                Some(pos) => start + pos + 1,
33                None => (start + SEQ_CHUNK).min(data.len()),
34            }
35        };
36        buf.clear();
37        process_fn(&data[start..end], &mut buf);
38        if !buf.is_empty() {
39            out.write_all(&buf)?;
40        }
41        start = end;
42    }
43    Ok(())
44}
45
46/// Configuration for cut operations.
47pub struct CutConfig<'a> {
48    pub mode: CutMode,
49    pub ranges: &'a [Range],
50    pub complement: bool,
51    pub delim: u8,
52    pub output_delim: &'a [u8],
53    pub suppress_no_delim: bool,
54    pub line_delim: u8,
55}
56
57/// A range specification like 1, 3-5, -3, 4-
58#[derive(Debug, Clone)]
59pub struct Range {
60    pub start: usize, // 1-based, 0 means "from beginning"
61    pub end: usize,   // 1-based, usize::MAX means "to end"
62}
63
64/// Parse a LIST specification like "1,3-5,7-" into ranges.
65/// Each range is 1-based. Returns sorted, merged ranges.
66/// When `no_merge_adjacent` is true, overlapping ranges are still merged but
67/// adjacent ranges (e.g., 1-2,3-4) are kept separate. This is needed when
68/// `--output-delimiter` is specified for byte/char mode so the delimiter is
69/// inserted between originally separate but adjacent ranges.
70pub fn parse_ranges(spec: &str, no_merge_adjacent: bool) -> Result<Vec<Range>, String> {
71    let mut ranges = Vec::new();
72
73    for part in spec.split(',') {
74        let part = part.trim();
75        if part.is_empty() {
76            continue;
77        }
78
79        if let Some(idx) = part.find('-') {
80            let left = &part[..idx];
81            let right = &part[idx + 1..];
82
83            // Reject bare "-" (both sides empty)
84            if left.is_empty() && right.is_empty() {
85                return Err("invalid range with no endpoint: -".to_string());
86            }
87
88            let start = if left.is_empty() {
89                1
90            } else {
91                left.parse::<usize>()
92                    .map_err(|_| format!("invalid range: '{}'", part))?
93            };
94
95            let end = if right.is_empty() {
96                usize::MAX
97            } else {
98                right
99                    .parse::<usize>()
100                    .map_err(|_| format!("invalid range: '{}'", part))?
101            };
102
103            if start == 0 {
104                return Err("fields and positions are numbered from 1".to_string());
105            }
106            if start > end {
107                return Err(format!("invalid decreasing range: '{}'", part));
108            }
109
110            ranges.push(Range { start, end });
111        } else {
112            let n = part
113                .parse::<usize>()
114                .map_err(|_| format!("invalid field: '{}'", part))?;
115            if n == 0 {
116                return Err("fields and positions are numbered from 1".to_string());
117            }
118            ranges.push(Range { start: n, end: n });
119        }
120    }
121
122    if ranges.is_empty() {
123        return Err("you must specify a list of bytes, characters, or fields".to_string());
124    }
125
126    // Sort and merge overlapping/adjacent ranges
127    ranges.sort_by_key(|r| (r.start, r.end));
128    let mut merged = vec![ranges[0].clone()];
129    for r in &ranges[1..] {
130        let last = merged.last_mut().unwrap();
131        if no_merge_adjacent {
132            // Only merge truly overlapping ranges, not adjacent ones
133            if r.start <= last.end {
134                last.end = last.end.max(r.end);
135            } else {
136                merged.push(r.clone());
137            }
138        } else {
139            // Merge both overlapping and adjacent ranges
140            if r.start <= last.end.saturating_add(1) {
141                last.end = last.end.max(r.end);
142            } else {
143                merged.push(r.clone());
144            }
145        }
146    }
147
148    Ok(merged)
149}
150
151/// Check if a 1-based position is in any range.
152/// Ranges must be sorted. Uses early exit since ranges are sorted.
153#[inline(always)]
154fn in_ranges(ranges: &[Range], pos: usize) -> bool {
155    for r in ranges {
156        if pos < r.start {
157            return false;
158        }
159        if pos <= r.end {
160            return true;
161        }
162    }
163    false
164}
165
166/// Pre-compute a 64-bit mask for field selection.
167/// Bit i-1 is set if field i should be output.
168#[inline]
169fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
170    let mut mask: u64 = 0;
171    for i in 1..=64u32 {
172        let in_range = in_ranges(ranges, i as usize);
173        if in_range != complement {
174            mask |= 1u64 << (i - 1);
175        }
176    }
177    mask
178}
179
180/// Check if a field should be selected, using bitset for first 64 fields.
181#[inline(always)]
182fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
183    if field_num <= 64 {
184        (mask >> (field_num - 1)) & 1 == 1
185    } else {
186        in_ranges(ranges, field_num) != complement
187    }
188}
189
190// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
191
192/// Append a slice to buf without capacity checks.
193/// Caller MUST ensure buf has enough remaining capacity.
194#[inline(always)]
195unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
196    unsafe {
197        let len = buf.len();
198        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
199        buf.set_len(len + data.len());
200    }
201}
202
203/// Append a single byte to buf without capacity checks.
204/// Caller MUST ensure buf has enough remaining capacity.
205#[inline(always)]
206unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
207    unsafe {
208        let len = buf.len();
209        *buf.as_mut_ptr().add(len) = b;
210        buf.set_len(len + 1);
211    }
212}
213
214/// Write multiple IoSlice buffers using write_vectored (writev syscall).
215/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
216/// Cold path (partial write) is out-of-line to keep the hot loop tight.
217#[inline]
218fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
219    if slices.is_empty() {
220        return Ok(());
221    }
222    for batch in slices.chunks(MAX_IOV) {
223        let total: usize = batch.iter().map(|s| s.len()).sum();
224        let written = out.write_vectored(batch)?;
225        if written >= total {
226            continue;
227        }
228        if written == 0 {
229            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
230        }
231        write_ioslices_slow(out, batch, written)?;
232    }
233    Ok(())
234}
235
236/// Handle partial write_vectored (cold path, never inlined).
237#[cold]
238#[inline(never)]
239fn write_ioslices_slow(
240    out: &mut impl Write,
241    slices: &[IoSlice],
242    mut skip: usize,
243) -> io::Result<()> {
244    for slice in slices {
245        let len = slice.len();
246        if skip >= len {
247            skip -= len;
248            continue;
249        }
250        out.write_all(&slice[skip..])?;
251        skip = 0;
252    }
253    Ok(())
254}
255
256// ── Chunk splitting for parallel processing ──────────────────────────────
257
258/// Number of available CPUs for parallel chunk splitting.
259/// Uses std::thread::available_parallelism() to avoid triggering premature
260/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
261#[inline]
262fn num_cpus() -> usize {
263    std::thread::available_parallelism()
264        .map(|n| n.get())
265        .unwrap_or(1)
266}
267
268/// Split data into chunks for rayon::scope parallel processing.
269/// Uses Rayon's thread count to match the number of worker threads.
270fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
271    let num_threads = num_cpus().max(1);
272    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
273        return vec![data];
274    }
275
276    let chunk_size = data.len() / num_threads;
277    let mut chunks = Vec::with_capacity(num_threads);
278    let mut pos = 0;
279
280    for _ in 0..num_threads - 1 {
281        let target = pos + chunk_size;
282        if target >= data.len() {
283            break;
284        }
285        let boundary = memchr::memchr(line_delim, &data[target..])
286            .map(|p| target + p + 1)
287            .unwrap_or(data.len());
288        if boundary > pos {
289            chunks.push(&data[pos..boundary]);
290        }
291        pos = boundary;
292    }
293
294    if pos < data.len() {
295        chunks.push(&data[pos..]);
296    }
297
298    chunks
299}
300
301// ── Fast path: multi-field non-contiguous extraction ─────────────────────
302
303/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
304/// Pre-collects delimiter positions per line into a stack-allocated array,
305/// then directly indexes into them for each selected field.
306/// This is O(max_field) per line instead of O(num_fields * scan_length).
307fn process_fields_multi_select(
308    data: &[u8],
309    delim: u8,
310    line_delim: u8,
311    ranges: &[Range],
312    suppress: bool,
313    out: &mut impl Write,
314) -> io::Result<()> {
315    let max_field = ranges.last().map_or(0, |r| r.end);
316
317    if data.len() >= PARALLEL_THRESHOLD {
318        let chunks = split_for_scope(data, line_delim);
319        let n = chunks.len();
320        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
321        rayon::scope(|s| {
322            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
323                s.spawn(move |_| {
324                    result.reserve(chunk.len() * 3 / 4);
325                    multi_select_chunk(
326                        chunk, delim, line_delim, ranges, max_field, suppress, result,
327                    );
328                });
329            }
330        });
331        let slices: Vec<IoSlice> = results
332            .iter()
333            .filter(|r| !r.is_empty())
334            .map(|r| IoSlice::new(r))
335            .collect();
336        write_ioslices(out, &slices)?;
337    } else {
338        process_chunked(data, line_delim, out, |chunk, buf| {
339            multi_select_chunk(chunk, delim, line_delim, ranges, max_field, suppress, buf);
340        })?;
341    }
342    Ok(())
343}
344
345/// Process a chunk for multi-field extraction.
346/// Uses single-pass memchr2 with bitmask field selection when max_field <= 64.
347/// Falls back to two-level scanning for larger field numbers.
348fn multi_select_chunk(
349    data: &[u8],
350    delim: u8,
351    line_delim: u8,
352    ranges: &[Range],
353    max_field: usize,
354    suppress: bool,
355    buf: &mut Vec<u8>,
356) {
357    // Single-pass bitmask approach for small field numbers (common case).
358    // One memchr2 scan finds both delimiters and newlines simultaneously,
359    // avoiding per-line function call overhead and delimiter position arrays.
360    if max_field <= 64 && delim != line_delim {
361        let mut mask: u64 = 0;
362        for r in ranges {
363            let s = r.start.max(1);
364            let e = r.end.min(64);
365            for f in s..=e {
366                mask |= 1u64 << (f - 1);
367            }
368        }
369        multi_select_chunk_bitmask(data, delim, line_delim, mask, max_field, suppress, buf);
370        return;
371    }
372
373    // Fallback: two-level scanning for large field numbers
374    buf.reserve(data.len());
375    let base = data.as_ptr();
376    let mut start = 0;
377    let max_delims = max_field.min(128);
378
379    for end_pos in memchr_iter(line_delim, data) {
380        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
381        multi_select_line_fast(
382            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
383        );
384        start = end_pos + 1;
385    }
386    if start < data.len() {
387        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
388        multi_select_line_fast(
389            line, delim, line_delim, ranges, max_delims, suppress, buf, start, base,
390        );
391    }
392}
393
394/// Per-line multi-field extraction with early termination after max_field.
395/// For `-f1,3,5` on 20-field CSV, this scans only 5 delimiters per line
396/// instead of all 20, reducing per-hit overhead by ~75%.
397fn multi_select_chunk_bitmask(
398    data: &[u8],
399    delim: u8,
400    line_delim: u8,
401    mask: u64,
402    max_field: usize,
403    suppress: bool,
404    buf: &mut Vec<u8>,
405) {
406    // Single-pass memchr2 approach: scan for both delimiters and newlines
407    // simultaneously. This avoids per-line memchr_iter creation overhead,
408    // which dominates for short lines (200K lines × ~35 bytes each).
409    buf.reserve(data.len() + 1);
410    let initial_len = buf.len();
411    let out_base = unsafe { buf.as_mut_ptr().add(initial_len) };
412    let src = data.as_ptr();
413    let mut wp: usize = 0;
414
415    let mut field_num: usize = 1; // current field (1-based)
416    let mut field_start: usize = 0; // start of current field
417    let mut first_output = true; // first field on current line?
418    let mut has_delim = false; // current line has any delimiter?
419
420    for pos in memchr::memchr2_iter(delim, line_delim, data) {
421        if data[pos] == line_delim {
422            // End of line: handle last field + write newline
423            if !has_delim {
424                // Line had no delimiter: pass through or suppress
425                if !suppress {
426                    let len = pos - field_start;
427                    unsafe {
428                        std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
429                    }
430                    wp += len;
431                    unsafe {
432                        *out_base.add(wp) = line_delim;
433                    }
434                    wp += 1;
435                }
436            } else {
437                // Check if last field is selected
438                if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
439                    if !first_output {
440                        unsafe {
441                            *out_base.add(wp) = delim;
442                        }
443                        wp += 1;
444                    }
445                    let len = pos - field_start;
446                    unsafe {
447                        std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
448                    }
449                    wp += len;
450                }
451                unsafe {
452                    *out_base.add(wp) = line_delim;
453                }
454                wp += 1;
455            }
456            // Reset for next line
457            field_num = 1;
458            field_start = pos + 1;
459            first_output = true;
460            has_delim = false;
461        } else {
462            // Delimiter found
463            has_delim = true;
464            if field_num <= max_field && (mask & (1u64 << (field_num - 1))) != 0 {
465                if !first_output {
466                    unsafe {
467                        *out_base.add(wp) = delim;
468                    }
469                    wp += 1;
470                }
471                let len = pos - field_start;
472                unsafe {
473                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
474                }
475                wp += len;
476                first_output = false;
477            }
478            field_num += 1;
479            field_start = pos + 1;
480        }
481    }
482
483    // Handle final line without trailing newline
484    if field_start < data.len() {
485        if !has_delim {
486            if !suppress {
487                let len = data.len() - field_start;
488                unsafe {
489                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
490                }
491                wp += len;
492                unsafe {
493                    *out_base.add(wp) = line_delim;
494                }
495                wp += 1;
496            }
497        } else {
498            if field_num <= 64 && (mask & (1u64 << (field_num - 1))) != 0 {
499                if !first_output {
500                    unsafe {
501                        *out_base.add(wp) = delim;
502                    }
503                    wp += 1;
504                }
505                let len = data.len() - field_start;
506                unsafe {
507                    std::ptr::copy_nonoverlapping(src.add(field_start), out_base.add(wp), len);
508                }
509                wp += len;
510            }
511            unsafe {
512                *out_base.add(wp) = line_delim;
513            }
514            wp += 1;
515        }
516    }
517
518    unsafe {
519        buf.set_len(initial_len + wp);
520    }
521}
522
523/// Extract selected fields from a single line using delimiter position scanning.
524/// Optimized: collects delimiter positions into a stack array with early exit at max_delims,
525/// then indexes directly for each selected field. Uses raw pointer arithmetic.
526#[inline(always)]
527fn multi_select_line_fast(
528    line: &[u8],
529    delim: u8,
530    line_delim: u8,
531    ranges: &[Range],
532    max_delims: usize,
533    suppress: bool,
534    buf: &mut Vec<u8>,
535    _line_abs_start: usize,
536    _data_base: *const u8,
537) {
538    let len = line.len();
539    if len == 0 {
540        if !suppress {
541            unsafe { buf_push(buf, line_delim) };
542        }
543        return;
544    }
545
546    let base = line.as_ptr();
547
548    // Collect delimiter positions up to max_delims (early exit).
549    let mut delim_pos = [0usize; 128];
550    let mut num_delims: usize = 0;
551
552    for pos in memchr_iter(delim, line) {
553        if num_delims < max_delims {
554            delim_pos[num_delims] = pos;
555            num_delims += 1;
556            if num_delims >= max_delims {
557                break;
558            }
559        }
560    }
561
562    if num_delims == 0 {
563        if !suppress {
564            unsafe {
565                buf_extend(buf, line);
566                buf_push(buf, line_delim);
567            }
568        }
569        return;
570    }
571
572    let total_fields = num_delims + 1;
573    let mut first_output = true;
574
575    for r in ranges {
576        let range_start = r.start;
577        let range_end = r.end.min(total_fields);
578        if range_start > total_fields {
579            break;
580        }
581        for field_num in range_start..=range_end {
582            if field_num > total_fields {
583                break;
584            }
585
586            let field_start = if field_num == 1 {
587                0
588            } else if field_num - 2 < num_delims {
589                delim_pos[field_num - 2] + 1
590            } else {
591                continue;
592            };
593            let field_end = if field_num <= num_delims {
594                delim_pos[field_num - 1]
595            } else {
596                len
597            };
598
599            if !first_output {
600                unsafe { buf_push(buf, delim) };
601            }
602            unsafe {
603                buf_extend(
604                    buf,
605                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
606                );
607            }
608            first_output = false;
609        }
610    }
611
612    unsafe { buf_push(buf, line_delim) };
613}
614
615// ── Fast path: field extraction with batched output ──────────────────────
616
617/// Optimized field extraction with early exit and batched output.
618fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
619    let delim = cfg.delim;
620    let line_delim = cfg.line_delim;
621    let ranges = cfg.ranges;
622    let complement = cfg.complement;
623    let output_delim = cfg.output_delim;
624    let suppress = cfg.suppress_no_delim;
625
626    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
627    // That scan was O(N) over the entire file just to check an edge case
628    // (no delimiter in any line). The per-line processing already handles
629    // lines without delimiters correctly, so the scan was pure overhead
630    // for files that DO contain delimiters (the common case).
631
632    // Ultra-fast path: single field extraction (e.g., cut -f5)
633    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
634        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
635    }
636
637    // Fast path: complement of single field or contiguous range with default output delimiter.
638    if complement
639        && ranges.len() == 1
640        && output_delim.len() == 1
641        && output_delim[0] == delim
642        && ranges[0].start == ranges[0].end
643    {
644        return process_complement_single_field(
645            data,
646            delim,
647            line_delim,
648            ranges[0].start,
649            suppress,
650            out,
651        );
652    }
653
654    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
655    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
656    if complement
657        && ranges.len() == 1
658        && ranges[0].start > 1
659        && ranges[0].end < usize::MAX
660        && output_delim.len() == 1
661        && output_delim[0] == delim
662    {
663        return process_complement_range(
664            data,
665            delim,
666            line_delim,
667            ranges[0].start,
668            ranges[0].end,
669            suppress,
670            out,
671        );
672    }
673
674    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
675    if !complement
676        && ranges.len() == 1
677        && ranges[0].start == 1
678        && output_delim.len() == 1
679        && output_delim[0] == delim
680        && ranges[0].end < usize::MAX
681    {
682        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
683    }
684
685    // Fast path: open-ended field range from field N (e.g., cut -f3-)
686    if !complement
687        && ranges.len() == 1
688        && ranges[0].end == usize::MAX
689        && ranges[0].start > 1
690        && output_delim.len() == 1
691        && output_delim[0] == delim
692    {
693        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
694    }
695
696    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
697    if !complement
698        && ranges.len() == 1
699        && ranges[0].start > 1
700        && ranges[0].end < usize::MAX
701        && output_delim.len() == 1
702        && output_delim[0] == delim
703    {
704        return process_fields_mid_range(
705            data,
706            delim,
707            line_delim,
708            ranges[0].start,
709            ranges[0].end,
710            suppress,
711            out,
712        );
713    }
714
715    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
716    // Uses delimiter position caching: find all delimiter positions per line,
717    // then directly index into them for each selected field.
718    // This is faster than the general extract_fields_to_buf which re-checks
719    // is_selected() for every field encountered.
720    if !complement
721        && ranges.len() > 1
722        && ranges.last().map_or(false, |r| r.end < usize::MAX)
723        && output_delim.len() == 1
724        && output_delim[0] == delim
725        && delim != line_delim
726    {
727        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
728    }
729
730    // General field extraction
731    let max_field = if complement {
732        usize::MAX
733    } else {
734        ranges.last().map(|r| r.end).unwrap_or(0)
735    };
736    let field_mask = compute_field_mask(ranges, complement);
737
738    if data.len() >= PARALLEL_THRESHOLD {
739        let chunks = split_for_scope(data, line_delim);
740        let n = chunks.len();
741        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
742        rayon::scope(|s| {
743            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
744                s.spawn(move |_| {
745                    result.reserve(chunk.len() + 1);
746                    process_fields_chunk(
747                        chunk,
748                        delim,
749                        ranges,
750                        output_delim,
751                        suppress,
752                        max_field,
753                        field_mask,
754                        line_delim,
755                        complement,
756                        result,
757                    );
758                });
759            }
760        });
761        let slices: Vec<IoSlice> = results
762            .iter()
763            .filter(|r| !r.is_empty())
764            .map(|r| IoSlice::new(r))
765            .collect();
766        write_ioslices(out, &slices)?;
767    } else {
768        process_chunked(data, line_delim, out, |chunk, buf| {
769            process_fields_chunk(
770                chunk,
771                delim,
772                ranges,
773                output_delim,
774                suppress,
775                max_field,
776                field_mask,
777                line_delim,
778                complement,
779                buf,
780            );
781        })?;
782    }
783    Ok(())
784}
785
786/// Process a chunk of data for general field extraction.
787/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
788/// memchr_iter(delim) for delimiter positions. This is faster than memchr2 single-pass
789/// because memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles).
790fn process_fields_chunk(
791    data: &[u8],
792    delim: u8,
793    ranges: &[Range],
794    output_delim: &[u8],
795    suppress: bool,
796    max_field: usize,
797    field_mask: u64,
798    line_delim: u8,
799    complement: bool,
800    buf: &mut Vec<u8>,
801) {
802    // Always use two-level approach: outer memchr(newline) + inner memchr_iter(delim).
803    // Even for complement/unbounded ranges, two-level is faster because memchr is
804    // ~30-50% faster per byte than memchr2. The per-line function call overhead
805    // is negligible compared to the SIMD scan savings.
806    if delim != line_delim {
807        buf.reserve(data.len());
808        let mut start = 0;
809        for end_pos in memchr_iter(line_delim, data) {
810            let line = &data[start..end_pos];
811            extract_fields_to_buf(
812                line,
813                delim,
814                ranges,
815                output_delim,
816                suppress,
817                max_field,
818                field_mask,
819                line_delim,
820                buf,
821                complement,
822            );
823            start = end_pos + 1;
824        }
825        if start < data.len() {
826            extract_fields_to_buf(
827                &data[start..],
828                delim,
829                ranges,
830                output_delim,
831                suppress,
832                max_field,
833                field_mask,
834                line_delim,
835                buf,
836                complement,
837            );
838        }
839        return;
840    }
841
842    // Fallback: when delim == line_delim, use the two-level scan approach
843    let mut start = 0;
844    for end_pos in memchr_iter(line_delim, data) {
845        let line = &data[start..end_pos];
846        extract_fields_to_buf(
847            line,
848            delim,
849            ranges,
850            output_delim,
851            suppress,
852            max_field,
853            field_mask,
854            line_delim,
855            buf,
856            complement,
857        );
858        start = end_pos + 1;
859    }
860    if start < data.len() {
861        extract_fields_to_buf(
862            &data[start..],
863            delim,
864            ranges,
865            output_delim,
866            suppress,
867            max_field,
868            field_mask,
869            line_delim,
870            buf,
871            complement,
872        );
873    }
874}
875
876// ── Ultra-fast single field extraction ───────────────────────────────────
877
878/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
879/// Uses two-level scanning: outer memchr(newline) for line boundaries, inner
880/// memchr(delim) for the field delimiter with early exit.
881fn process_single_field(
882    data: &[u8],
883    delim: u8,
884    line_delim: u8,
885    target: usize,
886    suppress: bool,
887    out: &mut impl Write,
888) -> io::Result<()> {
889    let target_idx = target - 1;
890
891    // For single-field extraction, parallelize at 16MB+ to match PARALLEL_THRESHOLD.
892    const FIELD_PARALLEL_MIN: usize = 16 * 1024 * 1024;
893
894    if delim != line_delim {
895        // Field 1 fast path: two-level scan (outer newline + inner first-delim).
896        // For field 1, only needs to find the first delimiter per line.
897        // Lines without delimiter are tracked as contiguous runs for bulk copy.
898        if target_idx == 0 && !suppress {
899            if data.len() >= FIELD_PARALLEL_MIN {
900                return single_field1_parallel(data, delim, line_delim, out);
901            }
902            return process_chunked(data, line_delim, out, |chunk, buf| {
903                single_field1_to_buf(chunk, delim, line_delim, buf);
904            });
905        }
906
907        // Two-level approach for field N: outer newline scan + inner delim scan
908        // with early exit at target_idx. Faster than memchr2 single-pass because
909        // we only scan delimiters up to target_idx per line (not all of them).
910        if data.len() >= FIELD_PARALLEL_MIN {
911            let chunks = split_for_scope(data, line_delim);
912            let n = chunks.len();
913            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
914            rayon::scope(|s| {
915                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
916                    s.spawn(move |_| {
917                        result.reserve(chunk.len() / 2);
918                        process_single_field_chunk(
919                            chunk, delim, target_idx, line_delim, suppress, result,
920                        );
921                    });
922                }
923            });
924            let slices: Vec<IoSlice> = results
925                .iter()
926                .filter(|r| !r.is_empty())
927                .map(|r| IoSlice::new(r))
928                .collect();
929            write_ioslices(out, &slices)?;
930        } else {
931            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
932            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
933            if !buf.is_empty() {
934                out.write_all(&buf)?;
935            }
936        }
937        return Ok(());
938    }
939
940    // Fallback for delim == line_delim: nested loop approach
941    if data.len() >= FIELD_PARALLEL_MIN {
942        let chunks = split_for_scope(data, line_delim);
943        let n = chunks.len();
944        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
945        rayon::scope(|s| {
946            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
947                s.spawn(move |_| {
948                    result.reserve(chunk.len() / 4);
949                    process_single_field_chunk(
950                        chunk, delim, target_idx, line_delim, suppress, result,
951                    );
952                });
953            }
954        });
955        let slices: Vec<IoSlice> = results
956            .iter()
957            .filter(|r| !r.is_empty())
958            .map(|r| IoSlice::new(r))
959            .collect();
960        write_ioslices(out, &slices)?;
961    } else {
962        let mut buf = Vec::with_capacity(data.len() / 4);
963        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
964        if !buf.is_empty() {
965            out.write_all(&buf)?;
966        }
967    }
968    Ok(())
969}
970
971/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
972/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
973fn process_complement_range(
974    data: &[u8],
975    delim: u8,
976    line_delim: u8,
977    skip_start: usize,
978    skip_end: usize,
979    suppress: bool,
980    out: &mut impl Write,
981) -> io::Result<()> {
982    if data.len() >= PARALLEL_THRESHOLD {
983        let chunks = split_for_scope(data, line_delim);
984        let n = chunks.len();
985        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
986        rayon::scope(|s| {
987            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
988                s.spawn(move |_| {
989                    result.reserve(chunk.len());
990                    complement_range_chunk(
991                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
992                    );
993                });
994            }
995        });
996        let slices: Vec<IoSlice> = results
997            .iter()
998            .filter(|r| !r.is_empty())
999            .map(|r| IoSlice::new(r))
1000            .collect();
1001        write_ioslices(out, &slices)?;
1002    } else {
1003        process_chunked(data, line_delim, out, |chunk, buf| {
1004            complement_range_chunk(
1005                chunk, delim, skip_start, skip_end, line_delim, suppress, buf,
1006            );
1007        })?;
1008    }
1009    Ok(())
1010}
1011
1012/// Process a chunk for complement range extraction.
1013fn complement_range_chunk(
1014    data: &[u8],
1015    delim: u8,
1016    skip_start: usize,
1017    skip_end: usize,
1018    line_delim: u8,
1019    suppress: bool,
1020    buf: &mut Vec<u8>,
1021) {
1022    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1023    buf.reserve(data.len());
1024    let mut start = 0;
1025    for end_pos in memchr_iter(line_delim, data) {
1026        let line = &data[start..end_pos];
1027        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1028        start = end_pos + 1;
1029    }
1030    if start < data.len() {
1031        complement_range_line(
1032            &data[start..],
1033            delim,
1034            skip_start,
1035            skip_end,
1036            line_delim,
1037            suppress,
1038            buf,
1039        );
1040    }
1041}
1042
1043/// Extract all fields except skip_start..=skip_end from one line.
1044/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1045///
1046/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1047/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1048/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1049#[inline(always)]
1050fn complement_range_line(
1051    line: &[u8],
1052    delim: u8,
1053    skip_start: usize,
1054    skip_end: usize,
1055    line_delim: u8,
1056    suppress: bool,
1057    buf: &mut Vec<u8>,
1058) {
1059    let len = line.len();
1060    if len == 0 {
1061        if !suppress {
1062            unsafe { buf_push(buf, line_delim) };
1063        }
1064        return;
1065    }
1066
1067    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1068    let base = line.as_ptr();
1069
1070    // 1-based field numbers. To skip fields skip_start..=skip_end:
1071    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1072    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1073    //
1074    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1075    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1076
1077    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1078    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1079    let total_need = need_prefix_delims + need_skip_delims;
1080
1081    // Find delimiter positions up to total_need
1082    let mut delim_count: usize = 0;
1083    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1084    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1085
1086    for pos in memchr_iter(delim, line) {
1087        delim_count += 1;
1088        if delim_count == need_prefix_delims {
1089            prefix_end_pos = pos;
1090        }
1091        if delim_count == total_need {
1092            suffix_start_pos = pos + 1;
1093            break;
1094        }
1095    }
1096
1097    if delim_count == 0 {
1098        // No delimiter at all
1099        if !suppress {
1100            unsafe {
1101                buf_extend(buf, line);
1102                buf_push(buf, line_delim);
1103            }
1104        }
1105        return;
1106    }
1107
1108    // Case analysis:
1109    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1110    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1111    // 3. Enough to reach skip_end: prefix + delim + suffix
1112
1113    if delim_count < need_prefix_delims {
1114        // Not enough fields to reach skip region — output entire line
1115        unsafe {
1116            buf_extend(buf, line);
1117            buf_push(buf, line_delim);
1118        }
1119        return;
1120    }
1121
1122    let has_prefix = need_prefix_delims > 0;
1123    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1124
1125    if has_prefix && has_suffix {
1126        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1127        unsafe {
1128            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1129            buf_push(buf, delim);
1130            buf_extend(
1131                buf,
1132                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1133            );
1134            buf_push(buf, line_delim);
1135        }
1136    } else if has_prefix {
1137        // Only prefix, no suffix (skip region extends to end of line)
1138        unsafe {
1139            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1140            buf_push(buf, line_delim);
1141        }
1142    } else if has_suffix {
1143        // No prefix (skip_start == 1), only suffix
1144        unsafe {
1145            buf_extend(
1146                buf,
1147                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1148            );
1149            buf_push(buf, line_delim);
1150        }
1151    } else {
1152        // All fields skipped
1153        unsafe { buf_push(buf, line_delim) };
1154    }
1155}
1156
1157/// Complement single-field extraction: skip one field, output rest unchanged.
1158fn process_complement_single_field(
1159    data: &[u8],
1160    delim: u8,
1161    line_delim: u8,
1162    skip_field: usize,
1163    suppress: bool,
1164    out: &mut impl Write,
1165) -> io::Result<()> {
1166    let skip_idx = skip_field - 1;
1167
1168    if data.len() >= PARALLEL_THRESHOLD {
1169        let chunks = split_for_scope(data, line_delim);
1170        let n = chunks.len();
1171        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1172        rayon::scope(|s| {
1173            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1174                s.spawn(move |_| {
1175                    result.reserve(chunk.len());
1176                    complement_single_field_chunk(
1177                        chunk, delim, skip_idx, line_delim, suppress, result,
1178                    );
1179                });
1180            }
1181        });
1182        let slices: Vec<IoSlice> = results
1183            .iter()
1184            .filter(|r| !r.is_empty())
1185            .map(|r| IoSlice::new(r))
1186            .collect();
1187        write_ioslices(out, &slices)?;
1188    } else {
1189        process_chunked(data, line_delim, out, |chunk, buf| {
1190            complement_single_field_chunk(chunk, delim, skip_idx, line_delim, suppress, buf);
1191        })?;
1192    }
1193    Ok(())
1194}
1195
1196/// Process a chunk for complement single-field extraction using two-level scanning.
1197/// Outer memchr(newline) for line boundaries, inner memchr_iter(delim) with early exit
1198/// after finding the skip field's bounding delimiters. Faster than memchr2 single-pass
1199/// because memchr is faster per byte and inner scan exits early.
1200fn complement_single_field_chunk(
1201    data: &[u8],
1202    delim: u8,
1203    skip_idx: usize,
1204    line_delim: u8,
1205    suppress: bool,
1206    buf: &mut Vec<u8>,
1207) {
1208    buf.reserve(data.len());
1209    let mut start = 0;
1210    for end_pos in memchr_iter(line_delim, data) {
1211        let line = &data[start..end_pos];
1212        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1213        start = end_pos + 1;
1214    }
1215    if start < data.len() {
1216        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
1217    }
1218}
1219
1220/// Fallback per-line complement single-field extraction (for delim == line_delim).
1221#[inline(always)]
1222fn complement_single_field_line(
1223    line: &[u8],
1224    delim: u8,
1225    skip_idx: usize,
1226    line_delim: u8,
1227    suppress: bool,
1228    buf: &mut Vec<u8>,
1229) {
1230    let len = line.len();
1231    if len == 0 {
1232        if !suppress {
1233            unsafe { buf_push(buf, line_delim) };
1234        }
1235        return;
1236    }
1237
1238    let base = line.as_ptr();
1239    let need_before = skip_idx;
1240    let need_total = skip_idx + 1;
1241
1242    let mut delim_count: usize = 0;
1243    let mut skip_start_pos: usize = 0;
1244    let mut skip_end_pos: usize = len;
1245    let mut found_end = false;
1246
1247    for pos in memchr_iter(delim, line) {
1248        delim_count += 1;
1249        if delim_count == need_before {
1250            skip_start_pos = pos + 1;
1251        }
1252        if delim_count == need_total {
1253            skip_end_pos = pos;
1254            found_end = true;
1255            break;
1256        }
1257    }
1258
1259    if delim_count == 0 {
1260        if !suppress {
1261            unsafe {
1262                buf_extend(buf, line);
1263                buf_push(buf, line_delim);
1264            }
1265        }
1266        return;
1267    }
1268
1269    if delim_count < need_before {
1270        unsafe {
1271            buf_extend(buf, line);
1272            buf_push(buf, line_delim);
1273        }
1274        return;
1275    }
1276
1277    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1278    let has_suffix = found_end && skip_end_pos < len;
1279
1280    if has_prefix && has_suffix {
1281        unsafe {
1282            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1283            buf_push(buf, delim);
1284            buf_extend(
1285                buf,
1286                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1287            );
1288            buf_push(buf, line_delim);
1289        }
1290    } else if has_prefix {
1291        unsafe {
1292            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1293            buf_push(buf, line_delim);
1294        }
1295    } else if has_suffix {
1296        unsafe {
1297            buf_extend(
1298                buf,
1299                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1300            );
1301            buf_push(buf, line_delim);
1302        }
1303    } else {
1304        unsafe { buf_push(buf, line_delim) };
1305    }
1306}
1307
1308/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1309/// Zero-copy for the non-parallel path: identifies the truncation point per line
1310/// and writes contiguous runs directly from the source data.
1311fn process_fields_prefix(
1312    data: &[u8],
1313    delim: u8,
1314    line_delim: u8,
1315    last_field: usize,
1316    suppress: bool,
1317    out: &mut impl Write,
1318) -> io::Result<()> {
1319    if data.len() >= PARALLEL_THRESHOLD {
1320        let chunks = split_for_scope(data, line_delim);
1321        let n = chunks.len();
1322        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1323        rayon::scope(|s| {
1324            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1325                s.spawn(move |_| {
1326                    result.reserve(chunk.len());
1327                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1328                });
1329            }
1330        });
1331        let slices: Vec<IoSlice> = results
1332            .iter()
1333            .filter(|r| !r.is_empty())
1334            .map(|r| IoSlice::new(r))
1335            .collect();
1336        write_ioslices(out, &slices)?;
1337    } else if !suppress {
1338        // Zero-copy fast path: scan for truncation points, write runs from source.
1339        // When suppress is false, every line is output (with or without delimiter).
1340        // Most lines have enough fields, so the output is often identical to input.
1341        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1342    } else {
1343        process_chunked(data, line_delim, out, |chunk, buf| {
1344            fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, buf);
1345        })?;
1346    }
1347    Ok(())
1348}
1349
1350/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1351/// directly into the source data, flushing in MAX_IOV-sized batches.
1352/// For lines where the Nth delimiter exists, we truncate at that point.
1353/// For lines with fewer fields, we output them unchanged (contiguous run).
1354/// Lines without any delimiter are output unchanged (suppress=false assumed).
1355#[inline]
1356fn fields_prefix_zerocopy(
1357    data: &[u8],
1358    delim: u8,
1359    line_delim: u8,
1360    last_field: usize,
1361    out: &mut impl Write,
1362) -> io::Result<()> {
1363    let newline_buf: [u8; 1] = [line_delim];
1364    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1365    let mut start = 0;
1366    let mut run_start: usize = 0;
1367
1368    for end_pos in memchr_iter(line_delim, data) {
1369        let line = &data[start..end_pos];
1370        let mut field_count = 1;
1371        let mut truncate_at: Option<usize> = None;
1372        for dpos in memchr_iter(delim, line) {
1373            if field_count >= last_field {
1374                truncate_at = Some(start + dpos);
1375                break;
1376            }
1377            field_count += 1;
1378        }
1379
1380        if let Some(trunc_pos) = truncate_at {
1381            if run_start < start {
1382                iov.push(IoSlice::new(&data[run_start..start]));
1383            }
1384            iov.push(IoSlice::new(&data[start..trunc_pos]));
1385            iov.push(IoSlice::new(&newline_buf));
1386            run_start = end_pos + 1;
1387
1388            if iov.len() >= MAX_IOV - 2 {
1389                write_ioslices(out, &iov)?;
1390                iov.clear();
1391            }
1392        }
1393        start = end_pos + 1;
1394    }
1395    // Handle last line without terminator
1396    if start < data.len() {
1397        let line = &data[start..];
1398        let mut field_count = 1;
1399        let mut truncate_at: Option<usize> = None;
1400        for dpos in memchr_iter(delim, line) {
1401            if field_count >= last_field {
1402                truncate_at = Some(start + dpos);
1403                break;
1404            }
1405            field_count += 1;
1406        }
1407        if let Some(trunc_pos) = truncate_at {
1408            if run_start < start {
1409                iov.push(IoSlice::new(&data[run_start..start]));
1410            }
1411            iov.push(IoSlice::new(&data[start..trunc_pos]));
1412            iov.push(IoSlice::new(&newline_buf));
1413            if !iov.is_empty() {
1414                write_ioslices(out, &iov)?;
1415            }
1416            return Ok(());
1417        }
1418    }
1419    // Flush remaining contiguous run
1420    if run_start < data.len() {
1421        iov.push(IoSlice::new(&data[run_start..]));
1422        if !data.is_empty() && *data.last().unwrap() != line_delim {
1423            iov.push(IoSlice::new(&newline_buf));
1424        }
1425    }
1426    if !iov.is_empty() {
1427        write_ioslices(out, &iov)?;
1428    }
1429    Ok(())
1430}
1431
1432/// Process a chunk for contiguous from-start field range extraction.
1433fn fields_prefix_chunk(
1434    data: &[u8],
1435    delim: u8,
1436    line_delim: u8,
1437    last_field: usize,
1438    suppress: bool,
1439    buf: &mut Vec<u8>,
1440) {
1441    buf.reserve(data.len());
1442    let mut start = 0;
1443    for end_pos in memchr_iter(line_delim, data) {
1444        let line = &data[start..end_pos];
1445        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1446        start = end_pos + 1;
1447    }
1448    if start < data.len() {
1449        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1450    }
1451}
1452
1453/// Extract first N fields from one line (contiguous from-start range).
1454/// Uses memchr SIMD for delimiter scanning on all line sizes.
1455#[inline(always)]
1456fn fields_prefix_line(
1457    line: &[u8],
1458    delim: u8,
1459    line_delim: u8,
1460    last_field: usize,
1461    suppress: bool,
1462    buf: &mut Vec<u8>,
1463) {
1464    let len = line.len();
1465    if len == 0 {
1466        if !suppress {
1467            unsafe { buf_push(buf, line_delim) };
1468        }
1469        return;
1470    }
1471
1472    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1473    let base = line.as_ptr();
1474
1475    let mut field_count = 1usize;
1476    let mut has_delim = false;
1477
1478    for pos in memchr_iter(delim, line) {
1479        has_delim = true;
1480        if field_count >= last_field {
1481            unsafe {
1482                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1483                buf_push(buf, line_delim);
1484            }
1485            return;
1486        }
1487        field_count += 1;
1488    }
1489
1490    if !has_delim {
1491        if !suppress {
1492            unsafe {
1493                buf_extend(buf, line);
1494                buf_push(buf, line_delim);
1495            }
1496        }
1497        return;
1498    }
1499
1500    unsafe {
1501        buf_extend(buf, line);
1502        buf_push(buf, line_delim);
1503    }
1504}
1505
1506/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1507fn process_fields_suffix(
1508    data: &[u8],
1509    delim: u8,
1510    line_delim: u8,
1511    start_field: usize,
1512    suppress: bool,
1513    out: &mut impl Write,
1514) -> io::Result<()> {
1515    if data.len() >= PARALLEL_THRESHOLD {
1516        let chunks = split_for_scope(data, line_delim);
1517        let n = chunks.len();
1518        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1519        rayon::scope(|s| {
1520            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1521                s.spawn(move |_| {
1522                    result.reserve(chunk.len());
1523                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1524                });
1525            }
1526        });
1527        let slices: Vec<IoSlice> = results
1528            .iter()
1529            .filter(|r| !r.is_empty())
1530            .map(|r| IoSlice::new(r))
1531            .collect();
1532        write_ioslices(out, &slices)?;
1533    } else {
1534        process_chunked(data, line_delim, out, |chunk, buf| {
1535            fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, buf);
1536        })?;
1537    }
1538    Ok(())
1539}
1540
1541/// Process a chunk for open-ended field suffix extraction.
1542fn fields_suffix_chunk(
1543    data: &[u8],
1544    delim: u8,
1545    line_delim: u8,
1546    start_field: usize,
1547    suppress: bool,
1548    buf: &mut Vec<u8>,
1549) {
1550    buf.reserve(data.len());
1551    let mut start = 0;
1552    for end_pos in memchr_iter(line_delim, data) {
1553        let line = &data[start..end_pos];
1554        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1555        start = end_pos + 1;
1556    }
1557    if start < data.len() {
1558        fields_suffix_line(
1559            &data[start..],
1560            delim,
1561            line_delim,
1562            start_field,
1563            suppress,
1564            buf,
1565        );
1566    }
1567}
1568
1569/// Extract fields from start_field to end from one line.
1570/// Uses memchr SIMD for delimiter scanning on all line sizes.
1571#[inline(always)]
1572fn fields_suffix_line(
1573    line: &[u8],
1574    delim: u8,
1575    line_delim: u8,
1576    start_field: usize,
1577    suppress: bool,
1578    buf: &mut Vec<u8>,
1579) {
1580    let len = line.len();
1581    if len == 0 {
1582        if !suppress {
1583            unsafe { buf_push(buf, line_delim) };
1584        }
1585        return;
1586    }
1587
1588    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1589    let base = line.as_ptr();
1590
1591    let skip_delims = start_field - 1;
1592    let mut delim_count = 0usize;
1593    let mut has_delim = false;
1594
1595    for pos in memchr_iter(delim, line) {
1596        has_delim = true;
1597        delim_count += 1;
1598        if delim_count >= skip_delims {
1599            unsafe {
1600                buf_extend(
1601                    buf,
1602                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1603                );
1604                buf_push(buf, line_delim);
1605            }
1606            return;
1607        }
1608    }
1609
1610    if !has_delim {
1611        if !suppress {
1612            unsafe {
1613                buf_extend(buf, line);
1614                buf_push(buf, line_delim);
1615            }
1616        }
1617        return;
1618    }
1619
1620    // Fewer delimiters than needed
1621    unsafe { buf_push(buf, line_delim) };
1622}
1623
1624/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1625/// Optimized: skip to start_field using memchr, then output until end_field.
1626fn process_fields_mid_range(
1627    data: &[u8],
1628    delim: u8,
1629    line_delim: u8,
1630    start_field: usize,
1631    end_field: usize,
1632    suppress: bool,
1633    out: &mut impl Write,
1634) -> io::Result<()> {
1635    if data.len() >= PARALLEL_THRESHOLD {
1636        let chunks = split_for_scope(data, line_delim);
1637        let n = chunks.len();
1638        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1639        rayon::scope(|s| {
1640            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1641                s.spawn(move |_| {
1642                    result.reserve(chunk.len());
1643                    fields_mid_range_chunk(
1644                        chunk,
1645                        delim,
1646                        line_delim,
1647                        start_field,
1648                        end_field,
1649                        suppress,
1650                        result,
1651                    );
1652                });
1653            }
1654        });
1655        let slices: Vec<IoSlice> = results
1656            .iter()
1657            .filter(|r| !r.is_empty())
1658            .map(|r| IoSlice::new(r))
1659            .collect();
1660        write_ioslices(out, &slices)?;
1661    } else {
1662        process_chunked(data, line_delim, out, |chunk, buf| {
1663            fields_mid_range_chunk(
1664                chunk,
1665                delim,
1666                line_delim,
1667                start_field,
1668                end_field,
1669                suppress,
1670                buf,
1671            );
1672        })?;
1673    }
1674    Ok(())
1675}
1676
1677/// Process a chunk for contiguous mid-range field extraction.
1678/// Two-level scan: outer memchr(newline) for line boundaries, inner memchr_iter(delim)
1679/// with early exit at target_end_delim. Faster than memchr2 single-pass because
1680/// memchr is faster per byte and inner scan exits early.
1681fn fields_mid_range_chunk(
1682    data: &[u8],
1683    delim: u8,
1684    line_delim: u8,
1685    start_field: usize,
1686    end_field: usize,
1687    suppress: bool,
1688    buf: &mut Vec<u8>,
1689) {
1690    buf.reserve(data.len());
1691    let mut start = 0;
1692    for end_pos in memchr_iter(line_delim, data) {
1693        let line = &data[start..end_pos];
1694        fields_mid_range_line(
1695            line,
1696            delim,
1697            line_delim,
1698            start_field,
1699            end_field,
1700            suppress,
1701            buf,
1702        );
1703        start = end_pos + 1;
1704    }
1705    if start < data.len() {
1706        fields_mid_range_line(
1707            &data[start..],
1708            delim,
1709            line_delim,
1710            start_field,
1711            end_field,
1712            suppress,
1713            buf,
1714        );
1715    }
1716}
1717
1718/// Extract fields start_field..=end_field from one line.
1719/// Uses scalar byte scanning for short lines, memchr_iter for longer.
1720/// Raw pointer arithmetic to eliminate bounds checking.
1721#[inline(always)]
1722fn fields_mid_range_line(
1723    line: &[u8],
1724    delim: u8,
1725    line_delim: u8,
1726    start_field: usize,
1727    end_field: usize,
1728    suppress: bool,
1729    buf: &mut Vec<u8>,
1730) {
1731    let len = line.len();
1732    if len == 0 {
1733        if !suppress {
1734            unsafe { buf_push(buf, line_delim) };
1735        }
1736        return;
1737    }
1738
1739    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
1740    let base = line.as_ptr();
1741
1742    // Count delimiters to find start_field and end_field boundaries
1743    let skip_before = start_field - 1; // delimiters to skip before start_field
1744    let field_span = end_field - start_field; // additional delimiters within the range
1745    let target_end_delim = skip_before + field_span + 1;
1746    let mut delim_count = 0;
1747    let mut range_start = 0;
1748    let mut has_delim = false;
1749
1750    for pos in memchr_iter(delim, line) {
1751        has_delim = true;
1752        delim_count += 1;
1753        if delim_count == skip_before {
1754            range_start = pos + 1;
1755        }
1756        if delim_count == target_end_delim {
1757            if skip_before == 0 {
1758                range_start = 0;
1759            }
1760            unsafe {
1761                buf_extend(
1762                    buf,
1763                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1764                );
1765                buf_push(buf, line_delim);
1766            }
1767            return;
1768        }
1769    }
1770
1771    if !has_delim {
1772        if !suppress {
1773            unsafe {
1774                buf_extend(buf, line);
1775                buf_push(buf, line_delim);
1776            }
1777        }
1778        return;
1779    }
1780
1781    // Line has delimiters but fewer fields than end_field
1782    if delim_count >= skip_before {
1783        // We have at least start_field, output from range_start to end
1784        if skip_before == 0 {
1785            range_start = 0;
1786        }
1787        unsafe {
1788            buf_extend(
1789                buf,
1790                std::slice::from_raw_parts(base.add(range_start), len - range_start),
1791            );
1792            buf_push(buf, line_delim);
1793        }
1794    } else {
1795        // Not enough fields even for start_field — output empty line
1796        unsafe { buf_push(buf, line_delim) };
1797    }
1798}
1799
1800/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
1801/// directly into the source data, flushing in MAX_IOV-sized batches.
1802/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
1803///
1804/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
1805/// Parallel field-1 extraction for large data using memchr2 single-pass.
1806/// Splits data into per-thread chunks, each chunk extracts field 1 using
1807/// memchr2(delim, newline) which finds the first special byte in one scan.
1808/// For field 1: first special byte is either the delimiter (field end) or
1809/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
1810fn single_field1_parallel(
1811    data: &[u8],
1812    delim: u8,
1813    line_delim: u8,
1814    out: &mut impl Write,
1815) -> io::Result<()> {
1816    let chunks = split_for_scope(data, line_delim);
1817    let n = chunks.len();
1818    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1819    rayon::scope(|s| {
1820        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1821            s.spawn(move |_| {
1822                result.reserve(chunk.len() + 1);
1823                single_field1_to_buf(chunk, delim, line_delim, result);
1824            });
1825        }
1826    });
1827    let slices: Vec<IoSlice> = results
1828        .iter()
1829        .filter(|r| !r.is_empty())
1830        .map(|r| IoSlice::new(r))
1831        .collect();
1832    write_ioslices(out, &slices)
1833}
1834
1835/// Extract field 1 from a chunk using two-level scanning: outer memchr(newline)
1836/// for line boundaries, inner memchr(delim) for the first delimiter per line.
1837///
1838/// This is faster than memchr2_iter single-pass because:
1839/// 1. memchr (one needle) is ~30-50% faster per byte than memchr2 (two needles)
1840/// 2. For field 1, the inner memchr exits after the FIRST delimiter, skipping
1841///    all subsequent delimiters on the line (huge win for multi-column CSV)
1842/// 3. Lines without delimiter produce contiguous runs that are bulk-copied
1843///
1844/// Uses a single output pointer to avoid per-line buf.len() load/store.
1845#[inline]
1846fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
1847    debug_assert_ne!(delim, line_delim, "delim and line_delim must differ");
1848    // Reserve data.len() + 1: output <= input for all lines except potentially
1849    // the last line without trailing newline, where we add a newline (GNU compat).
1850    buf.reserve(data.len() + 1);
1851
1852    let base = data.as_ptr();
1853    let initial_len = buf.len();
1854    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
1855    let mut start = 0;
1856    // Track the start of contiguous runs of no-delimiter lines for bulk copy.
1857    let mut run_start: usize = 0;
1858    let mut in_run = true; // we start in a run
1859
1860    for end_pos in memchr_iter(line_delim, data) {
1861        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
1862        match memchr::memchr(delim, line) {
1863            Some(dp) => {
1864                // Line has delimiter — flush contiguous run, output field1 + newline
1865                if in_run && run_start < start {
1866                    // Bulk copy the contiguous run of unchanged lines
1867                    let run_len = start - run_start;
1868                    unsafe {
1869                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1870                        out_ptr = out_ptr.add(run_len);
1871                    }
1872                }
1873                // Output field (bytes before first delimiter) + newline
1874                unsafe {
1875                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
1876                    out_ptr = out_ptr.add(dp);
1877                    *out_ptr = line_delim;
1878                    out_ptr = out_ptr.add(1);
1879                }
1880                run_start = end_pos + 1;
1881                in_run = true;
1882            }
1883            None => {
1884                // No delimiter — this line stays in the contiguous run
1885                if !in_run {
1886                    run_start = start;
1887                    in_run = true;
1888                }
1889            }
1890        }
1891        start = end_pos + 1;
1892    }
1893
1894    // Flush any remaining contiguous run
1895    if in_run && run_start < start {
1896        let run_len = start - run_start;
1897        unsafe {
1898            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
1899            out_ptr = out_ptr.add(run_len);
1900        }
1901    }
1902
1903    // Handle last line without trailing newline
1904    if start < data.len() {
1905        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
1906        match memchr::memchr(delim, line) {
1907            Some(dp) => {
1908                // Field + trailing newline
1909                unsafe {
1910                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, dp);
1911                    out_ptr = out_ptr.add(dp);
1912                    *out_ptr = line_delim;
1913                    out_ptr = out_ptr.add(1);
1914                }
1915            }
1916            None => {
1917                // No delimiter — output remaining data + newline (GNU compat)
1918                let len = data.len() - start;
1919                unsafe {
1920                    std::ptr::copy_nonoverlapping(base.add(start), out_ptr, len);
1921                    out_ptr = out_ptr.add(len);
1922                    *out_ptr = line_delim;
1923                    out_ptr = out_ptr.add(1);
1924                }
1925            }
1926        }
1927    }
1928
1929    unsafe {
1930        let new_len = out_ptr as usize - buf.as_ptr() as usize;
1931        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
1932        buf.set_len(new_len);
1933    }
1934}
1935
1936/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
1937/// directly into the source data. Uses two-level scan: outer memchr(newline)
1938/// for the first delimiter. This is faster than memchr2 for SMALL data because
1939/// the inner scan exits after the FIRST delimiter, skipping all
1940/// subsequent delimiters on the line.
1941///
1942/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
1943/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
1944#[inline]
1945#[allow(dead_code)]
1946fn single_field1_zerocopy(
1947    data: &[u8],
1948    delim: u8,
1949    line_delim: u8,
1950    out: &mut impl Write,
1951) -> io::Result<()> {
1952    let newline_buf: [u8; 1] = [line_delim];
1953
1954    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1955    let mut run_start: usize = 0;
1956    let mut start = 0;
1957
1958    for end_pos in memchr_iter(line_delim, data) {
1959        let line = &data[start..end_pos];
1960        if let Some(dp) = memchr::memchr(delim, line) {
1961            // Line has delimiter — truncate at first delimiter.
1962            // Flush current contiguous run, then add truncated field + newline.
1963            if run_start < start {
1964                iov.push(IoSlice::new(&data[run_start..start]));
1965            }
1966            iov.push(IoSlice::new(&data[start..start + dp]));
1967            iov.push(IoSlice::new(&newline_buf));
1968            run_start = end_pos + 1;
1969
1970            if iov.len() >= MAX_IOV - 2 {
1971                write_ioslices(out, &iov)?;
1972                iov.clear();
1973            }
1974        }
1975        // else: no delimiter in line, output unchanged (stays in contiguous run)
1976        start = end_pos + 1;
1977    }
1978
1979    // Handle last line (no trailing newline)
1980    if start < data.len() {
1981        let line = &data[start..];
1982        if let Some(dp) = memchr::memchr(delim, line) {
1983            if run_start < start {
1984                iov.push(IoSlice::new(&data[run_start..start]));
1985            }
1986            iov.push(IoSlice::new(&data[start..start + dp]));
1987            iov.push(IoSlice::new(&newline_buf));
1988            if !iov.is_empty() {
1989                write_ioslices(out, &iov)?;
1990            }
1991            return Ok(());
1992        }
1993    }
1994
1995    // Flush remaining contiguous run
1996    if run_start < data.len() {
1997        iov.push(IoSlice::new(&data[run_start..]));
1998        if !data.is_empty() && *data.last().unwrap() != line_delim {
1999            iov.push(IoSlice::new(&newline_buf));
2000        }
2001    }
2002    if !iov.is_empty() {
2003        write_ioslices(out, &iov)?;
2004    }
2005    Ok(())
2006}
2007
2008/// Process a chunk of data for single-field extraction using write-pointer pattern.
2009/// Two-level scan: outer memchr(newline), inner memchr_iter(delim) with early exit.
2010/// Uses contiguous run tracking for lines that pass through unchanged.
2011fn process_single_field_chunk(
2012    data: &[u8],
2013    delim: u8,
2014    target_idx: usize,
2015    line_delim: u8,
2016    suppress: bool,
2017    buf: &mut Vec<u8>,
2018) {
2019    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2020    buf.reserve(data.len() + 1);
2021
2022    let base = data.as_ptr();
2023    let initial_len = buf.len();
2024    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
2025    let mut start = 0;
2026    // Track contiguous runs of lines that output unchanged
2027    let mut run_start: usize = 0;
2028    let mut in_run = !suppress; // if suppress, no line passes through without delimiter
2029
2030    for end_pos in memchr_iter(line_delim, data) {
2031        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2032        let line_len = end_pos - start;
2033
2034        if line_len == 0 {
2035            if !suppress {
2036                // Empty line passes through in the run
2037                if !in_run {
2038                    run_start = start;
2039                    in_run = true;
2040                }
2041            }
2042            start = end_pos + 1;
2043            continue;
2044        }
2045
2046        // Count delimiters up to target_idx to find the target field
2047        let mut field_start_offset = 0;
2048        let mut field_idx = 0;
2049        let mut found = false;
2050        let mut has_delim = false;
2051
2052        for pos in memchr_iter(delim, line) {
2053            has_delim = true;
2054            if field_idx == target_idx {
2055                // Found the target field: line[field_start_offset..pos]
2056                // Flush run, output field + newline
2057                if in_run && run_start < start {
2058                    let run_len = start - run_start;
2059                    unsafe {
2060                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2061                        out_ptr = out_ptr.add(run_len);
2062                    }
2063                }
2064                let field_len = pos - field_start_offset;
2065                unsafe {
2066                    std::ptr::copy_nonoverlapping(
2067                        base.add(start + field_start_offset),
2068                        out_ptr,
2069                        field_len,
2070                    );
2071                    out_ptr = out_ptr.add(field_len);
2072                    *out_ptr = line_delim;
2073                    out_ptr = out_ptr.add(1);
2074                }
2075                run_start = end_pos + 1;
2076                in_run = true;
2077                found = true;
2078                break;
2079            }
2080            field_idx += 1;
2081            field_start_offset = pos + 1;
2082        }
2083
2084        if !found {
2085            if !has_delim {
2086                // No delimiter in line
2087                if !suppress {
2088                    // Line passes through unchanged — stays in run
2089                    if !in_run {
2090                        run_start = start;
2091                        in_run = true;
2092                    }
2093                } else {
2094                    // Suppress: flush run, skip this line
2095                    if in_run && run_start < start {
2096                        let run_len = start - run_start;
2097                        unsafe {
2098                            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2099                            out_ptr = out_ptr.add(run_len);
2100                        }
2101                    }
2102                    in_run = false;
2103                    run_start = end_pos + 1;
2104                }
2105            } else if field_idx == target_idx {
2106                // Last field is the target: line[field_start_offset..]
2107                if in_run && run_start < start {
2108                    let run_len = start - run_start;
2109                    unsafe {
2110                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2111                        out_ptr = out_ptr.add(run_len);
2112                    }
2113                }
2114                let field_len = line_len - field_start_offset;
2115                unsafe {
2116                    std::ptr::copy_nonoverlapping(
2117                        base.add(start + field_start_offset),
2118                        out_ptr,
2119                        field_len,
2120                    );
2121                    out_ptr = out_ptr.add(field_len);
2122                    *out_ptr = line_delim;
2123                    out_ptr = out_ptr.add(1);
2124                }
2125                run_start = end_pos + 1;
2126                in_run = true;
2127            } else {
2128                // Not enough fields for target — output empty line
2129                if in_run && run_start < start {
2130                    let run_len = start - run_start;
2131                    unsafe {
2132                        std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2133                        out_ptr = out_ptr.add(run_len);
2134                    }
2135                }
2136                unsafe {
2137                    *out_ptr = line_delim;
2138                    out_ptr = out_ptr.add(1);
2139                }
2140                run_start = end_pos + 1;
2141                in_run = true;
2142            }
2143        }
2144
2145        start = end_pos + 1;
2146    }
2147
2148    // Flush remaining contiguous run
2149    if in_run && run_start < start {
2150        let run_len = start - run_start;
2151        unsafe {
2152            std::ptr::copy_nonoverlapping(base.add(run_start), out_ptr, run_len);
2153            out_ptr = out_ptr.add(run_len);
2154        }
2155    }
2156
2157    // Handle last line without trailing newline
2158    if start < data.len() {
2159        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2160        let line_len = data.len() - start;
2161
2162        if line_len == 0 {
2163            if !suppress {
2164                unsafe {
2165                    *out_ptr = line_delim;
2166                    out_ptr = out_ptr.add(1);
2167                }
2168            }
2169        } else {
2170            let mut field_start_offset = 0;
2171            let mut field_idx = 0;
2172            let mut found = false;
2173            let mut has_delim = false;
2174
2175            for pos in memchr_iter(delim, line) {
2176                has_delim = true;
2177                if field_idx == target_idx {
2178                    let field_len = pos - field_start_offset;
2179                    unsafe {
2180                        std::ptr::copy_nonoverlapping(
2181                            base.add(start + field_start_offset),
2182                            out_ptr,
2183                            field_len,
2184                        );
2185                        out_ptr = out_ptr.add(field_len);
2186                        *out_ptr = line_delim;
2187                        out_ptr = out_ptr.add(1);
2188                    }
2189                    found = true;
2190                    break;
2191                }
2192                field_idx += 1;
2193                field_start_offset = pos + 1;
2194            }
2195
2196            if !found {
2197                if !has_delim {
2198                    if !suppress {
2199                        unsafe {
2200                            std::ptr::copy_nonoverlapping(base.add(start), out_ptr, line_len);
2201                            out_ptr = out_ptr.add(line_len);
2202                            *out_ptr = line_delim;
2203                            out_ptr = out_ptr.add(1);
2204                        }
2205                    }
2206                } else if field_idx == target_idx {
2207                    let field_len = line_len - field_start_offset;
2208                    unsafe {
2209                        std::ptr::copy_nonoverlapping(
2210                            base.add(start + field_start_offset),
2211                            out_ptr,
2212                            field_len,
2213                        );
2214                        out_ptr = out_ptr.add(field_len);
2215                        *out_ptr = line_delim;
2216                        out_ptr = out_ptr.add(1);
2217                    }
2218                } else {
2219                    unsafe {
2220                        *out_ptr = line_delim;
2221                        out_ptr = out_ptr.add(1);
2222                    }
2223                }
2224            }
2225        }
2226    }
2227
2228    unsafe {
2229        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2230        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2231        buf.set_len(new_len);
2232    }
2233}
2234
2235/// Extract fields from a single line into the output buffer.
2236/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2237/// Raw pointer arithmetic eliminates per-field bounds checking.
2238#[inline(always)]
2239fn extract_fields_to_buf(
2240    line: &[u8],
2241    delim: u8,
2242    ranges: &[Range],
2243    output_delim: &[u8],
2244    suppress: bool,
2245    max_field: usize,
2246    field_mask: u64,
2247    line_delim: u8,
2248    buf: &mut Vec<u8>,
2249    complement: bool,
2250) {
2251    let len = line.len();
2252
2253    if len == 0 {
2254        if !suppress {
2255            buf.push(line_delim);
2256        }
2257        return;
2258    }
2259
2260    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2261    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2262    let needed = len + output_delim.len() * 16 + 1;
2263    if buf.capacity() - buf.len() < needed {
2264        buf.reserve(needed);
2265    }
2266
2267    let base = line.as_ptr();
2268    let mut field_num: usize = 1;
2269    let mut field_start: usize = 0;
2270    let mut first_output = true;
2271    let mut has_delim = false;
2272
2273    // Use memchr SIMD for all line sizes
2274    for delim_pos in memchr_iter(delim, line) {
2275        has_delim = true;
2276
2277        if is_selected(field_num, field_mask, ranges, complement) {
2278            if !first_output {
2279                unsafe { buf_extend(buf, output_delim) };
2280            }
2281            unsafe {
2282                buf_extend(
2283                    buf,
2284                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2285                )
2286            };
2287            first_output = false;
2288        }
2289
2290        field_num += 1;
2291        field_start = delim_pos + 1;
2292
2293        if field_num > max_field {
2294            break;
2295        }
2296    }
2297
2298    // Last field
2299    if (field_num <= max_field || complement)
2300        && has_delim
2301        && is_selected(field_num, field_mask, ranges, complement)
2302    {
2303        if !first_output {
2304            unsafe { buf_extend(buf, output_delim) };
2305        }
2306        unsafe {
2307            buf_extend(
2308                buf,
2309                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2310            )
2311        };
2312        first_output = false;
2313    }
2314
2315    if !first_output {
2316        unsafe { buf_push(buf, line_delim) };
2317    } else if !has_delim {
2318        if !suppress {
2319            unsafe {
2320                buf_extend(buf, line);
2321                buf_push(buf, line_delim);
2322            }
2323        }
2324    } else {
2325        unsafe { buf_push(buf, line_delim) };
2326    }
2327}
2328
2329// ── Fast path: byte/char extraction with batched output ──────────────────
2330
2331/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2332/// Zero-copy: writes directly from the source data using output runs.
2333/// For lines shorter than max_bytes, the output is identical to the input,
2334/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2335fn process_bytes_from_start(
2336    data: &[u8],
2337    max_bytes: usize,
2338    line_delim: u8,
2339    out: &mut impl Write,
2340) -> io::Result<()> {
2341    // For small data (< PARALLEL_THRESHOLD): check if all lines fit for zero-copy passthrough.
2342    // The sequential scan + write_all is competitive with per-line processing for small data.
2343    //
2344    // For large data (>= PARALLEL_THRESHOLD): skip the all_fit scan entirely.
2345    // The scan is sequential (~1.7ms for 10MB at memchr speed) while parallel per-line
2346    // processing is much faster (~0.5ms for 10MB with 4 threads). Even when all lines fit,
2347    // the parallel copy + write is faster than sequential scan + zero-copy write.
2348    if data.len() < PARALLEL_THRESHOLD && max_bytes > 0 && max_bytes < usize::MAX {
2349        let mut start = 0;
2350        let mut all_fit = true;
2351        for pos in memchr_iter(line_delim, data) {
2352            if pos - start > max_bytes {
2353                all_fit = false;
2354                break;
2355            }
2356            start = pos + 1;
2357        }
2358        // Check last line (no trailing delimiter)
2359        if all_fit && start < data.len() && data.len() - start > max_bytes {
2360            all_fit = false;
2361        }
2362        if all_fit {
2363            // All lines fit: output = input. Handle missing trailing delimiter.
2364            if !data.is_empty() && data[data.len() - 1] == line_delim {
2365                return out.write_all(data);
2366            } else if !data.is_empty() {
2367                out.write_all(data)?;
2368                return out.write_all(&[line_delim]);
2369            }
2370            return Ok(());
2371        }
2372    }
2373
2374    if data.len() >= PARALLEL_THRESHOLD {
2375        let chunks = split_for_scope(data, line_delim);
2376        let n = chunks.len();
2377        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2378        rayon::scope(|s| {
2379            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2380                s.spawn(move |_| {
2381                    // Output can be up to input size (when all lines fit).
2382                    // Reserve full chunk size to avoid reallocation.
2383                    result.reserve(chunk.len());
2384                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2385                });
2386            }
2387        });
2388        // Use write_vectored (writev) to batch N writes into fewer syscalls
2389        let slices: Vec<IoSlice> = results
2390            .iter()
2391            .filter(|r| !r.is_empty())
2392            .map(|r| IoSlice::new(r))
2393            .collect();
2394        write_ioslices(out, &slices)?;
2395    } else {
2396        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2397        // because every line gets truncated, creating 3 IoSlice entries per line.
2398        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2399        // managing millions of IoSlice entries through the kernel.
2400        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2401        if max_bytes <= 512 {
2402            // Estimate output size without scanning: output <= data.len(),
2403            // typically ~data.len()/4 for short max_bytes on longer lines.
2404            let est_out = (data.len() / 4).max(max_bytes + 2);
2405            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2406            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2407            if !buf.is_empty() {
2408                out.write_all(&buf)?;
2409            }
2410        } else {
2411            // Zero-copy path: track contiguous output runs and write directly from source.
2412            // For lines <= max_bytes, we include them as-is (no copy needed).
2413            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2414            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2415        }
2416    }
2417    Ok(())
2418}
2419
2420/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2421/// directly into the source data, flushing in MAX_IOV-sized batches.
2422/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2423/// produce two IoSlices (truncated data + newline).
2424#[inline]
2425fn bytes_from_start_zerocopy(
2426    data: &[u8],
2427    max_bytes: usize,
2428    line_delim: u8,
2429    out: &mut impl Write,
2430) -> io::Result<()> {
2431    let newline_buf: [u8; 1] = [line_delim];
2432    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2433    let mut start = 0;
2434    let mut run_start: usize = 0;
2435
2436    for pos in memchr_iter(line_delim, data) {
2437        let line_len = pos - start;
2438        if line_len > max_bytes {
2439            // This line needs truncation
2440            if run_start < start {
2441                iov.push(IoSlice::new(&data[run_start..start]));
2442            }
2443            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2444            iov.push(IoSlice::new(&newline_buf));
2445            run_start = pos + 1;
2446
2447            if iov.len() >= MAX_IOV - 2 {
2448                write_ioslices(out, &iov)?;
2449                iov.clear();
2450            }
2451        }
2452        start = pos + 1;
2453    }
2454    // Handle last line without terminator
2455    if start < data.len() {
2456        let line_len = data.len() - start;
2457        if line_len > max_bytes {
2458            if run_start < start {
2459                iov.push(IoSlice::new(&data[run_start..start]));
2460            }
2461            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2462            iov.push(IoSlice::new(&newline_buf));
2463            if !iov.is_empty() {
2464                write_ioslices(out, &iov)?;
2465            }
2466            return Ok(());
2467        }
2468    }
2469    // Flush remaining contiguous run
2470    if run_start < data.len() {
2471        iov.push(IoSlice::new(&data[run_start..]));
2472        if !data.is_empty() && *data.last().unwrap() != line_delim {
2473            iov.push(IoSlice::new(&newline_buf));
2474        }
2475    }
2476    if !iov.is_empty() {
2477        write_ioslices(out, &iov)?;
2478    }
2479    Ok(())
2480}
2481
2482/// Process a chunk for from-start byte range extraction (parallel path).
2483/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2484/// Pre-reserves data.len() (output never exceeds input), then uses a single
2485/// write pointer with deferred set_len — no per-line capacity checks.
2486#[inline]
2487fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2488    // Output is always <= input size (we only truncate, never expand).
2489    // Single reserve eliminates ALL per-line capacity checks.
2490    buf.reserve(data.len());
2491
2492    let src = data.as_ptr();
2493    let dst_base = buf.as_mut_ptr();
2494    let mut wp = buf.len();
2495    let mut start = 0;
2496
2497    for pos in memchr_iter(line_delim, data) {
2498        let line_len = pos - start;
2499        let take = line_len.min(max_bytes);
2500        unsafe {
2501            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2502            *dst_base.add(wp + take) = line_delim;
2503        }
2504        wp += take + 1;
2505        start = pos + 1;
2506    }
2507    // Handle last line without terminator
2508    if start < data.len() {
2509        let line_len = data.len() - start;
2510        let take = line_len.min(max_bytes);
2511        unsafe {
2512            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2513            *dst_base.add(wp + take) = line_delim;
2514        }
2515        wp += take + 1;
2516    }
2517    unsafe { buf.set_len(wp) };
2518}
2519
2520/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2521fn process_bytes_from_offset(
2522    data: &[u8],
2523    skip_bytes: usize,
2524    line_delim: u8,
2525    out: &mut impl Write,
2526) -> io::Result<()> {
2527    if data.len() >= PARALLEL_THRESHOLD {
2528        let chunks = split_for_scope(data, line_delim);
2529        let n = chunks.len();
2530        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2531        rayon::scope(|s| {
2532            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2533                s.spawn(move |_| {
2534                    result.reserve(chunk.len());
2535                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2536                });
2537            }
2538        });
2539        // Use write_vectored (writev) to batch N writes into fewer syscalls
2540        let slices: Vec<IoSlice> = results
2541            .iter()
2542            .filter(|r| !r.is_empty())
2543            .map(|r| IoSlice::new(r))
2544            .collect();
2545        write_ioslices(out, &slices)?;
2546    } else {
2547        // Zero-copy: write suffix of each line directly from source
2548        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2549    }
2550    Ok(())
2551}
2552
2553/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2554/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2555/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2556#[inline]
2557fn bytes_from_offset_zerocopy(
2558    data: &[u8],
2559    skip_bytes: usize,
2560    line_delim: u8,
2561    out: &mut impl Write,
2562) -> io::Result<()> {
2563    let delim_buf = [line_delim];
2564    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2565
2566    let mut start = 0;
2567    for pos in memchr_iter(line_delim, data) {
2568        let line_len = pos - start;
2569        if line_len > skip_bytes {
2570            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2571        }
2572        iov.push(IoSlice::new(&delim_buf));
2573        // Flush when approaching MAX_IOV to avoid oversized writev
2574        if iov.len() >= MAX_IOV - 1 {
2575            write_ioslices(out, &iov)?;
2576            iov.clear();
2577        }
2578        start = pos + 1;
2579    }
2580    if start < data.len() {
2581        let line_len = data.len() - start;
2582        if line_len > skip_bytes {
2583            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2584        }
2585        iov.push(IoSlice::new(&delim_buf));
2586    }
2587    if !iov.is_empty() {
2588        write_ioslices(out, &iov)?;
2589    }
2590    Ok(())
2591}
2592
2593/// Process a chunk for from-offset byte range extraction.
2594/// Single reserve + deferred set_len for zero per-line overhead.
2595#[inline]
2596fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2597    buf.reserve(data.len());
2598
2599    let src = data.as_ptr();
2600    let dst_base = buf.as_mut_ptr();
2601    let mut wp = buf.len();
2602    let mut start = 0;
2603
2604    for pos in memchr_iter(line_delim, data) {
2605        let line_len = pos - start;
2606        if line_len > skip_bytes {
2607            let take = line_len - skip_bytes;
2608            unsafe {
2609                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2610            }
2611            wp += take;
2612        }
2613        unsafe {
2614            *dst_base.add(wp) = line_delim;
2615        }
2616        wp += 1;
2617        start = pos + 1;
2618    }
2619    if start < data.len() {
2620        let line_len = data.len() - start;
2621        if line_len > skip_bytes {
2622            let take = line_len - skip_bytes;
2623            unsafe {
2624                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2625            }
2626            wp += take;
2627        }
2628        unsafe {
2629            *dst_base.add(wp) = line_delim;
2630        }
2631        wp += 1;
2632    }
2633    unsafe { buf.set_len(wp) };
2634}
2635
2636/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2637fn process_bytes_mid_range(
2638    data: &[u8],
2639    start_byte: usize,
2640    end_byte: usize,
2641    line_delim: u8,
2642    out: &mut impl Write,
2643) -> io::Result<()> {
2644    let skip = start_byte.saturating_sub(1);
2645
2646    if data.len() >= PARALLEL_THRESHOLD {
2647        let chunks = split_for_scope(data, line_delim);
2648        let n = chunks.len();
2649        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2650        rayon::scope(|s| {
2651            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2652                s.spawn(move |_| {
2653                    result.reserve(chunk.len());
2654                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2655                });
2656            }
2657        });
2658        let slices: Vec<IoSlice> = results
2659            .iter()
2660            .filter(|r| !r.is_empty())
2661            .map(|r| IoSlice::new(r))
2662            .collect();
2663        write_ioslices(out, &slices)?;
2664    } else {
2665        process_chunked(data, line_delim, out, |chunk, buf| {
2666            bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, buf);
2667        })?;
2668    }
2669    Ok(())
2670}
2671
2672/// Process a chunk for mid-range byte extraction.
2673/// For each line, output bytes skip..min(line_len, end_byte).
2674/// Single reserve + deferred set_len.
2675#[inline]
2676fn bytes_mid_range_chunk(
2677    data: &[u8],
2678    skip: usize,
2679    end_byte: usize,
2680    line_delim: u8,
2681    buf: &mut Vec<u8>,
2682) {
2683    buf.reserve(data.len());
2684
2685    let src = data.as_ptr();
2686    let dst_base = buf.as_mut_ptr();
2687    let mut wp = buf.len();
2688    let mut start = 0;
2689
2690    for pos in memchr_iter(line_delim, data) {
2691        let line_len = pos - start;
2692        if line_len > skip {
2693            let take_end = line_len.min(end_byte);
2694            let take = take_end - skip;
2695            unsafe {
2696                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2697            }
2698            wp += take;
2699        }
2700        unsafe {
2701            *dst_base.add(wp) = line_delim;
2702        }
2703        wp += 1;
2704        start = pos + 1;
2705    }
2706    if start < data.len() {
2707        let line_len = data.len() - start;
2708        if line_len > skip {
2709            let take_end = line_len.min(end_byte);
2710            let take = take_end - skip;
2711            unsafe {
2712                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2713            }
2714            wp += take;
2715        }
2716        unsafe {
2717            *dst_base.add(wp) = line_delim;
2718        }
2719        wp += 1;
2720    }
2721    unsafe { buf.set_len(wp) };
2722}
2723
2724/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2725fn process_bytes_complement_mid(
2726    data: &[u8],
2727    skip_start: usize,
2728    skip_end: usize,
2729    line_delim: u8,
2730    out: &mut impl Write,
2731) -> io::Result<()> {
2732    let prefix_bytes = skip_start - 1; // bytes before the skip region
2733    if data.len() >= PARALLEL_THRESHOLD {
2734        let chunks = split_for_scope(data, line_delim);
2735        let n = chunks.len();
2736        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2737        rayon::scope(|s| {
2738            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2739                s.spawn(move |_| {
2740                    result.reserve(chunk.len());
2741                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
2742                });
2743            }
2744        });
2745        let slices: Vec<IoSlice> = results
2746            .iter()
2747            .filter(|r| !r.is_empty())
2748            .map(|r| IoSlice::new(r))
2749            .collect();
2750        write_ioslices(out, &slices)?;
2751    } else {
2752        process_chunked(data, line_delim, out, |chunk, buf| {
2753            bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, buf);
2754        })?;
2755    }
2756    Ok(())
2757}
2758
2759/// Process a chunk for complement mid-range byte extraction.
2760/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2761#[inline]
2762fn bytes_complement_mid_chunk(
2763    data: &[u8],
2764    prefix_bytes: usize,
2765    skip_end: usize,
2766    line_delim: u8,
2767    buf: &mut Vec<u8>,
2768) {
2769    buf.reserve(data.len());
2770
2771    let src = data.as_ptr();
2772    let dst_base = buf.as_mut_ptr();
2773    let mut wp = buf.len();
2774    let mut start = 0;
2775
2776    for pos in memchr_iter(line_delim, data) {
2777        let line_len = pos - start;
2778        // Copy prefix (bytes before skip region)
2779        let take_prefix = prefix_bytes.min(line_len);
2780        if take_prefix > 0 {
2781            unsafe {
2782                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2783            }
2784            wp += take_prefix;
2785        }
2786        // Copy suffix (bytes after skip region)
2787        if line_len > skip_end {
2788            let suffix_len = line_len - skip_end;
2789            unsafe {
2790                std::ptr::copy_nonoverlapping(
2791                    src.add(start + skip_end),
2792                    dst_base.add(wp),
2793                    suffix_len,
2794                );
2795            }
2796            wp += suffix_len;
2797        }
2798        unsafe {
2799            *dst_base.add(wp) = line_delim;
2800        }
2801        wp += 1;
2802        start = pos + 1;
2803    }
2804    if start < data.len() {
2805        let line_len = data.len() - start;
2806        let take_prefix = prefix_bytes.min(line_len);
2807        if take_prefix > 0 {
2808            unsafe {
2809                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2810            }
2811            wp += take_prefix;
2812        }
2813        if line_len > skip_end {
2814            let suffix_len = line_len - skip_end;
2815            unsafe {
2816                std::ptr::copy_nonoverlapping(
2817                    src.add(start + skip_end),
2818                    dst_base.add(wp),
2819                    suffix_len,
2820                );
2821            }
2822            wp += suffix_len;
2823        }
2824        unsafe {
2825            *dst_base.add(wp) = line_delim;
2826        }
2827        wp += 1;
2828    }
2829    unsafe { buf.set_len(wp) };
2830}
2831
2832/// Optimized byte/char extraction with batched output and parallel processing.
2833fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2834    let line_delim = cfg.line_delim;
2835    let ranges = cfg.ranges;
2836    let complement = cfg.complement;
2837    let output_delim = cfg.output_delim;
2838
2839    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
2840    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
2841        let max_bytes = ranges[0].end;
2842        if max_bytes < usize::MAX {
2843            return process_bytes_from_start(data, max_bytes, line_delim, out);
2844        }
2845    }
2846
2847    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
2848    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
2849        let skip_bytes = ranges[0].start.saturating_sub(1);
2850        if skip_bytes > 0 {
2851            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
2852        }
2853    }
2854
2855    // Fast path: single mid-range (e.g., cut -b5-100)
2856    if !complement
2857        && ranges.len() == 1
2858        && ranges[0].start > 1
2859        && ranges[0].end < usize::MAX
2860        && output_delim.is_empty()
2861    {
2862        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
2863    }
2864
2865    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
2866    if complement
2867        && ranges.len() == 1
2868        && ranges[0].start == 1
2869        && ranges[0].end < usize::MAX
2870        && output_delim.is_empty()
2871    {
2872        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
2873    }
2874
2875    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
2876    if complement
2877        && ranges.len() == 1
2878        && ranges[0].end == usize::MAX
2879        && ranges[0].start > 1
2880        && output_delim.is_empty()
2881    {
2882        let max_bytes = ranges[0].start - 1;
2883        return process_bytes_from_start(data, max_bytes, line_delim, out);
2884    }
2885
2886    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
2887    if complement
2888        && ranges.len() == 1
2889        && ranges[0].start > 1
2890        && ranges[0].end < usize::MAX
2891        && output_delim.is_empty()
2892    {
2893        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
2894    }
2895
2896    if data.len() >= PARALLEL_THRESHOLD {
2897        let chunks = split_for_scope(data, line_delim);
2898        let n = chunks.len();
2899        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2900        rayon::scope(|s| {
2901            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2902                s.spawn(move |_| {
2903                    result.reserve(chunk.len() + 1);
2904                    process_bytes_chunk(
2905                        chunk,
2906                        ranges,
2907                        complement,
2908                        output_delim,
2909                        line_delim,
2910                        result,
2911                    );
2912                });
2913            }
2914        });
2915        let slices: Vec<IoSlice> = results
2916            .iter()
2917            .filter(|r| !r.is_empty())
2918            .map(|r| IoSlice::new(r))
2919            .collect();
2920        write_ioslices(out, &slices)?;
2921    } else {
2922        process_chunked(data, line_delim, out, |chunk, buf| {
2923            process_bytes_chunk(chunk, ranges, complement, output_delim, line_delim, buf);
2924        })?;
2925    }
2926    Ok(())
2927}
2928
2929/// Process a chunk of data for byte/char extraction.
2930/// Uses raw pointer arithmetic for the newline scan.
2931/// Complement single-range fast path: compute complement ranges once, then use
2932/// the non-complement multi-range path which is more cache-friendly.
2933fn process_bytes_chunk(
2934    data: &[u8],
2935    ranges: &[Range],
2936    complement: bool,
2937    output_delim: &[u8],
2938    line_delim: u8,
2939    buf: &mut Vec<u8>,
2940) {
2941    buf.reserve(data.len());
2942    let base = data.as_ptr();
2943    let mut start = 0;
2944    for end_pos in memchr_iter(line_delim, data) {
2945        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2946        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2947        unsafe { buf_push(buf, line_delim) };
2948        start = end_pos + 1;
2949    }
2950    if start < data.len() {
2951        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2952        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2953        unsafe { buf_push(buf, line_delim) };
2954    }
2955}
2956
2957/// Extract byte ranges from a line into the output buffer.
2958/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
2959/// Raw pointer arithmetic eliminates per-range bounds checking.
2960#[inline(always)]
2961fn cut_bytes_to_buf(
2962    line: &[u8],
2963    ranges: &[Range],
2964    complement: bool,
2965    output_delim: &[u8],
2966    buf: &mut Vec<u8>,
2967) {
2968    let len = line.len();
2969    let base = line.as_ptr();
2970    let mut first_range = true;
2971
2972    // Reserve worst case: full line + delimiters between ranges
2973    let needed = len + output_delim.len() * ranges.len() + 1;
2974    if buf.capacity() - buf.len() < needed {
2975        buf.reserve(needed);
2976    }
2977
2978    if complement {
2979        let mut pos: usize = 1;
2980        for r in ranges {
2981            let rs = r.start;
2982            let re = r.end.min(len);
2983            if pos < rs {
2984                if !first_range && !output_delim.is_empty() {
2985                    unsafe { buf_extend(buf, output_delim) };
2986                }
2987                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
2988                first_range = false;
2989            }
2990            pos = re + 1;
2991            if pos > len {
2992                break;
2993            }
2994        }
2995        if pos <= len {
2996            if !first_range && !output_delim.is_empty() {
2997                unsafe { buf_extend(buf, output_delim) };
2998            }
2999            unsafe {
3000                buf_extend(
3001                    buf,
3002                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3003                )
3004            };
3005        }
3006    } else if output_delim.is_empty() && ranges.len() == 1 {
3007        // Ultra-fast path: single range, no output delimiter
3008        let start = ranges[0].start.saturating_sub(1);
3009        let end = ranges[0].end.min(len);
3010        if start < len {
3011            unsafe {
3012                buf_extend(
3013                    buf,
3014                    std::slice::from_raw_parts(base.add(start), end - start),
3015                )
3016            };
3017        }
3018    } else {
3019        for r in ranges {
3020            let start = r.start.saturating_sub(1);
3021            let end = r.end.min(len);
3022            if start >= len {
3023                break;
3024            }
3025            if !first_range && !output_delim.is_empty() {
3026                unsafe { buf_extend(buf, output_delim) };
3027            }
3028            unsafe {
3029                buf_extend(
3030                    buf,
3031                    std::slice::from_raw_parts(base.add(start), end - start),
3032                )
3033            };
3034            first_range = false;
3035        }
3036    }
3037}
3038
3039// ── Public API ───────────────────────────────────────────────────────────
3040
3041/// Cut fields from a line using a delimiter. Writes to `out`.
3042#[inline]
3043pub fn cut_fields(
3044    line: &[u8],
3045    delim: u8,
3046    ranges: &[Range],
3047    complement: bool,
3048    output_delim: &[u8],
3049    suppress_no_delim: bool,
3050    out: &mut impl Write,
3051) -> io::Result<bool> {
3052    if memchr::memchr(delim, line).is_none() {
3053        if !suppress_no_delim {
3054            out.write_all(line)?;
3055            return Ok(true);
3056        }
3057        return Ok(false);
3058    }
3059
3060    let mut field_num: usize = 1;
3061    let mut field_start: usize = 0;
3062    let mut first_output = true;
3063
3064    for delim_pos in memchr_iter(delim, line) {
3065        let selected = in_ranges(ranges, field_num) != complement;
3066        if selected {
3067            if !first_output {
3068                out.write_all(output_delim)?;
3069            }
3070            out.write_all(&line[field_start..delim_pos])?;
3071            first_output = false;
3072        }
3073        field_start = delim_pos + 1;
3074        field_num += 1;
3075    }
3076
3077    let selected = in_ranges(ranges, field_num) != complement;
3078    if selected {
3079        if !first_output {
3080            out.write_all(output_delim)?;
3081        }
3082        out.write_all(&line[field_start..])?;
3083    }
3084
3085    Ok(true)
3086}
3087
3088/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3089#[inline]
3090pub fn cut_bytes(
3091    line: &[u8],
3092    ranges: &[Range],
3093    complement: bool,
3094    output_delim: &[u8],
3095    out: &mut impl Write,
3096) -> io::Result<bool> {
3097    let mut first_range = true;
3098
3099    if complement {
3100        let len = line.len();
3101        let mut comp_ranges = Vec::new();
3102        let mut pos: usize = 1;
3103        for r in ranges {
3104            let rs = r.start;
3105            let re = r.end.min(len);
3106            if pos < rs {
3107                comp_ranges.push((pos, rs - 1));
3108            }
3109            pos = re + 1;
3110            if pos > len {
3111                break;
3112            }
3113        }
3114        if pos <= len {
3115            comp_ranges.push((pos, len));
3116        }
3117        for &(s, e) in &comp_ranges {
3118            if !first_range && !output_delim.is_empty() {
3119                out.write_all(output_delim)?;
3120            }
3121            out.write_all(&line[s - 1..e])?;
3122            first_range = false;
3123        }
3124    } else {
3125        for r in ranges {
3126            let start = r.start.saturating_sub(1);
3127            let end = r.end.min(line.len());
3128            if start >= line.len() {
3129                break;
3130            }
3131            if !first_range && !output_delim.is_empty() {
3132                out.write_all(output_delim)?;
3133            }
3134            out.write_all(&line[start..end])?;
3135            first_range = false;
3136        }
3137    }
3138    Ok(true)
3139}
3140
3141/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3142/// Output is always <= input (we remove everything after first delimiter per line).
3143/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3144/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3145///
3146/// Lines without delimiter pass through unchanged (unless suppress=true).
3147/// Lines with delimiter: keep bytes before delimiter + newline.
3148pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3149    let len = data.len();
3150    let mut wp: usize = 0;
3151    let mut rp: usize = 0;
3152
3153    while rp < len {
3154        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3155            None => {
3156                // Rest is partial line, no delimiter
3157                if suppress {
3158                    // suppress: skip lines without delimiter
3159                    break;
3160                }
3161                let remaining = len - rp;
3162                if wp != rp {
3163                    data.copy_within(rp..len, wp);
3164                }
3165                wp += remaining;
3166                break;
3167            }
3168            Some(offset) => {
3169                let actual = rp + offset;
3170                if data[actual] == line_delim {
3171                    // No delimiter on this line
3172                    if suppress {
3173                        // Skip this line entirely
3174                        rp = actual + 1;
3175                    } else {
3176                        // Output entire line including newline
3177                        let chunk_len = actual + 1 - rp;
3178                        if wp != rp {
3179                            data.copy_within(rp..actual + 1, wp);
3180                        }
3181                        wp += chunk_len;
3182                        rp = actual + 1;
3183                    }
3184                } else {
3185                    // Delimiter found: output field 1 (up to delimiter) + newline
3186                    let field_len = actual - rp;
3187                    if wp != rp && field_len > 0 {
3188                        data.copy_within(rp..actual, wp);
3189                    }
3190                    wp += field_len;
3191                    data[wp] = line_delim;
3192                    wp += 1;
3193                    // Skip to next newline
3194                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3195                        None => {
3196                            rp = len;
3197                        }
3198                        Some(nl_off) => {
3199                            rp = actual + 1 + nl_off + 1;
3200                        }
3201                    }
3202                }
3203            }
3204        }
3205    }
3206    wp
3207}
3208
3209/// Process a full data buffer (from mmap or read) with cut operation.
3210pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3211    match cfg.mode {
3212        CutMode::Fields => process_fields_fast(data, cfg, out),
3213        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3214    }
3215}
3216
3217/// Process input from a reader (for stdin).
3218/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3219/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3220/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3221pub fn process_cut_reader<R: BufRead>(
3222    mut reader: R,
3223    cfg: &CutConfig,
3224    out: &mut impl Write,
3225) -> io::Result<()> {
3226    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3227    let line_delim = cfg.line_delim;
3228
3229    // Read large chunks and process in batch.
3230    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3231    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3232
3233    loop {
3234        // Read up to CHUNK_SIZE bytes
3235        buf.reserve(CHUNK_SIZE);
3236        let read_start = buf.len();
3237        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3238        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3239        buf.truncate(read_start + n);
3240
3241        if buf.is_empty() {
3242            break;
3243        }
3244
3245        if n == 0 {
3246            // EOF with leftover data (last line without terminator)
3247            process_cut_data(&buf, cfg, out)?;
3248            break;
3249        }
3250
3251        // Find the last line delimiter in the buffer so we process complete lines
3252        let process_end = match memchr::memrchr(line_delim, &buf) {
3253            Some(pos) => pos + 1,
3254            None => {
3255                // No line delimiter found — keep accumulating
3256                continue;
3257            }
3258        };
3259
3260        // Process the complete lines using the fast batch path
3261        process_cut_data(&buf[..process_end], cfg, out)?;
3262
3263        // Shift leftover to the front for next iteration
3264        let leftover_len = buf.len() - process_end;
3265        if leftover_len > 0 {
3266            buf.copy_within(process_end.., 0);
3267        }
3268        buf.truncate(leftover_len);
3269    }
3270
3271    Ok(())
3272}
3273
3274/// Read as many bytes as possible into buf, retrying on partial reads.
3275#[inline]
3276fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3277    let n = reader.read(buf)?;
3278    if n == buf.len() || n == 0 {
3279        return Ok(n);
3280    }
3281    // Slow path: partial read — retry to fill buffer
3282    let mut total = n;
3283    while total < buf.len() {
3284        match reader.read(&mut buf[total..]) {
3285            Ok(0) => break,
3286            Ok(n) => total += n,
3287            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3288            Err(e) => return Err(e),
3289        }
3290    }
3291    Ok(total)
3292}
3293
3294/// In-place cut processing for mutable data buffers.
3295/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3296/// for the given configuration (caller should fall back to regular processing).
3297///
3298/// In-place avoids allocating intermediate output buffers — the result is written
3299/// directly into the input buffer (output is always <= input for non-complement modes
3300/// with default output delimiter).
3301///
3302/// Note: if the input does not end with line_delim, we fall back to the regular
3303/// path because GNU cut always adds a trailing line delimiter, and the in-place
3304/// buffer cannot grow beyond the input size.
3305pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3306    if cfg.complement {
3307        return None;
3308    }
3309    // If input doesn't end with line_delim, the output may need an extra byte
3310    // (GNU cut always terminates the last line). In-place can't grow the buffer,
3311    // so fall back to the regular allocating path.
3312    if data.is_empty() || data[data.len() - 1] != cfg.line_delim {
3313        return None;
3314    }
3315
3316    match cfg.mode {
3317        CutMode::Fields => {
3318            // Only handle when output delimiter matches input (single-byte)
3319            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3320                return None;
3321            }
3322            if cfg.delim == cfg.line_delim {
3323                return None;
3324            }
3325            Some(cut_fields_inplace_general(
3326                data,
3327                cfg.delim,
3328                cfg.line_delim,
3329                cfg.ranges,
3330                cfg.suppress_no_delim,
3331            ))
3332        }
3333        CutMode::Bytes | CutMode::Characters => {
3334            if !cfg.output_delim.is_empty() {
3335                return None;
3336            }
3337            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3338        }
3339    }
3340}
3341
3342/// In-place generalized field extraction.
3343/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3344fn cut_fields_inplace_general(
3345    data: &mut [u8],
3346    delim: u8,
3347    line_delim: u8,
3348    ranges: &[Range],
3349    suppress: bool,
3350) -> usize {
3351    // Special case: field 1 only (existing optimized path)
3352    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3353        return cut_field1_inplace(data, delim, line_delim, suppress);
3354    }
3355
3356    let len = data.len();
3357    if len == 0 {
3358        return 0;
3359    }
3360
3361    let max_field = ranges.last().map_or(0, |r| r.end);
3362    let max_delims = max_field.min(128);
3363    let mut wp: usize = 0;
3364    let mut rp: usize = 0;
3365
3366    while rp < len {
3367        let line_end = memchr::memchr(line_delim, &data[rp..])
3368            .map(|p| rp + p)
3369            .unwrap_or(len);
3370        let line_len = line_end - rp;
3371
3372        // Collect delimiter positions (relative to line start)
3373        let mut delim_pos = [0usize; 128];
3374        let mut num_delims: usize = 0;
3375
3376        for pos in memchr_iter(delim, &data[rp..line_end]) {
3377            if num_delims < max_delims {
3378                delim_pos[num_delims] = pos;
3379                num_delims += 1;
3380                if num_delims >= max_delims {
3381                    break;
3382                }
3383            }
3384        }
3385
3386        if num_delims == 0 {
3387            // No delimiter in line
3388            if !suppress {
3389                if wp != rp {
3390                    data.copy_within(rp..line_end, wp);
3391                }
3392                wp += line_len;
3393                if line_end < len {
3394                    data[wp] = line_delim;
3395                    wp += 1;
3396                }
3397            }
3398        } else {
3399            let total_fields = num_delims + 1;
3400            let mut first_output = true;
3401
3402            for r in ranges {
3403                let range_start = r.start;
3404                let range_end = r.end.min(total_fields);
3405                if range_start > total_fields {
3406                    break;
3407                }
3408                for field_num in range_start..=range_end {
3409                    if field_num > total_fields {
3410                        break;
3411                    }
3412
3413                    let field_start = if field_num == 1 {
3414                        0
3415                    } else if field_num - 2 < num_delims {
3416                        delim_pos[field_num - 2] + 1
3417                    } else {
3418                        continue;
3419                    };
3420                    let field_end = if field_num <= num_delims {
3421                        delim_pos[field_num - 1]
3422                    } else {
3423                        line_len
3424                    };
3425
3426                    if !first_output {
3427                        data[wp] = delim;
3428                        wp += 1;
3429                    }
3430                    let flen = field_end - field_start;
3431                    if flen > 0 {
3432                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3433                        wp += flen;
3434                    }
3435                    first_output = false;
3436                }
3437            }
3438
3439            if !first_output && line_end < len {
3440                data[wp] = line_delim;
3441                wp += 1;
3442            } else if first_output && line_end < len {
3443                // No fields selected but line had delimiters — output empty line
3444                data[wp] = line_delim;
3445                wp += 1;
3446            }
3447        }
3448
3449        rp = if line_end < len { line_end + 1 } else { len };
3450    }
3451
3452    wp
3453}
3454
3455/// In-place byte/char range extraction.
3456fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3457    let len = data.len();
3458    if len == 0 {
3459        return 0;
3460    }
3461
3462    // Quick check: single range from byte 1 to end = no-op
3463    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3464        return len;
3465    }
3466
3467    // Single range from byte 1: fast truncation path
3468    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3469        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3470    }
3471
3472    let mut wp: usize = 0;
3473    let mut rp: usize = 0;
3474
3475    while rp < len {
3476        let line_end = memchr::memchr(line_delim, &data[rp..])
3477            .map(|p| rp + p)
3478            .unwrap_or(len);
3479        let line_len = line_end - rp;
3480
3481        for r in ranges {
3482            let start = r.start.saturating_sub(1);
3483            let end = r.end.min(line_len);
3484            if start >= line_len {
3485                break;
3486            }
3487            let flen = end - start;
3488            if flen > 0 {
3489                data.copy_within(rp + start..rp + start + flen, wp);
3490                wp += flen;
3491            }
3492        }
3493
3494        if line_end < len {
3495            data[wp] = line_delim;
3496            wp += 1;
3497        }
3498
3499        rp = if line_end < len { line_end + 1 } else { len };
3500    }
3501
3502    wp
3503}
3504
3505/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3506fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3507    let len = data.len();
3508
3509    // Quick check: see if all lines fit within max_bytes (common case)
3510    let mut all_fit = true;
3511    let mut start = 0;
3512    for pos in memchr_iter(line_delim, data) {
3513        if pos - start > max_bytes {
3514            all_fit = false;
3515            break;
3516        }
3517        start = pos + 1;
3518    }
3519    if all_fit && start < len && len - start > max_bytes {
3520        all_fit = false;
3521    }
3522    if all_fit {
3523        return len;
3524    }
3525
3526    // Some lines need truncation
3527    let mut wp: usize = 0;
3528    let mut rp: usize = 0;
3529
3530    while rp < len {
3531        let line_end = memchr::memchr(line_delim, &data[rp..])
3532            .map(|p| rp + p)
3533            .unwrap_or(len);
3534        let line_len = line_end - rp;
3535
3536        let take = line_len.min(max_bytes);
3537        if take > 0 && wp != rp {
3538            data.copy_within(rp..rp + take, wp);
3539        }
3540        wp += take;
3541
3542        if line_end < len {
3543            data[wp] = line_delim;
3544            wp += 1;
3545        }
3546
3547        rp = if line_end < len { line_end + 1 } else { len };
3548    }
3549
3550    wp
3551}
3552
3553/// Cut operation mode
3554#[derive(Debug, Clone, Copy, PartialEq)]
3555pub enum CutMode {
3556    Bytes,
3557    Characters,
3558    Fields,
3559}