Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (2MB).
5/// std::thread::scope avoids rayon's ~0.3-0.5ms thread pool initialization cost
6/// (paid every process invocation via clone+mmap for worker thread stacks).
7/// For data >= 2MB with 4 cores, the parallel savings far exceed thread::scope overhead.
8const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
9
10/// Max iovec entries per writev call (Linux default).
11const MAX_IOV: usize = 1024;
12
13/// Configuration for cut operations.
14pub struct CutConfig<'a> {
15    pub mode: CutMode,
16    pub ranges: &'a [Range],
17    pub complement: bool,
18    pub delim: u8,
19    pub output_delim: &'a [u8],
20    pub suppress_no_delim: bool,
21    pub line_delim: u8,
22}
23
24/// A range specification like 1, 3-5, -3, 4-
25#[derive(Debug, Clone)]
26pub struct Range {
27    pub start: usize, // 1-based, 0 means "from beginning"
28    pub end: usize,   // 1-based, usize::MAX means "to end"
29}
30
31/// Parse a LIST specification like "1,3-5,7-" into ranges.
32/// Each range is 1-based. Returns sorted, merged ranges.
33pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
34    let mut ranges = Vec::new();
35
36    for part in spec.split(',') {
37        let part = part.trim();
38        if part.is_empty() {
39            continue;
40        }
41
42        if let Some(idx) = part.find('-') {
43            let left = &part[..idx];
44            let right = &part[idx + 1..];
45
46            let start = if left.is_empty() {
47                1
48            } else {
49                left.parse::<usize>()
50                    .map_err(|_| format!("invalid range: '{}'", part))?
51            };
52
53            let end = if right.is_empty() {
54                usize::MAX
55            } else {
56                right
57                    .parse::<usize>()
58                    .map_err(|_| format!("invalid range: '{}'", part))?
59            };
60
61            if start == 0 {
62                return Err("fields and positions are numbered from 1".to_string());
63            }
64            if start > end {
65                return Err(format!("invalid decreasing range: '{}'", part));
66            }
67
68            ranges.push(Range { start, end });
69        } else {
70            let n = part
71                .parse::<usize>()
72                .map_err(|_| format!("invalid field: '{}'", part))?;
73            if n == 0 {
74                return Err("fields and positions are numbered from 1".to_string());
75            }
76            ranges.push(Range { start: n, end: n });
77        }
78    }
79
80    if ranges.is_empty() {
81        return Err("you must specify a list of bytes, characters, or fields".to_string());
82    }
83
84    // Sort and merge overlapping ranges
85    ranges.sort_by_key(|r| (r.start, r.end));
86    let mut merged = vec![ranges[0].clone()];
87    for r in &ranges[1..] {
88        let last = merged.last_mut().unwrap();
89        if r.start <= last.end.saturating_add(1) {
90            last.end = last.end.max(r.end);
91        } else {
92            merged.push(r.clone());
93        }
94    }
95
96    Ok(merged)
97}
98
99/// Check if a 1-based position is in any range.
100/// Ranges must be sorted. Uses early exit since ranges are sorted.
101#[inline(always)]
102fn in_ranges(ranges: &[Range], pos: usize) -> bool {
103    for r in ranges {
104        if pos < r.start {
105            return false;
106        }
107        if pos <= r.end {
108            return true;
109        }
110    }
111    false
112}
113
114/// Pre-compute a 64-bit mask for field selection.
115/// Bit i-1 is set if field i should be output.
116#[inline]
117fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
118    let mut mask: u64 = 0;
119    for i in 1..=64u32 {
120        let in_range = in_ranges(ranges, i as usize);
121        if in_range != complement {
122            mask |= 1u64 << (i - 1);
123        }
124    }
125    mask
126}
127
128/// Check if a field should be selected, using bitset for first 64 fields.
129#[inline(always)]
130fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
131    if field_num <= 64 {
132        (mask >> (field_num - 1)) & 1 == 1
133    } else {
134        in_ranges(ranges, field_num) != complement
135    }
136}
137
138// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
139
140/// Append a slice to buf without capacity checks.
141/// Caller MUST ensure buf has enough remaining capacity.
142#[inline(always)]
143unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
144    unsafe {
145        let len = buf.len();
146        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
147        buf.set_len(len + data.len());
148    }
149}
150
151/// Append a single byte to buf without capacity checks.
152/// Caller MUST ensure buf has enough remaining capacity.
153#[inline(always)]
154unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
155    unsafe {
156        let len = buf.len();
157        *buf.as_mut_ptr().add(len) = b;
158        buf.set_len(len + 1);
159    }
160}
161
162/// Write multiple IoSlice buffers using write_vectored (writev syscall).
163/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
164/// Cold path (partial write) is out-of-line to keep the hot loop tight.
165#[inline]
166fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
167    if slices.is_empty() {
168        return Ok(());
169    }
170    for batch in slices.chunks(MAX_IOV) {
171        let total: usize = batch.iter().map(|s| s.len()).sum();
172        let written = out.write_vectored(batch)?;
173        if written >= total {
174            continue;
175        }
176        if written == 0 {
177            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
178        }
179        write_ioslices_slow(out, batch, written)?;
180    }
181    Ok(())
182}
183
184/// Handle partial write_vectored (cold path, never inlined).
185#[cold]
186#[inline(never)]
187fn write_ioslices_slow(
188    out: &mut impl Write,
189    slices: &[IoSlice],
190    mut skip: usize,
191) -> io::Result<()> {
192    for slice in slices {
193        let len = slice.len();
194        if skip >= len {
195            skip -= len;
196            continue;
197        }
198        out.write_all(&slice[skip..])?;
199        skip = 0;
200    }
201    Ok(())
202}
203
204// ── Chunk splitting for parallel processing ──────────────────────────────
205
206/// Number of available CPUs (cached). Used for thread::scope parallelism.
207#[inline]
208fn num_cpus() -> usize {
209    std::thread::available_parallelism()
210        .map(|n| n.get())
211        .unwrap_or(1)
212}
213
214/// Run a closure on each chunk in parallel using std::thread::scope.
215/// Avoids rayon's ~0.3-0.5ms thread pool initialization overhead per process.
216/// For single-chunk inputs, runs inline without thread creation.
217fn par_process<'a, F>(chunks: &[&'a [u8]], f: F) -> Vec<Vec<u8>>
218where
219    F: Fn(&'a [u8]) -> Vec<u8> + Sync,
220{
221    if chunks.len() <= 1 {
222        return chunks.iter().map(|c| f(c)).collect();
223    }
224    std::thread::scope(|s| {
225        let f = &f;
226        let handles: Vec<_> = chunks
227            .iter()
228            .map(|&chunk| s.spawn(move || f(chunk)))
229            .collect();
230        handles.into_iter().map(|h| h.join().unwrap()).collect()
231    })
232}
233
234/// Split data into chunks aligned to line boundaries for parallel processing.
235fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
236    let num_threads = num_cpus();
237    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
238        return vec![data];
239    }
240
241    let chunk_size = data.len() / num_threads;
242    let mut chunks = Vec::with_capacity(num_threads);
243    let mut pos = 0;
244
245    for _ in 0..num_threads - 1 {
246        let target = pos + chunk_size;
247        if target >= data.len() {
248            break;
249        }
250        let boundary = memchr::memchr(line_delim, &data[target..])
251            .map(|p| target + p + 1)
252            .unwrap_or(data.len());
253        if boundary > pos {
254            chunks.push(&data[pos..boundary]);
255        }
256        pos = boundary;
257    }
258
259    if pos < data.len() {
260        chunks.push(&data[pos..]);
261    }
262
263    chunks
264}
265
266// ── Fast path: multi-field non-contiguous extraction ─────────────────────
267
268/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
269/// Pre-collects delimiter positions per line into a stack-allocated array,
270/// then directly indexes into them for each selected field.
271/// This is O(max_field) per line instead of O(num_fields * scan_length).
272fn process_fields_multi_select(
273    data: &[u8],
274    delim: u8,
275    line_delim: u8,
276    ranges: &[Range],
277    suppress: bool,
278    out: &mut impl Write,
279) -> io::Result<()> {
280    let max_field = ranges.last().map_or(0, |r| r.end);
281
282    if data.len() >= PARALLEL_THRESHOLD {
283        let chunks = split_into_chunks(data, line_delim);
284        let results = par_process(&chunks, |chunk| {
285            let mut buf = Vec::with_capacity(chunk.len() * 3 / 4);
286            multi_select_chunk(
287                chunk, delim, line_delim, ranges, max_field, suppress, &mut buf,
288            );
289            buf
290        });
291        let slices: Vec<IoSlice> = results
292            .iter()
293            .filter(|r| !r.is_empty())
294            .map(|r| IoSlice::new(r))
295            .collect();
296        write_ioslices(out, &slices)?;
297    } else {
298        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
299        multi_select_chunk(
300            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
301        );
302        if !buf.is_empty() {
303            out.write_all(&buf)?;
304        }
305    }
306    Ok(())
307}
308
309/// Process a chunk for multi-field extraction using two-level scanning.
310/// Outer: memchr_iter for newlines (one SIMD scan).
311/// Inner: memchr_iter for delimiters with early exit at max_field.
312/// This avoids processing excess delimiters on wide CSV data (e.g., 20 columns
313/// but only selecting -f1,3,5 → only scans 5 of 19 delimiters per line).
314/// Also eliminates delimiter/newline branching in the hot loop — inner loop
315/// is 100% delimiters (no mispredictions).
316fn multi_select_chunk(
317    data: &[u8],
318    delim: u8,
319    line_delim: u8,
320    ranges: &[Range],
321    max_field: usize,
322    suppress: bool,
323    buf: &mut Vec<u8>,
324) {
325    buf.reserve(data.len());
326    let base = data.as_ptr();
327    let mut start = 0;
328    for end_pos in memchr_iter(line_delim, data) {
329        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
330        multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
331        start = end_pos + 1;
332    }
333    if start < data.len() {
334        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
335        multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
336    }
337}
338
339/// Extract selected fields from a single line using delimiter position scanning.
340/// Scans delimiters only up to max_field (early exit), then extracts selected fields
341/// by indexing directly into the collected positions. Since ranges are pre-sorted and
342/// non-overlapping, every field within a range is selected — no is_selected check needed.
343#[inline(always)]
344fn multi_select_line(
345    line: &[u8],
346    delim: u8,
347    line_delim: u8,
348    ranges: &[Range],
349    max_field: usize,
350    suppress: bool,
351    buf: &mut Vec<u8>,
352) {
353    let len = line.len();
354    if len == 0 {
355        if !suppress {
356            unsafe { buf_push(buf, line_delim) };
357        }
358        return;
359    }
360
361    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
362    let base = line.as_ptr();
363
364    // Collect delimiter positions up to max_field (early exit).
365    // Stack array for up to 64 delimiter positions.
366    let mut delim_pos = [0usize; 64];
367    let mut num_delims: usize = 0;
368    let max_delims = max_field.min(64);
369
370    for pos in memchr_iter(delim, line) {
371        if num_delims < max_delims {
372            delim_pos[num_delims] = pos;
373            num_delims += 1;
374            if num_delims >= max_delims {
375                break;
376            }
377        }
378    }
379
380    if num_delims == 0 {
381        if !suppress {
382            unsafe {
383                buf_extend(buf, line);
384                buf_push(buf, line_delim);
385            }
386        }
387        return;
388    }
389
390    // Extract selected fields using delimiter positions.
391    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
392    // is selected — skip the is_selected check entirely (saves 1 function call per field).
393    let total_fields = num_delims + 1;
394    let mut first_output = true;
395
396    for r in ranges {
397        let range_start = r.start;
398        let range_end = r.end.min(total_fields);
399        if range_start > total_fields {
400            break;
401        }
402        for field_num in range_start..=range_end {
403            if field_num > total_fields {
404                break;
405            }
406
407            let field_start = if field_num == 1 {
408                0
409            } else if field_num - 2 < num_delims {
410                delim_pos[field_num - 2] + 1
411            } else {
412                continue;
413            };
414            let field_end = if field_num <= num_delims {
415                delim_pos[field_num - 1]
416            } else {
417                len
418            };
419
420            if !first_output {
421                unsafe { buf_push(buf, delim) };
422            }
423            unsafe {
424                buf_extend(
425                    buf,
426                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
427                );
428            }
429            first_output = false;
430        }
431    }
432
433    unsafe { buf_push(buf, line_delim) };
434}
435
436// ── Fast path: field extraction with batched output ──────────────────────
437
438/// Optimized field extraction with early exit and batched output.
439fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
440    let delim = cfg.delim;
441    let line_delim = cfg.line_delim;
442    let ranges = cfg.ranges;
443    let complement = cfg.complement;
444    let output_delim = cfg.output_delim;
445    let suppress = cfg.suppress_no_delim;
446
447    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
448    // That scan was O(N) over the entire file just to check an edge case
449    // (no delimiter in any line). The per-line processing already handles
450    // lines without delimiters correctly, so the scan was pure overhead
451    // for files that DO contain delimiters (the common case).
452
453    // Ultra-fast path: single field extraction (e.g., cut -f5)
454    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
455        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
456    }
457
458    // Fast path: complement of single field or contiguous range with default output delimiter.
459    if complement
460        && ranges.len() == 1
461        && output_delim.len() == 1
462        && output_delim[0] == delim
463        && ranges[0].start == ranges[0].end
464    {
465        return process_complement_single_field(
466            data,
467            delim,
468            line_delim,
469            ranges[0].start,
470            suppress,
471            out,
472        );
473    }
474
475    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
476    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
477    if complement
478        && ranges.len() == 1
479        && ranges[0].start > 1
480        && ranges[0].end < usize::MAX
481        && output_delim.len() == 1
482        && output_delim[0] == delim
483    {
484        return process_complement_range(
485            data,
486            delim,
487            line_delim,
488            ranges[0].start,
489            ranges[0].end,
490            suppress,
491            out,
492        );
493    }
494
495    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
496    if !complement
497        && ranges.len() == 1
498        && ranges[0].start == 1
499        && output_delim.len() == 1
500        && output_delim[0] == delim
501        && ranges[0].end < usize::MAX
502    {
503        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
504    }
505
506    // Fast path: open-ended field range from field N (e.g., cut -f3-)
507    if !complement
508        && ranges.len() == 1
509        && ranges[0].end == usize::MAX
510        && ranges[0].start > 1
511        && output_delim.len() == 1
512        && output_delim[0] == delim
513    {
514        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
515    }
516
517    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
518    if !complement
519        && ranges.len() == 1
520        && ranges[0].start > 1
521        && ranges[0].end < usize::MAX
522        && output_delim.len() == 1
523        && output_delim[0] == delim
524    {
525        return process_fields_mid_range(
526            data,
527            delim,
528            line_delim,
529            ranges[0].start,
530            ranges[0].end,
531            suppress,
532            out,
533        );
534    }
535
536    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
537    // Uses delimiter position caching: find all delimiter positions per line,
538    // then directly index into them for each selected field.
539    // This is faster than the general extract_fields_to_buf which re-checks
540    // is_selected() for every field encountered.
541    if !complement
542        && ranges.len() > 1
543        && ranges.last().map_or(false, |r| r.end < usize::MAX)
544        && output_delim.len() == 1
545        && output_delim[0] == delim
546        && delim != line_delim
547    {
548        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
549    }
550
551    // General field extraction
552    let max_field = if complement {
553        usize::MAX
554    } else {
555        ranges.last().map(|r| r.end).unwrap_or(0)
556    };
557    let field_mask = compute_field_mask(ranges, complement);
558
559    if data.len() >= PARALLEL_THRESHOLD {
560        let chunks = split_into_chunks(data, line_delim);
561        let results = par_process(&chunks, |chunk| {
562            let mut buf = Vec::with_capacity(chunk.len());
563            process_fields_chunk(
564                chunk,
565                delim,
566                ranges,
567                output_delim,
568                suppress,
569                max_field,
570                field_mask,
571                line_delim,
572                complement,
573                &mut buf,
574            );
575            buf
576        });
577        let slices: Vec<IoSlice> = results
578            .iter()
579            .filter(|r| !r.is_empty())
580            .map(|r| IoSlice::new(r))
581            .collect();
582        write_ioslices(out, &slices)?;
583    } else {
584        let mut buf = Vec::with_capacity(data.len());
585        process_fields_chunk(
586            data,
587            delim,
588            ranges,
589            output_delim,
590            suppress,
591            max_field,
592            field_mask,
593            line_delim,
594            complement,
595            &mut buf,
596        );
597        if !buf.is_empty() {
598            out.write_all(&buf)?;
599        }
600    }
601    Ok(())
602}
603
604/// Process a chunk of data for general field extraction.
605/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
606/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
607/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
608fn process_fields_chunk(
609    data: &[u8],
610    delim: u8,
611    ranges: &[Range],
612    output_delim: &[u8],
613    suppress: bool,
614    max_field: usize,
615    field_mask: u64,
616    line_delim: u8,
617    complement: bool,
618    buf: &mut Vec<u8>,
619) {
620    // When delim != line_delim and max_field is bounded, use two-level approach:
621    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
622    // This avoids scanning past max_field on each line (significant for lines with
623    // many columns but small field selection like -f1,3,5 on 20-column CSV).
624    // For complement or unbounded ranges, use single-pass memchr2_iter which
625    // needs to process all delimiters anyway.
626    if delim != line_delim && max_field < usize::MAX && !complement {
627        buf.reserve(data.len());
628        let mut start = 0;
629        for end_pos in memchr_iter(line_delim, data) {
630            let line = &data[start..end_pos];
631            extract_fields_to_buf(
632                line,
633                delim,
634                ranges,
635                output_delim,
636                suppress,
637                max_field,
638                field_mask,
639                line_delim,
640                buf,
641                complement,
642            );
643            start = end_pos + 1;
644        }
645        if start < data.len() {
646            extract_fields_to_buf(
647                &data[start..],
648                delim,
649                ranges,
650                output_delim,
651                suppress,
652                max_field,
653                field_mask,
654                line_delim,
655                buf,
656                complement,
657            );
658        }
659        return;
660    }
661
662    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
663    // delimiter and line_delim in one SIMD scan.
664    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
665    if delim != line_delim {
666        buf.reserve(data.len());
667
668        let data_len = data.len();
669        let base = data.as_ptr();
670        let mut line_start: usize = 0;
671        let mut field_start: usize = 0;
672        let mut field_num: usize = 1;
673        let mut first_output = true;
674        let mut has_delim = false;
675
676        for pos in memchr::memchr2_iter(delim, line_delim, data) {
677            let byte = unsafe { *base.add(pos) };
678
679            if byte == line_delim {
680                // End of line: flush final field and emit line delimiter
681                if (field_num <= max_field || complement)
682                    && has_delim
683                    && is_selected(field_num, field_mask, ranges, complement)
684                {
685                    if !first_output {
686                        unsafe { buf_extend(buf, output_delim) };
687                    }
688                    unsafe {
689                        buf_extend(
690                            buf,
691                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
692                        )
693                    };
694                    first_output = false;
695                }
696
697                if !first_output {
698                    unsafe { buf_push(buf, line_delim) };
699                } else if !has_delim {
700                    if !suppress {
701                        unsafe {
702                            buf_extend(
703                                buf,
704                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
705                            );
706                            buf_push(buf, line_delim);
707                        }
708                    }
709                } else {
710                    unsafe { buf_push(buf, line_delim) };
711                }
712
713                // Reset state for next line
714                line_start = pos + 1;
715                field_start = pos + 1;
716                field_num = 1;
717                first_output = true;
718                has_delim = false;
719            } else {
720                // Field delimiter hit
721                has_delim = true;
722
723                if is_selected(field_num, field_mask, ranges, complement) {
724                    if !first_output {
725                        unsafe { buf_extend(buf, output_delim) };
726                    }
727                    unsafe {
728                        buf_extend(
729                            buf,
730                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
731                        )
732                    };
733                    first_output = false;
734                }
735
736                field_num += 1;
737                field_start = pos + 1;
738            }
739        }
740
741        // Handle last line without trailing line_delim
742        if line_start < data_len {
743            if line_start < data_len {
744                if (field_num <= max_field || complement)
745                    && has_delim
746                    && is_selected(field_num, field_mask, ranges, complement)
747                {
748                    if !first_output {
749                        unsafe { buf_extend(buf, output_delim) };
750                    }
751                    unsafe {
752                        buf_extend(
753                            buf,
754                            std::slice::from_raw_parts(
755                                base.add(field_start),
756                                data_len - field_start,
757                            ),
758                        )
759                    };
760                    first_output = false;
761                }
762
763                if !first_output {
764                    unsafe { buf_push(buf, line_delim) };
765                } else if !has_delim {
766                    if !suppress {
767                        unsafe {
768                            buf_extend(
769                                buf,
770                                std::slice::from_raw_parts(
771                                    base.add(line_start),
772                                    data_len - line_start,
773                                ),
774                            );
775                            buf_push(buf, line_delim);
776                        }
777                    }
778                } else {
779                    unsafe { buf_push(buf, line_delim) };
780                }
781            }
782        }
783
784        return;
785    }
786
787    // Fallback: when delim == line_delim, use the two-level scan approach
788    let mut start = 0;
789    for end_pos in memchr_iter(line_delim, data) {
790        let line = &data[start..end_pos];
791        extract_fields_to_buf(
792            line,
793            delim,
794            ranges,
795            output_delim,
796            suppress,
797            max_field,
798            field_mask,
799            line_delim,
800            buf,
801            complement,
802        );
803        start = end_pos + 1;
804    }
805    if start < data.len() {
806        extract_fields_to_buf(
807            &data[start..],
808            delim,
809            ranges,
810            output_delim,
811            suppress,
812            max_field,
813            field_mask,
814            line_delim,
815            buf,
816            complement,
817        );
818    }
819}
820
821// ── Ultra-fast single field extraction ───────────────────────────────────
822
823/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
824/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
825/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
826fn process_single_field(
827    data: &[u8],
828    delim: u8,
829    line_delim: u8,
830    target: usize,
831    suppress: bool,
832    out: &mut impl Write,
833) -> io::Result<()> {
834    let target_idx = target - 1;
835
836    // For single-field extraction, parallelize at 2MB+ to match PARALLEL_THRESHOLD.
837    // The 10MB benchmark regressed from ~7x to ~5.3x when this was set to 32MB.
838    const FIELD_PARALLEL_MIN: usize = 2 * 1024 * 1024;
839
840    if delim != line_delim {
841        // Field 1 fast path: memchr2 single-pass scan.
842        // For field 1, the first delimiter IS the field boundary. Lines without
843        // delimiter are passed through unchanged.
844        if target_idx == 0 && !suppress {
845            if data.len() >= FIELD_PARALLEL_MIN {
846                return single_field1_parallel(data, delim, line_delim, out);
847            }
848            // Sequential: scan with memchr2 into buffer, single write_all.
849            // Faster than writev/IoSlice for moderate data because it produces
850            // one contiguous buffer → one write syscall, and avoids IoSlice
851            // allocation overhead for high-delimiter-density data.
852            let mut buf = Vec::with_capacity(data.len());
853            single_field1_to_buf(data, delim, line_delim, &mut buf);
854            if !buf.is_empty() {
855                out.write_all(&buf)?;
856            }
857            return Ok(());
858        }
859
860        // Two-level approach for field N: outer newline scan + inner delim scan
861        // with early exit at target_idx. Faster than memchr2 single-pass because
862        // we only scan delimiters up to target_idx per line (not all of them).
863        if data.len() >= FIELD_PARALLEL_MIN {
864            let chunks = split_into_chunks(data, line_delim);
865            let results = par_process(&chunks, |chunk| {
866                let mut buf = Vec::with_capacity(chunk.len() / 2);
867                process_single_field_chunk(
868                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
869                );
870                buf
871            });
872            let slices: Vec<IoSlice> = results
873                .iter()
874                .filter(|r| !r.is_empty())
875                .map(|r| IoSlice::new(r))
876                .collect();
877            write_ioslices(out, &slices)?;
878        } else {
879            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
880            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
881            if !buf.is_empty() {
882                out.write_all(&buf)?;
883            }
884        }
885        return Ok(());
886    }
887
888    // Fallback for delim == line_delim: nested loop approach
889    if data.len() >= FIELD_PARALLEL_MIN {
890        let chunks = split_into_chunks(data, line_delim);
891        let results = par_process(&chunks, |chunk| {
892            let mut buf = Vec::with_capacity(chunk.len() / 4);
893            process_single_field_chunk(chunk, delim, target_idx, line_delim, suppress, &mut buf);
894            buf
895        });
896        let slices: Vec<IoSlice> = results
897            .iter()
898            .filter(|r| !r.is_empty())
899            .map(|r| IoSlice::new(r))
900            .collect();
901        write_ioslices(out, &slices)?;
902    } else {
903        let mut buf = Vec::with_capacity(data.len() / 4);
904        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
905        if !buf.is_empty() {
906            out.write_all(&buf)?;
907        }
908    }
909    Ok(())
910}
911
912/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
913/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
914fn process_complement_range(
915    data: &[u8],
916    delim: u8,
917    line_delim: u8,
918    skip_start: usize,
919    skip_end: usize,
920    suppress: bool,
921    out: &mut impl Write,
922) -> io::Result<()> {
923    if data.len() >= PARALLEL_THRESHOLD {
924        let chunks = split_into_chunks(data, line_delim);
925        let results = par_process(&chunks, |chunk| {
926            let mut buf = Vec::with_capacity(chunk.len());
927            complement_range_chunk(
928                chunk, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
929            );
930            buf
931        });
932        let slices: Vec<IoSlice> = results
933            .iter()
934            .filter(|r| !r.is_empty())
935            .map(|r| IoSlice::new(r))
936            .collect();
937        write_ioslices(out, &slices)?;
938    } else {
939        let mut buf = Vec::with_capacity(data.len());
940        complement_range_chunk(
941            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
942        );
943        if !buf.is_empty() {
944            out.write_all(&buf)?;
945        }
946    }
947    Ok(())
948}
949
950/// Process a chunk for complement range extraction.
951fn complement_range_chunk(
952    data: &[u8],
953    delim: u8,
954    skip_start: usize,
955    skip_end: usize,
956    line_delim: u8,
957    suppress: bool,
958    buf: &mut Vec<u8>,
959) {
960    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
961    buf.reserve(data.len());
962    let mut start = 0;
963    for end_pos in memchr_iter(line_delim, data) {
964        let line = &data[start..end_pos];
965        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
966        start = end_pos + 1;
967    }
968    if start < data.len() {
969        complement_range_line(
970            &data[start..],
971            delim,
972            skip_start,
973            skip_end,
974            line_delim,
975            suppress,
976            buf,
977        );
978    }
979}
980
981/// Extract all fields except skip_start..=skip_end from one line.
982/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
983///
984/// Optimized: only scans for enough delimiters to find the skip region boundaries.
985/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
986/// does a single copy of prefix + suffix, avoiding scanning past field 5.
987#[inline(always)]
988fn complement_range_line(
989    line: &[u8],
990    delim: u8,
991    skip_start: usize,
992    skip_end: usize,
993    line_delim: u8,
994    suppress: bool,
995    buf: &mut Vec<u8>,
996) {
997    let len = line.len();
998    if len == 0 {
999        if !suppress {
1000            unsafe { buf_push(buf, line_delim) };
1001        }
1002        return;
1003    }
1004
1005    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1006    let base = line.as_ptr();
1007
1008    // 1-based field numbers. To skip fields skip_start..=skip_end:
1009    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1010    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1011    //
1012    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1013    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1014
1015    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1016    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1017    let total_need = need_prefix_delims + need_skip_delims;
1018
1019    // Find delimiter positions up to total_need
1020    let mut delim_count: usize = 0;
1021    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1022    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1023
1024    for pos in memchr_iter(delim, line) {
1025        delim_count += 1;
1026        if delim_count == need_prefix_delims {
1027            prefix_end_pos = pos;
1028        }
1029        if delim_count == total_need {
1030            suffix_start_pos = pos + 1;
1031            break;
1032        }
1033    }
1034
1035    if delim_count == 0 {
1036        // No delimiter at all
1037        if !suppress {
1038            unsafe {
1039                buf_extend(buf, line);
1040                buf_push(buf, line_delim);
1041            }
1042        }
1043        return;
1044    }
1045
1046    // Case analysis:
1047    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1048    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1049    // 3. Enough to reach skip_end: prefix + delim + suffix
1050
1051    if delim_count < need_prefix_delims {
1052        // Not enough fields to reach skip region — output entire line
1053        unsafe {
1054            buf_extend(buf, line);
1055            buf_push(buf, line_delim);
1056        }
1057        return;
1058    }
1059
1060    let has_prefix = need_prefix_delims > 0;
1061    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1062
1063    if has_prefix && has_suffix {
1064        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1065        unsafe {
1066            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1067            buf_push(buf, delim);
1068            buf_extend(
1069                buf,
1070                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1071            );
1072            buf_push(buf, line_delim);
1073        }
1074    } else if has_prefix {
1075        // Only prefix, no suffix (skip region extends to end of line)
1076        unsafe {
1077            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1078            buf_push(buf, line_delim);
1079        }
1080    } else if has_suffix {
1081        // No prefix (skip_start == 1), only suffix
1082        unsafe {
1083            buf_extend(
1084                buf,
1085                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1086            );
1087            buf_push(buf, line_delim);
1088        }
1089    } else {
1090        // All fields skipped
1091        unsafe { buf_push(buf, line_delim) };
1092    }
1093}
1094
1095/// Complement single-field extraction: skip one field, output rest unchanged.
1096fn process_complement_single_field(
1097    data: &[u8],
1098    delim: u8,
1099    line_delim: u8,
1100    skip_field: usize,
1101    suppress: bool,
1102    out: &mut impl Write,
1103) -> io::Result<()> {
1104    let skip_idx = skip_field - 1;
1105
1106    if data.len() >= PARALLEL_THRESHOLD {
1107        let chunks = split_into_chunks(data, line_delim);
1108        let results = par_process(&chunks, |chunk| {
1109            let mut buf = Vec::with_capacity(chunk.len());
1110            complement_single_field_chunk(chunk, delim, skip_idx, line_delim, suppress, &mut buf);
1111            buf
1112        });
1113        let slices: Vec<IoSlice> = results
1114            .iter()
1115            .filter(|r| !r.is_empty())
1116            .map(|r| IoSlice::new(r))
1117            .collect();
1118        write_ioslices(out, &slices)?;
1119    } else {
1120        let mut buf = Vec::with_capacity(data.len());
1121        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1122        if !buf.is_empty() {
1123            out.write_all(&buf)?;
1124        }
1125    }
1126    Ok(())
1127}
1128
1129/// Process a chunk for complement single-field extraction using memchr2 single-pass.
1130/// Scans for both delimiter and line_delim in one SIMD pass, tracking delimiter count
1131/// per line. When the skip field's bounding delimiters are found, copies prefix + suffix.
1132/// This eliminates the per-line memchr_iter setup overhead and reduces from two SIMD
1133/// passes (outer newline scan + inner delimiter scan) to one.
1134fn complement_single_field_chunk(
1135    data: &[u8],
1136    delim: u8,
1137    skip_idx: usize,
1138    line_delim: u8,
1139    suppress: bool,
1140    buf: &mut Vec<u8>,
1141) {
1142    // When delim == line_delim, fall back to per-line approach
1143    if delim == line_delim {
1144        buf.reserve(data.len());
1145        let mut start = 0;
1146        for end_pos in memchr_iter(line_delim, data) {
1147            let line = &data[start..end_pos];
1148            complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1149            start = end_pos + 1;
1150        }
1151        if start < data.len() {
1152            complement_single_field_line(
1153                &data[start..],
1154                delim,
1155                skip_idx,
1156                line_delim,
1157                suppress,
1158                buf,
1159            );
1160        }
1161        return;
1162    }
1163
1164    buf.reserve(data.len());
1165    let base = data.as_ptr();
1166    let data_len = data.len();
1167    let need_before = skip_idx; // delimiters before skip field
1168    let need_total = skip_idx + 1; // delimiters to find end of skip field
1169
1170    // Per-line state
1171    let mut line_start: usize = 0;
1172    let mut delim_count: usize = 0;
1173    let mut skip_start_pos: usize = 0;
1174    let mut skip_end_pos: usize = 0;
1175    let mut found_start = need_before == 0; // skip_idx==0 means skip starts at line start
1176    let mut found_end = false;
1177
1178    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1179        let byte = unsafe { *base.add(pos) };
1180
1181        if byte == line_delim {
1182            // End of line: emit based on what we found
1183            if delim_count == 0 {
1184                // No delimiter in line
1185                if !suppress {
1186                    unsafe {
1187                        buf_extend(
1188                            buf,
1189                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1190                        );
1191                        buf_push(buf, line_delim);
1192                    }
1193                }
1194            } else if !found_start || delim_count < need_before {
1195                // Not enough delimiters to reach skip field — output entire line
1196                unsafe {
1197                    buf_extend(
1198                        buf,
1199                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1200                    );
1201                    buf_push(buf, line_delim);
1202                }
1203            } else {
1204                let has_prefix = skip_idx > 0;
1205                let has_suffix = found_end && skip_end_pos < pos;
1206
1207                if has_prefix && has_suffix {
1208                    unsafe {
1209                        buf_extend(
1210                            buf,
1211                            std::slice::from_raw_parts(
1212                                base.add(line_start),
1213                                skip_start_pos - 1 - line_start,
1214                            ),
1215                        );
1216                        buf_push(buf, delim);
1217                        buf_extend(
1218                            buf,
1219                            std::slice::from_raw_parts(
1220                                base.add(skip_end_pos + 1),
1221                                pos - skip_end_pos - 1,
1222                            ),
1223                        );
1224                        buf_push(buf, line_delim);
1225                    }
1226                } else if has_prefix {
1227                    unsafe {
1228                        buf_extend(
1229                            buf,
1230                            std::slice::from_raw_parts(
1231                                base.add(line_start),
1232                                skip_start_pos - 1 - line_start,
1233                            ),
1234                        );
1235                        buf_push(buf, line_delim);
1236                    }
1237                } else if has_suffix {
1238                    unsafe {
1239                        buf_extend(
1240                            buf,
1241                            std::slice::from_raw_parts(
1242                                base.add(skip_end_pos + 1),
1243                                pos - skip_end_pos - 1,
1244                            ),
1245                        );
1246                        buf_push(buf, line_delim);
1247                    }
1248                } else {
1249                    unsafe { buf_push(buf, line_delim) };
1250                }
1251            }
1252
1253            // Reset for next line
1254            line_start = pos + 1;
1255            delim_count = 0;
1256            skip_start_pos = 0;
1257            skip_end_pos = 0;
1258            found_start = need_before == 0;
1259            found_end = false;
1260        } else {
1261            // Delimiter found
1262            delim_count += 1;
1263            if delim_count == need_before {
1264                skip_start_pos = pos + 1;
1265                found_start = true;
1266            }
1267            if delim_count == need_total {
1268                skip_end_pos = pos;
1269                found_end = true;
1270            }
1271        }
1272    }
1273
1274    // Handle last line without trailing line_delim
1275    if line_start < data_len {
1276        let pos = data_len;
1277        if delim_count == 0 {
1278            if !suppress {
1279                unsafe {
1280                    buf_extend(
1281                        buf,
1282                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1283                    );
1284                    buf_push(buf, line_delim);
1285                }
1286            }
1287        } else if !found_start || delim_count < need_before {
1288            unsafe {
1289                buf_extend(
1290                    buf,
1291                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1292                );
1293                buf_push(buf, line_delim);
1294            }
1295        } else {
1296            let has_prefix = skip_idx > 0;
1297            let has_suffix = found_end && skip_end_pos < pos;
1298
1299            if has_prefix && has_suffix {
1300                unsafe {
1301                    buf_extend(
1302                        buf,
1303                        std::slice::from_raw_parts(
1304                            base.add(line_start),
1305                            skip_start_pos - 1 - line_start,
1306                        ),
1307                    );
1308                    buf_push(buf, delim);
1309                    buf_extend(
1310                        buf,
1311                        std::slice::from_raw_parts(
1312                            base.add(skip_end_pos + 1),
1313                            pos - skip_end_pos - 1,
1314                        ),
1315                    );
1316                    buf_push(buf, line_delim);
1317                }
1318            } else if has_prefix {
1319                unsafe {
1320                    buf_extend(
1321                        buf,
1322                        std::slice::from_raw_parts(
1323                            base.add(line_start),
1324                            skip_start_pos - 1 - line_start,
1325                        ),
1326                    );
1327                    buf_push(buf, line_delim);
1328                }
1329            } else if has_suffix {
1330                unsafe {
1331                    buf_extend(
1332                        buf,
1333                        std::slice::from_raw_parts(
1334                            base.add(skip_end_pos + 1),
1335                            pos - skip_end_pos - 1,
1336                        ),
1337                    );
1338                    buf_push(buf, line_delim);
1339                }
1340            } else {
1341                unsafe { buf_push(buf, line_delim) };
1342            }
1343        }
1344    }
1345}
1346
1347/// Fallback per-line complement single-field extraction (for delim == line_delim).
1348#[inline(always)]
1349fn complement_single_field_line(
1350    line: &[u8],
1351    delim: u8,
1352    skip_idx: usize,
1353    line_delim: u8,
1354    suppress: bool,
1355    buf: &mut Vec<u8>,
1356) {
1357    let len = line.len();
1358    if len == 0 {
1359        if !suppress {
1360            unsafe { buf_push(buf, line_delim) };
1361        }
1362        return;
1363    }
1364
1365    let base = line.as_ptr();
1366    let need_before = skip_idx;
1367    let need_total = skip_idx + 1;
1368
1369    let mut delim_count: usize = 0;
1370    let mut skip_start_pos: usize = 0;
1371    let mut skip_end_pos: usize = len;
1372    let mut found_end = false;
1373
1374    for pos in memchr_iter(delim, line) {
1375        delim_count += 1;
1376        if delim_count == need_before {
1377            skip_start_pos = pos + 1;
1378        }
1379        if delim_count == need_total {
1380            skip_end_pos = pos;
1381            found_end = true;
1382            break;
1383        }
1384    }
1385
1386    if delim_count == 0 {
1387        if !suppress {
1388            unsafe {
1389                buf_extend(buf, line);
1390                buf_push(buf, line_delim);
1391            }
1392        }
1393        return;
1394    }
1395
1396    if delim_count < need_before {
1397        unsafe {
1398            buf_extend(buf, line);
1399            buf_push(buf, line_delim);
1400        }
1401        return;
1402    }
1403
1404    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1405    let has_suffix = found_end && skip_end_pos < len;
1406
1407    if has_prefix && has_suffix {
1408        unsafe {
1409            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1410            buf_push(buf, delim);
1411            buf_extend(
1412                buf,
1413                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1414            );
1415            buf_push(buf, line_delim);
1416        }
1417    } else if has_prefix {
1418        unsafe {
1419            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1420            buf_push(buf, line_delim);
1421        }
1422    } else if has_suffix {
1423        unsafe {
1424            buf_extend(
1425                buf,
1426                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1427            );
1428            buf_push(buf, line_delim);
1429        }
1430    } else {
1431        unsafe { buf_push(buf, line_delim) };
1432    }
1433}
1434
1435/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1436/// Zero-copy for the non-parallel path: identifies the truncation point per line
1437/// and writes contiguous runs directly from the source data.
1438fn process_fields_prefix(
1439    data: &[u8],
1440    delim: u8,
1441    line_delim: u8,
1442    last_field: usize,
1443    suppress: bool,
1444    out: &mut impl Write,
1445) -> io::Result<()> {
1446    if data.len() >= PARALLEL_THRESHOLD {
1447        let chunks = split_into_chunks(data, line_delim);
1448        let results = par_process(&chunks, |chunk| {
1449            let mut buf = Vec::with_capacity(chunk.len());
1450            fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
1451            buf
1452        });
1453        let slices: Vec<IoSlice> = results
1454            .iter()
1455            .filter(|r| !r.is_empty())
1456            .map(|r| IoSlice::new(r))
1457            .collect();
1458        write_ioslices(out, &slices)?;
1459    } else if !suppress {
1460        // Zero-copy fast path: scan for truncation points, write runs from source.
1461        // When suppress is false, every line is output (with or without delimiter).
1462        // Most lines have enough fields, so the output is often identical to input.
1463        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1464    } else {
1465        let mut buf = Vec::with_capacity(data.len());
1466        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1467        if !buf.is_empty() {
1468            out.write_all(&buf)?;
1469        }
1470    }
1471    Ok(())
1472}
1473
1474/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1475/// directly into the source data, flushing in MAX_IOV-sized batches.
1476/// For lines where the Nth delimiter exists, we truncate at that point.
1477/// For lines with fewer fields, we output them unchanged (contiguous run).
1478/// Lines without any delimiter are output unchanged (suppress=false assumed).
1479#[inline]
1480fn fields_prefix_zerocopy(
1481    data: &[u8],
1482    delim: u8,
1483    line_delim: u8,
1484    last_field: usize,
1485    out: &mut impl Write,
1486) -> io::Result<()> {
1487    let newline_buf: [u8; 1] = [line_delim];
1488    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1489    let mut start = 0;
1490    let mut run_start: usize = 0;
1491
1492    for end_pos in memchr_iter(line_delim, data) {
1493        let line = &data[start..end_pos];
1494        let mut field_count = 1;
1495        let mut truncate_at: Option<usize> = None;
1496        for dpos in memchr_iter(delim, line) {
1497            if field_count >= last_field {
1498                truncate_at = Some(start + dpos);
1499                break;
1500            }
1501            field_count += 1;
1502        }
1503
1504        if let Some(trunc_pos) = truncate_at {
1505            if run_start < start {
1506                iov.push(IoSlice::new(&data[run_start..start]));
1507            }
1508            iov.push(IoSlice::new(&data[start..trunc_pos]));
1509            iov.push(IoSlice::new(&newline_buf));
1510            run_start = end_pos + 1;
1511
1512            if iov.len() >= MAX_IOV - 2 {
1513                write_ioslices(out, &iov)?;
1514                iov.clear();
1515            }
1516        }
1517        start = end_pos + 1;
1518    }
1519    // Handle last line without terminator
1520    if start < data.len() {
1521        let line = &data[start..];
1522        let mut field_count = 1;
1523        let mut truncate_at: Option<usize> = None;
1524        for dpos in memchr_iter(delim, line) {
1525            if field_count >= last_field {
1526                truncate_at = Some(start + dpos);
1527                break;
1528            }
1529            field_count += 1;
1530        }
1531        if let Some(trunc_pos) = truncate_at {
1532            if run_start < start {
1533                iov.push(IoSlice::new(&data[run_start..start]));
1534            }
1535            iov.push(IoSlice::new(&data[start..trunc_pos]));
1536            iov.push(IoSlice::new(&newline_buf));
1537            if !iov.is_empty() {
1538                write_ioslices(out, &iov)?;
1539            }
1540            return Ok(());
1541        }
1542    }
1543    // Flush remaining contiguous run
1544    if run_start < data.len() {
1545        iov.push(IoSlice::new(&data[run_start..]));
1546        if !data.is_empty() && *data.last().unwrap() != line_delim {
1547            iov.push(IoSlice::new(&newline_buf));
1548        }
1549    }
1550    if !iov.is_empty() {
1551        write_ioslices(out, &iov)?;
1552    }
1553    Ok(())
1554}
1555
1556/// Process a chunk for contiguous from-start field range extraction.
1557fn fields_prefix_chunk(
1558    data: &[u8],
1559    delim: u8,
1560    line_delim: u8,
1561    last_field: usize,
1562    suppress: bool,
1563    buf: &mut Vec<u8>,
1564) {
1565    buf.reserve(data.len());
1566    let mut start = 0;
1567    for end_pos in memchr_iter(line_delim, data) {
1568        let line = &data[start..end_pos];
1569        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1570        start = end_pos + 1;
1571    }
1572    if start < data.len() {
1573        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1574    }
1575}
1576
1577/// Extract first N fields from one line (contiguous from-start range).
1578/// Uses memchr SIMD for delimiter scanning on all line sizes.
1579#[inline(always)]
1580fn fields_prefix_line(
1581    line: &[u8],
1582    delim: u8,
1583    line_delim: u8,
1584    last_field: usize,
1585    suppress: bool,
1586    buf: &mut Vec<u8>,
1587) {
1588    let len = line.len();
1589    if len == 0 {
1590        if !suppress {
1591            unsafe { buf_push(buf, line_delim) };
1592        }
1593        return;
1594    }
1595
1596    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1597    let base = line.as_ptr();
1598
1599    let mut field_count = 1usize;
1600    let mut has_delim = false;
1601
1602    for pos in memchr_iter(delim, line) {
1603        has_delim = true;
1604        if field_count >= last_field {
1605            unsafe {
1606                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1607                buf_push(buf, line_delim);
1608            }
1609            return;
1610        }
1611        field_count += 1;
1612    }
1613
1614    if !has_delim {
1615        if !suppress {
1616            unsafe {
1617                buf_extend(buf, line);
1618                buf_push(buf, line_delim);
1619            }
1620        }
1621        return;
1622    }
1623
1624    unsafe {
1625        buf_extend(buf, line);
1626        buf_push(buf, line_delim);
1627    }
1628}
1629
1630/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1631fn process_fields_suffix(
1632    data: &[u8],
1633    delim: u8,
1634    line_delim: u8,
1635    start_field: usize,
1636    suppress: bool,
1637    out: &mut impl Write,
1638) -> io::Result<()> {
1639    if data.len() >= PARALLEL_THRESHOLD {
1640        let chunks = split_into_chunks(data, line_delim);
1641        let results = par_process(&chunks, |chunk| {
1642            let mut buf = Vec::with_capacity(chunk.len());
1643            fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
1644            buf
1645        });
1646        let slices: Vec<IoSlice> = results
1647            .iter()
1648            .filter(|r| !r.is_empty())
1649            .map(|r| IoSlice::new(r))
1650            .collect();
1651        write_ioslices(out, &slices)?;
1652    } else {
1653        let mut buf = Vec::with_capacity(data.len());
1654        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1655        if !buf.is_empty() {
1656            out.write_all(&buf)?;
1657        }
1658    }
1659    Ok(())
1660}
1661
1662/// Process a chunk for open-ended field suffix extraction.
1663fn fields_suffix_chunk(
1664    data: &[u8],
1665    delim: u8,
1666    line_delim: u8,
1667    start_field: usize,
1668    suppress: bool,
1669    buf: &mut Vec<u8>,
1670) {
1671    buf.reserve(data.len());
1672    let mut start = 0;
1673    for end_pos in memchr_iter(line_delim, data) {
1674        let line = &data[start..end_pos];
1675        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1676        start = end_pos + 1;
1677    }
1678    if start < data.len() {
1679        fields_suffix_line(
1680            &data[start..],
1681            delim,
1682            line_delim,
1683            start_field,
1684            suppress,
1685            buf,
1686        );
1687    }
1688}
1689
1690/// Extract fields from start_field to end from one line.
1691/// Uses memchr SIMD for delimiter scanning on all line sizes.
1692#[inline(always)]
1693fn fields_suffix_line(
1694    line: &[u8],
1695    delim: u8,
1696    line_delim: u8,
1697    start_field: usize,
1698    suppress: bool,
1699    buf: &mut Vec<u8>,
1700) {
1701    let len = line.len();
1702    if len == 0 {
1703        if !suppress {
1704            unsafe { buf_push(buf, line_delim) };
1705        }
1706        return;
1707    }
1708
1709    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1710    let base = line.as_ptr();
1711
1712    let skip_delims = start_field - 1;
1713    let mut delim_count = 0usize;
1714    let mut has_delim = false;
1715
1716    for pos in memchr_iter(delim, line) {
1717        has_delim = true;
1718        delim_count += 1;
1719        if delim_count >= skip_delims {
1720            unsafe {
1721                buf_extend(
1722                    buf,
1723                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1724                );
1725                buf_push(buf, line_delim);
1726            }
1727            return;
1728        }
1729    }
1730
1731    if !has_delim {
1732        if !suppress {
1733            unsafe {
1734                buf_extend(buf, line);
1735                buf_push(buf, line_delim);
1736            }
1737        }
1738        return;
1739    }
1740
1741    // Fewer delimiters than needed
1742    unsafe { buf_push(buf, line_delim) };
1743}
1744
1745/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1746/// Optimized: skip to start_field using memchr, then output until end_field.
1747fn process_fields_mid_range(
1748    data: &[u8],
1749    delim: u8,
1750    line_delim: u8,
1751    start_field: usize,
1752    end_field: usize,
1753    suppress: bool,
1754    out: &mut impl Write,
1755) -> io::Result<()> {
1756    if data.len() >= PARALLEL_THRESHOLD {
1757        let chunks = split_into_chunks(data, line_delim);
1758        let results = par_process(&chunks, |chunk| {
1759            let mut buf = Vec::with_capacity(chunk.len());
1760            fields_mid_range_chunk(
1761                chunk,
1762                delim,
1763                line_delim,
1764                start_field,
1765                end_field,
1766                suppress,
1767                &mut buf,
1768            );
1769            buf
1770        });
1771        let slices: Vec<IoSlice> = results
1772            .iter()
1773            .filter(|r| !r.is_empty())
1774            .map(|r| IoSlice::new(r))
1775            .collect();
1776        write_ioslices(out, &slices)?;
1777    } else {
1778        let mut buf = Vec::with_capacity(data.len());
1779        fields_mid_range_chunk(
1780            data,
1781            delim,
1782            line_delim,
1783            start_field,
1784            end_field,
1785            suppress,
1786            &mut buf,
1787        );
1788        if !buf.is_empty() {
1789            out.write_all(&buf)?;
1790        }
1791    }
1792    Ok(())
1793}
1794
1795/// Process a chunk for contiguous mid-range field extraction.
1796fn fields_mid_range_chunk(
1797    data: &[u8],
1798    delim: u8,
1799    line_delim: u8,
1800    start_field: usize,
1801    end_field: usize,
1802    suppress: bool,
1803    buf: &mut Vec<u8>,
1804) {
1805    buf.reserve(data.len());
1806    let mut start = 0;
1807    for end_pos in memchr_iter(line_delim, data) {
1808        let line = &data[start..end_pos];
1809        fields_mid_range_line(
1810            line,
1811            delim,
1812            line_delim,
1813            start_field,
1814            end_field,
1815            suppress,
1816            buf,
1817        );
1818        start = end_pos + 1;
1819    }
1820    if start < data.len() {
1821        fields_mid_range_line(
1822            &data[start..],
1823            delim,
1824            line_delim,
1825            start_field,
1826            end_field,
1827            suppress,
1828            buf,
1829        );
1830    }
1831}
1832
1833/// Extract fields start_field..=end_field from one line.
1834/// Uses scalar byte scanning for short lines, memchr_iter for longer.
1835/// Raw pointer arithmetic to eliminate bounds checking.
1836#[inline(always)]
1837fn fields_mid_range_line(
1838    line: &[u8],
1839    delim: u8,
1840    line_delim: u8,
1841    start_field: usize,
1842    end_field: usize,
1843    suppress: bool,
1844    buf: &mut Vec<u8>,
1845) {
1846    let len = line.len();
1847    if len == 0 {
1848        if !suppress {
1849            unsafe { buf_push(buf, line_delim) };
1850        }
1851        return;
1852    }
1853
1854    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
1855    let base = line.as_ptr();
1856
1857    // Count delimiters to find start_field and end_field boundaries
1858    let skip_before = start_field - 1; // delimiters to skip before start_field
1859    let field_span = end_field - start_field; // additional delimiters within the range
1860    let target_end_delim = skip_before + field_span + 1;
1861    let mut delim_count = 0;
1862    let mut range_start = 0;
1863    let mut has_delim = false;
1864
1865    for pos in memchr_iter(delim, line) {
1866        has_delim = true;
1867        delim_count += 1;
1868        if delim_count == skip_before {
1869            range_start = pos + 1;
1870        }
1871        if delim_count == target_end_delim {
1872            if skip_before == 0 {
1873                range_start = 0;
1874            }
1875            unsafe {
1876                buf_extend(
1877                    buf,
1878                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1879                );
1880                buf_push(buf, line_delim);
1881            }
1882            return;
1883        }
1884    }
1885
1886    if !has_delim {
1887        if !suppress {
1888            unsafe {
1889                buf_extend(buf, line);
1890                buf_push(buf, line_delim);
1891            }
1892        }
1893        return;
1894    }
1895
1896    // Line has delimiters but fewer fields than end_field
1897    if delim_count >= skip_before {
1898        // We have at least start_field, output from range_start to end
1899        if skip_before == 0 {
1900            range_start = 0;
1901        }
1902        unsafe {
1903            buf_extend(
1904                buf,
1905                std::slice::from_raw_parts(base.add(range_start), len - range_start),
1906            );
1907            buf_push(buf, line_delim);
1908        }
1909    } else {
1910        // Not enough fields even for start_field — output empty line
1911        unsafe { buf_push(buf, line_delim) };
1912    }
1913}
1914
1915/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
1916/// directly into the source data, flushing in MAX_IOV-sized batches.
1917/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
1918///
1919/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
1920/// Parallel field-1 extraction for large data using memchr2 single-pass.
1921/// Splits data into per-thread chunks, each chunk extracts field 1 using
1922/// memchr2(delim, newline) which finds the first special byte in one scan.
1923/// For field 1: first special byte is either the delimiter (field end) or
1924/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
1925fn single_field1_parallel(
1926    data: &[u8],
1927    delim: u8,
1928    line_delim: u8,
1929    out: &mut impl Write,
1930) -> io::Result<()> {
1931    let chunks = split_into_chunks(data, line_delim);
1932    let results = par_process(&chunks, |chunk| {
1933        let mut buf = Vec::with_capacity(chunk.len());
1934        single_field1_to_buf(chunk, delim, line_delim, &mut buf);
1935        buf
1936    });
1937    let slices: Vec<IoSlice> = results
1938        .iter()
1939        .filter(|r| !r.is_empty())
1940        .map(|r| IoSlice::new(r))
1941        .collect();
1942    write_ioslices(out, &slices)
1943}
1944
1945/// Extract field 1 from a chunk using memchr2 single-pass scanning.
1946/// Uses memchr2(delim, line_delim) to find the first special byte per line:
1947/// - If delimiter: field 1 = data[line_start..delim_pos], skip to next newline
1948/// - If newline: no delimiter on this line, output unchanged
1949/// This scans ~N total bytes vs ~1.5N for two-level (outer newline + inner delimiter).
1950#[inline]
1951fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
1952    use memchr::memchr2;
1953    buf.reserve(data.len());
1954    let mut pos = 0;
1955    while pos < data.len() {
1956        match memchr2(delim, line_delim, &data[pos..]) {
1957            None => {
1958                // Rest is a partial line, no delimiter — output as-is
1959                unsafe {
1960                    buf_extend(buf, &data[pos..]);
1961                }
1962                break;
1963            }
1964            Some(offset) => {
1965                let actual = pos + offset;
1966                if data[actual] == line_delim {
1967                    // No delimiter on this line — output entire line including newline
1968                    unsafe {
1969                        buf_extend(buf, &data[pos..actual + 1]);
1970                    }
1971                    pos = actual + 1;
1972                } else {
1973                    // Delimiter found — output field 1 (up to delimiter) + newline
1974                    unsafe {
1975                        buf_extend(buf, &data[pos..actual]);
1976                        buf_push(buf, line_delim);
1977                    }
1978                    // Skip to next newline
1979                    match memchr::memchr(line_delim, &data[actual + 1..]) {
1980                        None => {
1981                            pos = data.len();
1982                        }
1983                        Some(nl_off) => {
1984                            pos = actual + 1 + nl_off + 1;
1985                        }
1986                    }
1987                }
1988            }
1989        }
1990    }
1991}
1992
1993/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
1994/// directly into the source data. Uses two-level scan: outer memchr(newline)
1995/// for the first delimiter. This is faster than memchr2 for SMALL data because
1996/// the inner scan exits after the FIRST delimiter, skipping all
1997/// subsequent delimiters on the line.
1998///
1999/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2000/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2001#[inline]
2002#[allow(dead_code)]
2003fn single_field1_zerocopy(
2004    data: &[u8],
2005    delim: u8,
2006    line_delim: u8,
2007    out: &mut impl Write,
2008) -> io::Result<()> {
2009    let newline_buf: [u8; 1] = [line_delim];
2010
2011    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2012    let mut run_start: usize = 0;
2013    let mut start = 0;
2014
2015    for end_pos in memchr_iter(line_delim, data) {
2016        let line = &data[start..end_pos];
2017        if let Some(dp) = memchr::memchr(delim, line) {
2018            // Line has delimiter — truncate at first delimiter.
2019            // Flush current contiguous run, then add truncated field + newline.
2020            if run_start < start {
2021                iov.push(IoSlice::new(&data[run_start..start]));
2022            }
2023            iov.push(IoSlice::new(&data[start..start + dp]));
2024            iov.push(IoSlice::new(&newline_buf));
2025            run_start = end_pos + 1;
2026
2027            if iov.len() >= MAX_IOV - 2 {
2028                write_ioslices(out, &iov)?;
2029                iov.clear();
2030            }
2031        }
2032        // else: no delimiter in line, output unchanged (stays in contiguous run)
2033        start = end_pos + 1;
2034    }
2035
2036    // Handle last line (no trailing newline)
2037    if start < data.len() {
2038        let line = &data[start..];
2039        if let Some(dp) = memchr::memchr(delim, line) {
2040            if run_start < start {
2041                iov.push(IoSlice::new(&data[run_start..start]));
2042            }
2043            iov.push(IoSlice::new(&data[start..start + dp]));
2044            iov.push(IoSlice::new(&newline_buf));
2045            if !iov.is_empty() {
2046                write_ioslices(out, &iov)?;
2047            }
2048            return Ok(());
2049        }
2050    }
2051
2052    // Flush remaining contiguous run
2053    if run_start < data.len() {
2054        iov.push(IoSlice::new(&data[run_start..]));
2055        if !data.is_empty() && *data.last().unwrap() != line_delim {
2056            iov.push(IoSlice::new(&newline_buf));
2057        }
2058    }
2059    if !iov.is_empty() {
2060        write_ioslices(out, &iov)?;
2061    }
2062    Ok(())
2063}
2064
2065/// Process a chunk of data for single-field extraction.
2066fn process_single_field_chunk(
2067    data: &[u8],
2068    delim: u8,
2069    target_idx: usize,
2070    line_delim: u8,
2071    suppress: bool,
2072    buf: &mut Vec<u8>,
2073) {
2074    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2075    buf.reserve(data.len());
2076    let mut start = 0;
2077    for end_pos in memchr_iter(line_delim, data) {
2078        let line = &data[start..end_pos];
2079        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2080        start = end_pos + 1;
2081    }
2082    if start < data.len() {
2083        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2084    }
2085}
2086
2087/// Extract a single field from one line.
2088/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2089/// For longer lines, uses memchr for SIMD-accelerated scanning.
2090/// Raw pointer arithmetic eliminates per-field bounds checking.
2091#[inline(always)]
2092fn extract_single_field_line(
2093    line: &[u8],
2094    delim: u8,
2095    target_idx: usize,
2096    line_delim: u8,
2097    suppress: bool,
2098    buf: &mut Vec<u8>,
2099) {
2100    let len = line.len();
2101    if len == 0 {
2102        if !suppress {
2103            unsafe { buf_push(buf, line_delim) };
2104        }
2105        return;
2106    }
2107
2108    // Note: no per-line buf.reserve — process_single_field_chunk already reserves data.len()
2109    let base = line.as_ptr();
2110
2111    // Ultra-fast path for first field: single memchr
2112    if target_idx == 0 {
2113        match memchr::memchr(delim, line) {
2114            Some(pos) => unsafe {
2115                buf_extend(buf, std::slice::from_raw_parts(base, pos));
2116                buf_push(buf, line_delim);
2117            },
2118            None => {
2119                if !suppress {
2120                    unsafe {
2121                        buf_extend(buf, line);
2122                        buf_push(buf, line_delim);
2123                    }
2124                }
2125            }
2126        }
2127        return;
2128    }
2129
2130    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2131    let mut field_start = 0;
2132    let mut field_idx = 0;
2133    let mut has_delim = false;
2134
2135    for pos in memchr_iter(delim, line) {
2136        has_delim = true;
2137        if field_idx == target_idx {
2138            unsafe {
2139                buf_extend(
2140                    buf,
2141                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2142                );
2143                buf_push(buf, line_delim);
2144            }
2145            return;
2146        }
2147        field_idx += 1;
2148        field_start = pos + 1;
2149    }
2150
2151    if !has_delim {
2152        if !suppress {
2153            unsafe {
2154                buf_extend(buf, line);
2155                buf_push(buf, line_delim);
2156            }
2157        }
2158        return;
2159    }
2160
2161    if field_idx == target_idx {
2162        unsafe {
2163            buf_extend(
2164                buf,
2165                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2166            );
2167            buf_push(buf, line_delim);
2168        }
2169    } else {
2170        unsafe { buf_push(buf, line_delim) };
2171    }
2172}
2173
2174/// Extract fields from a single line into the output buffer.
2175/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2176/// Raw pointer arithmetic eliminates per-field bounds checking.
2177#[inline(always)]
2178fn extract_fields_to_buf(
2179    line: &[u8],
2180    delim: u8,
2181    ranges: &[Range],
2182    output_delim: &[u8],
2183    suppress: bool,
2184    max_field: usize,
2185    field_mask: u64,
2186    line_delim: u8,
2187    buf: &mut Vec<u8>,
2188    complement: bool,
2189) {
2190    let len = line.len();
2191
2192    if len == 0 {
2193        if !suppress {
2194            buf.push(line_delim);
2195        }
2196        return;
2197    }
2198
2199    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2200    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2201    let needed = len + output_delim.len() * 16 + 1;
2202    if buf.capacity() - buf.len() < needed {
2203        buf.reserve(needed);
2204    }
2205
2206    let base = line.as_ptr();
2207    let mut field_num: usize = 1;
2208    let mut field_start: usize = 0;
2209    let mut first_output = true;
2210    let mut has_delim = false;
2211
2212    // Use memchr SIMD for all line sizes
2213    for delim_pos in memchr_iter(delim, line) {
2214        has_delim = true;
2215
2216        if is_selected(field_num, field_mask, ranges, complement) {
2217            if !first_output {
2218                unsafe { buf_extend(buf, output_delim) };
2219            }
2220            unsafe {
2221                buf_extend(
2222                    buf,
2223                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2224                )
2225            };
2226            first_output = false;
2227        }
2228
2229        field_num += 1;
2230        field_start = delim_pos + 1;
2231
2232        if field_num > max_field {
2233            break;
2234        }
2235    }
2236
2237    // Last field
2238    if (field_num <= max_field || complement)
2239        && has_delim
2240        && is_selected(field_num, field_mask, ranges, complement)
2241    {
2242        if !first_output {
2243            unsafe { buf_extend(buf, output_delim) };
2244        }
2245        unsafe {
2246            buf_extend(
2247                buf,
2248                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2249            )
2250        };
2251        first_output = false;
2252    }
2253
2254    if !first_output {
2255        unsafe { buf_push(buf, line_delim) };
2256    } else if !has_delim {
2257        if !suppress {
2258            unsafe {
2259                buf_extend(buf, line);
2260                buf_push(buf, line_delim);
2261            }
2262        }
2263    } else {
2264        unsafe { buf_push(buf, line_delim) };
2265    }
2266}
2267
2268// ── Fast path: byte/char extraction with batched output ──────────────────
2269
2270/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2271/// Zero-copy: writes directly from the source data using output runs.
2272/// For lines shorter than max_bytes, the output is identical to the input,
2273/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2274fn process_bytes_from_start(
2275    data: &[u8],
2276    max_bytes: usize,
2277    line_delim: u8,
2278    out: &mut impl Write,
2279) -> io::Result<()> {
2280    // Fast path: if all lines fit within max_bytes, output = input.
2281    // Single memchr scan with early exit on first oversized line.
2282    // For `-b1-100` on CSV where average line is < 100 bytes, this
2283    // skips all per-line processing and outputs the data directly.
2284    if max_bytes > 0 && max_bytes < usize::MAX {
2285        let mut start = 0;
2286        let mut all_fit = true;
2287        for pos in memchr_iter(line_delim, data) {
2288            if pos - start > max_bytes {
2289                all_fit = false;
2290                break;
2291            }
2292            start = pos + 1;
2293        }
2294        // Check last line (no trailing delimiter)
2295        if all_fit && start < data.len() && data.len() - start > max_bytes {
2296            all_fit = false;
2297        }
2298        if all_fit {
2299            // All lines fit: output = input. Handle missing trailing delimiter.
2300            if !data.is_empty() && data[data.len() - 1] == line_delim {
2301                return out.write_all(data);
2302            } else if !data.is_empty() {
2303                out.write_all(data)?;
2304                return out.write_all(&[line_delim]);
2305            }
2306            return Ok(());
2307        }
2308    }
2309
2310    if data.len() >= PARALLEL_THRESHOLD {
2311        let chunks = split_into_chunks(data, line_delim);
2312        let results = par_process(&chunks, |chunk| {
2313            let est_out = (chunk.len() / 4).max(max_bytes + 2);
2314            let mut buf = Vec::with_capacity(est_out.min(chunk.len()));
2315            bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
2316            buf
2317        });
2318        let slices: Vec<IoSlice> = results
2319            .iter()
2320            .filter(|r| !r.is_empty())
2321            .map(|r| IoSlice::new(r))
2322            .collect();
2323        write_ioslices(out, &slices)?;
2324    } else {
2325        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2326        // because every line gets truncated, creating 3 IoSlice entries per line.
2327        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2328        // managing millions of IoSlice entries through the kernel.
2329        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2330        if max_bytes <= 512 {
2331            // Estimate output size without scanning: output <= data.len(),
2332            // typically ~data.len()/4 for short max_bytes on longer lines.
2333            let est_out = (data.len() / 4).max(max_bytes + 2);
2334            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2335            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2336            if !buf.is_empty() {
2337                out.write_all(&buf)?;
2338            }
2339        } else {
2340            // Zero-copy path: track contiguous output runs and write directly from source.
2341            // For lines <= max_bytes, we include them as-is (no copy needed).
2342            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2343            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2344        }
2345    }
2346    Ok(())
2347}
2348
2349/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2350/// directly into the source data, flushing in MAX_IOV-sized batches.
2351/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2352/// produce two IoSlices (truncated data + newline).
2353#[inline]
2354fn bytes_from_start_zerocopy(
2355    data: &[u8],
2356    max_bytes: usize,
2357    line_delim: u8,
2358    out: &mut impl Write,
2359) -> io::Result<()> {
2360    let newline_buf: [u8; 1] = [line_delim];
2361    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2362    let mut start = 0;
2363    let mut run_start: usize = 0;
2364
2365    for pos in memchr_iter(line_delim, data) {
2366        let line_len = pos - start;
2367        if line_len > max_bytes {
2368            // This line needs truncation
2369            if run_start < start {
2370                iov.push(IoSlice::new(&data[run_start..start]));
2371            }
2372            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2373            iov.push(IoSlice::new(&newline_buf));
2374            run_start = pos + 1;
2375
2376            if iov.len() >= MAX_IOV - 2 {
2377                write_ioslices(out, &iov)?;
2378                iov.clear();
2379            }
2380        }
2381        start = pos + 1;
2382    }
2383    // Handle last line without terminator
2384    if start < data.len() {
2385        let line_len = data.len() - start;
2386        if line_len > max_bytes {
2387            if run_start < start {
2388                iov.push(IoSlice::new(&data[run_start..start]));
2389            }
2390            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2391            iov.push(IoSlice::new(&newline_buf));
2392            if !iov.is_empty() {
2393                write_ioslices(out, &iov)?;
2394            }
2395            return Ok(());
2396        }
2397    }
2398    // Flush remaining contiguous run
2399    if run_start < data.len() {
2400        iov.push(IoSlice::new(&data[run_start..]));
2401        if !data.is_empty() && *data.last().unwrap() != line_delim {
2402            iov.push(IoSlice::new(&newline_buf));
2403        }
2404    }
2405    if !iov.is_empty() {
2406        write_ioslices(out, &iov)?;
2407    }
2408    Ok(())
2409}
2410
2411/// Process a chunk for from-start byte range extraction (parallel path).
2412/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2413/// Pre-reserves data.len() (output never exceeds input), then uses a single
2414/// write pointer with deferred set_len — no per-line capacity checks.
2415#[inline]
2416fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2417    // Output is always <= input size (we only truncate, never expand).
2418    // Single reserve eliminates ALL per-line capacity checks.
2419    buf.reserve(data.len());
2420
2421    let src = data.as_ptr();
2422    let dst_base = buf.as_mut_ptr();
2423    let mut wp = buf.len();
2424    let mut start = 0;
2425
2426    for pos in memchr_iter(line_delim, data) {
2427        let line_len = pos - start;
2428        let take = line_len.min(max_bytes);
2429        unsafe {
2430            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2431            *dst_base.add(wp + take) = line_delim;
2432        }
2433        wp += take + 1;
2434        start = pos + 1;
2435    }
2436    // Handle last line without terminator
2437    if start < data.len() {
2438        let line_len = data.len() - start;
2439        let take = line_len.min(max_bytes);
2440        unsafe {
2441            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2442            *dst_base.add(wp + take) = line_delim;
2443        }
2444        wp += take + 1;
2445    }
2446    unsafe { buf.set_len(wp) };
2447}
2448
2449/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2450fn process_bytes_from_offset(
2451    data: &[u8],
2452    skip_bytes: usize,
2453    line_delim: u8,
2454    out: &mut impl Write,
2455) -> io::Result<()> {
2456    if data.len() >= PARALLEL_THRESHOLD {
2457        let chunks = split_into_chunks(data, line_delim);
2458        let results = par_process(&chunks, |chunk| {
2459            let mut buf = Vec::with_capacity(chunk.len());
2460            bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
2461            buf
2462        });
2463        let slices: Vec<IoSlice> = results
2464            .iter()
2465            .filter(|r| !r.is_empty())
2466            .map(|r| IoSlice::new(r))
2467            .collect();
2468        write_ioslices(out, &slices)?;
2469    } else {
2470        // Zero-copy: write suffix of each line directly from source
2471        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2472    }
2473    Ok(())
2474}
2475
2476/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2477/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2478/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2479#[inline]
2480fn bytes_from_offset_zerocopy(
2481    data: &[u8],
2482    skip_bytes: usize,
2483    line_delim: u8,
2484    out: &mut impl Write,
2485) -> io::Result<()> {
2486    let delim_buf = [line_delim];
2487    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2488
2489    let mut start = 0;
2490    for pos in memchr_iter(line_delim, data) {
2491        let line_len = pos - start;
2492        if line_len > skip_bytes {
2493            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2494        }
2495        iov.push(IoSlice::new(&delim_buf));
2496        // Flush when approaching MAX_IOV to avoid oversized writev
2497        if iov.len() >= MAX_IOV - 1 {
2498            write_ioslices(out, &iov)?;
2499            iov.clear();
2500        }
2501        start = pos + 1;
2502    }
2503    if start < data.len() {
2504        let line_len = data.len() - start;
2505        if line_len > skip_bytes {
2506            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2507        }
2508        iov.push(IoSlice::new(&delim_buf));
2509    }
2510    if !iov.is_empty() {
2511        write_ioslices(out, &iov)?;
2512    }
2513    Ok(())
2514}
2515
2516/// Process a chunk for from-offset byte range extraction.
2517/// Single reserve + deferred set_len for zero per-line overhead.
2518#[inline]
2519fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2520    buf.reserve(data.len());
2521
2522    let src = data.as_ptr();
2523    let dst_base = buf.as_mut_ptr();
2524    let mut wp = buf.len();
2525    let mut start = 0;
2526
2527    for pos in memchr_iter(line_delim, data) {
2528        let line_len = pos - start;
2529        if line_len > skip_bytes {
2530            let take = line_len - skip_bytes;
2531            unsafe {
2532                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2533            }
2534            wp += take;
2535        }
2536        unsafe {
2537            *dst_base.add(wp) = line_delim;
2538        }
2539        wp += 1;
2540        start = pos + 1;
2541    }
2542    if start < data.len() {
2543        let line_len = data.len() - start;
2544        if line_len > skip_bytes {
2545            let take = line_len - skip_bytes;
2546            unsafe {
2547                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2548            }
2549            wp += take;
2550        }
2551        unsafe {
2552            *dst_base.add(wp) = line_delim;
2553        }
2554        wp += 1;
2555    }
2556    unsafe { buf.set_len(wp) };
2557}
2558
2559/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2560fn process_bytes_mid_range(
2561    data: &[u8],
2562    start_byte: usize,
2563    end_byte: usize,
2564    line_delim: u8,
2565    out: &mut impl Write,
2566) -> io::Result<()> {
2567    let skip = start_byte.saturating_sub(1);
2568
2569    if data.len() >= PARALLEL_THRESHOLD {
2570        let chunks = split_into_chunks(data, line_delim);
2571        let results = par_process(&chunks, |chunk| {
2572            let mut buf = Vec::with_capacity(chunk.len());
2573            bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, &mut buf);
2574            buf
2575        });
2576        let slices: Vec<IoSlice> = results
2577            .iter()
2578            .filter(|r| !r.is_empty())
2579            .map(|r| IoSlice::new(r))
2580            .collect();
2581        write_ioslices(out, &slices)?;
2582    } else {
2583        let mut buf = Vec::with_capacity(data.len());
2584        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2585        if !buf.is_empty() {
2586            out.write_all(&buf)?;
2587        }
2588    }
2589    Ok(())
2590}
2591
2592/// Process a chunk for mid-range byte extraction.
2593/// For each line, output bytes skip..min(line_len, end_byte).
2594/// Single reserve + deferred set_len.
2595#[inline]
2596fn bytes_mid_range_chunk(
2597    data: &[u8],
2598    skip: usize,
2599    end_byte: usize,
2600    line_delim: u8,
2601    buf: &mut Vec<u8>,
2602) {
2603    buf.reserve(data.len());
2604
2605    let src = data.as_ptr();
2606    let dst_base = buf.as_mut_ptr();
2607    let mut wp = buf.len();
2608    let mut start = 0;
2609
2610    for pos in memchr_iter(line_delim, data) {
2611        let line_len = pos - start;
2612        if line_len > skip {
2613            let take_end = line_len.min(end_byte);
2614            let take = take_end - skip;
2615            unsafe {
2616                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2617            }
2618            wp += take;
2619        }
2620        unsafe {
2621            *dst_base.add(wp) = line_delim;
2622        }
2623        wp += 1;
2624        start = pos + 1;
2625    }
2626    if start < data.len() {
2627        let line_len = data.len() - start;
2628        if line_len > skip {
2629            let take_end = line_len.min(end_byte);
2630            let take = take_end - skip;
2631            unsafe {
2632                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2633            }
2634            wp += take;
2635        }
2636        unsafe {
2637            *dst_base.add(wp) = line_delim;
2638        }
2639        wp += 1;
2640    }
2641    unsafe { buf.set_len(wp) };
2642}
2643
2644/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2645fn process_bytes_complement_mid(
2646    data: &[u8],
2647    skip_start: usize,
2648    skip_end: usize,
2649    line_delim: u8,
2650    out: &mut impl Write,
2651) -> io::Result<()> {
2652    let prefix_bytes = skip_start - 1; // bytes before the skip region
2653    if data.len() >= PARALLEL_THRESHOLD {
2654        let chunks = split_into_chunks(data, line_delim);
2655        let results = par_process(&chunks, |chunk| {
2656            let mut buf = Vec::with_capacity(chunk.len());
2657            bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, &mut buf);
2658            buf
2659        });
2660        let slices: Vec<IoSlice> = results
2661            .iter()
2662            .filter(|r| !r.is_empty())
2663            .map(|r| IoSlice::new(r))
2664            .collect();
2665        write_ioslices(out, &slices)?;
2666    } else {
2667        let mut buf = Vec::with_capacity(data.len());
2668        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
2669        if !buf.is_empty() {
2670            out.write_all(&buf)?;
2671        }
2672    }
2673    Ok(())
2674}
2675
2676/// Process a chunk for complement mid-range byte extraction.
2677/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2678#[inline]
2679fn bytes_complement_mid_chunk(
2680    data: &[u8],
2681    prefix_bytes: usize,
2682    skip_end: usize,
2683    line_delim: u8,
2684    buf: &mut Vec<u8>,
2685) {
2686    buf.reserve(data.len());
2687
2688    let src = data.as_ptr();
2689    let dst_base = buf.as_mut_ptr();
2690    let mut wp = buf.len();
2691    let mut start = 0;
2692
2693    for pos in memchr_iter(line_delim, data) {
2694        let line_len = pos - start;
2695        // Copy prefix (bytes before skip region)
2696        let take_prefix = prefix_bytes.min(line_len);
2697        if take_prefix > 0 {
2698            unsafe {
2699                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2700            }
2701            wp += take_prefix;
2702        }
2703        // Copy suffix (bytes after skip region)
2704        if line_len > skip_end {
2705            let suffix_len = line_len - skip_end;
2706            unsafe {
2707                std::ptr::copy_nonoverlapping(
2708                    src.add(start + skip_end),
2709                    dst_base.add(wp),
2710                    suffix_len,
2711                );
2712            }
2713            wp += suffix_len;
2714        }
2715        unsafe {
2716            *dst_base.add(wp) = line_delim;
2717        }
2718        wp += 1;
2719        start = pos + 1;
2720    }
2721    if start < data.len() {
2722        let line_len = data.len() - start;
2723        let take_prefix = prefix_bytes.min(line_len);
2724        if take_prefix > 0 {
2725            unsafe {
2726                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2727            }
2728            wp += take_prefix;
2729        }
2730        if line_len > skip_end {
2731            let suffix_len = line_len - skip_end;
2732            unsafe {
2733                std::ptr::copy_nonoverlapping(
2734                    src.add(start + skip_end),
2735                    dst_base.add(wp),
2736                    suffix_len,
2737                );
2738            }
2739            wp += suffix_len;
2740        }
2741        unsafe {
2742            *dst_base.add(wp) = line_delim;
2743        }
2744        wp += 1;
2745    }
2746    unsafe { buf.set_len(wp) };
2747}
2748
2749/// Optimized byte/char extraction with batched output and parallel processing.
2750fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2751    let line_delim = cfg.line_delim;
2752    let ranges = cfg.ranges;
2753    let complement = cfg.complement;
2754    let output_delim = cfg.output_delim;
2755
2756    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
2757    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
2758        let max_bytes = ranges[0].end;
2759        if max_bytes < usize::MAX {
2760            return process_bytes_from_start(data, max_bytes, line_delim, out);
2761        }
2762    }
2763
2764    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
2765    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
2766        let skip_bytes = ranges[0].start.saturating_sub(1);
2767        if skip_bytes > 0 {
2768            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
2769        }
2770    }
2771
2772    // Fast path: single mid-range (e.g., cut -b5-100)
2773    if !complement
2774        && ranges.len() == 1
2775        && ranges[0].start > 1
2776        && ranges[0].end < usize::MAX
2777        && output_delim.is_empty()
2778    {
2779        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
2780    }
2781
2782    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
2783    if complement
2784        && ranges.len() == 1
2785        && ranges[0].start == 1
2786        && ranges[0].end < usize::MAX
2787        && output_delim.is_empty()
2788    {
2789        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
2790    }
2791
2792    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
2793    if complement
2794        && ranges.len() == 1
2795        && ranges[0].end == usize::MAX
2796        && ranges[0].start > 1
2797        && output_delim.is_empty()
2798    {
2799        let max_bytes = ranges[0].start - 1;
2800        return process_bytes_from_start(data, max_bytes, line_delim, out);
2801    }
2802
2803    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
2804    if complement
2805        && ranges.len() == 1
2806        && ranges[0].start > 1
2807        && ranges[0].end < usize::MAX
2808        && output_delim.is_empty()
2809    {
2810        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
2811    }
2812
2813    if data.len() >= PARALLEL_THRESHOLD {
2814        let chunks = split_into_chunks(data, line_delim);
2815        let results = par_process(&chunks, |chunk| {
2816            let mut buf = Vec::with_capacity(chunk.len());
2817            process_bytes_chunk(
2818                chunk,
2819                ranges,
2820                complement,
2821                output_delim,
2822                line_delim,
2823                &mut buf,
2824            );
2825            buf
2826        });
2827        let slices: Vec<IoSlice> = results
2828            .iter()
2829            .filter(|r| !r.is_empty())
2830            .map(|r| IoSlice::new(r))
2831            .collect();
2832        write_ioslices(out, &slices)?;
2833    } else {
2834        let mut buf = Vec::with_capacity(data.len());
2835        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
2836        if !buf.is_empty() {
2837            out.write_all(&buf)?;
2838        }
2839    }
2840    Ok(())
2841}
2842
2843/// Process a chunk of data for byte/char extraction.
2844/// Uses raw pointer arithmetic for the newline scan.
2845/// Complement single-range fast path: compute complement ranges once, then use
2846/// the non-complement multi-range path which is more cache-friendly.
2847fn process_bytes_chunk(
2848    data: &[u8],
2849    ranges: &[Range],
2850    complement: bool,
2851    output_delim: &[u8],
2852    line_delim: u8,
2853    buf: &mut Vec<u8>,
2854) {
2855    buf.reserve(data.len());
2856    let base = data.as_ptr();
2857    let mut start = 0;
2858    for end_pos in memchr_iter(line_delim, data) {
2859        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2860        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2861        unsafe { buf_push(buf, line_delim) };
2862        start = end_pos + 1;
2863    }
2864    if start < data.len() {
2865        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2866        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2867        unsafe { buf_push(buf, line_delim) };
2868    }
2869}
2870
2871/// Extract byte ranges from a line into the output buffer.
2872/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
2873/// Raw pointer arithmetic eliminates per-range bounds checking.
2874#[inline(always)]
2875fn cut_bytes_to_buf(
2876    line: &[u8],
2877    ranges: &[Range],
2878    complement: bool,
2879    output_delim: &[u8],
2880    buf: &mut Vec<u8>,
2881) {
2882    let len = line.len();
2883    let base = line.as_ptr();
2884    let mut first_range = true;
2885
2886    // Reserve worst case: full line + delimiters between ranges
2887    let needed = len + output_delim.len() * ranges.len() + 1;
2888    if buf.capacity() - buf.len() < needed {
2889        buf.reserve(needed);
2890    }
2891
2892    if complement {
2893        let mut pos: usize = 1;
2894        for r in ranges {
2895            let rs = r.start;
2896            let re = r.end.min(len);
2897            if pos < rs {
2898                if !first_range && !output_delim.is_empty() {
2899                    unsafe { buf_extend(buf, output_delim) };
2900                }
2901                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
2902                first_range = false;
2903            }
2904            pos = re + 1;
2905            if pos > len {
2906                break;
2907            }
2908        }
2909        if pos <= len {
2910            if !first_range && !output_delim.is_empty() {
2911                unsafe { buf_extend(buf, output_delim) };
2912            }
2913            unsafe {
2914                buf_extend(
2915                    buf,
2916                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
2917                )
2918            };
2919        }
2920    } else if output_delim.is_empty() && ranges.len() == 1 {
2921        // Ultra-fast path: single range, no output delimiter
2922        let start = ranges[0].start.saturating_sub(1);
2923        let end = ranges[0].end.min(len);
2924        if start < len {
2925            unsafe {
2926                buf_extend(
2927                    buf,
2928                    std::slice::from_raw_parts(base.add(start), end - start),
2929                )
2930            };
2931        }
2932    } else {
2933        for r in ranges {
2934            let start = r.start.saturating_sub(1);
2935            let end = r.end.min(len);
2936            if start >= len {
2937                break;
2938            }
2939            if !first_range && !output_delim.is_empty() {
2940                unsafe { buf_extend(buf, output_delim) };
2941            }
2942            unsafe {
2943                buf_extend(
2944                    buf,
2945                    std::slice::from_raw_parts(base.add(start), end - start),
2946                )
2947            };
2948            first_range = false;
2949        }
2950    }
2951}
2952
2953// ── Public API ───────────────────────────────────────────────────────────
2954
2955/// Cut fields from a line using a delimiter. Writes to `out`.
2956#[inline]
2957pub fn cut_fields(
2958    line: &[u8],
2959    delim: u8,
2960    ranges: &[Range],
2961    complement: bool,
2962    output_delim: &[u8],
2963    suppress_no_delim: bool,
2964    out: &mut impl Write,
2965) -> io::Result<bool> {
2966    if memchr::memchr(delim, line).is_none() {
2967        if !suppress_no_delim {
2968            out.write_all(line)?;
2969            return Ok(true);
2970        }
2971        return Ok(false);
2972    }
2973
2974    let mut field_num: usize = 1;
2975    let mut field_start: usize = 0;
2976    let mut first_output = true;
2977
2978    for delim_pos in memchr_iter(delim, line) {
2979        let selected = in_ranges(ranges, field_num) != complement;
2980        if selected {
2981            if !first_output {
2982                out.write_all(output_delim)?;
2983            }
2984            out.write_all(&line[field_start..delim_pos])?;
2985            first_output = false;
2986        }
2987        field_start = delim_pos + 1;
2988        field_num += 1;
2989    }
2990
2991    let selected = in_ranges(ranges, field_num) != complement;
2992    if selected {
2993        if !first_output {
2994            out.write_all(output_delim)?;
2995        }
2996        out.write_all(&line[field_start..])?;
2997    }
2998
2999    Ok(true)
3000}
3001
3002/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3003#[inline]
3004pub fn cut_bytes(
3005    line: &[u8],
3006    ranges: &[Range],
3007    complement: bool,
3008    output_delim: &[u8],
3009    out: &mut impl Write,
3010) -> io::Result<bool> {
3011    let mut first_range = true;
3012
3013    if complement {
3014        let len = line.len();
3015        let mut comp_ranges = Vec::new();
3016        let mut pos: usize = 1;
3017        for r in ranges {
3018            let rs = r.start;
3019            let re = r.end.min(len);
3020            if pos < rs {
3021                comp_ranges.push((pos, rs - 1));
3022            }
3023            pos = re + 1;
3024            if pos > len {
3025                break;
3026            }
3027        }
3028        if pos <= len {
3029            comp_ranges.push((pos, len));
3030        }
3031        for &(s, e) in &comp_ranges {
3032            if !first_range && !output_delim.is_empty() {
3033                out.write_all(output_delim)?;
3034            }
3035            out.write_all(&line[s - 1..e])?;
3036            first_range = false;
3037        }
3038    } else {
3039        for r in ranges {
3040            let start = r.start.saturating_sub(1);
3041            let end = r.end.min(line.len());
3042            if start >= line.len() {
3043                break;
3044            }
3045            if !first_range && !output_delim.is_empty() {
3046                out.write_all(output_delim)?;
3047            }
3048            out.write_all(&line[start..end])?;
3049            first_range = false;
3050        }
3051    }
3052    Ok(true)
3053}
3054
3055/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3056/// Output is always <= input (we remove everything after first delimiter per line).
3057/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3058/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3059///
3060/// Lines without delimiter pass through unchanged (unless suppress=true).
3061/// Lines with delimiter: keep bytes before delimiter + newline.
3062pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3063    let len = data.len();
3064    let mut wp: usize = 0;
3065    let mut rp: usize = 0;
3066
3067    while rp < len {
3068        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3069            None => {
3070                // Rest is partial line, no delimiter
3071                if suppress {
3072                    // suppress: skip lines without delimiter
3073                    break;
3074                }
3075                let remaining = len - rp;
3076                if wp != rp {
3077                    data.copy_within(rp..len, wp);
3078                }
3079                wp += remaining;
3080                break;
3081            }
3082            Some(offset) => {
3083                let actual = rp + offset;
3084                if data[actual] == line_delim {
3085                    // No delimiter on this line
3086                    if suppress {
3087                        // Skip this line entirely
3088                        rp = actual + 1;
3089                    } else {
3090                        // Output entire line including newline
3091                        let chunk_len = actual + 1 - rp;
3092                        if wp != rp {
3093                            data.copy_within(rp..actual + 1, wp);
3094                        }
3095                        wp += chunk_len;
3096                        rp = actual + 1;
3097                    }
3098                } else {
3099                    // Delimiter found: output field 1 (up to delimiter) + newline
3100                    let field_len = actual - rp;
3101                    if wp != rp && field_len > 0 {
3102                        data.copy_within(rp..actual, wp);
3103                    }
3104                    wp += field_len;
3105                    data[wp] = line_delim;
3106                    wp += 1;
3107                    // Skip to next newline
3108                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3109                        None => {
3110                            rp = len;
3111                        }
3112                        Some(nl_off) => {
3113                            rp = actual + 1 + nl_off + 1;
3114                        }
3115                    }
3116                }
3117            }
3118        }
3119    }
3120    wp
3121}
3122
3123/// Process a full data buffer (from mmap or read) with cut operation.
3124pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3125    match cfg.mode {
3126        CutMode::Fields => process_fields_fast(data, cfg, out),
3127        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3128    }
3129}
3130
3131/// Process input from a reader (for stdin).
3132/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3133/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3134/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3135pub fn process_cut_reader<R: BufRead>(
3136    mut reader: R,
3137    cfg: &CutConfig,
3138    out: &mut impl Write,
3139) -> io::Result<()> {
3140    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3141    let line_delim = cfg.line_delim;
3142
3143    // Read large chunks and process in batch.
3144    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3145    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3146
3147    loop {
3148        // Read up to CHUNK_SIZE bytes
3149        buf.reserve(CHUNK_SIZE);
3150        let read_start = buf.len();
3151        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3152        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3153        buf.truncate(read_start + n);
3154
3155        if buf.is_empty() {
3156            break;
3157        }
3158
3159        if n == 0 {
3160            // EOF with leftover data (last line without terminator)
3161            process_cut_data(&buf, cfg, out)?;
3162            break;
3163        }
3164
3165        // Find the last line delimiter in the buffer so we process complete lines
3166        let process_end = match memchr::memrchr(line_delim, &buf) {
3167            Some(pos) => pos + 1,
3168            None => {
3169                // No line delimiter found — keep accumulating
3170                continue;
3171            }
3172        };
3173
3174        // Process the complete lines using the fast batch path
3175        process_cut_data(&buf[..process_end], cfg, out)?;
3176
3177        // Shift leftover to the front for next iteration
3178        let leftover_len = buf.len() - process_end;
3179        if leftover_len > 0 {
3180            buf.copy_within(process_end.., 0);
3181        }
3182        buf.truncate(leftover_len);
3183    }
3184
3185    Ok(())
3186}
3187
3188/// Read as many bytes as possible into buf, retrying on partial reads.
3189#[inline]
3190fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3191    let n = reader.read(buf)?;
3192    if n == buf.len() || n == 0 {
3193        return Ok(n);
3194    }
3195    // Slow path: partial read — retry to fill buffer
3196    let mut total = n;
3197    while total < buf.len() {
3198        match reader.read(&mut buf[total..]) {
3199            Ok(0) => break,
3200            Ok(n) => total += n,
3201            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3202            Err(e) => return Err(e),
3203        }
3204    }
3205    Ok(total)
3206}
3207
3208/// Cut operation mode
3209#[derive(Debug, Clone, Copy, PartialEq)]
3210pub enum CutMode {
3211    Bytes,
3212    Characters,
3213    Fields,
3214}