Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (2MB).
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Max iovec entries per writev call (Linux default).
9const MAX_IOV: usize = 1024;
10
11/// Configuration for cut operations.
12pub struct CutConfig<'a> {
13    pub mode: CutMode,
14    pub ranges: &'a [Range],
15    pub complement: bool,
16    pub delim: u8,
17    pub output_delim: &'a [u8],
18    pub suppress_no_delim: bool,
19    pub line_delim: u8,
20}
21
22/// A range specification like 1, 3-5, -3, 4-
23#[derive(Debug, Clone)]
24pub struct Range {
25    pub start: usize, // 1-based, 0 means "from beginning"
26    pub end: usize,   // 1-based, usize::MAX means "to end"
27}
28
29/// Parse a LIST specification like "1,3-5,7-" into ranges.
30/// Each range is 1-based. Returns sorted, merged ranges.
31pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
32    let mut ranges = Vec::new();
33
34    for part in spec.split(',') {
35        let part = part.trim();
36        if part.is_empty() {
37            continue;
38        }
39
40        if let Some(idx) = part.find('-') {
41            let left = &part[..idx];
42            let right = &part[idx + 1..];
43
44            let start = if left.is_empty() {
45                1
46            } else {
47                left.parse::<usize>()
48                    .map_err(|_| format!("invalid range: '{}'", part))?
49            };
50
51            let end = if right.is_empty() {
52                usize::MAX
53            } else {
54                right
55                    .parse::<usize>()
56                    .map_err(|_| format!("invalid range: '{}'", part))?
57            };
58
59            if start == 0 {
60                return Err("fields and positions are numbered from 1".to_string());
61            }
62            if start > end {
63                return Err(format!("invalid decreasing range: '{}'", part));
64            }
65
66            ranges.push(Range { start, end });
67        } else {
68            let n = part
69                .parse::<usize>()
70                .map_err(|_| format!("invalid field: '{}'", part))?;
71            if n == 0 {
72                return Err("fields and positions are numbered from 1".to_string());
73            }
74            ranges.push(Range { start: n, end: n });
75        }
76    }
77
78    if ranges.is_empty() {
79        return Err("you must specify a list of bytes, characters, or fields".to_string());
80    }
81
82    // Sort and merge overlapping ranges
83    ranges.sort_by_key(|r| (r.start, r.end));
84    let mut merged = vec![ranges[0].clone()];
85    for r in &ranges[1..] {
86        let last = merged.last_mut().unwrap();
87        if r.start <= last.end.saturating_add(1) {
88            last.end = last.end.max(r.end);
89        } else {
90            merged.push(r.clone());
91        }
92    }
93
94    Ok(merged)
95}
96
97/// Check if a 1-based position is in any range.
98/// Ranges must be sorted. Uses early exit since ranges are sorted.
99#[inline(always)]
100fn in_ranges(ranges: &[Range], pos: usize) -> bool {
101    for r in ranges {
102        if pos < r.start {
103            return false;
104        }
105        if pos <= r.end {
106            return true;
107        }
108    }
109    false
110}
111
112/// Pre-compute a 64-bit mask for field selection.
113/// Bit i-1 is set if field i should be output.
114#[inline]
115fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
116    let mut mask: u64 = 0;
117    for i in 1..=64u32 {
118        let in_range = in_ranges(ranges, i as usize);
119        if in_range != complement {
120            mask |= 1u64 << (i - 1);
121        }
122    }
123    mask
124}
125
126/// Check if a field should be selected, using bitset for first 64 fields.
127#[inline(always)]
128fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
129    if field_num <= 64 {
130        (mask >> (field_num - 1)) & 1 == 1
131    } else {
132        in_ranges(ranges, field_num) != complement
133    }
134}
135
136// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
137
138/// Append a slice to buf without capacity checks.
139/// Caller MUST ensure buf has enough remaining capacity.
140#[inline(always)]
141unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
142    unsafe {
143        let len = buf.len();
144        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
145        buf.set_len(len + data.len());
146    }
147}
148
149/// Append a single byte to buf without capacity checks.
150/// Caller MUST ensure buf has enough remaining capacity.
151#[inline(always)]
152unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
153    unsafe {
154        let len = buf.len();
155        *buf.as_mut_ptr().add(len) = b;
156        buf.set_len(len + 1);
157    }
158}
159
160/// Write multiple IoSlice buffers using write_vectored (writev syscall).
161/// Batches into MAX_IOV-sized groups. Falls back to write_all per slice for partial writes.
162#[inline]
163fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
164    if slices.is_empty() {
165        return Ok(());
166    }
167    for batch in slices.chunks(MAX_IOV) {
168        let total: usize = batch.iter().map(|s| s.len()).sum();
169        match out.write_vectored(batch) {
170            Ok(n) if n >= total => continue,
171            Ok(mut written) => {
172                // Partial write: fall back to write_all per remaining slice
173                for slice in batch {
174                    let slen = slice.len();
175                    if written >= slen {
176                        written -= slen;
177                        continue;
178                    }
179                    if written > 0 {
180                        out.write_all(&slice[written..])?;
181                        written = 0;
182                    } else {
183                        out.write_all(slice)?;
184                    }
185                }
186            }
187            Err(e) => return Err(e),
188        }
189    }
190    Ok(())
191}
192
193// ── Chunk splitting for parallel processing ──────────────────────────────
194
195/// Split data into chunks aligned to line boundaries for parallel processing.
196fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
197    let num_threads = rayon::current_num_threads().max(1);
198    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
199        return vec![data];
200    }
201
202    let chunk_size = data.len() / num_threads;
203    let mut chunks = Vec::with_capacity(num_threads);
204    let mut pos = 0;
205
206    for _ in 0..num_threads - 1 {
207        let target = pos + chunk_size;
208        if target >= data.len() {
209            break;
210        }
211        let boundary = memchr::memchr(line_delim, &data[target..])
212            .map(|p| target + p + 1)
213            .unwrap_or(data.len());
214        if boundary > pos {
215            chunks.push(&data[pos..boundary]);
216        }
217        pos = boundary;
218    }
219
220    if pos < data.len() {
221        chunks.push(&data[pos..]);
222    }
223
224    chunks
225}
226
227// ── Fast path: field extraction with batched output ──────────────────────
228
229/// Optimized field extraction with early exit and batched output.
230fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
231    let delim = cfg.delim;
232    let line_delim = cfg.line_delim;
233    let ranges = cfg.ranges;
234    let complement = cfg.complement;
235    let output_delim = cfg.output_delim;
236    let suppress = cfg.suppress_no_delim;
237
238    // Zero-copy fast path: if delimiter never appears, output = input unchanged.
239    if !complement && memchr::memchr(delim, data).is_none() {
240        if suppress {
241            return Ok(());
242        }
243        out.write_all(data)?;
244        if !data.is_empty() && *data.last().unwrap() != line_delim {
245            out.write_all(&[line_delim])?;
246        }
247        return Ok(());
248    }
249
250    // Ultra-fast path: single field extraction (e.g., cut -f5)
251    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
252        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
253    }
254
255    // Fast path: complement of single field with default output delimiter.
256    if complement
257        && ranges.len() == 1
258        && ranges[0].start == ranges[0].end
259        && output_delim.len() == 1
260        && output_delim[0] == delim
261    {
262        return process_complement_single_field(
263            data,
264            delim,
265            line_delim,
266            ranges[0].start,
267            suppress,
268            out,
269        );
270    }
271
272    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
273    if !complement
274        && ranges.len() == 1
275        && ranges[0].start == 1
276        && output_delim.len() == 1
277        && output_delim[0] == delim
278        && ranges[0].end < usize::MAX
279    {
280        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
281    }
282
283    // Fast path: open-ended field range from field N (e.g., cut -f3-)
284    if !complement
285        && ranges.len() == 1
286        && ranges[0].end == usize::MAX
287        && ranges[0].start > 1
288        && output_delim.len() == 1
289        && output_delim[0] == delim
290    {
291        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
292    }
293
294    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
295    if !complement
296        && ranges.len() == 1
297        && ranges[0].start > 1
298        && ranges[0].end < usize::MAX
299        && output_delim.len() == 1
300        && output_delim[0] == delim
301    {
302        return process_fields_mid_range(
303            data,
304            delim,
305            line_delim,
306            ranges[0].start,
307            ranges[0].end,
308            suppress,
309            out,
310        );
311    }
312
313    // General field extraction
314    let max_field = if complement {
315        usize::MAX
316    } else {
317        ranges.last().map(|r| r.end).unwrap_or(0)
318    };
319    let field_mask = compute_field_mask(ranges, complement);
320
321    if data.len() >= PARALLEL_THRESHOLD {
322        let chunks = split_into_chunks(data, line_delim);
323        let results: Vec<Vec<u8>> = chunks
324            .par_iter()
325            .map(|chunk| {
326                let mut buf = Vec::with_capacity(chunk.len());
327                process_fields_chunk(
328                    chunk,
329                    delim,
330                    ranges,
331                    output_delim,
332                    suppress,
333                    max_field,
334                    field_mask,
335                    line_delim,
336                    complement,
337                    &mut buf,
338                );
339                buf
340            })
341            .collect();
342        // Use write_vectored (writev) to batch N writes into fewer syscalls
343        let slices: Vec<IoSlice> = results
344            .iter()
345            .filter(|r| !r.is_empty())
346            .map(|r| IoSlice::new(r))
347            .collect();
348        write_ioslices(out, &slices)?;
349    } else {
350        let mut buf = Vec::with_capacity(data.len());
351        process_fields_chunk(
352            data,
353            delim,
354            ranges,
355            output_delim,
356            suppress,
357            max_field,
358            field_mask,
359            line_delim,
360            complement,
361            &mut buf,
362        );
363        if !buf.is_empty() {
364            out.write_all(&buf)?;
365        }
366    }
367    Ok(())
368}
369
370/// Process a chunk of data for general field extraction.
371/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
372/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
373/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
374fn process_fields_chunk(
375    data: &[u8],
376    delim: u8,
377    ranges: &[Range],
378    output_delim: &[u8],
379    suppress: bool,
380    max_field: usize,
381    field_mask: u64,
382    line_delim: u8,
383    complement: bool,
384    buf: &mut Vec<u8>,
385) {
386    // When delim != line_delim and max_field is bounded, use two-level approach:
387    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
388    // This avoids scanning past max_field on each line (significant for lines with
389    // many columns but small field selection like -f1,3,5 on 20-column CSV).
390    // For complement or unbounded ranges, use single-pass memchr2_iter which
391    // needs to process all delimiters anyway.
392    if delim != line_delim && max_field < usize::MAX && !complement {
393        buf.reserve(data.len());
394        let mut start = 0;
395        for end_pos in memchr_iter(line_delim, data) {
396            let line = &data[start..end_pos];
397            extract_fields_to_buf(
398                line,
399                delim,
400                ranges,
401                output_delim,
402                suppress,
403                max_field,
404                field_mask,
405                line_delim,
406                buf,
407                complement,
408            );
409            start = end_pos + 1;
410        }
411        if start < data.len() {
412            extract_fields_to_buf(
413                &data[start..],
414                delim,
415                ranges,
416                output_delim,
417                suppress,
418                max_field,
419                field_mask,
420                line_delim,
421                buf,
422                complement,
423            );
424        }
425        return;
426    }
427
428    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
429    // delimiter and line_delim in one SIMD scan.
430    if delim != line_delim {
431        buf.reserve(data.len());
432
433        let mut line_start: usize = 0;
434        let mut field_start: usize = 0;
435        let mut field_num: usize = 1;
436        let mut first_output = true;
437        let mut has_delim = false;
438
439        for pos in memchr::memchr2_iter(delim, line_delim, data) {
440            let byte = unsafe { *data.get_unchecked(pos) };
441
442            if byte == line_delim {
443                // End of line: flush final field and emit line delimiter
444                if (field_num <= max_field || complement)
445                    && has_delim
446                    && is_selected(field_num, field_mask, ranges, complement)
447                {
448                    if !first_output {
449                        unsafe { buf_extend(buf, output_delim) };
450                    }
451                    unsafe { buf_extend(buf, &data[field_start..pos]) };
452                    first_output = false;
453                }
454
455                if !first_output {
456                    unsafe { buf_push(buf, line_delim) };
457                } else if !has_delim {
458                    if !suppress {
459                        unsafe {
460                            buf_extend(buf, &data[line_start..pos]);
461                            buf_push(buf, line_delim);
462                        }
463                    }
464                } else {
465                    unsafe { buf_push(buf, line_delim) };
466                }
467
468                // Reset state for next line
469                line_start = pos + 1;
470                field_start = pos + 1;
471                field_num = 1;
472                first_output = true;
473                has_delim = false;
474            } else {
475                // Field delimiter hit
476                has_delim = true;
477
478                if is_selected(field_num, field_mask, ranges, complement) {
479                    if !first_output {
480                        unsafe { buf_extend(buf, output_delim) };
481                    }
482                    unsafe { buf_extend(buf, &data[field_start..pos]) };
483                    first_output = false;
484                }
485
486                field_num += 1;
487                field_start = pos + 1;
488            }
489        }
490
491        // Handle last line without trailing line_delim
492        if line_start < data.len() {
493            let line = &data[line_start..];
494            if !line.is_empty() {
495                if (field_num <= max_field || complement)
496                    && has_delim
497                    && is_selected(field_num, field_mask, ranges, complement)
498                {
499                    if !first_output {
500                        unsafe { buf_extend(buf, output_delim) };
501                    }
502                    unsafe { buf_extend(buf, &data[field_start..data.len()]) };
503                    first_output = false;
504                }
505
506                if !first_output {
507                    unsafe { buf_push(buf, line_delim) };
508                } else if !has_delim {
509                    if !suppress {
510                        unsafe {
511                            buf_extend(buf, &data[line_start..data.len()]);
512                            buf_push(buf, line_delim);
513                        }
514                    }
515                } else {
516                    unsafe { buf_push(buf, line_delim) };
517                }
518            }
519        }
520
521        return;
522    }
523
524    // Fallback: when delim == line_delim, use the two-level scan approach
525    let mut start = 0;
526    for end_pos in memchr_iter(line_delim, data) {
527        let line = &data[start..end_pos];
528        extract_fields_to_buf(
529            line,
530            delim,
531            ranges,
532            output_delim,
533            suppress,
534            max_field,
535            field_mask,
536            line_delim,
537            buf,
538            complement,
539        );
540        start = end_pos + 1;
541    }
542    if start < data.len() {
543        extract_fields_to_buf(
544            &data[start..],
545            delim,
546            ranges,
547            output_delim,
548            suppress,
549            max_field,
550            field_mask,
551            line_delim,
552            buf,
553            complement,
554        );
555    }
556}
557
558// ── Ultra-fast single field extraction ───────────────────────────────────
559
560/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
561/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
562/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
563fn process_single_field(
564    data: &[u8],
565    delim: u8,
566    line_delim: u8,
567    target: usize,
568    suppress: bool,
569    out: &mut impl Write,
570) -> io::Result<()> {
571    let target_idx = target - 1;
572
573    // Combined SIMD scan: single pass using memchr2 for any target field.
574    if delim != line_delim {
575        if data.len() >= PARALLEL_THRESHOLD {
576            let chunks = split_into_chunks(data, line_delim);
577            let results: Vec<Vec<u8>> = chunks
578                .par_iter()
579                .map(|chunk| {
580                    let mut buf = Vec::with_capacity(chunk.len());
581                    process_nth_field_combined(
582                        chunk, delim, line_delim, target_idx, suppress, &mut buf,
583                    );
584                    buf
585                })
586                .collect();
587            for result in &results {
588                if !result.is_empty() {
589                    out.write_all(result)?;
590                }
591            }
592        } else if target_idx == 0 && !suppress {
593            // Zero-copy fast path for field 1 (most common case):
594            // For each line, either truncate at the first delimiter, or pass through.
595            // Since most lines have a delimiter, and field 1 is a prefix of each line,
596            // we can write contiguous runs directly from the source data.
597            single_field1_zerocopy(data, delim, line_delim, out)?;
598        } else if target_idx <= 3 && !suppress {
599            // Optimized path for small field indices (fields 2-4):
600            // Uses successive memchr calls per line instead of the full combined scan.
601            // For field 2: two memchr calls (find first delim, find second).
602            // This avoids the memchr2_iter overhead for every byte in the line.
603            let mut buf = Vec::with_capacity(data.len());
604            process_small_field_combined(data, delim, line_delim, target_idx, &mut buf);
605            if !buf.is_empty() {
606                out.write_all(&buf)?;
607            }
608        } else {
609            let mut buf = Vec::with_capacity(data.len());
610            process_nth_field_combined(data, delim, line_delim, target_idx, suppress, &mut buf);
611            if !buf.is_empty() {
612                out.write_all(&buf)?;
613            }
614        }
615        return Ok(());
616    }
617
618    // Fallback for delim == line_delim: nested loop approach
619    if data.len() >= PARALLEL_THRESHOLD {
620        let chunks = split_into_chunks(data, line_delim);
621        let results: Vec<Vec<u8>> = chunks
622            .par_iter()
623            .map(|chunk| {
624                let mut buf = Vec::with_capacity(chunk.len() / 4);
625                process_single_field_chunk(
626                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
627                );
628                buf
629            })
630            .collect();
631        // Use write_vectored (writev) to batch N writes into fewer syscalls
632        let slices: Vec<IoSlice> = results
633            .iter()
634            .filter(|r| !r.is_empty())
635            .map(|r| IoSlice::new(r))
636            .collect();
637        write_ioslices(out, &slices)?;
638    } else {
639        let mut buf = Vec::with_capacity(data.len() / 4);
640        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
641        if !buf.is_empty() {
642            out.write_all(&buf)?;
643        }
644    }
645    Ok(())
646}
647
648/// Complement single-field extraction: skip one field, output rest unchanged.
649fn process_complement_single_field(
650    data: &[u8],
651    delim: u8,
652    line_delim: u8,
653    skip_field: usize,
654    suppress: bool,
655    out: &mut impl Write,
656) -> io::Result<()> {
657    let skip_idx = skip_field - 1;
658
659    if data.len() >= PARALLEL_THRESHOLD {
660        let chunks = split_into_chunks(data, line_delim);
661        let results: Vec<Vec<u8>> = chunks
662            .par_iter()
663            .map(|chunk| {
664                let mut buf = Vec::with_capacity(chunk.len());
665                complement_single_field_chunk(
666                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
667                );
668                buf
669            })
670            .collect();
671        // Use write_vectored (writev) to batch N writes into fewer syscalls
672        let slices: Vec<IoSlice> = results
673            .iter()
674            .filter(|r| !r.is_empty())
675            .map(|r| IoSlice::new(r))
676            .collect();
677        write_ioslices(out, &slices)?;
678    } else {
679        let mut buf = Vec::with_capacity(data.len());
680        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
681        if !buf.is_empty() {
682            out.write_all(&buf)?;
683        }
684    }
685    Ok(())
686}
687
688/// Process a chunk for complement single-field extraction.
689fn complement_single_field_chunk(
690    data: &[u8],
691    delim: u8,
692    skip_idx: usize,
693    line_delim: u8,
694    suppress: bool,
695    buf: &mut Vec<u8>,
696) {
697    let mut start = 0;
698    for end_pos in memchr_iter(line_delim, data) {
699        let line = &data[start..end_pos];
700        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
701        start = end_pos + 1;
702    }
703    if start < data.len() {
704        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
705    }
706}
707
708/// Extract all fields except skip_idx from one line.
709#[inline(always)]
710fn complement_single_field_line(
711    line: &[u8],
712    delim: u8,
713    skip_idx: usize,
714    line_delim: u8,
715    suppress: bool,
716    buf: &mut Vec<u8>,
717) {
718    if line.is_empty() {
719        if !suppress {
720            buf.push(line_delim);
721        }
722        return;
723    }
724
725    buf.reserve(line.len() + 1);
726
727    let mut field_idx = 0;
728    let mut field_start = 0;
729    let mut first_output = true;
730    let mut has_delim = false;
731
732    for pos in memchr_iter(delim, line) {
733        has_delim = true;
734        if field_idx != skip_idx {
735            if !first_output {
736                unsafe { buf_push(buf, delim) };
737            }
738            unsafe { buf_extend(buf, &line[field_start..pos]) };
739            first_output = false;
740        }
741        field_idx += 1;
742        field_start = pos + 1;
743    }
744
745    if !has_delim {
746        if !suppress {
747            unsafe {
748                buf_extend(buf, line);
749                buf_push(buf, line_delim);
750            }
751        }
752        return;
753    }
754
755    // Last field
756    if field_idx != skip_idx {
757        if !first_output {
758            unsafe { buf_push(buf, delim) };
759        }
760        unsafe { buf_extend(buf, &line[field_start..]) };
761    }
762
763    unsafe { buf_push(buf, line_delim) };
764}
765
766/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
767/// Zero-copy for the non-parallel path: identifies the truncation point per line
768/// and writes contiguous runs directly from the source data.
769fn process_fields_prefix(
770    data: &[u8],
771    delim: u8,
772    line_delim: u8,
773    last_field: usize,
774    suppress: bool,
775    out: &mut impl Write,
776) -> io::Result<()> {
777    if data.len() >= PARALLEL_THRESHOLD {
778        let chunks = split_into_chunks(data, line_delim);
779        let results: Vec<Vec<u8>> = chunks
780            .par_iter()
781            .map(|chunk| {
782                let mut buf = Vec::with_capacity(chunk.len());
783                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
784                buf
785            })
786            .collect();
787        // Use write_vectored (writev) to batch N writes into fewer syscalls
788        let slices: Vec<IoSlice> = results
789            .iter()
790            .filter(|r| !r.is_empty())
791            .map(|r| IoSlice::new(r))
792            .collect();
793        write_ioslices(out, &slices)?;
794    } else if !suppress {
795        // Zero-copy fast path: scan for truncation points, write runs from source.
796        // When suppress is false, every line is output (with or without delimiter).
797        // Most lines have enough fields, so the output is often identical to input.
798        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
799    } else {
800        let mut buf = Vec::with_capacity(data.len());
801        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
802        if !buf.is_empty() {
803            out.write_all(&buf)?;
804        }
805    }
806    Ok(())
807}
808
809/// Zero-copy field-prefix extraction: writes contiguous runs directly from source data.
810/// For lines where the Nth delimiter exists, we truncate at that point.
811/// For lines with fewer fields, we output them unchanged.
812/// Lines without any delimiter are output unchanged (suppress=false assumed).
813#[inline]
814fn fields_prefix_zerocopy(
815    data: &[u8],
816    delim: u8,
817    line_delim: u8,
818    last_field: usize,
819    out: &mut impl Write,
820) -> io::Result<()> {
821    let mut start = 0;
822    let mut run_start: usize = 0;
823
824    for end_pos in memchr_iter(line_delim, data) {
825        let line = &data[start..end_pos];
826        // Find the position of the Nth delimiter to truncate at
827        let mut field_count = 1;
828        let mut truncate_at: Option<usize> = None;
829        for dpos in memchr_iter(delim, line) {
830            if field_count >= last_field {
831                truncate_at = Some(start + dpos);
832                break;
833            }
834            field_count += 1;
835        }
836
837        if let Some(trunc_pos) = truncate_at {
838            // This line has more fields than needed. Flush run, write truncated.
839            if run_start < start {
840                out.write_all(&data[run_start..start])?;
841            }
842            out.write_all(&data[start..trunc_pos])?;
843            out.write_all(&[line_delim])?;
844            run_start = end_pos + 1;
845        }
846        // else: line has <= last_field fields, keep it in the run
847        start = end_pos + 1;
848    }
849    // Handle last line without terminator
850    if start < data.len() {
851        let line = &data[start..];
852        let mut field_count = 1;
853        let mut truncate_at: Option<usize> = None;
854        for dpos in memchr_iter(delim, line) {
855            if field_count >= last_field {
856                truncate_at = Some(start + dpos);
857                break;
858            }
859            field_count += 1;
860        }
861        if let Some(trunc_pos) = truncate_at {
862            if run_start < start {
863                out.write_all(&data[run_start..start])?;
864            }
865            out.write_all(&data[start..trunc_pos])?;
866            out.write_all(&[line_delim])?;
867            return Ok(());
868        }
869    }
870    // Flush remaining run
871    if run_start < data.len() {
872        out.write_all(&data[run_start..])?;
873        if !data.is_empty() && *data.last().unwrap() != line_delim {
874            out.write_all(&[line_delim])?;
875        }
876    }
877    Ok(())
878}
879
880/// Process a chunk for contiguous from-start field range extraction.
881fn fields_prefix_chunk(
882    data: &[u8],
883    delim: u8,
884    line_delim: u8,
885    last_field: usize,
886    suppress: bool,
887    buf: &mut Vec<u8>,
888) {
889    let mut start = 0;
890    for end_pos in memchr_iter(line_delim, data) {
891        let line = &data[start..end_pos];
892        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
893        start = end_pos + 1;
894    }
895    if start < data.len() {
896        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
897    }
898}
899
900/// Extract first N fields from one line (contiguous from-start range).
901#[inline(always)]
902fn fields_prefix_line(
903    line: &[u8],
904    delim: u8,
905    line_delim: u8,
906    last_field: usize,
907    suppress: bool,
908    buf: &mut Vec<u8>,
909) {
910    if line.is_empty() {
911        if !suppress {
912            buf.push(line_delim);
913        }
914        return;
915    }
916
917    buf.reserve(line.len() + 1);
918
919    let mut field_count = 1;
920    let mut has_delim = false;
921
922    for pos in memchr_iter(delim, line) {
923        has_delim = true;
924        if field_count >= last_field {
925            unsafe {
926                buf_extend(buf, &line[..pos]);
927                buf_push(buf, line_delim);
928            }
929            return;
930        }
931        field_count += 1;
932    }
933
934    if !has_delim {
935        if !suppress {
936            unsafe {
937                buf_extend(buf, line);
938                buf_push(buf, line_delim);
939            }
940        }
941        return;
942    }
943
944    unsafe {
945        buf_extend(buf, line);
946        buf_push(buf, line_delim);
947    }
948}
949
950/// Open-ended field suffix extraction (e.g., `cut -f3-`).
951fn process_fields_suffix(
952    data: &[u8],
953    delim: u8,
954    line_delim: u8,
955    start_field: usize,
956    suppress: bool,
957    out: &mut impl Write,
958) -> io::Result<()> {
959    if data.len() >= PARALLEL_THRESHOLD {
960        let chunks = split_into_chunks(data, line_delim);
961        let results: Vec<Vec<u8>> = chunks
962            .par_iter()
963            .map(|chunk| {
964                let mut buf = Vec::with_capacity(chunk.len());
965                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
966                buf
967            })
968            .collect();
969        // Use write_vectored (writev) to batch N writes into fewer syscalls
970        let slices: Vec<IoSlice> = results
971            .iter()
972            .filter(|r| !r.is_empty())
973            .map(|r| IoSlice::new(r))
974            .collect();
975        write_ioslices(out, &slices)?;
976    } else {
977        let mut buf = Vec::with_capacity(data.len());
978        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
979        if !buf.is_empty() {
980            out.write_all(&buf)?;
981        }
982    }
983    Ok(())
984}
985
986/// Process a chunk for open-ended field suffix extraction.
987fn fields_suffix_chunk(
988    data: &[u8],
989    delim: u8,
990    line_delim: u8,
991    start_field: usize,
992    suppress: bool,
993    buf: &mut Vec<u8>,
994) {
995    let mut start = 0;
996    for end_pos in memchr_iter(line_delim, data) {
997        let line = &data[start..end_pos];
998        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
999        start = end_pos + 1;
1000    }
1001    if start < data.len() {
1002        fields_suffix_line(
1003            &data[start..],
1004            delim,
1005            line_delim,
1006            start_field,
1007            suppress,
1008            buf,
1009        );
1010    }
1011}
1012
1013/// Extract fields from start_field to end from one line.
1014#[inline(always)]
1015fn fields_suffix_line(
1016    line: &[u8],
1017    delim: u8,
1018    line_delim: u8,
1019    start_field: usize,
1020    suppress: bool,
1021    buf: &mut Vec<u8>,
1022) {
1023    if line.is_empty() {
1024        if !suppress {
1025            buf.push(line_delim);
1026        }
1027        return;
1028    }
1029
1030    buf.reserve(line.len() + 1);
1031
1032    let skip_delims = start_field - 1;
1033    let mut delim_count = 0;
1034    let mut has_delim = false;
1035
1036    for pos in memchr_iter(delim, line) {
1037        has_delim = true;
1038        delim_count += 1;
1039        if delim_count >= skip_delims {
1040            unsafe {
1041                buf_extend(buf, &line[pos + 1..]);
1042                buf_push(buf, line_delim);
1043            }
1044            return;
1045        }
1046    }
1047
1048    if !has_delim {
1049        if !suppress {
1050            unsafe {
1051                buf_extend(buf, line);
1052                buf_push(buf, line_delim);
1053            }
1054        }
1055        return;
1056    }
1057
1058    // Fewer delimiters than needed
1059    unsafe { buf_push(buf, line_delim) };
1060}
1061
1062/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1063/// Optimized: skip to start_field using memchr, then output until end_field.
1064fn process_fields_mid_range(
1065    data: &[u8],
1066    delim: u8,
1067    line_delim: u8,
1068    start_field: usize,
1069    end_field: usize,
1070    suppress: bool,
1071    out: &mut impl Write,
1072) -> io::Result<()> {
1073    if data.len() >= PARALLEL_THRESHOLD {
1074        let chunks = split_into_chunks(data, line_delim);
1075        let results: Vec<Vec<u8>> = chunks
1076            .par_iter()
1077            .map(|chunk| {
1078                let mut buf = Vec::with_capacity(chunk.len());
1079                fields_mid_range_chunk(
1080                    chunk,
1081                    delim,
1082                    line_delim,
1083                    start_field,
1084                    end_field,
1085                    suppress,
1086                    &mut buf,
1087                );
1088                buf
1089            })
1090            .collect();
1091        let slices: Vec<IoSlice> = results
1092            .iter()
1093            .filter(|r| !r.is_empty())
1094            .map(|r| IoSlice::new(r))
1095            .collect();
1096        write_ioslices(out, &slices)?;
1097    } else {
1098        let mut buf = Vec::with_capacity(data.len());
1099        fields_mid_range_chunk(
1100            data,
1101            delim,
1102            line_delim,
1103            start_field,
1104            end_field,
1105            suppress,
1106            &mut buf,
1107        );
1108        if !buf.is_empty() {
1109            out.write_all(&buf)?;
1110        }
1111    }
1112    Ok(())
1113}
1114
1115/// Process a chunk for contiguous mid-range field extraction.
1116fn fields_mid_range_chunk(
1117    data: &[u8],
1118    delim: u8,
1119    line_delim: u8,
1120    start_field: usize,
1121    end_field: usize,
1122    suppress: bool,
1123    buf: &mut Vec<u8>,
1124) {
1125    let mut start = 0;
1126    for end_pos in memchr_iter(line_delim, data) {
1127        let line = &data[start..end_pos];
1128        fields_mid_range_line(
1129            line,
1130            delim,
1131            line_delim,
1132            start_field,
1133            end_field,
1134            suppress,
1135            buf,
1136        );
1137        start = end_pos + 1;
1138    }
1139    if start < data.len() {
1140        fields_mid_range_line(
1141            &data[start..],
1142            delim,
1143            line_delim,
1144            start_field,
1145            end_field,
1146            suppress,
1147            buf,
1148        );
1149    }
1150}
1151
1152/// Extract fields start_field..=end_field from one line.
1153/// Uses memchr_iter to skip to start_field, then counts delimiters to end_field.
1154#[inline(always)]
1155fn fields_mid_range_line(
1156    line: &[u8],
1157    delim: u8,
1158    line_delim: u8,
1159    start_field: usize,
1160    end_field: usize,
1161    suppress: bool,
1162    buf: &mut Vec<u8>,
1163) {
1164    if line.is_empty() {
1165        if !suppress {
1166            buf.push(line_delim);
1167        }
1168        return;
1169    }
1170
1171    buf.reserve(line.len() + 1);
1172
1173    // Count delimiters to find start_field and end_field boundaries
1174    let skip_before = start_field - 1; // delimiters to skip before start_field
1175    let field_span = end_field - start_field; // additional delimiters within the range
1176    let mut delim_count = 0;
1177    let mut range_start = 0;
1178    let mut has_delim = false;
1179
1180    for pos in memchr_iter(delim, line) {
1181        has_delim = true;
1182        delim_count += 1;
1183        if delim_count == skip_before {
1184            range_start = pos + 1;
1185        }
1186        if delim_count == skip_before + field_span + 1 {
1187            // Found the delimiter after end_field — output the range
1188            if skip_before == 0 {
1189                range_start = 0;
1190            }
1191            unsafe {
1192                buf_extend(buf, &line[range_start..pos]);
1193                buf_push(buf, line_delim);
1194            }
1195            return;
1196        }
1197    }
1198
1199    if !has_delim {
1200        if !suppress {
1201            unsafe {
1202                buf_extend(buf, line);
1203                buf_push(buf, line_delim);
1204            }
1205        }
1206        return;
1207    }
1208
1209    // Line has delimiters but fewer fields than end_field
1210    if delim_count >= skip_before {
1211        // We have at least start_field, output from range_start to end
1212        if skip_before == 0 {
1213            range_start = 0;
1214        }
1215        unsafe {
1216            buf_extend(buf, &line[range_start..]);
1217            buf_push(buf, line_delim);
1218        }
1219    } else {
1220        // Not enough fields even for start_field — output empty line
1221        unsafe { buf_push(buf, line_delim) };
1222    }
1223}
1224
1225/// Combined SIMD scan for arbitrary single field extraction.
1226/// Uses memchr2_iter(delim, line_delim) to scan for both bytes in a single SIMD pass.
1227/// This is faster than the nested approach (outer: find newlines, inner: find delimiters)
1228/// because it eliminates one full SIMD scan and improves cache locality.
1229fn process_nth_field_combined(
1230    data: &[u8],
1231    delim: u8,
1232    line_delim: u8,
1233    target_idx: usize,
1234    suppress: bool,
1235    buf: &mut Vec<u8>,
1236) {
1237    buf.reserve(data.len());
1238
1239    let mut line_start: usize = 0;
1240    let mut field_start: usize = 0;
1241    let mut field_idx: usize = 0;
1242    let mut has_delim = false;
1243    let mut emitted = false;
1244
1245    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1246        let byte = unsafe { *data.get_unchecked(pos) };
1247
1248        if byte == line_delim {
1249            // End of line
1250            if !emitted {
1251                if has_delim && field_idx == target_idx {
1252                    // Last field matches target
1253                    unsafe {
1254                        buf_extend(buf, &data[field_start..pos]);
1255                        buf_push(buf, line_delim);
1256                    }
1257                } else if has_delim {
1258                    // Target field doesn't exist (fewer fields)
1259                    unsafe {
1260                        buf_push(buf, line_delim);
1261                    }
1262                } else if !suppress {
1263                    // No delimiter in line — output unchanged
1264                    unsafe {
1265                        buf_extend(buf, &data[line_start..pos]);
1266                        buf_push(buf, line_delim);
1267                    }
1268                }
1269            }
1270            // Reset for next line
1271            line_start = pos + 1;
1272            field_start = pos + 1;
1273            field_idx = 0;
1274            has_delim = false;
1275            emitted = false;
1276        } else {
1277            // Delimiter found
1278            has_delim = true;
1279            if field_idx == target_idx {
1280                unsafe {
1281                    buf_extend(buf, &data[field_start..pos]);
1282                    buf_push(buf, line_delim);
1283                }
1284                emitted = true;
1285            }
1286            field_idx += 1;
1287            field_start = pos + 1;
1288        }
1289    }
1290
1291    // Handle last line without trailing newline
1292    if line_start < data.len() && !emitted {
1293        if has_delim && field_idx == target_idx {
1294            unsafe {
1295                buf_extend(buf, &data[field_start..data.len()]);
1296                buf_push(buf, line_delim);
1297            }
1298        } else if has_delim {
1299            unsafe {
1300                buf_push(buf, line_delim);
1301            }
1302        } else if !suppress {
1303            unsafe {
1304                buf_extend(buf, &data[line_start..data.len()]);
1305                buf_push(buf, line_delim);
1306            }
1307        }
1308    }
1309}
1310
1311/// Zero-copy field-1 extraction: writes contiguous runs directly from source data.
1312/// For each line: if delimiter exists, truncate at first delimiter; otherwise pass through.
1313/// Uses memchr2 to scan for both delimiter and line terminator in a single SIMD pass.
1314#[inline]
1315fn single_field1_zerocopy(
1316    data: &[u8],
1317    delim: u8,
1318    line_delim: u8,
1319    out: &mut impl Write,
1320) -> io::Result<()> {
1321    let mut line_start: usize = 0;
1322    let mut run_start: usize = 0;
1323    let mut first_delim: Option<usize> = None;
1324
1325    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1326        let byte = unsafe { *data.get_unchecked(pos) };
1327
1328        if byte == line_delim {
1329            // End of line
1330            if let Some(dp) = first_delim {
1331                // Line has delimiter — truncate at first delimiter.
1332                // Flush current run up to line_start, write truncated line.
1333                if run_start < line_start {
1334                    out.write_all(&data[run_start..line_start])?;
1335                }
1336                out.write_all(&data[line_start..dp])?;
1337                out.write_all(&[line_delim])?;
1338                run_start = pos + 1;
1339            }
1340            // else: no delimiter in line, output unchanged (stays in run)
1341            line_start = pos + 1;
1342            first_delim = None;
1343        } else {
1344            // Delimiter found
1345            if first_delim.is_none() {
1346                first_delim = Some(pos);
1347            }
1348        }
1349    }
1350
1351    // Handle last line (no trailing line_delim)
1352    if line_start < data.len() {
1353        if let Some(dp) = first_delim {
1354            if run_start < line_start {
1355                out.write_all(&data[run_start..line_start])?;
1356            }
1357            out.write_all(&data[line_start..dp])?;
1358            out.write_all(&[line_delim])?;
1359            return Ok(());
1360        }
1361    }
1362
1363    // Flush remaining run
1364    if run_start < data.len() {
1365        out.write_all(&data[run_start..])?;
1366        if !data.is_empty() && *data.last().unwrap() != line_delim {
1367            out.write_all(&[line_delim])?;
1368        }
1369    }
1370    Ok(())
1371}
1372
1373/// Optimized path for extracting small field indices (2-4) without suppress.
1374/// Uses per-line memchr calls to find the target field boundaries.
1375/// For field 2: finds the 1st delimiter (start of field 2), then the 2nd (end).
1376/// More efficient than memchr2_iter for small field indices since we stop early.
1377fn process_small_field_combined(
1378    data: &[u8],
1379    delim: u8,
1380    line_delim: u8,
1381    target_idx: usize,
1382    buf: &mut Vec<u8>,
1383) {
1384    buf.reserve(data.len());
1385    let mut start = 0;
1386    for end_pos in memchr_iter(line_delim, data) {
1387        let line = &data[start..end_pos];
1388        // Find the start of the target field (skip target_idx delimiters)
1389        let mut field_start = 0;
1390        let mut found_start = target_idx == 0;
1391        let mut delim_count = 0;
1392        if !found_start {
1393            let mut search_start = 0;
1394            while let Some(pos) = memchr::memchr(delim, &line[search_start..]) {
1395                delim_count += 1;
1396                if delim_count == target_idx {
1397                    field_start = search_start + pos + 1;
1398                    found_start = true;
1399                    break;
1400                }
1401                search_start = search_start + pos + 1;
1402            }
1403        }
1404        if !found_start {
1405            // Line has fewer fields than needed - output as-is (no suppress)
1406            unsafe {
1407                buf_extend(buf, line);
1408                buf_push(buf, line_delim);
1409            }
1410        } else if field_start >= line.len() {
1411            // Empty field at end
1412            unsafe { buf_push(buf, line_delim) };
1413        } else {
1414            // Find the end of the target field
1415            match memchr::memchr(delim, &line[field_start..]) {
1416                Some(pos) => unsafe {
1417                    buf_extend(buf, &line[field_start..field_start + pos]);
1418                    buf_push(buf, line_delim);
1419                },
1420                None => unsafe {
1421                    buf_extend(buf, &line[field_start..]);
1422                    buf_push(buf, line_delim);
1423                },
1424            }
1425        }
1426        start = end_pos + 1;
1427    }
1428    // Handle last line without terminator
1429    if start < data.len() {
1430        let line = &data[start..];
1431        let mut field_start = 0;
1432        let mut found_start = target_idx == 0;
1433        let mut delim_count = 0;
1434        if !found_start {
1435            let mut search_start = 0;
1436            while let Some(pos) = memchr::memchr(delim, &line[search_start..]) {
1437                delim_count += 1;
1438                if delim_count == target_idx {
1439                    field_start = search_start + pos + 1;
1440                    found_start = true;
1441                    break;
1442                }
1443                search_start = search_start + pos + 1;
1444            }
1445        }
1446        if !found_start {
1447            unsafe {
1448                buf_extend(buf, line);
1449                buf_push(buf, line_delim);
1450            }
1451        } else if field_start >= line.len() {
1452            unsafe { buf_push(buf, line_delim) };
1453        } else {
1454            match memchr::memchr(delim, &line[field_start..]) {
1455                Some(pos) => unsafe {
1456                    buf_extend(buf, &line[field_start..field_start + pos]);
1457                    buf_push(buf, line_delim);
1458                },
1459                None => unsafe {
1460                    buf_extend(buf, &line[field_start..]);
1461                    buf_push(buf, line_delim);
1462                },
1463            }
1464        }
1465    }
1466}
1467
1468/// Process a chunk of data for single-field extraction.
1469fn process_single_field_chunk(
1470    data: &[u8],
1471    delim: u8,
1472    target_idx: usize,
1473    line_delim: u8,
1474    suppress: bool,
1475    buf: &mut Vec<u8>,
1476) {
1477    let mut start = 0;
1478    for end_pos in memchr_iter(line_delim, data) {
1479        let line = &data[start..end_pos];
1480        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
1481        start = end_pos + 1;
1482    }
1483    if start < data.len() {
1484        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
1485    }
1486}
1487
1488/// Extract a single field from one line.
1489/// Uses unsafe buf helpers — caller must ensure buf has capacity reserved.
1490#[inline(always)]
1491fn extract_single_field_line(
1492    line: &[u8],
1493    delim: u8,
1494    target_idx: usize,
1495    line_delim: u8,
1496    suppress: bool,
1497    buf: &mut Vec<u8>,
1498) {
1499    if line.is_empty() {
1500        if !suppress {
1501            buf.push(line_delim);
1502        }
1503        return;
1504    }
1505
1506    // Ensure capacity for worst case (full line + newline)
1507    buf.reserve(line.len() + 1);
1508
1509    // Ultra-fast path for first field: single memchr
1510    if target_idx == 0 {
1511        match memchr::memchr(delim, line) {
1512            Some(pos) => unsafe {
1513                buf_extend(buf, &line[..pos]);
1514                buf_push(buf, line_delim);
1515            },
1516            None => {
1517                if !suppress {
1518                    unsafe {
1519                        buf_extend(buf, line);
1520                        buf_push(buf, line_delim);
1521                    }
1522                }
1523            }
1524        }
1525        return;
1526    }
1527
1528    let mut field_start = 0;
1529    let mut field_idx = 0;
1530    let mut has_delim = false;
1531
1532    for pos in memchr_iter(delim, line) {
1533        has_delim = true;
1534        if field_idx == target_idx {
1535            unsafe {
1536                buf_extend(buf, &line[field_start..pos]);
1537                buf_push(buf, line_delim);
1538            }
1539            return;
1540        }
1541        field_idx += 1;
1542        field_start = pos + 1;
1543    }
1544
1545    if !has_delim {
1546        if !suppress {
1547            unsafe {
1548                buf_extend(buf, line);
1549                buf_push(buf, line_delim);
1550            }
1551        }
1552        return;
1553    }
1554
1555    if field_idx == target_idx {
1556        unsafe {
1557            buf_extend(buf, &line[field_start..]);
1558            buf_push(buf, line_delim);
1559        }
1560    } else {
1561        unsafe { buf_push(buf, line_delim) };
1562    }
1563}
1564
1565/// Extract fields from a single line into the output buffer.
1566/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
1567#[inline(always)]
1568fn extract_fields_to_buf(
1569    line: &[u8],
1570    delim: u8,
1571    ranges: &[Range],
1572    output_delim: &[u8],
1573    suppress: bool,
1574    max_field: usize,
1575    field_mask: u64,
1576    line_delim: u8,
1577    buf: &mut Vec<u8>,
1578    complement: bool,
1579) {
1580    let len = line.len();
1581
1582    if len == 0 {
1583        if !suppress {
1584            buf.push(line_delim);
1585        }
1586        return;
1587    }
1588
1589    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
1590    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
1591    let needed = len + output_delim.len() * 16 + 1;
1592    if buf.capacity() - buf.len() < needed {
1593        buf.reserve(needed);
1594    }
1595
1596    let mut field_num: usize = 1;
1597    let mut field_start: usize = 0;
1598    let mut first_output = true;
1599    let mut has_delim = false;
1600
1601    for delim_pos in memchr_iter(delim, line) {
1602        has_delim = true;
1603
1604        if is_selected(field_num, field_mask, ranges, complement) {
1605            if !first_output {
1606                unsafe { buf_extend(buf, output_delim) };
1607            }
1608            unsafe { buf_extend(buf, &line[field_start..delim_pos]) };
1609            first_output = false;
1610        }
1611
1612        field_num += 1;
1613        field_start = delim_pos + 1;
1614
1615        if field_num > max_field {
1616            break;
1617        }
1618    }
1619
1620    // Last field
1621    if (field_num <= max_field || complement)
1622        && has_delim
1623        && is_selected(field_num, field_mask, ranges, complement)
1624    {
1625        if !first_output {
1626            unsafe { buf_extend(buf, output_delim) };
1627        }
1628        unsafe { buf_extend(buf, &line[field_start..len]) };
1629        first_output = false;
1630    }
1631
1632    if !first_output {
1633        unsafe { buf_push(buf, line_delim) };
1634    } else if !has_delim {
1635        if !suppress {
1636            unsafe {
1637                buf_extend(buf, line);
1638                buf_push(buf, line_delim);
1639            }
1640        }
1641    } else {
1642        unsafe { buf_push(buf, line_delim) };
1643    }
1644}
1645
1646// ── Fast path: byte/char extraction with batched output ──────────────────
1647
1648/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
1649/// Zero-copy: writes directly from the source data using output runs.
1650/// For lines shorter than max_bytes, the output is identical to the input,
1651/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
1652fn process_bytes_from_start(
1653    data: &[u8],
1654    max_bytes: usize,
1655    line_delim: u8,
1656    out: &mut impl Write,
1657) -> io::Result<()> {
1658    if data.len() >= PARALLEL_THRESHOLD {
1659        let chunks = split_into_chunks(data, line_delim);
1660        let results: Vec<Vec<u8>> = chunks
1661            .par_iter()
1662            .map(|chunk| {
1663                let mut buf = Vec::with_capacity(chunk.len());
1664                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
1665                buf
1666            })
1667            .collect();
1668        // Use write_vectored (writev) to batch N writes into fewer syscalls
1669        let slices: Vec<IoSlice> = results
1670            .iter()
1671            .filter(|r| !r.is_empty())
1672            .map(|r| IoSlice::new(r))
1673            .collect();
1674        write_ioslices(out, &slices)?;
1675    } else {
1676        // Zero-copy path: track contiguous output runs and write directly from source.
1677        // For lines <= max_bytes, we include them as-is (no copy needed).
1678        // For lines > max_bytes, we flush the run, write the truncated line, start new run.
1679        bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
1680    }
1681    Ok(())
1682}
1683
1684/// Zero-copy byte-prefix extraction: writes contiguous runs directly from the source data.
1685/// Only copies when a line needs truncation (line > max_bytes).
1686#[inline]
1687fn bytes_from_start_zerocopy(
1688    data: &[u8],
1689    max_bytes: usize,
1690    line_delim: u8,
1691    out: &mut impl Write,
1692) -> io::Result<()> {
1693    let mut start = 0;
1694    let mut run_start: usize = 0;
1695
1696    for pos in memchr_iter(line_delim, data) {
1697        let line_len = pos - start;
1698        if line_len > max_bytes {
1699            // This line needs truncation. Flush current run, write truncated line.
1700            if run_start < start {
1701                out.write_all(&data[run_start..start])?;
1702            }
1703            out.write_all(&data[start..start + max_bytes])?;
1704            out.write_all(&[line_delim])?;
1705            run_start = pos + 1;
1706        }
1707        // else: line fits, keep it in the current contiguous run
1708        start = pos + 1;
1709    }
1710    // Handle last line without terminator
1711    if start < data.len() {
1712        let line_len = data.len() - start;
1713        if line_len > max_bytes {
1714            if run_start < start {
1715                out.write_all(&data[run_start..start])?;
1716            }
1717            out.write_all(&data[start..start + max_bytes])?;
1718            out.write_all(&[line_delim])?;
1719            return Ok(());
1720        }
1721    }
1722    // Flush remaining run (includes all short lines + the last line)
1723    if run_start < data.len() {
1724        out.write_all(&data[run_start..])?;
1725        // Add terminator if last byte isn't one
1726        if !data.is_empty() && *data.last().unwrap() != line_delim {
1727            out.write_all(&[line_delim])?;
1728        }
1729    }
1730    Ok(())
1731}
1732
1733/// Process a chunk for from-start byte range extraction (parallel path).
1734/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1735#[inline]
1736fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1737    // Reserve enough capacity: output <= input size
1738    buf.reserve(data.len());
1739
1740    let mut start = 0;
1741    for pos in memchr_iter(line_delim, data) {
1742        let line_len = pos - start;
1743        let take = line_len.min(max_bytes);
1744        unsafe {
1745            buf_extend(buf, &data[start..start + take]);
1746            buf_push(buf, line_delim);
1747        }
1748        start = pos + 1;
1749    }
1750    // Handle last line without terminator
1751    if start < data.len() {
1752        let line_len = data.len() - start;
1753        let take = line_len.min(max_bytes);
1754        unsafe {
1755            buf_extend(buf, &data[start..start + take]);
1756            buf_push(buf, line_delim);
1757        }
1758    }
1759}
1760
1761/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
1762fn process_bytes_from_offset(
1763    data: &[u8],
1764    skip_bytes: usize,
1765    line_delim: u8,
1766    out: &mut impl Write,
1767) -> io::Result<()> {
1768    if data.len() >= PARALLEL_THRESHOLD {
1769        let chunks = split_into_chunks(data, line_delim);
1770        let results: Vec<Vec<u8>> = chunks
1771            .par_iter()
1772            .map(|chunk| {
1773                let mut buf = Vec::with_capacity(chunk.len());
1774                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
1775                buf
1776            })
1777            .collect();
1778        // Use write_vectored (writev) to batch N writes into fewer syscalls
1779        let slices: Vec<IoSlice> = results
1780            .iter()
1781            .filter(|r| !r.is_empty())
1782            .map(|r| IoSlice::new(r))
1783            .collect();
1784        write_ioslices(out, &slices)?;
1785    } else {
1786        // Zero-copy: write suffix of each line directly from source
1787        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
1788    }
1789    Ok(())
1790}
1791
1792/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
1793/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
1794/// reducing syscall overhead from 2 write_all calls per line to batched writev.
1795#[inline]
1796fn bytes_from_offset_zerocopy(
1797    data: &[u8],
1798    skip_bytes: usize,
1799    line_delim: u8,
1800    out: &mut impl Write,
1801) -> io::Result<()> {
1802    let delim_buf = [line_delim];
1803    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
1804
1805    let mut start = 0;
1806    for pos in memchr_iter(line_delim, data) {
1807        let line_len = pos - start;
1808        if line_len > skip_bytes {
1809            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
1810        }
1811        iov.push(IoSlice::new(&delim_buf));
1812        // Flush when approaching MAX_IOV to avoid oversized writev
1813        if iov.len() >= MAX_IOV - 1 {
1814            write_ioslices(out, &iov)?;
1815            iov.clear();
1816        }
1817        start = pos + 1;
1818    }
1819    if start < data.len() {
1820        let line_len = data.len() - start;
1821        if line_len > skip_bytes {
1822            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
1823        }
1824        iov.push(IoSlice::new(&delim_buf));
1825    }
1826    if !iov.is_empty() {
1827        write_ioslices(out, &iov)?;
1828    }
1829    Ok(())
1830}
1831
1832/// Process a chunk for from-offset byte range extraction.
1833/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1834#[inline]
1835fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1836    buf.reserve(data.len());
1837
1838    let mut start = 0;
1839    for pos in memchr_iter(line_delim, data) {
1840        let line_len = pos - start;
1841        if line_len > skip_bytes {
1842            unsafe {
1843                buf_extend(buf, &data[start + skip_bytes..pos]);
1844            }
1845        }
1846        unsafe {
1847            buf_push(buf, line_delim);
1848        }
1849        start = pos + 1;
1850    }
1851    if start < data.len() {
1852        let line_len = data.len() - start;
1853        if line_len > skip_bytes {
1854            unsafe {
1855                buf_extend(buf, &data[start + skip_bytes..data.len()]);
1856            }
1857        }
1858        unsafe {
1859            buf_push(buf, line_delim);
1860        }
1861    }
1862}
1863
1864/// Optimized byte/char extraction with batched output and parallel processing.
1865fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
1866    let line_delim = cfg.line_delim;
1867    let ranges = cfg.ranges;
1868    let complement = cfg.complement;
1869    let output_delim = cfg.output_delim;
1870
1871    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
1872    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
1873        let max_bytes = ranges[0].end;
1874        if max_bytes < usize::MAX {
1875            return process_bytes_from_start(data, max_bytes, line_delim, out);
1876        }
1877    }
1878
1879    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
1880    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
1881        let skip_bytes = ranges[0].start.saturating_sub(1);
1882        if skip_bytes > 0 {
1883            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
1884        }
1885    }
1886
1887    if data.len() >= PARALLEL_THRESHOLD {
1888        let chunks = split_into_chunks(data, line_delim);
1889        let results: Vec<Vec<u8>> = chunks
1890            .par_iter()
1891            .map(|chunk| {
1892                let mut buf = Vec::with_capacity(chunk.len());
1893                process_bytes_chunk(
1894                    chunk,
1895                    ranges,
1896                    complement,
1897                    output_delim,
1898                    line_delim,
1899                    &mut buf,
1900                );
1901                buf
1902            })
1903            .collect();
1904        // Use write_vectored (writev) to batch N writes into fewer syscalls
1905        let slices: Vec<IoSlice> = results
1906            .iter()
1907            .filter(|r| !r.is_empty())
1908            .map(|r| IoSlice::new(r))
1909            .collect();
1910        write_ioslices(out, &slices)?;
1911    } else {
1912        let mut buf = Vec::with_capacity(data.len());
1913        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
1914        if !buf.is_empty() {
1915            out.write_all(&buf)?;
1916        }
1917    }
1918    Ok(())
1919}
1920
1921/// Process a chunk of data for byte/char extraction.
1922fn process_bytes_chunk(
1923    data: &[u8],
1924    ranges: &[Range],
1925    complement: bool,
1926    output_delim: &[u8],
1927    line_delim: u8,
1928    buf: &mut Vec<u8>,
1929) {
1930    let mut start = 0;
1931    for end_pos in memchr_iter(line_delim, data) {
1932        let line = &data[start..end_pos];
1933        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
1934        buf.push(line_delim);
1935        start = end_pos + 1;
1936    }
1937    if start < data.len() {
1938        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
1939        buf.push(line_delim);
1940    }
1941}
1942
1943/// Extract byte ranges from a line into the output buffer.
1944/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
1945#[inline(always)]
1946fn cut_bytes_to_buf(
1947    line: &[u8],
1948    ranges: &[Range],
1949    complement: bool,
1950    output_delim: &[u8],
1951    buf: &mut Vec<u8>,
1952) {
1953    let len = line.len();
1954    let mut first_range = true;
1955
1956    // Reserve worst case: full line + delimiters between ranges
1957    buf.reserve(len + output_delim.len() * ranges.len() + 1);
1958
1959    if complement {
1960        let mut pos: usize = 1;
1961        for r in ranges {
1962            let rs = r.start;
1963            let re = r.end.min(len);
1964            if pos < rs {
1965                if !first_range && !output_delim.is_empty() {
1966                    unsafe { buf_extend(buf, output_delim) };
1967                }
1968                unsafe { buf_extend(buf, &line[pos - 1..rs - 1]) };
1969                first_range = false;
1970            }
1971            pos = re + 1;
1972            if pos > len {
1973                break;
1974            }
1975        }
1976        if pos <= len {
1977            if !first_range && !output_delim.is_empty() {
1978                unsafe { buf_extend(buf, output_delim) };
1979            }
1980            unsafe { buf_extend(buf, &line[pos - 1..len]) };
1981        }
1982    } else if output_delim.is_empty() && ranges.len() == 1 {
1983        // Ultra-fast path: single range, no output delimiter
1984        let start = ranges[0].start.saturating_sub(1);
1985        let end = ranges[0].end.min(len);
1986        if start < len {
1987            unsafe { buf_extend(buf, &line[start..end]) };
1988        }
1989    } else {
1990        for r in ranges {
1991            let start = r.start.saturating_sub(1);
1992            let end = r.end.min(len);
1993            if start >= len {
1994                break;
1995            }
1996            if !first_range && !output_delim.is_empty() {
1997                unsafe { buf_extend(buf, output_delim) };
1998            }
1999            unsafe { buf_extend(buf, &line[start..end]) };
2000            first_range = false;
2001        }
2002    }
2003}
2004
2005// ── Public API ───────────────────────────────────────────────────────────
2006
2007/// Cut fields from a line using a delimiter. Writes to `out`.
2008#[inline]
2009pub fn cut_fields(
2010    line: &[u8],
2011    delim: u8,
2012    ranges: &[Range],
2013    complement: bool,
2014    output_delim: &[u8],
2015    suppress_no_delim: bool,
2016    out: &mut impl Write,
2017) -> io::Result<bool> {
2018    if memchr::memchr(delim, line).is_none() {
2019        if !suppress_no_delim {
2020            out.write_all(line)?;
2021            return Ok(true);
2022        }
2023        return Ok(false);
2024    }
2025
2026    let mut field_num: usize = 1;
2027    let mut field_start: usize = 0;
2028    let mut first_output = true;
2029
2030    for delim_pos in memchr_iter(delim, line) {
2031        let selected = in_ranges(ranges, field_num) != complement;
2032        if selected {
2033            if !first_output {
2034                out.write_all(output_delim)?;
2035            }
2036            out.write_all(&line[field_start..delim_pos])?;
2037            first_output = false;
2038        }
2039        field_start = delim_pos + 1;
2040        field_num += 1;
2041    }
2042
2043    let selected = in_ranges(ranges, field_num) != complement;
2044    if selected {
2045        if !first_output {
2046            out.write_all(output_delim)?;
2047        }
2048        out.write_all(&line[field_start..])?;
2049    }
2050
2051    Ok(true)
2052}
2053
2054/// Cut bytes/chars from a line. Writes selected bytes to `out`.
2055#[inline]
2056pub fn cut_bytes(
2057    line: &[u8],
2058    ranges: &[Range],
2059    complement: bool,
2060    output_delim: &[u8],
2061    out: &mut impl Write,
2062) -> io::Result<bool> {
2063    let mut first_range = true;
2064
2065    if complement {
2066        let len = line.len();
2067        let mut comp_ranges = Vec::new();
2068        let mut pos: usize = 1;
2069        for r in ranges {
2070            let rs = r.start;
2071            let re = r.end.min(len);
2072            if pos < rs {
2073                comp_ranges.push((pos, rs - 1));
2074            }
2075            pos = re + 1;
2076            if pos > len {
2077                break;
2078            }
2079        }
2080        if pos <= len {
2081            comp_ranges.push((pos, len));
2082        }
2083        for &(s, e) in &comp_ranges {
2084            if !first_range && !output_delim.is_empty() {
2085                out.write_all(output_delim)?;
2086            }
2087            out.write_all(&line[s - 1..e])?;
2088            first_range = false;
2089        }
2090    } else {
2091        for r in ranges {
2092            let start = r.start.saturating_sub(1);
2093            let end = r.end.min(line.len());
2094            if start >= line.len() {
2095                break;
2096            }
2097            if !first_range && !output_delim.is_empty() {
2098                out.write_all(output_delim)?;
2099            }
2100            out.write_all(&line[start..end])?;
2101            first_range = false;
2102        }
2103    }
2104    Ok(true)
2105}
2106
2107/// Process a full data buffer (from mmap or read) with cut operation.
2108pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2109    match cfg.mode {
2110        CutMode::Fields => process_fields_fast(data, cfg, out),
2111        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
2112    }
2113}
2114
2115/// Process input from a reader (for stdin).
2116/// Uses batch reading: reads large chunks (4MB), then processes them in batch
2117/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
2118pub fn process_cut_reader<R: BufRead>(
2119    mut reader: R,
2120    cfg: &CutConfig,
2121    out: &mut impl Write,
2122) -> io::Result<()> {
2123    const CHUNK_SIZE: usize = 4 * 1024 * 1024; // 4MB read chunks
2124    let line_delim = cfg.line_delim;
2125
2126    // Read large chunks and process in batch.
2127    // We keep a buffer; after processing complete lines, we shift leftover to the front.
2128    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
2129
2130    loop {
2131        // Read up to CHUNK_SIZE bytes
2132        buf.reserve(CHUNK_SIZE);
2133        let read_start = buf.len();
2134        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
2135        let n = read_fully(&mut reader, &mut buf[read_start..])?;
2136        buf.truncate(read_start + n);
2137
2138        if buf.is_empty() {
2139            break;
2140        }
2141
2142        if n == 0 {
2143            // EOF with leftover data (last line without terminator)
2144            process_cut_data(&buf, cfg, out)?;
2145            break;
2146        }
2147
2148        // Find the last line delimiter in the buffer so we process complete lines
2149        let process_end = match memchr::memrchr(line_delim, &buf) {
2150            Some(pos) => pos + 1,
2151            None => {
2152                // No line delimiter found — keep accumulating
2153                continue;
2154            }
2155        };
2156
2157        // Process the complete lines using the fast batch path
2158        process_cut_data(&buf[..process_end], cfg, out)?;
2159
2160        // Shift leftover to the front for next iteration
2161        let leftover_len = buf.len() - process_end;
2162        if leftover_len > 0 {
2163            buf.copy_within(process_end.., 0);
2164        }
2165        buf.truncate(leftover_len);
2166    }
2167
2168    Ok(())
2169}
2170
2171/// Read as many bytes as possible into buf, retrying on partial reads.
2172#[inline]
2173fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
2174    let n = reader.read(buf)?;
2175    if n == buf.len() || n == 0 {
2176        return Ok(n);
2177    }
2178    // Slow path: partial read — retry to fill buffer
2179    let mut total = n;
2180    while total < buf.len() {
2181        match reader.read(&mut buf[total..]) {
2182            Ok(0) => break,
2183            Ok(n) => total += n,
2184            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2185            Err(e) => return Err(e),
2186        }
2187    }
2188    Ok(total)
2189}
2190
2191/// Cut operation mode
2192#[derive(Debug, Clone, Copy, PartialEq)]
2193pub enum CutMode {
2194    Bytes,
2195    Characters,
2196    Fields,
2197}