Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (2MB).
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Max iovec entries per writev call (Linux default).
9const MAX_IOV: usize = 1024;
10
11/// Configuration for cut operations.
12pub struct CutConfig<'a> {
13    pub mode: CutMode,
14    pub ranges: &'a [Range],
15    pub complement: bool,
16    pub delim: u8,
17    pub output_delim: &'a [u8],
18    pub suppress_no_delim: bool,
19    pub line_delim: u8,
20}
21
22/// A range specification like 1, 3-5, -3, 4-
23#[derive(Debug, Clone)]
24pub struct Range {
25    pub start: usize, // 1-based, 0 means "from beginning"
26    pub end: usize,   // 1-based, usize::MAX means "to end"
27}
28
29/// Parse a LIST specification like "1,3-5,7-" into ranges.
30/// Each range is 1-based. Returns sorted, merged ranges.
31pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
32    let mut ranges = Vec::new();
33
34    for part in spec.split(',') {
35        let part = part.trim();
36        if part.is_empty() {
37            continue;
38        }
39
40        if let Some(idx) = part.find('-') {
41            let left = &part[..idx];
42            let right = &part[idx + 1..];
43
44            let start = if left.is_empty() {
45                1
46            } else {
47                left.parse::<usize>()
48                    .map_err(|_| format!("invalid range: '{}'", part))?
49            };
50
51            let end = if right.is_empty() {
52                usize::MAX
53            } else {
54                right
55                    .parse::<usize>()
56                    .map_err(|_| format!("invalid range: '{}'", part))?
57            };
58
59            if start == 0 {
60                return Err("fields and positions are numbered from 1".to_string());
61            }
62            if start > end {
63                return Err(format!("invalid decreasing range: '{}'", part));
64            }
65
66            ranges.push(Range { start, end });
67        } else {
68            let n = part
69                .parse::<usize>()
70                .map_err(|_| format!("invalid field: '{}'", part))?;
71            if n == 0 {
72                return Err("fields and positions are numbered from 1".to_string());
73            }
74            ranges.push(Range { start: n, end: n });
75        }
76    }
77
78    if ranges.is_empty() {
79        return Err("you must specify a list of bytes, characters, or fields".to_string());
80    }
81
82    // Sort and merge overlapping ranges
83    ranges.sort_by_key(|r| (r.start, r.end));
84    let mut merged = vec![ranges[0].clone()];
85    for r in &ranges[1..] {
86        let last = merged.last_mut().unwrap();
87        if r.start <= last.end.saturating_add(1) {
88            last.end = last.end.max(r.end);
89        } else {
90            merged.push(r.clone());
91        }
92    }
93
94    Ok(merged)
95}
96
97/// Check if a 1-based position is in any range.
98/// Ranges must be sorted. Uses early exit since ranges are sorted.
99#[inline(always)]
100fn in_ranges(ranges: &[Range], pos: usize) -> bool {
101    for r in ranges {
102        if pos < r.start {
103            return false;
104        }
105        if pos <= r.end {
106            return true;
107        }
108    }
109    false
110}
111
112/// Pre-compute a 64-bit mask for field selection.
113/// Bit i-1 is set if field i should be output.
114#[inline]
115fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
116    let mut mask: u64 = 0;
117    for i in 1..=64u32 {
118        let in_range = in_ranges(ranges, i as usize);
119        if in_range != complement {
120            mask |= 1u64 << (i - 1);
121        }
122    }
123    mask
124}
125
126/// Check if a field should be selected, using bitset for first 64 fields.
127#[inline(always)]
128fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
129    if field_num <= 64 {
130        (mask >> (field_num - 1)) & 1 == 1
131    } else {
132        in_ranges(ranges, field_num) != complement
133    }
134}
135
136// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
137
138/// Append a slice to buf without capacity checks.
139/// Caller MUST ensure buf has enough remaining capacity.
140#[inline(always)]
141unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
142    unsafe {
143        let len = buf.len();
144        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
145        buf.set_len(len + data.len());
146    }
147}
148
149/// Append a single byte to buf without capacity checks.
150/// Caller MUST ensure buf has enough remaining capacity.
151#[inline(always)]
152unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
153    unsafe {
154        let len = buf.len();
155        *buf.as_mut_ptr().add(len) = b;
156        buf.set_len(len + 1);
157    }
158}
159
160/// Write multiple IoSlice buffers using write_vectored (writev syscall).
161/// Batches into MAX_IOV-sized groups. Falls back to write_all per slice for partial writes.
162#[inline]
163fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
164    if slices.is_empty() {
165        return Ok(());
166    }
167    for batch in slices.chunks(MAX_IOV) {
168        let total: usize = batch.iter().map(|s| s.len()).sum();
169        match out.write_vectored(batch) {
170            Ok(n) if n >= total => continue,
171            Ok(mut written) => {
172                // Partial write: fall back to write_all per remaining slice
173                for slice in batch {
174                    let slen = slice.len();
175                    if written >= slen {
176                        written -= slen;
177                        continue;
178                    }
179                    if written > 0 {
180                        out.write_all(&slice[written..])?;
181                        written = 0;
182                    } else {
183                        out.write_all(slice)?;
184                    }
185                }
186            }
187            Err(e) => return Err(e),
188        }
189    }
190    Ok(())
191}
192
193// ── Chunk splitting for parallel processing ──────────────────────────────
194
195/// Split data into chunks aligned to line boundaries for parallel processing.
196fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
197    let num_threads = rayon::current_num_threads().max(1);
198    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
199        return vec![data];
200    }
201
202    let chunk_size = data.len() / num_threads;
203    let mut chunks = Vec::with_capacity(num_threads);
204    let mut pos = 0;
205
206    for _ in 0..num_threads - 1 {
207        let target = pos + chunk_size;
208        if target >= data.len() {
209            break;
210        }
211        let boundary = memchr::memchr(line_delim, &data[target..])
212            .map(|p| target + p + 1)
213            .unwrap_or(data.len());
214        if boundary > pos {
215            chunks.push(&data[pos..boundary]);
216        }
217        pos = boundary;
218    }
219
220    if pos < data.len() {
221        chunks.push(&data[pos..]);
222    }
223
224    chunks
225}
226
227// ── Fast path: field extraction with batched output ──────────────────────
228
229/// Optimized field extraction with early exit and batched output.
230fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
231    let delim = cfg.delim;
232    let line_delim = cfg.line_delim;
233    let ranges = cfg.ranges;
234    let complement = cfg.complement;
235    let output_delim = cfg.output_delim;
236    let suppress = cfg.suppress_no_delim;
237
238    // Zero-copy fast path: if delimiter never appears, output = input unchanged.
239    if !complement && memchr::memchr(delim, data).is_none() {
240        if suppress {
241            return Ok(());
242        }
243        out.write_all(data)?;
244        if !data.is_empty() && *data.last().unwrap() != line_delim {
245            out.write_all(&[line_delim])?;
246        }
247        return Ok(());
248    }
249
250    // Ultra-fast path: single field extraction (e.g., cut -f5)
251    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
252        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
253    }
254
255    // Fast path: complement of single field with default output delimiter.
256    if complement
257        && ranges.len() == 1
258        && ranges[0].start == ranges[0].end
259        && output_delim.len() == 1
260        && output_delim[0] == delim
261    {
262        return process_complement_single_field(
263            data,
264            delim,
265            line_delim,
266            ranges[0].start,
267            suppress,
268            out,
269        );
270    }
271
272    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
273    if !complement
274        && ranges.len() == 1
275        && ranges[0].start == 1
276        && output_delim.len() == 1
277        && output_delim[0] == delim
278        && ranges[0].end < usize::MAX
279    {
280        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
281    }
282
283    // Fast path: open-ended field range from field N (e.g., cut -f3-)
284    if !complement
285        && ranges.len() == 1
286        && ranges[0].end == usize::MAX
287        && ranges[0].start > 1
288        && output_delim.len() == 1
289        && output_delim[0] == delim
290    {
291        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
292    }
293
294    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
295    if !complement
296        && ranges.len() == 1
297        && ranges[0].start > 1
298        && ranges[0].end < usize::MAX
299        && output_delim.len() == 1
300        && output_delim[0] == delim
301    {
302        return process_fields_mid_range(
303            data,
304            delim,
305            line_delim,
306            ranges[0].start,
307            ranges[0].end,
308            suppress,
309            out,
310        );
311    }
312
313    // General field extraction
314    let max_field = if complement {
315        usize::MAX
316    } else {
317        ranges.last().map(|r| r.end).unwrap_or(0)
318    };
319    let field_mask = compute_field_mask(ranges, complement);
320
321    if data.len() >= PARALLEL_THRESHOLD {
322        let chunks = split_into_chunks(data, line_delim);
323        let results: Vec<Vec<u8>> = chunks
324            .par_iter()
325            .map(|chunk| {
326                let mut buf = Vec::with_capacity(chunk.len());
327                process_fields_chunk(
328                    chunk,
329                    delim,
330                    ranges,
331                    output_delim,
332                    suppress,
333                    max_field,
334                    field_mask,
335                    line_delim,
336                    complement,
337                    &mut buf,
338                );
339                buf
340            })
341            .collect();
342        // Use write_vectored (writev) to batch N writes into fewer syscalls
343        let slices: Vec<IoSlice> = results
344            .iter()
345            .filter(|r| !r.is_empty())
346            .map(|r| IoSlice::new(r))
347            .collect();
348        write_ioslices(out, &slices)?;
349    } else {
350        let mut buf = Vec::with_capacity(data.len());
351        process_fields_chunk(
352            data,
353            delim,
354            ranges,
355            output_delim,
356            suppress,
357            max_field,
358            field_mask,
359            line_delim,
360            complement,
361            &mut buf,
362        );
363        if !buf.is_empty() {
364            out.write_all(&buf)?;
365        }
366    }
367    Ok(())
368}
369
370/// Process a chunk of data for general field extraction.
371/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
372/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
373/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
374fn process_fields_chunk(
375    data: &[u8],
376    delim: u8,
377    ranges: &[Range],
378    output_delim: &[u8],
379    suppress: bool,
380    max_field: usize,
381    field_mask: u64,
382    line_delim: u8,
383    complement: bool,
384    buf: &mut Vec<u8>,
385) {
386    // When delim != line_delim and max_field is bounded, use two-level approach:
387    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
388    // This avoids scanning past max_field on each line (significant for lines with
389    // many columns but small field selection like -f1,3,5 on 20-column CSV).
390    // For complement or unbounded ranges, use single-pass memchr2_iter which
391    // needs to process all delimiters anyway.
392    if delim != line_delim && max_field < usize::MAX && !complement {
393        buf.reserve(data.len());
394        let mut start = 0;
395        for end_pos in memchr_iter(line_delim, data) {
396            let line = &data[start..end_pos];
397            extract_fields_to_buf(
398                line,
399                delim,
400                ranges,
401                output_delim,
402                suppress,
403                max_field,
404                field_mask,
405                line_delim,
406                buf,
407                complement,
408            );
409            start = end_pos + 1;
410        }
411        if start < data.len() {
412            extract_fields_to_buf(
413                &data[start..],
414                delim,
415                ranges,
416                output_delim,
417                suppress,
418                max_field,
419                field_mask,
420                line_delim,
421                buf,
422                complement,
423            );
424        }
425        return;
426    }
427
428    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
429    // delimiter and line_delim in one SIMD scan.
430    if delim != line_delim {
431        buf.reserve(data.len());
432
433        let mut line_start: usize = 0;
434        let mut field_start: usize = 0;
435        let mut field_num: usize = 1;
436        let mut first_output = true;
437        let mut has_delim = false;
438
439        for pos in memchr::memchr2_iter(delim, line_delim, data) {
440            let byte = unsafe { *data.get_unchecked(pos) };
441
442            if byte == line_delim {
443                // End of line: flush final field and emit line delimiter
444                if (field_num <= max_field || complement)
445                    && has_delim
446                    && is_selected(field_num, field_mask, ranges, complement)
447                {
448                    if !first_output {
449                        unsafe { buf_extend(buf, output_delim) };
450                    }
451                    unsafe { buf_extend(buf, &data[field_start..pos]) };
452                    first_output = false;
453                }
454
455                if !first_output {
456                    unsafe { buf_push(buf, line_delim) };
457                } else if !has_delim {
458                    if !suppress {
459                        unsafe {
460                            buf_extend(buf, &data[line_start..pos]);
461                            buf_push(buf, line_delim);
462                        }
463                    }
464                } else {
465                    unsafe { buf_push(buf, line_delim) };
466                }
467
468                // Reset state for next line
469                line_start = pos + 1;
470                field_start = pos + 1;
471                field_num = 1;
472                first_output = true;
473                has_delim = false;
474            } else {
475                // Field delimiter hit
476                has_delim = true;
477
478                if is_selected(field_num, field_mask, ranges, complement) {
479                    if !first_output {
480                        unsafe { buf_extend(buf, output_delim) };
481                    }
482                    unsafe { buf_extend(buf, &data[field_start..pos]) };
483                    first_output = false;
484                }
485
486                field_num += 1;
487                field_start = pos + 1;
488            }
489        }
490
491        // Handle last line without trailing line_delim
492        if line_start < data.len() {
493            let line = &data[line_start..];
494            if !line.is_empty() {
495                if (field_num <= max_field || complement)
496                    && has_delim
497                    && is_selected(field_num, field_mask, ranges, complement)
498                {
499                    if !first_output {
500                        unsafe { buf_extend(buf, output_delim) };
501                    }
502                    unsafe { buf_extend(buf, &data[field_start..data.len()]) };
503                    first_output = false;
504                }
505
506                if !first_output {
507                    unsafe { buf_push(buf, line_delim) };
508                } else if !has_delim {
509                    if !suppress {
510                        unsafe {
511                            buf_extend(buf, &data[line_start..data.len()]);
512                            buf_push(buf, line_delim);
513                        }
514                    }
515                } else {
516                    unsafe { buf_push(buf, line_delim) };
517                }
518            }
519        }
520
521        return;
522    }
523
524    // Fallback: when delim == line_delim, use the two-level scan approach
525    let mut start = 0;
526    for end_pos in memchr_iter(line_delim, data) {
527        let line = &data[start..end_pos];
528        extract_fields_to_buf(
529            line,
530            delim,
531            ranges,
532            output_delim,
533            suppress,
534            max_field,
535            field_mask,
536            line_delim,
537            buf,
538            complement,
539        );
540        start = end_pos + 1;
541    }
542    if start < data.len() {
543        extract_fields_to_buf(
544            &data[start..],
545            delim,
546            ranges,
547            output_delim,
548            suppress,
549            max_field,
550            field_mask,
551            line_delim,
552            buf,
553            complement,
554        );
555    }
556}
557
558// ── Ultra-fast single field extraction ───────────────────────────────────
559
560/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
561/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
562/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
563fn process_single_field(
564    data: &[u8],
565    delim: u8,
566    line_delim: u8,
567    target: usize,
568    suppress: bool,
569    out: &mut impl Write,
570) -> io::Result<()> {
571    let target_idx = target - 1;
572
573    // Combined SIMD scan: single pass using memchr2 for any target field.
574    if delim != line_delim {
575        if data.len() >= PARALLEL_THRESHOLD {
576            let chunks = split_into_chunks(data, line_delim);
577            let results: Vec<Vec<u8>> = chunks
578                .par_iter()
579                .map(|chunk| {
580                    let mut buf = Vec::with_capacity(chunk.len());
581                    process_nth_field_combined(
582                        chunk, delim, line_delim, target_idx, suppress, &mut buf,
583                    );
584                    buf
585                })
586                .collect();
587            // Use write_vectored (writev) to batch N writes into fewer syscalls
588            let slices: Vec<IoSlice> = results
589                .iter()
590                .filter(|r| !r.is_empty())
591                .map(|r| IoSlice::new(r))
592                .collect();
593            write_ioslices(out, &slices)?;
594        } else if target_idx == 0 && !suppress {
595            // Zero-copy fast path for field 1 (most common case):
596            // For each line, either truncate at the first delimiter, or pass through.
597            // Since most lines have a delimiter, and field 1 is a prefix of each line,
598            // we can write contiguous runs directly from the source data.
599            single_field1_zerocopy(data, delim, line_delim, out)?;
600        } else if target_idx <= 3 && !suppress {
601            // Optimized path for small field indices (fields 2-4):
602            // Uses successive memchr calls per line instead of the full combined scan.
603            // For field 2: two memchr calls (find first delim, find second).
604            // This avoids the memchr2_iter overhead for every byte in the line.
605            let mut buf = Vec::with_capacity(data.len());
606            process_small_field_combined(data, delim, line_delim, target_idx, &mut buf);
607            if !buf.is_empty() {
608                out.write_all(&buf)?;
609            }
610        } else {
611            let mut buf = Vec::with_capacity(data.len());
612            process_nth_field_combined(data, delim, line_delim, target_idx, suppress, &mut buf);
613            if !buf.is_empty() {
614                out.write_all(&buf)?;
615            }
616        }
617        return Ok(());
618    }
619
620    // Fallback for delim == line_delim: nested loop approach
621    if data.len() >= PARALLEL_THRESHOLD {
622        let chunks = split_into_chunks(data, line_delim);
623        let results: Vec<Vec<u8>> = chunks
624            .par_iter()
625            .map(|chunk| {
626                let mut buf = Vec::with_capacity(chunk.len() / 4);
627                process_single_field_chunk(
628                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
629                );
630                buf
631            })
632            .collect();
633        // Use write_vectored (writev) to batch N writes into fewer syscalls
634        let slices: Vec<IoSlice> = results
635            .iter()
636            .filter(|r| !r.is_empty())
637            .map(|r| IoSlice::new(r))
638            .collect();
639        write_ioslices(out, &slices)?;
640    } else {
641        let mut buf = Vec::with_capacity(data.len() / 4);
642        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
643        if !buf.is_empty() {
644            out.write_all(&buf)?;
645        }
646    }
647    Ok(())
648}
649
650/// Complement single-field extraction: skip one field, output rest unchanged.
651fn process_complement_single_field(
652    data: &[u8],
653    delim: u8,
654    line_delim: u8,
655    skip_field: usize,
656    suppress: bool,
657    out: &mut impl Write,
658) -> io::Result<()> {
659    let skip_idx = skip_field - 1;
660
661    if data.len() >= PARALLEL_THRESHOLD {
662        let chunks = split_into_chunks(data, line_delim);
663        let results: Vec<Vec<u8>> = chunks
664            .par_iter()
665            .map(|chunk| {
666                let mut buf = Vec::with_capacity(chunk.len());
667                complement_single_field_chunk(
668                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
669                );
670                buf
671            })
672            .collect();
673        // Use write_vectored (writev) to batch N writes into fewer syscalls
674        let slices: Vec<IoSlice> = results
675            .iter()
676            .filter(|r| !r.is_empty())
677            .map(|r| IoSlice::new(r))
678            .collect();
679        write_ioslices(out, &slices)?;
680    } else {
681        let mut buf = Vec::with_capacity(data.len());
682        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
683        if !buf.is_empty() {
684            out.write_all(&buf)?;
685        }
686    }
687    Ok(())
688}
689
690/// Process a chunk for complement single-field extraction.
691fn complement_single_field_chunk(
692    data: &[u8],
693    delim: u8,
694    skip_idx: usize,
695    line_delim: u8,
696    suppress: bool,
697    buf: &mut Vec<u8>,
698) {
699    let mut start = 0;
700    for end_pos in memchr_iter(line_delim, data) {
701        let line = &data[start..end_pos];
702        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
703        start = end_pos + 1;
704    }
705    if start < data.len() {
706        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
707    }
708}
709
710/// Extract all fields except skip_idx from one line.
711#[inline(always)]
712fn complement_single_field_line(
713    line: &[u8],
714    delim: u8,
715    skip_idx: usize,
716    line_delim: u8,
717    suppress: bool,
718    buf: &mut Vec<u8>,
719) {
720    if line.is_empty() {
721        if !suppress {
722            buf.push(line_delim);
723        }
724        return;
725    }
726
727    buf.reserve(line.len() + 1);
728
729    let mut field_idx = 0;
730    let mut field_start = 0;
731    let mut first_output = true;
732    let mut has_delim = false;
733
734    for pos in memchr_iter(delim, line) {
735        has_delim = true;
736        if field_idx != skip_idx {
737            if !first_output {
738                unsafe { buf_push(buf, delim) };
739            }
740            unsafe { buf_extend(buf, &line[field_start..pos]) };
741            first_output = false;
742        }
743        field_idx += 1;
744        field_start = pos + 1;
745    }
746
747    if !has_delim {
748        if !suppress {
749            unsafe {
750                buf_extend(buf, line);
751                buf_push(buf, line_delim);
752            }
753        }
754        return;
755    }
756
757    // Last field
758    if field_idx != skip_idx {
759        if !first_output {
760            unsafe { buf_push(buf, delim) };
761        }
762        unsafe { buf_extend(buf, &line[field_start..]) };
763    }
764
765    unsafe { buf_push(buf, line_delim) };
766}
767
768/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
769/// Zero-copy for the non-parallel path: identifies the truncation point per line
770/// and writes contiguous runs directly from the source data.
771fn process_fields_prefix(
772    data: &[u8],
773    delim: u8,
774    line_delim: u8,
775    last_field: usize,
776    suppress: bool,
777    out: &mut impl Write,
778) -> io::Result<()> {
779    if data.len() >= PARALLEL_THRESHOLD {
780        let chunks = split_into_chunks(data, line_delim);
781        let results: Vec<Vec<u8>> = chunks
782            .par_iter()
783            .map(|chunk| {
784                let mut buf = Vec::with_capacity(chunk.len());
785                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
786                buf
787            })
788            .collect();
789        // Use write_vectored (writev) to batch N writes into fewer syscalls
790        let slices: Vec<IoSlice> = results
791            .iter()
792            .filter(|r| !r.is_empty())
793            .map(|r| IoSlice::new(r))
794            .collect();
795        write_ioslices(out, &slices)?;
796    } else if !suppress {
797        // Zero-copy fast path: scan for truncation points, write runs from source.
798        // When suppress is false, every line is output (with or without delimiter).
799        // Most lines have enough fields, so the output is often identical to input.
800        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
801    } else {
802        let mut buf = Vec::with_capacity(data.len());
803        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
804        if !buf.is_empty() {
805            out.write_all(&buf)?;
806        }
807    }
808    Ok(())
809}
810
811/// Zero-copy field-prefix extraction: writes contiguous runs directly from source data.
812/// For lines where the Nth delimiter exists, we truncate at that point.
813/// For lines with fewer fields, we output them unchanged.
814/// Lines without any delimiter are output unchanged (suppress=false assumed).
815#[inline]
816fn fields_prefix_zerocopy(
817    data: &[u8],
818    delim: u8,
819    line_delim: u8,
820    last_field: usize,
821    out: &mut impl Write,
822) -> io::Result<()> {
823    let mut start = 0;
824    let mut run_start: usize = 0;
825
826    for end_pos in memchr_iter(line_delim, data) {
827        let line = &data[start..end_pos];
828        // Find the position of the Nth delimiter to truncate at
829        let mut field_count = 1;
830        let mut truncate_at: Option<usize> = None;
831        for dpos in memchr_iter(delim, line) {
832            if field_count >= last_field {
833                truncate_at = Some(start + dpos);
834                break;
835            }
836            field_count += 1;
837        }
838
839        if let Some(trunc_pos) = truncate_at {
840            // This line has more fields than needed. Flush run, write truncated.
841            if run_start < start {
842                out.write_all(&data[run_start..start])?;
843            }
844            out.write_all(&data[start..trunc_pos])?;
845            out.write_all(&[line_delim])?;
846            run_start = end_pos + 1;
847        }
848        // else: line has <= last_field fields, keep it in the run
849        start = end_pos + 1;
850    }
851    // Handle last line without terminator
852    if start < data.len() {
853        let line = &data[start..];
854        let mut field_count = 1;
855        let mut truncate_at: Option<usize> = None;
856        for dpos in memchr_iter(delim, line) {
857            if field_count >= last_field {
858                truncate_at = Some(start + dpos);
859                break;
860            }
861            field_count += 1;
862        }
863        if let Some(trunc_pos) = truncate_at {
864            if run_start < start {
865                out.write_all(&data[run_start..start])?;
866            }
867            out.write_all(&data[start..trunc_pos])?;
868            out.write_all(&[line_delim])?;
869            return Ok(());
870        }
871    }
872    // Flush remaining run
873    if run_start < data.len() {
874        out.write_all(&data[run_start..])?;
875        if !data.is_empty() && *data.last().unwrap() != line_delim {
876            out.write_all(&[line_delim])?;
877        }
878    }
879    Ok(())
880}
881
882/// Process a chunk for contiguous from-start field range extraction.
883fn fields_prefix_chunk(
884    data: &[u8],
885    delim: u8,
886    line_delim: u8,
887    last_field: usize,
888    suppress: bool,
889    buf: &mut Vec<u8>,
890) {
891    let mut start = 0;
892    for end_pos in memchr_iter(line_delim, data) {
893        let line = &data[start..end_pos];
894        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
895        start = end_pos + 1;
896    }
897    if start < data.len() {
898        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
899    }
900}
901
902/// Extract first N fields from one line (contiguous from-start range).
903#[inline(always)]
904fn fields_prefix_line(
905    line: &[u8],
906    delim: u8,
907    line_delim: u8,
908    last_field: usize,
909    suppress: bool,
910    buf: &mut Vec<u8>,
911) {
912    if line.is_empty() {
913        if !suppress {
914            buf.push(line_delim);
915        }
916        return;
917    }
918
919    buf.reserve(line.len() + 1);
920
921    let mut field_count = 1;
922    let mut has_delim = false;
923
924    for pos in memchr_iter(delim, line) {
925        has_delim = true;
926        if field_count >= last_field {
927            unsafe {
928                buf_extend(buf, &line[..pos]);
929                buf_push(buf, line_delim);
930            }
931            return;
932        }
933        field_count += 1;
934    }
935
936    if !has_delim {
937        if !suppress {
938            unsafe {
939                buf_extend(buf, line);
940                buf_push(buf, line_delim);
941            }
942        }
943        return;
944    }
945
946    unsafe {
947        buf_extend(buf, line);
948        buf_push(buf, line_delim);
949    }
950}
951
952/// Open-ended field suffix extraction (e.g., `cut -f3-`).
953fn process_fields_suffix(
954    data: &[u8],
955    delim: u8,
956    line_delim: u8,
957    start_field: usize,
958    suppress: bool,
959    out: &mut impl Write,
960) -> io::Result<()> {
961    if data.len() >= PARALLEL_THRESHOLD {
962        let chunks = split_into_chunks(data, line_delim);
963        let results: Vec<Vec<u8>> = chunks
964            .par_iter()
965            .map(|chunk| {
966                let mut buf = Vec::with_capacity(chunk.len());
967                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
968                buf
969            })
970            .collect();
971        // Use write_vectored (writev) to batch N writes into fewer syscalls
972        let slices: Vec<IoSlice> = results
973            .iter()
974            .filter(|r| !r.is_empty())
975            .map(|r| IoSlice::new(r))
976            .collect();
977        write_ioslices(out, &slices)?;
978    } else {
979        let mut buf = Vec::with_capacity(data.len());
980        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
981        if !buf.is_empty() {
982            out.write_all(&buf)?;
983        }
984    }
985    Ok(())
986}
987
988/// Process a chunk for open-ended field suffix extraction.
989fn fields_suffix_chunk(
990    data: &[u8],
991    delim: u8,
992    line_delim: u8,
993    start_field: usize,
994    suppress: bool,
995    buf: &mut Vec<u8>,
996) {
997    let mut start = 0;
998    for end_pos in memchr_iter(line_delim, data) {
999        let line = &data[start..end_pos];
1000        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1001        start = end_pos + 1;
1002    }
1003    if start < data.len() {
1004        fields_suffix_line(
1005            &data[start..],
1006            delim,
1007            line_delim,
1008            start_field,
1009            suppress,
1010            buf,
1011        );
1012    }
1013}
1014
1015/// Extract fields from start_field to end from one line.
1016#[inline(always)]
1017fn fields_suffix_line(
1018    line: &[u8],
1019    delim: u8,
1020    line_delim: u8,
1021    start_field: usize,
1022    suppress: bool,
1023    buf: &mut Vec<u8>,
1024) {
1025    if line.is_empty() {
1026        if !suppress {
1027            buf.push(line_delim);
1028        }
1029        return;
1030    }
1031
1032    buf.reserve(line.len() + 1);
1033
1034    let skip_delims = start_field - 1;
1035    let mut delim_count = 0;
1036    let mut has_delim = false;
1037
1038    for pos in memchr_iter(delim, line) {
1039        has_delim = true;
1040        delim_count += 1;
1041        if delim_count >= skip_delims {
1042            unsafe {
1043                buf_extend(buf, &line[pos + 1..]);
1044                buf_push(buf, line_delim);
1045            }
1046            return;
1047        }
1048    }
1049
1050    if !has_delim {
1051        if !suppress {
1052            unsafe {
1053                buf_extend(buf, line);
1054                buf_push(buf, line_delim);
1055            }
1056        }
1057        return;
1058    }
1059
1060    // Fewer delimiters than needed
1061    unsafe { buf_push(buf, line_delim) };
1062}
1063
1064/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1065/// Optimized: skip to start_field using memchr, then output until end_field.
1066fn process_fields_mid_range(
1067    data: &[u8],
1068    delim: u8,
1069    line_delim: u8,
1070    start_field: usize,
1071    end_field: usize,
1072    suppress: bool,
1073    out: &mut impl Write,
1074) -> io::Result<()> {
1075    if data.len() >= PARALLEL_THRESHOLD {
1076        let chunks = split_into_chunks(data, line_delim);
1077        let results: Vec<Vec<u8>> = chunks
1078            .par_iter()
1079            .map(|chunk| {
1080                let mut buf = Vec::with_capacity(chunk.len());
1081                fields_mid_range_chunk(
1082                    chunk,
1083                    delim,
1084                    line_delim,
1085                    start_field,
1086                    end_field,
1087                    suppress,
1088                    &mut buf,
1089                );
1090                buf
1091            })
1092            .collect();
1093        let slices: Vec<IoSlice> = results
1094            .iter()
1095            .filter(|r| !r.is_empty())
1096            .map(|r| IoSlice::new(r))
1097            .collect();
1098        write_ioslices(out, &slices)?;
1099    } else {
1100        let mut buf = Vec::with_capacity(data.len());
1101        fields_mid_range_chunk(
1102            data,
1103            delim,
1104            line_delim,
1105            start_field,
1106            end_field,
1107            suppress,
1108            &mut buf,
1109        );
1110        if !buf.is_empty() {
1111            out.write_all(&buf)?;
1112        }
1113    }
1114    Ok(())
1115}
1116
1117/// Process a chunk for contiguous mid-range field extraction.
1118fn fields_mid_range_chunk(
1119    data: &[u8],
1120    delim: u8,
1121    line_delim: u8,
1122    start_field: usize,
1123    end_field: usize,
1124    suppress: bool,
1125    buf: &mut Vec<u8>,
1126) {
1127    let mut start = 0;
1128    for end_pos in memchr_iter(line_delim, data) {
1129        let line = &data[start..end_pos];
1130        fields_mid_range_line(
1131            line,
1132            delim,
1133            line_delim,
1134            start_field,
1135            end_field,
1136            suppress,
1137            buf,
1138        );
1139        start = end_pos + 1;
1140    }
1141    if start < data.len() {
1142        fields_mid_range_line(
1143            &data[start..],
1144            delim,
1145            line_delim,
1146            start_field,
1147            end_field,
1148            suppress,
1149            buf,
1150        );
1151    }
1152}
1153
1154/// Extract fields start_field..=end_field from one line.
1155/// Uses memchr_iter to skip to start_field, then counts delimiters to end_field.
1156#[inline(always)]
1157fn fields_mid_range_line(
1158    line: &[u8],
1159    delim: u8,
1160    line_delim: u8,
1161    start_field: usize,
1162    end_field: usize,
1163    suppress: bool,
1164    buf: &mut Vec<u8>,
1165) {
1166    if line.is_empty() {
1167        if !suppress {
1168            buf.push(line_delim);
1169        }
1170        return;
1171    }
1172
1173    buf.reserve(line.len() + 1);
1174
1175    // Count delimiters to find start_field and end_field boundaries
1176    let skip_before = start_field - 1; // delimiters to skip before start_field
1177    let field_span = end_field - start_field; // additional delimiters within the range
1178    let mut delim_count = 0;
1179    let mut range_start = 0;
1180    let mut has_delim = false;
1181
1182    for pos in memchr_iter(delim, line) {
1183        has_delim = true;
1184        delim_count += 1;
1185        if delim_count == skip_before {
1186            range_start = pos + 1;
1187        }
1188        if delim_count == skip_before + field_span + 1 {
1189            // Found the delimiter after end_field — output the range
1190            if skip_before == 0 {
1191                range_start = 0;
1192            }
1193            unsafe {
1194                buf_extend(buf, &line[range_start..pos]);
1195                buf_push(buf, line_delim);
1196            }
1197            return;
1198        }
1199    }
1200
1201    if !has_delim {
1202        if !suppress {
1203            unsafe {
1204                buf_extend(buf, line);
1205                buf_push(buf, line_delim);
1206            }
1207        }
1208        return;
1209    }
1210
1211    // Line has delimiters but fewer fields than end_field
1212    if delim_count >= skip_before {
1213        // We have at least start_field, output from range_start to end
1214        if skip_before == 0 {
1215            range_start = 0;
1216        }
1217        unsafe {
1218            buf_extend(buf, &line[range_start..]);
1219            buf_push(buf, line_delim);
1220        }
1221    } else {
1222        // Not enough fields even for start_field — output empty line
1223        unsafe { buf_push(buf, line_delim) };
1224    }
1225}
1226
1227/// Combined SIMD scan for arbitrary single field extraction.
1228/// Uses memchr2_iter(delim, line_delim) to scan for both bytes in a single SIMD pass.
1229/// This is faster than the nested approach (outer: find newlines, inner: find delimiters)
1230/// because it eliminates one full SIMD scan and improves cache locality.
1231fn process_nth_field_combined(
1232    data: &[u8],
1233    delim: u8,
1234    line_delim: u8,
1235    target_idx: usize,
1236    suppress: bool,
1237    buf: &mut Vec<u8>,
1238) {
1239    buf.reserve(data.len());
1240
1241    let mut line_start: usize = 0;
1242    let mut field_start: usize = 0;
1243    let mut field_idx: usize = 0;
1244    let mut has_delim = false;
1245    let mut emitted = false;
1246
1247    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1248        let byte = unsafe { *data.get_unchecked(pos) };
1249
1250        if byte == line_delim {
1251            // End of line
1252            if !emitted {
1253                if has_delim && field_idx == target_idx {
1254                    // Last field matches target
1255                    unsafe {
1256                        buf_extend(buf, &data[field_start..pos]);
1257                        buf_push(buf, line_delim);
1258                    }
1259                } else if has_delim {
1260                    // Target field doesn't exist (fewer fields)
1261                    unsafe {
1262                        buf_push(buf, line_delim);
1263                    }
1264                } else if !suppress {
1265                    // No delimiter in line — output unchanged
1266                    unsafe {
1267                        buf_extend(buf, &data[line_start..pos]);
1268                        buf_push(buf, line_delim);
1269                    }
1270                }
1271            }
1272            // Reset for next line
1273            line_start = pos + 1;
1274            field_start = pos + 1;
1275            field_idx = 0;
1276            has_delim = false;
1277            emitted = false;
1278        } else {
1279            // Delimiter found
1280            has_delim = true;
1281            if field_idx == target_idx {
1282                unsafe {
1283                    buf_extend(buf, &data[field_start..pos]);
1284                    buf_push(buf, line_delim);
1285                }
1286                emitted = true;
1287            }
1288            field_idx += 1;
1289            field_start = pos + 1;
1290        }
1291    }
1292
1293    // Handle last line without trailing newline
1294    if line_start < data.len() && !emitted {
1295        if has_delim && field_idx == target_idx {
1296            unsafe {
1297                buf_extend(buf, &data[field_start..data.len()]);
1298                buf_push(buf, line_delim);
1299            }
1300        } else if has_delim {
1301            unsafe {
1302                buf_push(buf, line_delim);
1303            }
1304        } else if !suppress {
1305            unsafe {
1306                buf_extend(buf, &data[line_start..data.len()]);
1307                buf_push(buf, line_delim);
1308            }
1309        }
1310    }
1311}
1312
1313/// Zero-copy field-1 extraction: writes contiguous runs directly from source data.
1314/// For each line: if delimiter exists, truncate at first delimiter; otherwise pass through.
1315/// Uses memchr2 to scan for both delimiter and line terminator in a single SIMD pass.
1316#[inline]
1317fn single_field1_zerocopy(
1318    data: &[u8],
1319    delim: u8,
1320    line_delim: u8,
1321    out: &mut impl Write,
1322) -> io::Result<()> {
1323    let mut line_start: usize = 0;
1324    let mut run_start: usize = 0;
1325    let mut first_delim: Option<usize> = None;
1326
1327    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1328        let byte = unsafe { *data.get_unchecked(pos) };
1329
1330        if byte == line_delim {
1331            // End of line
1332            if let Some(dp) = first_delim {
1333                // Line has delimiter — truncate at first delimiter.
1334                // Flush current run up to line_start, write truncated line.
1335                if run_start < line_start {
1336                    out.write_all(&data[run_start..line_start])?;
1337                }
1338                out.write_all(&data[line_start..dp])?;
1339                out.write_all(&[line_delim])?;
1340                run_start = pos + 1;
1341            }
1342            // else: no delimiter in line, output unchanged (stays in run)
1343            line_start = pos + 1;
1344            first_delim = None;
1345        } else {
1346            // Delimiter found
1347            if first_delim.is_none() {
1348                first_delim = Some(pos);
1349            }
1350        }
1351    }
1352
1353    // Handle last line (no trailing line_delim)
1354    if line_start < data.len() {
1355        if let Some(dp) = first_delim {
1356            if run_start < line_start {
1357                out.write_all(&data[run_start..line_start])?;
1358            }
1359            out.write_all(&data[line_start..dp])?;
1360            out.write_all(&[line_delim])?;
1361            return Ok(());
1362        }
1363    }
1364
1365    // Flush remaining run
1366    if run_start < data.len() {
1367        out.write_all(&data[run_start..])?;
1368        if !data.is_empty() && *data.last().unwrap() != line_delim {
1369            out.write_all(&[line_delim])?;
1370        }
1371    }
1372    Ok(())
1373}
1374
1375/// Optimized path for extracting small field indices (2-4) without suppress.
1376/// Uses per-line memchr calls to find the target field boundaries.
1377/// For field 2: finds the 1st delimiter (start of field 2), then the 2nd (end).
1378/// More efficient than memchr2_iter for small field indices since we stop early.
1379fn process_small_field_combined(
1380    data: &[u8],
1381    delim: u8,
1382    line_delim: u8,
1383    target_idx: usize,
1384    buf: &mut Vec<u8>,
1385) {
1386    buf.reserve(data.len());
1387    let mut start = 0;
1388    for end_pos in memchr_iter(line_delim, data) {
1389        let line = &data[start..end_pos];
1390        // Find the start of the target field (skip target_idx delimiters)
1391        let mut field_start = 0;
1392        let mut found_start = target_idx == 0;
1393        let mut delim_count = 0;
1394        if !found_start {
1395            let mut search_start = 0;
1396            while let Some(pos) = memchr::memchr(delim, &line[search_start..]) {
1397                delim_count += 1;
1398                if delim_count == target_idx {
1399                    field_start = search_start + pos + 1;
1400                    found_start = true;
1401                    break;
1402                }
1403                search_start = search_start + pos + 1;
1404            }
1405        }
1406        if !found_start {
1407            // Line has fewer fields than needed - output as-is (no suppress)
1408            unsafe {
1409                buf_extend(buf, line);
1410                buf_push(buf, line_delim);
1411            }
1412        } else if field_start >= line.len() {
1413            // Empty field at end
1414            unsafe { buf_push(buf, line_delim) };
1415        } else {
1416            // Find the end of the target field
1417            match memchr::memchr(delim, &line[field_start..]) {
1418                Some(pos) => unsafe {
1419                    buf_extend(buf, &line[field_start..field_start + pos]);
1420                    buf_push(buf, line_delim);
1421                },
1422                None => unsafe {
1423                    buf_extend(buf, &line[field_start..]);
1424                    buf_push(buf, line_delim);
1425                },
1426            }
1427        }
1428        start = end_pos + 1;
1429    }
1430    // Handle last line without terminator
1431    if start < data.len() {
1432        let line = &data[start..];
1433        let mut field_start = 0;
1434        let mut found_start = target_idx == 0;
1435        let mut delim_count = 0;
1436        if !found_start {
1437            let mut search_start = 0;
1438            while let Some(pos) = memchr::memchr(delim, &line[search_start..]) {
1439                delim_count += 1;
1440                if delim_count == target_idx {
1441                    field_start = search_start + pos + 1;
1442                    found_start = true;
1443                    break;
1444                }
1445                search_start = search_start + pos + 1;
1446            }
1447        }
1448        if !found_start {
1449            unsafe {
1450                buf_extend(buf, line);
1451                buf_push(buf, line_delim);
1452            }
1453        } else if field_start >= line.len() {
1454            unsafe { buf_push(buf, line_delim) };
1455        } else {
1456            match memchr::memchr(delim, &line[field_start..]) {
1457                Some(pos) => unsafe {
1458                    buf_extend(buf, &line[field_start..field_start + pos]);
1459                    buf_push(buf, line_delim);
1460                },
1461                None => unsafe {
1462                    buf_extend(buf, &line[field_start..]);
1463                    buf_push(buf, line_delim);
1464                },
1465            }
1466        }
1467    }
1468}
1469
1470/// Process a chunk of data for single-field extraction.
1471fn process_single_field_chunk(
1472    data: &[u8],
1473    delim: u8,
1474    target_idx: usize,
1475    line_delim: u8,
1476    suppress: bool,
1477    buf: &mut Vec<u8>,
1478) {
1479    let mut start = 0;
1480    for end_pos in memchr_iter(line_delim, data) {
1481        let line = &data[start..end_pos];
1482        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
1483        start = end_pos + 1;
1484    }
1485    if start < data.len() {
1486        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
1487    }
1488}
1489
1490/// Extract a single field from one line.
1491/// Uses unsafe buf helpers — caller must ensure buf has capacity reserved.
1492#[inline(always)]
1493fn extract_single_field_line(
1494    line: &[u8],
1495    delim: u8,
1496    target_idx: usize,
1497    line_delim: u8,
1498    suppress: bool,
1499    buf: &mut Vec<u8>,
1500) {
1501    if line.is_empty() {
1502        if !suppress {
1503            buf.push(line_delim);
1504        }
1505        return;
1506    }
1507
1508    // Ensure capacity for worst case (full line + newline)
1509    buf.reserve(line.len() + 1);
1510
1511    // Ultra-fast path for first field: single memchr
1512    if target_idx == 0 {
1513        match memchr::memchr(delim, line) {
1514            Some(pos) => unsafe {
1515                buf_extend(buf, &line[..pos]);
1516                buf_push(buf, line_delim);
1517            },
1518            None => {
1519                if !suppress {
1520                    unsafe {
1521                        buf_extend(buf, line);
1522                        buf_push(buf, line_delim);
1523                    }
1524                }
1525            }
1526        }
1527        return;
1528    }
1529
1530    let mut field_start = 0;
1531    let mut field_idx = 0;
1532    let mut has_delim = false;
1533
1534    for pos in memchr_iter(delim, line) {
1535        has_delim = true;
1536        if field_idx == target_idx {
1537            unsafe {
1538                buf_extend(buf, &line[field_start..pos]);
1539                buf_push(buf, line_delim);
1540            }
1541            return;
1542        }
1543        field_idx += 1;
1544        field_start = pos + 1;
1545    }
1546
1547    if !has_delim {
1548        if !suppress {
1549            unsafe {
1550                buf_extend(buf, line);
1551                buf_push(buf, line_delim);
1552            }
1553        }
1554        return;
1555    }
1556
1557    if field_idx == target_idx {
1558        unsafe {
1559            buf_extend(buf, &line[field_start..]);
1560            buf_push(buf, line_delim);
1561        }
1562    } else {
1563        unsafe { buf_push(buf, line_delim) };
1564    }
1565}
1566
1567/// Extract fields from a single line into the output buffer.
1568/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
1569#[inline(always)]
1570fn extract_fields_to_buf(
1571    line: &[u8],
1572    delim: u8,
1573    ranges: &[Range],
1574    output_delim: &[u8],
1575    suppress: bool,
1576    max_field: usize,
1577    field_mask: u64,
1578    line_delim: u8,
1579    buf: &mut Vec<u8>,
1580    complement: bool,
1581) {
1582    let len = line.len();
1583
1584    if len == 0 {
1585        if !suppress {
1586            buf.push(line_delim);
1587        }
1588        return;
1589    }
1590
1591    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
1592    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
1593    let needed = len + output_delim.len() * 16 + 1;
1594    if buf.capacity() - buf.len() < needed {
1595        buf.reserve(needed);
1596    }
1597
1598    let mut field_num: usize = 1;
1599    let mut field_start: usize = 0;
1600    let mut first_output = true;
1601    let mut has_delim = false;
1602
1603    for delim_pos in memchr_iter(delim, line) {
1604        has_delim = true;
1605
1606        if is_selected(field_num, field_mask, ranges, complement) {
1607            if !first_output {
1608                unsafe { buf_extend(buf, output_delim) };
1609            }
1610            unsafe { buf_extend(buf, &line[field_start..delim_pos]) };
1611            first_output = false;
1612        }
1613
1614        field_num += 1;
1615        field_start = delim_pos + 1;
1616
1617        if field_num > max_field {
1618            break;
1619        }
1620    }
1621
1622    // Last field
1623    if (field_num <= max_field || complement)
1624        && has_delim
1625        && is_selected(field_num, field_mask, ranges, complement)
1626    {
1627        if !first_output {
1628            unsafe { buf_extend(buf, output_delim) };
1629        }
1630        unsafe { buf_extend(buf, &line[field_start..len]) };
1631        first_output = false;
1632    }
1633
1634    if !first_output {
1635        unsafe { buf_push(buf, line_delim) };
1636    } else if !has_delim {
1637        if !suppress {
1638            unsafe {
1639                buf_extend(buf, line);
1640                buf_push(buf, line_delim);
1641            }
1642        }
1643    } else {
1644        unsafe { buf_push(buf, line_delim) };
1645    }
1646}
1647
1648// ── Fast path: byte/char extraction with batched output ──────────────────
1649
1650/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
1651/// Zero-copy: writes directly from the source data using output runs.
1652/// For lines shorter than max_bytes, the output is identical to the input,
1653/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
1654fn process_bytes_from_start(
1655    data: &[u8],
1656    max_bytes: usize,
1657    line_delim: u8,
1658    out: &mut impl Write,
1659) -> io::Result<()> {
1660    if data.len() >= PARALLEL_THRESHOLD {
1661        let chunks = split_into_chunks(data, line_delim);
1662        let results: Vec<Vec<u8>> = chunks
1663            .par_iter()
1664            .map(|chunk| {
1665                let mut buf = Vec::with_capacity(chunk.len());
1666                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
1667                buf
1668            })
1669            .collect();
1670        // Use write_vectored (writev) to batch N writes into fewer syscalls
1671        let slices: Vec<IoSlice> = results
1672            .iter()
1673            .filter(|r| !r.is_empty())
1674            .map(|r| IoSlice::new(r))
1675            .collect();
1676        write_ioslices(out, &slices)?;
1677    } else {
1678        // Zero-copy path: track contiguous output runs and write directly from source.
1679        // For lines <= max_bytes, we include them as-is (no copy needed).
1680        // For lines > max_bytes, we flush the run, write the truncated line, start new run.
1681        bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
1682    }
1683    Ok(())
1684}
1685
1686/// Zero-copy byte-prefix extraction: writes contiguous runs directly from the source data.
1687/// Only copies when a line needs truncation (line > max_bytes).
1688#[inline]
1689fn bytes_from_start_zerocopy(
1690    data: &[u8],
1691    max_bytes: usize,
1692    line_delim: u8,
1693    out: &mut impl Write,
1694) -> io::Result<()> {
1695    let mut start = 0;
1696    let mut run_start: usize = 0;
1697
1698    for pos in memchr_iter(line_delim, data) {
1699        let line_len = pos - start;
1700        if line_len > max_bytes {
1701            // This line needs truncation. Flush current run, write truncated line.
1702            if run_start < start {
1703                out.write_all(&data[run_start..start])?;
1704            }
1705            out.write_all(&data[start..start + max_bytes])?;
1706            out.write_all(&[line_delim])?;
1707            run_start = pos + 1;
1708        }
1709        // else: line fits, keep it in the current contiguous run
1710        start = pos + 1;
1711    }
1712    // Handle last line without terminator
1713    if start < data.len() {
1714        let line_len = data.len() - start;
1715        if line_len > max_bytes {
1716            if run_start < start {
1717                out.write_all(&data[run_start..start])?;
1718            }
1719            out.write_all(&data[start..start + max_bytes])?;
1720            out.write_all(&[line_delim])?;
1721            return Ok(());
1722        }
1723    }
1724    // Flush remaining run (includes all short lines + the last line)
1725    if run_start < data.len() {
1726        out.write_all(&data[run_start..])?;
1727        // Add terminator if last byte isn't one
1728        if !data.is_empty() && *data.last().unwrap() != line_delim {
1729            out.write_all(&[line_delim])?;
1730        }
1731    }
1732    Ok(())
1733}
1734
1735/// Process a chunk for from-start byte range extraction (parallel path).
1736/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1737#[inline]
1738fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1739    // Reserve enough capacity: output <= input size
1740    buf.reserve(data.len());
1741
1742    let mut start = 0;
1743    for pos in memchr_iter(line_delim, data) {
1744        let line_len = pos - start;
1745        let take = line_len.min(max_bytes);
1746        unsafe {
1747            buf_extend(buf, &data[start..start + take]);
1748            buf_push(buf, line_delim);
1749        }
1750        start = pos + 1;
1751    }
1752    // Handle last line without terminator
1753    if start < data.len() {
1754        let line_len = data.len() - start;
1755        let take = line_len.min(max_bytes);
1756        unsafe {
1757            buf_extend(buf, &data[start..start + take]);
1758            buf_push(buf, line_delim);
1759        }
1760    }
1761}
1762
1763/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
1764fn process_bytes_from_offset(
1765    data: &[u8],
1766    skip_bytes: usize,
1767    line_delim: u8,
1768    out: &mut impl Write,
1769) -> io::Result<()> {
1770    if data.len() >= PARALLEL_THRESHOLD {
1771        let chunks = split_into_chunks(data, line_delim);
1772        let results: Vec<Vec<u8>> = chunks
1773            .par_iter()
1774            .map(|chunk| {
1775                let mut buf = Vec::with_capacity(chunk.len());
1776                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
1777                buf
1778            })
1779            .collect();
1780        // Use write_vectored (writev) to batch N writes into fewer syscalls
1781        let slices: Vec<IoSlice> = results
1782            .iter()
1783            .filter(|r| !r.is_empty())
1784            .map(|r| IoSlice::new(r))
1785            .collect();
1786        write_ioslices(out, &slices)?;
1787    } else {
1788        // Zero-copy: write suffix of each line directly from source
1789        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
1790    }
1791    Ok(())
1792}
1793
1794/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
1795/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
1796/// reducing syscall overhead from 2 write_all calls per line to batched writev.
1797#[inline]
1798fn bytes_from_offset_zerocopy(
1799    data: &[u8],
1800    skip_bytes: usize,
1801    line_delim: u8,
1802    out: &mut impl Write,
1803) -> io::Result<()> {
1804    let delim_buf = [line_delim];
1805    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
1806
1807    let mut start = 0;
1808    for pos in memchr_iter(line_delim, data) {
1809        let line_len = pos - start;
1810        if line_len > skip_bytes {
1811            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
1812        }
1813        iov.push(IoSlice::new(&delim_buf));
1814        // Flush when approaching MAX_IOV to avoid oversized writev
1815        if iov.len() >= MAX_IOV - 1 {
1816            write_ioslices(out, &iov)?;
1817            iov.clear();
1818        }
1819        start = pos + 1;
1820    }
1821    if start < data.len() {
1822        let line_len = data.len() - start;
1823        if line_len > skip_bytes {
1824            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
1825        }
1826        iov.push(IoSlice::new(&delim_buf));
1827    }
1828    if !iov.is_empty() {
1829        write_ioslices(out, &iov)?;
1830    }
1831    Ok(())
1832}
1833
1834/// Process a chunk for from-offset byte range extraction.
1835/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1836#[inline]
1837fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1838    buf.reserve(data.len());
1839
1840    let mut start = 0;
1841    for pos in memchr_iter(line_delim, data) {
1842        let line_len = pos - start;
1843        if line_len > skip_bytes {
1844            unsafe {
1845                buf_extend(buf, &data[start + skip_bytes..pos]);
1846            }
1847        }
1848        unsafe {
1849            buf_push(buf, line_delim);
1850        }
1851        start = pos + 1;
1852    }
1853    if start < data.len() {
1854        let line_len = data.len() - start;
1855        if line_len > skip_bytes {
1856            unsafe {
1857                buf_extend(buf, &data[start + skip_bytes..data.len()]);
1858            }
1859        }
1860        unsafe {
1861            buf_push(buf, line_delim);
1862        }
1863    }
1864}
1865
1866/// Optimized byte/char extraction with batched output and parallel processing.
1867fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
1868    let line_delim = cfg.line_delim;
1869    let ranges = cfg.ranges;
1870    let complement = cfg.complement;
1871    let output_delim = cfg.output_delim;
1872
1873    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
1874    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
1875        let max_bytes = ranges[0].end;
1876        if max_bytes < usize::MAX {
1877            return process_bytes_from_start(data, max_bytes, line_delim, out);
1878        }
1879    }
1880
1881    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
1882    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
1883        let skip_bytes = ranges[0].start.saturating_sub(1);
1884        if skip_bytes > 0 {
1885            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
1886        }
1887    }
1888
1889    if data.len() >= PARALLEL_THRESHOLD {
1890        let chunks = split_into_chunks(data, line_delim);
1891        let results: Vec<Vec<u8>> = chunks
1892            .par_iter()
1893            .map(|chunk| {
1894                let mut buf = Vec::with_capacity(chunk.len());
1895                process_bytes_chunk(
1896                    chunk,
1897                    ranges,
1898                    complement,
1899                    output_delim,
1900                    line_delim,
1901                    &mut buf,
1902                );
1903                buf
1904            })
1905            .collect();
1906        // Use write_vectored (writev) to batch N writes into fewer syscalls
1907        let slices: Vec<IoSlice> = results
1908            .iter()
1909            .filter(|r| !r.is_empty())
1910            .map(|r| IoSlice::new(r))
1911            .collect();
1912        write_ioslices(out, &slices)?;
1913    } else {
1914        let mut buf = Vec::with_capacity(data.len());
1915        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
1916        if !buf.is_empty() {
1917            out.write_all(&buf)?;
1918        }
1919    }
1920    Ok(())
1921}
1922
1923/// Process a chunk of data for byte/char extraction.
1924fn process_bytes_chunk(
1925    data: &[u8],
1926    ranges: &[Range],
1927    complement: bool,
1928    output_delim: &[u8],
1929    line_delim: u8,
1930    buf: &mut Vec<u8>,
1931) {
1932    let mut start = 0;
1933    for end_pos in memchr_iter(line_delim, data) {
1934        let line = &data[start..end_pos];
1935        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
1936        buf.push(line_delim);
1937        start = end_pos + 1;
1938    }
1939    if start < data.len() {
1940        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
1941        buf.push(line_delim);
1942    }
1943}
1944
1945/// Extract byte ranges from a line into the output buffer.
1946/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
1947#[inline(always)]
1948fn cut_bytes_to_buf(
1949    line: &[u8],
1950    ranges: &[Range],
1951    complement: bool,
1952    output_delim: &[u8],
1953    buf: &mut Vec<u8>,
1954) {
1955    let len = line.len();
1956    let mut first_range = true;
1957
1958    // Reserve worst case: full line + delimiters between ranges
1959    buf.reserve(len + output_delim.len() * ranges.len() + 1);
1960
1961    if complement {
1962        let mut pos: usize = 1;
1963        for r in ranges {
1964            let rs = r.start;
1965            let re = r.end.min(len);
1966            if pos < rs {
1967                if !first_range && !output_delim.is_empty() {
1968                    unsafe { buf_extend(buf, output_delim) };
1969                }
1970                unsafe { buf_extend(buf, &line[pos - 1..rs - 1]) };
1971                first_range = false;
1972            }
1973            pos = re + 1;
1974            if pos > len {
1975                break;
1976            }
1977        }
1978        if pos <= len {
1979            if !first_range && !output_delim.is_empty() {
1980                unsafe { buf_extend(buf, output_delim) };
1981            }
1982            unsafe { buf_extend(buf, &line[pos - 1..len]) };
1983        }
1984    } else if output_delim.is_empty() && ranges.len() == 1 {
1985        // Ultra-fast path: single range, no output delimiter
1986        let start = ranges[0].start.saturating_sub(1);
1987        let end = ranges[0].end.min(len);
1988        if start < len {
1989            unsafe { buf_extend(buf, &line[start..end]) };
1990        }
1991    } else {
1992        for r in ranges {
1993            let start = r.start.saturating_sub(1);
1994            let end = r.end.min(len);
1995            if start >= len {
1996                break;
1997            }
1998            if !first_range && !output_delim.is_empty() {
1999                unsafe { buf_extend(buf, output_delim) };
2000            }
2001            unsafe { buf_extend(buf, &line[start..end]) };
2002            first_range = false;
2003        }
2004    }
2005}
2006
2007// ── Public API ───────────────────────────────────────────────────────────
2008
2009/// Cut fields from a line using a delimiter. Writes to `out`.
2010#[inline]
2011pub fn cut_fields(
2012    line: &[u8],
2013    delim: u8,
2014    ranges: &[Range],
2015    complement: bool,
2016    output_delim: &[u8],
2017    suppress_no_delim: bool,
2018    out: &mut impl Write,
2019) -> io::Result<bool> {
2020    if memchr::memchr(delim, line).is_none() {
2021        if !suppress_no_delim {
2022            out.write_all(line)?;
2023            return Ok(true);
2024        }
2025        return Ok(false);
2026    }
2027
2028    let mut field_num: usize = 1;
2029    let mut field_start: usize = 0;
2030    let mut first_output = true;
2031
2032    for delim_pos in memchr_iter(delim, line) {
2033        let selected = in_ranges(ranges, field_num) != complement;
2034        if selected {
2035            if !first_output {
2036                out.write_all(output_delim)?;
2037            }
2038            out.write_all(&line[field_start..delim_pos])?;
2039            first_output = false;
2040        }
2041        field_start = delim_pos + 1;
2042        field_num += 1;
2043    }
2044
2045    let selected = in_ranges(ranges, field_num) != complement;
2046    if selected {
2047        if !first_output {
2048            out.write_all(output_delim)?;
2049        }
2050        out.write_all(&line[field_start..])?;
2051    }
2052
2053    Ok(true)
2054}
2055
2056/// Cut bytes/chars from a line. Writes selected bytes to `out`.
2057#[inline]
2058pub fn cut_bytes(
2059    line: &[u8],
2060    ranges: &[Range],
2061    complement: bool,
2062    output_delim: &[u8],
2063    out: &mut impl Write,
2064) -> io::Result<bool> {
2065    let mut first_range = true;
2066
2067    if complement {
2068        let len = line.len();
2069        let mut comp_ranges = Vec::new();
2070        let mut pos: usize = 1;
2071        for r in ranges {
2072            let rs = r.start;
2073            let re = r.end.min(len);
2074            if pos < rs {
2075                comp_ranges.push((pos, rs - 1));
2076            }
2077            pos = re + 1;
2078            if pos > len {
2079                break;
2080            }
2081        }
2082        if pos <= len {
2083            comp_ranges.push((pos, len));
2084        }
2085        for &(s, e) in &comp_ranges {
2086            if !first_range && !output_delim.is_empty() {
2087                out.write_all(output_delim)?;
2088            }
2089            out.write_all(&line[s - 1..e])?;
2090            first_range = false;
2091        }
2092    } else {
2093        for r in ranges {
2094            let start = r.start.saturating_sub(1);
2095            let end = r.end.min(line.len());
2096            if start >= line.len() {
2097                break;
2098            }
2099            if !first_range && !output_delim.is_empty() {
2100                out.write_all(output_delim)?;
2101            }
2102            out.write_all(&line[start..end])?;
2103            first_range = false;
2104        }
2105    }
2106    Ok(true)
2107}
2108
2109/// Process a full data buffer (from mmap or read) with cut operation.
2110pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2111    match cfg.mode {
2112        CutMode::Fields => process_fields_fast(data, cfg, out),
2113        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
2114    }
2115}
2116
2117/// Process input from a reader (for stdin).
2118/// Uses batch reading: reads large chunks (16MB), then processes them in batch
2119/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
2120/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
2121pub fn process_cut_reader<R: BufRead>(
2122    mut reader: R,
2123    cfg: &CutConfig,
2124    out: &mut impl Write,
2125) -> io::Result<()> {
2126    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
2127    let line_delim = cfg.line_delim;
2128
2129    // Read large chunks and process in batch.
2130    // We keep a buffer; after processing complete lines, we shift leftover to the front.
2131    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
2132
2133    loop {
2134        // Read up to CHUNK_SIZE bytes
2135        buf.reserve(CHUNK_SIZE);
2136        let read_start = buf.len();
2137        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
2138        let n = read_fully(&mut reader, &mut buf[read_start..])?;
2139        buf.truncate(read_start + n);
2140
2141        if buf.is_empty() {
2142            break;
2143        }
2144
2145        if n == 0 {
2146            // EOF with leftover data (last line without terminator)
2147            process_cut_data(&buf, cfg, out)?;
2148            break;
2149        }
2150
2151        // Find the last line delimiter in the buffer so we process complete lines
2152        let process_end = match memchr::memrchr(line_delim, &buf) {
2153            Some(pos) => pos + 1,
2154            None => {
2155                // No line delimiter found — keep accumulating
2156                continue;
2157            }
2158        };
2159
2160        // Process the complete lines using the fast batch path
2161        process_cut_data(&buf[..process_end], cfg, out)?;
2162
2163        // Shift leftover to the front for next iteration
2164        let leftover_len = buf.len() - process_end;
2165        if leftover_len > 0 {
2166            buf.copy_within(process_end.., 0);
2167        }
2168        buf.truncate(leftover_len);
2169    }
2170
2171    Ok(())
2172}
2173
2174/// Read as many bytes as possible into buf, retrying on partial reads.
2175#[inline]
2176fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
2177    let n = reader.read(buf)?;
2178    if n == buf.len() || n == 0 {
2179        return Ok(n);
2180    }
2181    // Slow path: partial read — retry to fill buffer
2182    let mut total = n;
2183    while total < buf.len() {
2184        match reader.read(&mut buf[total..]) {
2185            Ok(0) => break,
2186            Ok(n) => total += n,
2187            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2188            Err(e) => return Err(e),
2189        }
2190    }
2191    Ok(total)
2192}
2193
2194/// Cut operation mode
2195#[derive(Debug, Clone, Copy, PartialEq)]
2196pub enum CutMode {
2197    Bytes,
2198    Characters,
2199    Fields,
2200}