Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (2MB).
6/// Rayon's thread pool initialization costs ~0.5ms on first use, but for data >= 2MB
7/// with 4 cores, the parallel savings (~3ms) far exceed the overhead. The 10MB
8/// benchmark regressed from ~7x to ~5.3x when this was set to 32MB because it
9/// no longer got parallelized.
10const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
11
12/// Max iovec entries per writev call (Linux default).
13const MAX_IOV: usize = 1024;
14
15/// Configuration for cut operations.
16pub struct CutConfig<'a> {
17    pub mode: CutMode,
18    pub ranges: &'a [Range],
19    pub complement: bool,
20    pub delim: u8,
21    pub output_delim: &'a [u8],
22    pub suppress_no_delim: bool,
23    pub line_delim: u8,
24}
25
26/// A range specification like 1, 3-5, -3, 4-
27#[derive(Debug, Clone)]
28pub struct Range {
29    pub start: usize, // 1-based, 0 means "from beginning"
30    pub end: usize,   // 1-based, usize::MAX means "to end"
31}
32
33/// Parse a LIST specification like "1,3-5,7-" into ranges.
34/// Each range is 1-based. Returns sorted, merged ranges.
35pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
36    let mut ranges = Vec::new();
37
38    for part in spec.split(',') {
39        let part = part.trim();
40        if part.is_empty() {
41            continue;
42        }
43
44        if let Some(idx) = part.find('-') {
45            let left = &part[..idx];
46            let right = &part[idx + 1..];
47
48            let start = if left.is_empty() {
49                1
50            } else {
51                left.parse::<usize>()
52                    .map_err(|_| format!("invalid range: '{}'", part))?
53            };
54
55            let end = if right.is_empty() {
56                usize::MAX
57            } else {
58                right
59                    .parse::<usize>()
60                    .map_err(|_| format!("invalid range: '{}'", part))?
61            };
62
63            if start == 0 {
64                return Err("fields and positions are numbered from 1".to_string());
65            }
66            if start > end {
67                return Err(format!("invalid decreasing range: '{}'", part));
68            }
69
70            ranges.push(Range { start, end });
71        } else {
72            let n = part
73                .parse::<usize>()
74                .map_err(|_| format!("invalid field: '{}'", part))?;
75            if n == 0 {
76                return Err("fields and positions are numbered from 1".to_string());
77            }
78            ranges.push(Range { start: n, end: n });
79        }
80    }
81
82    if ranges.is_empty() {
83        return Err("you must specify a list of bytes, characters, or fields".to_string());
84    }
85
86    // Sort and merge overlapping ranges
87    ranges.sort_by_key(|r| (r.start, r.end));
88    let mut merged = vec![ranges[0].clone()];
89    for r in &ranges[1..] {
90        let last = merged.last_mut().unwrap();
91        if r.start <= last.end.saturating_add(1) {
92            last.end = last.end.max(r.end);
93        } else {
94            merged.push(r.clone());
95        }
96    }
97
98    Ok(merged)
99}
100
101/// Check if a 1-based position is in any range.
102/// Ranges must be sorted. Uses early exit since ranges are sorted.
103#[inline(always)]
104fn in_ranges(ranges: &[Range], pos: usize) -> bool {
105    for r in ranges {
106        if pos < r.start {
107            return false;
108        }
109        if pos <= r.end {
110            return true;
111        }
112    }
113    false
114}
115
116/// Pre-compute a 64-bit mask for field selection.
117/// Bit i-1 is set if field i should be output.
118#[inline]
119fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
120    let mut mask: u64 = 0;
121    for i in 1..=64u32 {
122        let in_range = in_ranges(ranges, i as usize);
123        if in_range != complement {
124            mask |= 1u64 << (i - 1);
125        }
126    }
127    mask
128}
129
130/// Check if a field should be selected, using bitset for first 64 fields.
131#[inline(always)]
132fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
133    if field_num <= 64 {
134        (mask >> (field_num - 1)) & 1 == 1
135    } else {
136        in_ranges(ranges, field_num) != complement
137    }
138}
139
140// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
141
142/// Append a slice to buf without capacity checks.
143/// Caller MUST ensure buf has enough remaining capacity.
144#[inline(always)]
145unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
146    unsafe {
147        let len = buf.len();
148        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
149        buf.set_len(len + data.len());
150    }
151}
152
153/// Append a single byte to buf without capacity checks.
154/// Caller MUST ensure buf has enough remaining capacity.
155#[inline(always)]
156unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
157    unsafe {
158        let len = buf.len();
159        *buf.as_mut_ptr().add(len) = b;
160        buf.set_len(len + 1);
161    }
162}
163
164/// Write multiple IoSlice buffers using write_vectored (writev syscall).
165/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
166/// Cold path (partial write) is out-of-line to keep the hot loop tight.
167#[inline]
168fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
169    if slices.is_empty() {
170        return Ok(());
171    }
172    for batch in slices.chunks(MAX_IOV) {
173        let total: usize = batch.iter().map(|s| s.len()).sum();
174        let written = out.write_vectored(batch)?;
175        if written >= total {
176            continue;
177        }
178        if written == 0 {
179            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
180        }
181        write_ioslices_slow(out, batch, written)?;
182    }
183    Ok(())
184}
185
186/// Handle partial write_vectored (cold path, never inlined).
187#[cold]
188#[inline(never)]
189fn write_ioslices_slow(
190    out: &mut impl Write,
191    slices: &[IoSlice],
192    mut skip: usize,
193) -> io::Result<()> {
194    for slice in slices {
195        let len = slice.len();
196        if skip >= len {
197            skip -= len;
198            continue;
199        }
200        out.write_all(&slice[skip..])?;
201        skip = 0;
202    }
203    Ok(())
204}
205
206// ── Chunk splitting for parallel processing ──────────────────────────────
207
208/// Split data into chunks aligned to line boundaries for parallel processing.
209fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
210    let num_threads = rayon::current_num_threads().max(1);
211    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
212        return vec![data];
213    }
214
215    let chunk_size = data.len() / num_threads;
216    let mut chunks = Vec::with_capacity(num_threads);
217    let mut pos = 0;
218
219    for _ in 0..num_threads - 1 {
220        let target = pos + chunk_size;
221        if target >= data.len() {
222            break;
223        }
224        let boundary = memchr::memchr(line_delim, &data[target..])
225            .map(|p| target + p + 1)
226            .unwrap_or(data.len());
227        if boundary > pos {
228            chunks.push(&data[pos..boundary]);
229        }
230        pos = boundary;
231    }
232
233    if pos < data.len() {
234        chunks.push(&data[pos..]);
235    }
236
237    chunks
238}
239
240// ── Fast path: multi-field non-contiguous extraction ─────────────────────
241
242/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
243/// Pre-collects delimiter positions per line into a stack-allocated array,
244/// then directly indexes into them for each selected field.
245/// This is O(max_field) per line instead of O(num_fields * scan_length).
246fn process_fields_multi_select(
247    data: &[u8],
248    delim: u8,
249    line_delim: u8,
250    ranges: &[Range],
251    suppress: bool,
252    out: &mut impl Write,
253) -> io::Result<()> {
254    let max_field = ranges.last().map_or(0, |r| r.end);
255
256    if data.len() >= PARALLEL_THRESHOLD {
257        let chunks = split_into_chunks(data, line_delim);
258        let results: Vec<Vec<u8>> = chunks
259            .par_iter()
260            .map(|chunk| {
261                // Output is always <= input for field selection; use 3/4 as safe estimate
262                let mut buf = Vec::with_capacity(chunk.len() * 3 / 4);
263                multi_select_chunk(
264                    chunk, delim, line_delim, ranges, max_field, suppress, &mut buf,
265                );
266                buf
267            })
268            .collect();
269        let slices: Vec<IoSlice> = results
270            .iter()
271            .filter(|r| !r.is_empty())
272            .map(|r| IoSlice::new(r))
273            .collect();
274        write_ioslices(out, &slices)?;
275    } else {
276        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
277        multi_select_chunk(
278            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
279        );
280        if !buf.is_empty() {
281            out.write_all(&buf)?;
282        }
283    }
284    Ok(())
285}
286
287/// Process a chunk for multi-field extraction using a single-pass memchr2 scan.
288/// Scans for both delimiter and line_delim in one SIMD pass over the entire chunk,
289/// eliminating per-line memchr_iter setup overhead (significant for short lines).
290/// Delimiter positions are collected in a stack array per line.
291/// When max_field is reached on a line, remaining delimiters are ignored.
292fn multi_select_chunk(
293    data: &[u8],
294    delim: u8,
295    line_delim: u8,
296    ranges: &[Range],
297    max_field: usize,
298    suppress: bool,
299    buf: &mut Vec<u8>,
300) {
301    // When delim == line_delim, fall back to two-level approach
302    if delim == line_delim {
303        buf.reserve(data.len());
304        let base = data.as_ptr();
305        let mut start = 0;
306        for end_pos in memchr_iter(line_delim, data) {
307            let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
308            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
309            start = end_pos + 1;
310        }
311        if start < data.len() {
312            let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
313            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
314        }
315        return;
316    }
317
318    buf.reserve(data.len());
319    let base = data.as_ptr();
320    let data_len = data.len();
321
322    // Per-line state
323    let mut line_start: usize = 0;
324    let mut delim_pos = [0usize; 64];
325    let mut num_delims: usize = 0;
326    let max_delims = max_field.min(64);
327    let mut at_max = false;
328
329    // Single-pass scan using memchr2 for both delimiter and newline
330    for pos in memchr::memchr2_iter(delim, line_delim, data) {
331        let byte = unsafe { *base.add(pos) };
332
333        if byte == line_delim {
334            // End of line: extract fields from collected positions
335            let line_len = pos - line_start;
336            if num_delims == 0 {
337                // No delimiter in line
338                if !suppress {
339                    unsafe {
340                        buf_extend(
341                            buf,
342                            std::slice::from_raw_parts(base.add(line_start), line_len),
343                        );
344                        buf_push(buf, line_delim);
345                    }
346                }
347            } else {
348                // Extract fields using collected delimiter positions
349                let total_fields = num_delims + 1;
350                let mut first_output = true;
351
352                for r in ranges {
353                    let range_start = r.start;
354                    let range_end = r.end.min(total_fields);
355                    if range_start > total_fields {
356                        break;
357                    }
358                    for field_num in range_start..=range_end {
359                        if field_num > total_fields {
360                            break;
361                        }
362
363                        let field_start = if field_num == 1 {
364                            line_start
365                        } else if field_num - 2 < num_delims {
366                            delim_pos[field_num - 2] + 1
367                        } else {
368                            continue;
369                        };
370                        let field_end = if field_num <= num_delims {
371                            delim_pos[field_num - 1]
372                        } else {
373                            pos
374                        };
375
376                        if !first_output {
377                            unsafe { buf_push(buf, delim) };
378                        }
379                        unsafe {
380                            buf_extend(
381                                buf,
382                                std::slice::from_raw_parts(
383                                    base.add(field_start),
384                                    field_end - field_start,
385                                ),
386                            );
387                        }
388                        first_output = false;
389                    }
390                }
391
392                unsafe { buf_push(buf, line_delim) };
393            }
394
395            // Reset for next line
396            line_start = pos + 1;
397            num_delims = 0;
398            at_max = false;
399        } else {
400            // Delimiter found: collect position (up to max_field)
401            if !at_max && num_delims < max_delims {
402                delim_pos[num_delims] = pos;
403                num_delims += 1;
404                if num_delims >= max_delims {
405                    at_max = true;
406                }
407            }
408        }
409    }
410
411    // Handle last line without trailing line_delim
412    if line_start < data_len {
413        if num_delims == 0 {
414            if !suppress {
415                unsafe {
416                    buf_extend(
417                        buf,
418                        std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
419                    );
420                    buf_push(buf, line_delim);
421                }
422            }
423        } else {
424            let total_fields = num_delims + 1;
425            let mut first_output = true;
426
427            for r in ranges {
428                let range_start = r.start;
429                let range_end = r.end.min(total_fields);
430                if range_start > total_fields {
431                    break;
432                }
433                for field_num in range_start..=range_end {
434                    if field_num > total_fields {
435                        break;
436                    }
437
438                    let field_start = if field_num == 1 {
439                        line_start
440                    } else if field_num - 2 < num_delims {
441                        delim_pos[field_num - 2] + 1
442                    } else {
443                        continue;
444                    };
445                    let field_end = if field_num <= num_delims {
446                        delim_pos[field_num - 1]
447                    } else {
448                        data_len
449                    };
450
451                    if !first_output {
452                        unsafe { buf_push(buf, delim) };
453                    }
454                    unsafe {
455                        buf_extend(
456                            buf,
457                            std::slice::from_raw_parts(
458                                base.add(field_start),
459                                field_end - field_start,
460                            ),
461                        );
462                    }
463                    first_output = false;
464                }
465            }
466
467            unsafe { buf_push(buf, line_delim) };
468        }
469    }
470}
471
472/// Extract selected fields from a single line using delimiter position scanning.
473/// Scans delimiters only up to max_field (early exit), then extracts selected fields
474/// by indexing directly into the collected positions. Since ranges are pre-sorted and
475/// non-overlapping, every field within a range is selected — no is_selected check needed.
476#[inline(always)]
477fn multi_select_line(
478    line: &[u8],
479    delim: u8,
480    line_delim: u8,
481    ranges: &[Range],
482    max_field: usize,
483    suppress: bool,
484    buf: &mut Vec<u8>,
485) {
486    let len = line.len();
487    if len == 0 {
488        if !suppress {
489            unsafe { buf_push(buf, line_delim) };
490        }
491        return;
492    }
493
494    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
495    let base = line.as_ptr();
496
497    // Collect delimiter positions up to max_field (early exit).
498    // Stack array for up to 64 delimiter positions.
499    let mut delim_pos = [0usize; 64];
500    let mut num_delims: usize = 0;
501    let max_delims = max_field.min(64);
502
503    for pos in memchr_iter(delim, line) {
504        if num_delims < max_delims {
505            delim_pos[num_delims] = pos;
506            num_delims += 1;
507            if num_delims >= max_delims {
508                break;
509            }
510        }
511    }
512
513    if num_delims == 0 {
514        if !suppress {
515            unsafe {
516                buf_extend(buf, line);
517                buf_push(buf, line_delim);
518            }
519        }
520        return;
521    }
522
523    // Extract selected fields using delimiter positions.
524    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
525    // is selected — skip the is_selected check entirely (saves 1 function call per field).
526    let total_fields = num_delims + 1;
527    let mut first_output = true;
528
529    for r in ranges {
530        let range_start = r.start;
531        let range_end = r.end.min(total_fields);
532        if range_start > total_fields {
533            break;
534        }
535        for field_num in range_start..=range_end {
536            if field_num > total_fields {
537                break;
538            }
539
540            let field_start = if field_num == 1 {
541                0
542            } else if field_num - 2 < num_delims {
543                delim_pos[field_num - 2] + 1
544            } else {
545                continue;
546            };
547            let field_end = if field_num <= num_delims {
548                delim_pos[field_num - 1]
549            } else {
550                len
551            };
552
553            if !first_output {
554                unsafe { buf_push(buf, delim) };
555            }
556            unsafe {
557                buf_extend(
558                    buf,
559                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
560                );
561            }
562            first_output = false;
563        }
564    }
565
566    unsafe { buf_push(buf, line_delim) };
567}
568
569// ── Fast path: field extraction with batched output ──────────────────────
570
571/// Optimized field extraction with early exit and batched output.
572fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
573    let delim = cfg.delim;
574    let line_delim = cfg.line_delim;
575    let ranges = cfg.ranges;
576    let complement = cfg.complement;
577    let output_delim = cfg.output_delim;
578    let suppress = cfg.suppress_no_delim;
579
580    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
581    // That scan was O(N) over the entire file just to check an edge case
582    // (no delimiter in any line). The per-line processing already handles
583    // lines without delimiters correctly, so the scan was pure overhead
584    // for files that DO contain delimiters (the common case).
585
586    // Ultra-fast path: single field extraction (e.g., cut -f5)
587    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
588        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
589    }
590
591    // Fast path: complement of single field or contiguous range with default output delimiter.
592    if complement
593        && ranges.len() == 1
594        && output_delim.len() == 1
595        && output_delim[0] == delim
596        && ranges[0].start == ranges[0].end
597    {
598        return process_complement_single_field(
599            data,
600            delim,
601            line_delim,
602            ranges[0].start,
603            suppress,
604            out,
605        );
606    }
607
608    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
609    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
610    if complement
611        && ranges.len() == 1
612        && ranges[0].start > 1
613        && ranges[0].end < usize::MAX
614        && output_delim.len() == 1
615        && output_delim[0] == delim
616    {
617        return process_complement_range(
618            data,
619            delim,
620            line_delim,
621            ranges[0].start,
622            ranges[0].end,
623            suppress,
624            out,
625        );
626    }
627
628    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
629    if !complement
630        && ranges.len() == 1
631        && ranges[0].start == 1
632        && output_delim.len() == 1
633        && output_delim[0] == delim
634        && ranges[0].end < usize::MAX
635    {
636        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
637    }
638
639    // Fast path: open-ended field range from field N (e.g., cut -f3-)
640    if !complement
641        && ranges.len() == 1
642        && ranges[0].end == usize::MAX
643        && ranges[0].start > 1
644        && output_delim.len() == 1
645        && output_delim[0] == delim
646    {
647        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
648    }
649
650    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
651    if !complement
652        && ranges.len() == 1
653        && ranges[0].start > 1
654        && ranges[0].end < usize::MAX
655        && output_delim.len() == 1
656        && output_delim[0] == delim
657    {
658        return process_fields_mid_range(
659            data,
660            delim,
661            line_delim,
662            ranges[0].start,
663            ranges[0].end,
664            suppress,
665            out,
666        );
667    }
668
669    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
670    // Uses delimiter position caching: find all delimiter positions per line,
671    // then directly index into them for each selected field.
672    // This is faster than the general extract_fields_to_buf which re-checks
673    // is_selected() for every field encountered.
674    if !complement
675        && ranges.len() > 1
676        && ranges.last().map_or(false, |r| r.end < usize::MAX)
677        && output_delim.len() == 1
678        && output_delim[0] == delim
679        && delim != line_delim
680    {
681        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
682    }
683
684    // General field extraction
685    let max_field = if complement {
686        usize::MAX
687    } else {
688        ranges.last().map(|r| r.end).unwrap_or(0)
689    };
690    let field_mask = compute_field_mask(ranges, complement);
691
692    if data.len() >= PARALLEL_THRESHOLD {
693        let chunks = split_into_chunks(data, line_delim);
694        let results: Vec<Vec<u8>> = chunks
695            .par_iter()
696            .map(|chunk| {
697                let mut buf = Vec::with_capacity(chunk.len());
698                process_fields_chunk(
699                    chunk,
700                    delim,
701                    ranges,
702                    output_delim,
703                    suppress,
704                    max_field,
705                    field_mask,
706                    line_delim,
707                    complement,
708                    &mut buf,
709                );
710                buf
711            })
712            .collect();
713        // Use write_vectored (writev) to batch N writes into fewer syscalls
714        let slices: Vec<IoSlice> = results
715            .iter()
716            .filter(|r| !r.is_empty())
717            .map(|r| IoSlice::new(r))
718            .collect();
719        write_ioslices(out, &slices)?;
720    } else {
721        let mut buf = Vec::with_capacity(data.len());
722        process_fields_chunk(
723            data,
724            delim,
725            ranges,
726            output_delim,
727            suppress,
728            max_field,
729            field_mask,
730            line_delim,
731            complement,
732            &mut buf,
733        );
734        if !buf.is_empty() {
735            out.write_all(&buf)?;
736        }
737    }
738    Ok(())
739}
740
741/// Process a chunk of data for general field extraction.
742/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
743/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
744/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
745fn process_fields_chunk(
746    data: &[u8],
747    delim: u8,
748    ranges: &[Range],
749    output_delim: &[u8],
750    suppress: bool,
751    max_field: usize,
752    field_mask: u64,
753    line_delim: u8,
754    complement: bool,
755    buf: &mut Vec<u8>,
756) {
757    // When delim != line_delim and max_field is bounded, use two-level approach:
758    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
759    // This avoids scanning past max_field on each line (significant for lines with
760    // many columns but small field selection like -f1,3,5 on 20-column CSV).
761    // For complement or unbounded ranges, use single-pass memchr2_iter which
762    // needs to process all delimiters anyway.
763    if delim != line_delim && max_field < usize::MAX && !complement {
764        buf.reserve(data.len());
765        let mut start = 0;
766        for end_pos in memchr_iter(line_delim, data) {
767            let line = &data[start..end_pos];
768            extract_fields_to_buf(
769                line,
770                delim,
771                ranges,
772                output_delim,
773                suppress,
774                max_field,
775                field_mask,
776                line_delim,
777                buf,
778                complement,
779            );
780            start = end_pos + 1;
781        }
782        if start < data.len() {
783            extract_fields_to_buf(
784                &data[start..],
785                delim,
786                ranges,
787                output_delim,
788                suppress,
789                max_field,
790                field_mask,
791                line_delim,
792                buf,
793                complement,
794            );
795        }
796        return;
797    }
798
799    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
800    // delimiter and line_delim in one SIMD scan.
801    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
802    if delim != line_delim {
803        buf.reserve(data.len());
804
805        let data_len = data.len();
806        let base = data.as_ptr();
807        let mut line_start: usize = 0;
808        let mut field_start: usize = 0;
809        let mut field_num: usize = 1;
810        let mut first_output = true;
811        let mut has_delim = false;
812
813        for pos in memchr::memchr2_iter(delim, line_delim, data) {
814            let byte = unsafe { *base.add(pos) };
815
816            if byte == line_delim {
817                // End of line: flush final field and emit line delimiter
818                if (field_num <= max_field || complement)
819                    && has_delim
820                    && is_selected(field_num, field_mask, ranges, complement)
821                {
822                    if !first_output {
823                        unsafe { buf_extend(buf, output_delim) };
824                    }
825                    unsafe {
826                        buf_extend(
827                            buf,
828                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
829                        )
830                    };
831                    first_output = false;
832                }
833
834                if !first_output {
835                    unsafe { buf_push(buf, line_delim) };
836                } else if !has_delim {
837                    if !suppress {
838                        unsafe {
839                            buf_extend(
840                                buf,
841                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
842                            );
843                            buf_push(buf, line_delim);
844                        }
845                    }
846                } else {
847                    unsafe { buf_push(buf, line_delim) };
848                }
849
850                // Reset state for next line
851                line_start = pos + 1;
852                field_start = pos + 1;
853                field_num = 1;
854                first_output = true;
855                has_delim = false;
856            } else {
857                // Field delimiter hit
858                has_delim = true;
859
860                if is_selected(field_num, field_mask, ranges, complement) {
861                    if !first_output {
862                        unsafe { buf_extend(buf, output_delim) };
863                    }
864                    unsafe {
865                        buf_extend(
866                            buf,
867                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
868                        )
869                    };
870                    first_output = false;
871                }
872
873                field_num += 1;
874                field_start = pos + 1;
875            }
876        }
877
878        // Handle last line without trailing line_delim
879        if line_start < data_len {
880            if line_start < data_len {
881                if (field_num <= max_field || complement)
882                    && has_delim
883                    && is_selected(field_num, field_mask, ranges, complement)
884                {
885                    if !first_output {
886                        unsafe { buf_extend(buf, output_delim) };
887                    }
888                    unsafe {
889                        buf_extend(
890                            buf,
891                            std::slice::from_raw_parts(
892                                base.add(field_start),
893                                data_len - field_start,
894                            ),
895                        )
896                    };
897                    first_output = false;
898                }
899
900                if !first_output {
901                    unsafe { buf_push(buf, line_delim) };
902                } else if !has_delim {
903                    if !suppress {
904                        unsafe {
905                            buf_extend(
906                                buf,
907                                std::slice::from_raw_parts(
908                                    base.add(line_start),
909                                    data_len - line_start,
910                                ),
911                            );
912                            buf_push(buf, line_delim);
913                        }
914                    }
915                } else {
916                    unsafe { buf_push(buf, line_delim) };
917                }
918            }
919        }
920
921        return;
922    }
923
924    // Fallback: when delim == line_delim, use the two-level scan approach
925    let mut start = 0;
926    for end_pos in memchr_iter(line_delim, data) {
927        let line = &data[start..end_pos];
928        extract_fields_to_buf(
929            line,
930            delim,
931            ranges,
932            output_delim,
933            suppress,
934            max_field,
935            field_mask,
936            line_delim,
937            buf,
938            complement,
939        );
940        start = end_pos + 1;
941    }
942    if start < data.len() {
943        extract_fields_to_buf(
944            &data[start..],
945            delim,
946            ranges,
947            output_delim,
948            suppress,
949            max_field,
950            field_mask,
951            line_delim,
952            buf,
953            complement,
954        );
955    }
956}
957
958// ── Ultra-fast single field extraction ───────────────────────────────────
959
960/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
961/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
962/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
963fn process_single_field(
964    data: &[u8],
965    delim: u8,
966    line_delim: u8,
967    target: usize,
968    suppress: bool,
969    out: &mut impl Write,
970) -> io::Result<()> {
971    let target_idx = target - 1;
972
973    // For single-field extraction, parallelize at 2MB+ to match PARALLEL_THRESHOLD.
974    // The 10MB benchmark regressed from ~7x to ~5.3x when this was set to 32MB.
975    const FIELD_PARALLEL_MIN: usize = 2 * 1024 * 1024;
976
977    if delim != line_delim {
978        // Field 1 fast path: memchr2 single-pass scan.
979        // For field 1, the first delimiter IS the field boundary. Lines without
980        // delimiter are passed through unchanged.
981        if target_idx == 0 && !suppress {
982            if data.len() >= FIELD_PARALLEL_MIN {
983                return single_field1_parallel(data, delim, line_delim, out);
984            }
985            // Sequential: scan with memchr2 into buffer, single write_all.
986            // Faster than writev/IoSlice for moderate data because it produces
987            // one contiguous buffer → one write syscall, and avoids IoSlice
988            // allocation overhead for high-delimiter-density data.
989            let mut buf = Vec::with_capacity(data.len());
990            single_field1_to_buf(data, delim, line_delim, &mut buf);
991            if !buf.is_empty() {
992                out.write_all(&buf)?;
993            }
994            return Ok(());
995        }
996
997        // Two-level approach for field N: outer newline scan + inner delim scan
998        // with early exit at target_idx. Faster than memchr2 single-pass because
999        // we only scan delimiters up to target_idx per line (not all of them).
1000        if data.len() >= FIELD_PARALLEL_MIN {
1001            let chunks = split_into_chunks(data, line_delim);
1002            let results: Vec<Vec<u8>> = chunks
1003                .par_iter()
1004                .map(|chunk| {
1005                    let mut buf = Vec::with_capacity(chunk.len() / 2);
1006                    process_single_field_chunk(
1007                        chunk, delim, target_idx, line_delim, suppress, &mut buf,
1008                    );
1009                    buf
1010                })
1011                .collect();
1012            let slices: Vec<IoSlice> = results
1013                .iter()
1014                .filter(|r| !r.is_empty())
1015                .map(|r| IoSlice::new(r))
1016                .collect();
1017            write_ioslices(out, &slices)?;
1018        } else {
1019            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1020            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1021            if !buf.is_empty() {
1022                out.write_all(&buf)?;
1023            }
1024        }
1025        return Ok(());
1026    }
1027
1028    // Fallback for delim == line_delim: nested loop approach
1029    if data.len() >= FIELD_PARALLEL_MIN {
1030        let chunks = split_into_chunks(data, line_delim);
1031        let results: Vec<Vec<u8>> = chunks
1032            .par_iter()
1033            .map(|chunk| {
1034                let mut buf = Vec::with_capacity(chunk.len() / 4);
1035                process_single_field_chunk(
1036                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
1037                );
1038                buf
1039            })
1040            .collect();
1041        let slices: Vec<IoSlice> = results
1042            .iter()
1043            .filter(|r| !r.is_empty())
1044            .map(|r| IoSlice::new(r))
1045            .collect();
1046        write_ioslices(out, &slices)?;
1047    } else {
1048        let mut buf = Vec::with_capacity(data.len() / 4);
1049        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1050        if !buf.is_empty() {
1051            out.write_all(&buf)?;
1052        }
1053    }
1054    Ok(())
1055}
1056
1057/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1058/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1059fn process_complement_range(
1060    data: &[u8],
1061    delim: u8,
1062    line_delim: u8,
1063    skip_start: usize,
1064    skip_end: usize,
1065    suppress: bool,
1066    out: &mut impl Write,
1067) -> io::Result<()> {
1068    if data.len() >= PARALLEL_THRESHOLD {
1069        let chunks = split_into_chunks(data, line_delim);
1070        let results: Vec<Vec<u8>> = chunks
1071            .par_iter()
1072            .map(|chunk| {
1073                let mut buf = Vec::with_capacity(chunk.len());
1074                complement_range_chunk(
1075                    chunk, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1076                );
1077                buf
1078            })
1079            .collect();
1080        let slices: Vec<IoSlice> = results
1081            .iter()
1082            .filter(|r| !r.is_empty())
1083            .map(|r| IoSlice::new(r))
1084            .collect();
1085        write_ioslices(out, &slices)?;
1086    } else {
1087        let mut buf = Vec::with_capacity(data.len());
1088        complement_range_chunk(
1089            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1090        );
1091        if !buf.is_empty() {
1092            out.write_all(&buf)?;
1093        }
1094    }
1095    Ok(())
1096}
1097
1098/// Process a chunk for complement range extraction.
1099fn complement_range_chunk(
1100    data: &[u8],
1101    delim: u8,
1102    skip_start: usize,
1103    skip_end: usize,
1104    line_delim: u8,
1105    suppress: bool,
1106    buf: &mut Vec<u8>,
1107) {
1108    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1109    buf.reserve(data.len());
1110    let mut start = 0;
1111    for end_pos in memchr_iter(line_delim, data) {
1112        let line = &data[start..end_pos];
1113        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1114        start = end_pos + 1;
1115    }
1116    if start < data.len() {
1117        complement_range_line(
1118            &data[start..],
1119            delim,
1120            skip_start,
1121            skip_end,
1122            line_delim,
1123            suppress,
1124            buf,
1125        );
1126    }
1127}
1128
1129/// Extract all fields except skip_start..=skip_end from one line.
1130/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1131///
1132/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1133/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1134/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1135#[inline(always)]
1136fn complement_range_line(
1137    line: &[u8],
1138    delim: u8,
1139    skip_start: usize,
1140    skip_end: usize,
1141    line_delim: u8,
1142    suppress: bool,
1143    buf: &mut Vec<u8>,
1144) {
1145    let len = line.len();
1146    if len == 0 {
1147        if !suppress {
1148            unsafe { buf_push(buf, line_delim) };
1149        }
1150        return;
1151    }
1152
1153    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1154    let base = line.as_ptr();
1155
1156    // 1-based field numbers. To skip fields skip_start..=skip_end:
1157    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1158    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1159    //
1160    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1161    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1162
1163    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1164    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1165    let total_need = need_prefix_delims + need_skip_delims;
1166
1167    // Find delimiter positions up to total_need
1168    let mut delim_count: usize = 0;
1169    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1170    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1171
1172    for pos in memchr_iter(delim, line) {
1173        delim_count += 1;
1174        if delim_count == need_prefix_delims {
1175            prefix_end_pos = pos;
1176        }
1177        if delim_count == total_need {
1178            suffix_start_pos = pos + 1;
1179            break;
1180        }
1181    }
1182
1183    if delim_count == 0 {
1184        // No delimiter at all
1185        if !suppress {
1186            unsafe {
1187                buf_extend(buf, line);
1188                buf_push(buf, line_delim);
1189            }
1190        }
1191        return;
1192    }
1193
1194    // Case analysis:
1195    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1196    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1197    // 3. Enough to reach skip_end: prefix + delim + suffix
1198
1199    if delim_count < need_prefix_delims {
1200        // Not enough fields to reach skip region — output entire line
1201        unsafe {
1202            buf_extend(buf, line);
1203            buf_push(buf, line_delim);
1204        }
1205        return;
1206    }
1207
1208    let has_prefix = need_prefix_delims > 0;
1209    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1210
1211    if has_prefix && has_suffix {
1212        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1213        unsafe {
1214            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1215            buf_push(buf, delim);
1216            buf_extend(
1217                buf,
1218                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1219            );
1220            buf_push(buf, line_delim);
1221        }
1222    } else if has_prefix {
1223        // Only prefix, no suffix (skip region extends to end of line)
1224        unsafe {
1225            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1226            buf_push(buf, line_delim);
1227        }
1228    } else if has_suffix {
1229        // No prefix (skip_start == 1), only suffix
1230        unsafe {
1231            buf_extend(
1232                buf,
1233                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1234            );
1235            buf_push(buf, line_delim);
1236        }
1237    } else {
1238        // All fields skipped
1239        unsafe { buf_push(buf, line_delim) };
1240    }
1241}
1242
1243/// Complement single-field extraction: skip one field, output rest unchanged.
1244fn process_complement_single_field(
1245    data: &[u8],
1246    delim: u8,
1247    line_delim: u8,
1248    skip_field: usize,
1249    suppress: bool,
1250    out: &mut impl Write,
1251) -> io::Result<()> {
1252    let skip_idx = skip_field - 1;
1253
1254    if data.len() >= PARALLEL_THRESHOLD {
1255        let chunks = split_into_chunks(data, line_delim);
1256        let results: Vec<Vec<u8>> = chunks
1257            .par_iter()
1258            .map(|chunk| {
1259                let mut buf = Vec::with_capacity(chunk.len());
1260                complement_single_field_chunk(
1261                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
1262                );
1263                buf
1264            })
1265            .collect();
1266        // Use write_vectored (writev) to batch N writes into fewer syscalls
1267        let slices: Vec<IoSlice> = results
1268            .iter()
1269            .filter(|r| !r.is_empty())
1270            .map(|r| IoSlice::new(r))
1271            .collect();
1272        write_ioslices(out, &slices)?;
1273    } else {
1274        let mut buf = Vec::with_capacity(data.len());
1275        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1276        if !buf.is_empty() {
1277            out.write_all(&buf)?;
1278        }
1279    }
1280    Ok(())
1281}
1282
1283/// Process a chunk for complement single-field extraction using memchr2 single-pass.
1284/// Scans for both delimiter and line_delim in one SIMD pass, tracking delimiter count
1285/// per line. When the skip field's bounding delimiters are found, copies prefix + suffix.
1286/// This eliminates the per-line memchr_iter setup overhead and reduces from two SIMD
1287/// passes (outer newline scan + inner delimiter scan) to one.
1288fn complement_single_field_chunk(
1289    data: &[u8],
1290    delim: u8,
1291    skip_idx: usize,
1292    line_delim: u8,
1293    suppress: bool,
1294    buf: &mut Vec<u8>,
1295) {
1296    // When delim == line_delim, fall back to per-line approach
1297    if delim == line_delim {
1298        buf.reserve(data.len());
1299        let mut start = 0;
1300        for end_pos in memchr_iter(line_delim, data) {
1301            let line = &data[start..end_pos];
1302            complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1303            start = end_pos + 1;
1304        }
1305        if start < data.len() {
1306            complement_single_field_line(
1307                &data[start..],
1308                delim,
1309                skip_idx,
1310                line_delim,
1311                suppress,
1312                buf,
1313            );
1314        }
1315        return;
1316    }
1317
1318    buf.reserve(data.len());
1319    let base = data.as_ptr();
1320    let data_len = data.len();
1321    let need_before = skip_idx; // delimiters before skip field
1322    let need_total = skip_idx + 1; // delimiters to find end of skip field
1323
1324    // Per-line state
1325    let mut line_start: usize = 0;
1326    let mut delim_count: usize = 0;
1327    let mut skip_start_pos: usize = 0;
1328    let mut skip_end_pos: usize = 0;
1329    let mut found_start = need_before == 0; // skip_idx==0 means skip starts at line start
1330    let mut found_end = false;
1331
1332    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1333        let byte = unsafe { *base.add(pos) };
1334
1335        if byte == line_delim {
1336            // End of line: emit based on what we found
1337            if delim_count == 0 {
1338                // No delimiter in line
1339                if !suppress {
1340                    unsafe {
1341                        buf_extend(
1342                            buf,
1343                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1344                        );
1345                        buf_push(buf, line_delim);
1346                    }
1347                }
1348            } else if !found_start || delim_count < need_before {
1349                // Not enough delimiters to reach skip field — output entire line
1350                unsafe {
1351                    buf_extend(
1352                        buf,
1353                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1354                    );
1355                    buf_push(buf, line_delim);
1356                }
1357            } else {
1358                let has_prefix = skip_idx > 0;
1359                let has_suffix = found_end && skip_end_pos < pos;
1360
1361                if has_prefix && has_suffix {
1362                    unsafe {
1363                        buf_extend(
1364                            buf,
1365                            std::slice::from_raw_parts(
1366                                base.add(line_start),
1367                                skip_start_pos - 1 - line_start,
1368                            ),
1369                        );
1370                        buf_push(buf, delim);
1371                        buf_extend(
1372                            buf,
1373                            std::slice::from_raw_parts(
1374                                base.add(skip_end_pos + 1),
1375                                pos - skip_end_pos - 1,
1376                            ),
1377                        );
1378                        buf_push(buf, line_delim);
1379                    }
1380                } else if has_prefix {
1381                    unsafe {
1382                        buf_extend(
1383                            buf,
1384                            std::slice::from_raw_parts(
1385                                base.add(line_start),
1386                                skip_start_pos - 1 - line_start,
1387                            ),
1388                        );
1389                        buf_push(buf, line_delim);
1390                    }
1391                } else if has_suffix {
1392                    unsafe {
1393                        buf_extend(
1394                            buf,
1395                            std::slice::from_raw_parts(
1396                                base.add(skip_end_pos + 1),
1397                                pos - skip_end_pos - 1,
1398                            ),
1399                        );
1400                        buf_push(buf, line_delim);
1401                    }
1402                } else {
1403                    unsafe { buf_push(buf, line_delim) };
1404                }
1405            }
1406
1407            // Reset for next line
1408            line_start = pos + 1;
1409            delim_count = 0;
1410            skip_start_pos = 0;
1411            skip_end_pos = 0;
1412            found_start = need_before == 0;
1413            found_end = false;
1414        } else {
1415            // Delimiter found
1416            delim_count += 1;
1417            if delim_count == need_before {
1418                skip_start_pos = pos + 1;
1419                found_start = true;
1420            }
1421            if delim_count == need_total {
1422                skip_end_pos = pos;
1423                found_end = true;
1424            }
1425        }
1426    }
1427
1428    // Handle last line without trailing line_delim
1429    if line_start < data_len {
1430        let pos = data_len;
1431        if delim_count == 0 {
1432            if !suppress {
1433                unsafe {
1434                    buf_extend(
1435                        buf,
1436                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1437                    );
1438                    buf_push(buf, line_delim);
1439                }
1440            }
1441        } else if !found_start || delim_count < need_before {
1442            unsafe {
1443                buf_extend(
1444                    buf,
1445                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1446                );
1447                buf_push(buf, line_delim);
1448            }
1449        } else {
1450            let has_prefix = skip_idx > 0;
1451            let has_suffix = found_end && skip_end_pos < pos;
1452
1453            if has_prefix && has_suffix {
1454                unsafe {
1455                    buf_extend(
1456                        buf,
1457                        std::slice::from_raw_parts(
1458                            base.add(line_start),
1459                            skip_start_pos - 1 - line_start,
1460                        ),
1461                    );
1462                    buf_push(buf, delim);
1463                    buf_extend(
1464                        buf,
1465                        std::slice::from_raw_parts(
1466                            base.add(skip_end_pos + 1),
1467                            pos - skip_end_pos - 1,
1468                        ),
1469                    );
1470                    buf_push(buf, line_delim);
1471                }
1472            } else if has_prefix {
1473                unsafe {
1474                    buf_extend(
1475                        buf,
1476                        std::slice::from_raw_parts(
1477                            base.add(line_start),
1478                            skip_start_pos - 1 - line_start,
1479                        ),
1480                    );
1481                    buf_push(buf, line_delim);
1482                }
1483            } else if has_suffix {
1484                unsafe {
1485                    buf_extend(
1486                        buf,
1487                        std::slice::from_raw_parts(
1488                            base.add(skip_end_pos + 1),
1489                            pos - skip_end_pos - 1,
1490                        ),
1491                    );
1492                    buf_push(buf, line_delim);
1493                }
1494            } else {
1495                unsafe { buf_push(buf, line_delim) };
1496            }
1497        }
1498    }
1499}
1500
1501/// Fallback per-line complement single-field extraction (for delim == line_delim).
1502#[inline(always)]
1503fn complement_single_field_line(
1504    line: &[u8],
1505    delim: u8,
1506    skip_idx: usize,
1507    line_delim: u8,
1508    suppress: bool,
1509    buf: &mut Vec<u8>,
1510) {
1511    let len = line.len();
1512    if len == 0 {
1513        if !suppress {
1514            unsafe { buf_push(buf, line_delim) };
1515        }
1516        return;
1517    }
1518
1519    let base = line.as_ptr();
1520    let need_before = skip_idx;
1521    let need_total = skip_idx + 1;
1522
1523    let mut delim_count: usize = 0;
1524    let mut skip_start_pos: usize = 0;
1525    let mut skip_end_pos: usize = len;
1526    let mut found_end = false;
1527
1528    for pos in memchr_iter(delim, line) {
1529        delim_count += 1;
1530        if delim_count == need_before {
1531            skip_start_pos = pos + 1;
1532        }
1533        if delim_count == need_total {
1534            skip_end_pos = pos;
1535            found_end = true;
1536            break;
1537        }
1538    }
1539
1540    if delim_count == 0 {
1541        if !suppress {
1542            unsafe {
1543                buf_extend(buf, line);
1544                buf_push(buf, line_delim);
1545            }
1546        }
1547        return;
1548    }
1549
1550    if delim_count < need_before {
1551        unsafe {
1552            buf_extend(buf, line);
1553            buf_push(buf, line_delim);
1554        }
1555        return;
1556    }
1557
1558    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1559    let has_suffix = found_end && skip_end_pos < len;
1560
1561    if has_prefix && has_suffix {
1562        unsafe {
1563            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1564            buf_push(buf, delim);
1565            buf_extend(
1566                buf,
1567                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1568            );
1569            buf_push(buf, line_delim);
1570        }
1571    } else if has_prefix {
1572        unsafe {
1573            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1574            buf_push(buf, line_delim);
1575        }
1576    } else if has_suffix {
1577        unsafe {
1578            buf_extend(
1579                buf,
1580                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1581            );
1582            buf_push(buf, line_delim);
1583        }
1584    } else {
1585        unsafe { buf_push(buf, line_delim) };
1586    }
1587}
1588
1589/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1590/// Zero-copy for the non-parallel path: identifies the truncation point per line
1591/// and writes contiguous runs directly from the source data.
1592fn process_fields_prefix(
1593    data: &[u8],
1594    delim: u8,
1595    line_delim: u8,
1596    last_field: usize,
1597    suppress: bool,
1598    out: &mut impl Write,
1599) -> io::Result<()> {
1600    if data.len() >= PARALLEL_THRESHOLD {
1601        let chunks = split_into_chunks(data, line_delim);
1602        let results: Vec<Vec<u8>> = chunks
1603            .par_iter()
1604            .map(|chunk| {
1605                let mut buf = Vec::with_capacity(chunk.len());
1606                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
1607                buf
1608            })
1609            .collect();
1610        // Use write_vectored (writev) to batch N writes into fewer syscalls
1611        let slices: Vec<IoSlice> = results
1612            .iter()
1613            .filter(|r| !r.is_empty())
1614            .map(|r| IoSlice::new(r))
1615            .collect();
1616        write_ioslices(out, &slices)?;
1617    } else if !suppress {
1618        // Zero-copy fast path: scan for truncation points, write runs from source.
1619        // When suppress is false, every line is output (with or without delimiter).
1620        // Most lines have enough fields, so the output is often identical to input.
1621        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1622    } else {
1623        let mut buf = Vec::with_capacity(data.len());
1624        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1625        if !buf.is_empty() {
1626            out.write_all(&buf)?;
1627        }
1628    }
1629    Ok(())
1630}
1631
1632/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1633/// directly into the source data, flushing in MAX_IOV-sized batches.
1634/// For lines where the Nth delimiter exists, we truncate at that point.
1635/// For lines with fewer fields, we output them unchanged (contiguous run).
1636/// Lines without any delimiter are output unchanged (suppress=false assumed).
1637#[inline]
1638fn fields_prefix_zerocopy(
1639    data: &[u8],
1640    delim: u8,
1641    line_delim: u8,
1642    last_field: usize,
1643    out: &mut impl Write,
1644) -> io::Result<()> {
1645    let newline_buf: [u8; 1] = [line_delim];
1646    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1647    let mut start = 0;
1648    let mut run_start: usize = 0;
1649
1650    for end_pos in memchr_iter(line_delim, data) {
1651        let line = &data[start..end_pos];
1652        let mut field_count = 1;
1653        let mut truncate_at: Option<usize> = None;
1654        for dpos in memchr_iter(delim, line) {
1655            if field_count >= last_field {
1656                truncate_at = Some(start + dpos);
1657                break;
1658            }
1659            field_count += 1;
1660        }
1661
1662        if let Some(trunc_pos) = truncate_at {
1663            if run_start < start {
1664                iov.push(IoSlice::new(&data[run_start..start]));
1665            }
1666            iov.push(IoSlice::new(&data[start..trunc_pos]));
1667            iov.push(IoSlice::new(&newline_buf));
1668            run_start = end_pos + 1;
1669
1670            if iov.len() >= MAX_IOV - 2 {
1671                write_ioslices(out, &iov)?;
1672                iov.clear();
1673            }
1674        }
1675        start = end_pos + 1;
1676    }
1677    // Handle last line without terminator
1678    if start < data.len() {
1679        let line = &data[start..];
1680        let mut field_count = 1;
1681        let mut truncate_at: Option<usize> = None;
1682        for dpos in memchr_iter(delim, line) {
1683            if field_count >= last_field {
1684                truncate_at = Some(start + dpos);
1685                break;
1686            }
1687            field_count += 1;
1688        }
1689        if let Some(trunc_pos) = truncate_at {
1690            if run_start < start {
1691                iov.push(IoSlice::new(&data[run_start..start]));
1692            }
1693            iov.push(IoSlice::new(&data[start..trunc_pos]));
1694            iov.push(IoSlice::new(&newline_buf));
1695            if !iov.is_empty() {
1696                write_ioslices(out, &iov)?;
1697            }
1698            return Ok(());
1699        }
1700    }
1701    // Flush remaining contiguous run
1702    if run_start < data.len() {
1703        iov.push(IoSlice::new(&data[run_start..]));
1704        if !data.is_empty() && *data.last().unwrap() != line_delim {
1705            iov.push(IoSlice::new(&newline_buf));
1706        }
1707    }
1708    if !iov.is_empty() {
1709        write_ioslices(out, &iov)?;
1710    }
1711    Ok(())
1712}
1713
1714/// Process a chunk for contiguous from-start field range extraction.
1715fn fields_prefix_chunk(
1716    data: &[u8],
1717    delim: u8,
1718    line_delim: u8,
1719    last_field: usize,
1720    suppress: bool,
1721    buf: &mut Vec<u8>,
1722) {
1723    buf.reserve(data.len());
1724    let mut start = 0;
1725    for end_pos in memchr_iter(line_delim, data) {
1726        let line = &data[start..end_pos];
1727        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1728        start = end_pos + 1;
1729    }
1730    if start < data.len() {
1731        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1732    }
1733}
1734
1735/// Extract first N fields from one line (contiguous from-start range).
1736/// Uses memchr SIMD for delimiter scanning on all line sizes.
1737#[inline(always)]
1738fn fields_prefix_line(
1739    line: &[u8],
1740    delim: u8,
1741    line_delim: u8,
1742    last_field: usize,
1743    suppress: bool,
1744    buf: &mut Vec<u8>,
1745) {
1746    let len = line.len();
1747    if len == 0 {
1748        if !suppress {
1749            unsafe { buf_push(buf, line_delim) };
1750        }
1751        return;
1752    }
1753
1754    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1755    let base = line.as_ptr();
1756
1757    let mut field_count = 1usize;
1758    let mut has_delim = false;
1759
1760    for pos in memchr_iter(delim, line) {
1761        has_delim = true;
1762        if field_count >= last_field {
1763            unsafe {
1764                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1765                buf_push(buf, line_delim);
1766            }
1767            return;
1768        }
1769        field_count += 1;
1770    }
1771
1772    if !has_delim {
1773        if !suppress {
1774            unsafe {
1775                buf_extend(buf, line);
1776                buf_push(buf, line_delim);
1777            }
1778        }
1779        return;
1780    }
1781
1782    unsafe {
1783        buf_extend(buf, line);
1784        buf_push(buf, line_delim);
1785    }
1786}
1787
1788/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1789fn process_fields_suffix(
1790    data: &[u8],
1791    delim: u8,
1792    line_delim: u8,
1793    start_field: usize,
1794    suppress: bool,
1795    out: &mut impl Write,
1796) -> io::Result<()> {
1797    if data.len() >= PARALLEL_THRESHOLD {
1798        let chunks = split_into_chunks(data, line_delim);
1799        let results: Vec<Vec<u8>> = chunks
1800            .par_iter()
1801            .map(|chunk| {
1802                let mut buf = Vec::with_capacity(chunk.len());
1803                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
1804                buf
1805            })
1806            .collect();
1807        // Use write_vectored (writev) to batch N writes into fewer syscalls
1808        let slices: Vec<IoSlice> = results
1809            .iter()
1810            .filter(|r| !r.is_empty())
1811            .map(|r| IoSlice::new(r))
1812            .collect();
1813        write_ioslices(out, &slices)?;
1814    } else {
1815        let mut buf = Vec::with_capacity(data.len());
1816        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1817        if !buf.is_empty() {
1818            out.write_all(&buf)?;
1819        }
1820    }
1821    Ok(())
1822}
1823
1824/// Process a chunk for open-ended field suffix extraction.
1825fn fields_suffix_chunk(
1826    data: &[u8],
1827    delim: u8,
1828    line_delim: u8,
1829    start_field: usize,
1830    suppress: bool,
1831    buf: &mut Vec<u8>,
1832) {
1833    buf.reserve(data.len());
1834    let mut start = 0;
1835    for end_pos in memchr_iter(line_delim, data) {
1836        let line = &data[start..end_pos];
1837        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1838        start = end_pos + 1;
1839    }
1840    if start < data.len() {
1841        fields_suffix_line(
1842            &data[start..],
1843            delim,
1844            line_delim,
1845            start_field,
1846            suppress,
1847            buf,
1848        );
1849    }
1850}
1851
1852/// Extract fields from start_field to end from one line.
1853/// Uses memchr SIMD for delimiter scanning on all line sizes.
1854#[inline(always)]
1855fn fields_suffix_line(
1856    line: &[u8],
1857    delim: u8,
1858    line_delim: u8,
1859    start_field: usize,
1860    suppress: bool,
1861    buf: &mut Vec<u8>,
1862) {
1863    let len = line.len();
1864    if len == 0 {
1865        if !suppress {
1866            unsafe { buf_push(buf, line_delim) };
1867        }
1868        return;
1869    }
1870
1871    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1872    let base = line.as_ptr();
1873
1874    let skip_delims = start_field - 1;
1875    let mut delim_count = 0usize;
1876    let mut has_delim = false;
1877
1878    for pos in memchr_iter(delim, line) {
1879        has_delim = true;
1880        delim_count += 1;
1881        if delim_count >= skip_delims {
1882            unsafe {
1883                buf_extend(
1884                    buf,
1885                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1886                );
1887                buf_push(buf, line_delim);
1888            }
1889            return;
1890        }
1891    }
1892
1893    if !has_delim {
1894        if !suppress {
1895            unsafe {
1896                buf_extend(buf, line);
1897                buf_push(buf, line_delim);
1898            }
1899        }
1900        return;
1901    }
1902
1903    // Fewer delimiters than needed
1904    unsafe { buf_push(buf, line_delim) };
1905}
1906
1907/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1908/// Optimized: skip to start_field using memchr, then output until end_field.
1909fn process_fields_mid_range(
1910    data: &[u8],
1911    delim: u8,
1912    line_delim: u8,
1913    start_field: usize,
1914    end_field: usize,
1915    suppress: bool,
1916    out: &mut impl Write,
1917) -> io::Result<()> {
1918    if data.len() >= PARALLEL_THRESHOLD {
1919        let chunks = split_into_chunks(data, line_delim);
1920        let results: Vec<Vec<u8>> = chunks
1921            .par_iter()
1922            .map(|chunk| {
1923                let mut buf = Vec::with_capacity(chunk.len());
1924                fields_mid_range_chunk(
1925                    chunk,
1926                    delim,
1927                    line_delim,
1928                    start_field,
1929                    end_field,
1930                    suppress,
1931                    &mut buf,
1932                );
1933                buf
1934            })
1935            .collect();
1936        let slices: Vec<IoSlice> = results
1937            .iter()
1938            .filter(|r| !r.is_empty())
1939            .map(|r| IoSlice::new(r))
1940            .collect();
1941        write_ioslices(out, &slices)?;
1942    } else {
1943        let mut buf = Vec::with_capacity(data.len());
1944        fields_mid_range_chunk(
1945            data,
1946            delim,
1947            line_delim,
1948            start_field,
1949            end_field,
1950            suppress,
1951            &mut buf,
1952        );
1953        if !buf.is_empty() {
1954            out.write_all(&buf)?;
1955        }
1956    }
1957    Ok(())
1958}
1959
1960/// Process a chunk for contiguous mid-range field extraction.
1961fn fields_mid_range_chunk(
1962    data: &[u8],
1963    delim: u8,
1964    line_delim: u8,
1965    start_field: usize,
1966    end_field: usize,
1967    suppress: bool,
1968    buf: &mut Vec<u8>,
1969) {
1970    buf.reserve(data.len());
1971    let mut start = 0;
1972    for end_pos in memchr_iter(line_delim, data) {
1973        let line = &data[start..end_pos];
1974        fields_mid_range_line(
1975            line,
1976            delim,
1977            line_delim,
1978            start_field,
1979            end_field,
1980            suppress,
1981            buf,
1982        );
1983        start = end_pos + 1;
1984    }
1985    if start < data.len() {
1986        fields_mid_range_line(
1987            &data[start..],
1988            delim,
1989            line_delim,
1990            start_field,
1991            end_field,
1992            suppress,
1993            buf,
1994        );
1995    }
1996}
1997
1998/// Extract fields start_field..=end_field from one line.
1999/// Uses scalar byte scanning for short lines, memchr_iter for longer.
2000/// Raw pointer arithmetic to eliminate bounds checking.
2001#[inline(always)]
2002fn fields_mid_range_line(
2003    line: &[u8],
2004    delim: u8,
2005    line_delim: u8,
2006    start_field: usize,
2007    end_field: usize,
2008    suppress: bool,
2009    buf: &mut Vec<u8>,
2010) {
2011    let len = line.len();
2012    if len == 0 {
2013        if !suppress {
2014            unsafe { buf_push(buf, line_delim) };
2015        }
2016        return;
2017    }
2018
2019    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
2020    let base = line.as_ptr();
2021
2022    // Count delimiters to find start_field and end_field boundaries
2023    let skip_before = start_field - 1; // delimiters to skip before start_field
2024    let field_span = end_field - start_field; // additional delimiters within the range
2025    let target_end_delim = skip_before + field_span + 1;
2026    let mut delim_count = 0;
2027    let mut range_start = 0;
2028    let mut has_delim = false;
2029
2030    for pos in memchr_iter(delim, line) {
2031        has_delim = true;
2032        delim_count += 1;
2033        if delim_count == skip_before {
2034            range_start = pos + 1;
2035        }
2036        if delim_count == target_end_delim {
2037            if skip_before == 0 {
2038                range_start = 0;
2039            }
2040            unsafe {
2041                buf_extend(
2042                    buf,
2043                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2044                );
2045                buf_push(buf, line_delim);
2046            }
2047            return;
2048        }
2049    }
2050
2051    if !has_delim {
2052        if !suppress {
2053            unsafe {
2054                buf_extend(buf, line);
2055                buf_push(buf, line_delim);
2056            }
2057        }
2058        return;
2059    }
2060
2061    // Line has delimiters but fewer fields than end_field
2062    if delim_count >= skip_before {
2063        // We have at least start_field, output from range_start to end
2064        if skip_before == 0 {
2065            range_start = 0;
2066        }
2067        unsafe {
2068            buf_extend(
2069                buf,
2070                std::slice::from_raw_parts(base.add(range_start), len - range_start),
2071            );
2072            buf_push(buf, line_delim);
2073        }
2074    } else {
2075        // Not enough fields even for start_field — output empty line
2076        unsafe { buf_push(buf, line_delim) };
2077    }
2078}
2079
2080/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
2081/// directly into the source data, flushing in MAX_IOV-sized batches.
2082/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
2083///
2084/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
2085/// Parallel field-1 extraction for large data using memchr2 single-pass.
2086/// Splits data into per-thread chunks, each chunk extracts field 1 using
2087/// memchr2(delim, newline) which finds the first special byte in one scan.
2088/// For field 1: first special byte is either the delimiter (field end) or
2089/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
2090fn single_field1_parallel(
2091    data: &[u8],
2092    delim: u8,
2093    line_delim: u8,
2094    out: &mut impl Write,
2095) -> io::Result<()> {
2096    let chunks = split_into_chunks(data, line_delim);
2097    let results: Vec<Vec<u8>> = chunks
2098        .par_iter()
2099        .map(|chunk| {
2100            let mut buf = Vec::with_capacity(chunk.len());
2101            single_field1_to_buf(chunk, delim, line_delim, &mut buf);
2102            buf
2103        })
2104        .collect();
2105    let slices: Vec<IoSlice> = results
2106        .iter()
2107        .filter(|r| !r.is_empty())
2108        .map(|r| IoSlice::new(r))
2109        .collect();
2110    write_ioslices(out, &slices)
2111}
2112
2113/// Extract field 1 from a chunk using memchr2 single-pass scanning.
2114/// Uses memchr2(delim, line_delim) to find the first special byte per line:
2115/// - If delimiter: field 1 = data[line_start..delim_pos], skip to next newline
2116/// - If newline: no delimiter on this line, output unchanged
2117/// This scans ~N total bytes vs ~1.5N for two-level (outer newline + inner delimiter).
2118#[inline]
2119fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
2120    use memchr::memchr2;
2121    buf.reserve(data.len());
2122    let mut pos = 0;
2123    while pos < data.len() {
2124        match memchr2(delim, line_delim, &data[pos..]) {
2125            None => {
2126                // Rest is a partial line, no delimiter — output as-is
2127                unsafe {
2128                    buf_extend(buf, &data[pos..]);
2129                }
2130                break;
2131            }
2132            Some(offset) => {
2133                let actual = pos + offset;
2134                if data[actual] == line_delim {
2135                    // No delimiter on this line — output entire line including newline
2136                    unsafe {
2137                        buf_extend(buf, &data[pos..actual + 1]);
2138                    }
2139                    pos = actual + 1;
2140                } else {
2141                    // Delimiter found — output field 1 (up to delimiter) + newline
2142                    unsafe {
2143                        buf_extend(buf, &data[pos..actual]);
2144                        buf_push(buf, line_delim);
2145                    }
2146                    // Skip to next newline
2147                    match memchr::memchr(line_delim, &data[actual + 1..]) {
2148                        None => {
2149                            pos = data.len();
2150                        }
2151                        Some(nl_off) => {
2152                            pos = actual + 1 + nl_off + 1;
2153                        }
2154                    }
2155                }
2156            }
2157        }
2158    }
2159}
2160
2161/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
2162/// directly into the source data. Uses two-level scan: outer memchr(newline)
2163/// for the first delimiter. This is faster than memchr2 for SMALL data because
2164/// the inner scan exits after the FIRST delimiter, skipping all
2165/// subsequent delimiters on the line.
2166///
2167/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2168/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2169#[inline]
2170#[allow(dead_code)]
2171fn single_field1_zerocopy(
2172    data: &[u8],
2173    delim: u8,
2174    line_delim: u8,
2175    out: &mut impl Write,
2176) -> io::Result<()> {
2177    let newline_buf: [u8; 1] = [line_delim];
2178
2179    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2180    let mut run_start: usize = 0;
2181    let mut start = 0;
2182
2183    for end_pos in memchr_iter(line_delim, data) {
2184        let line = &data[start..end_pos];
2185        if let Some(dp) = memchr::memchr(delim, line) {
2186            // Line has delimiter — truncate at first delimiter.
2187            // Flush current contiguous run, then add truncated field + newline.
2188            if run_start < start {
2189                iov.push(IoSlice::new(&data[run_start..start]));
2190            }
2191            iov.push(IoSlice::new(&data[start..start + dp]));
2192            iov.push(IoSlice::new(&newline_buf));
2193            run_start = end_pos + 1;
2194
2195            if iov.len() >= MAX_IOV - 2 {
2196                write_ioslices(out, &iov)?;
2197                iov.clear();
2198            }
2199        }
2200        // else: no delimiter in line, output unchanged (stays in contiguous run)
2201        start = end_pos + 1;
2202    }
2203
2204    // Handle last line (no trailing newline)
2205    if start < data.len() {
2206        let line = &data[start..];
2207        if let Some(dp) = memchr::memchr(delim, line) {
2208            if run_start < start {
2209                iov.push(IoSlice::new(&data[run_start..start]));
2210            }
2211            iov.push(IoSlice::new(&data[start..start + dp]));
2212            iov.push(IoSlice::new(&newline_buf));
2213            if !iov.is_empty() {
2214                write_ioslices(out, &iov)?;
2215            }
2216            return Ok(());
2217        }
2218    }
2219
2220    // Flush remaining contiguous run
2221    if run_start < data.len() {
2222        iov.push(IoSlice::new(&data[run_start..]));
2223        if !data.is_empty() && *data.last().unwrap() != line_delim {
2224            iov.push(IoSlice::new(&newline_buf));
2225        }
2226    }
2227    if !iov.is_empty() {
2228        write_ioslices(out, &iov)?;
2229    }
2230    Ok(())
2231}
2232
2233/// Process a chunk of data for single-field extraction.
2234fn process_single_field_chunk(
2235    data: &[u8],
2236    delim: u8,
2237    target_idx: usize,
2238    line_delim: u8,
2239    suppress: bool,
2240    buf: &mut Vec<u8>,
2241) {
2242    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2243    buf.reserve(data.len());
2244    let mut start = 0;
2245    for end_pos in memchr_iter(line_delim, data) {
2246        let line = &data[start..end_pos];
2247        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2248        start = end_pos + 1;
2249    }
2250    if start < data.len() {
2251        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2252    }
2253}
2254
2255/// Extract a single field from one line.
2256/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2257/// For longer lines, uses memchr for SIMD-accelerated scanning.
2258/// Raw pointer arithmetic eliminates per-field bounds checking.
2259#[inline(always)]
2260fn extract_single_field_line(
2261    line: &[u8],
2262    delim: u8,
2263    target_idx: usize,
2264    line_delim: u8,
2265    suppress: bool,
2266    buf: &mut Vec<u8>,
2267) {
2268    let len = line.len();
2269    if len == 0 {
2270        if !suppress {
2271            unsafe { buf_push(buf, line_delim) };
2272        }
2273        return;
2274    }
2275
2276    // Note: no per-line buf.reserve — process_single_field_chunk already reserves data.len()
2277    let base = line.as_ptr();
2278
2279    // Ultra-fast path for first field: single memchr
2280    if target_idx == 0 {
2281        match memchr::memchr(delim, line) {
2282            Some(pos) => unsafe {
2283                buf_extend(buf, std::slice::from_raw_parts(base, pos));
2284                buf_push(buf, line_delim);
2285            },
2286            None => {
2287                if !suppress {
2288                    unsafe {
2289                        buf_extend(buf, line);
2290                        buf_push(buf, line_delim);
2291                    }
2292                }
2293            }
2294        }
2295        return;
2296    }
2297
2298    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2299    let mut field_start = 0;
2300    let mut field_idx = 0;
2301    let mut has_delim = false;
2302
2303    for pos in memchr_iter(delim, line) {
2304        has_delim = true;
2305        if field_idx == target_idx {
2306            unsafe {
2307                buf_extend(
2308                    buf,
2309                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2310                );
2311                buf_push(buf, line_delim);
2312            }
2313            return;
2314        }
2315        field_idx += 1;
2316        field_start = pos + 1;
2317    }
2318
2319    if !has_delim {
2320        if !suppress {
2321            unsafe {
2322                buf_extend(buf, line);
2323                buf_push(buf, line_delim);
2324            }
2325        }
2326        return;
2327    }
2328
2329    if field_idx == target_idx {
2330        unsafe {
2331            buf_extend(
2332                buf,
2333                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2334            );
2335            buf_push(buf, line_delim);
2336        }
2337    } else {
2338        unsafe { buf_push(buf, line_delim) };
2339    }
2340}
2341
2342/// Extract fields from a single line into the output buffer.
2343/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2344/// Raw pointer arithmetic eliminates per-field bounds checking.
2345#[inline(always)]
2346fn extract_fields_to_buf(
2347    line: &[u8],
2348    delim: u8,
2349    ranges: &[Range],
2350    output_delim: &[u8],
2351    suppress: bool,
2352    max_field: usize,
2353    field_mask: u64,
2354    line_delim: u8,
2355    buf: &mut Vec<u8>,
2356    complement: bool,
2357) {
2358    let len = line.len();
2359
2360    if len == 0 {
2361        if !suppress {
2362            buf.push(line_delim);
2363        }
2364        return;
2365    }
2366
2367    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2368    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2369    let needed = len + output_delim.len() * 16 + 1;
2370    if buf.capacity() - buf.len() < needed {
2371        buf.reserve(needed);
2372    }
2373
2374    let base = line.as_ptr();
2375    let mut field_num: usize = 1;
2376    let mut field_start: usize = 0;
2377    let mut first_output = true;
2378    let mut has_delim = false;
2379
2380    // Use memchr SIMD for all line sizes
2381    for delim_pos in memchr_iter(delim, line) {
2382        has_delim = true;
2383
2384        if is_selected(field_num, field_mask, ranges, complement) {
2385            if !first_output {
2386                unsafe { buf_extend(buf, output_delim) };
2387            }
2388            unsafe {
2389                buf_extend(
2390                    buf,
2391                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2392                )
2393            };
2394            first_output = false;
2395        }
2396
2397        field_num += 1;
2398        field_start = delim_pos + 1;
2399
2400        if field_num > max_field {
2401            break;
2402        }
2403    }
2404
2405    // Last field
2406    if (field_num <= max_field || complement)
2407        && has_delim
2408        && is_selected(field_num, field_mask, ranges, complement)
2409    {
2410        if !first_output {
2411            unsafe { buf_extend(buf, output_delim) };
2412        }
2413        unsafe {
2414            buf_extend(
2415                buf,
2416                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2417            )
2418        };
2419        first_output = false;
2420    }
2421
2422    if !first_output {
2423        unsafe { buf_push(buf, line_delim) };
2424    } else if !has_delim {
2425        if !suppress {
2426            unsafe {
2427                buf_extend(buf, line);
2428                buf_push(buf, line_delim);
2429            }
2430        }
2431    } else {
2432        unsafe { buf_push(buf, line_delim) };
2433    }
2434}
2435
2436// ── Fast path: byte/char extraction with batched output ──────────────────
2437
2438/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2439/// Zero-copy: writes directly from the source data using output runs.
2440/// For lines shorter than max_bytes, the output is identical to the input,
2441/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2442fn process_bytes_from_start(
2443    data: &[u8],
2444    max_bytes: usize,
2445    line_delim: u8,
2446    out: &mut impl Write,
2447) -> io::Result<()> {
2448    // Fast path: if all lines fit within max_bytes, output = input.
2449    // Single memchr scan with early exit on first oversized line.
2450    // For `-b1-100` on CSV where average line is < 100 bytes, this
2451    // skips all per-line processing and outputs the data directly.
2452    if max_bytes > 0 && max_bytes < usize::MAX {
2453        let mut start = 0;
2454        let mut all_fit = true;
2455        for pos in memchr_iter(line_delim, data) {
2456            if pos - start > max_bytes {
2457                all_fit = false;
2458                break;
2459            }
2460            start = pos + 1;
2461        }
2462        // Check last line (no trailing delimiter)
2463        if all_fit && start < data.len() && data.len() - start > max_bytes {
2464            all_fit = false;
2465        }
2466        if all_fit {
2467            // All lines fit: output = input. Handle missing trailing delimiter.
2468            if !data.is_empty() && data[data.len() - 1] == line_delim {
2469                return out.write_all(data);
2470            } else if !data.is_empty() {
2471                out.write_all(data)?;
2472                return out.write_all(&[line_delim]);
2473            }
2474            return Ok(());
2475        }
2476    }
2477
2478    if data.len() >= PARALLEL_THRESHOLD {
2479        let chunks = split_into_chunks(data, line_delim);
2480        let results: Vec<Vec<u8>> = chunks
2481            .par_iter()
2482            .map(|chunk| {
2483                // Estimate output size without scanning: assume average line
2484                // is at least (max_bytes+1) bytes (otherwise no truncation).
2485                // For cut -b1-5 on 50-char lines: output ~ chunk.len() * 6/51 ~ chunk/8.
2486                // Using chunk.len()/4 as initial capacity handles most cases
2487                // without reallocation, while avoiding the extra memchr scan.
2488                let est_out = (chunk.len() / 4).max(max_bytes + 2);
2489                let mut buf = Vec::with_capacity(est_out.min(chunk.len()));
2490                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
2491                buf
2492            })
2493            .collect();
2494        // Use write_vectored (writev) to batch N writes into fewer syscalls
2495        let slices: Vec<IoSlice> = results
2496            .iter()
2497            .filter(|r| !r.is_empty())
2498            .map(|r| IoSlice::new(r))
2499            .collect();
2500        write_ioslices(out, &slices)?;
2501    } else {
2502        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2503        // because every line gets truncated, creating 3 IoSlice entries per line.
2504        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2505        // managing millions of IoSlice entries through the kernel.
2506        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2507        if max_bytes <= 512 {
2508            // Estimate output size without scanning: output <= data.len(),
2509            // typically ~data.len()/4 for short max_bytes on longer lines.
2510            let est_out = (data.len() / 4).max(max_bytes + 2);
2511            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2512            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2513            if !buf.is_empty() {
2514                out.write_all(&buf)?;
2515            }
2516        } else {
2517            // Zero-copy path: track contiguous output runs and write directly from source.
2518            // For lines <= max_bytes, we include them as-is (no copy needed).
2519            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2520            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2521        }
2522    }
2523    Ok(())
2524}
2525
2526/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2527/// directly into the source data, flushing in MAX_IOV-sized batches.
2528/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2529/// produce two IoSlices (truncated data + newline).
2530#[inline]
2531fn bytes_from_start_zerocopy(
2532    data: &[u8],
2533    max_bytes: usize,
2534    line_delim: u8,
2535    out: &mut impl Write,
2536) -> io::Result<()> {
2537    let newline_buf: [u8; 1] = [line_delim];
2538    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2539    let mut start = 0;
2540    let mut run_start: usize = 0;
2541
2542    for pos in memchr_iter(line_delim, data) {
2543        let line_len = pos - start;
2544        if line_len > max_bytes {
2545            // This line needs truncation
2546            if run_start < start {
2547                iov.push(IoSlice::new(&data[run_start..start]));
2548            }
2549            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2550            iov.push(IoSlice::new(&newline_buf));
2551            run_start = pos + 1;
2552
2553            if iov.len() >= MAX_IOV - 2 {
2554                write_ioslices(out, &iov)?;
2555                iov.clear();
2556            }
2557        }
2558        start = pos + 1;
2559    }
2560    // Handle last line without terminator
2561    if start < data.len() {
2562        let line_len = data.len() - start;
2563        if line_len > max_bytes {
2564            if run_start < start {
2565                iov.push(IoSlice::new(&data[run_start..start]));
2566            }
2567            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2568            iov.push(IoSlice::new(&newline_buf));
2569            if !iov.is_empty() {
2570                write_ioslices(out, &iov)?;
2571            }
2572            return Ok(());
2573        }
2574    }
2575    // Flush remaining contiguous run
2576    if run_start < data.len() {
2577        iov.push(IoSlice::new(&data[run_start..]));
2578        if !data.is_empty() && *data.last().unwrap() != line_delim {
2579            iov.push(IoSlice::new(&newline_buf));
2580        }
2581    }
2582    if !iov.is_empty() {
2583        write_ioslices(out, &iov)?;
2584    }
2585    Ok(())
2586}
2587
2588/// Process a chunk for from-start byte range extraction (parallel path).
2589/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2590/// Pre-reserves data.len() (output never exceeds input), then uses a single
2591/// write pointer with deferred set_len — no per-line capacity checks.
2592#[inline]
2593fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2594    // Output is always <= input size (we only truncate, never expand).
2595    // Single reserve eliminates ALL per-line capacity checks.
2596    buf.reserve(data.len());
2597
2598    let src = data.as_ptr();
2599    let dst_base = buf.as_mut_ptr();
2600    let mut wp = buf.len();
2601    let mut start = 0;
2602
2603    for pos in memchr_iter(line_delim, data) {
2604        let line_len = pos - start;
2605        let take = line_len.min(max_bytes);
2606        unsafe {
2607            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2608            *dst_base.add(wp + take) = line_delim;
2609        }
2610        wp += take + 1;
2611        start = pos + 1;
2612    }
2613    // Handle last line without terminator
2614    if start < data.len() {
2615        let line_len = data.len() - start;
2616        let take = line_len.min(max_bytes);
2617        unsafe {
2618            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2619            *dst_base.add(wp + take) = line_delim;
2620        }
2621        wp += take + 1;
2622    }
2623    unsafe { buf.set_len(wp) };
2624}
2625
2626/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2627fn process_bytes_from_offset(
2628    data: &[u8],
2629    skip_bytes: usize,
2630    line_delim: u8,
2631    out: &mut impl Write,
2632) -> io::Result<()> {
2633    if data.len() >= PARALLEL_THRESHOLD {
2634        let chunks = split_into_chunks(data, line_delim);
2635        let results: Vec<Vec<u8>> = chunks
2636            .par_iter()
2637            .map(|chunk| {
2638                let mut buf = Vec::with_capacity(chunk.len());
2639                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
2640                buf
2641            })
2642            .collect();
2643        // Use write_vectored (writev) to batch N writes into fewer syscalls
2644        let slices: Vec<IoSlice> = results
2645            .iter()
2646            .filter(|r| !r.is_empty())
2647            .map(|r| IoSlice::new(r))
2648            .collect();
2649        write_ioslices(out, &slices)?;
2650    } else {
2651        // Zero-copy: write suffix of each line directly from source
2652        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2653    }
2654    Ok(())
2655}
2656
2657/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2658/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2659/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2660#[inline]
2661fn bytes_from_offset_zerocopy(
2662    data: &[u8],
2663    skip_bytes: usize,
2664    line_delim: u8,
2665    out: &mut impl Write,
2666) -> io::Result<()> {
2667    let delim_buf = [line_delim];
2668    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2669
2670    let mut start = 0;
2671    for pos in memchr_iter(line_delim, data) {
2672        let line_len = pos - start;
2673        if line_len > skip_bytes {
2674            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2675        }
2676        iov.push(IoSlice::new(&delim_buf));
2677        // Flush when approaching MAX_IOV to avoid oversized writev
2678        if iov.len() >= MAX_IOV - 1 {
2679            write_ioslices(out, &iov)?;
2680            iov.clear();
2681        }
2682        start = pos + 1;
2683    }
2684    if start < data.len() {
2685        let line_len = data.len() - start;
2686        if line_len > skip_bytes {
2687            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2688        }
2689        iov.push(IoSlice::new(&delim_buf));
2690    }
2691    if !iov.is_empty() {
2692        write_ioslices(out, &iov)?;
2693    }
2694    Ok(())
2695}
2696
2697/// Process a chunk for from-offset byte range extraction.
2698/// Single reserve + deferred set_len for zero per-line overhead.
2699#[inline]
2700fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2701    buf.reserve(data.len());
2702
2703    let src = data.as_ptr();
2704    let dst_base = buf.as_mut_ptr();
2705    let mut wp = buf.len();
2706    let mut start = 0;
2707
2708    for pos in memchr_iter(line_delim, data) {
2709        let line_len = pos - start;
2710        if line_len > skip_bytes {
2711            let take = line_len - skip_bytes;
2712            unsafe {
2713                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2714            }
2715            wp += take;
2716        }
2717        unsafe {
2718            *dst_base.add(wp) = line_delim;
2719        }
2720        wp += 1;
2721        start = pos + 1;
2722    }
2723    if start < data.len() {
2724        let line_len = data.len() - start;
2725        if line_len > skip_bytes {
2726            let take = line_len - skip_bytes;
2727            unsafe {
2728                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2729            }
2730            wp += take;
2731        }
2732        unsafe {
2733            *dst_base.add(wp) = line_delim;
2734        }
2735        wp += 1;
2736    }
2737    unsafe { buf.set_len(wp) };
2738}
2739
2740/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2741fn process_bytes_mid_range(
2742    data: &[u8],
2743    start_byte: usize,
2744    end_byte: usize,
2745    line_delim: u8,
2746    out: &mut impl Write,
2747) -> io::Result<()> {
2748    let skip = start_byte.saturating_sub(1);
2749
2750    if data.len() >= PARALLEL_THRESHOLD {
2751        let chunks = split_into_chunks(data, line_delim);
2752        let results: Vec<Vec<u8>> = chunks
2753            .par_iter()
2754            .map(|chunk| {
2755                let mut buf = Vec::with_capacity(chunk.len());
2756                bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, &mut buf);
2757                buf
2758            })
2759            .collect();
2760        let slices: Vec<IoSlice> = results
2761            .iter()
2762            .filter(|r| !r.is_empty())
2763            .map(|r| IoSlice::new(r))
2764            .collect();
2765        write_ioslices(out, &slices)?;
2766    } else {
2767        let mut buf = Vec::with_capacity(data.len());
2768        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2769        if !buf.is_empty() {
2770            out.write_all(&buf)?;
2771        }
2772    }
2773    Ok(())
2774}
2775
2776/// Process a chunk for mid-range byte extraction.
2777/// For each line, output bytes skip..min(line_len, end_byte).
2778/// Single reserve + deferred set_len.
2779#[inline]
2780fn bytes_mid_range_chunk(
2781    data: &[u8],
2782    skip: usize,
2783    end_byte: usize,
2784    line_delim: u8,
2785    buf: &mut Vec<u8>,
2786) {
2787    buf.reserve(data.len());
2788
2789    let src = data.as_ptr();
2790    let dst_base = buf.as_mut_ptr();
2791    let mut wp = buf.len();
2792    let mut start = 0;
2793
2794    for pos in memchr_iter(line_delim, data) {
2795        let line_len = pos - start;
2796        if line_len > skip {
2797            let take_end = line_len.min(end_byte);
2798            let take = take_end - skip;
2799            unsafe {
2800                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2801            }
2802            wp += take;
2803        }
2804        unsafe {
2805            *dst_base.add(wp) = line_delim;
2806        }
2807        wp += 1;
2808        start = pos + 1;
2809    }
2810    if start < data.len() {
2811        let line_len = data.len() - start;
2812        if line_len > skip {
2813            let take_end = line_len.min(end_byte);
2814            let take = take_end - skip;
2815            unsafe {
2816                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2817            }
2818            wp += take;
2819        }
2820        unsafe {
2821            *dst_base.add(wp) = line_delim;
2822        }
2823        wp += 1;
2824    }
2825    unsafe { buf.set_len(wp) };
2826}
2827
2828/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2829fn process_bytes_complement_mid(
2830    data: &[u8],
2831    skip_start: usize,
2832    skip_end: usize,
2833    line_delim: u8,
2834    out: &mut impl Write,
2835) -> io::Result<()> {
2836    let prefix_bytes = skip_start - 1; // bytes before the skip region
2837    if data.len() >= PARALLEL_THRESHOLD {
2838        let chunks = split_into_chunks(data, line_delim);
2839        let results: Vec<Vec<u8>> = chunks
2840            .par_iter()
2841            .map(|chunk| {
2842                let mut buf = Vec::with_capacity(chunk.len());
2843                bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, &mut buf);
2844                buf
2845            })
2846            .collect();
2847        let slices: Vec<IoSlice> = results
2848            .iter()
2849            .filter(|r| !r.is_empty())
2850            .map(|r| IoSlice::new(r))
2851            .collect();
2852        write_ioslices(out, &slices)?;
2853    } else {
2854        let mut buf = Vec::with_capacity(data.len());
2855        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
2856        if !buf.is_empty() {
2857            out.write_all(&buf)?;
2858        }
2859    }
2860    Ok(())
2861}
2862
2863/// Process a chunk for complement mid-range byte extraction.
2864/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2865#[inline]
2866fn bytes_complement_mid_chunk(
2867    data: &[u8],
2868    prefix_bytes: usize,
2869    skip_end: usize,
2870    line_delim: u8,
2871    buf: &mut Vec<u8>,
2872) {
2873    buf.reserve(data.len());
2874
2875    let src = data.as_ptr();
2876    let dst_base = buf.as_mut_ptr();
2877    let mut wp = buf.len();
2878    let mut start = 0;
2879
2880    for pos in memchr_iter(line_delim, data) {
2881        let line_len = pos - start;
2882        // Copy prefix (bytes before skip region)
2883        let take_prefix = prefix_bytes.min(line_len);
2884        if take_prefix > 0 {
2885            unsafe {
2886                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2887            }
2888            wp += take_prefix;
2889        }
2890        // Copy suffix (bytes after skip region)
2891        if line_len > skip_end {
2892            let suffix_len = line_len - skip_end;
2893            unsafe {
2894                std::ptr::copy_nonoverlapping(
2895                    src.add(start + skip_end),
2896                    dst_base.add(wp),
2897                    suffix_len,
2898                );
2899            }
2900            wp += suffix_len;
2901        }
2902        unsafe {
2903            *dst_base.add(wp) = line_delim;
2904        }
2905        wp += 1;
2906        start = pos + 1;
2907    }
2908    if start < data.len() {
2909        let line_len = data.len() - start;
2910        let take_prefix = prefix_bytes.min(line_len);
2911        if take_prefix > 0 {
2912            unsafe {
2913                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2914            }
2915            wp += take_prefix;
2916        }
2917        if line_len > skip_end {
2918            let suffix_len = line_len - skip_end;
2919            unsafe {
2920                std::ptr::copy_nonoverlapping(
2921                    src.add(start + skip_end),
2922                    dst_base.add(wp),
2923                    suffix_len,
2924                );
2925            }
2926            wp += suffix_len;
2927        }
2928        unsafe {
2929            *dst_base.add(wp) = line_delim;
2930        }
2931        wp += 1;
2932    }
2933    unsafe { buf.set_len(wp) };
2934}
2935
2936/// Optimized byte/char extraction with batched output and parallel processing.
2937fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2938    let line_delim = cfg.line_delim;
2939    let ranges = cfg.ranges;
2940    let complement = cfg.complement;
2941    let output_delim = cfg.output_delim;
2942
2943    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
2944    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
2945        let max_bytes = ranges[0].end;
2946        if max_bytes < usize::MAX {
2947            return process_bytes_from_start(data, max_bytes, line_delim, out);
2948        }
2949    }
2950
2951    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
2952    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
2953        let skip_bytes = ranges[0].start.saturating_sub(1);
2954        if skip_bytes > 0 {
2955            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
2956        }
2957    }
2958
2959    // Fast path: single mid-range (e.g., cut -b5-100)
2960    if !complement
2961        && ranges.len() == 1
2962        && ranges[0].start > 1
2963        && ranges[0].end < usize::MAX
2964        && output_delim.is_empty()
2965    {
2966        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
2967    }
2968
2969    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
2970    if complement
2971        && ranges.len() == 1
2972        && ranges[0].start == 1
2973        && ranges[0].end < usize::MAX
2974        && output_delim.is_empty()
2975    {
2976        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
2977    }
2978
2979    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
2980    if complement
2981        && ranges.len() == 1
2982        && ranges[0].end == usize::MAX
2983        && ranges[0].start > 1
2984        && output_delim.is_empty()
2985    {
2986        let max_bytes = ranges[0].start - 1;
2987        return process_bytes_from_start(data, max_bytes, line_delim, out);
2988    }
2989
2990    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
2991    if complement
2992        && ranges.len() == 1
2993        && ranges[0].start > 1
2994        && ranges[0].end < usize::MAX
2995        && output_delim.is_empty()
2996    {
2997        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
2998    }
2999
3000    if data.len() >= PARALLEL_THRESHOLD {
3001        let chunks = split_into_chunks(data, line_delim);
3002        let results: Vec<Vec<u8>> = chunks
3003            .par_iter()
3004            .map(|chunk| {
3005                let mut buf = Vec::with_capacity(chunk.len());
3006                process_bytes_chunk(
3007                    chunk,
3008                    ranges,
3009                    complement,
3010                    output_delim,
3011                    line_delim,
3012                    &mut buf,
3013                );
3014                buf
3015            })
3016            .collect();
3017        // Use write_vectored (writev) to batch N writes into fewer syscalls
3018        let slices: Vec<IoSlice> = results
3019            .iter()
3020            .filter(|r| !r.is_empty())
3021            .map(|r| IoSlice::new(r))
3022            .collect();
3023        write_ioslices(out, &slices)?;
3024    } else {
3025        let mut buf = Vec::with_capacity(data.len());
3026        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
3027        if !buf.is_empty() {
3028            out.write_all(&buf)?;
3029        }
3030    }
3031    Ok(())
3032}
3033
3034/// Process a chunk of data for byte/char extraction.
3035/// Uses raw pointer arithmetic for the newline scan.
3036/// Complement single-range fast path: compute complement ranges once, then use
3037/// the non-complement multi-range path which is more cache-friendly.
3038fn process_bytes_chunk(
3039    data: &[u8],
3040    ranges: &[Range],
3041    complement: bool,
3042    output_delim: &[u8],
3043    line_delim: u8,
3044    buf: &mut Vec<u8>,
3045) {
3046    buf.reserve(data.len());
3047    let base = data.as_ptr();
3048    let mut start = 0;
3049    for end_pos in memchr_iter(line_delim, data) {
3050        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
3051        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3052        unsafe { buf_push(buf, line_delim) };
3053        start = end_pos + 1;
3054    }
3055    if start < data.len() {
3056        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
3057        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3058        unsafe { buf_push(buf, line_delim) };
3059    }
3060}
3061
3062/// Extract byte ranges from a line into the output buffer.
3063/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
3064/// Raw pointer arithmetic eliminates per-range bounds checking.
3065#[inline(always)]
3066fn cut_bytes_to_buf(
3067    line: &[u8],
3068    ranges: &[Range],
3069    complement: bool,
3070    output_delim: &[u8],
3071    buf: &mut Vec<u8>,
3072) {
3073    let len = line.len();
3074    let base = line.as_ptr();
3075    let mut first_range = true;
3076
3077    // Reserve worst case: full line + delimiters between ranges
3078    let needed = len + output_delim.len() * ranges.len() + 1;
3079    if buf.capacity() - buf.len() < needed {
3080        buf.reserve(needed);
3081    }
3082
3083    if complement {
3084        let mut pos: usize = 1;
3085        for r in ranges {
3086            let rs = r.start;
3087            let re = r.end.min(len);
3088            if pos < rs {
3089                if !first_range && !output_delim.is_empty() {
3090                    unsafe { buf_extend(buf, output_delim) };
3091                }
3092                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
3093                first_range = false;
3094            }
3095            pos = re + 1;
3096            if pos > len {
3097                break;
3098            }
3099        }
3100        if pos <= len {
3101            if !first_range && !output_delim.is_empty() {
3102                unsafe { buf_extend(buf, output_delim) };
3103            }
3104            unsafe {
3105                buf_extend(
3106                    buf,
3107                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3108                )
3109            };
3110        }
3111    } else if output_delim.is_empty() && ranges.len() == 1 {
3112        // Ultra-fast path: single range, no output delimiter
3113        let start = ranges[0].start.saturating_sub(1);
3114        let end = ranges[0].end.min(len);
3115        if start < len {
3116            unsafe {
3117                buf_extend(
3118                    buf,
3119                    std::slice::from_raw_parts(base.add(start), end - start),
3120                )
3121            };
3122        }
3123    } else {
3124        for r in ranges {
3125            let start = r.start.saturating_sub(1);
3126            let end = r.end.min(len);
3127            if start >= len {
3128                break;
3129            }
3130            if !first_range && !output_delim.is_empty() {
3131                unsafe { buf_extend(buf, output_delim) };
3132            }
3133            unsafe {
3134                buf_extend(
3135                    buf,
3136                    std::slice::from_raw_parts(base.add(start), end - start),
3137                )
3138            };
3139            first_range = false;
3140        }
3141    }
3142}
3143
3144// ── Public API ───────────────────────────────────────────────────────────
3145
3146/// Cut fields from a line using a delimiter. Writes to `out`.
3147#[inline]
3148pub fn cut_fields(
3149    line: &[u8],
3150    delim: u8,
3151    ranges: &[Range],
3152    complement: bool,
3153    output_delim: &[u8],
3154    suppress_no_delim: bool,
3155    out: &mut impl Write,
3156) -> io::Result<bool> {
3157    if memchr::memchr(delim, line).is_none() {
3158        if !suppress_no_delim {
3159            out.write_all(line)?;
3160            return Ok(true);
3161        }
3162        return Ok(false);
3163    }
3164
3165    let mut field_num: usize = 1;
3166    let mut field_start: usize = 0;
3167    let mut first_output = true;
3168
3169    for delim_pos in memchr_iter(delim, line) {
3170        let selected = in_ranges(ranges, field_num) != complement;
3171        if selected {
3172            if !first_output {
3173                out.write_all(output_delim)?;
3174            }
3175            out.write_all(&line[field_start..delim_pos])?;
3176            first_output = false;
3177        }
3178        field_start = delim_pos + 1;
3179        field_num += 1;
3180    }
3181
3182    let selected = in_ranges(ranges, field_num) != complement;
3183    if selected {
3184        if !first_output {
3185            out.write_all(output_delim)?;
3186        }
3187        out.write_all(&line[field_start..])?;
3188    }
3189
3190    Ok(true)
3191}
3192
3193/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3194#[inline]
3195pub fn cut_bytes(
3196    line: &[u8],
3197    ranges: &[Range],
3198    complement: bool,
3199    output_delim: &[u8],
3200    out: &mut impl Write,
3201) -> io::Result<bool> {
3202    let mut first_range = true;
3203
3204    if complement {
3205        let len = line.len();
3206        let mut comp_ranges = Vec::new();
3207        let mut pos: usize = 1;
3208        for r in ranges {
3209            let rs = r.start;
3210            let re = r.end.min(len);
3211            if pos < rs {
3212                comp_ranges.push((pos, rs - 1));
3213            }
3214            pos = re + 1;
3215            if pos > len {
3216                break;
3217            }
3218        }
3219        if pos <= len {
3220            comp_ranges.push((pos, len));
3221        }
3222        for &(s, e) in &comp_ranges {
3223            if !first_range && !output_delim.is_empty() {
3224                out.write_all(output_delim)?;
3225            }
3226            out.write_all(&line[s - 1..e])?;
3227            first_range = false;
3228        }
3229    } else {
3230        for r in ranges {
3231            let start = r.start.saturating_sub(1);
3232            let end = r.end.min(line.len());
3233            if start >= line.len() {
3234                break;
3235            }
3236            if !first_range && !output_delim.is_empty() {
3237                out.write_all(output_delim)?;
3238            }
3239            out.write_all(&line[start..end])?;
3240            first_range = false;
3241        }
3242    }
3243    Ok(true)
3244}
3245
3246/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3247/// Output is always <= input (we remove everything after first delimiter per line).
3248/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3249/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3250///
3251/// Lines without delimiter pass through unchanged (unless suppress=true).
3252/// Lines with delimiter: keep bytes before delimiter + newline.
3253pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3254    let len = data.len();
3255    let mut wp: usize = 0;
3256    let mut rp: usize = 0;
3257
3258    while rp < len {
3259        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3260            None => {
3261                // Rest is partial line, no delimiter
3262                if suppress {
3263                    // suppress: skip lines without delimiter
3264                    break;
3265                }
3266                let remaining = len - rp;
3267                if wp != rp {
3268                    data.copy_within(rp..len, wp);
3269                }
3270                wp += remaining;
3271                break;
3272            }
3273            Some(offset) => {
3274                let actual = rp + offset;
3275                if data[actual] == line_delim {
3276                    // No delimiter on this line
3277                    if suppress {
3278                        // Skip this line entirely
3279                        rp = actual + 1;
3280                    } else {
3281                        // Output entire line including newline
3282                        let chunk_len = actual + 1 - rp;
3283                        if wp != rp {
3284                            data.copy_within(rp..actual + 1, wp);
3285                        }
3286                        wp += chunk_len;
3287                        rp = actual + 1;
3288                    }
3289                } else {
3290                    // Delimiter found: output field 1 (up to delimiter) + newline
3291                    let field_len = actual - rp;
3292                    if wp != rp && field_len > 0 {
3293                        data.copy_within(rp..actual, wp);
3294                    }
3295                    wp += field_len;
3296                    data[wp] = line_delim;
3297                    wp += 1;
3298                    // Skip to next newline
3299                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3300                        None => {
3301                            rp = len;
3302                        }
3303                        Some(nl_off) => {
3304                            rp = actual + 1 + nl_off + 1;
3305                        }
3306                    }
3307                }
3308            }
3309        }
3310    }
3311    wp
3312}
3313
3314/// Process a full data buffer (from mmap or read) with cut operation.
3315pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3316    match cfg.mode {
3317        CutMode::Fields => process_fields_fast(data, cfg, out),
3318        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3319    }
3320}
3321
3322/// Process input from a reader (for stdin).
3323/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3324/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3325/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3326pub fn process_cut_reader<R: BufRead>(
3327    mut reader: R,
3328    cfg: &CutConfig,
3329    out: &mut impl Write,
3330) -> io::Result<()> {
3331    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3332    let line_delim = cfg.line_delim;
3333
3334    // Read large chunks and process in batch.
3335    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3336    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3337
3338    loop {
3339        // Read up to CHUNK_SIZE bytes
3340        buf.reserve(CHUNK_SIZE);
3341        let read_start = buf.len();
3342        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3343        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3344        buf.truncate(read_start + n);
3345
3346        if buf.is_empty() {
3347            break;
3348        }
3349
3350        if n == 0 {
3351            // EOF with leftover data (last line without terminator)
3352            process_cut_data(&buf, cfg, out)?;
3353            break;
3354        }
3355
3356        // Find the last line delimiter in the buffer so we process complete lines
3357        let process_end = match memchr::memrchr(line_delim, &buf) {
3358            Some(pos) => pos + 1,
3359            None => {
3360                // No line delimiter found — keep accumulating
3361                continue;
3362            }
3363        };
3364
3365        // Process the complete lines using the fast batch path
3366        process_cut_data(&buf[..process_end], cfg, out)?;
3367
3368        // Shift leftover to the front for next iteration
3369        let leftover_len = buf.len() - process_end;
3370        if leftover_len > 0 {
3371            buf.copy_within(process_end.., 0);
3372        }
3373        buf.truncate(leftover_len);
3374    }
3375
3376    Ok(())
3377}
3378
3379/// Read as many bytes as possible into buf, retrying on partial reads.
3380#[inline]
3381fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3382    let n = reader.read(buf)?;
3383    if n == buf.len() || n == 0 {
3384        return Ok(n);
3385    }
3386    // Slow path: partial read — retry to fill buffer
3387    let mut total = n;
3388    while total < buf.len() {
3389        match reader.read(&mut buf[total..]) {
3390            Ok(0) => break,
3391            Ok(n) => total += n,
3392            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3393            Err(e) => return Err(e),
3394        }
3395    }
3396    Ok(total)
3397}
3398
3399/// In-place cut processing for mutable data buffers.
3400/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3401/// for the given configuration (caller should fall back to regular processing).
3402///
3403/// In-place avoids allocating intermediate output buffers — the result is written
3404/// directly into the input buffer (output is always <= input for non-complement modes
3405/// with default output delimiter).
3406pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3407    if cfg.complement {
3408        return None;
3409    }
3410
3411    match cfg.mode {
3412        CutMode::Fields => {
3413            // Only handle when output delimiter matches input (single-byte)
3414            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3415                return None;
3416            }
3417            if cfg.delim == cfg.line_delim {
3418                return None;
3419            }
3420            Some(cut_fields_inplace_general(
3421                data,
3422                cfg.delim,
3423                cfg.line_delim,
3424                cfg.ranges,
3425                cfg.suppress_no_delim,
3426            ))
3427        }
3428        CutMode::Bytes | CutMode::Characters => {
3429            if !cfg.output_delim.is_empty() {
3430                return None;
3431            }
3432            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3433        }
3434    }
3435}
3436
3437/// In-place generalized field extraction.
3438/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3439fn cut_fields_inplace_general(
3440    data: &mut [u8],
3441    delim: u8,
3442    line_delim: u8,
3443    ranges: &[Range],
3444    suppress: bool,
3445) -> usize {
3446    // Special case: field 1 only (existing optimized path)
3447    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3448        return cut_field1_inplace(data, delim, line_delim, suppress);
3449    }
3450
3451    let len = data.len();
3452    if len == 0 {
3453        return 0;
3454    }
3455
3456    let max_field = ranges.last().map_or(0, |r| r.end);
3457    let max_delims = max_field.min(64);
3458    let mut wp: usize = 0;
3459    let mut rp: usize = 0;
3460
3461    while rp < len {
3462        let line_end = memchr::memchr(line_delim, &data[rp..])
3463            .map(|p| rp + p)
3464            .unwrap_or(len);
3465        let line_len = line_end - rp;
3466
3467        // Collect delimiter positions (relative to line start)
3468        let mut delim_pos = [0usize; 64];
3469        let mut num_delims: usize = 0;
3470
3471        for pos in memchr_iter(delim, &data[rp..line_end]) {
3472            if num_delims < max_delims {
3473                delim_pos[num_delims] = pos;
3474                num_delims += 1;
3475                if num_delims >= max_delims {
3476                    break;
3477                }
3478            }
3479        }
3480
3481        if num_delims == 0 {
3482            // No delimiter in line
3483            if !suppress {
3484                if wp != rp {
3485                    data.copy_within(rp..line_end, wp);
3486                }
3487                wp += line_len;
3488                if line_end < len {
3489                    data[wp] = line_delim;
3490                    wp += 1;
3491                }
3492            }
3493        } else {
3494            let total_fields = num_delims + 1;
3495            let mut first_output = true;
3496
3497            for r in ranges {
3498                let range_start = r.start;
3499                let range_end = r.end.min(total_fields);
3500                if range_start > total_fields {
3501                    break;
3502                }
3503                for field_num in range_start..=range_end {
3504                    if field_num > total_fields {
3505                        break;
3506                    }
3507
3508                    let field_start = if field_num == 1 {
3509                        0
3510                    } else if field_num - 2 < num_delims {
3511                        delim_pos[field_num - 2] + 1
3512                    } else {
3513                        continue;
3514                    };
3515                    let field_end = if field_num <= num_delims {
3516                        delim_pos[field_num - 1]
3517                    } else {
3518                        line_len
3519                    };
3520
3521                    if !first_output {
3522                        data[wp] = delim;
3523                        wp += 1;
3524                    }
3525                    let flen = field_end - field_start;
3526                    if flen > 0 {
3527                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3528                        wp += flen;
3529                    }
3530                    first_output = false;
3531                }
3532            }
3533
3534            if !first_output && line_end < len {
3535                data[wp] = line_delim;
3536                wp += 1;
3537            } else if first_output && line_end < len {
3538                // No fields selected but line had delimiters — output empty line
3539                data[wp] = line_delim;
3540                wp += 1;
3541            }
3542        }
3543
3544        rp = if line_end < len { line_end + 1 } else { len };
3545    }
3546
3547    wp
3548}
3549
3550/// In-place byte/char range extraction.
3551fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3552    let len = data.len();
3553    if len == 0 {
3554        return 0;
3555    }
3556
3557    // Quick check: single range from byte 1 to end = no-op
3558    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3559        return len;
3560    }
3561
3562    // Single range from byte 1: fast truncation path
3563    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3564        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3565    }
3566
3567    let mut wp: usize = 0;
3568    let mut rp: usize = 0;
3569
3570    while rp < len {
3571        let line_end = memchr::memchr(line_delim, &data[rp..])
3572            .map(|p| rp + p)
3573            .unwrap_or(len);
3574        let line_len = line_end - rp;
3575
3576        for r in ranges {
3577            let start = r.start.saturating_sub(1);
3578            let end = r.end.min(line_len);
3579            if start >= line_len {
3580                break;
3581            }
3582            let flen = end - start;
3583            if flen > 0 {
3584                data.copy_within(rp + start..rp + start + flen, wp);
3585                wp += flen;
3586            }
3587        }
3588
3589        if line_end < len {
3590            data[wp] = line_delim;
3591            wp += 1;
3592        }
3593
3594        rp = if line_end < len { line_end + 1 } else { len };
3595    }
3596
3597    wp
3598}
3599
3600/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3601fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3602    let len = data.len();
3603
3604    // Quick check: see if all lines fit within max_bytes (common case)
3605    let mut all_fit = true;
3606    let mut start = 0;
3607    for pos in memchr_iter(line_delim, data) {
3608        if pos - start > max_bytes {
3609            all_fit = false;
3610            break;
3611        }
3612        start = pos + 1;
3613    }
3614    if all_fit && start < len && len - start > max_bytes {
3615        all_fit = false;
3616    }
3617    if all_fit {
3618        return len;
3619    }
3620
3621    // Some lines need truncation
3622    let mut wp: usize = 0;
3623    let mut rp: usize = 0;
3624
3625    while rp < len {
3626        let line_end = memchr::memchr(line_delim, &data[rp..])
3627            .map(|p| rp + p)
3628            .unwrap_or(len);
3629        let line_len = line_end - rp;
3630
3631        let take = line_len.min(max_bytes);
3632        if take > 0 && wp != rp {
3633            data.copy_within(rp..rp + take, wp);
3634        }
3635        wp += take;
3636
3637        if line_end < len {
3638            data[wp] = line_delim;
3639            wp += 1;
3640        }
3641
3642        rp = if line_end < len { line_end + 1 } else { len };
3643    }
3644
3645    wp
3646}
3647
3648/// Cut operation mode
3649#[derive(Debug, Clone, Copy, PartialEq)]
3650pub enum CutMode {
3651    Bytes,
3652    Characters,
3653    Fields,
3654}