Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (512KB).
6/// Lowered to benefit from parallel chunk processing on smaller piped inputs.
7/// At 512KB with 2+ threads, the per-thread chunk is ~256KB which still
8/// amortizes the rayon overhead (~100-200us) well.
9const PARALLEL_THRESHOLD: usize = 512 * 1024;
10
11/// Max iovec entries per writev call (Linux default).
12const MAX_IOV: usize = 1024;
13
14/// Configuration for cut operations.
15pub struct CutConfig<'a> {
16    pub mode: CutMode,
17    pub ranges: &'a [Range],
18    pub complement: bool,
19    pub delim: u8,
20    pub output_delim: &'a [u8],
21    pub suppress_no_delim: bool,
22    pub line_delim: u8,
23}
24
25/// A range specification like 1, 3-5, -3, 4-
26#[derive(Debug, Clone)]
27pub struct Range {
28    pub start: usize, // 1-based, 0 means "from beginning"
29    pub end: usize,   // 1-based, usize::MAX means "to end"
30}
31
32/// Parse a LIST specification like "1,3-5,7-" into ranges.
33/// Each range is 1-based. Returns sorted, merged ranges.
34pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
35    let mut ranges = Vec::new();
36
37    for part in spec.split(',') {
38        let part = part.trim();
39        if part.is_empty() {
40            continue;
41        }
42
43        if let Some(idx) = part.find('-') {
44            let left = &part[..idx];
45            let right = &part[idx + 1..];
46
47            let start = if left.is_empty() {
48                1
49            } else {
50                left.parse::<usize>()
51                    .map_err(|_| format!("invalid range: '{}'", part))?
52            };
53
54            let end = if right.is_empty() {
55                usize::MAX
56            } else {
57                right
58                    .parse::<usize>()
59                    .map_err(|_| format!("invalid range: '{}'", part))?
60            };
61
62            if start == 0 {
63                return Err("fields and positions are numbered from 1".to_string());
64            }
65            if start > end {
66                return Err(format!("invalid decreasing range: '{}'", part));
67            }
68
69            ranges.push(Range { start, end });
70        } else {
71            let n = part
72                .parse::<usize>()
73                .map_err(|_| format!("invalid field: '{}'", part))?;
74            if n == 0 {
75                return Err("fields and positions are numbered from 1".to_string());
76            }
77            ranges.push(Range { start: n, end: n });
78        }
79    }
80
81    if ranges.is_empty() {
82        return Err("you must specify a list of bytes, characters, or fields".to_string());
83    }
84
85    // Sort and merge overlapping ranges
86    ranges.sort_by_key(|r| (r.start, r.end));
87    let mut merged = vec![ranges[0].clone()];
88    for r in &ranges[1..] {
89        let last = merged.last_mut().unwrap();
90        if r.start <= last.end.saturating_add(1) {
91            last.end = last.end.max(r.end);
92        } else {
93            merged.push(r.clone());
94        }
95    }
96
97    Ok(merged)
98}
99
100/// Check if a 1-based position is in any range.
101/// Ranges must be sorted. Uses early exit since ranges are sorted.
102#[inline(always)]
103fn in_ranges(ranges: &[Range], pos: usize) -> bool {
104    for r in ranges {
105        if pos < r.start {
106            return false;
107        }
108        if pos <= r.end {
109            return true;
110        }
111    }
112    false
113}
114
115/// Pre-compute a 64-bit mask for field selection.
116/// Bit i-1 is set if field i should be output.
117#[inline]
118fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
119    let mut mask: u64 = 0;
120    for i in 1..=64u32 {
121        let in_range = in_ranges(ranges, i as usize);
122        if in_range != complement {
123            mask |= 1u64 << (i - 1);
124        }
125    }
126    mask
127}
128
129/// Check if a field should be selected, using bitset for first 64 fields.
130#[inline(always)]
131fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
132    if field_num <= 64 {
133        (mask >> (field_num - 1)) & 1 == 1
134    } else {
135        in_ranges(ranges, field_num) != complement
136    }
137}
138
139// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
140
141/// Append a slice to buf without capacity checks.
142/// Caller MUST ensure buf has enough remaining capacity.
143#[inline(always)]
144unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
145    unsafe {
146        let len = buf.len();
147        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
148        buf.set_len(len + data.len());
149    }
150}
151
152/// Append a single byte to buf without capacity checks.
153/// Caller MUST ensure buf has enough remaining capacity.
154#[inline(always)]
155unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
156    unsafe {
157        let len = buf.len();
158        *buf.as_mut_ptr().add(len) = b;
159        buf.set_len(len + 1);
160    }
161}
162
163/// Write multiple IoSlice buffers using write_vectored (writev syscall).
164/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
165/// Cold path (partial write) is out-of-line to keep the hot loop tight.
166#[inline]
167fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
168    if slices.is_empty() {
169        return Ok(());
170    }
171    for batch in slices.chunks(MAX_IOV) {
172        let total: usize = batch.iter().map(|s| s.len()).sum();
173        let written = out.write_vectored(batch)?;
174        if written >= total {
175            continue;
176        }
177        if written == 0 {
178            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
179        }
180        write_ioslices_slow(out, batch, written)?;
181    }
182    Ok(())
183}
184
185/// Handle partial write_vectored (cold path, never inlined).
186#[cold]
187#[inline(never)]
188fn write_ioslices_slow(
189    out: &mut impl Write,
190    slices: &[IoSlice],
191    mut skip: usize,
192) -> io::Result<()> {
193    for slice in slices {
194        let len = slice.len();
195        if skip >= len {
196            skip -= len;
197            continue;
198        }
199        out.write_all(&slice[skip..])?;
200        skip = 0;
201    }
202    Ok(())
203}
204
205// ── Chunk splitting for parallel processing ──────────────────────────────
206
207/// Split data into chunks aligned to line boundaries for parallel processing.
208fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
209    let num_threads = rayon::current_num_threads().max(1);
210    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
211        return vec![data];
212    }
213
214    let chunk_size = data.len() / num_threads;
215    let mut chunks = Vec::with_capacity(num_threads);
216    let mut pos = 0;
217
218    for _ in 0..num_threads - 1 {
219        let target = pos + chunk_size;
220        if target >= data.len() {
221            break;
222        }
223        let boundary = memchr::memchr(line_delim, &data[target..])
224            .map(|p| target + p + 1)
225            .unwrap_or(data.len());
226        if boundary > pos {
227            chunks.push(&data[pos..boundary]);
228        }
229        pos = boundary;
230    }
231
232    if pos < data.len() {
233        chunks.push(&data[pos..]);
234    }
235
236    chunks
237}
238
239// ── Fast path: multi-field non-contiguous extraction ─────────────────────
240
241/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
242/// Pre-collects delimiter positions per line into a stack-allocated array,
243/// then directly indexes into them for each selected field.
244/// This is O(max_field) per line instead of O(num_fields * scan_length).
245fn process_fields_multi_select(
246    data: &[u8],
247    delim: u8,
248    line_delim: u8,
249    ranges: &[Range],
250    suppress: bool,
251    out: &mut impl Write,
252) -> io::Result<()> {
253    let max_field = ranges.last().map_or(0, |r| r.end);
254
255    if data.len() >= PARALLEL_THRESHOLD {
256        let chunks = split_into_chunks(data, line_delim);
257        let results: Vec<Vec<u8>> = chunks
258            .par_iter()
259            .map(|chunk| {
260                // Output is always <= input for field selection; use 3/4 as safe estimate
261                let mut buf = Vec::with_capacity(chunk.len() * 3 / 4);
262                multi_select_chunk(
263                    chunk, delim, line_delim, ranges, max_field, suppress, &mut buf,
264                );
265                buf
266            })
267            .collect();
268        let slices: Vec<IoSlice> = results
269            .iter()
270            .filter(|r| !r.is_empty())
271            .map(|r| IoSlice::new(r))
272            .collect();
273        write_ioslices(out, &slices)?;
274    } else {
275        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
276        multi_select_chunk(
277            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
278        );
279        if !buf.is_empty() {
280            out.write_all(&buf)?;
281        }
282    }
283    Ok(())
284}
285
286/// Process a chunk for multi-field extraction using a single-pass memchr2 scan.
287/// Scans for both delimiter and line_delim in one SIMD pass over the entire chunk,
288/// eliminating per-line memchr_iter setup overhead (significant for short lines).
289/// Delimiter positions are collected in a stack array per line.
290/// When max_field is reached on a line, remaining delimiters are ignored.
291fn multi_select_chunk(
292    data: &[u8],
293    delim: u8,
294    line_delim: u8,
295    ranges: &[Range],
296    max_field: usize,
297    suppress: bool,
298    buf: &mut Vec<u8>,
299) {
300    // When delim == line_delim, fall back to two-level approach
301    if delim == line_delim {
302        buf.reserve(data.len());
303        let base = data.as_ptr();
304        let mut start = 0;
305        for end_pos in memchr_iter(line_delim, data) {
306            let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
307            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
308            start = end_pos + 1;
309        }
310        if start < data.len() {
311            let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
312            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
313        }
314        return;
315    }
316
317    buf.reserve(data.len());
318    let base = data.as_ptr();
319    let data_len = data.len();
320
321    // Per-line state
322    let mut line_start: usize = 0;
323    let mut delim_pos = [0usize; 64];
324    let mut num_delims: usize = 0;
325    let max_delims = max_field.min(64);
326    let mut at_max = false;
327
328    // Single-pass scan using memchr2 for both delimiter and newline
329    for pos in memchr::memchr2_iter(delim, line_delim, data) {
330        let byte = unsafe { *base.add(pos) };
331
332        if byte == line_delim {
333            // End of line: extract fields from collected positions
334            let line_len = pos - line_start;
335            if num_delims == 0 {
336                // No delimiter in line
337                if !suppress {
338                    unsafe {
339                        buf_extend(
340                            buf,
341                            std::slice::from_raw_parts(base.add(line_start), line_len),
342                        );
343                        buf_push(buf, line_delim);
344                    }
345                }
346            } else {
347                // Extract fields using collected delimiter positions
348                let total_fields = num_delims + 1;
349                let mut first_output = true;
350
351                for r in ranges {
352                    let range_start = r.start;
353                    let range_end = r.end.min(total_fields);
354                    if range_start > total_fields {
355                        break;
356                    }
357                    for field_num in range_start..=range_end {
358                        if field_num > total_fields {
359                            break;
360                        }
361
362                        let field_start = if field_num == 1 {
363                            line_start
364                        } else if field_num - 2 < num_delims {
365                            delim_pos[field_num - 2] + 1
366                        } else {
367                            continue;
368                        };
369                        let field_end = if field_num <= num_delims {
370                            delim_pos[field_num - 1]
371                        } else {
372                            pos
373                        };
374
375                        if !first_output {
376                            unsafe { buf_push(buf, delim) };
377                        }
378                        unsafe {
379                            buf_extend(
380                                buf,
381                                std::slice::from_raw_parts(
382                                    base.add(field_start),
383                                    field_end - field_start,
384                                ),
385                            );
386                        }
387                        first_output = false;
388                    }
389                }
390
391                unsafe { buf_push(buf, line_delim) };
392            }
393
394            // Reset for next line
395            line_start = pos + 1;
396            num_delims = 0;
397            at_max = false;
398        } else {
399            // Delimiter found: collect position (up to max_field)
400            if !at_max && num_delims < max_delims {
401                delim_pos[num_delims] = pos;
402                num_delims += 1;
403                if num_delims >= max_delims {
404                    at_max = true;
405                }
406            }
407        }
408    }
409
410    // Handle last line without trailing line_delim
411    if line_start < data_len {
412        if num_delims == 0 {
413            if !suppress {
414                unsafe {
415                    buf_extend(
416                        buf,
417                        std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
418                    );
419                    buf_push(buf, line_delim);
420                }
421            }
422        } else {
423            let total_fields = num_delims + 1;
424            let mut first_output = true;
425
426            for r in ranges {
427                let range_start = r.start;
428                let range_end = r.end.min(total_fields);
429                if range_start > total_fields {
430                    break;
431                }
432                for field_num in range_start..=range_end {
433                    if field_num > total_fields {
434                        break;
435                    }
436
437                    let field_start = if field_num == 1 {
438                        line_start
439                    } else if field_num - 2 < num_delims {
440                        delim_pos[field_num - 2] + 1
441                    } else {
442                        continue;
443                    };
444                    let field_end = if field_num <= num_delims {
445                        delim_pos[field_num - 1]
446                    } else {
447                        data_len
448                    };
449
450                    if !first_output {
451                        unsafe { buf_push(buf, delim) };
452                    }
453                    unsafe {
454                        buf_extend(
455                            buf,
456                            std::slice::from_raw_parts(
457                                base.add(field_start),
458                                field_end - field_start,
459                            ),
460                        );
461                    }
462                    first_output = false;
463                }
464            }
465
466            unsafe { buf_push(buf, line_delim) };
467        }
468    }
469}
470
471/// Extract selected fields from a single line using delimiter position scanning.
472/// Scans delimiters only up to max_field (early exit), then extracts selected fields
473/// by indexing directly into the collected positions. Since ranges are pre-sorted and
474/// non-overlapping, every field within a range is selected — no is_selected check needed.
475#[inline(always)]
476fn multi_select_line(
477    line: &[u8],
478    delim: u8,
479    line_delim: u8,
480    ranges: &[Range],
481    max_field: usize,
482    suppress: bool,
483    buf: &mut Vec<u8>,
484) {
485    let len = line.len();
486    if len == 0 {
487        if !suppress {
488            unsafe { buf_push(buf, line_delim) };
489        }
490        return;
491    }
492
493    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
494    let base = line.as_ptr();
495
496    // Collect delimiter positions up to max_field (early exit).
497    // Stack array for up to 64 delimiter positions.
498    let mut delim_pos = [0usize; 64];
499    let mut num_delims: usize = 0;
500    let max_delims = max_field.min(64);
501
502    for pos in memchr_iter(delim, line) {
503        if num_delims < max_delims {
504            delim_pos[num_delims] = pos;
505            num_delims += 1;
506            if num_delims >= max_delims {
507                break;
508            }
509        }
510    }
511
512    if num_delims == 0 {
513        if !suppress {
514            unsafe {
515                buf_extend(buf, line);
516                buf_push(buf, line_delim);
517            }
518        }
519        return;
520    }
521
522    // Extract selected fields using delimiter positions.
523    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
524    // is selected — skip the is_selected check entirely (saves 1 function call per field).
525    let total_fields = num_delims + 1;
526    let mut first_output = true;
527
528    for r in ranges {
529        let range_start = r.start;
530        let range_end = r.end.min(total_fields);
531        if range_start > total_fields {
532            break;
533        }
534        for field_num in range_start..=range_end {
535            if field_num > total_fields {
536                break;
537            }
538
539            let field_start = if field_num == 1 {
540                0
541            } else if field_num - 2 < num_delims {
542                delim_pos[field_num - 2] + 1
543            } else {
544                continue;
545            };
546            let field_end = if field_num <= num_delims {
547                delim_pos[field_num - 1]
548            } else {
549                len
550            };
551
552            if !first_output {
553                unsafe { buf_push(buf, delim) };
554            }
555            unsafe {
556                buf_extend(
557                    buf,
558                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
559                );
560            }
561            first_output = false;
562        }
563    }
564
565    unsafe { buf_push(buf, line_delim) };
566}
567
568// ── Fast path: field extraction with batched output ──────────────────────
569
570/// Optimized field extraction with early exit and batched output.
571fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
572    let delim = cfg.delim;
573    let line_delim = cfg.line_delim;
574    let ranges = cfg.ranges;
575    let complement = cfg.complement;
576    let output_delim = cfg.output_delim;
577    let suppress = cfg.suppress_no_delim;
578
579    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
580    // That scan was O(N) over the entire file just to check an edge case
581    // (no delimiter in any line). The per-line processing already handles
582    // lines without delimiters correctly, so the scan was pure overhead
583    // for files that DO contain delimiters (the common case).
584
585    // Ultra-fast path: single field extraction (e.g., cut -f5)
586    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
587        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
588    }
589
590    // Fast path: complement of single field or contiguous range with default output delimiter.
591    if complement
592        && ranges.len() == 1
593        && output_delim.len() == 1
594        && output_delim[0] == delim
595        && ranges[0].start == ranges[0].end
596    {
597        return process_complement_single_field(
598            data,
599            delim,
600            line_delim,
601            ranges[0].start,
602            suppress,
603            out,
604        );
605    }
606
607    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
608    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
609    if complement
610        && ranges.len() == 1
611        && ranges[0].start > 1
612        && ranges[0].end < usize::MAX
613        && output_delim.len() == 1
614        && output_delim[0] == delim
615    {
616        return process_complement_range(
617            data,
618            delim,
619            line_delim,
620            ranges[0].start,
621            ranges[0].end,
622            suppress,
623            out,
624        );
625    }
626
627    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
628    if !complement
629        && ranges.len() == 1
630        && ranges[0].start == 1
631        && output_delim.len() == 1
632        && output_delim[0] == delim
633        && ranges[0].end < usize::MAX
634    {
635        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
636    }
637
638    // Fast path: open-ended field range from field N (e.g., cut -f3-)
639    if !complement
640        && ranges.len() == 1
641        && ranges[0].end == usize::MAX
642        && ranges[0].start > 1
643        && output_delim.len() == 1
644        && output_delim[0] == delim
645    {
646        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
647    }
648
649    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
650    if !complement
651        && ranges.len() == 1
652        && ranges[0].start > 1
653        && ranges[0].end < usize::MAX
654        && output_delim.len() == 1
655        && output_delim[0] == delim
656    {
657        return process_fields_mid_range(
658            data,
659            delim,
660            line_delim,
661            ranges[0].start,
662            ranges[0].end,
663            suppress,
664            out,
665        );
666    }
667
668    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
669    // Uses delimiter position caching: find all delimiter positions per line,
670    // then directly index into them for each selected field.
671    // This is faster than the general extract_fields_to_buf which re-checks
672    // is_selected() for every field encountered.
673    if !complement
674        && ranges.len() > 1
675        && ranges.last().map_or(false, |r| r.end < usize::MAX)
676        && output_delim.len() == 1
677        && output_delim[0] == delim
678        && delim != line_delim
679    {
680        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
681    }
682
683    // General field extraction
684    let max_field = if complement {
685        usize::MAX
686    } else {
687        ranges.last().map(|r| r.end).unwrap_or(0)
688    };
689    let field_mask = compute_field_mask(ranges, complement);
690
691    if data.len() >= PARALLEL_THRESHOLD {
692        let chunks = split_into_chunks(data, line_delim);
693        let results: Vec<Vec<u8>> = chunks
694            .par_iter()
695            .map(|chunk| {
696                let mut buf = Vec::with_capacity(chunk.len());
697                process_fields_chunk(
698                    chunk,
699                    delim,
700                    ranges,
701                    output_delim,
702                    suppress,
703                    max_field,
704                    field_mask,
705                    line_delim,
706                    complement,
707                    &mut buf,
708                );
709                buf
710            })
711            .collect();
712        // Use write_vectored (writev) to batch N writes into fewer syscalls
713        let slices: Vec<IoSlice> = results
714            .iter()
715            .filter(|r| !r.is_empty())
716            .map(|r| IoSlice::new(r))
717            .collect();
718        write_ioslices(out, &slices)?;
719    } else {
720        let mut buf = Vec::with_capacity(data.len());
721        process_fields_chunk(
722            data,
723            delim,
724            ranges,
725            output_delim,
726            suppress,
727            max_field,
728            field_mask,
729            line_delim,
730            complement,
731            &mut buf,
732        );
733        if !buf.is_empty() {
734            out.write_all(&buf)?;
735        }
736    }
737    Ok(())
738}
739
740/// Process a chunk of data for general field extraction.
741/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
742/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
743/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
744fn process_fields_chunk(
745    data: &[u8],
746    delim: u8,
747    ranges: &[Range],
748    output_delim: &[u8],
749    suppress: bool,
750    max_field: usize,
751    field_mask: u64,
752    line_delim: u8,
753    complement: bool,
754    buf: &mut Vec<u8>,
755) {
756    // When delim != line_delim and max_field is bounded, use two-level approach:
757    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
758    // This avoids scanning past max_field on each line (significant for lines with
759    // many columns but small field selection like -f1,3,5 on 20-column CSV).
760    // For complement or unbounded ranges, use single-pass memchr2_iter which
761    // needs to process all delimiters anyway.
762    if delim != line_delim && max_field < usize::MAX && !complement {
763        buf.reserve(data.len());
764        let mut start = 0;
765        for end_pos in memchr_iter(line_delim, data) {
766            let line = &data[start..end_pos];
767            extract_fields_to_buf(
768                line,
769                delim,
770                ranges,
771                output_delim,
772                suppress,
773                max_field,
774                field_mask,
775                line_delim,
776                buf,
777                complement,
778            );
779            start = end_pos + 1;
780        }
781        if start < data.len() {
782            extract_fields_to_buf(
783                &data[start..],
784                delim,
785                ranges,
786                output_delim,
787                suppress,
788                max_field,
789                field_mask,
790                line_delim,
791                buf,
792                complement,
793            );
794        }
795        return;
796    }
797
798    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
799    // delimiter and line_delim in one SIMD scan.
800    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
801    if delim != line_delim {
802        buf.reserve(data.len());
803
804        let data_len = data.len();
805        let base = data.as_ptr();
806        let mut line_start: usize = 0;
807        let mut field_start: usize = 0;
808        let mut field_num: usize = 1;
809        let mut first_output = true;
810        let mut has_delim = false;
811
812        for pos in memchr::memchr2_iter(delim, line_delim, data) {
813            let byte = unsafe { *base.add(pos) };
814
815            if byte == line_delim {
816                // End of line: flush final field and emit line delimiter
817                if (field_num <= max_field || complement)
818                    && has_delim
819                    && is_selected(field_num, field_mask, ranges, complement)
820                {
821                    if !first_output {
822                        unsafe { buf_extend(buf, output_delim) };
823                    }
824                    unsafe {
825                        buf_extend(
826                            buf,
827                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
828                        )
829                    };
830                    first_output = false;
831                }
832
833                if !first_output {
834                    unsafe { buf_push(buf, line_delim) };
835                } else if !has_delim {
836                    if !suppress {
837                        unsafe {
838                            buf_extend(
839                                buf,
840                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
841                            );
842                            buf_push(buf, line_delim);
843                        }
844                    }
845                } else {
846                    unsafe { buf_push(buf, line_delim) };
847                }
848
849                // Reset state for next line
850                line_start = pos + 1;
851                field_start = pos + 1;
852                field_num = 1;
853                first_output = true;
854                has_delim = false;
855            } else {
856                // Field delimiter hit
857                has_delim = true;
858
859                if is_selected(field_num, field_mask, ranges, complement) {
860                    if !first_output {
861                        unsafe { buf_extend(buf, output_delim) };
862                    }
863                    unsafe {
864                        buf_extend(
865                            buf,
866                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
867                        )
868                    };
869                    first_output = false;
870                }
871
872                field_num += 1;
873                field_start = pos + 1;
874            }
875        }
876
877        // Handle last line without trailing line_delim
878        if line_start < data_len {
879            if line_start < data_len {
880                if (field_num <= max_field || complement)
881                    && has_delim
882                    && is_selected(field_num, field_mask, ranges, complement)
883                {
884                    if !first_output {
885                        unsafe { buf_extend(buf, output_delim) };
886                    }
887                    unsafe {
888                        buf_extend(
889                            buf,
890                            std::slice::from_raw_parts(
891                                base.add(field_start),
892                                data_len - field_start,
893                            ),
894                        )
895                    };
896                    first_output = false;
897                }
898
899                if !first_output {
900                    unsafe { buf_push(buf, line_delim) };
901                } else if !has_delim {
902                    if !suppress {
903                        unsafe {
904                            buf_extend(
905                                buf,
906                                std::slice::from_raw_parts(
907                                    base.add(line_start),
908                                    data_len - line_start,
909                                ),
910                            );
911                            buf_push(buf, line_delim);
912                        }
913                    }
914                } else {
915                    unsafe { buf_push(buf, line_delim) };
916                }
917            }
918        }
919
920        return;
921    }
922
923    // Fallback: when delim == line_delim, use the two-level scan approach
924    let mut start = 0;
925    for end_pos in memchr_iter(line_delim, data) {
926        let line = &data[start..end_pos];
927        extract_fields_to_buf(
928            line,
929            delim,
930            ranges,
931            output_delim,
932            suppress,
933            max_field,
934            field_mask,
935            line_delim,
936            buf,
937            complement,
938        );
939        start = end_pos + 1;
940    }
941    if start < data.len() {
942        extract_fields_to_buf(
943            &data[start..],
944            delim,
945            ranges,
946            output_delim,
947            suppress,
948            max_field,
949            field_mask,
950            line_delim,
951            buf,
952            complement,
953        );
954    }
955}
956
957// ── Ultra-fast single field extraction ───────────────────────────────────
958
959/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
960/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
961/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
962fn process_single_field(
963    data: &[u8],
964    delim: u8,
965    line_delim: u8,
966    target: usize,
967    suppress: bool,
968    out: &mut impl Write,
969) -> io::Result<()> {
970    let target_idx = target - 1;
971
972    if delim != line_delim {
973        // Field 1 fast path: memchr2 single-pass + parallel for large data.
974        // memchr2(delim, newline) finds the first special byte per line in one scan.
975        // For field 1, the first delimiter IS the field boundary. Lines without
976        // delimiter are passed through unchanged. This scans ~N total bytes vs
977        // ~1.5N for the two-level (outer newline + inner delimiter) approach.
978        if target_idx == 0 && !suppress {
979            if data.len() >= PARALLEL_THRESHOLD {
980                return single_field1_parallel(data, delim, line_delim, out);
981            }
982            return single_field1_zerocopy(data, delim, line_delim, out);
983        }
984
985        // Two-level approach for field N: outer newline scan + inner delim scan
986        // with early exit at target_idx. Faster than memchr2 single-pass because
987        // we only scan delimiters up to target_idx per line (not all of them).
988        if data.len() >= PARALLEL_THRESHOLD {
989            let chunks = split_into_chunks(data, line_delim);
990            let results: Vec<Vec<u8>> = chunks
991                .par_iter()
992                .map(|chunk| {
993                    let mut buf = Vec::with_capacity(chunk.len() / 2);
994                    process_single_field_chunk(
995                        chunk, delim, target_idx, line_delim, suppress, &mut buf,
996                    );
997                    buf
998                })
999                .collect();
1000            let slices: Vec<IoSlice> = results
1001                .iter()
1002                .filter(|r| !r.is_empty())
1003                .map(|r| IoSlice::new(r))
1004                .collect();
1005            write_ioslices(out, &slices)?;
1006        } else {
1007            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1008            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1009            if !buf.is_empty() {
1010                out.write_all(&buf)?;
1011            }
1012        }
1013        return Ok(());
1014    }
1015
1016    // Fallback for delim == line_delim: nested loop approach
1017    if data.len() >= PARALLEL_THRESHOLD {
1018        let chunks = split_into_chunks(data, line_delim);
1019        let results: Vec<Vec<u8>> = chunks
1020            .par_iter()
1021            .map(|chunk| {
1022                let mut buf = Vec::with_capacity(chunk.len() / 4);
1023                process_single_field_chunk(
1024                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
1025                );
1026                buf
1027            })
1028            .collect();
1029        // Use write_vectored (writev) to batch N writes into fewer syscalls
1030        let slices: Vec<IoSlice> = results
1031            .iter()
1032            .filter(|r| !r.is_empty())
1033            .map(|r| IoSlice::new(r))
1034            .collect();
1035        write_ioslices(out, &slices)?;
1036    } else {
1037        let mut buf = Vec::with_capacity(data.len() / 4);
1038        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1039        if !buf.is_empty() {
1040            out.write_all(&buf)?;
1041        }
1042    }
1043    Ok(())
1044}
1045
1046/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1047/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1048fn process_complement_range(
1049    data: &[u8],
1050    delim: u8,
1051    line_delim: u8,
1052    skip_start: usize,
1053    skip_end: usize,
1054    suppress: bool,
1055    out: &mut impl Write,
1056) -> io::Result<()> {
1057    if data.len() >= PARALLEL_THRESHOLD {
1058        let chunks = split_into_chunks(data, line_delim);
1059        let results: Vec<Vec<u8>> = chunks
1060            .par_iter()
1061            .map(|chunk| {
1062                let mut buf = Vec::with_capacity(chunk.len());
1063                complement_range_chunk(
1064                    chunk, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1065                );
1066                buf
1067            })
1068            .collect();
1069        let slices: Vec<IoSlice> = results
1070            .iter()
1071            .filter(|r| !r.is_empty())
1072            .map(|r| IoSlice::new(r))
1073            .collect();
1074        write_ioslices(out, &slices)?;
1075    } else {
1076        let mut buf = Vec::with_capacity(data.len());
1077        complement_range_chunk(
1078            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1079        );
1080        if !buf.is_empty() {
1081            out.write_all(&buf)?;
1082        }
1083    }
1084    Ok(())
1085}
1086
1087/// Process a chunk for complement range extraction.
1088fn complement_range_chunk(
1089    data: &[u8],
1090    delim: u8,
1091    skip_start: usize,
1092    skip_end: usize,
1093    line_delim: u8,
1094    suppress: bool,
1095    buf: &mut Vec<u8>,
1096) {
1097    let mut start = 0;
1098    for end_pos in memchr_iter(line_delim, data) {
1099        let line = &data[start..end_pos];
1100        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1101        start = end_pos + 1;
1102    }
1103    if start < data.len() {
1104        complement_range_line(
1105            &data[start..],
1106            delim,
1107            skip_start,
1108            skip_end,
1109            line_delim,
1110            suppress,
1111            buf,
1112        );
1113    }
1114}
1115
1116/// Extract all fields except skip_start..=skip_end from one line.
1117/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1118///
1119/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1120/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1121/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1122#[inline(always)]
1123fn complement_range_line(
1124    line: &[u8],
1125    delim: u8,
1126    skip_start: usize,
1127    skip_end: usize,
1128    line_delim: u8,
1129    suppress: bool,
1130    buf: &mut Vec<u8>,
1131) {
1132    let len = line.len();
1133    if len == 0 {
1134        if !suppress {
1135            buf.push(line_delim);
1136        }
1137        return;
1138    }
1139
1140    buf.reserve(len + 1);
1141    let base = line.as_ptr();
1142
1143    // 1-based field numbers. To skip fields skip_start..=skip_end:
1144    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1145    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1146    //
1147    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1148    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1149
1150    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1151    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1152    let total_need = need_prefix_delims + need_skip_delims;
1153
1154    // Find delimiter positions up to total_need
1155    let mut delim_count: usize = 0;
1156    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1157    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1158
1159    for pos in memchr_iter(delim, line) {
1160        delim_count += 1;
1161        if delim_count == need_prefix_delims {
1162            prefix_end_pos = pos;
1163        }
1164        if delim_count == total_need {
1165            suffix_start_pos = pos + 1;
1166            break;
1167        }
1168    }
1169
1170    if delim_count == 0 {
1171        // No delimiter at all
1172        if !suppress {
1173            unsafe {
1174                buf_extend(buf, line);
1175                buf_push(buf, line_delim);
1176            }
1177        }
1178        return;
1179    }
1180
1181    // Case analysis:
1182    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1183    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1184    // 3. Enough to reach skip_end: prefix + delim + suffix
1185
1186    if delim_count < need_prefix_delims {
1187        // Not enough fields to reach skip region — output entire line
1188        unsafe {
1189            buf_extend(buf, line);
1190            buf_push(buf, line_delim);
1191        }
1192        return;
1193    }
1194
1195    let has_prefix = need_prefix_delims > 0;
1196    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1197
1198    if has_prefix && has_suffix {
1199        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1200        unsafe {
1201            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1202            buf_push(buf, delim);
1203            buf_extend(
1204                buf,
1205                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1206            );
1207            buf_push(buf, line_delim);
1208        }
1209    } else if has_prefix {
1210        // Only prefix, no suffix (skip region extends to end of line)
1211        unsafe {
1212            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1213            buf_push(buf, line_delim);
1214        }
1215    } else if has_suffix {
1216        // No prefix (skip_start == 1), only suffix
1217        unsafe {
1218            buf_extend(
1219                buf,
1220                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1221            );
1222            buf_push(buf, line_delim);
1223        }
1224    } else {
1225        // All fields skipped
1226        unsafe { buf_push(buf, line_delim) };
1227    }
1228}
1229
1230/// Complement single-field extraction: skip one field, output rest unchanged.
1231fn process_complement_single_field(
1232    data: &[u8],
1233    delim: u8,
1234    line_delim: u8,
1235    skip_field: usize,
1236    suppress: bool,
1237    out: &mut impl Write,
1238) -> io::Result<()> {
1239    let skip_idx = skip_field - 1;
1240
1241    if data.len() >= PARALLEL_THRESHOLD {
1242        let chunks = split_into_chunks(data, line_delim);
1243        let results: Vec<Vec<u8>> = chunks
1244            .par_iter()
1245            .map(|chunk| {
1246                let mut buf = Vec::with_capacity(chunk.len());
1247                complement_single_field_chunk(
1248                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
1249                );
1250                buf
1251            })
1252            .collect();
1253        // Use write_vectored (writev) to batch N writes into fewer syscalls
1254        let slices: Vec<IoSlice> = results
1255            .iter()
1256            .filter(|r| !r.is_empty())
1257            .map(|r| IoSlice::new(r))
1258            .collect();
1259        write_ioslices(out, &slices)?;
1260    } else {
1261        let mut buf = Vec::with_capacity(data.len());
1262        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1263        if !buf.is_empty() {
1264            out.write_all(&buf)?;
1265        }
1266    }
1267    Ok(())
1268}
1269
1270/// Process a chunk for complement single-field extraction.
1271fn complement_single_field_chunk(
1272    data: &[u8],
1273    delim: u8,
1274    skip_idx: usize,
1275    line_delim: u8,
1276    suppress: bool,
1277    buf: &mut Vec<u8>,
1278) {
1279    let mut start = 0;
1280    for end_pos in memchr_iter(line_delim, data) {
1281        let line = &data[start..end_pos];
1282        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1283        start = end_pos + 1;
1284    }
1285    if start < data.len() {
1286        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
1287    }
1288}
1289
1290/// Extract all fields except skip_idx from one line.
1291/// Optimized: finds only the delimiters bounding the skip field (skip_idx-th
1292/// and (skip_idx+1)-th delimiters), then copies prefix + suffix in 2 bulk
1293/// copies instead of iterating through all fields.
1294#[inline(always)]
1295fn complement_single_field_line(
1296    line: &[u8],
1297    delim: u8,
1298    skip_idx: usize,
1299    line_delim: u8,
1300    suppress: bool,
1301    buf: &mut Vec<u8>,
1302) {
1303    let len = line.len();
1304    if len == 0 {
1305        if !suppress {
1306            buf.push(line_delim);
1307        }
1308        return;
1309    }
1310
1311    buf.reserve(len + 1);
1312    let base = line.as_ptr();
1313
1314    // Find the delimiters bounding the skip field:
1315    // - We need skip_idx delimiters to find where the skip field starts
1316    // - We need one more delimiter to find where it ends
1317    // For skip_idx == 0 (skip field 1): skip field starts at 0, ends at first delimiter
1318    // For skip_idx == 1 (skip field 2): skip field starts after 1st delim, ends at 2nd delim
1319    let need_before = skip_idx; // delimiters before skip field
1320    let need_total = skip_idx + 1; // delimiters to find end of skip field
1321
1322    let mut delim_count: usize = 0;
1323    let mut skip_start_pos: usize = 0; // byte start of skip field
1324    let mut skip_end_pos: usize = len; // byte position of delimiter after skip field (or EOL)
1325    let mut found_end = false;
1326
1327    for pos in memchr_iter(delim, line) {
1328        delim_count += 1;
1329        if delim_count == need_before {
1330            skip_start_pos = pos + 1;
1331        }
1332        if delim_count == need_total {
1333            skip_end_pos = pos;
1334            found_end = true;
1335            break;
1336        }
1337    }
1338
1339    if delim_count == 0 {
1340        // No delimiter in line
1341        if !suppress {
1342            unsafe {
1343                buf_extend(buf, line);
1344                buf_push(buf, line_delim);
1345            }
1346        }
1347        return;
1348    }
1349
1350    // Not enough delimiters to reach the skip field: output entire line
1351    if delim_count < need_before {
1352        unsafe {
1353            buf_extend(buf, line);
1354            buf_push(buf, line_delim);
1355        }
1356        return;
1357    }
1358
1359    // skip field is at positions skip_start_pos..skip_end_pos
1360    // Output prefix (before skip field) + suffix (after skip field)
1361    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1362    let has_suffix = found_end && skip_end_pos < len;
1363
1364    if has_prefix && has_suffix {
1365        // prefix = line[0..skip_start_pos-1] (before the delimiter that starts skip field)
1366        // suffix = line[skip_end_pos+1..] (after the delimiter that ends skip field)
1367        unsafe {
1368            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1369            buf_push(buf, delim);
1370            buf_extend(
1371                buf,
1372                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1373            );
1374            buf_push(buf, line_delim);
1375        }
1376    } else if has_prefix {
1377        // Only prefix (skip field is the last field)
1378        unsafe {
1379            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1380            buf_push(buf, line_delim);
1381        }
1382    } else if has_suffix {
1383        // No prefix (skip field is the first field)
1384        unsafe {
1385            buf_extend(
1386                buf,
1387                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1388            );
1389            buf_push(buf, line_delim);
1390        }
1391    } else {
1392        // Skip field is the only field (or entire line)
1393        unsafe { buf_push(buf, line_delim) };
1394    }
1395}
1396
1397/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1398/// Zero-copy for the non-parallel path: identifies the truncation point per line
1399/// and writes contiguous runs directly from the source data.
1400fn process_fields_prefix(
1401    data: &[u8],
1402    delim: u8,
1403    line_delim: u8,
1404    last_field: usize,
1405    suppress: bool,
1406    out: &mut impl Write,
1407) -> io::Result<()> {
1408    if data.len() >= PARALLEL_THRESHOLD {
1409        let chunks = split_into_chunks(data, line_delim);
1410        let results: Vec<Vec<u8>> = chunks
1411            .par_iter()
1412            .map(|chunk| {
1413                let mut buf = Vec::with_capacity(chunk.len());
1414                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
1415                buf
1416            })
1417            .collect();
1418        // Use write_vectored (writev) to batch N writes into fewer syscalls
1419        let slices: Vec<IoSlice> = results
1420            .iter()
1421            .filter(|r| !r.is_empty())
1422            .map(|r| IoSlice::new(r))
1423            .collect();
1424        write_ioslices(out, &slices)?;
1425    } else if !suppress {
1426        // Zero-copy fast path: scan for truncation points, write runs from source.
1427        // When suppress is false, every line is output (with or without delimiter).
1428        // Most lines have enough fields, so the output is often identical to input.
1429        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1430    } else {
1431        let mut buf = Vec::with_capacity(data.len());
1432        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1433        if !buf.is_empty() {
1434            out.write_all(&buf)?;
1435        }
1436    }
1437    Ok(())
1438}
1439
1440/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1441/// directly into the source data, flushing in MAX_IOV-sized batches.
1442/// For lines where the Nth delimiter exists, we truncate at that point.
1443/// For lines with fewer fields, we output them unchanged (contiguous run).
1444/// Lines without any delimiter are output unchanged (suppress=false assumed).
1445#[inline]
1446fn fields_prefix_zerocopy(
1447    data: &[u8],
1448    delim: u8,
1449    line_delim: u8,
1450    last_field: usize,
1451    out: &mut impl Write,
1452) -> io::Result<()> {
1453    let newline_buf: [u8; 1] = [line_delim];
1454    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1455    let mut start = 0;
1456    let mut run_start: usize = 0;
1457
1458    for end_pos in memchr_iter(line_delim, data) {
1459        let line = &data[start..end_pos];
1460        let mut field_count = 1;
1461        let mut truncate_at: Option<usize> = None;
1462        for dpos in memchr_iter(delim, line) {
1463            if field_count >= last_field {
1464                truncate_at = Some(start + dpos);
1465                break;
1466            }
1467            field_count += 1;
1468        }
1469
1470        if let Some(trunc_pos) = truncate_at {
1471            if run_start < start {
1472                iov.push(IoSlice::new(&data[run_start..start]));
1473            }
1474            iov.push(IoSlice::new(&data[start..trunc_pos]));
1475            iov.push(IoSlice::new(&newline_buf));
1476            run_start = end_pos + 1;
1477
1478            if iov.len() >= MAX_IOV - 2 {
1479                write_ioslices(out, &iov)?;
1480                iov.clear();
1481            }
1482        }
1483        start = end_pos + 1;
1484    }
1485    // Handle last line without terminator
1486    if start < data.len() {
1487        let line = &data[start..];
1488        let mut field_count = 1;
1489        let mut truncate_at: Option<usize> = None;
1490        for dpos in memchr_iter(delim, line) {
1491            if field_count >= last_field {
1492                truncate_at = Some(start + dpos);
1493                break;
1494            }
1495            field_count += 1;
1496        }
1497        if let Some(trunc_pos) = truncate_at {
1498            if run_start < start {
1499                iov.push(IoSlice::new(&data[run_start..start]));
1500            }
1501            iov.push(IoSlice::new(&data[start..trunc_pos]));
1502            iov.push(IoSlice::new(&newline_buf));
1503            if !iov.is_empty() {
1504                write_ioslices(out, &iov)?;
1505            }
1506            return Ok(());
1507        }
1508    }
1509    // Flush remaining contiguous run
1510    if run_start < data.len() {
1511        iov.push(IoSlice::new(&data[run_start..]));
1512        if !data.is_empty() && *data.last().unwrap() != line_delim {
1513            iov.push(IoSlice::new(&newline_buf));
1514        }
1515    }
1516    if !iov.is_empty() {
1517        write_ioslices(out, &iov)?;
1518    }
1519    Ok(())
1520}
1521
1522/// Process a chunk for contiguous from-start field range extraction.
1523fn fields_prefix_chunk(
1524    data: &[u8],
1525    delim: u8,
1526    line_delim: u8,
1527    last_field: usize,
1528    suppress: bool,
1529    buf: &mut Vec<u8>,
1530) {
1531    let mut start = 0;
1532    for end_pos in memchr_iter(line_delim, data) {
1533        let line = &data[start..end_pos];
1534        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1535        start = end_pos + 1;
1536    }
1537    if start < data.len() {
1538        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1539    }
1540}
1541
1542/// Extract first N fields from one line (contiguous from-start range).
1543/// Uses memchr SIMD for delimiter scanning on all line sizes.
1544#[inline(always)]
1545fn fields_prefix_line(
1546    line: &[u8],
1547    delim: u8,
1548    line_delim: u8,
1549    last_field: usize,
1550    suppress: bool,
1551    buf: &mut Vec<u8>,
1552) {
1553    let len = line.len();
1554    if len == 0 {
1555        if !suppress {
1556            buf.push(line_delim);
1557        }
1558        return;
1559    }
1560
1561    buf.reserve(len + 1);
1562    let base = line.as_ptr();
1563
1564    let mut field_count = 1usize;
1565    let mut has_delim = false;
1566
1567    for pos in memchr_iter(delim, line) {
1568        has_delim = true;
1569        if field_count >= last_field {
1570            unsafe {
1571                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1572                buf_push(buf, line_delim);
1573            }
1574            return;
1575        }
1576        field_count += 1;
1577    }
1578
1579    if !has_delim {
1580        if !suppress {
1581            unsafe {
1582                buf_extend(buf, line);
1583                buf_push(buf, line_delim);
1584            }
1585        }
1586        return;
1587    }
1588
1589    unsafe {
1590        buf_extend(buf, line);
1591        buf_push(buf, line_delim);
1592    }
1593}
1594
1595/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1596fn process_fields_suffix(
1597    data: &[u8],
1598    delim: u8,
1599    line_delim: u8,
1600    start_field: usize,
1601    suppress: bool,
1602    out: &mut impl Write,
1603) -> io::Result<()> {
1604    if data.len() >= PARALLEL_THRESHOLD {
1605        let chunks = split_into_chunks(data, line_delim);
1606        let results: Vec<Vec<u8>> = chunks
1607            .par_iter()
1608            .map(|chunk| {
1609                let mut buf = Vec::with_capacity(chunk.len());
1610                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
1611                buf
1612            })
1613            .collect();
1614        // Use write_vectored (writev) to batch N writes into fewer syscalls
1615        let slices: Vec<IoSlice> = results
1616            .iter()
1617            .filter(|r| !r.is_empty())
1618            .map(|r| IoSlice::new(r))
1619            .collect();
1620        write_ioslices(out, &slices)?;
1621    } else {
1622        let mut buf = Vec::with_capacity(data.len());
1623        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1624        if !buf.is_empty() {
1625            out.write_all(&buf)?;
1626        }
1627    }
1628    Ok(())
1629}
1630
1631/// Process a chunk for open-ended field suffix extraction.
1632fn fields_suffix_chunk(
1633    data: &[u8],
1634    delim: u8,
1635    line_delim: u8,
1636    start_field: usize,
1637    suppress: bool,
1638    buf: &mut Vec<u8>,
1639) {
1640    let mut start = 0;
1641    for end_pos in memchr_iter(line_delim, data) {
1642        let line = &data[start..end_pos];
1643        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1644        start = end_pos + 1;
1645    }
1646    if start < data.len() {
1647        fields_suffix_line(
1648            &data[start..],
1649            delim,
1650            line_delim,
1651            start_field,
1652            suppress,
1653            buf,
1654        );
1655    }
1656}
1657
1658/// Extract fields from start_field to end from one line.
1659/// Uses memchr SIMD for delimiter scanning on all line sizes.
1660#[inline(always)]
1661fn fields_suffix_line(
1662    line: &[u8],
1663    delim: u8,
1664    line_delim: u8,
1665    start_field: usize,
1666    suppress: bool,
1667    buf: &mut Vec<u8>,
1668) {
1669    let len = line.len();
1670    if len == 0 {
1671        if !suppress {
1672            buf.push(line_delim);
1673        }
1674        return;
1675    }
1676
1677    buf.reserve(len + 1);
1678    let base = line.as_ptr();
1679
1680    let skip_delims = start_field - 1;
1681    let mut delim_count = 0usize;
1682    let mut has_delim = false;
1683
1684    for pos in memchr_iter(delim, line) {
1685        has_delim = true;
1686        delim_count += 1;
1687        if delim_count >= skip_delims {
1688            unsafe {
1689                buf_extend(
1690                    buf,
1691                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1692                );
1693                buf_push(buf, line_delim);
1694            }
1695            return;
1696        }
1697    }
1698
1699    if !has_delim {
1700        if !suppress {
1701            unsafe {
1702                buf_extend(buf, line);
1703                buf_push(buf, line_delim);
1704            }
1705        }
1706        return;
1707    }
1708
1709    // Fewer delimiters than needed
1710    unsafe { buf_push(buf, line_delim) };
1711}
1712
1713/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1714/// Optimized: skip to start_field using memchr, then output until end_field.
1715fn process_fields_mid_range(
1716    data: &[u8],
1717    delim: u8,
1718    line_delim: u8,
1719    start_field: usize,
1720    end_field: usize,
1721    suppress: bool,
1722    out: &mut impl Write,
1723) -> io::Result<()> {
1724    if data.len() >= PARALLEL_THRESHOLD {
1725        let chunks = split_into_chunks(data, line_delim);
1726        let results: Vec<Vec<u8>> = chunks
1727            .par_iter()
1728            .map(|chunk| {
1729                let mut buf = Vec::with_capacity(chunk.len());
1730                fields_mid_range_chunk(
1731                    chunk,
1732                    delim,
1733                    line_delim,
1734                    start_field,
1735                    end_field,
1736                    suppress,
1737                    &mut buf,
1738                );
1739                buf
1740            })
1741            .collect();
1742        let slices: Vec<IoSlice> = results
1743            .iter()
1744            .filter(|r| !r.is_empty())
1745            .map(|r| IoSlice::new(r))
1746            .collect();
1747        write_ioslices(out, &slices)?;
1748    } else {
1749        let mut buf = Vec::with_capacity(data.len());
1750        fields_mid_range_chunk(
1751            data,
1752            delim,
1753            line_delim,
1754            start_field,
1755            end_field,
1756            suppress,
1757            &mut buf,
1758        );
1759        if !buf.is_empty() {
1760            out.write_all(&buf)?;
1761        }
1762    }
1763    Ok(())
1764}
1765
1766/// Process a chunk for contiguous mid-range field extraction.
1767fn fields_mid_range_chunk(
1768    data: &[u8],
1769    delim: u8,
1770    line_delim: u8,
1771    start_field: usize,
1772    end_field: usize,
1773    suppress: bool,
1774    buf: &mut Vec<u8>,
1775) {
1776    let mut start = 0;
1777    for end_pos in memchr_iter(line_delim, data) {
1778        let line = &data[start..end_pos];
1779        fields_mid_range_line(
1780            line,
1781            delim,
1782            line_delim,
1783            start_field,
1784            end_field,
1785            suppress,
1786            buf,
1787        );
1788        start = end_pos + 1;
1789    }
1790    if start < data.len() {
1791        fields_mid_range_line(
1792            &data[start..],
1793            delim,
1794            line_delim,
1795            start_field,
1796            end_field,
1797            suppress,
1798            buf,
1799        );
1800    }
1801}
1802
1803/// Extract fields start_field..=end_field from one line.
1804/// Uses scalar byte scanning for short lines, memchr_iter for longer.
1805/// Raw pointer arithmetic to eliminate bounds checking.
1806#[inline(always)]
1807fn fields_mid_range_line(
1808    line: &[u8],
1809    delim: u8,
1810    line_delim: u8,
1811    start_field: usize,
1812    end_field: usize,
1813    suppress: bool,
1814    buf: &mut Vec<u8>,
1815) {
1816    let len = line.len();
1817    if len == 0 {
1818        if !suppress {
1819            buf.push(line_delim);
1820        }
1821        return;
1822    }
1823
1824    buf.reserve(len + 1);
1825    let base = line.as_ptr();
1826
1827    // Count delimiters to find start_field and end_field boundaries
1828    let skip_before = start_field - 1; // delimiters to skip before start_field
1829    let field_span = end_field - start_field; // additional delimiters within the range
1830    let target_end_delim = skip_before + field_span + 1;
1831    let mut delim_count = 0;
1832    let mut range_start = 0;
1833    let mut has_delim = false;
1834
1835    for pos in memchr_iter(delim, line) {
1836        has_delim = true;
1837        delim_count += 1;
1838        if delim_count == skip_before {
1839            range_start = pos + 1;
1840        }
1841        if delim_count == target_end_delim {
1842            if skip_before == 0 {
1843                range_start = 0;
1844            }
1845            unsafe {
1846                buf_extend(
1847                    buf,
1848                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1849                );
1850                buf_push(buf, line_delim);
1851            }
1852            return;
1853        }
1854    }
1855
1856    if !has_delim {
1857        if !suppress {
1858            unsafe {
1859                buf_extend(buf, line);
1860                buf_push(buf, line_delim);
1861            }
1862        }
1863        return;
1864    }
1865
1866    // Line has delimiters but fewer fields than end_field
1867    if delim_count >= skip_before {
1868        // We have at least start_field, output from range_start to end
1869        if skip_before == 0 {
1870            range_start = 0;
1871        }
1872        unsafe {
1873            buf_extend(
1874                buf,
1875                std::slice::from_raw_parts(base.add(range_start), len - range_start),
1876            );
1877            buf_push(buf, line_delim);
1878        }
1879    } else {
1880        // Not enough fields even for start_field — output empty line
1881        unsafe { buf_push(buf, line_delim) };
1882    }
1883}
1884
1885/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
1886/// directly into the source data, flushing in MAX_IOV-sized batches.
1887/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
1888///
1889/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
1890/// Parallel field-1 extraction for large data using memchr2 single-pass.
1891/// Splits data into per-thread chunks, each chunk extracts field 1 using
1892/// memchr2(delim, newline) which finds the first special byte in one scan.
1893/// For field 1: first special byte is either the delimiter (field end) or
1894/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
1895fn single_field1_parallel(
1896    data: &[u8],
1897    delim: u8,
1898    line_delim: u8,
1899    out: &mut impl Write,
1900) -> io::Result<()> {
1901    let chunks = split_into_chunks(data, line_delim);
1902    let results: Vec<Vec<u8>> = chunks
1903        .par_iter()
1904        .map(|chunk| {
1905            let mut buf = Vec::with_capacity(chunk.len());
1906            single_field1_to_buf(chunk, delim, line_delim, &mut buf);
1907            buf
1908        })
1909        .collect();
1910    let slices: Vec<IoSlice> = results
1911        .iter()
1912        .filter(|r| !r.is_empty())
1913        .map(|r| IoSlice::new(r))
1914        .collect();
1915    write_ioslices(out, &slices)
1916}
1917
1918/// Extract field 1 from a chunk using memchr2 single-pass scanning.
1919/// Uses memchr2(delim, line_delim) to find the first special byte per line:
1920/// - If delimiter: field 1 = data[line_start..delim_pos], skip to next newline
1921/// - If newline: no delimiter on this line, output unchanged
1922/// This scans ~N total bytes vs ~1.5N for two-level (outer newline + inner delimiter).
1923#[inline]
1924fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
1925    use memchr::memchr2;
1926    buf.reserve(data.len());
1927    let mut pos = 0;
1928    while pos < data.len() {
1929        match memchr2(delim, line_delim, &data[pos..]) {
1930            None => {
1931                // Rest is a partial line, no delimiter — output as-is
1932                unsafe {
1933                    buf_extend(buf, &data[pos..]);
1934                }
1935                break;
1936            }
1937            Some(offset) => {
1938                let actual = pos + offset;
1939                if data[actual] == line_delim {
1940                    // No delimiter on this line — output entire line including newline
1941                    unsafe {
1942                        buf_extend(buf, &data[pos..actual + 1]);
1943                    }
1944                    pos = actual + 1;
1945                } else {
1946                    // Delimiter found — output field 1 (up to delimiter) + newline
1947                    unsafe {
1948                        buf_extend(buf, &data[pos..actual]);
1949                        buf_push(buf, line_delim);
1950                    }
1951                    // Skip to next newline
1952                    match memchr::memchr(line_delim, &data[actual + 1..]) {
1953                        None => {
1954                            pos = data.len();
1955                        }
1956                        Some(nl_off) => {
1957                            pos = actual + 1 + nl_off + 1;
1958                        }
1959                    }
1960                }
1961            }
1962        }
1963    }
1964}
1965
1966/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
1967/// directly into the source data. Uses two-level scan: outer memchr(newline)
1968/// for the first delimiter. This is faster than memchr2 for SMALL data because
1969/// the inner scan exits after the FIRST delimiter, skipping all
1970/// subsequent delimiters on the line.
1971///
1972/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
1973/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
1974#[inline]
1975fn single_field1_zerocopy(
1976    data: &[u8],
1977    delim: u8,
1978    line_delim: u8,
1979    out: &mut impl Write,
1980) -> io::Result<()> {
1981    let newline_buf: [u8; 1] = [line_delim];
1982
1983    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1984    let mut run_start: usize = 0;
1985    let mut start = 0;
1986
1987    for end_pos in memchr_iter(line_delim, data) {
1988        let line = &data[start..end_pos];
1989        if let Some(dp) = memchr::memchr(delim, line) {
1990            // Line has delimiter — truncate at first delimiter.
1991            // Flush current contiguous run, then add truncated field + newline.
1992            if run_start < start {
1993                iov.push(IoSlice::new(&data[run_start..start]));
1994            }
1995            iov.push(IoSlice::new(&data[start..start + dp]));
1996            iov.push(IoSlice::new(&newline_buf));
1997            run_start = end_pos + 1;
1998
1999            if iov.len() >= MAX_IOV - 2 {
2000                write_ioslices(out, &iov)?;
2001                iov.clear();
2002            }
2003        }
2004        // else: no delimiter in line, output unchanged (stays in contiguous run)
2005        start = end_pos + 1;
2006    }
2007
2008    // Handle last line (no trailing newline)
2009    if start < data.len() {
2010        let line = &data[start..];
2011        if let Some(dp) = memchr::memchr(delim, line) {
2012            if run_start < start {
2013                iov.push(IoSlice::new(&data[run_start..start]));
2014            }
2015            iov.push(IoSlice::new(&data[start..start + dp]));
2016            iov.push(IoSlice::new(&newline_buf));
2017            if !iov.is_empty() {
2018                write_ioslices(out, &iov)?;
2019            }
2020            return Ok(());
2021        }
2022    }
2023
2024    // Flush remaining contiguous run
2025    if run_start < data.len() {
2026        iov.push(IoSlice::new(&data[run_start..]));
2027        if !data.is_empty() && *data.last().unwrap() != line_delim {
2028            iov.push(IoSlice::new(&newline_buf));
2029        }
2030    }
2031    if !iov.is_empty() {
2032        write_ioslices(out, &iov)?;
2033    }
2034    Ok(())
2035}
2036
2037/// Process a chunk of data for single-field extraction.
2038fn process_single_field_chunk(
2039    data: &[u8],
2040    delim: u8,
2041    target_idx: usize,
2042    line_delim: u8,
2043    suppress: bool,
2044    buf: &mut Vec<u8>,
2045) {
2046    let mut start = 0;
2047    for end_pos in memchr_iter(line_delim, data) {
2048        let line = &data[start..end_pos];
2049        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2050        start = end_pos + 1;
2051    }
2052    if start < data.len() {
2053        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2054    }
2055}
2056
2057/// Extract a single field from one line.
2058/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2059/// For longer lines, uses memchr for SIMD-accelerated scanning.
2060/// Raw pointer arithmetic eliminates per-field bounds checking.
2061#[inline(always)]
2062fn extract_single_field_line(
2063    line: &[u8],
2064    delim: u8,
2065    target_idx: usize,
2066    line_delim: u8,
2067    suppress: bool,
2068    buf: &mut Vec<u8>,
2069) {
2070    let len = line.len();
2071    if len == 0 {
2072        if !suppress {
2073            buf.push(line_delim);
2074        }
2075        return;
2076    }
2077
2078    // Ensure capacity for worst case (full line + newline)
2079    buf.reserve(len + 1);
2080
2081    let base = line.as_ptr();
2082
2083    // Ultra-fast path for first field: single memchr
2084    if target_idx == 0 {
2085        match memchr::memchr(delim, line) {
2086            Some(pos) => unsafe {
2087                buf_extend(buf, std::slice::from_raw_parts(base, pos));
2088                buf_push(buf, line_delim);
2089            },
2090            None => {
2091                if !suppress {
2092                    unsafe {
2093                        buf_extend(buf, line);
2094                        buf_push(buf, line_delim);
2095                    }
2096                }
2097            }
2098        }
2099        return;
2100    }
2101
2102    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2103    let mut field_start = 0;
2104    let mut field_idx = 0;
2105    let mut has_delim = false;
2106
2107    for pos in memchr_iter(delim, line) {
2108        has_delim = true;
2109        if field_idx == target_idx {
2110            unsafe {
2111                buf_extend(
2112                    buf,
2113                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2114                );
2115                buf_push(buf, line_delim);
2116            }
2117            return;
2118        }
2119        field_idx += 1;
2120        field_start = pos + 1;
2121    }
2122
2123    if !has_delim {
2124        if !suppress {
2125            unsafe {
2126                buf_extend(buf, line);
2127                buf_push(buf, line_delim);
2128            }
2129        }
2130        return;
2131    }
2132
2133    if field_idx == target_idx {
2134        unsafe {
2135            buf_extend(
2136                buf,
2137                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2138            );
2139            buf_push(buf, line_delim);
2140        }
2141    } else {
2142        unsafe { buf_push(buf, line_delim) };
2143    }
2144}
2145
2146/// Extract fields from a single line into the output buffer.
2147/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2148/// Raw pointer arithmetic eliminates per-field bounds checking.
2149#[inline(always)]
2150fn extract_fields_to_buf(
2151    line: &[u8],
2152    delim: u8,
2153    ranges: &[Range],
2154    output_delim: &[u8],
2155    suppress: bool,
2156    max_field: usize,
2157    field_mask: u64,
2158    line_delim: u8,
2159    buf: &mut Vec<u8>,
2160    complement: bool,
2161) {
2162    let len = line.len();
2163
2164    if len == 0 {
2165        if !suppress {
2166            buf.push(line_delim);
2167        }
2168        return;
2169    }
2170
2171    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2172    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2173    let needed = len + output_delim.len() * 16 + 1;
2174    if buf.capacity() - buf.len() < needed {
2175        buf.reserve(needed);
2176    }
2177
2178    let base = line.as_ptr();
2179    let mut field_num: usize = 1;
2180    let mut field_start: usize = 0;
2181    let mut first_output = true;
2182    let mut has_delim = false;
2183
2184    // Use memchr SIMD for all line sizes
2185    for delim_pos in memchr_iter(delim, line) {
2186        has_delim = true;
2187
2188        if is_selected(field_num, field_mask, ranges, complement) {
2189            if !first_output {
2190                unsafe { buf_extend(buf, output_delim) };
2191            }
2192            unsafe {
2193                buf_extend(
2194                    buf,
2195                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2196                )
2197            };
2198            first_output = false;
2199        }
2200
2201        field_num += 1;
2202        field_start = delim_pos + 1;
2203
2204        if field_num > max_field {
2205            break;
2206        }
2207    }
2208
2209    // Last field
2210    if (field_num <= max_field || complement)
2211        && has_delim
2212        && is_selected(field_num, field_mask, ranges, complement)
2213    {
2214        if !first_output {
2215            unsafe { buf_extend(buf, output_delim) };
2216        }
2217        unsafe {
2218            buf_extend(
2219                buf,
2220                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2221            )
2222        };
2223        first_output = false;
2224    }
2225
2226    if !first_output {
2227        unsafe { buf_push(buf, line_delim) };
2228    } else if !has_delim {
2229        if !suppress {
2230            unsafe {
2231                buf_extend(buf, line);
2232                buf_push(buf, line_delim);
2233            }
2234        }
2235    } else {
2236        unsafe { buf_push(buf, line_delim) };
2237    }
2238}
2239
2240// ── Fast path: byte/char extraction with batched output ──────────────────
2241
2242/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2243/// Zero-copy: writes directly from the source data using output runs.
2244/// For lines shorter than max_bytes, the output is identical to the input,
2245/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2246fn process_bytes_from_start(
2247    data: &[u8],
2248    max_bytes: usize,
2249    line_delim: u8,
2250    out: &mut impl Write,
2251) -> io::Result<()> {
2252    // Fast path: if all lines fit within max_bytes, output = input.
2253    // Single memchr scan with early exit on first oversized line.
2254    // For `-b1-100` on CSV where average line is < 100 bytes, this
2255    // skips all per-line processing and outputs the data directly.
2256    if max_bytes > 0 && max_bytes < usize::MAX {
2257        let mut start = 0;
2258        let mut all_fit = true;
2259        for pos in memchr_iter(line_delim, data) {
2260            if pos - start > max_bytes {
2261                all_fit = false;
2262                break;
2263            }
2264            start = pos + 1;
2265        }
2266        // Check last line (no trailing delimiter)
2267        if all_fit && start < data.len() && data.len() - start > max_bytes {
2268            all_fit = false;
2269        }
2270        if all_fit {
2271            // All lines fit: output = input. Handle missing trailing delimiter.
2272            if !data.is_empty() && data[data.len() - 1] == line_delim {
2273                return out.write_all(data);
2274            } else if !data.is_empty() {
2275                out.write_all(data)?;
2276                return out.write_all(&[line_delim]);
2277            }
2278            return Ok(());
2279        }
2280    }
2281
2282    if data.len() >= PARALLEL_THRESHOLD {
2283        let chunks = split_into_chunks(data, line_delim);
2284        let results: Vec<Vec<u8>> = chunks
2285            .par_iter()
2286            .map(|chunk| {
2287                // Estimate output size without scanning: assume average line
2288                // is at least (max_bytes+1) bytes (otherwise no truncation).
2289                // For cut -b1-5 on 50-char lines: output ~ chunk.len() * 6/51 ~ chunk/8.
2290                // Using chunk.len()/4 as initial capacity handles most cases
2291                // without reallocation, while avoiding the extra memchr scan.
2292                let est_out = (chunk.len() / 4).max(max_bytes + 2);
2293                let mut buf = Vec::with_capacity(est_out.min(chunk.len()));
2294                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
2295                buf
2296            })
2297            .collect();
2298        // Use write_vectored (writev) to batch N writes into fewer syscalls
2299        let slices: Vec<IoSlice> = results
2300            .iter()
2301            .filter(|r| !r.is_empty())
2302            .map(|r| IoSlice::new(r))
2303            .collect();
2304        write_ioslices(out, &slices)?;
2305    } else {
2306        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2307        // because every line gets truncated, creating 3 IoSlice entries per line.
2308        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2309        // managing millions of IoSlice entries through the kernel.
2310        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2311        if max_bytes <= 512 {
2312            // Estimate output size without scanning: output <= data.len(),
2313            // typically ~data.len()/4 for short max_bytes on longer lines.
2314            let est_out = (data.len() / 4).max(max_bytes + 2);
2315            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2316            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2317            if !buf.is_empty() {
2318                out.write_all(&buf)?;
2319            }
2320        } else {
2321            // Zero-copy path: track contiguous output runs and write directly from source.
2322            // For lines <= max_bytes, we include them as-is (no copy needed).
2323            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2324            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2325        }
2326    }
2327    Ok(())
2328}
2329
2330/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2331/// directly into the source data, flushing in MAX_IOV-sized batches.
2332/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2333/// produce two IoSlices (truncated data + newline).
2334#[inline]
2335fn bytes_from_start_zerocopy(
2336    data: &[u8],
2337    max_bytes: usize,
2338    line_delim: u8,
2339    out: &mut impl Write,
2340) -> io::Result<()> {
2341    let newline_buf: [u8; 1] = [line_delim];
2342    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2343    let mut start = 0;
2344    let mut run_start: usize = 0;
2345
2346    for pos in memchr_iter(line_delim, data) {
2347        let line_len = pos - start;
2348        if line_len > max_bytes {
2349            // This line needs truncation
2350            if run_start < start {
2351                iov.push(IoSlice::new(&data[run_start..start]));
2352            }
2353            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2354            iov.push(IoSlice::new(&newline_buf));
2355            run_start = pos + 1;
2356
2357            if iov.len() >= MAX_IOV - 2 {
2358                write_ioslices(out, &iov)?;
2359                iov.clear();
2360            }
2361        }
2362        start = pos + 1;
2363    }
2364    // Handle last line without terminator
2365    if start < data.len() {
2366        let line_len = data.len() - start;
2367        if line_len > max_bytes {
2368            if run_start < start {
2369                iov.push(IoSlice::new(&data[run_start..start]));
2370            }
2371            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2372            iov.push(IoSlice::new(&newline_buf));
2373            if !iov.is_empty() {
2374                write_ioslices(out, &iov)?;
2375            }
2376            return Ok(());
2377        }
2378    }
2379    // Flush remaining contiguous run
2380    if run_start < data.len() {
2381        iov.push(IoSlice::new(&data[run_start..]));
2382        if !data.is_empty() && *data.last().unwrap() != line_delim {
2383            iov.push(IoSlice::new(&newline_buf));
2384        }
2385    }
2386    if !iov.is_empty() {
2387        write_ioslices(out, &iov)?;
2388    }
2389    Ok(())
2390}
2391
2392/// Process a chunk for from-start byte range extraction (parallel path).
2393/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2394/// Pre-reserves data.len() (output never exceeds input), then uses a single
2395/// write pointer with deferred set_len — no per-line capacity checks.
2396#[inline]
2397fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2398    // Output is always <= input size (we only truncate, never expand).
2399    // Single reserve eliminates ALL per-line capacity checks.
2400    buf.reserve(data.len());
2401
2402    let src = data.as_ptr();
2403    let dst_base = buf.as_mut_ptr();
2404    let mut wp = buf.len();
2405    let mut start = 0;
2406
2407    for pos in memchr_iter(line_delim, data) {
2408        let line_len = pos - start;
2409        let take = line_len.min(max_bytes);
2410        unsafe {
2411            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2412            *dst_base.add(wp + take) = line_delim;
2413        }
2414        wp += take + 1;
2415        start = pos + 1;
2416    }
2417    // Handle last line without terminator
2418    if start < data.len() {
2419        let line_len = data.len() - start;
2420        let take = line_len.min(max_bytes);
2421        unsafe {
2422            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2423            *dst_base.add(wp + take) = line_delim;
2424        }
2425        wp += take + 1;
2426    }
2427    unsafe { buf.set_len(wp) };
2428}
2429
2430/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2431fn process_bytes_from_offset(
2432    data: &[u8],
2433    skip_bytes: usize,
2434    line_delim: u8,
2435    out: &mut impl Write,
2436) -> io::Result<()> {
2437    if data.len() >= PARALLEL_THRESHOLD {
2438        let chunks = split_into_chunks(data, line_delim);
2439        let results: Vec<Vec<u8>> = chunks
2440            .par_iter()
2441            .map(|chunk| {
2442                let mut buf = Vec::with_capacity(chunk.len());
2443                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
2444                buf
2445            })
2446            .collect();
2447        // Use write_vectored (writev) to batch N writes into fewer syscalls
2448        let slices: Vec<IoSlice> = results
2449            .iter()
2450            .filter(|r| !r.is_empty())
2451            .map(|r| IoSlice::new(r))
2452            .collect();
2453        write_ioslices(out, &slices)?;
2454    } else {
2455        // Zero-copy: write suffix of each line directly from source
2456        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2457    }
2458    Ok(())
2459}
2460
2461/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2462/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2463/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2464#[inline]
2465fn bytes_from_offset_zerocopy(
2466    data: &[u8],
2467    skip_bytes: usize,
2468    line_delim: u8,
2469    out: &mut impl Write,
2470) -> io::Result<()> {
2471    let delim_buf = [line_delim];
2472    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2473
2474    let mut start = 0;
2475    for pos in memchr_iter(line_delim, data) {
2476        let line_len = pos - start;
2477        if line_len > skip_bytes {
2478            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2479        }
2480        iov.push(IoSlice::new(&delim_buf));
2481        // Flush when approaching MAX_IOV to avoid oversized writev
2482        if iov.len() >= MAX_IOV - 1 {
2483            write_ioslices(out, &iov)?;
2484            iov.clear();
2485        }
2486        start = pos + 1;
2487    }
2488    if start < data.len() {
2489        let line_len = data.len() - start;
2490        if line_len > skip_bytes {
2491            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2492        }
2493        iov.push(IoSlice::new(&delim_buf));
2494    }
2495    if !iov.is_empty() {
2496        write_ioslices(out, &iov)?;
2497    }
2498    Ok(())
2499}
2500
2501/// Process a chunk for from-offset byte range extraction.
2502/// Single reserve + deferred set_len for zero per-line overhead.
2503#[inline]
2504fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2505    buf.reserve(data.len());
2506
2507    let src = data.as_ptr();
2508    let dst_base = buf.as_mut_ptr();
2509    let mut wp = buf.len();
2510    let mut start = 0;
2511
2512    for pos in memchr_iter(line_delim, data) {
2513        let line_len = pos - start;
2514        if line_len > skip_bytes {
2515            let take = line_len - skip_bytes;
2516            unsafe {
2517                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2518            }
2519            wp += take;
2520        }
2521        unsafe {
2522            *dst_base.add(wp) = line_delim;
2523        }
2524        wp += 1;
2525        start = pos + 1;
2526    }
2527    if start < data.len() {
2528        let line_len = data.len() - start;
2529        if line_len > skip_bytes {
2530            let take = line_len - skip_bytes;
2531            unsafe {
2532                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2533            }
2534            wp += take;
2535        }
2536        unsafe {
2537            *dst_base.add(wp) = line_delim;
2538        }
2539        wp += 1;
2540    }
2541    unsafe { buf.set_len(wp) };
2542}
2543
2544/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2545fn process_bytes_mid_range(
2546    data: &[u8],
2547    start_byte: usize,
2548    end_byte: usize,
2549    line_delim: u8,
2550    out: &mut impl Write,
2551) -> io::Result<()> {
2552    let skip = start_byte.saturating_sub(1);
2553
2554    if data.len() >= PARALLEL_THRESHOLD {
2555        let chunks = split_into_chunks(data, line_delim);
2556        let results: Vec<Vec<u8>> = chunks
2557            .par_iter()
2558            .map(|chunk| {
2559                let mut buf = Vec::with_capacity(chunk.len());
2560                bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, &mut buf);
2561                buf
2562            })
2563            .collect();
2564        let slices: Vec<IoSlice> = results
2565            .iter()
2566            .filter(|r| !r.is_empty())
2567            .map(|r| IoSlice::new(r))
2568            .collect();
2569        write_ioslices(out, &slices)?;
2570    } else {
2571        let mut buf = Vec::with_capacity(data.len());
2572        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2573        if !buf.is_empty() {
2574            out.write_all(&buf)?;
2575        }
2576    }
2577    Ok(())
2578}
2579
2580/// Process a chunk for mid-range byte extraction.
2581/// For each line, output bytes skip..min(line_len, end_byte).
2582/// Single reserve + deferred set_len.
2583#[inline]
2584fn bytes_mid_range_chunk(
2585    data: &[u8],
2586    skip: usize,
2587    end_byte: usize,
2588    line_delim: u8,
2589    buf: &mut Vec<u8>,
2590) {
2591    buf.reserve(data.len());
2592
2593    let src = data.as_ptr();
2594    let dst_base = buf.as_mut_ptr();
2595    let mut wp = buf.len();
2596    let mut start = 0;
2597
2598    for pos in memchr_iter(line_delim, data) {
2599        let line_len = pos - start;
2600        if line_len > skip {
2601            let take_end = line_len.min(end_byte);
2602            let take = take_end - skip;
2603            unsafe {
2604                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2605            }
2606            wp += take;
2607        }
2608        unsafe {
2609            *dst_base.add(wp) = line_delim;
2610        }
2611        wp += 1;
2612        start = pos + 1;
2613    }
2614    if start < data.len() {
2615        let line_len = data.len() - start;
2616        if line_len > skip {
2617            let take_end = line_len.min(end_byte);
2618            let take = take_end - skip;
2619            unsafe {
2620                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2621            }
2622            wp += take;
2623        }
2624        unsafe {
2625            *dst_base.add(wp) = line_delim;
2626        }
2627        wp += 1;
2628    }
2629    unsafe { buf.set_len(wp) };
2630}
2631
2632/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2633fn process_bytes_complement_mid(
2634    data: &[u8],
2635    skip_start: usize,
2636    skip_end: usize,
2637    line_delim: u8,
2638    out: &mut impl Write,
2639) -> io::Result<()> {
2640    let prefix_bytes = skip_start - 1; // bytes before the skip region
2641    if data.len() >= PARALLEL_THRESHOLD {
2642        let chunks = split_into_chunks(data, line_delim);
2643        let results: Vec<Vec<u8>> = chunks
2644            .par_iter()
2645            .map(|chunk| {
2646                let mut buf = Vec::with_capacity(chunk.len());
2647                bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, &mut buf);
2648                buf
2649            })
2650            .collect();
2651        let slices: Vec<IoSlice> = results
2652            .iter()
2653            .filter(|r| !r.is_empty())
2654            .map(|r| IoSlice::new(r))
2655            .collect();
2656        write_ioslices(out, &slices)?;
2657    } else {
2658        let mut buf = Vec::with_capacity(data.len());
2659        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
2660        if !buf.is_empty() {
2661            out.write_all(&buf)?;
2662        }
2663    }
2664    Ok(())
2665}
2666
2667/// Process a chunk for complement mid-range byte extraction.
2668/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2669#[inline]
2670fn bytes_complement_mid_chunk(
2671    data: &[u8],
2672    prefix_bytes: usize,
2673    skip_end: usize,
2674    line_delim: u8,
2675    buf: &mut Vec<u8>,
2676) {
2677    buf.reserve(data.len());
2678
2679    let src = data.as_ptr();
2680    let dst_base = buf.as_mut_ptr();
2681    let mut wp = buf.len();
2682    let mut start = 0;
2683
2684    for pos in memchr_iter(line_delim, data) {
2685        let line_len = pos - start;
2686        // Copy prefix (bytes before skip region)
2687        let take_prefix = prefix_bytes.min(line_len);
2688        if take_prefix > 0 {
2689            unsafe {
2690                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2691            }
2692            wp += take_prefix;
2693        }
2694        // Copy suffix (bytes after skip region)
2695        if line_len > skip_end {
2696            let suffix_len = line_len - skip_end;
2697            unsafe {
2698                std::ptr::copy_nonoverlapping(
2699                    src.add(start + skip_end),
2700                    dst_base.add(wp),
2701                    suffix_len,
2702                );
2703            }
2704            wp += suffix_len;
2705        }
2706        unsafe {
2707            *dst_base.add(wp) = line_delim;
2708        }
2709        wp += 1;
2710        start = pos + 1;
2711    }
2712    if start < data.len() {
2713        let line_len = data.len() - start;
2714        let take_prefix = prefix_bytes.min(line_len);
2715        if take_prefix > 0 {
2716            unsafe {
2717                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2718            }
2719            wp += take_prefix;
2720        }
2721        if line_len > skip_end {
2722            let suffix_len = line_len - skip_end;
2723            unsafe {
2724                std::ptr::copy_nonoverlapping(
2725                    src.add(start + skip_end),
2726                    dst_base.add(wp),
2727                    suffix_len,
2728                );
2729            }
2730            wp += suffix_len;
2731        }
2732        unsafe {
2733            *dst_base.add(wp) = line_delim;
2734        }
2735        wp += 1;
2736    }
2737    unsafe { buf.set_len(wp) };
2738}
2739
2740/// Optimized byte/char extraction with batched output and parallel processing.
2741fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2742    let line_delim = cfg.line_delim;
2743    let ranges = cfg.ranges;
2744    let complement = cfg.complement;
2745    let output_delim = cfg.output_delim;
2746
2747    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
2748    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
2749        let max_bytes = ranges[0].end;
2750        if max_bytes < usize::MAX {
2751            return process_bytes_from_start(data, max_bytes, line_delim, out);
2752        }
2753    }
2754
2755    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
2756    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
2757        let skip_bytes = ranges[0].start.saturating_sub(1);
2758        if skip_bytes > 0 {
2759            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
2760        }
2761    }
2762
2763    // Fast path: single mid-range (e.g., cut -b5-100)
2764    if !complement
2765        && ranges.len() == 1
2766        && ranges[0].start > 1
2767        && ranges[0].end < usize::MAX
2768        && output_delim.is_empty()
2769    {
2770        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
2771    }
2772
2773    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
2774    if complement
2775        && ranges.len() == 1
2776        && ranges[0].start == 1
2777        && ranges[0].end < usize::MAX
2778        && output_delim.is_empty()
2779    {
2780        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
2781    }
2782
2783    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
2784    if complement
2785        && ranges.len() == 1
2786        && ranges[0].end == usize::MAX
2787        && ranges[0].start > 1
2788        && output_delim.is_empty()
2789    {
2790        let max_bytes = ranges[0].start - 1;
2791        return process_bytes_from_start(data, max_bytes, line_delim, out);
2792    }
2793
2794    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
2795    if complement
2796        && ranges.len() == 1
2797        && ranges[0].start > 1
2798        && ranges[0].end < usize::MAX
2799        && output_delim.is_empty()
2800    {
2801        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
2802    }
2803
2804    if data.len() >= PARALLEL_THRESHOLD {
2805        let chunks = split_into_chunks(data, line_delim);
2806        let results: Vec<Vec<u8>> = chunks
2807            .par_iter()
2808            .map(|chunk| {
2809                let mut buf = Vec::with_capacity(chunk.len());
2810                process_bytes_chunk(
2811                    chunk,
2812                    ranges,
2813                    complement,
2814                    output_delim,
2815                    line_delim,
2816                    &mut buf,
2817                );
2818                buf
2819            })
2820            .collect();
2821        // Use write_vectored (writev) to batch N writes into fewer syscalls
2822        let slices: Vec<IoSlice> = results
2823            .iter()
2824            .filter(|r| !r.is_empty())
2825            .map(|r| IoSlice::new(r))
2826            .collect();
2827        write_ioslices(out, &slices)?;
2828    } else {
2829        let mut buf = Vec::with_capacity(data.len());
2830        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
2831        if !buf.is_empty() {
2832            out.write_all(&buf)?;
2833        }
2834    }
2835    Ok(())
2836}
2837
2838/// Process a chunk of data for byte/char extraction.
2839/// Uses raw pointer arithmetic for the newline scan.
2840/// Complement single-range fast path: compute complement ranges once, then use
2841/// the non-complement multi-range path which is more cache-friendly.
2842fn process_bytes_chunk(
2843    data: &[u8],
2844    ranges: &[Range],
2845    complement: bool,
2846    output_delim: &[u8],
2847    line_delim: u8,
2848    buf: &mut Vec<u8>,
2849) {
2850    buf.reserve(data.len());
2851    let base = data.as_ptr();
2852    let mut start = 0;
2853    for end_pos in memchr_iter(line_delim, data) {
2854        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2855        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2856        unsafe { buf_push(buf, line_delim) };
2857        start = end_pos + 1;
2858    }
2859    if start < data.len() {
2860        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2861        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2862        unsafe { buf_push(buf, line_delim) };
2863    }
2864}
2865
2866/// Extract byte ranges from a line into the output buffer.
2867/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
2868/// Raw pointer arithmetic eliminates per-range bounds checking.
2869#[inline(always)]
2870fn cut_bytes_to_buf(
2871    line: &[u8],
2872    ranges: &[Range],
2873    complement: bool,
2874    output_delim: &[u8],
2875    buf: &mut Vec<u8>,
2876) {
2877    let len = line.len();
2878    let base = line.as_ptr();
2879    let mut first_range = true;
2880
2881    // Reserve worst case: full line + delimiters between ranges
2882    let needed = len + output_delim.len() * ranges.len() + 1;
2883    if buf.capacity() - buf.len() < needed {
2884        buf.reserve(needed);
2885    }
2886
2887    if complement {
2888        let mut pos: usize = 1;
2889        for r in ranges {
2890            let rs = r.start;
2891            let re = r.end.min(len);
2892            if pos < rs {
2893                if !first_range && !output_delim.is_empty() {
2894                    unsafe { buf_extend(buf, output_delim) };
2895                }
2896                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
2897                first_range = false;
2898            }
2899            pos = re + 1;
2900            if pos > len {
2901                break;
2902            }
2903        }
2904        if pos <= len {
2905            if !first_range && !output_delim.is_empty() {
2906                unsafe { buf_extend(buf, output_delim) };
2907            }
2908            unsafe {
2909                buf_extend(
2910                    buf,
2911                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
2912                )
2913            };
2914        }
2915    } else if output_delim.is_empty() && ranges.len() == 1 {
2916        // Ultra-fast path: single range, no output delimiter
2917        let start = ranges[0].start.saturating_sub(1);
2918        let end = ranges[0].end.min(len);
2919        if start < len {
2920            unsafe {
2921                buf_extend(
2922                    buf,
2923                    std::slice::from_raw_parts(base.add(start), end - start),
2924                )
2925            };
2926        }
2927    } else {
2928        for r in ranges {
2929            let start = r.start.saturating_sub(1);
2930            let end = r.end.min(len);
2931            if start >= len {
2932                break;
2933            }
2934            if !first_range && !output_delim.is_empty() {
2935                unsafe { buf_extend(buf, output_delim) };
2936            }
2937            unsafe {
2938                buf_extend(
2939                    buf,
2940                    std::slice::from_raw_parts(base.add(start), end - start),
2941                )
2942            };
2943            first_range = false;
2944        }
2945    }
2946}
2947
2948// ── Public API ───────────────────────────────────────────────────────────
2949
2950/// Cut fields from a line using a delimiter. Writes to `out`.
2951#[inline]
2952pub fn cut_fields(
2953    line: &[u8],
2954    delim: u8,
2955    ranges: &[Range],
2956    complement: bool,
2957    output_delim: &[u8],
2958    suppress_no_delim: bool,
2959    out: &mut impl Write,
2960) -> io::Result<bool> {
2961    if memchr::memchr(delim, line).is_none() {
2962        if !suppress_no_delim {
2963            out.write_all(line)?;
2964            return Ok(true);
2965        }
2966        return Ok(false);
2967    }
2968
2969    let mut field_num: usize = 1;
2970    let mut field_start: usize = 0;
2971    let mut first_output = true;
2972
2973    for delim_pos in memchr_iter(delim, line) {
2974        let selected = in_ranges(ranges, field_num) != complement;
2975        if selected {
2976            if !first_output {
2977                out.write_all(output_delim)?;
2978            }
2979            out.write_all(&line[field_start..delim_pos])?;
2980            first_output = false;
2981        }
2982        field_start = delim_pos + 1;
2983        field_num += 1;
2984    }
2985
2986    let selected = in_ranges(ranges, field_num) != complement;
2987    if selected {
2988        if !first_output {
2989            out.write_all(output_delim)?;
2990        }
2991        out.write_all(&line[field_start..])?;
2992    }
2993
2994    Ok(true)
2995}
2996
2997/// Cut bytes/chars from a line. Writes selected bytes to `out`.
2998#[inline]
2999pub fn cut_bytes(
3000    line: &[u8],
3001    ranges: &[Range],
3002    complement: bool,
3003    output_delim: &[u8],
3004    out: &mut impl Write,
3005) -> io::Result<bool> {
3006    let mut first_range = true;
3007
3008    if complement {
3009        let len = line.len();
3010        let mut comp_ranges = Vec::new();
3011        let mut pos: usize = 1;
3012        for r in ranges {
3013            let rs = r.start;
3014            let re = r.end.min(len);
3015            if pos < rs {
3016                comp_ranges.push((pos, rs - 1));
3017            }
3018            pos = re + 1;
3019            if pos > len {
3020                break;
3021            }
3022        }
3023        if pos <= len {
3024            comp_ranges.push((pos, len));
3025        }
3026        for &(s, e) in &comp_ranges {
3027            if !first_range && !output_delim.is_empty() {
3028                out.write_all(output_delim)?;
3029            }
3030            out.write_all(&line[s - 1..e])?;
3031            first_range = false;
3032        }
3033    } else {
3034        for r in ranges {
3035            let start = r.start.saturating_sub(1);
3036            let end = r.end.min(line.len());
3037            if start >= line.len() {
3038                break;
3039            }
3040            if !first_range && !output_delim.is_empty() {
3041                out.write_all(output_delim)?;
3042            }
3043            out.write_all(&line[start..end])?;
3044            first_range = false;
3045        }
3046    }
3047    Ok(true)
3048}
3049
3050/// Process a full data buffer (from mmap or read) with cut operation.
3051pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3052    match cfg.mode {
3053        CutMode::Fields => process_fields_fast(data, cfg, out),
3054        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3055    }
3056}
3057
3058/// Process input from a reader (for stdin).
3059/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3060/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3061/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3062pub fn process_cut_reader<R: BufRead>(
3063    mut reader: R,
3064    cfg: &CutConfig,
3065    out: &mut impl Write,
3066) -> io::Result<()> {
3067    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3068    let line_delim = cfg.line_delim;
3069
3070    // Read large chunks and process in batch.
3071    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3072    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3073
3074    loop {
3075        // Read up to CHUNK_SIZE bytes
3076        buf.reserve(CHUNK_SIZE);
3077        let read_start = buf.len();
3078        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3079        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3080        buf.truncate(read_start + n);
3081
3082        if buf.is_empty() {
3083            break;
3084        }
3085
3086        if n == 0 {
3087            // EOF with leftover data (last line without terminator)
3088            process_cut_data(&buf, cfg, out)?;
3089            break;
3090        }
3091
3092        // Find the last line delimiter in the buffer so we process complete lines
3093        let process_end = match memchr::memrchr(line_delim, &buf) {
3094            Some(pos) => pos + 1,
3095            None => {
3096                // No line delimiter found — keep accumulating
3097                continue;
3098            }
3099        };
3100
3101        // Process the complete lines using the fast batch path
3102        process_cut_data(&buf[..process_end], cfg, out)?;
3103
3104        // Shift leftover to the front for next iteration
3105        let leftover_len = buf.len() - process_end;
3106        if leftover_len > 0 {
3107            buf.copy_within(process_end.., 0);
3108        }
3109        buf.truncate(leftover_len);
3110    }
3111
3112    Ok(())
3113}
3114
3115/// Read as many bytes as possible into buf, retrying on partial reads.
3116#[inline]
3117fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3118    let n = reader.read(buf)?;
3119    if n == buf.len() || n == 0 {
3120        return Ok(n);
3121    }
3122    // Slow path: partial read — retry to fill buffer
3123    let mut total = n;
3124    while total < buf.len() {
3125        match reader.read(&mut buf[total..]) {
3126            Ok(0) => break,
3127            Ok(n) => total += n,
3128            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3129            Err(e) => return Err(e),
3130        }
3131    }
3132    Ok(total)
3133}
3134
3135/// Cut operation mode
3136#[derive(Debug, Clone, Copy, PartialEq)]
3137pub enum CutMode {
3138    Bytes,
3139    Characters,
3140    Fields,
3141}