Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (512KB).
6/// Lowered to benefit from parallel chunk processing on smaller piped inputs.
7/// At 512KB with 2+ threads, the per-thread chunk is ~256KB which still
8/// amortizes the rayon overhead (~100-200us) well.
9const PARALLEL_THRESHOLD: usize = 512 * 1024;
10
11/// Max iovec entries per writev call (Linux default).
12const MAX_IOV: usize = 1024;
13
14/// Configuration for cut operations.
15pub struct CutConfig<'a> {
16    pub mode: CutMode,
17    pub ranges: &'a [Range],
18    pub complement: bool,
19    pub delim: u8,
20    pub output_delim: &'a [u8],
21    pub suppress_no_delim: bool,
22    pub line_delim: u8,
23}
24
25/// A range specification like 1, 3-5, -3, 4-
26#[derive(Debug, Clone)]
27pub struct Range {
28    pub start: usize, // 1-based, 0 means "from beginning"
29    pub end: usize,   // 1-based, usize::MAX means "to end"
30}
31
32/// Parse a LIST specification like "1,3-5,7-" into ranges.
33/// Each range is 1-based. Returns sorted, merged ranges.
34pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
35    let mut ranges = Vec::new();
36
37    for part in spec.split(',') {
38        let part = part.trim();
39        if part.is_empty() {
40            continue;
41        }
42
43        if let Some(idx) = part.find('-') {
44            let left = &part[..idx];
45            let right = &part[idx + 1..];
46
47            let start = if left.is_empty() {
48                1
49            } else {
50                left.parse::<usize>()
51                    .map_err(|_| format!("invalid range: '{}'", part))?
52            };
53
54            let end = if right.is_empty() {
55                usize::MAX
56            } else {
57                right
58                    .parse::<usize>()
59                    .map_err(|_| format!("invalid range: '{}'", part))?
60            };
61
62            if start == 0 {
63                return Err("fields and positions are numbered from 1".to_string());
64            }
65            if start > end {
66                return Err(format!("invalid decreasing range: '{}'", part));
67            }
68
69            ranges.push(Range { start, end });
70        } else {
71            let n = part
72                .parse::<usize>()
73                .map_err(|_| format!("invalid field: '{}'", part))?;
74            if n == 0 {
75                return Err("fields and positions are numbered from 1".to_string());
76            }
77            ranges.push(Range { start: n, end: n });
78        }
79    }
80
81    if ranges.is_empty() {
82        return Err("you must specify a list of bytes, characters, or fields".to_string());
83    }
84
85    // Sort and merge overlapping ranges
86    ranges.sort_by_key(|r| (r.start, r.end));
87    let mut merged = vec![ranges[0].clone()];
88    for r in &ranges[1..] {
89        let last = merged.last_mut().unwrap();
90        if r.start <= last.end.saturating_add(1) {
91            last.end = last.end.max(r.end);
92        } else {
93            merged.push(r.clone());
94        }
95    }
96
97    Ok(merged)
98}
99
100/// Check if a 1-based position is in any range.
101/// Ranges must be sorted. Uses early exit since ranges are sorted.
102#[inline(always)]
103fn in_ranges(ranges: &[Range], pos: usize) -> bool {
104    for r in ranges {
105        if pos < r.start {
106            return false;
107        }
108        if pos <= r.end {
109            return true;
110        }
111    }
112    false
113}
114
115/// Pre-compute a 64-bit mask for field selection.
116/// Bit i-1 is set if field i should be output.
117#[inline]
118fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
119    let mut mask: u64 = 0;
120    for i in 1..=64u32 {
121        let in_range = in_ranges(ranges, i as usize);
122        if in_range != complement {
123            mask |= 1u64 << (i - 1);
124        }
125    }
126    mask
127}
128
129/// Check if a field should be selected, using bitset for first 64 fields.
130#[inline(always)]
131fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
132    if field_num <= 64 {
133        (mask >> (field_num - 1)) & 1 == 1
134    } else {
135        in_ranges(ranges, field_num) != complement
136    }
137}
138
139// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
140
141/// Append a slice to buf without capacity checks.
142/// Caller MUST ensure buf has enough remaining capacity.
143#[inline(always)]
144unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
145    unsafe {
146        let len = buf.len();
147        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
148        buf.set_len(len + data.len());
149    }
150}
151
152/// Append a single byte to buf without capacity checks.
153/// Caller MUST ensure buf has enough remaining capacity.
154#[inline(always)]
155unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
156    unsafe {
157        let len = buf.len();
158        *buf.as_mut_ptr().add(len) = b;
159        buf.set_len(len + 1);
160    }
161}
162
163/// Write multiple IoSlice buffers using write_vectored (writev syscall).
164/// Batches into MAX_IOV-sized groups. Falls back to write_all per slice for partial writes.
165#[inline]
166fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
167    if slices.is_empty() {
168        return Ok(());
169    }
170    for batch in slices.chunks(MAX_IOV) {
171        let total: usize = batch.iter().map(|s| s.len()).sum();
172        match out.write_vectored(batch) {
173            Ok(n) if n >= total => continue,
174            Ok(mut written) => {
175                // Partial write: fall back to write_all per remaining slice
176                for slice in batch {
177                    let slen = slice.len();
178                    if written >= slen {
179                        written -= slen;
180                        continue;
181                    }
182                    if written > 0 {
183                        out.write_all(&slice[written..])?;
184                        written = 0;
185                    } else {
186                        out.write_all(slice)?;
187                    }
188                }
189            }
190            Err(e) => return Err(e),
191        }
192    }
193    Ok(())
194}
195
196// ── Chunk splitting for parallel processing ──────────────────────────────
197
198/// Split data into chunks aligned to line boundaries for parallel processing.
199fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
200    let num_threads = rayon::current_num_threads().max(1);
201    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
202        return vec![data];
203    }
204
205    let chunk_size = data.len() / num_threads;
206    let mut chunks = Vec::with_capacity(num_threads);
207    let mut pos = 0;
208
209    for _ in 0..num_threads - 1 {
210        let target = pos + chunk_size;
211        if target >= data.len() {
212            break;
213        }
214        let boundary = memchr::memchr(line_delim, &data[target..])
215            .map(|p| target + p + 1)
216            .unwrap_or(data.len());
217        if boundary > pos {
218            chunks.push(&data[pos..boundary]);
219        }
220        pos = boundary;
221    }
222
223    if pos < data.len() {
224        chunks.push(&data[pos..]);
225    }
226
227    chunks
228}
229
230// ── Fast path: multi-field non-contiguous extraction ─────────────────────
231
232/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
233/// Pre-collects delimiter positions per line into a stack-allocated array,
234/// then directly indexes into them for each selected field.
235/// This is O(max_field) per line instead of O(num_fields * scan_length).
236fn process_fields_multi_select(
237    data: &[u8],
238    delim: u8,
239    line_delim: u8,
240    ranges: &[Range],
241    suppress: bool,
242    out: &mut impl Write,
243) -> io::Result<()> {
244    let max_field = ranges.last().map_or(0, |r| r.end);
245
246    if data.len() >= PARALLEL_THRESHOLD {
247        let chunks = split_into_chunks(data, line_delim);
248        let results: Vec<Vec<u8>> = chunks
249            .par_iter()
250            .map(|chunk| {
251                let mut buf = Vec::with_capacity(chunk.len());
252                multi_select_chunk(
253                    chunk, delim, line_delim, ranges, max_field, suppress, &mut buf,
254                );
255                buf
256            })
257            .collect();
258        let slices: Vec<IoSlice> = results
259            .iter()
260            .filter(|r| !r.is_empty())
261            .map(|r| IoSlice::new(r))
262            .collect();
263        write_ioslices(out, &slices)?;
264    } else {
265        let mut buf = Vec::with_capacity(data.len());
266        multi_select_chunk(
267            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
268        );
269        if !buf.is_empty() {
270            out.write_all(&buf)?;
271        }
272    }
273    Ok(())
274}
275
276/// Process a chunk for multi-field extraction using a single-pass memchr2 scan.
277/// Scans for both delimiter and line_delim in one SIMD pass over the entire chunk,
278/// eliminating per-line memchr_iter setup overhead (significant for short lines).
279/// Delimiter positions are collected in a stack array per line.
280/// When max_field is reached on a line, remaining delimiters are ignored.
281fn multi_select_chunk(
282    data: &[u8],
283    delim: u8,
284    line_delim: u8,
285    ranges: &[Range],
286    max_field: usize,
287    suppress: bool,
288    buf: &mut Vec<u8>,
289) {
290    // When delim == line_delim, fall back to two-level approach
291    if delim == line_delim {
292        buf.reserve(data.len());
293        let base = data.as_ptr();
294        let mut start = 0;
295        for end_pos in memchr_iter(line_delim, data) {
296            let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
297            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
298            start = end_pos + 1;
299        }
300        if start < data.len() {
301            let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
302            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
303        }
304        return;
305    }
306
307    buf.reserve(data.len());
308    let base = data.as_ptr();
309    let data_len = data.len();
310
311    // Per-line state
312    let mut line_start: usize = 0;
313    let mut delim_pos = [0usize; 64];
314    let mut num_delims: usize = 0;
315    let max_delims = max_field.min(64);
316    let mut at_max = false;
317
318    // Single-pass scan using memchr2 for both delimiter and newline
319    for pos in memchr::memchr2_iter(delim, line_delim, data) {
320        let byte = unsafe { *base.add(pos) };
321
322        if byte == line_delim {
323            // End of line: extract fields from collected positions
324            let line_len = pos - line_start;
325            if num_delims == 0 {
326                // No delimiter in line
327                if !suppress {
328                    unsafe {
329                        buf_extend(
330                            buf,
331                            std::slice::from_raw_parts(base.add(line_start), line_len),
332                        );
333                        buf_push(buf, line_delim);
334                    }
335                }
336            } else {
337                // Extract fields using collected delimiter positions
338                let total_fields = num_delims + 1;
339                let mut first_output = true;
340
341                for r in ranges {
342                    let range_start = r.start;
343                    let range_end = r.end.min(total_fields);
344                    if range_start > total_fields {
345                        break;
346                    }
347                    for field_num in range_start..=range_end {
348                        if field_num > total_fields {
349                            break;
350                        }
351
352                        let field_start = if field_num == 1 {
353                            line_start
354                        } else if field_num - 2 < num_delims {
355                            delim_pos[field_num - 2] + 1
356                        } else {
357                            continue;
358                        };
359                        let field_end = if field_num <= num_delims {
360                            delim_pos[field_num - 1]
361                        } else {
362                            pos
363                        };
364
365                        if !first_output {
366                            unsafe { buf_push(buf, delim) };
367                        }
368                        unsafe {
369                            buf_extend(
370                                buf,
371                                std::slice::from_raw_parts(
372                                    base.add(field_start),
373                                    field_end - field_start,
374                                ),
375                            );
376                        }
377                        first_output = false;
378                    }
379                }
380
381                unsafe { buf_push(buf, line_delim) };
382            }
383
384            // Reset for next line
385            line_start = pos + 1;
386            num_delims = 0;
387            at_max = false;
388        } else {
389            // Delimiter found: collect position (up to max_field)
390            if !at_max && num_delims < max_delims {
391                delim_pos[num_delims] = pos;
392                num_delims += 1;
393                if num_delims >= max_delims {
394                    at_max = true;
395                }
396            }
397        }
398    }
399
400    // Handle last line without trailing line_delim
401    if line_start < data_len {
402        if num_delims == 0 {
403            if !suppress {
404                unsafe {
405                    buf_extend(
406                        buf,
407                        std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
408                    );
409                    buf_push(buf, line_delim);
410                }
411            }
412        } else {
413            let total_fields = num_delims + 1;
414            let mut first_output = true;
415
416            for r in ranges {
417                let range_start = r.start;
418                let range_end = r.end.min(total_fields);
419                if range_start > total_fields {
420                    break;
421                }
422                for field_num in range_start..=range_end {
423                    if field_num > total_fields {
424                        break;
425                    }
426
427                    let field_start = if field_num == 1 {
428                        line_start
429                    } else if field_num - 2 < num_delims {
430                        delim_pos[field_num - 2] + 1
431                    } else {
432                        continue;
433                    };
434                    let field_end = if field_num <= num_delims {
435                        delim_pos[field_num - 1]
436                    } else {
437                        data_len
438                    };
439
440                    if !first_output {
441                        unsafe { buf_push(buf, delim) };
442                    }
443                    unsafe {
444                        buf_extend(
445                            buf,
446                            std::slice::from_raw_parts(
447                                base.add(field_start),
448                                field_end - field_start,
449                            ),
450                        );
451                    }
452                    first_output = false;
453                }
454            }
455
456            unsafe { buf_push(buf, line_delim) };
457        }
458    }
459}
460
461/// Extract selected fields from a single line using delimiter position scanning.
462/// Scans delimiters only up to max_field (early exit), then extracts selected fields
463/// by indexing directly into the collected positions. Since ranges are pre-sorted and
464/// non-overlapping, every field within a range is selected — no is_selected check needed.
465#[inline(always)]
466fn multi_select_line(
467    line: &[u8],
468    delim: u8,
469    line_delim: u8,
470    ranges: &[Range],
471    max_field: usize,
472    suppress: bool,
473    buf: &mut Vec<u8>,
474) {
475    let len = line.len();
476    if len == 0 {
477        if !suppress {
478            unsafe { buf_push(buf, line_delim) };
479        }
480        return;
481    }
482
483    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
484    let base = line.as_ptr();
485
486    // Collect delimiter positions up to max_field (early exit).
487    // Stack array for up to 64 delimiter positions.
488    let mut delim_pos = [0usize; 64];
489    let mut num_delims: usize = 0;
490    let max_delims = max_field.min(64);
491
492    for pos in memchr_iter(delim, line) {
493        if num_delims < max_delims {
494            delim_pos[num_delims] = pos;
495            num_delims += 1;
496            if num_delims >= max_delims {
497                break;
498            }
499        }
500    }
501
502    if num_delims == 0 {
503        if !suppress {
504            unsafe {
505                buf_extend(buf, line);
506                buf_push(buf, line_delim);
507            }
508        }
509        return;
510    }
511
512    // Extract selected fields using delimiter positions.
513    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
514    // is selected — skip the is_selected check entirely (saves 1 function call per field).
515    let total_fields = num_delims + 1;
516    let mut first_output = true;
517
518    for r in ranges {
519        let range_start = r.start;
520        let range_end = r.end.min(total_fields);
521        if range_start > total_fields {
522            break;
523        }
524        for field_num in range_start..=range_end {
525            if field_num > total_fields {
526                break;
527            }
528
529            let field_start = if field_num == 1 {
530                0
531            } else if field_num - 2 < num_delims {
532                delim_pos[field_num - 2] + 1
533            } else {
534                continue;
535            };
536            let field_end = if field_num <= num_delims {
537                delim_pos[field_num - 1]
538            } else {
539                len
540            };
541
542            if !first_output {
543                unsafe { buf_push(buf, delim) };
544            }
545            unsafe {
546                buf_extend(
547                    buf,
548                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
549                );
550            }
551            first_output = false;
552        }
553    }
554
555    unsafe { buf_push(buf, line_delim) };
556}
557
558// ── Fast path: field extraction with batched output ──────────────────────
559
560/// Optimized field extraction with early exit and batched output.
561fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
562    let delim = cfg.delim;
563    let line_delim = cfg.line_delim;
564    let ranges = cfg.ranges;
565    let complement = cfg.complement;
566    let output_delim = cfg.output_delim;
567    let suppress = cfg.suppress_no_delim;
568
569    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
570    // That scan was O(N) over the entire file just to check an edge case
571    // (no delimiter in any line). The per-line processing already handles
572    // lines without delimiters correctly, so the scan was pure overhead
573    // for files that DO contain delimiters (the common case).
574
575    // Ultra-fast path: single field extraction (e.g., cut -f5)
576    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
577        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
578    }
579
580    // Fast path: complement of single field or contiguous range with default output delimiter.
581    if complement
582        && ranges.len() == 1
583        && output_delim.len() == 1
584        && output_delim[0] == delim
585        && ranges[0].start == ranges[0].end
586    {
587        return process_complement_single_field(
588            data,
589            delim,
590            line_delim,
591            ranges[0].start,
592            suppress,
593            out,
594        );
595    }
596
597    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
598    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
599    if complement
600        && ranges.len() == 1
601        && ranges[0].start > 1
602        && ranges[0].end < usize::MAX
603        && output_delim.len() == 1
604        && output_delim[0] == delim
605    {
606        return process_complement_range(
607            data,
608            delim,
609            line_delim,
610            ranges[0].start,
611            ranges[0].end,
612            suppress,
613            out,
614        );
615    }
616
617    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
618    if !complement
619        && ranges.len() == 1
620        && ranges[0].start == 1
621        && output_delim.len() == 1
622        && output_delim[0] == delim
623        && ranges[0].end < usize::MAX
624    {
625        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
626    }
627
628    // Fast path: open-ended field range from field N (e.g., cut -f3-)
629    if !complement
630        && ranges.len() == 1
631        && ranges[0].end == usize::MAX
632        && ranges[0].start > 1
633        && output_delim.len() == 1
634        && output_delim[0] == delim
635    {
636        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
637    }
638
639    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
640    if !complement
641        && ranges.len() == 1
642        && ranges[0].start > 1
643        && ranges[0].end < usize::MAX
644        && output_delim.len() == 1
645        && output_delim[0] == delim
646    {
647        return process_fields_mid_range(
648            data,
649            delim,
650            line_delim,
651            ranges[0].start,
652            ranges[0].end,
653            suppress,
654            out,
655        );
656    }
657
658    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
659    // Uses delimiter position caching: find all delimiter positions per line,
660    // then directly index into them for each selected field.
661    // This is faster than the general extract_fields_to_buf which re-checks
662    // is_selected() for every field encountered.
663    if !complement
664        && ranges.len() > 1
665        && ranges.last().map_or(false, |r| r.end < usize::MAX)
666        && output_delim.len() == 1
667        && output_delim[0] == delim
668        && delim != line_delim
669    {
670        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
671    }
672
673    // General field extraction
674    let max_field = if complement {
675        usize::MAX
676    } else {
677        ranges.last().map(|r| r.end).unwrap_or(0)
678    };
679    let field_mask = compute_field_mask(ranges, complement);
680
681    if data.len() >= PARALLEL_THRESHOLD {
682        let chunks = split_into_chunks(data, line_delim);
683        let results: Vec<Vec<u8>> = chunks
684            .par_iter()
685            .map(|chunk| {
686                let mut buf = Vec::with_capacity(chunk.len());
687                process_fields_chunk(
688                    chunk,
689                    delim,
690                    ranges,
691                    output_delim,
692                    suppress,
693                    max_field,
694                    field_mask,
695                    line_delim,
696                    complement,
697                    &mut buf,
698                );
699                buf
700            })
701            .collect();
702        // Use write_vectored (writev) to batch N writes into fewer syscalls
703        let slices: Vec<IoSlice> = results
704            .iter()
705            .filter(|r| !r.is_empty())
706            .map(|r| IoSlice::new(r))
707            .collect();
708        write_ioslices(out, &slices)?;
709    } else {
710        let mut buf = Vec::with_capacity(data.len());
711        process_fields_chunk(
712            data,
713            delim,
714            ranges,
715            output_delim,
716            suppress,
717            max_field,
718            field_mask,
719            line_delim,
720            complement,
721            &mut buf,
722        );
723        if !buf.is_empty() {
724            out.write_all(&buf)?;
725        }
726    }
727    Ok(())
728}
729
730/// Process a chunk of data for general field extraction.
731/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
732/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
733/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
734fn process_fields_chunk(
735    data: &[u8],
736    delim: u8,
737    ranges: &[Range],
738    output_delim: &[u8],
739    suppress: bool,
740    max_field: usize,
741    field_mask: u64,
742    line_delim: u8,
743    complement: bool,
744    buf: &mut Vec<u8>,
745) {
746    // When delim != line_delim and max_field is bounded, use two-level approach:
747    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
748    // This avoids scanning past max_field on each line (significant for lines with
749    // many columns but small field selection like -f1,3,5 on 20-column CSV).
750    // For complement or unbounded ranges, use single-pass memchr2_iter which
751    // needs to process all delimiters anyway.
752    if delim != line_delim && max_field < usize::MAX && !complement {
753        buf.reserve(data.len());
754        let mut start = 0;
755        for end_pos in memchr_iter(line_delim, data) {
756            let line = &data[start..end_pos];
757            extract_fields_to_buf(
758                line,
759                delim,
760                ranges,
761                output_delim,
762                suppress,
763                max_field,
764                field_mask,
765                line_delim,
766                buf,
767                complement,
768            );
769            start = end_pos + 1;
770        }
771        if start < data.len() {
772            extract_fields_to_buf(
773                &data[start..],
774                delim,
775                ranges,
776                output_delim,
777                suppress,
778                max_field,
779                field_mask,
780                line_delim,
781                buf,
782                complement,
783            );
784        }
785        return;
786    }
787
788    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
789    // delimiter and line_delim in one SIMD scan.
790    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
791    if delim != line_delim {
792        buf.reserve(data.len());
793
794        let data_len = data.len();
795        let base = data.as_ptr();
796        let mut line_start: usize = 0;
797        let mut field_start: usize = 0;
798        let mut field_num: usize = 1;
799        let mut first_output = true;
800        let mut has_delim = false;
801
802        for pos in memchr::memchr2_iter(delim, line_delim, data) {
803            let byte = unsafe { *base.add(pos) };
804
805            if byte == line_delim {
806                // End of line: flush final field and emit line delimiter
807                if (field_num <= max_field || complement)
808                    && has_delim
809                    && is_selected(field_num, field_mask, ranges, complement)
810                {
811                    if !first_output {
812                        unsafe { buf_extend(buf, output_delim) };
813                    }
814                    unsafe {
815                        buf_extend(
816                            buf,
817                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
818                        )
819                    };
820                    first_output = false;
821                }
822
823                if !first_output {
824                    unsafe { buf_push(buf, line_delim) };
825                } else if !has_delim {
826                    if !suppress {
827                        unsafe {
828                            buf_extend(
829                                buf,
830                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
831                            );
832                            buf_push(buf, line_delim);
833                        }
834                    }
835                } else {
836                    unsafe { buf_push(buf, line_delim) };
837                }
838
839                // Reset state for next line
840                line_start = pos + 1;
841                field_start = pos + 1;
842                field_num = 1;
843                first_output = true;
844                has_delim = false;
845            } else {
846                // Field delimiter hit
847                has_delim = true;
848
849                if is_selected(field_num, field_mask, ranges, complement) {
850                    if !first_output {
851                        unsafe { buf_extend(buf, output_delim) };
852                    }
853                    unsafe {
854                        buf_extend(
855                            buf,
856                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
857                        )
858                    };
859                    first_output = false;
860                }
861
862                field_num += 1;
863                field_start = pos + 1;
864            }
865        }
866
867        // Handle last line without trailing line_delim
868        if line_start < data_len {
869            if line_start < data_len {
870                if (field_num <= max_field || complement)
871                    && has_delim
872                    && is_selected(field_num, field_mask, ranges, complement)
873                {
874                    if !first_output {
875                        unsafe { buf_extend(buf, output_delim) };
876                    }
877                    unsafe {
878                        buf_extend(
879                            buf,
880                            std::slice::from_raw_parts(
881                                base.add(field_start),
882                                data_len - field_start,
883                            ),
884                        )
885                    };
886                    first_output = false;
887                }
888
889                if !first_output {
890                    unsafe { buf_push(buf, line_delim) };
891                } else if !has_delim {
892                    if !suppress {
893                        unsafe {
894                            buf_extend(
895                                buf,
896                                std::slice::from_raw_parts(
897                                    base.add(line_start),
898                                    data_len - line_start,
899                                ),
900                            );
901                            buf_push(buf, line_delim);
902                        }
903                    }
904                } else {
905                    unsafe { buf_push(buf, line_delim) };
906                }
907            }
908        }
909
910        return;
911    }
912
913    // Fallback: when delim == line_delim, use the two-level scan approach
914    let mut start = 0;
915    for end_pos in memchr_iter(line_delim, data) {
916        let line = &data[start..end_pos];
917        extract_fields_to_buf(
918            line,
919            delim,
920            ranges,
921            output_delim,
922            suppress,
923            max_field,
924            field_mask,
925            line_delim,
926            buf,
927            complement,
928        );
929        start = end_pos + 1;
930    }
931    if start < data.len() {
932        extract_fields_to_buf(
933            &data[start..],
934            delim,
935            ranges,
936            output_delim,
937            suppress,
938            max_field,
939            field_mask,
940            line_delim,
941            buf,
942            complement,
943        );
944    }
945}
946
947// ── Ultra-fast single field extraction ───────────────────────────────────
948
949/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
950/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
951/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
952fn process_single_field(
953    data: &[u8],
954    delim: u8,
955    line_delim: u8,
956    target: usize,
957    suppress: bool,
958    out: &mut impl Write,
959) -> io::Result<()> {
960    let target_idx = target - 1;
961
962    // Combined SIMD scan: single pass using memchr2 for any target field.
963    if delim != line_delim {
964        if data.len() >= PARALLEL_THRESHOLD {
965            let chunks = split_into_chunks(data, line_delim);
966            let results: Vec<Vec<u8>> = chunks
967                .par_iter()
968                .map(|chunk| {
969                    let mut buf = Vec::with_capacity(chunk.len());
970                    process_nth_field_combined(
971                        chunk, delim, line_delim, target_idx, suppress, &mut buf,
972                    );
973                    buf
974                })
975                .collect();
976            // Use write_vectored (writev) to batch N writes into fewer syscalls
977            let slices: Vec<IoSlice> = results
978                .iter()
979                .filter(|r| !r.is_empty())
980                .map(|r| IoSlice::new(r))
981                .collect();
982            write_ioslices(out, &slices)?;
983        } else if target_idx == 0 && !suppress {
984            // Zero-copy fast path for field 1 (most common case):
985            // For each line, either truncate at the first delimiter, or pass through.
986            // Since most lines have a delimiter, and field 1 is a prefix of each line,
987            // we can write contiguous runs directly from the source data.
988            single_field1_zerocopy(data, delim, line_delim, out)?;
989        } else {
990            // Single-pass SIMD scan using memchr2_iter for both delimiter and
991            // line_delim simultaneously. For large files this is faster than the
992            // two-pass approach (outer newline scan + inner scalar field scan)
993            // because it processes the entire file in one SIMD sweep.
994            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
995            process_nth_field_combined(data, delim, line_delim, target_idx, suppress, &mut buf);
996            if !buf.is_empty() {
997                out.write_all(&buf)?;
998            }
999        }
1000        return Ok(());
1001    }
1002
1003    // Fallback for delim == line_delim: nested loop approach
1004    if data.len() >= PARALLEL_THRESHOLD {
1005        let chunks = split_into_chunks(data, line_delim);
1006        let results: Vec<Vec<u8>> = chunks
1007            .par_iter()
1008            .map(|chunk| {
1009                let mut buf = Vec::with_capacity(chunk.len() / 4);
1010                process_single_field_chunk(
1011                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
1012                );
1013                buf
1014            })
1015            .collect();
1016        // Use write_vectored (writev) to batch N writes into fewer syscalls
1017        let slices: Vec<IoSlice> = results
1018            .iter()
1019            .filter(|r| !r.is_empty())
1020            .map(|r| IoSlice::new(r))
1021            .collect();
1022        write_ioslices(out, &slices)?;
1023    } else {
1024        let mut buf = Vec::with_capacity(data.len() / 4);
1025        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1026        if !buf.is_empty() {
1027            out.write_all(&buf)?;
1028        }
1029    }
1030    Ok(())
1031}
1032
1033/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1034/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1035fn process_complement_range(
1036    data: &[u8],
1037    delim: u8,
1038    line_delim: u8,
1039    skip_start: usize,
1040    skip_end: usize,
1041    suppress: bool,
1042    out: &mut impl Write,
1043) -> io::Result<()> {
1044    if data.len() >= PARALLEL_THRESHOLD {
1045        let chunks = split_into_chunks(data, line_delim);
1046        let results: Vec<Vec<u8>> = chunks
1047            .par_iter()
1048            .map(|chunk| {
1049                let mut buf = Vec::with_capacity(chunk.len());
1050                complement_range_chunk(
1051                    chunk, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1052                );
1053                buf
1054            })
1055            .collect();
1056        let slices: Vec<IoSlice> = results
1057            .iter()
1058            .filter(|r| !r.is_empty())
1059            .map(|r| IoSlice::new(r))
1060            .collect();
1061        write_ioslices(out, &slices)?;
1062    } else {
1063        let mut buf = Vec::with_capacity(data.len());
1064        complement_range_chunk(
1065            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1066        );
1067        if !buf.is_empty() {
1068            out.write_all(&buf)?;
1069        }
1070    }
1071    Ok(())
1072}
1073
1074/// Process a chunk for complement range extraction.
1075fn complement_range_chunk(
1076    data: &[u8],
1077    delim: u8,
1078    skip_start: usize,
1079    skip_end: usize,
1080    line_delim: u8,
1081    suppress: bool,
1082    buf: &mut Vec<u8>,
1083) {
1084    let mut start = 0;
1085    for end_pos in memchr_iter(line_delim, data) {
1086        let line = &data[start..end_pos];
1087        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1088        start = end_pos + 1;
1089    }
1090    if start < data.len() {
1091        complement_range_line(
1092            &data[start..],
1093            delim,
1094            skip_start,
1095            skip_end,
1096            line_delim,
1097            suppress,
1098            buf,
1099        );
1100    }
1101}
1102
1103/// Extract all fields except skip_start..=skip_end from one line.
1104/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1105///
1106/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1107/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1108/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1109#[inline(always)]
1110fn complement_range_line(
1111    line: &[u8],
1112    delim: u8,
1113    skip_start: usize,
1114    skip_end: usize,
1115    line_delim: u8,
1116    suppress: bool,
1117    buf: &mut Vec<u8>,
1118) {
1119    let len = line.len();
1120    if len == 0 {
1121        if !suppress {
1122            buf.push(line_delim);
1123        }
1124        return;
1125    }
1126
1127    buf.reserve(len + 1);
1128    let base = line.as_ptr();
1129
1130    // 1-based field numbers. To skip fields skip_start..=skip_end:
1131    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1132    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1133    //
1134    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1135    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1136
1137    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1138    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1139    let total_need = need_prefix_delims + need_skip_delims;
1140
1141    // Find delimiter positions up to total_need
1142    let mut delim_count: usize = 0;
1143    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1144    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1145
1146    for pos in memchr_iter(delim, line) {
1147        delim_count += 1;
1148        if delim_count == need_prefix_delims {
1149            prefix_end_pos = pos;
1150        }
1151        if delim_count == total_need {
1152            suffix_start_pos = pos + 1;
1153            break;
1154        }
1155    }
1156
1157    if delim_count == 0 {
1158        // No delimiter at all
1159        if !suppress {
1160            unsafe {
1161                buf_extend(buf, line);
1162                buf_push(buf, line_delim);
1163            }
1164        }
1165        return;
1166    }
1167
1168    // Case analysis:
1169    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1170    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1171    // 3. Enough to reach skip_end: prefix + delim + suffix
1172
1173    if delim_count < need_prefix_delims {
1174        // Not enough fields to reach skip region — output entire line
1175        unsafe {
1176            buf_extend(buf, line);
1177            buf_push(buf, line_delim);
1178        }
1179        return;
1180    }
1181
1182    let has_prefix = need_prefix_delims > 0;
1183    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1184
1185    if has_prefix && has_suffix {
1186        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1187        unsafe {
1188            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1189            buf_push(buf, delim);
1190            buf_extend(
1191                buf,
1192                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1193            );
1194            buf_push(buf, line_delim);
1195        }
1196    } else if has_prefix {
1197        // Only prefix, no suffix (skip region extends to end of line)
1198        unsafe {
1199            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1200            buf_push(buf, line_delim);
1201        }
1202    } else if has_suffix {
1203        // No prefix (skip_start == 1), only suffix
1204        unsafe {
1205            buf_extend(
1206                buf,
1207                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1208            );
1209            buf_push(buf, line_delim);
1210        }
1211    } else {
1212        // All fields skipped
1213        unsafe { buf_push(buf, line_delim) };
1214    }
1215}
1216
1217/// Complement single-field extraction: skip one field, output rest unchanged.
1218fn process_complement_single_field(
1219    data: &[u8],
1220    delim: u8,
1221    line_delim: u8,
1222    skip_field: usize,
1223    suppress: bool,
1224    out: &mut impl Write,
1225) -> io::Result<()> {
1226    let skip_idx = skip_field - 1;
1227
1228    if data.len() >= PARALLEL_THRESHOLD {
1229        let chunks = split_into_chunks(data, line_delim);
1230        let results: Vec<Vec<u8>> = chunks
1231            .par_iter()
1232            .map(|chunk| {
1233                let mut buf = Vec::with_capacity(chunk.len());
1234                complement_single_field_chunk(
1235                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
1236                );
1237                buf
1238            })
1239            .collect();
1240        // Use write_vectored (writev) to batch N writes into fewer syscalls
1241        let slices: Vec<IoSlice> = results
1242            .iter()
1243            .filter(|r| !r.is_empty())
1244            .map(|r| IoSlice::new(r))
1245            .collect();
1246        write_ioslices(out, &slices)?;
1247    } else {
1248        let mut buf = Vec::with_capacity(data.len());
1249        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1250        if !buf.is_empty() {
1251            out.write_all(&buf)?;
1252        }
1253    }
1254    Ok(())
1255}
1256
1257/// Process a chunk for complement single-field extraction.
1258fn complement_single_field_chunk(
1259    data: &[u8],
1260    delim: u8,
1261    skip_idx: usize,
1262    line_delim: u8,
1263    suppress: bool,
1264    buf: &mut Vec<u8>,
1265) {
1266    let mut start = 0;
1267    for end_pos in memchr_iter(line_delim, data) {
1268        let line = &data[start..end_pos];
1269        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1270        start = end_pos + 1;
1271    }
1272    if start < data.len() {
1273        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
1274    }
1275}
1276
1277/// Extract all fields except skip_idx from one line.
1278/// Optimized: finds only the delimiters bounding the skip field (skip_idx-th
1279/// and (skip_idx+1)-th delimiters), then copies prefix + suffix in 2 bulk
1280/// copies instead of iterating through all fields.
1281#[inline(always)]
1282fn complement_single_field_line(
1283    line: &[u8],
1284    delim: u8,
1285    skip_idx: usize,
1286    line_delim: u8,
1287    suppress: bool,
1288    buf: &mut Vec<u8>,
1289) {
1290    let len = line.len();
1291    if len == 0 {
1292        if !suppress {
1293            buf.push(line_delim);
1294        }
1295        return;
1296    }
1297
1298    buf.reserve(len + 1);
1299    let base = line.as_ptr();
1300
1301    // Find the delimiters bounding the skip field:
1302    // - We need skip_idx delimiters to find where the skip field starts
1303    // - We need one more delimiter to find where it ends
1304    // For skip_idx == 0 (skip field 1): skip field starts at 0, ends at first delimiter
1305    // For skip_idx == 1 (skip field 2): skip field starts after 1st delim, ends at 2nd delim
1306    let need_before = skip_idx; // delimiters before skip field
1307    let need_total = skip_idx + 1; // delimiters to find end of skip field
1308
1309    let mut delim_count: usize = 0;
1310    let mut skip_start_pos: usize = 0; // byte start of skip field
1311    let mut skip_end_pos: usize = len; // byte position of delimiter after skip field (or EOL)
1312    let mut found_end = false;
1313
1314    for pos in memchr_iter(delim, line) {
1315        delim_count += 1;
1316        if delim_count == need_before {
1317            skip_start_pos = pos + 1;
1318        }
1319        if delim_count == need_total {
1320            skip_end_pos = pos;
1321            found_end = true;
1322            break;
1323        }
1324    }
1325
1326    if delim_count == 0 {
1327        // No delimiter in line
1328        if !suppress {
1329            unsafe {
1330                buf_extend(buf, line);
1331                buf_push(buf, line_delim);
1332            }
1333        }
1334        return;
1335    }
1336
1337    // Not enough delimiters to reach the skip field: output entire line
1338    if delim_count < need_before {
1339        unsafe {
1340            buf_extend(buf, line);
1341            buf_push(buf, line_delim);
1342        }
1343        return;
1344    }
1345
1346    // skip field is at positions skip_start_pos..skip_end_pos
1347    // Output prefix (before skip field) + suffix (after skip field)
1348    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1349    let has_suffix = found_end && skip_end_pos < len;
1350
1351    if has_prefix && has_suffix {
1352        // prefix = line[0..skip_start_pos-1] (before the delimiter that starts skip field)
1353        // suffix = line[skip_end_pos+1..] (after the delimiter that ends skip field)
1354        unsafe {
1355            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1356            buf_push(buf, delim);
1357            buf_extend(
1358                buf,
1359                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1360            );
1361            buf_push(buf, line_delim);
1362        }
1363    } else if has_prefix {
1364        // Only prefix (skip field is the last field)
1365        unsafe {
1366            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1367            buf_push(buf, line_delim);
1368        }
1369    } else if has_suffix {
1370        // No prefix (skip field is the first field)
1371        unsafe {
1372            buf_extend(
1373                buf,
1374                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1375            );
1376            buf_push(buf, line_delim);
1377        }
1378    } else {
1379        // Skip field is the only field (or entire line)
1380        unsafe { buf_push(buf, line_delim) };
1381    }
1382}
1383
1384/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1385/// Zero-copy for the non-parallel path: identifies the truncation point per line
1386/// and writes contiguous runs directly from the source data.
1387fn process_fields_prefix(
1388    data: &[u8],
1389    delim: u8,
1390    line_delim: u8,
1391    last_field: usize,
1392    suppress: bool,
1393    out: &mut impl Write,
1394) -> io::Result<()> {
1395    if data.len() >= PARALLEL_THRESHOLD {
1396        let chunks = split_into_chunks(data, line_delim);
1397        let results: Vec<Vec<u8>> = chunks
1398            .par_iter()
1399            .map(|chunk| {
1400                let mut buf = Vec::with_capacity(chunk.len());
1401                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
1402                buf
1403            })
1404            .collect();
1405        // Use write_vectored (writev) to batch N writes into fewer syscalls
1406        let slices: Vec<IoSlice> = results
1407            .iter()
1408            .filter(|r| !r.is_empty())
1409            .map(|r| IoSlice::new(r))
1410            .collect();
1411        write_ioslices(out, &slices)?;
1412    } else if !suppress {
1413        // Zero-copy fast path: scan for truncation points, write runs from source.
1414        // When suppress is false, every line is output (with or without delimiter).
1415        // Most lines have enough fields, so the output is often identical to input.
1416        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1417    } else {
1418        let mut buf = Vec::with_capacity(data.len());
1419        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1420        if !buf.is_empty() {
1421            out.write_all(&buf)?;
1422        }
1423    }
1424    Ok(())
1425}
1426
1427/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1428/// directly into the source data, flushing in MAX_IOV-sized batches.
1429/// For lines where the Nth delimiter exists, we truncate at that point.
1430/// For lines with fewer fields, we output them unchanged (contiguous run).
1431/// Lines without any delimiter are output unchanged (suppress=false assumed).
1432#[inline]
1433fn fields_prefix_zerocopy(
1434    data: &[u8],
1435    delim: u8,
1436    line_delim: u8,
1437    last_field: usize,
1438    out: &mut impl Write,
1439) -> io::Result<()> {
1440    let newline_buf: [u8; 1] = [line_delim];
1441    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1442    let mut start = 0;
1443    let mut run_start: usize = 0;
1444
1445    for end_pos in memchr_iter(line_delim, data) {
1446        let line = &data[start..end_pos];
1447        let mut field_count = 1;
1448        let mut truncate_at: Option<usize> = None;
1449        for dpos in memchr_iter(delim, line) {
1450            if field_count >= last_field {
1451                truncate_at = Some(start + dpos);
1452                break;
1453            }
1454            field_count += 1;
1455        }
1456
1457        if let Some(trunc_pos) = truncate_at {
1458            if run_start < start {
1459                iov.push(IoSlice::new(&data[run_start..start]));
1460            }
1461            iov.push(IoSlice::new(&data[start..trunc_pos]));
1462            iov.push(IoSlice::new(&newline_buf));
1463            run_start = end_pos + 1;
1464
1465            if iov.len() >= MAX_IOV - 2 {
1466                write_ioslices(out, &iov)?;
1467                iov.clear();
1468            }
1469        }
1470        start = end_pos + 1;
1471    }
1472    // Handle last line without terminator
1473    if start < data.len() {
1474        let line = &data[start..];
1475        let mut field_count = 1;
1476        let mut truncate_at: Option<usize> = None;
1477        for dpos in memchr_iter(delim, line) {
1478            if field_count >= last_field {
1479                truncate_at = Some(start + dpos);
1480                break;
1481            }
1482            field_count += 1;
1483        }
1484        if let Some(trunc_pos) = truncate_at {
1485            if run_start < start {
1486                iov.push(IoSlice::new(&data[run_start..start]));
1487            }
1488            iov.push(IoSlice::new(&data[start..trunc_pos]));
1489            iov.push(IoSlice::new(&newline_buf));
1490            if !iov.is_empty() {
1491                write_ioslices(out, &iov)?;
1492            }
1493            return Ok(());
1494        }
1495    }
1496    // Flush remaining contiguous run
1497    if run_start < data.len() {
1498        iov.push(IoSlice::new(&data[run_start..]));
1499        if !data.is_empty() && *data.last().unwrap() != line_delim {
1500            iov.push(IoSlice::new(&newline_buf));
1501        }
1502    }
1503    if !iov.is_empty() {
1504        write_ioslices(out, &iov)?;
1505    }
1506    Ok(())
1507}
1508
1509/// Process a chunk for contiguous from-start field range extraction.
1510fn fields_prefix_chunk(
1511    data: &[u8],
1512    delim: u8,
1513    line_delim: u8,
1514    last_field: usize,
1515    suppress: bool,
1516    buf: &mut Vec<u8>,
1517) {
1518    let mut start = 0;
1519    for end_pos in memchr_iter(line_delim, data) {
1520        let line = &data[start..end_pos];
1521        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1522        start = end_pos + 1;
1523    }
1524    if start < data.len() {
1525        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1526    }
1527}
1528
1529/// Extract first N fields from one line (contiguous from-start range).
1530/// Uses memchr SIMD for delimiter scanning on all line sizes.
1531#[inline(always)]
1532fn fields_prefix_line(
1533    line: &[u8],
1534    delim: u8,
1535    line_delim: u8,
1536    last_field: usize,
1537    suppress: bool,
1538    buf: &mut Vec<u8>,
1539) {
1540    let len = line.len();
1541    if len == 0 {
1542        if !suppress {
1543            buf.push(line_delim);
1544        }
1545        return;
1546    }
1547
1548    buf.reserve(len + 1);
1549    let base = line.as_ptr();
1550
1551    let mut field_count = 1usize;
1552    let mut has_delim = false;
1553
1554    for pos in memchr_iter(delim, line) {
1555        has_delim = true;
1556        if field_count >= last_field {
1557            unsafe {
1558                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1559                buf_push(buf, line_delim);
1560            }
1561            return;
1562        }
1563        field_count += 1;
1564    }
1565
1566    if !has_delim {
1567        if !suppress {
1568            unsafe {
1569                buf_extend(buf, line);
1570                buf_push(buf, line_delim);
1571            }
1572        }
1573        return;
1574    }
1575
1576    unsafe {
1577        buf_extend(buf, line);
1578        buf_push(buf, line_delim);
1579    }
1580}
1581
1582/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1583fn process_fields_suffix(
1584    data: &[u8],
1585    delim: u8,
1586    line_delim: u8,
1587    start_field: usize,
1588    suppress: bool,
1589    out: &mut impl Write,
1590) -> io::Result<()> {
1591    if data.len() >= PARALLEL_THRESHOLD {
1592        let chunks = split_into_chunks(data, line_delim);
1593        let results: Vec<Vec<u8>> = chunks
1594            .par_iter()
1595            .map(|chunk| {
1596                let mut buf = Vec::with_capacity(chunk.len());
1597                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
1598                buf
1599            })
1600            .collect();
1601        // Use write_vectored (writev) to batch N writes into fewer syscalls
1602        let slices: Vec<IoSlice> = results
1603            .iter()
1604            .filter(|r| !r.is_empty())
1605            .map(|r| IoSlice::new(r))
1606            .collect();
1607        write_ioslices(out, &slices)?;
1608    } else {
1609        let mut buf = Vec::with_capacity(data.len());
1610        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1611        if !buf.is_empty() {
1612            out.write_all(&buf)?;
1613        }
1614    }
1615    Ok(())
1616}
1617
1618/// Process a chunk for open-ended field suffix extraction.
1619fn fields_suffix_chunk(
1620    data: &[u8],
1621    delim: u8,
1622    line_delim: u8,
1623    start_field: usize,
1624    suppress: bool,
1625    buf: &mut Vec<u8>,
1626) {
1627    let mut start = 0;
1628    for end_pos in memchr_iter(line_delim, data) {
1629        let line = &data[start..end_pos];
1630        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1631        start = end_pos + 1;
1632    }
1633    if start < data.len() {
1634        fields_suffix_line(
1635            &data[start..],
1636            delim,
1637            line_delim,
1638            start_field,
1639            suppress,
1640            buf,
1641        );
1642    }
1643}
1644
1645/// Extract fields from start_field to end from one line.
1646/// Uses memchr SIMD for delimiter scanning on all line sizes.
1647#[inline(always)]
1648fn fields_suffix_line(
1649    line: &[u8],
1650    delim: u8,
1651    line_delim: u8,
1652    start_field: usize,
1653    suppress: bool,
1654    buf: &mut Vec<u8>,
1655) {
1656    let len = line.len();
1657    if len == 0 {
1658        if !suppress {
1659            buf.push(line_delim);
1660        }
1661        return;
1662    }
1663
1664    buf.reserve(len + 1);
1665    let base = line.as_ptr();
1666
1667    let skip_delims = start_field - 1;
1668    let mut delim_count = 0usize;
1669    let mut has_delim = false;
1670
1671    for pos in memchr_iter(delim, line) {
1672        has_delim = true;
1673        delim_count += 1;
1674        if delim_count >= skip_delims {
1675            unsafe {
1676                buf_extend(
1677                    buf,
1678                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1679                );
1680                buf_push(buf, line_delim);
1681            }
1682            return;
1683        }
1684    }
1685
1686    if !has_delim {
1687        if !suppress {
1688            unsafe {
1689                buf_extend(buf, line);
1690                buf_push(buf, line_delim);
1691            }
1692        }
1693        return;
1694    }
1695
1696    // Fewer delimiters than needed
1697    unsafe { buf_push(buf, line_delim) };
1698}
1699
1700/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1701/// Optimized: skip to start_field using memchr, then output until end_field.
1702fn process_fields_mid_range(
1703    data: &[u8],
1704    delim: u8,
1705    line_delim: u8,
1706    start_field: usize,
1707    end_field: usize,
1708    suppress: bool,
1709    out: &mut impl Write,
1710) -> io::Result<()> {
1711    if data.len() >= PARALLEL_THRESHOLD {
1712        let chunks = split_into_chunks(data, line_delim);
1713        let results: Vec<Vec<u8>> = chunks
1714            .par_iter()
1715            .map(|chunk| {
1716                let mut buf = Vec::with_capacity(chunk.len());
1717                fields_mid_range_chunk(
1718                    chunk,
1719                    delim,
1720                    line_delim,
1721                    start_field,
1722                    end_field,
1723                    suppress,
1724                    &mut buf,
1725                );
1726                buf
1727            })
1728            .collect();
1729        let slices: Vec<IoSlice> = results
1730            .iter()
1731            .filter(|r| !r.is_empty())
1732            .map(|r| IoSlice::new(r))
1733            .collect();
1734        write_ioslices(out, &slices)?;
1735    } else {
1736        let mut buf = Vec::with_capacity(data.len());
1737        fields_mid_range_chunk(
1738            data,
1739            delim,
1740            line_delim,
1741            start_field,
1742            end_field,
1743            suppress,
1744            &mut buf,
1745        );
1746        if !buf.is_empty() {
1747            out.write_all(&buf)?;
1748        }
1749    }
1750    Ok(())
1751}
1752
1753/// Process a chunk for contiguous mid-range field extraction.
1754fn fields_mid_range_chunk(
1755    data: &[u8],
1756    delim: u8,
1757    line_delim: u8,
1758    start_field: usize,
1759    end_field: usize,
1760    suppress: bool,
1761    buf: &mut Vec<u8>,
1762) {
1763    let mut start = 0;
1764    for end_pos in memchr_iter(line_delim, data) {
1765        let line = &data[start..end_pos];
1766        fields_mid_range_line(
1767            line,
1768            delim,
1769            line_delim,
1770            start_field,
1771            end_field,
1772            suppress,
1773            buf,
1774        );
1775        start = end_pos + 1;
1776    }
1777    if start < data.len() {
1778        fields_mid_range_line(
1779            &data[start..],
1780            delim,
1781            line_delim,
1782            start_field,
1783            end_field,
1784            suppress,
1785            buf,
1786        );
1787    }
1788}
1789
1790/// Extract fields start_field..=end_field from one line.
1791/// Uses scalar byte scanning for short lines, memchr_iter for longer.
1792/// Raw pointer arithmetic to eliminate bounds checking.
1793#[inline(always)]
1794fn fields_mid_range_line(
1795    line: &[u8],
1796    delim: u8,
1797    line_delim: u8,
1798    start_field: usize,
1799    end_field: usize,
1800    suppress: bool,
1801    buf: &mut Vec<u8>,
1802) {
1803    let len = line.len();
1804    if len == 0 {
1805        if !suppress {
1806            buf.push(line_delim);
1807        }
1808        return;
1809    }
1810
1811    buf.reserve(len + 1);
1812    let base = line.as_ptr();
1813
1814    // Count delimiters to find start_field and end_field boundaries
1815    let skip_before = start_field - 1; // delimiters to skip before start_field
1816    let field_span = end_field - start_field; // additional delimiters within the range
1817    let target_end_delim = skip_before + field_span + 1;
1818    let mut delim_count = 0;
1819    let mut range_start = 0;
1820    let mut has_delim = false;
1821
1822    for pos in memchr_iter(delim, line) {
1823        has_delim = true;
1824        delim_count += 1;
1825        if delim_count == skip_before {
1826            range_start = pos + 1;
1827        }
1828        if delim_count == target_end_delim {
1829            if skip_before == 0 {
1830                range_start = 0;
1831            }
1832            unsafe {
1833                buf_extend(
1834                    buf,
1835                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1836                );
1837                buf_push(buf, line_delim);
1838            }
1839            return;
1840        }
1841    }
1842
1843    if !has_delim {
1844        if !suppress {
1845            unsafe {
1846                buf_extend(buf, line);
1847                buf_push(buf, line_delim);
1848            }
1849        }
1850        return;
1851    }
1852
1853    // Line has delimiters but fewer fields than end_field
1854    if delim_count >= skip_before {
1855        // We have at least start_field, output from range_start to end
1856        if skip_before == 0 {
1857            range_start = 0;
1858        }
1859        unsafe {
1860            buf_extend(
1861                buf,
1862                std::slice::from_raw_parts(base.add(range_start), len - range_start),
1863            );
1864            buf_push(buf, line_delim);
1865        }
1866    } else {
1867        // Not enough fields even for start_field — output empty line
1868        unsafe { buf_push(buf, line_delim) };
1869    }
1870}
1871
1872/// Combined SIMD scan for arbitrary single field extraction.
1873/// Uses memchr2_iter(delim, line_delim) to scan for both bytes in a single SIMD pass.
1874/// This is faster than the nested approach (outer: find newlines, inner: find delimiters)
1875/// because it eliminates one full SIMD scan and improves cache locality.
1876///
1877/// For target_idx == 0 (field 1), after finding the target field we skip remaining
1878/// delimiters on the line by scanning directly for line_delim.
1879fn process_nth_field_combined(
1880    data: &[u8],
1881    delim: u8,
1882    line_delim: u8,
1883    target_idx: usize,
1884    suppress: bool,
1885    buf: &mut Vec<u8>,
1886) {
1887    buf.reserve(data.len());
1888
1889    let data_len = data.len();
1890    let base = data.as_ptr();
1891    let mut line_start: usize = 0;
1892    let mut field_start: usize = 0;
1893    let mut field_idx: usize = 0;
1894    let mut has_delim = false;
1895    let mut emitted = false;
1896
1897    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1898        let byte = unsafe { *base.add(pos) };
1899
1900        if byte == line_delim {
1901            // End of line
1902            if !emitted {
1903                if has_delim && field_idx == target_idx {
1904                    // Last field matches target
1905                    unsafe {
1906                        buf_extend(
1907                            buf,
1908                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
1909                        );
1910                        buf_push(buf, line_delim);
1911                    }
1912                } else if has_delim {
1913                    // Target field doesn't exist (fewer fields)
1914                    unsafe {
1915                        buf_push(buf, line_delim);
1916                    }
1917                } else if !suppress {
1918                    // No delimiter in line — output unchanged
1919                    unsafe {
1920                        buf_extend(
1921                            buf,
1922                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1923                        );
1924                        buf_push(buf, line_delim);
1925                    }
1926                }
1927            }
1928            // Reset for next line
1929            line_start = pos + 1;
1930            field_start = pos + 1;
1931            field_idx = 0;
1932            has_delim = false;
1933            emitted = false;
1934        } else {
1935            // Delimiter found
1936            has_delim = true;
1937            if field_idx == target_idx {
1938                unsafe {
1939                    buf_extend(
1940                        buf,
1941                        std::slice::from_raw_parts(base.add(field_start), pos - field_start),
1942                    );
1943                    buf_push(buf, line_delim);
1944                }
1945                emitted = true;
1946            }
1947            field_idx += 1;
1948            field_start = pos + 1;
1949        }
1950    }
1951
1952    // Handle last line without trailing newline
1953    if line_start < data_len && !emitted {
1954        if has_delim && field_idx == target_idx {
1955            unsafe {
1956                buf_extend(
1957                    buf,
1958                    std::slice::from_raw_parts(base.add(field_start), data_len - field_start),
1959                );
1960                buf_push(buf, line_delim);
1961            }
1962        } else if has_delim {
1963            unsafe {
1964                buf_push(buf, line_delim);
1965            }
1966        } else if !suppress {
1967            unsafe {
1968                buf_extend(
1969                    buf,
1970                    std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
1971                );
1972                buf_push(buf, line_delim);
1973            }
1974        }
1975    }
1976}
1977
1978/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
1979/// directly into the source data, flushing in MAX_IOV-sized batches.
1980/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
1981/// Uses memchr2 to scan for both delimiter and line terminator in a single SIMD pass.
1982///
1983/// For lines without delimiter: the entire line (including its newline) is a single
1984/// contiguous IoSlice. For lines with delimiter: two IoSlices (field1 data + newline byte).
1985/// Contiguous runs of unmodified lines are coalesced into a single IoSlice.
1986#[inline]
1987fn single_field1_zerocopy(
1988    data: &[u8],
1989    delim: u8,
1990    line_delim: u8,
1991    out: &mut impl Write,
1992) -> io::Result<()> {
1993    // Static newline byte for IoSlice references
1994    let newline_buf: [u8; 1] = [line_delim];
1995
1996    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1997    let mut line_start: usize = 0;
1998    let mut run_start: usize = 0;
1999    let mut first_delim: Option<usize> = None;
2000
2001    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2002        let byte = unsafe { *data.get_unchecked(pos) };
2003
2004        if byte == line_delim {
2005            // End of line
2006            if let Some(dp) = first_delim {
2007                // Line has delimiter — truncate at first delimiter.
2008                // Flush current contiguous run, then add truncated line + newline.
2009                if run_start < line_start {
2010                    iov.push(IoSlice::new(&data[run_start..line_start]));
2011                }
2012                iov.push(IoSlice::new(&data[line_start..dp]));
2013                iov.push(IoSlice::new(&newline_buf));
2014                run_start = pos + 1;
2015
2016                // Flush when we're near the iovec limit
2017                if iov.len() >= MAX_IOV - 2 {
2018                    write_ioslices(out, &iov)?;
2019                    iov.clear();
2020                }
2021            }
2022            // else: no delimiter in line, output unchanged (stays in run)
2023            line_start = pos + 1;
2024            first_delim = None;
2025        } else {
2026            // Delimiter found
2027            if first_delim.is_none() {
2028                first_delim = Some(pos);
2029            }
2030        }
2031    }
2032
2033    // Handle last line (no trailing line_delim)
2034    if line_start < data.len() {
2035        if let Some(dp) = first_delim {
2036            if run_start < line_start {
2037                iov.push(IoSlice::new(&data[run_start..line_start]));
2038            }
2039            iov.push(IoSlice::new(&data[line_start..dp]));
2040            iov.push(IoSlice::new(&newline_buf));
2041            if !iov.is_empty() {
2042                write_ioslices(out, &iov)?;
2043            }
2044            return Ok(());
2045        }
2046    }
2047
2048    // Flush remaining contiguous run
2049    if run_start < data.len() {
2050        iov.push(IoSlice::new(&data[run_start..]));
2051        if !data.is_empty() && *data.last().unwrap() != line_delim {
2052            iov.push(IoSlice::new(&newline_buf));
2053        }
2054    }
2055    if !iov.is_empty() {
2056        write_ioslices(out, &iov)?;
2057    }
2058    Ok(())
2059}
2060
2061/// Process a chunk of data for single-field extraction.
2062fn process_single_field_chunk(
2063    data: &[u8],
2064    delim: u8,
2065    target_idx: usize,
2066    line_delim: u8,
2067    suppress: bool,
2068    buf: &mut Vec<u8>,
2069) {
2070    let mut start = 0;
2071    for end_pos in memchr_iter(line_delim, data) {
2072        let line = &data[start..end_pos];
2073        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2074        start = end_pos + 1;
2075    }
2076    if start < data.len() {
2077        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2078    }
2079}
2080
2081/// Extract a single field from one line.
2082/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2083/// For longer lines, uses memchr for SIMD-accelerated scanning.
2084/// Raw pointer arithmetic eliminates per-field bounds checking.
2085#[inline(always)]
2086fn extract_single_field_line(
2087    line: &[u8],
2088    delim: u8,
2089    target_idx: usize,
2090    line_delim: u8,
2091    suppress: bool,
2092    buf: &mut Vec<u8>,
2093) {
2094    let len = line.len();
2095    if len == 0 {
2096        if !suppress {
2097            buf.push(line_delim);
2098        }
2099        return;
2100    }
2101
2102    // Ensure capacity for worst case (full line + newline)
2103    buf.reserve(len + 1);
2104
2105    let base = line.as_ptr();
2106
2107    // Ultra-fast path for first field: single memchr
2108    if target_idx == 0 {
2109        match memchr::memchr(delim, line) {
2110            Some(pos) => unsafe {
2111                buf_extend(buf, std::slice::from_raw_parts(base, pos));
2112                buf_push(buf, line_delim);
2113            },
2114            None => {
2115                if !suppress {
2116                    unsafe {
2117                        buf_extend(buf, line);
2118                        buf_push(buf, line_delim);
2119                    }
2120                }
2121            }
2122        }
2123        return;
2124    }
2125
2126    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2127    let mut field_start = 0;
2128    let mut field_idx = 0;
2129    let mut has_delim = false;
2130
2131    for pos in memchr_iter(delim, line) {
2132        has_delim = true;
2133        if field_idx == target_idx {
2134            unsafe {
2135                buf_extend(
2136                    buf,
2137                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2138                );
2139                buf_push(buf, line_delim);
2140            }
2141            return;
2142        }
2143        field_idx += 1;
2144        field_start = pos + 1;
2145    }
2146
2147    if !has_delim {
2148        if !suppress {
2149            unsafe {
2150                buf_extend(buf, line);
2151                buf_push(buf, line_delim);
2152            }
2153        }
2154        return;
2155    }
2156
2157    if field_idx == target_idx {
2158        unsafe {
2159            buf_extend(
2160                buf,
2161                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2162            );
2163            buf_push(buf, line_delim);
2164        }
2165    } else {
2166        unsafe { buf_push(buf, line_delim) };
2167    }
2168}
2169
2170/// Extract fields from a single line into the output buffer.
2171/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2172/// Raw pointer arithmetic eliminates per-field bounds checking.
2173#[inline(always)]
2174fn extract_fields_to_buf(
2175    line: &[u8],
2176    delim: u8,
2177    ranges: &[Range],
2178    output_delim: &[u8],
2179    suppress: bool,
2180    max_field: usize,
2181    field_mask: u64,
2182    line_delim: u8,
2183    buf: &mut Vec<u8>,
2184    complement: bool,
2185) {
2186    let len = line.len();
2187
2188    if len == 0 {
2189        if !suppress {
2190            buf.push(line_delim);
2191        }
2192        return;
2193    }
2194
2195    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2196    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2197    let needed = len + output_delim.len() * 16 + 1;
2198    if buf.capacity() - buf.len() < needed {
2199        buf.reserve(needed);
2200    }
2201
2202    let base = line.as_ptr();
2203    let mut field_num: usize = 1;
2204    let mut field_start: usize = 0;
2205    let mut first_output = true;
2206    let mut has_delim = false;
2207
2208    // Use memchr SIMD for all line sizes
2209    for delim_pos in memchr_iter(delim, line) {
2210        has_delim = true;
2211
2212        if is_selected(field_num, field_mask, ranges, complement) {
2213            if !first_output {
2214                unsafe { buf_extend(buf, output_delim) };
2215            }
2216            unsafe {
2217                buf_extend(
2218                    buf,
2219                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2220                )
2221            };
2222            first_output = false;
2223        }
2224
2225        field_num += 1;
2226        field_start = delim_pos + 1;
2227
2228        if field_num > max_field {
2229            break;
2230        }
2231    }
2232
2233    // Last field
2234    if (field_num <= max_field || complement)
2235        && has_delim
2236        && is_selected(field_num, field_mask, ranges, complement)
2237    {
2238        if !first_output {
2239            unsafe { buf_extend(buf, output_delim) };
2240        }
2241        unsafe {
2242            buf_extend(
2243                buf,
2244                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2245            )
2246        };
2247        first_output = false;
2248    }
2249
2250    if !first_output {
2251        unsafe { buf_push(buf, line_delim) };
2252    } else if !has_delim {
2253        if !suppress {
2254            unsafe {
2255                buf_extend(buf, line);
2256                buf_push(buf, line_delim);
2257            }
2258        }
2259    } else {
2260        unsafe { buf_push(buf, line_delim) };
2261    }
2262}
2263
2264// ── Fast path: byte/char extraction with batched output ──────────────────
2265
2266/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2267/// Zero-copy: writes directly from the source data using output runs.
2268/// For lines shorter than max_bytes, the output is identical to the input,
2269/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2270fn process_bytes_from_start(
2271    data: &[u8],
2272    max_bytes: usize,
2273    line_delim: u8,
2274    out: &mut impl Write,
2275) -> io::Result<()> {
2276    if data.len() >= PARALLEL_THRESHOLD {
2277        let chunks = split_into_chunks(data, line_delim);
2278        let results: Vec<Vec<u8>> = chunks
2279            .par_iter()
2280            .map(|chunk| {
2281                // Estimate output size without scanning: assume average line
2282                // is at least (max_bytes+1) bytes (otherwise no truncation).
2283                // For cut -b1-5 on 50-char lines: output ~ chunk.len() * 6/51 ~ chunk/8.
2284                // Using chunk.len()/4 as initial capacity handles most cases
2285                // without reallocation, while avoiding the extra memchr scan.
2286                let est_out = (chunk.len() / 4).max(max_bytes + 2);
2287                let mut buf = Vec::with_capacity(est_out.min(chunk.len()));
2288                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
2289                buf
2290            })
2291            .collect();
2292        // Use write_vectored (writev) to batch N writes into fewer syscalls
2293        let slices: Vec<IoSlice> = results
2294            .iter()
2295            .filter(|r| !r.is_empty())
2296            .map(|r| IoSlice::new(r))
2297            .collect();
2298        write_ioslices(out, &slices)?;
2299    } else {
2300        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2301        // because every line gets truncated, creating 3 IoSlice entries per line.
2302        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2303        // managing millions of IoSlice entries through the kernel.
2304        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2305        if max_bytes <= 512 {
2306            // Estimate output size without scanning: output <= data.len(),
2307            // typically ~data.len()/4 for short max_bytes on longer lines.
2308            let est_out = (data.len() / 4).max(max_bytes + 2);
2309            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2310            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2311            if !buf.is_empty() {
2312                out.write_all(&buf)?;
2313            }
2314        } else {
2315            // Zero-copy path: track contiguous output runs and write directly from source.
2316            // For lines <= max_bytes, we include them as-is (no copy needed).
2317            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2318            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2319        }
2320    }
2321    Ok(())
2322}
2323
2324/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2325/// directly into the source data, flushing in MAX_IOV-sized batches.
2326/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2327/// produce two IoSlices (truncated data + newline).
2328#[inline]
2329fn bytes_from_start_zerocopy(
2330    data: &[u8],
2331    max_bytes: usize,
2332    line_delim: u8,
2333    out: &mut impl Write,
2334) -> io::Result<()> {
2335    let newline_buf: [u8; 1] = [line_delim];
2336    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2337    let mut start = 0;
2338    let mut run_start: usize = 0;
2339
2340    for pos in memchr_iter(line_delim, data) {
2341        let line_len = pos - start;
2342        if line_len > max_bytes {
2343            // This line needs truncation
2344            if run_start < start {
2345                iov.push(IoSlice::new(&data[run_start..start]));
2346            }
2347            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2348            iov.push(IoSlice::new(&newline_buf));
2349            run_start = pos + 1;
2350
2351            if iov.len() >= MAX_IOV - 2 {
2352                write_ioslices(out, &iov)?;
2353                iov.clear();
2354            }
2355        }
2356        start = pos + 1;
2357    }
2358    // Handle last line without terminator
2359    if start < data.len() {
2360        let line_len = data.len() - start;
2361        if line_len > max_bytes {
2362            if run_start < start {
2363                iov.push(IoSlice::new(&data[run_start..start]));
2364            }
2365            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2366            iov.push(IoSlice::new(&newline_buf));
2367            if !iov.is_empty() {
2368                write_ioslices(out, &iov)?;
2369            }
2370            return Ok(());
2371        }
2372    }
2373    // Flush remaining contiguous run
2374    if run_start < data.len() {
2375        iov.push(IoSlice::new(&data[run_start..]));
2376        if !data.is_empty() && *data.last().unwrap() != line_delim {
2377            iov.push(IoSlice::new(&newline_buf));
2378        }
2379    }
2380    if !iov.is_empty() {
2381        write_ioslices(out, &iov)?;
2382    }
2383    Ok(())
2384}
2385
2386/// Process a chunk for from-start byte range extraction (parallel path).
2387/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2388/// Pre-reserves data.len() (output never exceeds input), then uses a single
2389/// write pointer with deferred set_len — no per-line capacity checks.
2390#[inline]
2391fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2392    // Output is always <= input size (we only truncate, never expand).
2393    // Single reserve eliminates ALL per-line capacity checks.
2394    buf.reserve(data.len());
2395
2396    let src = data.as_ptr();
2397    let dst_base = buf.as_mut_ptr();
2398    let mut wp = buf.len();
2399    let mut start = 0;
2400
2401    for pos in memchr_iter(line_delim, data) {
2402        let line_len = pos - start;
2403        let take = line_len.min(max_bytes);
2404        unsafe {
2405            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2406            *dst_base.add(wp + take) = line_delim;
2407        }
2408        wp += take + 1;
2409        start = pos + 1;
2410    }
2411    // Handle last line without terminator
2412    if start < data.len() {
2413        let line_len = data.len() - start;
2414        let take = line_len.min(max_bytes);
2415        unsafe {
2416            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2417            *dst_base.add(wp + take) = line_delim;
2418        }
2419        wp += take + 1;
2420    }
2421    unsafe { buf.set_len(wp) };
2422}
2423
2424/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2425fn process_bytes_from_offset(
2426    data: &[u8],
2427    skip_bytes: usize,
2428    line_delim: u8,
2429    out: &mut impl Write,
2430) -> io::Result<()> {
2431    if data.len() >= PARALLEL_THRESHOLD {
2432        let chunks = split_into_chunks(data, line_delim);
2433        let results: Vec<Vec<u8>> = chunks
2434            .par_iter()
2435            .map(|chunk| {
2436                let mut buf = Vec::with_capacity(chunk.len());
2437                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
2438                buf
2439            })
2440            .collect();
2441        // Use write_vectored (writev) to batch N writes into fewer syscalls
2442        let slices: Vec<IoSlice> = results
2443            .iter()
2444            .filter(|r| !r.is_empty())
2445            .map(|r| IoSlice::new(r))
2446            .collect();
2447        write_ioslices(out, &slices)?;
2448    } else {
2449        // Zero-copy: write suffix of each line directly from source
2450        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2451    }
2452    Ok(())
2453}
2454
2455/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2456/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2457/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2458#[inline]
2459fn bytes_from_offset_zerocopy(
2460    data: &[u8],
2461    skip_bytes: usize,
2462    line_delim: u8,
2463    out: &mut impl Write,
2464) -> io::Result<()> {
2465    let delim_buf = [line_delim];
2466    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2467
2468    let mut start = 0;
2469    for pos in memchr_iter(line_delim, data) {
2470        let line_len = pos - start;
2471        if line_len > skip_bytes {
2472            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2473        }
2474        iov.push(IoSlice::new(&delim_buf));
2475        // Flush when approaching MAX_IOV to avoid oversized writev
2476        if iov.len() >= MAX_IOV - 1 {
2477            write_ioslices(out, &iov)?;
2478            iov.clear();
2479        }
2480        start = pos + 1;
2481    }
2482    if start < data.len() {
2483        let line_len = data.len() - start;
2484        if line_len > skip_bytes {
2485            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2486        }
2487        iov.push(IoSlice::new(&delim_buf));
2488    }
2489    if !iov.is_empty() {
2490        write_ioslices(out, &iov)?;
2491    }
2492    Ok(())
2493}
2494
2495/// Process a chunk for from-offset byte range extraction.
2496/// Single reserve + deferred set_len for zero per-line overhead.
2497#[inline]
2498fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2499    buf.reserve(data.len());
2500
2501    let src = data.as_ptr();
2502    let dst_base = buf.as_mut_ptr();
2503    let mut wp = buf.len();
2504    let mut start = 0;
2505
2506    for pos in memchr_iter(line_delim, data) {
2507        let line_len = pos - start;
2508        if line_len > skip_bytes {
2509            let take = line_len - skip_bytes;
2510            unsafe {
2511                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2512            }
2513            wp += take;
2514        }
2515        unsafe {
2516            *dst_base.add(wp) = line_delim;
2517        }
2518        wp += 1;
2519        start = pos + 1;
2520    }
2521    if start < data.len() {
2522        let line_len = data.len() - start;
2523        if line_len > skip_bytes {
2524            let take = line_len - skip_bytes;
2525            unsafe {
2526                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2527            }
2528            wp += take;
2529        }
2530        unsafe {
2531            *dst_base.add(wp) = line_delim;
2532        }
2533        wp += 1;
2534    }
2535    unsafe { buf.set_len(wp) };
2536}
2537
2538/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2539fn process_bytes_mid_range(
2540    data: &[u8],
2541    start_byte: usize,
2542    end_byte: usize,
2543    line_delim: u8,
2544    out: &mut impl Write,
2545) -> io::Result<()> {
2546    let skip = start_byte.saturating_sub(1);
2547
2548    if data.len() >= PARALLEL_THRESHOLD {
2549        let chunks = split_into_chunks(data, line_delim);
2550        let results: Vec<Vec<u8>> = chunks
2551            .par_iter()
2552            .map(|chunk| {
2553                let mut buf = Vec::with_capacity(chunk.len());
2554                bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, &mut buf);
2555                buf
2556            })
2557            .collect();
2558        let slices: Vec<IoSlice> = results
2559            .iter()
2560            .filter(|r| !r.is_empty())
2561            .map(|r| IoSlice::new(r))
2562            .collect();
2563        write_ioslices(out, &slices)?;
2564    } else {
2565        let mut buf = Vec::with_capacity(data.len());
2566        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2567        if !buf.is_empty() {
2568            out.write_all(&buf)?;
2569        }
2570    }
2571    Ok(())
2572}
2573
2574/// Process a chunk for mid-range byte extraction.
2575/// For each line, output bytes skip..min(line_len, end_byte).
2576/// Single reserve + deferred set_len.
2577#[inline]
2578fn bytes_mid_range_chunk(
2579    data: &[u8],
2580    skip: usize,
2581    end_byte: usize,
2582    line_delim: u8,
2583    buf: &mut Vec<u8>,
2584) {
2585    buf.reserve(data.len());
2586
2587    let src = data.as_ptr();
2588    let dst_base = buf.as_mut_ptr();
2589    let mut wp = buf.len();
2590    let mut start = 0;
2591
2592    for pos in memchr_iter(line_delim, data) {
2593        let line_len = pos - start;
2594        if line_len > skip {
2595            let take_end = line_len.min(end_byte);
2596            let take = take_end - skip;
2597            unsafe {
2598                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2599            }
2600            wp += take;
2601        }
2602        unsafe {
2603            *dst_base.add(wp) = line_delim;
2604        }
2605        wp += 1;
2606        start = pos + 1;
2607    }
2608    if start < data.len() {
2609        let line_len = data.len() - start;
2610        if line_len > skip {
2611            let take_end = line_len.min(end_byte);
2612            let take = take_end - skip;
2613            unsafe {
2614                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2615            }
2616            wp += take;
2617        }
2618        unsafe {
2619            *dst_base.add(wp) = line_delim;
2620        }
2621        wp += 1;
2622    }
2623    unsafe { buf.set_len(wp) };
2624}
2625
2626/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2627fn process_bytes_complement_mid(
2628    data: &[u8],
2629    skip_start: usize,
2630    skip_end: usize,
2631    line_delim: u8,
2632    out: &mut impl Write,
2633) -> io::Result<()> {
2634    let prefix_bytes = skip_start - 1; // bytes before the skip region
2635    if data.len() >= PARALLEL_THRESHOLD {
2636        let chunks = split_into_chunks(data, line_delim);
2637        let results: Vec<Vec<u8>> = chunks
2638            .par_iter()
2639            .map(|chunk| {
2640                let mut buf = Vec::with_capacity(chunk.len());
2641                bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, &mut buf);
2642                buf
2643            })
2644            .collect();
2645        let slices: Vec<IoSlice> = results
2646            .iter()
2647            .filter(|r| !r.is_empty())
2648            .map(|r| IoSlice::new(r))
2649            .collect();
2650        write_ioslices(out, &slices)?;
2651    } else {
2652        let mut buf = Vec::with_capacity(data.len());
2653        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
2654        if !buf.is_empty() {
2655            out.write_all(&buf)?;
2656        }
2657    }
2658    Ok(())
2659}
2660
2661/// Process a chunk for complement mid-range byte extraction.
2662/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
2663#[inline]
2664fn bytes_complement_mid_chunk(
2665    data: &[u8],
2666    prefix_bytes: usize,
2667    skip_end: usize,
2668    line_delim: u8,
2669    buf: &mut Vec<u8>,
2670) {
2671    buf.reserve(data.len());
2672
2673    let src = data.as_ptr();
2674    let dst_base = buf.as_mut_ptr();
2675    let mut wp = buf.len();
2676    let mut start = 0;
2677
2678    for pos in memchr_iter(line_delim, data) {
2679        let line_len = pos - start;
2680        // Copy prefix (bytes before skip region)
2681        let take_prefix = prefix_bytes.min(line_len);
2682        if take_prefix > 0 {
2683            unsafe {
2684                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2685            }
2686            wp += take_prefix;
2687        }
2688        // Copy suffix (bytes after skip region)
2689        if line_len > skip_end {
2690            let suffix_len = line_len - skip_end;
2691            unsafe {
2692                std::ptr::copy_nonoverlapping(
2693                    src.add(start + skip_end),
2694                    dst_base.add(wp),
2695                    suffix_len,
2696                );
2697            }
2698            wp += suffix_len;
2699        }
2700        unsafe {
2701            *dst_base.add(wp) = line_delim;
2702        }
2703        wp += 1;
2704        start = pos + 1;
2705    }
2706    if start < data.len() {
2707        let line_len = data.len() - start;
2708        let take_prefix = prefix_bytes.min(line_len);
2709        if take_prefix > 0 {
2710            unsafe {
2711                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
2712            }
2713            wp += take_prefix;
2714        }
2715        if line_len > skip_end {
2716            let suffix_len = line_len - skip_end;
2717            unsafe {
2718                std::ptr::copy_nonoverlapping(
2719                    src.add(start + skip_end),
2720                    dst_base.add(wp),
2721                    suffix_len,
2722                );
2723            }
2724            wp += suffix_len;
2725        }
2726        unsafe {
2727            *dst_base.add(wp) = line_delim;
2728        }
2729        wp += 1;
2730    }
2731    unsafe { buf.set_len(wp) };
2732}
2733
2734/// Optimized byte/char extraction with batched output and parallel processing.
2735fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2736    let line_delim = cfg.line_delim;
2737    let ranges = cfg.ranges;
2738    let complement = cfg.complement;
2739    let output_delim = cfg.output_delim;
2740
2741    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
2742    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
2743        let max_bytes = ranges[0].end;
2744        if max_bytes < usize::MAX {
2745            return process_bytes_from_start(data, max_bytes, line_delim, out);
2746        }
2747    }
2748
2749    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
2750    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
2751        let skip_bytes = ranges[0].start.saturating_sub(1);
2752        if skip_bytes > 0 {
2753            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
2754        }
2755    }
2756
2757    // Fast path: single mid-range (e.g., cut -b5-100)
2758    if !complement
2759        && ranges.len() == 1
2760        && ranges[0].start > 1
2761        && ranges[0].end < usize::MAX
2762        && output_delim.is_empty()
2763    {
2764        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
2765    }
2766
2767    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
2768    if complement
2769        && ranges.len() == 1
2770        && ranges[0].start == 1
2771        && ranges[0].end < usize::MAX
2772        && output_delim.is_empty()
2773    {
2774        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
2775    }
2776
2777    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
2778    if complement
2779        && ranges.len() == 1
2780        && ranges[0].end == usize::MAX
2781        && ranges[0].start > 1
2782        && output_delim.is_empty()
2783    {
2784        let max_bytes = ranges[0].start - 1;
2785        return process_bytes_from_start(data, max_bytes, line_delim, out);
2786    }
2787
2788    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
2789    if complement
2790        && ranges.len() == 1
2791        && ranges[0].start > 1
2792        && ranges[0].end < usize::MAX
2793        && output_delim.is_empty()
2794    {
2795        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
2796    }
2797
2798    if data.len() >= PARALLEL_THRESHOLD {
2799        let chunks = split_into_chunks(data, line_delim);
2800        let results: Vec<Vec<u8>> = chunks
2801            .par_iter()
2802            .map(|chunk| {
2803                let mut buf = Vec::with_capacity(chunk.len());
2804                process_bytes_chunk(
2805                    chunk,
2806                    ranges,
2807                    complement,
2808                    output_delim,
2809                    line_delim,
2810                    &mut buf,
2811                );
2812                buf
2813            })
2814            .collect();
2815        // Use write_vectored (writev) to batch N writes into fewer syscalls
2816        let slices: Vec<IoSlice> = results
2817            .iter()
2818            .filter(|r| !r.is_empty())
2819            .map(|r| IoSlice::new(r))
2820            .collect();
2821        write_ioslices(out, &slices)?;
2822    } else {
2823        let mut buf = Vec::with_capacity(data.len());
2824        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
2825        if !buf.is_empty() {
2826            out.write_all(&buf)?;
2827        }
2828    }
2829    Ok(())
2830}
2831
2832/// Process a chunk of data for byte/char extraction.
2833/// Uses raw pointer arithmetic for the newline scan.
2834/// Complement single-range fast path: compute complement ranges once, then use
2835/// the non-complement multi-range path which is more cache-friendly.
2836fn process_bytes_chunk(
2837    data: &[u8],
2838    ranges: &[Range],
2839    complement: bool,
2840    output_delim: &[u8],
2841    line_delim: u8,
2842    buf: &mut Vec<u8>,
2843) {
2844    buf.reserve(data.len());
2845    let base = data.as_ptr();
2846    let mut start = 0;
2847    for end_pos in memchr_iter(line_delim, data) {
2848        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2849        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2850        unsafe { buf_push(buf, line_delim) };
2851        start = end_pos + 1;
2852    }
2853    if start < data.len() {
2854        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2855        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2856        unsafe { buf_push(buf, line_delim) };
2857    }
2858}
2859
2860/// Extract byte ranges from a line into the output buffer.
2861/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
2862/// Raw pointer arithmetic eliminates per-range bounds checking.
2863#[inline(always)]
2864fn cut_bytes_to_buf(
2865    line: &[u8],
2866    ranges: &[Range],
2867    complement: bool,
2868    output_delim: &[u8],
2869    buf: &mut Vec<u8>,
2870) {
2871    let len = line.len();
2872    let base = line.as_ptr();
2873    let mut first_range = true;
2874
2875    // Reserve worst case: full line + delimiters between ranges
2876    let needed = len + output_delim.len() * ranges.len() + 1;
2877    if buf.capacity() - buf.len() < needed {
2878        buf.reserve(needed);
2879    }
2880
2881    if complement {
2882        let mut pos: usize = 1;
2883        for r in ranges {
2884            let rs = r.start;
2885            let re = r.end.min(len);
2886            if pos < rs {
2887                if !first_range && !output_delim.is_empty() {
2888                    unsafe { buf_extend(buf, output_delim) };
2889                }
2890                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
2891                first_range = false;
2892            }
2893            pos = re + 1;
2894            if pos > len {
2895                break;
2896            }
2897        }
2898        if pos <= len {
2899            if !first_range && !output_delim.is_empty() {
2900                unsafe { buf_extend(buf, output_delim) };
2901            }
2902            unsafe {
2903                buf_extend(
2904                    buf,
2905                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
2906                )
2907            };
2908        }
2909    } else if output_delim.is_empty() && ranges.len() == 1 {
2910        // Ultra-fast path: single range, no output delimiter
2911        let start = ranges[0].start.saturating_sub(1);
2912        let end = ranges[0].end.min(len);
2913        if start < len {
2914            unsafe {
2915                buf_extend(
2916                    buf,
2917                    std::slice::from_raw_parts(base.add(start), end - start),
2918                )
2919            };
2920        }
2921    } else {
2922        for r in ranges {
2923            let start = r.start.saturating_sub(1);
2924            let end = r.end.min(len);
2925            if start >= len {
2926                break;
2927            }
2928            if !first_range && !output_delim.is_empty() {
2929                unsafe { buf_extend(buf, output_delim) };
2930            }
2931            unsafe {
2932                buf_extend(
2933                    buf,
2934                    std::slice::from_raw_parts(base.add(start), end - start),
2935                )
2936            };
2937            first_range = false;
2938        }
2939    }
2940}
2941
2942// ── Public API ───────────────────────────────────────────────────────────
2943
2944/// Cut fields from a line using a delimiter. Writes to `out`.
2945#[inline]
2946pub fn cut_fields(
2947    line: &[u8],
2948    delim: u8,
2949    ranges: &[Range],
2950    complement: bool,
2951    output_delim: &[u8],
2952    suppress_no_delim: bool,
2953    out: &mut impl Write,
2954) -> io::Result<bool> {
2955    if memchr::memchr(delim, line).is_none() {
2956        if !suppress_no_delim {
2957            out.write_all(line)?;
2958            return Ok(true);
2959        }
2960        return Ok(false);
2961    }
2962
2963    let mut field_num: usize = 1;
2964    let mut field_start: usize = 0;
2965    let mut first_output = true;
2966
2967    for delim_pos in memchr_iter(delim, line) {
2968        let selected = in_ranges(ranges, field_num) != complement;
2969        if selected {
2970            if !first_output {
2971                out.write_all(output_delim)?;
2972            }
2973            out.write_all(&line[field_start..delim_pos])?;
2974            first_output = false;
2975        }
2976        field_start = delim_pos + 1;
2977        field_num += 1;
2978    }
2979
2980    let selected = in_ranges(ranges, field_num) != complement;
2981    if selected {
2982        if !first_output {
2983            out.write_all(output_delim)?;
2984        }
2985        out.write_all(&line[field_start..])?;
2986    }
2987
2988    Ok(true)
2989}
2990
2991/// Cut bytes/chars from a line. Writes selected bytes to `out`.
2992#[inline]
2993pub fn cut_bytes(
2994    line: &[u8],
2995    ranges: &[Range],
2996    complement: bool,
2997    output_delim: &[u8],
2998    out: &mut impl Write,
2999) -> io::Result<bool> {
3000    let mut first_range = true;
3001
3002    if complement {
3003        let len = line.len();
3004        let mut comp_ranges = Vec::new();
3005        let mut pos: usize = 1;
3006        for r in ranges {
3007            let rs = r.start;
3008            let re = r.end.min(len);
3009            if pos < rs {
3010                comp_ranges.push((pos, rs - 1));
3011            }
3012            pos = re + 1;
3013            if pos > len {
3014                break;
3015            }
3016        }
3017        if pos <= len {
3018            comp_ranges.push((pos, len));
3019        }
3020        for &(s, e) in &comp_ranges {
3021            if !first_range && !output_delim.is_empty() {
3022                out.write_all(output_delim)?;
3023            }
3024            out.write_all(&line[s - 1..e])?;
3025            first_range = false;
3026        }
3027    } else {
3028        for r in ranges {
3029            let start = r.start.saturating_sub(1);
3030            let end = r.end.min(line.len());
3031            if start >= line.len() {
3032                break;
3033            }
3034            if !first_range && !output_delim.is_empty() {
3035                out.write_all(output_delim)?;
3036            }
3037            out.write_all(&line[start..end])?;
3038            first_range = false;
3039        }
3040    }
3041    Ok(true)
3042}
3043
3044/// Process a full data buffer (from mmap or read) with cut operation.
3045pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3046    match cfg.mode {
3047        CutMode::Fields => process_fields_fast(data, cfg, out),
3048        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3049    }
3050}
3051
3052/// Process input from a reader (for stdin).
3053/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3054/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3055/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3056pub fn process_cut_reader<R: BufRead>(
3057    mut reader: R,
3058    cfg: &CutConfig,
3059    out: &mut impl Write,
3060) -> io::Result<()> {
3061    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3062    let line_delim = cfg.line_delim;
3063
3064    // Read large chunks and process in batch.
3065    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3066    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3067
3068    loop {
3069        // Read up to CHUNK_SIZE bytes
3070        buf.reserve(CHUNK_SIZE);
3071        let read_start = buf.len();
3072        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3073        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3074        buf.truncate(read_start + n);
3075
3076        if buf.is_empty() {
3077            break;
3078        }
3079
3080        if n == 0 {
3081            // EOF with leftover data (last line without terminator)
3082            process_cut_data(&buf, cfg, out)?;
3083            break;
3084        }
3085
3086        // Find the last line delimiter in the buffer so we process complete lines
3087        let process_end = match memchr::memrchr(line_delim, &buf) {
3088            Some(pos) => pos + 1,
3089            None => {
3090                // No line delimiter found — keep accumulating
3091                continue;
3092            }
3093        };
3094
3095        // Process the complete lines using the fast batch path
3096        process_cut_data(&buf[..process_end], cfg, out)?;
3097
3098        // Shift leftover to the front for next iteration
3099        let leftover_len = buf.len() - process_end;
3100        if leftover_len > 0 {
3101            buf.copy_within(process_end.., 0);
3102        }
3103        buf.truncate(leftover_len);
3104    }
3105
3106    Ok(())
3107}
3108
3109/// Read as many bytes as possible into buf, retrying on partial reads.
3110#[inline]
3111fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3112    let n = reader.read(buf)?;
3113    if n == buf.len() || n == 0 {
3114        return Ok(n);
3115    }
3116    // Slow path: partial read — retry to fill buffer
3117    let mut total = n;
3118    while total < buf.len() {
3119        match reader.read(&mut buf[total..]) {
3120            Ok(0) => break,
3121            Ok(n) => total += n,
3122            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3123            Err(e) => return Err(e),
3124        }
3125    }
3126    Ok(total)
3127}
3128
3129/// Cut operation mode
3130#[derive(Debug, Clone, Copy, PartialEq)]
3131pub enum CutMode {
3132    Bytes,
3133    Characters,
3134    Fields,
3135}