Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (1MB).
6/// Lowered from 2MB to benefit from parallel chunk processing on smaller piped inputs.
7const PARALLEL_THRESHOLD: usize = 1024 * 1024;
8
9/// Max iovec entries per writev call (Linux default).
10const MAX_IOV: usize = 1024;
11
12/// Configuration for cut operations.
13pub struct CutConfig<'a> {
14    pub mode: CutMode,
15    pub ranges: &'a [Range],
16    pub complement: bool,
17    pub delim: u8,
18    pub output_delim: &'a [u8],
19    pub suppress_no_delim: bool,
20    pub line_delim: u8,
21}
22
23/// A range specification like 1, 3-5, -3, 4-
24#[derive(Debug, Clone)]
25pub struct Range {
26    pub start: usize, // 1-based, 0 means "from beginning"
27    pub end: usize,   // 1-based, usize::MAX means "to end"
28}
29
30/// Parse a LIST specification like "1,3-5,7-" into ranges.
31/// Each range is 1-based. Returns sorted, merged ranges.
32pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
33    let mut ranges = Vec::new();
34
35    for part in spec.split(',') {
36        let part = part.trim();
37        if part.is_empty() {
38            continue;
39        }
40
41        if let Some(idx) = part.find('-') {
42            let left = &part[..idx];
43            let right = &part[idx + 1..];
44
45            let start = if left.is_empty() {
46                1
47            } else {
48                left.parse::<usize>()
49                    .map_err(|_| format!("invalid range: '{}'", part))?
50            };
51
52            let end = if right.is_empty() {
53                usize::MAX
54            } else {
55                right
56                    .parse::<usize>()
57                    .map_err(|_| format!("invalid range: '{}'", part))?
58            };
59
60            if start == 0 {
61                return Err("fields and positions are numbered from 1".to_string());
62            }
63            if start > end {
64                return Err(format!("invalid decreasing range: '{}'", part));
65            }
66
67            ranges.push(Range { start, end });
68        } else {
69            let n = part
70                .parse::<usize>()
71                .map_err(|_| format!("invalid field: '{}'", part))?;
72            if n == 0 {
73                return Err("fields and positions are numbered from 1".to_string());
74            }
75            ranges.push(Range { start: n, end: n });
76        }
77    }
78
79    if ranges.is_empty() {
80        return Err("you must specify a list of bytes, characters, or fields".to_string());
81    }
82
83    // Sort and merge overlapping ranges
84    ranges.sort_by_key(|r| (r.start, r.end));
85    let mut merged = vec![ranges[0].clone()];
86    for r in &ranges[1..] {
87        let last = merged.last_mut().unwrap();
88        if r.start <= last.end.saturating_add(1) {
89            last.end = last.end.max(r.end);
90        } else {
91            merged.push(r.clone());
92        }
93    }
94
95    Ok(merged)
96}
97
98/// Check if a 1-based position is in any range.
99/// Ranges must be sorted. Uses early exit since ranges are sorted.
100#[inline(always)]
101fn in_ranges(ranges: &[Range], pos: usize) -> bool {
102    for r in ranges {
103        if pos < r.start {
104            return false;
105        }
106        if pos <= r.end {
107            return true;
108        }
109    }
110    false
111}
112
113/// Pre-compute a 64-bit mask for field selection.
114/// Bit i-1 is set if field i should be output.
115#[inline]
116fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
117    let mut mask: u64 = 0;
118    for i in 1..=64u32 {
119        let in_range = in_ranges(ranges, i as usize);
120        if in_range != complement {
121            mask |= 1u64 << (i - 1);
122        }
123    }
124    mask
125}
126
127/// Check if a field should be selected, using bitset for first 64 fields.
128#[inline(always)]
129fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
130    if field_num <= 64 {
131        (mask >> (field_num - 1)) & 1 == 1
132    } else {
133        in_ranges(ranges, field_num) != complement
134    }
135}
136
137// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
138
139/// Append a slice to buf without capacity checks.
140/// Caller MUST ensure buf has enough remaining capacity.
141#[inline(always)]
142unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
143    unsafe {
144        let len = buf.len();
145        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
146        buf.set_len(len + data.len());
147    }
148}
149
150/// Append a single byte to buf without capacity checks.
151/// Caller MUST ensure buf has enough remaining capacity.
152#[inline(always)]
153unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
154    unsafe {
155        let len = buf.len();
156        *buf.as_mut_ptr().add(len) = b;
157        buf.set_len(len + 1);
158    }
159}
160
161/// Write multiple IoSlice buffers using write_vectored (writev syscall).
162/// Batches into MAX_IOV-sized groups. Falls back to write_all per slice for partial writes.
163#[inline]
164fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
165    if slices.is_empty() {
166        return Ok(());
167    }
168    for batch in slices.chunks(MAX_IOV) {
169        let total: usize = batch.iter().map(|s| s.len()).sum();
170        match out.write_vectored(batch) {
171            Ok(n) if n >= total => continue,
172            Ok(mut written) => {
173                // Partial write: fall back to write_all per remaining slice
174                for slice in batch {
175                    let slen = slice.len();
176                    if written >= slen {
177                        written -= slen;
178                        continue;
179                    }
180                    if written > 0 {
181                        out.write_all(&slice[written..])?;
182                        written = 0;
183                    } else {
184                        out.write_all(slice)?;
185                    }
186                }
187            }
188            Err(e) => return Err(e),
189        }
190    }
191    Ok(())
192}
193
194// ── Chunk splitting for parallel processing ──────────────────────────────
195
196/// Split data into chunks aligned to line boundaries for parallel processing.
197fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
198    let num_threads = rayon::current_num_threads().max(1);
199    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
200        return vec![data];
201    }
202
203    let chunk_size = data.len() / num_threads;
204    let mut chunks = Vec::with_capacity(num_threads);
205    let mut pos = 0;
206
207    for _ in 0..num_threads - 1 {
208        let target = pos + chunk_size;
209        if target >= data.len() {
210            break;
211        }
212        let boundary = memchr::memchr(line_delim, &data[target..])
213            .map(|p| target + p + 1)
214            .unwrap_or(data.len());
215        if boundary > pos {
216            chunks.push(&data[pos..boundary]);
217        }
218        pos = boundary;
219    }
220
221    if pos < data.len() {
222        chunks.push(&data[pos..]);
223    }
224
225    chunks
226}
227
228// ── Fast path: field extraction with batched output ──────────────────────
229
230/// Optimized field extraction with early exit and batched output.
231fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
232    let delim = cfg.delim;
233    let line_delim = cfg.line_delim;
234    let ranges = cfg.ranges;
235    let complement = cfg.complement;
236    let output_delim = cfg.output_delim;
237    let suppress = cfg.suppress_no_delim;
238
239    // Zero-copy fast path: if delimiter never appears, output = input unchanged.
240    if !complement && memchr::memchr(delim, data).is_none() {
241        if suppress {
242            return Ok(());
243        }
244        out.write_all(data)?;
245        if !data.is_empty() && *data.last().unwrap() != line_delim {
246            out.write_all(&[line_delim])?;
247        }
248        return Ok(());
249    }
250
251    // Ultra-fast path: single field extraction (e.g., cut -f5)
252    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
253        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
254    }
255
256    // Fast path: complement of single field with default output delimiter.
257    if complement
258        && ranges.len() == 1
259        && ranges[0].start == ranges[0].end
260        && output_delim.len() == 1
261        && output_delim[0] == delim
262    {
263        return process_complement_single_field(
264            data,
265            delim,
266            line_delim,
267            ranges[0].start,
268            suppress,
269            out,
270        );
271    }
272
273    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
274    if !complement
275        && ranges.len() == 1
276        && ranges[0].start == 1
277        && output_delim.len() == 1
278        && output_delim[0] == delim
279        && ranges[0].end < usize::MAX
280    {
281        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
282    }
283
284    // Fast path: open-ended field range from field N (e.g., cut -f3-)
285    if !complement
286        && ranges.len() == 1
287        && ranges[0].end == usize::MAX
288        && ranges[0].start > 1
289        && output_delim.len() == 1
290        && output_delim[0] == delim
291    {
292        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
293    }
294
295    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
296    if !complement
297        && ranges.len() == 1
298        && ranges[0].start > 1
299        && ranges[0].end < usize::MAX
300        && output_delim.len() == 1
301        && output_delim[0] == delim
302    {
303        return process_fields_mid_range(
304            data,
305            delim,
306            line_delim,
307            ranges[0].start,
308            ranges[0].end,
309            suppress,
310            out,
311        );
312    }
313
314    // General field extraction
315    let max_field = if complement {
316        usize::MAX
317    } else {
318        ranges.last().map(|r| r.end).unwrap_or(0)
319    };
320    let field_mask = compute_field_mask(ranges, complement);
321
322    if data.len() >= PARALLEL_THRESHOLD {
323        let chunks = split_into_chunks(data, line_delim);
324        let results: Vec<Vec<u8>> = chunks
325            .par_iter()
326            .map(|chunk| {
327                let mut buf = Vec::with_capacity(chunk.len());
328                process_fields_chunk(
329                    chunk,
330                    delim,
331                    ranges,
332                    output_delim,
333                    suppress,
334                    max_field,
335                    field_mask,
336                    line_delim,
337                    complement,
338                    &mut buf,
339                );
340                buf
341            })
342            .collect();
343        // Use write_vectored (writev) to batch N writes into fewer syscalls
344        let slices: Vec<IoSlice> = results
345            .iter()
346            .filter(|r| !r.is_empty())
347            .map(|r| IoSlice::new(r))
348            .collect();
349        write_ioslices(out, &slices)?;
350    } else {
351        let mut buf = Vec::with_capacity(data.len());
352        process_fields_chunk(
353            data,
354            delim,
355            ranges,
356            output_delim,
357            suppress,
358            max_field,
359            field_mask,
360            line_delim,
361            complement,
362            &mut buf,
363        );
364        if !buf.is_empty() {
365            out.write_all(&buf)?;
366        }
367    }
368    Ok(())
369}
370
371/// Process a chunk of data for general field extraction.
372/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
373/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
374/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
375fn process_fields_chunk(
376    data: &[u8],
377    delim: u8,
378    ranges: &[Range],
379    output_delim: &[u8],
380    suppress: bool,
381    max_field: usize,
382    field_mask: u64,
383    line_delim: u8,
384    complement: bool,
385    buf: &mut Vec<u8>,
386) {
387    // When delim != line_delim and max_field is bounded, use two-level approach:
388    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
389    // This avoids scanning past max_field on each line (significant for lines with
390    // many columns but small field selection like -f1,3,5 on 20-column CSV).
391    // For complement or unbounded ranges, use single-pass memchr2_iter which
392    // needs to process all delimiters anyway.
393    if delim != line_delim && max_field < usize::MAX && !complement {
394        buf.reserve(data.len());
395        let mut start = 0;
396        for end_pos in memchr_iter(line_delim, data) {
397            let line = &data[start..end_pos];
398            extract_fields_to_buf(
399                line,
400                delim,
401                ranges,
402                output_delim,
403                suppress,
404                max_field,
405                field_mask,
406                line_delim,
407                buf,
408                complement,
409            );
410            start = end_pos + 1;
411        }
412        if start < data.len() {
413            extract_fields_to_buf(
414                &data[start..],
415                delim,
416                ranges,
417                output_delim,
418                suppress,
419                max_field,
420                field_mask,
421                line_delim,
422                buf,
423                complement,
424            );
425        }
426        return;
427    }
428
429    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
430    // delimiter and line_delim in one SIMD scan.
431    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
432    if delim != line_delim {
433        buf.reserve(data.len());
434
435        let data_len = data.len();
436        let base = data.as_ptr();
437        let mut line_start: usize = 0;
438        let mut field_start: usize = 0;
439        let mut field_num: usize = 1;
440        let mut first_output = true;
441        let mut has_delim = false;
442
443        for pos in memchr::memchr2_iter(delim, line_delim, data) {
444            let byte = unsafe { *base.add(pos) };
445
446            if byte == line_delim {
447                // End of line: flush final field and emit line delimiter
448                if (field_num <= max_field || complement)
449                    && has_delim
450                    && is_selected(field_num, field_mask, ranges, complement)
451                {
452                    if !first_output {
453                        unsafe { buf_extend(buf, output_delim) };
454                    }
455                    unsafe {
456                        buf_extend(
457                            buf,
458                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
459                        )
460                    };
461                    first_output = false;
462                }
463
464                if !first_output {
465                    unsafe { buf_push(buf, line_delim) };
466                } else if !has_delim {
467                    if !suppress {
468                        unsafe {
469                            buf_extend(
470                                buf,
471                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
472                            );
473                            buf_push(buf, line_delim);
474                        }
475                    }
476                } else {
477                    unsafe { buf_push(buf, line_delim) };
478                }
479
480                // Reset state for next line
481                line_start = pos + 1;
482                field_start = pos + 1;
483                field_num = 1;
484                first_output = true;
485                has_delim = false;
486            } else {
487                // Field delimiter hit
488                has_delim = true;
489
490                if is_selected(field_num, field_mask, ranges, complement) {
491                    if !first_output {
492                        unsafe { buf_extend(buf, output_delim) };
493                    }
494                    unsafe {
495                        buf_extend(
496                            buf,
497                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
498                        )
499                    };
500                    first_output = false;
501                }
502
503                field_num += 1;
504                field_start = pos + 1;
505            }
506        }
507
508        // Handle last line without trailing line_delim
509        if line_start < data_len {
510            if line_start < data_len {
511                if (field_num <= max_field || complement)
512                    && has_delim
513                    && is_selected(field_num, field_mask, ranges, complement)
514                {
515                    if !first_output {
516                        unsafe { buf_extend(buf, output_delim) };
517                    }
518                    unsafe {
519                        buf_extend(
520                            buf,
521                            std::slice::from_raw_parts(
522                                base.add(field_start),
523                                data_len - field_start,
524                            ),
525                        )
526                    };
527                    first_output = false;
528                }
529
530                if !first_output {
531                    unsafe { buf_push(buf, line_delim) };
532                } else if !has_delim {
533                    if !suppress {
534                        unsafe {
535                            buf_extend(
536                                buf,
537                                std::slice::from_raw_parts(
538                                    base.add(line_start),
539                                    data_len - line_start,
540                                ),
541                            );
542                            buf_push(buf, line_delim);
543                        }
544                    }
545                } else {
546                    unsafe { buf_push(buf, line_delim) };
547                }
548            }
549        }
550
551        return;
552    }
553
554    // Fallback: when delim == line_delim, use the two-level scan approach
555    let mut start = 0;
556    for end_pos in memchr_iter(line_delim, data) {
557        let line = &data[start..end_pos];
558        extract_fields_to_buf(
559            line,
560            delim,
561            ranges,
562            output_delim,
563            suppress,
564            max_field,
565            field_mask,
566            line_delim,
567            buf,
568            complement,
569        );
570        start = end_pos + 1;
571    }
572    if start < data.len() {
573        extract_fields_to_buf(
574            &data[start..],
575            delim,
576            ranges,
577            output_delim,
578            suppress,
579            max_field,
580            field_mask,
581            line_delim,
582            buf,
583            complement,
584        );
585    }
586}
587
588// ── Ultra-fast single field extraction ───────────────────────────────────
589
590/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
591/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
592/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
593fn process_single_field(
594    data: &[u8],
595    delim: u8,
596    line_delim: u8,
597    target: usize,
598    suppress: bool,
599    out: &mut impl Write,
600) -> io::Result<()> {
601    let target_idx = target - 1;
602
603    // Combined SIMD scan: single pass using memchr2 for any target field.
604    if delim != line_delim {
605        if data.len() >= PARALLEL_THRESHOLD {
606            let chunks = split_into_chunks(data, line_delim);
607            let results: Vec<Vec<u8>> = chunks
608                .par_iter()
609                .map(|chunk| {
610                    let mut buf = Vec::with_capacity(chunk.len());
611                    process_nth_field_combined(
612                        chunk, delim, line_delim, target_idx, suppress, &mut buf,
613                    );
614                    buf
615                })
616                .collect();
617            // Use write_vectored (writev) to batch N writes into fewer syscalls
618            let slices: Vec<IoSlice> = results
619                .iter()
620                .filter(|r| !r.is_empty())
621                .map(|r| IoSlice::new(r))
622                .collect();
623            write_ioslices(out, &slices)?;
624        } else if target_idx == 0 && !suppress {
625            // Zero-copy fast path for field 1 (most common case):
626            // For each line, either truncate at the first delimiter, or pass through.
627            // Since most lines have a delimiter, and field 1 is a prefix of each line,
628            // we can write contiguous runs directly from the source data.
629            single_field1_zerocopy(data, delim, line_delim, out)?;
630        } else if target_idx <= 3 && !suppress {
631            // Optimized path for small field indices (fields 2-4):
632            // Uses successive memchr calls per line instead of the full combined scan.
633            // For field 2: two memchr calls (find first delim, find second).
634            // This avoids the memchr2_iter overhead for every byte in the line.
635            // Write directly to the output BufWriter to avoid intermediate Vec allocation.
636            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
637            process_small_field_combined(data, delim, line_delim, target_idx, &mut buf);
638            if !buf.is_empty() {
639                out.write_all(&buf)?;
640            }
641        } else {
642            // Write directly to BufWriter-backed output to avoid intermediate Vec.
643            // For larger inputs, process_nth_field_combined builds a buffer that
644            // we then write in a single call (reducing syscall count).
645            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
646            process_nth_field_combined(data, delim, line_delim, target_idx, suppress, &mut buf);
647            if !buf.is_empty() {
648                out.write_all(&buf)?;
649            }
650        }
651        return Ok(());
652    }
653
654    // Fallback for delim == line_delim: nested loop approach
655    if data.len() >= PARALLEL_THRESHOLD {
656        let chunks = split_into_chunks(data, line_delim);
657        let results: Vec<Vec<u8>> = chunks
658            .par_iter()
659            .map(|chunk| {
660                let mut buf = Vec::with_capacity(chunk.len() / 4);
661                process_single_field_chunk(
662                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
663                );
664                buf
665            })
666            .collect();
667        // Use write_vectored (writev) to batch N writes into fewer syscalls
668        let slices: Vec<IoSlice> = results
669            .iter()
670            .filter(|r| !r.is_empty())
671            .map(|r| IoSlice::new(r))
672            .collect();
673        write_ioslices(out, &slices)?;
674    } else {
675        let mut buf = Vec::with_capacity(data.len() / 4);
676        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
677        if !buf.is_empty() {
678            out.write_all(&buf)?;
679        }
680    }
681    Ok(())
682}
683
684/// Complement single-field extraction: skip one field, output rest unchanged.
685fn process_complement_single_field(
686    data: &[u8],
687    delim: u8,
688    line_delim: u8,
689    skip_field: usize,
690    suppress: bool,
691    out: &mut impl Write,
692) -> io::Result<()> {
693    let skip_idx = skip_field - 1;
694
695    if data.len() >= PARALLEL_THRESHOLD {
696        let chunks = split_into_chunks(data, line_delim);
697        let results: Vec<Vec<u8>> = chunks
698            .par_iter()
699            .map(|chunk| {
700                let mut buf = Vec::with_capacity(chunk.len());
701                complement_single_field_chunk(
702                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
703                );
704                buf
705            })
706            .collect();
707        // Use write_vectored (writev) to batch N writes into fewer syscalls
708        let slices: Vec<IoSlice> = results
709            .iter()
710            .filter(|r| !r.is_empty())
711            .map(|r| IoSlice::new(r))
712            .collect();
713        write_ioslices(out, &slices)?;
714    } else {
715        let mut buf = Vec::with_capacity(data.len());
716        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
717        if !buf.is_empty() {
718            out.write_all(&buf)?;
719        }
720    }
721    Ok(())
722}
723
724/// Process a chunk for complement single-field extraction.
725fn complement_single_field_chunk(
726    data: &[u8],
727    delim: u8,
728    skip_idx: usize,
729    line_delim: u8,
730    suppress: bool,
731    buf: &mut Vec<u8>,
732) {
733    let mut start = 0;
734    for end_pos in memchr_iter(line_delim, data) {
735        let line = &data[start..end_pos];
736        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
737        start = end_pos + 1;
738    }
739    if start < data.len() {
740        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
741    }
742}
743
744/// Extract all fields except skip_idx from one line.
745/// Uses raw pointer arithmetic to eliminate bounds checking.
746#[inline(always)]
747fn complement_single_field_line(
748    line: &[u8],
749    delim: u8,
750    skip_idx: usize,
751    line_delim: u8,
752    suppress: bool,
753    buf: &mut Vec<u8>,
754) {
755    let len = line.len();
756    if len == 0 {
757        if !suppress {
758            buf.push(line_delim);
759        }
760        return;
761    }
762
763    buf.reserve(len + 1);
764    let base = line.as_ptr();
765
766    let mut field_idx = 0;
767    let mut field_start = 0;
768    let mut first_output = true;
769    let mut has_delim = false;
770
771    for pos in memchr_iter(delim, line) {
772        has_delim = true;
773        if field_idx != skip_idx {
774            if !first_output {
775                unsafe { buf_push(buf, delim) };
776            }
777            unsafe {
778                buf_extend(
779                    buf,
780                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
781                )
782            };
783            first_output = false;
784        }
785        field_idx += 1;
786        field_start = pos + 1;
787    }
788
789    if !has_delim {
790        if !suppress {
791            unsafe {
792                buf_extend(buf, line);
793                buf_push(buf, line_delim);
794            }
795        }
796        return;
797    }
798
799    // Last field
800    if field_idx != skip_idx {
801        if !first_output {
802            unsafe { buf_push(buf, delim) };
803        }
804        unsafe {
805            buf_extend(
806                buf,
807                std::slice::from_raw_parts(base.add(field_start), len - field_start),
808            )
809        };
810    }
811
812    unsafe { buf_push(buf, line_delim) };
813}
814
815/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
816/// Zero-copy for the non-parallel path: identifies the truncation point per line
817/// and writes contiguous runs directly from the source data.
818fn process_fields_prefix(
819    data: &[u8],
820    delim: u8,
821    line_delim: u8,
822    last_field: usize,
823    suppress: bool,
824    out: &mut impl Write,
825) -> io::Result<()> {
826    if data.len() >= PARALLEL_THRESHOLD {
827        let chunks = split_into_chunks(data, line_delim);
828        let results: Vec<Vec<u8>> = chunks
829            .par_iter()
830            .map(|chunk| {
831                let mut buf = Vec::with_capacity(chunk.len());
832                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
833                buf
834            })
835            .collect();
836        // Use write_vectored (writev) to batch N writes into fewer syscalls
837        let slices: Vec<IoSlice> = results
838            .iter()
839            .filter(|r| !r.is_empty())
840            .map(|r| IoSlice::new(r))
841            .collect();
842        write_ioslices(out, &slices)?;
843    } else if !suppress {
844        // Zero-copy fast path: scan for truncation points, write runs from source.
845        // When suppress is false, every line is output (with or without delimiter).
846        // Most lines have enough fields, so the output is often identical to input.
847        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
848    } else {
849        let mut buf = Vec::with_capacity(data.len());
850        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
851        if !buf.is_empty() {
852            out.write_all(&buf)?;
853        }
854    }
855    Ok(())
856}
857
858/// Zero-copy field-prefix extraction: writes contiguous runs directly from source data.
859/// For lines where the Nth delimiter exists, we truncate at that point.
860/// For lines with fewer fields, we output them unchanged.
861/// Lines without any delimiter are output unchanged (suppress=false assumed).
862#[inline]
863fn fields_prefix_zerocopy(
864    data: &[u8],
865    delim: u8,
866    line_delim: u8,
867    last_field: usize,
868    out: &mut impl Write,
869) -> io::Result<()> {
870    let mut start = 0;
871    let mut run_start: usize = 0;
872
873    for end_pos in memchr_iter(line_delim, data) {
874        let line = &data[start..end_pos];
875        // Find the position of the Nth delimiter to truncate at
876        let mut field_count = 1;
877        let mut truncate_at: Option<usize> = None;
878        for dpos in memchr_iter(delim, line) {
879            if field_count >= last_field {
880                truncate_at = Some(start + dpos);
881                break;
882            }
883            field_count += 1;
884        }
885
886        if let Some(trunc_pos) = truncate_at {
887            // This line has more fields than needed. Flush run, write truncated.
888            if run_start < start {
889                out.write_all(&data[run_start..start])?;
890            }
891            out.write_all(&data[start..trunc_pos])?;
892            out.write_all(&[line_delim])?;
893            run_start = end_pos + 1;
894        }
895        // else: line has <= last_field fields, keep it in the run
896        start = end_pos + 1;
897    }
898    // Handle last line without terminator
899    if start < data.len() {
900        let line = &data[start..];
901        let mut field_count = 1;
902        let mut truncate_at: Option<usize> = None;
903        for dpos in memchr_iter(delim, line) {
904            if field_count >= last_field {
905                truncate_at = Some(start + dpos);
906                break;
907            }
908            field_count += 1;
909        }
910        if let Some(trunc_pos) = truncate_at {
911            if run_start < start {
912                out.write_all(&data[run_start..start])?;
913            }
914            out.write_all(&data[start..trunc_pos])?;
915            out.write_all(&[line_delim])?;
916            return Ok(());
917        }
918    }
919    // Flush remaining run
920    if run_start < data.len() {
921        out.write_all(&data[run_start..])?;
922        if !data.is_empty() && *data.last().unwrap() != line_delim {
923            out.write_all(&[line_delim])?;
924        }
925    }
926    Ok(())
927}
928
929/// Process a chunk for contiguous from-start field range extraction.
930fn fields_prefix_chunk(
931    data: &[u8],
932    delim: u8,
933    line_delim: u8,
934    last_field: usize,
935    suppress: bool,
936    buf: &mut Vec<u8>,
937) {
938    let mut start = 0;
939    for end_pos in memchr_iter(line_delim, data) {
940        let line = &data[start..end_pos];
941        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
942        start = end_pos + 1;
943    }
944    if start < data.len() {
945        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
946    }
947}
948
949/// Extract first N fields from one line (contiguous from-start range).
950/// Uses raw pointer arithmetic.
951#[inline(always)]
952fn fields_prefix_line(
953    line: &[u8],
954    delim: u8,
955    line_delim: u8,
956    last_field: usize,
957    suppress: bool,
958    buf: &mut Vec<u8>,
959) {
960    let len = line.len();
961    if len == 0 {
962        if !suppress {
963            buf.push(line_delim);
964        }
965        return;
966    }
967
968    buf.reserve(len + 1);
969    let base = line.as_ptr();
970
971    let mut field_count = 1;
972    let mut has_delim = false;
973
974    for pos in memchr_iter(delim, line) {
975        has_delim = true;
976        if field_count >= last_field {
977            unsafe {
978                buf_extend(buf, std::slice::from_raw_parts(base, pos));
979                buf_push(buf, line_delim);
980            }
981            return;
982        }
983        field_count += 1;
984    }
985
986    if !has_delim {
987        if !suppress {
988            unsafe {
989                buf_extend(buf, line);
990                buf_push(buf, line_delim);
991            }
992        }
993        return;
994    }
995
996    unsafe {
997        buf_extend(buf, line);
998        buf_push(buf, line_delim);
999    }
1000}
1001
1002/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1003fn process_fields_suffix(
1004    data: &[u8],
1005    delim: u8,
1006    line_delim: u8,
1007    start_field: usize,
1008    suppress: bool,
1009    out: &mut impl Write,
1010) -> io::Result<()> {
1011    if data.len() >= PARALLEL_THRESHOLD {
1012        let chunks = split_into_chunks(data, line_delim);
1013        let results: Vec<Vec<u8>> = chunks
1014            .par_iter()
1015            .map(|chunk| {
1016                let mut buf = Vec::with_capacity(chunk.len());
1017                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
1018                buf
1019            })
1020            .collect();
1021        // Use write_vectored (writev) to batch N writes into fewer syscalls
1022        let slices: Vec<IoSlice> = results
1023            .iter()
1024            .filter(|r| !r.is_empty())
1025            .map(|r| IoSlice::new(r))
1026            .collect();
1027        write_ioslices(out, &slices)?;
1028    } else {
1029        let mut buf = Vec::with_capacity(data.len());
1030        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1031        if !buf.is_empty() {
1032            out.write_all(&buf)?;
1033        }
1034    }
1035    Ok(())
1036}
1037
1038/// Process a chunk for open-ended field suffix extraction.
1039fn fields_suffix_chunk(
1040    data: &[u8],
1041    delim: u8,
1042    line_delim: u8,
1043    start_field: usize,
1044    suppress: bool,
1045    buf: &mut Vec<u8>,
1046) {
1047    let mut start = 0;
1048    for end_pos in memchr_iter(line_delim, data) {
1049        let line = &data[start..end_pos];
1050        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1051        start = end_pos + 1;
1052    }
1053    if start < data.len() {
1054        fields_suffix_line(
1055            &data[start..],
1056            delim,
1057            line_delim,
1058            start_field,
1059            suppress,
1060            buf,
1061        );
1062    }
1063}
1064
1065/// Extract fields from start_field to end from one line.
1066/// Uses raw pointer arithmetic.
1067#[inline(always)]
1068fn fields_suffix_line(
1069    line: &[u8],
1070    delim: u8,
1071    line_delim: u8,
1072    start_field: usize,
1073    suppress: bool,
1074    buf: &mut Vec<u8>,
1075) {
1076    let len = line.len();
1077    if len == 0 {
1078        if !suppress {
1079            buf.push(line_delim);
1080        }
1081        return;
1082    }
1083
1084    buf.reserve(len + 1);
1085    let base = line.as_ptr();
1086
1087    let skip_delims = start_field - 1;
1088    let mut delim_count = 0;
1089    let mut has_delim = false;
1090
1091    for pos in memchr_iter(delim, line) {
1092        has_delim = true;
1093        delim_count += 1;
1094        if delim_count >= skip_delims {
1095            unsafe {
1096                buf_extend(
1097                    buf,
1098                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1099                );
1100                buf_push(buf, line_delim);
1101            }
1102            return;
1103        }
1104    }
1105
1106    if !has_delim {
1107        if !suppress {
1108            unsafe {
1109                buf_extend(buf, line);
1110                buf_push(buf, line_delim);
1111            }
1112        }
1113        return;
1114    }
1115
1116    // Fewer delimiters than needed
1117    unsafe { buf_push(buf, line_delim) };
1118}
1119
1120/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1121/// Optimized: skip to start_field using memchr, then output until end_field.
1122fn process_fields_mid_range(
1123    data: &[u8],
1124    delim: u8,
1125    line_delim: u8,
1126    start_field: usize,
1127    end_field: usize,
1128    suppress: bool,
1129    out: &mut impl Write,
1130) -> io::Result<()> {
1131    if data.len() >= PARALLEL_THRESHOLD {
1132        let chunks = split_into_chunks(data, line_delim);
1133        let results: Vec<Vec<u8>> = chunks
1134            .par_iter()
1135            .map(|chunk| {
1136                let mut buf = Vec::with_capacity(chunk.len());
1137                fields_mid_range_chunk(
1138                    chunk,
1139                    delim,
1140                    line_delim,
1141                    start_field,
1142                    end_field,
1143                    suppress,
1144                    &mut buf,
1145                );
1146                buf
1147            })
1148            .collect();
1149        let slices: Vec<IoSlice> = results
1150            .iter()
1151            .filter(|r| !r.is_empty())
1152            .map(|r| IoSlice::new(r))
1153            .collect();
1154        write_ioslices(out, &slices)?;
1155    } else {
1156        let mut buf = Vec::with_capacity(data.len());
1157        fields_mid_range_chunk(
1158            data,
1159            delim,
1160            line_delim,
1161            start_field,
1162            end_field,
1163            suppress,
1164            &mut buf,
1165        );
1166        if !buf.is_empty() {
1167            out.write_all(&buf)?;
1168        }
1169    }
1170    Ok(())
1171}
1172
1173/// Process a chunk for contiguous mid-range field extraction.
1174fn fields_mid_range_chunk(
1175    data: &[u8],
1176    delim: u8,
1177    line_delim: u8,
1178    start_field: usize,
1179    end_field: usize,
1180    suppress: bool,
1181    buf: &mut Vec<u8>,
1182) {
1183    let mut start = 0;
1184    for end_pos in memchr_iter(line_delim, data) {
1185        let line = &data[start..end_pos];
1186        fields_mid_range_line(
1187            line,
1188            delim,
1189            line_delim,
1190            start_field,
1191            end_field,
1192            suppress,
1193            buf,
1194        );
1195        start = end_pos + 1;
1196    }
1197    if start < data.len() {
1198        fields_mid_range_line(
1199            &data[start..],
1200            delim,
1201            line_delim,
1202            start_field,
1203            end_field,
1204            suppress,
1205            buf,
1206        );
1207    }
1208}
1209
1210/// Extract fields start_field..=end_field from one line.
1211/// Uses memchr_iter to skip to start_field, then counts delimiters to end_field.
1212/// Raw pointer arithmetic to eliminate bounds checking.
1213#[inline(always)]
1214fn fields_mid_range_line(
1215    line: &[u8],
1216    delim: u8,
1217    line_delim: u8,
1218    start_field: usize,
1219    end_field: usize,
1220    suppress: bool,
1221    buf: &mut Vec<u8>,
1222) {
1223    let len = line.len();
1224    if len == 0 {
1225        if !suppress {
1226            buf.push(line_delim);
1227        }
1228        return;
1229    }
1230
1231    buf.reserve(len + 1);
1232    let base = line.as_ptr();
1233
1234    // Count delimiters to find start_field and end_field boundaries
1235    let skip_before = start_field - 1; // delimiters to skip before start_field
1236    let field_span = end_field - start_field; // additional delimiters within the range
1237    let mut delim_count = 0;
1238    let mut range_start = 0;
1239    let mut has_delim = false;
1240
1241    for pos in memchr_iter(delim, line) {
1242        has_delim = true;
1243        delim_count += 1;
1244        if delim_count == skip_before {
1245            range_start = pos + 1;
1246        }
1247        if delim_count == skip_before + field_span + 1 {
1248            // Found the delimiter after end_field — output the range
1249            if skip_before == 0 {
1250                range_start = 0;
1251            }
1252            unsafe {
1253                buf_extend(
1254                    buf,
1255                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
1256                );
1257                buf_push(buf, line_delim);
1258            }
1259            return;
1260        }
1261    }
1262
1263    if !has_delim {
1264        if !suppress {
1265            unsafe {
1266                buf_extend(buf, line);
1267                buf_push(buf, line_delim);
1268            }
1269        }
1270        return;
1271    }
1272
1273    // Line has delimiters but fewer fields than end_field
1274    if delim_count >= skip_before {
1275        // We have at least start_field, output from range_start to end
1276        if skip_before == 0 {
1277            range_start = 0;
1278        }
1279        unsafe {
1280            buf_extend(
1281                buf,
1282                std::slice::from_raw_parts(base.add(range_start), len - range_start),
1283            );
1284            buf_push(buf, line_delim);
1285        }
1286    } else {
1287        // Not enough fields even for start_field — output empty line
1288        unsafe { buf_push(buf, line_delim) };
1289    }
1290}
1291
1292/// Combined SIMD scan for arbitrary single field extraction.
1293/// Uses memchr2_iter(delim, line_delim) to scan for both bytes in a single SIMD pass.
1294/// This is faster than the nested approach (outer: find newlines, inner: find delimiters)
1295/// because it eliminates one full SIMD scan and improves cache locality.
1296///
1297/// For target_idx == 0 (field 1), after finding the target field we skip remaining
1298/// delimiters on the line by scanning directly for line_delim.
1299fn process_nth_field_combined(
1300    data: &[u8],
1301    delim: u8,
1302    line_delim: u8,
1303    target_idx: usize,
1304    suppress: bool,
1305    buf: &mut Vec<u8>,
1306) {
1307    buf.reserve(data.len());
1308
1309    let data_len = data.len();
1310    let base = data.as_ptr();
1311    let mut line_start: usize = 0;
1312    let mut field_start: usize = 0;
1313    let mut field_idx: usize = 0;
1314    let mut has_delim = false;
1315    let mut emitted = false;
1316
1317    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1318        let byte = unsafe { *base.add(pos) };
1319
1320        if byte == line_delim {
1321            // End of line
1322            if !emitted {
1323                if has_delim && field_idx == target_idx {
1324                    // Last field matches target
1325                    unsafe {
1326                        buf_extend(
1327                            buf,
1328                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
1329                        );
1330                        buf_push(buf, line_delim);
1331                    }
1332                } else if has_delim {
1333                    // Target field doesn't exist (fewer fields)
1334                    unsafe {
1335                        buf_push(buf, line_delim);
1336                    }
1337                } else if !suppress {
1338                    // No delimiter in line — output unchanged
1339                    unsafe {
1340                        buf_extend(
1341                            buf,
1342                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1343                        );
1344                        buf_push(buf, line_delim);
1345                    }
1346                }
1347            }
1348            // Reset for next line
1349            line_start = pos + 1;
1350            field_start = pos + 1;
1351            field_idx = 0;
1352            has_delim = false;
1353            emitted = false;
1354        } else {
1355            // Delimiter found
1356            has_delim = true;
1357            if field_idx == target_idx {
1358                unsafe {
1359                    buf_extend(
1360                        buf,
1361                        std::slice::from_raw_parts(base.add(field_start), pos - field_start),
1362                    );
1363                    buf_push(buf, line_delim);
1364                }
1365                emitted = true;
1366            }
1367            field_idx += 1;
1368            field_start = pos + 1;
1369        }
1370    }
1371
1372    // Handle last line without trailing newline
1373    if line_start < data_len && !emitted {
1374        if has_delim && field_idx == target_idx {
1375            unsafe {
1376                buf_extend(
1377                    buf,
1378                    std::slice::from_raw_parts(base.add(field_start), data_len - field_start),
1379                );
1380                buf_push(buf, line_delim);
1381            }
1382        } else if has_delim {
1383            unsafe {
1384                buf_push(buf, line_delim);
1385            }
1386        } else if !suppress {
1387            unsafe {
1388                buf_extend(
1389                    buf,
1390                    std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
1391                );
1392                buf_push(buf, line_delim);
1393            }
1394        }
1395    }
1396}
1397
1398/// Zero-copy field-1 extraction: writes contiguous runs directly from source data.
1399/// For each line: if delimiter exists, truncate at first delimiter; otherwise pass through.
1400/// Uses memchr2 to scan for both delimiter and line terminator in a single SIMD pass.
1401#[inline]
1402fn single_field1_zerocopy(
1403    data: &[u8],
1404    delim: u8,
1405    line_delim: u8,
1406    out: &mut impl Write,
1407) -> io::Result<()> {
1408    let mut line_start: usize = 0;
1409    let mut run_start: usize = 0;
1410    let mut first_delim: Option<usize> = None;
1411
1412    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1413        let byte = unsafe { *data.get_unchecked(pos) };
1414
1415        if byte == line_delim {
1416            // End of line
1417            if let Some(dp) = first_delim {
1418                // Line has delimiter — truncate at first delimiter.
1419                // Flush current run up to line_start, write truncated line.
1420                if run_start < line_start {
1421                    out.write_all(&data[run_start..line_start])?;
1422                }
1423                out.write_all(&data[line_start..dp])?;
1424                out.write_all(&[line_delim])?;
1425                run_start = pos + 1;
1426            }
1427            // else: no delimiter in line, output unchanged (stays in run)
1428            line_start = pos + 1;
1429            first_delim = None;
1430        } else {
1431            // Delimiter found
1432            if first_delim.is_none() {
1433                first_delim = Some(pos);
1434            }
1435        }
1436    }
1437
1438    // Handle last line (no trailing line_delim)
1439    if line_start < data.len() {
1440        if let Some(dp) = first_delim {
1441            if run_start < line_start {
1442                out.write_all(&data[run_start..line_start])?;
1443            }
1444            out.write_all(&data[line_start..dp])?;
1445            out.write_all(&[line_delim])?;
1446            return Ok(());
1447        }
1448    }
1449
1450    // Flush remaining run
1451    if run_start < data.len() {
1452        out.write_all(&data[run_start..])?;
1453        if !data.is_empty() && *data.last().unwrap() != line_delim {
1454            out.write_all(&[line_delim])?;
1455        }
1456    }
1457    Ok(())
1458}
1459
1460/// Optimized path for extracting small field indices (2-4) without suppress.
1461/// Uses per-line memchr calls to find the target field boundaries.
1462/// For field 2: finds the 1st delimiter (start of field 2), then the 2nd (end).
1463/// More efficient than memchr2_iter for small field indices since we stop early.
1464/// Uses raw pointer arithmetic to eliminate bounds checking.
1465fn process_small_field_combined(
1466    data: &[u8],
1467    delim: u8,
1468    line_delim: u8,
1469    target_idx: usize,
1470    buf: &mut Vec<u8>,
1471) {
1472    buf.reserve(data.len());
1473    let base = data.as_ptr();
1474    let data_len = data.len();
1475    let mut start = 0;
1476    for end_pos in memchr_iter(line_delim, data) {
1477        let line_len = end_pos - start;
1478        let line = unsafe { std::slice::from_raw_parts(base.add(start), line_len) };
1479        let line_base = line.as_ptr();
1480        // Find the start of the target field (skip target_idx delimiters)
1481        let mut field_start = 0;
1482        let mut found_start = target_idx == 0;
1483        let mut delim_count = 0;
1484        if !found_start {
1485            let mut search_start = 0;
1486            while let Some(pos) = memchr::memchr(delim, unsafe {
1487                std::slice::from_raw_parts(line_base.add(search_start), line_len - search_start)
1488            }) {
1489                delim_count += 1;
1490                if delim_count == target_idx {
1491                    field_start = search_start + pos + 1;
1492                    found_start = true;
1493                    break;
1494                }
1495                search_start = search_start + pos + 1;
1496            }
1497        }
1498        if !found_start {
1499            // Line has fewer fields than needed - output as-is (no suppress)
1500            unsafe {
1501                buf_extend(buf, line);
1502                buf_push(buf, line_delim);
1503            }
1504        } else if field_start >= line_len {
1505            // Empty field at end
1506            unsafe { buf_push(buf, line_delim) };
1507        } else {
1508            // Find the end of the target field
1509            match memchr::memchr(delim, unsafe {
1510                std::slice::from_raw_parts(line_base.add(field_start), line_len - field_start)
1511            }) {
1512                Some(pos) => unsafe {
1513                    buf_extend(
1514                        buf,
1515                        std::slice::from_raw_parts(line_base.add(field_start), pos),
1516                    );
1517                    buf_push(buf, line_delim);
1518                },
1519                None => unsafe {
1520                    buf_extend(
1521                        buf,
1522                        std::slice::from_raw_parts(
1523                            line_base.add(field_start),
1524                            line_len - field_start,
1525                        ),
1526                    );
1527                    buf_push(buf, line_delim);
1528                },
1529            }
1530        }
1531        start = end_pos + 1;
1532    }
1533    // Handle last line without terminator
1534    if start < data_len {
1535        let line_len = data_len - start;
1536        let line = unsafe { std::slice::from_raw_parts(base.add(start), line_len) };
1537        let line_base = line.as_ptr();
1538        let mut field_start = 0;
1539        let mut found_start = target_idx == 0;
1540        let mut delim_count = 0;
1541        if !found_start {
1542            let mut search_start = 0;
1543            while let Some(pos) = memchr::memchr(delim, unsafe {
1544                std::slice::from_raw_parts(line_base.add(search_start), line_len - search_start)
1545            }) {
1546                delim_count += 1;
1547                if delim_count == target_idx {
1548                    field_start = search_start + pos + 1;
1549                    found_start = true;
1550                    break;
1551                }
1552                search_start = search_start + pos + 1;
1553            }
1554        }
1555        if !found_start {
1556            unsafe {
1557                buf_extend(buf, line);
1558                buf_push(buf, line_delim);
1559            }
1560        } else if field_start >= line_len {
1561            unsafe { buf_push(buf, line_delim) };
1562        } else {
1563            match memchr::memchr(delim, unsafe {
1564                std::slice::from_raw_parts(line_base.add(field_start), line_len - field_start)
1565            }) {
1566                Some(pos) => unsafe {
1567                    buf_extend(
1568                        buf,
1569                        std::slice::from_raw_parts(line_base.add(field_start), pos),
1570                    );
1571                    buf_push(buf, line_delim);
1572                },
1573                None => unsafe {
1574                    buf_extend(
1575                        buf,
1576                        std::slice::from_raw_parts(
1577                            line_base.add(field_start),
1578                            line_len - field_start,
1579                        ),
1580                    );
1581                    buf_push(buf, line_delim);
1582                },
1583            }
1584        }
1585    }
1586}
1587
1588/// Process a chunk of data for single-field extraction.
1589fn process_single_field_chunk(
1590    data: &[u8],
1591    delim: u8,
1592    target_idx: usize,
1593    line_delim: u8,
1594    suppress: bool,
1595    buf: &mut Vec<u8>,
1596) {
1597    let mut start = 0;
1598    for end_pos in memchr_iter(line_delim, data) {
1599        let line = &data[start..end_pos];
1600        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
1601        start = end_pos + 1;
1602    }
1603    if start < data.len() {
1604        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
1605    }
1606}
1607
1608/// Extract a single field from one line.
1609/// Uses unsafe buf helpers — caller must ensure buf has capacity reserved.
1610/// Raw pointer arithmetic eliminates per-field bounds checking.
1611#[inline(always)]
1612fn extract_single_field_line(
1613    line: &[u8],
1614    delim: u8,
1615    target_idx: usize,
1616    line_delim: u8,
1617    suppress: bool,
1618    buf: &mut Vec<u8>,
1619) {
1620    let len = line.len();
1621    if len == 0 {
1622        if !suppress {
1623            buf.push(line_delim);
1624        }
1625        return;
1626    }
1627
1628    // Ensure capacity for worst case (full line + newline)
1629    buf.reserve(len + 1);
1630
1631    let base = line.as_ptr();
1632
1633    // Ultra-fast path for first field: single memchr
1634    if target_idx == 0 {
1635        match memchr::memchr(delim, line) {
1636            Some(pos) => unsafe {
1637                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1638                buf_push(buf, line_delim);
1639            },
1640            None => {
1641                if !suppress {
1642                    unsafe {
1643                        buf_extend(buf, line);
1644                        buf_push(buf, line_delim);
1645                    }
1646                }
1647            }
1648        }
1649        return;
1650    }
1651
1652    let mut field_start = 0;
1653    let mut field_idx = 0;
1654    let mut has_delim = false;
1655
1656    for pos in memchr_iter(delim, line) {
1657        has_delim = true;
1658        if field_idx == target_idx {
1659            unsafe {
1660                buf_extend(
1661                    buf,
1662                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
1663                );
1664                buf_push(buf, line_delim);
1665            }
1666            return;
1667        }
1668        field_idx += 1;
1669        field_start = pos + 1;
1670    }
1671
1672    if !has_delim {
1673        if !suppress {
1674            unsafe {
1675                buf_extend(buf, line);
1676                buf_push(buf, line_delim);
1677            }
1678        }
1679        return;
1680    }
1681
1682    if field_idx == target_idx {
1683        unsafe {
1684            buf_extend(
1685                buf,
1686                std::slice::from_raw_parts(base.add(field_start), len - field_start),
1687            );
1688            buf_push(buf, line_delim);
1689        }
1690    } else {
1691        unsafe { buf_push(buf, line_delim) };
1692    }
1693}
1694
1695/// Extract fields from a single line into the output buffer.
1696/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
1697/// Raw pointer arithmetic eliminates per-field bounds checking.
1698#[inline(always)]
1699fn extract_fields_to_buf(
1700    line: &[u8],
1701    delim: u8,
1702    ranges: &[Range],
1703    output_delim: &[u8],
1704    suppress: bool,
1705    max_field: usize,
1706    field_mask: u64,
1707    line_delim: u8,
1708    buf: &mut Vec<u8>,
1709    complement: bool,
1710) {
1711    let len = line.len();
1712
1713    if len == 0 {
1714        if !suppress {
1715            buf.push(line_delim);
1716        }
1717        return;
1718    }
1719
1720    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
1721    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
1722    let needed = len + output_delim.len() * 16 + 1;
1723    if buf.capacity() - buf.len() < needed {
1724        buf.reserve(needed);
1725    }
1726
1727    let base = line.as_ptr();
1728    let mut field_num: usize = 1;
1729    let mut field_start: usize = 0;
1730    let mut first_output = true;
1731    let mut has_delim = false;
1732
1733    for delim_pos in memchr_iter(delim, line) {
1734        has_delim = true;
1735
1736        if is_selected(field_num, field_mask, ranges, complement) {
1737            if !first_output {
1738                unsafe { buf_extend(buf, output_delim) };
1739            }
1740            unsafe {
1741                buf_extend(
1742                    buf,
1743                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
1744                )
1745            };
1746            first_output = false;
1747        }
1748
1749        field_num += 1;
1750        field_start = delim_pos + 1;
1751
1752        if field_num > max_field {
1753            break;
1754        }
1755    }
1756
1757    // Last field
1758    if (field_num <= max_field || complement)
1759        && has_delim
1760        && is_selected(field_num, field_mask, ranges, complement)
1761    {
1762        if !first_output {
1763            unsafe { buf_extend(buf, output_delim) };
1764        }
1765        unsafe {
1766            buf_extend(
1767                buf,
1768                std::slice::from_raw_parts(base.add(field_start), len - field_start),
1769            )
1770        };
1771        first_output = false;
1772    }
1773
1774    if !first_output {
1775        unsafe { buf_push(buf, line_delim) };
1776    } else if !has_delim {
1777        if !suppress {
1778            unsafe {
1779                buf_extend(buf, line);
1780                buf_push(buf, line_delim);
1781            }
1782        }
1783    } else {
1784        unsafe { buf_push(buf, line_delim) };
1785    }
1786}
1787
1788// ── Fast path: byte/char extraction with batched output ──────────────────
1789
1790/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
1791/// Zero-copy: writes directly from the source data using output runs.
1792/// For lines shorter than max_bytes, the output is identical to the input,
1793/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
1794fn process_bytes_from_start(
1795    data: &[u8],
1796    max_bytes: usize,
1797    line_delim: u8,
1798    out: &mut impl Write,
1799) -> io::Result<()> {
1800    if data.len() >= PARALLEL_THRESHOLD {
1801        let chunks = split_into_chunks(data, line_delim);
1802        let results: Vec<Vec<u8>> = chunks
1803            .par_iter()
1804            .map(|chunk| {
1805                let mut buf = Vec::with_capacity(chunk.len());
1806                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
1807                buf
1808            })
1809            .collect();
1810        // Use write_vectored (writev) to batch N writes into fewer syscalls
1811        let slices: Vec<IoSlice> = results
1812            .iter()
1813            .filter(|r| !r.is_empty())
1814            .map(|r| IoSlice::new(r))
1815            .collect();
1816        write_ioslices(out, &slices)?;
1817    } else {
1818        // Zero-copy path: track contiguous output runs and write directly from source.
1819        // For lines <= max_bytes, we include them as-is (no copy needed).
1820        // For lines > max_bytes, we flush the run, write the truncated line, start new run.
1821        bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
1822    }
1823    Ok(())
1824}
1825
1826/// Zero-copy byte-prefix extraction: writes contiguous runs directly from the source data.
1827/// Only copies when a line needs truncation (line > max_bytes).
1828#[inline]
1829fn bytes_from_start_zerocopy(
1830    data: &[u8],
1831    max_bytes: usize,
1832    line_delim: u8,
1833    out: &mut impl Write,
1834) -> io::Result<()> {
1835    let mut start = 0;
1836    let mut run_start: usize = 0;
1837
1838    for pos in memchr_iter(line_delim, data) {
1839        let line_len = pos - start;
1840        if line_len > max_bytes {
1841            // This line needs truncation. Flush current run, write truncated line.
1842            if run_start < start {
1843                out.write_all(&data[run_start..start])?;
1844            }
1845            out.write_all(&data[start..start + max_bytes])?;
1846            out.write_all(&[line_delim])?;
1847            run_start = pos + 1;
1848        }
1849        // else: line fits, keep it in the current contiguous run
1850        start = pos + 1;
1851    }
1852    // Handle last line without terminator
1853    if start < data.len() {
1854        let line_len = data.len() - start;
1855        if line_len > max_bytes {
1856            if run_start < start {
1857                out.write_all(&data[run_start..start])?;
1858            }
1859            out.write_all(&data[start..start + max_bytes])?;
1860            out.write_all(&[line_delim])?;
1861            return Ok(());
1862        }
1863    }
1864    // Flush remaining run (includes all short lines + the last line)
1865    if run_start < data.len() {
1866        out.write_all(&data[run_start..])?;
1867        // Add terminator if last byte isn't one
1868        if !data.is_empty() && *data.last().unwrap() != line_delim {
1869            out.write_all(&[line_delim])?;
1870        }
1871    }
1872    Ok(())
1873}
1874
1875/// Process a chunk for from-start byte range extraction (parallel path).
1876/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1877#[inline]
1878fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1879    // Reserve enough capacity: output <= input size
1880    buf.reserve(data.len());
1881
1882    let mut start = 0;
1883    for pos in memchr_iter(line_delim, data) {
1884        let line_len = pos - start;
1885        let take = line_len.min(max_bytes);
1886        unsafe {
1887            buf_extend(buf, &data[start..start + take]);
1888            buf_push(buf, line_delim);
1889        }
1890        start = pos + 1;
1891    }
1892    // Handle last line without terminator
1893    if start < data.len() {
1894        let line_len = data.len() - start;
1895        let take = line_len.min(max_bytes);
1896        unsafe {
1897            buf_extend(buf, &data[start..start + take]);
1898            buf_push(buf, line_delim);
1899        }
1900    }
1901}
1902
1903/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
1904fn process_bytes_from_offset(
1905    data: &[u8],
1906    skip_bytes: usize,
1907    line_delim: u8,
1908    out: &mut impl Write,
1909) -> io::Result<()> {
1910    if data.len() >= PARALLEL_THRESHOLD {
1911        let chunks = split_into_chunks(data, line_delim);
1912        let results: Vec<Vec<u8>> = chunks
1913            .par_iter()
1914            .map(|chunk| {
1915                let mut buf = Vec::with_capacity(chunk.len());
1916                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
1917                buf
1918            })
1919            .collect();
1920        // Use write_vectored (writev) to batch N writes into fewer syscalls
1921        let slices: Vec<IoSlice> = results
1922            .iter()
1923            .filter(|r| !r.is_empty())
1924            .map(|r| IoSlice::new(r))
1925            .collect();
1926        write_ioslices(out, &slices)?;
1927    } else {
1928        // Zero-copy: write suffix of each line directly from source
1929        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
1930    }
1931    Ok(())
1932}
1933
1934/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
1935/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
1936/// reducing syscall overhead from 2 write_all calls per line to batched writev.
1937#[inline]
1938fn bytes_from_offset_zerocopy(
1939    data: &[u8],
1940    skip_bytes: usize,
1941    line_delim: u8,
1942    out: &mut impl Write,
1943) -> io::Result<()> {
1944    let delim_buf = [line_delim];
1945    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
1946
1947    let mut start = 0;
1948    for pos in memchr_iter(line_delim, data) {
1949        let line_len = pos - start;
1950        if line_len > skip_bytes {
1951            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
1952        }
1953        iov.push(IoSlice::new(&delim_buf));
1954        // Flush when approaching MAX_IOV to avoid oversized writev
1955        if iov.len() >= MAX_IOV - 1 {
1956            write_ioslices(out, &iov)?;
1957            iov.clear();
1958        }
1959        start = pos + 1;
1960    }
1961    if start < data.len() {
1962        let line_len = data.len() - start;
1963        if line_len > skip_bytes {
1964            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
1965        }
1966        iov.push(IoSlice::new(&delim_buf));
1967    }
1968    if !iov.is_empty() {
1969        write_ioslices(out, &iov)?;
1970    }
1971    Ok(())
1972}
1973
1974/// Process a chunk for from-offset byte range extraction.
1975/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1976#[inline]
1977fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1978    buf.reserve(data.len());
1979
1980    let mut start = 0;
1981    for pos in memchr_iter(line_delim, data) {
1982        let line_len = pos - start;
1983        if line_len > skip_bytes {
1984            unsafe {
1985                buf_extend(buf, &data[start + skip_bytes..pos]);
1986            }
1987        }
1988        unsafe {
1989            buf_push(buf, line_delim);
1990        }
1991        start = pos + 1;
1992    }
1993    if start < data.len() {
1994        let line_len = data.len() - start;
1995        if line_len > skip_bytes {
1996            unsafe {
1997                buf_extend(buf, &data[start + skip_bytes..data.len()]);
1998            }
1999        }
2000        unsafe {
2001            buf_push(buf, line_delim);
2002        }
2003    }
2004}
2005
2006/// Optimized byte/char extraction with batched output and parallel processing.
2007fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2008    let line_delim = cfg.line_delim;
2009    let ranges = cfg.ranges;
2010    let complement = cfg.complement;
2011    let output_delim = cfg.output_delim;
2012
2013    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
2014    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
2015        let max_bytes = ranges[0].end;
2016        if max_bytes < usize::MAX {
2017            return process_bytes_from_start(data, max_bytes, line_delim, out);
2018        }
2019    }
2020
2021    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
2022    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
2023        let skip_bytes = ranges[0].start.saturating_sub(1);
2024        if skip_bytes > 0 {
2025            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
2026        }
2027    }
2028
2029    if data.len() >= PARALLEL_THRESHOLD {
2030        let chunks = split_into_chunks(data, line_delim);
2031        let results: Vec<Vec<u8>> = chunks
2032            .par_iter()
2033            .map(|chunk| {
2034                let mut buf = Vec::with_capacity(chunk.len());
2035                process_bytes_chunk(
2036                    chunk,
2037                    ranges,
2038                    complement,
2039                    output_delim,
2040                    line_delim,
2041                    &mut buf,
2042                );
2043                buf
2044            })
2045            .collect();
2046        // Use write_vectored (writev) to batch N writes into fewer syscalls
2047        let slices: Vec<IoSlice> = results
2048            .iter()
2049            .filter(|r| !r.is_empty())
2050            .map(|r| IoSlice::new(r))
2051            .collect();
2052        write_ioslices(out, &slices)?;
2053    } else {
2054        let mut buf = Vec::with_capacity(data.len());
2055        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
2056        if !buf.is_empty() {
2057            out.write_all(&buf)?;
2058        }
2059    }
2060    Ok(())
2061}
2062
2063/// Process a chunk of data for byte/char extraction.
2064/// Uses raw pointer arithmetic for the newline scan.
2065fn process_bytes_chunk(
2066    data: &[u8],
2067    ranges: &[Range],
2068    complement: bool,
2069    output_delim: &[u8],
2070    line_delim: u8,
2071    buf: &mut Vec<u8>,
2072) {
2073    buf.reserve(data.len());
2074    let base = data.as_ptr();
2075    let mut start = 0;
2076    for end_pos in memchr_iter(line_delim, data) {
2077        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
2078        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2079        unsafe { buf_push(buf, line_delim) };
2080        start = end_pos + 1;
2081    }
2082    if start < data.len() {
2083        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
2084        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
2085        unsafe { buf_push(buf, line_delim) };
2086    }
2087}
2088
2089/// Extract byte ranges from a line into the output buffer.
2090/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
2091/// Raw pointer arithmetic eliminates per-range bounds checking.
2092#[inline(always)]
2093fn cut_bytes_to_buf(
2094    line: &[u8],
2095    ranges: &[Range],
2096    complement: bool,
2097    output_delim: &[u8],
2098    buf: &mut Vec<u8>,
2099) {
2100    let len = line.len();
2101    let base = line.as_ptr();
2102    let mut first_range = true;
2103
2104    // Reserve worst case: full line + delimiters between ranges
2105    let needed = len + output_delim.len() * ranges.len() + 1;
2106    if buf.capacity() - buf.len() < needed {
2107        buf.reserve(needed);
2108    }
2109
2110    if complement {
2111        let mut pos: usize = 1;
2112        for r in ranges {
2113            let rs = r.start;
2114            let re = r.end.min(len);
2115            if pos < rs {
2116                if !first_range && !output_delim.is_empty() {
2117                    unsafe { buf_extend(buf, output_delim) };
2118                }
2119                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
2120                first_range = false;
2121            }
2122            pos = re + 1;
2123            if pos > len {
2124                break;
2125            }
2126        }
2127        if pos <= len {
2128            if !first_range && !output_delim.is_empty() {
2129                unsafe { buf_extend(buf, output_delim) };
2130            }
2131            unsafe {
2132                buf_extend(
2133                    buf,
2134                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
2135                )
2136            };
2137        }
2138    } else if output_delim.is_empty() && ranges.len() == 1 {
2139        // Ultra-fast path: single range, no output delimiter
2140        let start = ranges[0].start.saturating_sub(1);
2141        let end = ranges[0].end.min(len);
2142        if start < len {
2143            unsafe {
2144                buf_extend(
2145                    buf,
2146                    std::slice::from_raw_parts(base.add(start), end - start),
2147                )
2148            };
2149        }
2150    } else {
2151        for r in ranges {
2152            let start = r.start.saturating_sub(1);
2153            let end = r.end.min(len);
2154            if start >= len {
2155                break;
2156            }
2157            if !first_range && !output_delim.is_empty() {
2158                unsafe { buf_extend(buf, output_delim) };
2159            }
2160            unsafe {
2161                buf_extend(
2162                    buf,
2163                    std::slice::from_raw_parts(base.add(start), end - start),
2164                )
2165            };
2166            first_range = false;
2167        }
2168    }
2169}
2170
2171// ── Public API ───────────────────────────────────────────────────────────
2172
2173/// Cut fields from a line using a delimiter. Writes to `out`.
2174#[inline]
2175pub fn cut_fields(
2176    line: &[u8],
2177    delim: u8,
2178    ranges: &[Range],
2179    complement: bool,
2180    output_delim: &[u8],
2181    suppress_no_delim: bool,
2182    out: &mut impl Write,
2183) -> io::Result<bool> {
2184    if memchr::memchr(delim, line).is_none() {
2185        if !suppress_no_delim {
2186            out.write_all(line)?;
2187            return Ok(true);
2188        }
2189        return Ok(false);
2190    }
2191
2192    let mut field_num: usize = 1;
2193    let mut field_start: usize = 0;
2194    let mut first_output = true;
2195
2196    for delim_pos in memchr_iter(delim, line) {
2197        let selected = in_ranges(ranges, field_num) != complement;
2198        if selected {
2199            if !first_output {
2200                out.write_all(output_delim)?;
2201            }
2202            out.write_all(&line[field_start..delim_pos])?;
2203            first_output = false;
2204        }
2205        field_start = delim_pos + 1;
2206        field_num += 1;
2207    }
2208
2209    let selected = in_ranges(ranges, field_num) != complement;
2210    if selected {
2211        if !first_output {
2212            out.write_all(output_delim)?;
2213        }
2214        out.write_all(&line[field_start..])?;
2215    }
2216
2217    Ok(true)
2218}
2219
2220/// Cut bytes/chars from a line. Writes selected bytes to `out`.
2221#[inline]
2222pub fn cut_bytes(
2223    line: &[u8],
2224    ranges: &[Range],
2225    complement: bool,
2226    output_delim: &[u8],
2227    out: &mut impl Write,
2228) -> io::Result<bool> {
2229    let mut first_range = true;
2230
2231    if complement {
2232        let len = line.len();
2233        let mut comp_ranges = Vec::new();
2234        let mut pos: usize = 1;
2235        for r in ranges {
2236            let rs = r.start;
2237            let re = r.end.min(len);
2238            if pos < rs {
2239                comp_ranges.push((pos, rs - 1));
2240            }
2241            pos = re + 1;
2242            if pos > len {
2243                break;
2244            }
2245        }
2246        if pos <= len {
2247            comp_ranges.push((pos, len));
2248        }
2249        for &(s, e) in &comp_ranges {
2250            if !first_range && !output_delim.is_empty() {
2251                out.write_all(output_delim)?;
2252            }
2253            out.write_all(&line[s - 1..e])?;
2254            first_range = false;
2255        }
2256    } else {
2257        for r in ranges {
2258            let start = r.start.saturating_sub(1);
2259            let end = r.end.min(line.len());
2260            if start >= line.len() {
2261                break;
2262            }
2263            if !first_range && !output_delim.is_empty() {
2264                out.write_all(output_delim)?;
2265            }
2266            out.write_all(&line[start..end])?;
2267            first_range = false;
2268        }
2269    }
2270    Ok(true)
2271}
2272
2273/// Process a full data buffer (from mmap or read) with cut operation.
2274pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2275    match cfg.mode {
2276        CutMode::Fields => process_fields_fast(data, cfg, out),
2277        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
2278    }
2279}
2280
2281/// Process input from a reader (for stdin).
2282/// Uses batch reading: reads large chunks (16MB), then processes them in batch
2283/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
2284/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
2285pub fn process_cut_reader<R: BufRead>(
2286    mut reader: R,
2287    cfg: &CutConfig,
2288    out: &mut impl Write,
2289) -> io::Result<()> {
2290    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
2291    let line_delim = cfg.line_delim;
2292
2293    // Read large chunks and process in batch.
2294    // We keep a buffer; after processing complete lines, we shift leftover to the front.
2295    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
2296
2297    loop {
2298        // Read up to CHUNK_SIZE bytes
2299        buf.reserve(CHUNK_SIZE);
2300        let read_start = buf.len();
2301        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
2302        let n = read_fully(&mut reader, &mut buf[read_start..])?;
2303        buf.truncate(read_start + n);
2304
2305        if buf.is_empty() {
2306            break;
2307        }
2308
2309        if n == 0 {
2310            // EOF with leftover data (last line without terminator)
2311            process_cut_data(&buf, cfg, out)?;
2312            break;
2313        }
2314
2315        // Find the last line delimiter in the buffer so we process complete lines
2316        let process_end = match memchr::memrchr(line_delim, &buf) {
2317            Some(pos) => pos + 1,
2318            None => {
2319                // No line delimiter found — keep accumulating
2320                continue;
2321            }
2322        };
2323
2324        // Process the complete lines using the fast batch path
2325        process_cut_data(&buf[..process_end], cfg, out)?;
2326
2327        // Shift leftover to the front for next iteration
2328        let leftover_len = buf.len() - process_end;
2329        if leftover_len > 0 {
2330            buf.copy_within(process_end.., 0);
2331        }
2332        buf.truncate(leftover_len);
2333    }
2334
2335    Ok(())
2336}
2337
2338/// Read as many bytes as possible into buf, retrying on partial reads.
2339#[inline]
2340fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
2341    let n = reader.read(buf)?;
2342    if n == buf.len() || n == 0 {
2343        return Ok(n);
2344    }
2345    // Slow path: partial read — retry to fill buffer
2346    let mut total = n;
2347    while total < buf.len() {
2348        match reader.read(&mut buf[total..]) {
2349            Ok(0) => break,
2350            Ok(n) => total += n,
2351            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2352            Err(e) => return Err(e),
2353        }
2354    }
2355    Ok(total)
2356}
2357
2358/// Cut operation mode
2359#[derive(Debug, Clone, Copy, PartialEq)]
2360pub enum CutMode {
2361    Bytes,
2362    Characters,
2363    Fields,
2364}