Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (2MB).
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Max iovec entries per writev call (Linux default).
9const MAX_IOV: usize = 1024;
10
11/// Configuration for cut operations.
12pub struct CutConfig<'a> {
13    pub mode: CutMode,
14    pub ranges: &'a [Range],
15    pub complement: bool,
16    pub delim: u8,
17    pub output_delim: &'a [u8],
18    pub suppress_no_delim: bool,
19    pub line_delim: u8,
20}
21
22/// A range specification like 1, 3-5, -3, 4-
23#[derive(Debug, Clone)]
24pub struct Range {
25    pub start: usize, // 1-based, 0 means "from beginning"
26    pub end: usize,   // 1-based, usize::MAX means "to end"
27}
28
29/// Parse a LIST specification like "1,3-5,7-" into ranges.
30/// Each range is 1-based. Returns sorted, merged ranges.
31pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
32    let mut ranges = Vec::new();
33
34    for part in spec.split(',') {
35        let part = part.trim();
36        if part.is_empty() {
37            continue;
38        }
39
40        if let Some(idx) = part.find('-') {
41            let left = &part[..idx];
42            let right = &part[idx + 1..];
43
44            let start = if left.is_empty() {
45                1
46            } else {
47                left.parse::<usize>()
48                    .map_err(|_| format!("invalid range: '{}'", part))?
49            };
50
51            let end = if right.is_empty() {
52                usize::MAX
53            } else {
54                right
55                    .parse::<usize>()
56                    .map_err(|_| format!("invalid range: '{}'", part))?
57            };
58
59            if start == 0 {
60                return Err("fields and positions are numbered from 1".to_string());
61            }
62            if start > end {
63                return Err(format!("invalid decreasing range: '{}'", part));
64            }
65
66            ranges.push(Range { start, end });
67        } else {
68            let n = part
69                .parse::<usize>()
70                .map_err(|_| format!("invalid field: '{}'", part))?;
71            if n == 0 {
72                return Err("fields and positions are numbered from 1".to_string());
73            }
74            ranges.push(Range { start: n, end: n });
75        }
76    }
77
78    if ranges.is_empty() {
79        return Err("you must specify a list of bytes, characters, or fields".to_string());
80    }
81
82    // Sort and merge overlapping ranges
83    ranges.sort_by_key(|r| (r.start, r.end));
84    let mut merged = vec![ranges[0].clone()];
85    for r in &ranges[1..] {
86        let last = merged.last_mut().unwrap();
87        if r.start <= last.end.saturating_add(1) {
88            last.end = last.end.max(r.end);
89        } else {
90            merged.push(r.clone());
91        }
92    }
93
94    Ok(merged)
95}
96
97/// Check if a 1-based position is in any range.
98/// Ranges must be sorted. Uses early exit since ranges are sorted.
99#[inline(always)]
100fn in_ranges(ranges: &[Range], pos: usize) -> bool {
101    for r in ranges {
102        if pos < r.start {
103            return false;
104        }
105        if pos <= r.end {
106            return true;
107        }
108    }
109    false
110}
111
112/// Pre-compute a 64-bit mask for field selection.
113/// Bit i-1 is set if field i should be output.
114#[inline]
115fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
116    let mut mask: u64 = 0;
117    for i in 1..=64u32 {
118        let in_range = in_ranges(ranges, i as usize);
119        if in_range != complement {
120            mask |= 1u64 << (i - 1);
121        }
122    }
123    mask
124}
125
126/// Check if a field should be selected, using bitset for first 64 fields.
127#[inline(always)]
128fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
129    if field_num <= 64 {
130        (mask >> (field_num - 1)) & 1 == 1
131    } else {
132        in_ranges(ranges, field_num) != complement
133    }
134}
135
136// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
137
138/// Append a slice to buf without capacity checks.
139/// Caller MUST ensure buf has enough remaining capacity.
140#[inline(always)]
141unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
142    unsafe {
143        let len = buf.len();
144        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
145        buf.set_len(len + data.len());
146    }
147}
148
149/// Append a single byte to buf without capacity checks.
150/// Caller MUST ensure buf has enough remaining capacity.
151#[inline(always)]
152unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
153    unsafe {
154        let len = buf.len();
155        *buf.as_mut_ptr().add(len) = b;
156        buf.set_len(len + 1);
157    }
158}
159
160/// Write multiple IoSlice buffers using write_vectored (writev syscall).
161/// Batches into MAX_IOV-sized groups. Falls back to write_all per slice for partial writes.
162#[inline]
163fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
164    if slices.is_empty() {
165        return Ok(());
166    }
167    for batch in slices.chunks(MAX_IOV) {
168        let total: usize = batch.iter().map(|s| s.len()).sum();
169        match out.write_vectored(batch) {
170            Ok(n) if n >= total => continue,
171            Ok(mut written) => {
172                // Partial write: fall back to write_all per remaining slice
173                for slice in batch {
174                    let slen = slice.len();
175                    if written >= slen {
176                        written -= slen;
177                        continue;
178                    }
179                    if written > 0 {
180                        out.write_all(&slice[written..])?;
181                        written = 0;
182                    } else {
183                        out.write_all(slice)?;
184                    }
185                }
186            }
187            Err(e) => return Err(e),
188        }
189    }
190    Ok(())
191}
192
193// ── Chunk splitting for parallel processing ──────────────────────────────
194
195/// Split data into chunks aligned to line boundaries for parallel processing.
196fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
197    let num_threads = rayon::current_num_threads().max(1);
198    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
199        return vec![data];
200    }
201
202    let chunk_size = data.len() / num_threads;
203    let mut chunks = Vec::with_capacity(num_threads);
204    let mut pos = 0;
205
206    for _ in 0..num_threads - 1 {
207        let target = pos + chunk_size;
208        if target >= data.len() {
209            break;
210        }
211        let boundary = memchr::memchr(line_delim, &data[target..])
212            .map(|p| target + p + 1)
213            .unwrap_or(data.len());
214        if boundary > pos {
215            chunks.push(&data[pos..boundary]);
216        }
217        pos = boundary;
218    }
219
220    if pos < data.len() {
221        chunks.push(&data[pos..]);
222    }
223
224    chunks
225}
226
227// ── Fast path: field extraction with batched output ──────────────────────
228
229/// Optimized field extraction with early exit and batched output.
230fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
231    let delim = cfg.delim;
232    let line_delim = cfg.line_delim;
233    let ranges = cfg.ranges;
234    let complement = cfg.complement;
235    let output_delim = cfg.output_delim;
236    let suppress = cfg.suppress_no_delim;
237
238    // Zero-copy fast path: if delimiter never appears, output = input unchanged.
239    if !complement && memchr::memchr(delim, data).is_none() {
240        if suppress {
241            return Ok(());
242        }
243        out.write_all(data)?;
244        if !data.is_empty() && *data.last().unwrap() != line_delim {
245            out.write_all(&[line_delim])?;
246        }
247        return Ok(());
248    }
249
250    // Ultra-fast path: single field extraction (e.g., cut -f5)
251    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
252        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
253    }
254
255    // Fast path: complement of single field with default output delimiter.
256    if complement
257        && ranges.len() == 1
258        && ranges[0].start == ranges[0].end
259        && output_delim.len() == 1
260        && output_delim[0] == delim
261    {
262        return process_complement_single_field(
263            data,
264            delim,
265            line_delim,
266            ranges[0].start,
267            suppress,
268            out,
269        );
270    }
271
272    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
273    if !complement
274        && ranges.len() == 1
275        && ranges[0].start == 1
276        && output_delim.len() == 1
277        && output_delim[0] == delim
278        && ranges[0].end < usize::MAX
279    {
280        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
281    }
282
283    // Fast path: open-ended field range from field N (e.g., cut -f3-)
284    if !complement
285        && ranges.len() == 1
286        && ranges[0].end == usize::MAX
287        && ranges[0].start > 1
288        && output_delim.len() == 1
289        && output_delim[0] == delim
290    {
291        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
292    }
293
294    // General field extraction
295    let max_field = if complement {
296        usize::MAX
297    } else {
298        ranges.last().map(|r| r.end).unwrap_or(0)
299    };
300    let field_mask = compute_field_mask(ranges, complement);
301
302    if data.len() >= PARALLEL_THRESHOLD {
303        let chunks = split_into_chunks(data, line_delim);
304        let results: Vec<Vec<u8>> = chunks
305            .par_iter()
306            .map(|chunk| {
307                let mut buf = Vec::with_capacity(chunk.len());
308                process_fields_chunk(
309                    chunk,
310                    delim,
311                    ranges,
312                    output_delim,
313                    suppress,
314                    max_field,
315                    field_mask,
316                    line_delim,
317                    complement,
318                    &mut buf,
319                );
320                buf
321            })
322            .collect();
323        // Use write_vectored (writev) to batch N writes into fewer syscalls
324        let slices: Vec<IoSlice> = results
325            .iter()
326            .filter(|r| !r.is_empty())
327            .map(|r| IoSlice::new(r))
328            .collect();
329        write_ioslices(out, &slices)?;
330    } else {
331        let mut buf = Vec::with_capacity(data.len());
332        process_fields_chunk(
333            data,
334            delim,
335            ranges,
336            output_delim,
337            suppress,
338            max_field,
339            field_mask,
340            line_delim,
341            complement,
342            &mut buf,
343        );
344        if !buf.is_empty() {
345            out.write_all(&buf)?;
346        }
347    }
348    Ok(())
349}
350
351/// Process a chunk of data for general field extraction.
352/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
353/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
354/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
355fn process_fields_chunk(
356    data: &[u8],
357    delim: u8,
358    ranges: &[Range],
359    output_delim: &[u8],
360    suppress: bool,
361    max_field: usize,
362    field_mask: u64,
363    line_delim: u8,
364    complement: bool,
365    buf: &mut Vec<u8>,
366) {
367    // Single-pass path: when delim != line_delim, use memchr2_iter for both bytes
368    // in one SIMD scan. This eliminates 10M inner-loop memchr_iter startups for
369    // a file with 10M lines.
370    if delim != line_delim {
371        buf.reserve(data.len());
372
373        let mut line_start: usize = 0;
374        let mut field_start: usize = 0;
375        let mut field_num: usize = 1;
376        let mut first_output = true;
377        let mut has_delim = false;
378
379        for pos in memchr::memchr2_iter(delim, line_delim, data) {
380            let byte = unsafe { *data.get_unchecked(pos) };
381
382            if byte == line_delim {
383                // End of line: flush final field and emit line delimiter
384                if (field_num <= max_field || complement)
385                    && has_delim
386                    && is_selected(field_num, field_mask, ranges, complement)
387                {
388                    if !first_output {
389                        unsafe { buf_extend(buf, output_delim) };
390                    }
391                    unsafe { buf_extend(buf, &data[field_start..pos]) };
392                    first_output = false;
393                }
394
395                if !first_output {
396                    unsafe { buf_push(buf, line_delim) };
397                } else if !has_delim {
398                    if !suppress {
399                        unsafe {
400                            buf_extend(buf, &data[line_start..pos]);
401                            buf_push(buf, line_delim);
402                        }
403                    }
404                } else {
405                    unsafe { buf_push(buf, line_delim) };
406                }
407
408                // Reset state for next line
409                line_start = pos + 1;
410                field_start = pos + 1;
411                field_num = 1;
412                first_output = true;
413                has_delim = false;
414            } else {
415                // Field delimiter hit
416                has_delim = true;
417
418                if is_selected(field_num, field_mask, ranges, complement) {
419                    if !first_output {
420                        unsafe { buf_extend(buf, output_delim) };
421                    }
422                    unsafe { buf_extend(buf, &data[field_start..pos]) };
423                    first_output = false;
424                }
425
426                field_num += 1;
427                field_start = pos + 1;
428
429                if field_num > max_field && !complement {
430                    // Skip remaining delimiters on this line; find next line_delim
431                    // We'll let the next memchr2 iteration handle the line_delim
432                }
433            }
434        }
435
436        // Handle last line without trailing line_delim
437        if line_start < data.len() {
438            let line = &data[line_start..];
439            if !line.is_empty() {
440                if (field_num <= max_field || complement)
441                    && has_delim
442                    && is_selected(field_num, field_mask, ranges, complement)
443                {
444                    if !first_output {
445                        unsafe { buf_extend(buf, output_delim) };
446                    }
447                    unsafe { buf_extend(buf, &data[field_start..data.len()]) };
448                    first_output = false;
449                }
450
451                if !first_output {
452                    unsafe { buf_push(buf, line_delim) };
453                } else if !has_delim {
454                    if !suppress {
455                        unsafe {
456                            buf_extend(buf, &data[line_start..data.len()]);
457                            buf_push(buf, line_delim);
458                        }
459                    }
460                } else {
461                    unsafe { buf_push(buf, line_delim) };
462                }
463            }
464        }
465
466        return;
467    }
468
469    // Fallback: when delim == line_delim, use the two-level scan approach
470    let mut start = 0;
471    for end_pos in memchr_iter(line_delim, data) {
472        let line = &data[start..end_pos];
473        extract_fields_to_buf(
474            line,
475            delim,
476            ranges,
477            output_delim,
478            suppress,
479            max_field,
480            field_mask,
481            line_delim,
482            buf,
483            complement,
484        );
485        start = end_pos + 1;
486    }
487    if start < data.len() {
488        extract_fields_to_buf(
489            &data[start..],
490            delim,
491            ranges,
492            output_delim,
493            suppress,
494            max_field,
495            field_mask,
496            line_delim,
497            buf,
498            complement,
499        );
500    }
501}
502
503// ── Ultra-fast single field extraction ───────────────────────────────────
504
505/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
506/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
507/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
508fn process_single_field(
509    data: &[u8],
510    delim: u8,
511    line_delim: u8,
512    target: usize,
513    suppress: bool,
514    out: &mut impl Write,
515) -> io::Result<()> {
516    let target_idx = target - 1;
517
518    // Combined SIMD scan: single pass using memchr2 for any target field.
519    if delim != line_delim {
520        if data.len() >= PARALLEL_THRESHOLD {
521            let chunks = split_into_chunks(data, line_delim);
522            let results: Vec<Vec<u8>> = chunks
523                .par_iter()
524                .map(|chunk| {
525                    let mut buf = Vec::with_capacity(chunk.len());
526                    process_nth_field_combined(
527                        chunk, delim, line_delim, target_idx, suppress, &mut buf,
528                    );
529                    buf
530                })
531                .collect();
532            for result in &results {
533                if !result.is_empty() {
534                    out.write_all(result)?;
535                }
536            }
537        } else if target_idx == 0 && !suppress {
538            // Zero-copy fast path for field 1 (most common case):
539            // For each line, either truncate at the first delimiter, or pass through.
540            // Since most lines have a delimiter, and field 1 is a prefix of each line,
541            // we can write contiguous runs directly from the source data.
542            single_field1_zerocopy(data, delim, line_delim, out)?;
543        } else {
544            let mut buf = Vec::with_capacity(data.len());
545            process_nth_field_combined(data, delim, line_delim, target_idx, suppress, &mut buf);
546            if !buf.is_empty() {
547                out.write_all(&buf)?;
548            }
549        }
550        return Ok(());
551    }
552
553    // Fallback for delim == line_delim: nested loop approach
554    if data.len() >= PARALLEL_THRESHOLD {
555        let chunks = split_into_chunks(data, line_delim);
556        let results: Vec<Vec<u8>> = chunks
557            .par_iter()
558            .map(|chunk| {
559                let mut buf = Vec::with_capacity(chunk.len() / 4);
560                process_single_field_chunk(
561                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
562                );
563                buf
564            })
565            .collect();
566        // Use write_vectored (writev) to batch N writes into fewer syscalls
567        let slices: Vec<IoSlice> = results
568            .iter()
569            .filter(|r| !r.is_empty())
570            .map(|r| IoSlice::new(r))
571            .collect();
572        write_ioslices(out, &slices)?;
573    } else {
574        let mut buf = Vec::with_capacity(data.len() / 4);
575        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
576        if !buf.is_empty() {
577            out.write_all(&buf)?;
578        }
579    }
580    Ok(())
581}
582
583/// Complement single-field extraction: skip one field, output rest unchanged.
584fn process_complement_single_field(
585    data: &[u8],
586    delim: u8,
587    line_delim: u8,
588    skip_field: usize,
589    suppress: bool,
590    out: &mut impl Write,
591) -> io::Result<()> {
592    let skip_idx = skip_field - 1;
593
594    if data.len() >= PARALLEL_THRESHOLD {
595        let chunks = split_into_chunks(data, line_delim);
596        let results: Vec<Vec<u8>> = chunks
597            .par_iter()
598            .map(|chunk| {
599                let mut buf = Vec::with_capacity(chunk.len());
600                complement_single_field_chunk(
601                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
602                );
603                buf
604            })
605            .collect();
606        // Use write_vectored (writev) to batch N writes into fewer syscalls
607        let slices: Vec<IoSlice> = results
608            .iter()
609            .filter(|r| !r.is_empty())
610            .map(|r| IoSlice::new(r))
611            .collect();
612        write_ioslices(out, &slices)?;
613    } else {
614        let mut buf = Vec::with_capacity(data.len());
615        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
616        if !buf.is_empty() {
617            out.write_all(&buf)?;
618        }
619    }
620    Ok(())
621}
622
623/// Process a chunk for complement single-field extraction.
624fn complement_single_field_chunk(
625    data: &[u8],
626    delim: u8,
627    skip_idx: usize,
628    line_delim: u8,
629    suppress: bool,
630    buf: &mut Vec<u8>,
631) {
632    let mut start = 0;
633    for end_pos in memchr_iter(line_delim, data) {
634        let line = &data[start..end_pos];
635        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
636        start = end_pos + 1;
637    }
638    if start < data.len() {
639        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
640    }
641}
642
643/// Extract all fields except skip_idx from one line.
644#[inline(always)]
645fn complement_single_field_line(
646    line: &[u8],
647    delim: u8,
648    skip_idx: usize,
649    line_delim: u8,
650    suppress: bool,
651    buf: &mut Vec<u8>,
652) {
653    if line.is_empty() {
654        if !suppress {
655            buf.push(line_delim);
656        }
657        return;
658    }
659
660    buf.reserve(line.len() + 1);
661
662    let mut field_idx = 0;
663    let mut field_start = 0;
664    let mut first_output = true;
665    let mut has_delim = false;
666
667    for pos in memchr_iter(delim, line) {
668        has_delim = true;
669        if field_idx != skip_idx {
670            if !first_output {
671                unsafe { buf_push(buf, delim) };
672            }
673            unsafe { buf_extend(buf, &line[field_start..pos]) };
674            first_output = false;
675        }
676        field_idx += 1;
677        field_start = pos + 1;
678    }
679
680    if !has_delim {
681        if !suppress {
682            unsafe {
683                buf_extend(buf, line);
684                buf_push(buf, line_delim);
685            }
686        }
687        return;
688    }
689
690    // Last field
691    if field_idx != skip_idx {
692        if !first_output {
693            unsafe { buf_push(buf, delim) };
694        }
695        unsafe { buf_extend(buf, &line[field_start..]) };
696    }
697
698    unsafe { buf_push(buf, line_delim) };
699}
700
701/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
702/// Zero-copy for the non-parallel path: identifies the truncation point per line
703/// and writes contiguous runs directly from the source data.
704fn process_fields_prefix(
705    data: &[u8],
706    delim: u8,
707    line_delim: u8,
708    last_field: usize,
709    suppress: bool,
710    out: &mut impl Write,
711) -> io::Result<()> {
712    if data.len() >= PARALLEL_THRESHOLD {
713        let chunks = split_into_chunks(data, line_delim);
714        let results: Vec<Vec<u8>> = chunks
715            .par_iter()
716            .map(|chunk| {
717                let mut buf = Vec::with_capacity(chunk.len());
718                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
719                buf
720            })
721            .collect();
722        // Use write_vectored (writev) to batch N writes into fewer syscalls
723        let slices: Vec<IoSlice> = results
724            .iter()
725            .filter(|r| !r.is_empty())
726            .map(|r| IoSlice::new(r))
727            .collect();
728        write_ioslices(out, &slices)?;
729    } else if !suppress {
730        // Zero-copy fast path: scan for truncation points, write runs from source.
731        // When suppress is false, every line is output (with or without delimiter).
732        // Most lines have enough fields, so the output is often identical to input.
733        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
734    } else {
735        let mut buf = Vec::with_capacity(data.len());
736        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
737        if !buf.is_empty() {
738            out.write_all(&buf)?;
739        }
740    }
741    Ok(())
742}
743
744/// Zero-copy field-prefix extraction: writes contiguous runs directly from source data.
745/// For lines where the Nth delimiter exists, we truncate at that point.
746/// For lines with fewer fields, we output them unchanged.
747/// Lines without any delimiter are output unchanged (suppress=false assumed).
748#[inline]
749fn fields_prefix_zerocopy(
750    data: &[u8],
751    delim: u8,
752    line_delim: u8,
753    last_field: usize,
754    out: &mut impl Write,
755) -> io::Result<()> {
756    let mut start = 0;
757    let mut run_start: usize = 0;
758
759    for end_pos in memchr_iter(line_delim, data) {
760        let line = &data[start..end_pos];
761        // Find the position of the Nth delimiter to truncate at
762        let mut field_count = 1;
763        let mut truncate_at: Option<usize> = None;
764        for dpos in memchr_iter(delim, line) {
765            if field_count >= last_field {
766                truncate_at = Some(start + dpos);
767                break;
768            }
769            field_count += 1;
770        }
771
772        if let Some(trunc_pos) = truncate_at {
773            // This line has more fields than needed. Flush run, write truncated.
774            if run_start < start {
775                out.write_all(&data[run_start..start])?;
776            }
777            out.write_all(&data[start..trunc_pos])?;
778            out.write_all(&[line_delim])?;
779            run_start = end_pos + 1;
780        }
781        // else: line has <= last_field fields, keep it in the run
782        start = end_pos + 1;
783    }
784    // Handle last line without terminator
785    if start < data.len() {
786        let line = &data[start..];
787        let mut field_count = 1;
788        let mut truncate_at: Option<usize> = None;
789        for dpos in memchr_iter(delim, line) {
790            if field_count >= last_field {
791                truncate_at = Some(start + dpos);
792                break;
793            }
794            field_count += 1;
795        }
796        if let Some(trunc_pos) = truncate_at {
797            if run_start < start {
798                out.write_all(&data[run_start..start])?;
799            }
800            out.write_all(&data[start..trunc_pos])?;
801            out.write_all(&[line_delim])?;
802            return Ok(());
803        }
804    }
805    // Flush remaining run
806    if run_start < data.len() {
807        out.write_all(&data[run_start..])?;
808        if !data.is_empty() && *data.last().unwrap() != line_delim {
809            out.write_all(&[line_delim])?;
810        }
811    }
812    Ok(())
813}
814
815/// Process a chunk for contiguous from-start field range extraction.
816fn fields_prefix_chunk(
817    data: &[u8],
818    delim: u8,
819    line_delim: u8,
820    last_field: usize,
821    suppress: bool,
822    buf: &mut Vec<u8>,
823) {
824    let mut start = 0;
825    for end_pos in memchr_iter(line_delim, data) {
826        let line = &data[start..end_pos];
827        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
828        start = end_pos + 1;
829    }
830    if start < data.len() {
831        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
832    }
833}
834
835/// Extract first N fields from one line (contiguous from-start range).
836#[inline(always)]
837fn fields_prefix_line(
838    line: &[u8],
839    delim: u8,
840    line_delim: u8,
841    last_field: usize,
842    suppress: bool,
843    buf: &mut Vec<u8>,
844) {
845    if line.is_empty() {
846        if !suppress {
847            buf.push(line_delim);
848        }
849        return;
850    }
851
852    buf.reserve(line.len() + 1);
853
854    let mut field_count = 1;
855    let mut has_delim = false;
856
857    for pos in memchr_iter(delim, line) {
858        has_delim = true;
859        if field_count >= last_field {
860            unsafe {
861                buf_extend(buf, &line[..pos]);
862                buf_push(buf, line_delim);
863            }
864            return;
865        }
866        field_count += 1;
867    }
868
869    if !has_delim {
870        if !suppress {
871            unsafe {
872                buf_extend(buf, line);
873                buf_push(buf, line_delim);
874            }
875        }
876        return;
877    }
878
879    unsafe {
880        buf_extend(buf, line);
881        buf_push(buf, line_delim);
882    }
883}
884
885/// Open-ended field suffix extraction (e.g., `cut -f3-`).
886fn process_fields_suffix(
887    data: &[u8],
888    delim: u8,
889    line_delim: u8,
890    start_field: usize,
891    suppress: bool,
892    out: &mut impl Write,
893) -> io::Result<()> {
894    if data.len() >= PARALLEL_THRESHOLD {
895        let chunks = split_into_chunks(data, line_delim);
896        let results: Vec<Vec<u8>> = chunks
897            .par_iter()
898            .map(|chunk| {
899                let mut buf = Vec::with_capacity(chunk.len());
900                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
901                buf
902            })
903            .collect();
904        // Use write_vectored (writev) to batch N writes into fewer syscalls
905        let slices: Vec<IoSlice> = results
906            .iter()
907            .filter(|r| !r.is_empty())
908            .map(|r| IoSlice::new(r))
909            .collect();
910        write_ioslices(out, &slices)?;
911    } else {
912        let mut buf = Vec::with_capacity(data.len());
913        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
914        if !buf.is_empty() {
915            out.write_all(&buf)?;
916        }
917    }
918    Ok(())
919}
920
921/// Process a chunk for open-ended field suffix extraction.
922fn fields_suffix_chunk(
923    data: &[u8],
924    delim: u8,
925    line_delim: u8,
926    start_field: usize,
927    suppress: bool,
928    buf: &mut Vec<u8>,
929) {
930    let mut start = 0;
931    for end_pos in memchr_iter(line_delim, data) {
932        let line = &data[start..end_pos];
933        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
934        start = end_pos + 1;
935    }
936    if start < data.len() {
937        fields_suffix_line(
938            &data[start..],
939            delim,
940            line_delim,
941            start_field,
942            suppress,
943            buf,
944        );
945    }
946}
947
948/// Extract fields from start_field to end from one line.
949#[inline(always)]
950fn fields_suffix_line(
951    line: &[u8],
952    delim: u8,
953    line_delim: u8,
954    start_field: usize,
955    suppress: bool,
956    buf: &mut Vec<u8>,
957) {
958    if line.is_empty() {
959        if !suppress {
960            buf.push(line_delim);
961        }
962        return;
963    }
964
965    buf.reserve(line.len() + 1);
966
967    let skip_delims = start_field - 1;
968    let mut delim_count = 0;
969    let mut has_delim = false;
970
971    for pos in memchr_iter(delim, line) {
972        has_delim = true;
973        delim_count += 1;
974        if delim_count >= skip_delims {
975            unsafe {
976                buf_extend(buf, &line[pos + 1..]);
977                buf_push(buf, line_delim);
978            }
979            return;
980        }
981    }
982
983    if !has_delim {
984        if !suppress {
985            unsafe {
986                buf_extend(buf, line);
987                buf_push(buf, line_delim);
988            }
989        }
990        return;
991    }
992
993    // Fewer delimiters than needed
994    unsafe { buf_push(buf, line_delim) };
995}
996
997/// Combined SIMD scan for arbitrary single field extraction.
998/// Uses memchr2_iter(delim, line_delim) to scan for both bytes in a single SIMD pass.
999/// This is faster than the nested approach (outer: find newlines, inner: find delimiters)
1000/// because it eliminates one full SIMD scan and improves cache locality.
1001fn process_nth_field_combined(
1002    data: &[u8],
1003    delim: u8,
1004    line_delim: u8,
1005    target_idx: usize,
1006    suppress: bool,
1007    buf: &mut Vec<u8>,
1008) {
1009    buf.reserve(data.len());
1010
1011    let mut line_start: usize = 0;
1012    let mut field_start: usize = 0;
1013    let mut field_idx: usize = 0;
1014    let mut has_delim = false;
1015    let mut emitted = false;
1016
1017    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1018        let byte = unsafe { *data.get_unchecked(pos) };
1019
1020        if byte == line_delim {
1021            // End of line
1022            if !emitted {
1023                if has_delim && field_idx == target_idx {
1024                    // Last field matches target
1025                    unsafe {
1026                        buf_extend(buf, &data[field_start..pos]);
1027                        buf_push(buf, line_delim);
1028                    }
1029                } else if has_delim {
1030                    // Target field doesn't exist (fewer fields)
1031                    unsafe {
1032                        buf_push(buf, line_delim);
1033                    }
1034                } else if !suppress {
1035                    // No delimiter in line — output unchanged
1036                    unsafe {
1037                        buf_extend(buf, &data[line_start..pos]);
1038                        buf_push(buf, line_delim);
1039                    }
1040                }
1041            }
1042            // Reset for next line
1043            line_start = pos + 1;
1044            field_start = pos + 1;
1045            field_idx = 0;
1046            has_delim = false;
1047            emitted = false;
1048        } else {
1049            // Delimiter found
1050            has_delim = true;
1051            if field_idx == target_idx {
1052                unsafe {
1053                    buf_extend(buf, &data[field_start..pos]);
1054                    buf_push(buf, line_delim);
1055                }
1056                emitted = true;
1057            }
1058            field_idx += 1;
1059            field_start = pos + 1;
1060        }
1061    }
1062
1063    // Handle last line without trailing newline
1064    if line_start < data.len() && !emitted {
1065        if has_delim && field_idx == target_idx {
1066            unsafe {
1067                buf_extend(buf, &data[field_start..data.len()]);
1068                buf_push(buf, line_delim);
1069            }
1070        } else if has_delim {
1071            unsafe {
1072                buf_push(buf, line_delim);
1073            }
1074        } else if !suppress {
1075            unsafe {
1076                buf_extend(buf, &data[line_start..data.len()]);
1077                buf_push(buf, line_delim);
1078            }
1079        }
1080    }
1081}
1082
1083/// Zero-copy field-1 extraction: writes contiguous runs directly from source data.
1084/// For each line: if delimiter exists, truncate at first delimiter; otherwise pass through.
1085/// Uses memchr2 to scan for both delimiter and line terminator in a single SIMD pass.
1086#[inline]
1087fn single_field1_zerocopy(
1088    data: &[u8],
1089    delim: u8,
1090    line_delim: u8,
1091    out: &mut impl Write,
1092) -> io::Result<()> {
1093    let mut line_start: usize = 0;
1094    let mut run_start: usize = 0;
1095    let mut first_delim: Option<usize> = None;
1096
1097    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1098        let byte = unsafe { *data.get_unchecked(pos) };
1099
1100        if byte == line_delim {
1101            // End of line
1102            if let Some(dp) = first_delim {
1103                // Line has delimiter — truncate at first delimiter.
1104                // Flush current run up to line_start, write truncated line.
1105                if run_start < line_start {
1106                    out.write_all(&data[run_start..line_start])?;
1107                }
1108                out.write_all(&data[line_start..dp])?;
1109                out.write_all(&[line_delim])?;
1110                run_start = pos + 1;
1111            }
1112            // else: no delimiter in line, output unchanged (stays in run)
1113            line_start = pos + 1;
1114            first_delim = None;
1115        } else {
1116            // Delimiter found
1117            if first_delim.is_none() {
1118                first_delim = Some(pos);
1119            }
1120        }
1121    }
1122
1123    // Handle last line (no trailing line_delim)
1124    if line_start < data.len() {
1125        if let Some(dp) = first_delim {
1126            if run_start < line_start {
1127                out.write_all(&data[run_start..line_start])?;
1128            }
1129            out.write_all(&data[line_start..dp])?;
1130            out.write_all(&[line_delim])?;
1131            return Ok(());
1132        }
1133    }
1134
1135    // Flush remaining run
1136    if run_start < data.len() {
1137        out.write_all(&data[run_start..])?;
1138        if !data.is_empty() && *data.last().unwrap() != line_delim {
1139            out.write_all(&[line_delim])?;
1140        }
1141    }
1142    Ok(())
1143}
1144
1145/// Process a chunk of data for single-field extraction.
1146fn process_single_field_chunk(
1147    data: &[u8],
1148    delim: u8,
1149    target_idx: usize,
1150    line_delim: u8,
1151    suppress: bool,
1152    buf: &mut Vec<u8>,
1153) {
1154    let mut start = 0;
1155    for end_pos in memchr_iter(line_delim, data) {
1156        let line = &data[start..end_pos];
1157        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
1158        start = end_pos + 1;
1159    }
1160    if start < data.len() {
1161        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
1162    }
1163}
1164
1165/// Extract a single field from one line.
1166/// Uses unsafe buf helpers — caller must ensure buf has capacity reserved.
1167#[inline(always)]
1168fn extract_single_field_line(
1169    line: &[u8],
1170    delim: u8,
1171    target_idx: usize,
1172    line_delim: u8,
1173    suppress: bool,
1174    buf: &mut Vec<u8>,
1175) {
1176    if line.is_empty() {
1177        if !suppress {
1178            buf.push(line_delim);
1179        }
1180        return;
1181    }
1182
1183    // Ensure capacity for worst case (full line + newline)
1184    buf.reserve(line.len() + 1);
1185
1186    // Ultra-fast path for first field: single memchr
1187    if target_idx == 0 {
1188        match memchr::memchr(delim, line) {
1189            Some(pos) => unsafe {
1190                buf_extend(buf, &line[..pos]);
1191                buf_push(buf, line_delim);
1192            },
1193            None => {
1194                if !suppress {
1195                    unsafe {
1196                        buf_extend(buf, line);
1197                        buf_push(buf, line_delim);
1198                    }
1199                }
1200            }
1201        }
1202        return;
1203    }
1204
1205    let mut field_start = 0;
1206    let mut field_idx = 0;
1207    let mut has_delim = false;
1208
1209    for pos in memchr_iter(delim, line) {
1210        has_delim = true;
1211        if field_idx == target_idx {
1212            unsafe {
1213                buf_extend(buf, &line[field_start..pos]);
1214                buf_push(buf, line_delim);
1215            }
1216            return;
1217        }
1218        field_idx += 1;
1219        field_start = pos + 1;
1220    }
1221
1222    if !has_delim {
1223        if !suppress {
1224            unsafe {
1225                buf_extend(buf, line);
1226                buf_push(buf, line_delim);
1227            }
1228        }
1229        return;
1230    }
1231
1232    if field_idx == target_idx {
1233        unsafe {
1234            buf_extend(buf, &line[field_start..]);
1235            buf_push(buf, line_delim);
1236        }
1237    } else {
1238        unsafe { buf_push(buf, line_delim) };
1239    }
1240}
1241
1242/// Extract fields from a single line into the output buffer.
1243/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
1244#[inline(always)]
1245fn extract_fields_to_buf(
1246    line: &[u8],
1247    delim: u8,
1248    ranges: &[Range],
1249    output_delim: &[u8],
1250    suppress: bool,
1251    max_field: usize,
1252    field_mask: u64,
1253    line_delim: u8,
1254    buf: &mut Vec<u8>,
1255    complement: bool,
1256) {
1257    let len = line.len();
1258
1259    if len == 0 {
1260        if !suppress {
1261            buf.push(line_delim);
1262        }
1263        return;
1264    }
1265
1266    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
1267    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
1268    let needed = len + output_delim.len() * 16 + 1;
1269    if buf.capacity() - buf.len() < needed {
1270        buf.reserve(needed);
1271    }
1272
1273    let mut field_num: usize = 1;
1274    let mut field_start: usize = 0;
1275    let mut first_output = true;
1276    let mut has_delim = false;
1277
1278    for delim_pos in memchr_iter(delim, line) {
1279        has_delim = true;
1280
1281        if is_selected(field_num, field_mask, ranges, complement) {
1282            if !first_output {
1283                unsafe { buf_extend(buf, output_delim) };
1284            }
1285            unsafe { buf_extend(buf, &line[field_start..delim_pos]) };
1286            first_output = false;
1287        }
1288
1289        field_num += 1;
1290        field_start = delim_pos + 1;
1291
1292        if field_num > max_field {
1293            break;
1294        }
1295    }
1296
1297    // Last field
1298    if (field_num <= max_field || complement)
1299        && has_delim
1300        && is_selected(field_num, field_mask, ranges, complement)
1301    {
1302        if !first_output {
1303            unsafe { buf_extend(buf, output_delim) };
1304        }
1305        unsafe { buf_extend(buf, &line[field_start..len]) };
1306        first_output = false;
1307    }
1308
1309    if !first_output {
1310        unsafe { buf_push(buf, line_delim) };
1311    } else if !has_delim {
1312        if !suppress {
1313            unsafe {
1314                buf_extend(buf, line);
1315                buf_push(buf, line_delim);
1316            }
1317        }
1318    } else {
1319        unsafe { buf_push(buf, line_delim) };
1320    }
1321}
1322
1323// ── Fast path: byte/char extraction with batched output ──────────────────
1324
1325/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
1326/// Zero-copy: writes directly from the source data using output runs.
1327/// For lines shorter than max_bytes, the output is identical to the input,
1328/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
1329fn process_bytes_from_start(
1330    data: &[u8],
1331    max_bytes: usize,
1332    line_delim: u8,
1333    out: &mut impl Write,
1334) -> io::Result<()> {
1335    if data.len() >= PARALLEL_THRESHOLD {
1336        let chunks = split_into_chunks(data, line_delim);
1337        let results: Vec<Vec<u8>> = chunks
1338            .par_iter()
1339            .map(|chunk| {
1340                let mut buf = Vec::with_capacity(chunk.len());
1341                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
1342                buf
1343            })
1344            .collect();
1345        // Use write_vectored (writev) to batch N writes into fewer syscalls
1346        let slices: Vec<IoSlice> = results
1347            .iter()
1348            .filter(|r| !r.is_empty())
1349            .map(|r| IoSlice::new(r))
1350            .collect();
1351        write_ioslices(out, &slices)?;
1352    } else {
1353        // Zero-copy path: track contiguous output runs and write directly from source.
1354        // For lines <= max_bytes, we include them as-is (no copy needed).
1355        // For lines > max_bytes, we flush the run, write the truncated line, start new run.
1356        bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
1357    }
1358    Ok(())
1359}
1360
1361/// Zero-copy byte-prefix extraction: writes contiguous runs directly from the source data.
1362/// Only copies when a line needs truncation (line > max_bytes).
1363#[inline]
1364fn bytes_from_start_zerocopy(
1365    data: &[u8],
1366    max_bytes: usize,
1367    line_delim: u8,
1368    out: &mut impl Write,
1369) -> io::Result<()> {
1370    let mut start = 0;
1371    let mut run_start: usize = 0;
1372
1373    for pos in memchr_iter(line_delim, data) {
1374        let line_len = pos - start;
1375        if line_len > max_bytes {
1376            // This line needs truncation. Flush current run, write truncated line.
1377            if run_start < start {
1378                out.write_all(&data[run_start..start])?;
1379            }
1380            out.write_all(&data[start..start + max_bytes])?;
1381            out.write_all(&[line_delim])?;
1382            run_start = pos + 1;
1383        }
1384        // else: line fits, keep it in the current contiguous run
1385        start = pos + 1;
1386    }
1387    // Handle last line without terminator
1388    if start < data.len() {
1389        let line_len = data.len() - start;
1390        if line_len > max_bytes {
1391            if run_start < start {
1392                out.write_all(&data[run_start..start])?;
1393            }
1394            out.write_all(&data[start..start + max_bytes])?;
1395            out.write_all(&[line_delim])?;
1396            return Ok(());
1397        }
1398    }
1399    // Flush remaining run (includes all short lines + the last line)
1400    if run_start < data.len() {
1401        out.write_all(&data[run_start..])?;
1402        // Add terminator if last byte isn't one
1403        if !data.is_empty() && *data.last().unwrap() != line_delim {
1404            out.write_all(&[line_delim])?;
1405        }
1406    }
1407    Ok(())
1408}
1409
1410/// Process a chunk for from-start byte range extraction (parallel path).
1411/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1412#[inline]
1413fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1414    // Reserve enough capacity: output <= input size
1415    buf.reserve(data.len());
1416
1417    let mut start = 0;
1418    for pos in memchr_iter(line_delim, data) {
1419        let line_len = pos - start;
1420        let take = line_len.min(max_bytes);
1421        unsafe {
1422            buf_extend(buf, &data[start..start + take]);
1423            buf_push(buf, line_delim);
1424        }
1425        start = pos + 1;
1426    }
1427    // Handle last line without terminator
1428    if start < data.len() {
1429        let line_len = data.len() - start;
1430        let take = line_len.min(max_bytes);
1431        unsafe {
1432            buf_extend(buf, &data[start..start + take]);
1433            buf_push(buf, line_delim);
1434        }
1435    }
1436}
1437
1438/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
1439fn process_bytes_from_offset(
1440    data: &[u8],
1441    skip_bytes: usize,
1442    line_delim: u8,
1443    out: &mut impl Write,
1444) -> io::Result<()> {
1445    if data.len() >= PARALLEL_THRESHOLD {
1446        let chunks = split_into_chunks(data, line_delim);
1447        let results: Vec<Vec<u8>> = chunks
1448            .par_iter()
1449            .map(|chunk| {
1450                let mut buf = Vec::with_capacity(chunk.len());
1451                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
1452                buf
1453            })
1454            .collect();
1455        // Use write_vectored (writev) to batch N writes into fewer syscalls
1456        let slices: Vec<IoSlice> = results
1457            .iter()
1458            .filter(|r| !r.is_empty())
1459            .map(|r| IoSlice::new(r))
1460            .collect();
1461        write_ioslices(out, &slices)?;
1462    } else {
1463        // Zero-copy: write suffix of each line directly from source
1464        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
1465    }
1466    Ok(())
1467}
1468
1469/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
1470/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
1471/// reducing syscall overhead from 2 write_all calls per line to batched writev.
1472#[inline]
1473fn bytes_from_offset_zerocopy(
1474    data: &[u8],
1475    skip_bytes: usize,
1476    line_delim: u8,
1477    out: &mut impl Write,
1478) -> io::Result<()> {
1479    let delim_buf = [line_delim];
1480    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
1481
1482    let mut start = 0;
1483    for pos in memchr_iter(line_delim, data) {
1484        let line_len = pos - start;
1485        if line_len > skip_bytes {
1486            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
1487        }
1488        iov.push(IoSlice::new(&delim_buf));
1489        // Flush when approaching MAX_IOV to avoid oversized writev
1490        if iov.len() >= MAX_IOV - 1 {
1491            write_ioslices(out, &iov)?;
1492            iov.clear();
1493        }
1494        start = pos + 1;
1495    }
1496    if start < data.len() {
1497        let line_len = data.len() - start;
1498        if line_len > skip_bytes {
1499            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
1500        }
1501        iov.push(IoSlice::new(&delim_buf));
1502    }
1503    if !iov.is_empty() {
1504        write_ioslices(out, &iov)?;
1505    }
1506    Ok(())
1507}
1508
1509/// Process a chunk for from-offset byte range extraction.
1510/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1511#[inline]
1512fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1513    buf.reserve(data.len());
1514
1515    let mut start = 0;
1516    for pos in memchr_iter(line_delim, data) {
1517        let line_len = pos - start;
1518        if line_len > skip_bytes {
1519            unsafe {
1520                buf_extend(buf, &data[start + skip_bytes..pos]);
1521            }
1522        }
1523        unsafe {
1524            buf_push(buf, line_delim);
1525        }
1526        start = pos + 1;
1527    }
1528    if start < data.len() {
1529        let line_len = data.len() - start;
1530        if line_len > skip_bytes {
1531            unsafe {
1532                buf_extend(buf, &data[start + skip_bytes..data.len()]);
1533            }
1534        }
1535        unsafe {
1536            buf_push(buf, line_delim);
1537        }
1538    }
1539}
1540
1541/// Optimized byte/char extraction with batched output and parallel processing.
1542fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
1543    let line_delim = cfg.line_delim;
1544    let ranges = cfg.ranges;
1545    let complement = cfg.complement;
1546    let output_delim = cfg.output_delim;
1547
1548    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
1549    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
1550        let max_bytes = ranges[0].end;
1551        if max_bytes < usize::MAX {
1552            return process_bytes_from_start(data, max_bytes, line_delim, out);
1553        }
1554    }
1555
1556    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
1557    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
1558        let skip_bytes = ranges[0].start.saturating_sub(1);
1559        if skip_bytes > 0 {
1560            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
1561        }
1562    }
1563
1564    if data.len() >= PARALLEL_THRESHOLD {
1565        let chunks = split_into_chunks(data, line_delim);
1566        let results: Vec<Vec<u8>> = chunks
1567            .par_iter()
1568            .map(|chunk| {
1569                let mut buf = Vec::with_capacity(chunk.len());
1570                process_bytes_chunk(
1571                    chunk,
1572                    ranges,
1573                    complement,
1574                    output_delim,
1575                    line_delim,
1576                    &mut buf,
1577                );
1578                buf
1579            })
1580            .collect();
1581        // Use write_vectored (writev) to batch N writes into fewer syscalls
1582        let slices: Vec<IoSlice> = results
1583            .iter()
1584            .filter(|r| !r.is_empty())
1585            .map(|r| IoSlice::new(r))
1586            .collect();
1587        write_ioslices(out, &slices)?;
1588    } else {
1589        let mut buf = Vec::with_capacity(data.len());
1590        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
1591        if !buf.is_empty() {
1592            out.write_all(&buf)?;
1593        }
1594    }
1595    Ok(())
1596}
1597
1598/// Process a chunk of data for byte/char extraction.
1599fn process_bytes_chunk(
1600    data: &[u8],
1601    ranges: &[Range],
1602    complement: bool,
1603    output_delim: &[u8],
1604    line_delim: u8,
1605    buf: &mut Vec<u8>,
1606) {
1607    let mut start = 0;
1608    for end_pos in memchr_iter(line_delim, data) {
1609        let line = &data[start..end_pos];
1610        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
1611        buf.push(line_delim);
1612        start = end_pos + 1;
1613    }
1614    if start < data.len() {
1615        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
1616        buf.push(line_delim);
1617    }
1618}
1619
1620/// Extract byte ranges from a line into the output buffer.
1621/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
1622#[inline(always)]
1623fn cut_bytes_to_buf(
1624    line: &[u8],
1625    ranges: &[Range],
1626    complement: bool,
1627    output_delim: &[u8],
1628    buf: &mut Vec<u8>,
1629) {
1630    let len = line.len();
1631    let mut first_range = true;
1632
1633    // Reserve worst case: full line + delimiters between ranges
1634    buf.reserve(len + output_delim.len() * ranges.len() + 1);
1635
1636    if complement {
1637        let mut pos: usize = 1;
1638        for r in ranges {
1639            let rs = r.start;
1640            let re = r.end.min(len);
1641            if pos < rs {
1642                if !first_range && !output_delim.is_empty() {
1643                    unsafe { buf_extend(buf, output_delim) };
1644                }
1645                unsafe { buf_extend(buf, &line[pos - 1..rs - 1]) };
1646                first_range = false;
1647            }
1648            pos = re + 1;
1649            if pos > len {
1650                break;
1651            }
1652        }
1653        if pos <= len {
1654            if !first_range && !output_delim.is_empty() {
1655                unsafe { buf_extend(buf, output_delim) };
1656            }
1657            unsafe { buf_extend(buf, &line[pos - 1..len]) };
1658        }
1659    } else if output_delim.is_empty() && ranges.len() == 1 {
1660        // Ultra-fast path: single range, no output delimiter
1661        let start = ranges[0].start.saturating_sub(1);
1662        let end = ranges[0].end.min(len);
1663        if start < len {
1664            unsafe { buf_extend(buf, &line[start..end]) };
1665        }
1666    } else {
1667        for r in ranges {
1668            let start = r.start.saturating_sub(1);
1669            let end = r.end.min(len);
1670            if start >= len {
1671                break;
1672            }
1673            if !first_range && !output_delim.is_empty() {
1674                unsafe { buf_extend(buf, output_delim) };
1675            }
1676            unsafe { buf_extend(buf, &line[start..end]) };
1677            first_range = false;
1678        }
1679    }
1680}
1681
1682// ── Public API ───────────────────────────────────────────────────────────
1683
1684/// Cut fields from a line using a delimiter. Writes to `out`.
1685#[inline]
1686pub fn cut_fields(
1687    line: &[u8],
1688    delim: u8,
1689    ranges: &[Range],
1690    complement: bool,
1691    output_delim: &[u8],
1692    suppress_no_delim: bool,
1693    out: &mut impl Write,
1694) -> io::Result<bool> {
1695    if memchr::memchr(delim, line).is_none() {
1696        if !suppress_no_delim {
1697            out.write_all(line)?;
1698            return Ok(true);
1699        }
1700        return Ok(false);
1701    }
1702
1703    let mut field_num: usize = 1;
1704    let mut field_start: usize = 0;
1705    let mut first_output = true;
1706
1707    for delim_pos in memchr_iter(delim, line) {
1708        let selected = in_ranges(ranges, field_num) != complement;
1709        if selected {
1710            if !first_output {
1711                out.write_all(output_delim)?;
1712            }
1713            out.write_all(&line[field_start..delim_pos])?;
1714            first_output = false;
1715        }
1716        field_start = delim_pos + 1;
1717        field_num += 1;
1718    }
1719
1720    let selected = in_ranges(ranges, field_num) != complement;
1721    if selected {
1722        if !first_output {
1723            out.write_all(output_delim)?;
1724        }
1725        out.write_all(&line[field_start..])?;
1726    }
1727
1728    Ok(true)
1729}
1730
1731/// Cut bytes/chars from a line. Writes selected bytes to `out`.
1732#[inline]
1733pub fn cut_bytes(
1734    line: &[u8],
1735    ranges: &[Range],
1736    complement: bool,
1737    output_delim: &[u8],
1738    out: &mut impl Write,
1739) -> io::Result<bool> {
1740    let mut first_range = true;
1741
1742    if complement {
1743        let len = line.len();
1744        let mut comp_ranges = Vec::new();
1745        let mut pos: usize = 1;
1746        for r in ranges {
1747            let rs = r.start;
1748            let re = r.end.min(len);
1749            if pos < rs {
1750                comp_ranges.push((pos, rs - 1));
1751            }
1752            pos = re + 1;
1753            if pos > len {
1754                break;
1755            }
1756        }
1757        if pos <= len {
1758            comp_ranges.push((pos, len));
1759        }
1760        for &(s, e) in &comp_ranges {
1761            if !first_range && !output_delim.is_empty() {
1762                out.write_all(output_delim)?;
1763            }
1764            out.write_all(&line[s - 1..e])?;
1765            first_range = false;
1766        }
1767    } else {
1768        for r in ranges {
1769            let start = r.start.saturating_sub(1);
1770            let end = r.end.min(line.len());
1771            if start >= line.len() {
1772                break;
1773            }
1774            if !first_range && !output_delim.is_empty() {
1775                out.write_all(output_delim)?;
1776            }
1777            out.write_all(&line[start..end])?;
1778            first_range = false;
1779        }
1780    }
1781    Ok(true)
1782}
1783
1784/// Process a full data buffer (from mmap or read) with cut operation.
1785pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
1786    match cfg.mode {
1787        CutMode::Fields => process_fields_fast(data, cfg, out),
1788        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
1789    }
1790}
1791
1792/// Process input from a reader (for stdin).
1793/// Uses batch reading: reads large chunks (4MB), then processes them in batch
1794/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
1795pub fn process_cut_reader<R: BufRead>(
1796    mut reader: R,
1797    cfg: &CutConfig,
1798    out: &mut impl Write,
1799) -> io::Result<()> {
1800    const CHUNK_SIZE: usize = 4 * 1024 * 1024; // 4MB read chunks
1801    let line_delim = cfg.line_delim;
1802
1803    // Read large chunks and process in batch.
1804    // We keep a buffer; after processing complete lines, we shift leftover to the front.
1805    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
1806
1807    loop {
1808        // Read up to CHUNK_SIZE bytes
1809        buf.reserve(CHUNK_SIZE);
1810        let read_start = buf.len();
1811        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
1812        let n = read_fully(&mut reader, &mut buf[read_start..])?;
1813        buf.truncate(read_start + n);
1814
1815        if buf.is_empty() {
1816            break;
1817        }
1818
1819        if n == 0 {
1820            // EOF with leftover data (last line without terminator)
1821            process_cut_data(&buf, cfg, out)?;
1822            break;
1823        }
1824
1825        // Find the last line delimiter in the buffer so we process complete lines
1826        let process_end = match memchr::memrchr(line_delim, &buf) {
1827            Some(pos) => pos + 1,
1828            None => {
1829                // No line delimiter found — keep accumulating
1830                continue;
1831            }
1832        };
1833
1834        // Process the complete lines using the fast batch path
1835        process_cut_data(&buf[..process_end], cfg, out)?;
1836
1837        // Shift leftover to the front for next iteration
1838        let leftover_len = buf.len() - process_end;
1839        if leftover_len > 0 {
1840            buf.copy_within(process_end.., 0);
1841        }
1842        buf.truncate(leftover_len);
1843    }
1844
1845    Ok(())
1846}
1847
1848/// Read as many bytes as possible into buf, retrying on partial reads.
1849#[inline]
1850fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
1851    let n = reader.read(buf)?;
1852    if n == buf.len() || n == 0 {
1853        return Ok(n);
1854    }
1855    // Slow path: partial read — retry to fill buffer
1856    let mut total = n;
1857    while total < buf.len() {
1858        match reader.read(&mut buf[total..]) {
1859            Ok(0) => break,
1860            Ok(n) => total += n,
1861            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
1862            Err(e) => return Err(e),
1863        }
1864    }
1865    Ok(total)
1866}
1867
1868/// Cut operation mode
1869#[derive(Debug, Clone, Copy, PartialEq)]
1870pub enum CutMode {
1871    Bytes,
1872    Characters,
1873    Fields,
1874}