Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (8MB).
5/// Files above this threshold use rayon parallel chunked processing.
6/// 8MB balances the split_for_scope scan overhead against parallel benefits.
7const PARALLEL_THRESHOLD: usize = 8 * 1024 * 1024;
8
9/// Max iovec entries per writev call (Linux default).
10const MAX_IOV: usize = 1024;
11
12/// Configuration for cut operations.
13pub struct CutConfig<'a> {
14    pub mode: CutMode,
15    pub ranges: &'a [Range],
16    pub complement: bool,
17    pub delim: u8,
18    pub output_delim: &'a [u8],
19    pub suppress_no_delim: bool,
20    pub line_delim: u8,
21}
22
23/// A range specification like 1, 3-5, -3, 4-
24#[derive(Debug, Clone)]
25pub struct Range {
26    pub start: usize, // 1-based, 0 means "from beginning"
27    pub end: usize,   // 1-based, usize::MAX means "to end"
28}
29
30/// Parse a LIST specification like "1,3-5,7-" into ranges.
31/// Each range is 1-based. Returns sorted, merged ranges.
32/// When `no_merge_adjacent` is true, overlapping ranges are still merged but
33/// adjacent ranges (e.g., 1-2,3-4) are kept separate. This is needed when
34/// `--output-delimiter` is specified for byte/char mode so the delimiter is
35/// inserted between originally separate but adjacent ranges.
36pub fn parse_ranges(spec: &str, no_merge_adjacent: bool) -> Result<Vec<Range>, String> {
37    let mut ranges = Vec::new();
38
39    for part in spec.split(',') {
40        let part = part.trim();
41        if part.is_empty() {
42            continue;
43        }
44
45        if let Some(idx) = part.find('-') {
46            let left = &part[..idx];
47            let right = &part[idx + 1..];
48
49            // Reject bare "-" (both sides empty)
50            if left.is_empty() && right.is_empty() {
51                return Err("invalid range with no endpoint: -".to_string());
52            }
53
54            let start = if left.is_empty() {
55                1
56            } else {
57                left.parse::<usize>()
58                    .map_err(|_| format!("invalid range: '{}'", part))?
59            };
60
61            let end = if right.is_empty() {
62                usize::MAX
63            } else {
64                right
65                    .parse::<usize>()
66                    .map_err(|_| format!("invalid range: '{}'", part))?
67            };
68
69            if start == 0 {
70                return Err("fields and positions are numbered from 1".to_string());
71            }
72            if start > end {
73                return Err(format!("invalid decreasing range: '{}'", part));
74            }
75
76            ranges.push(Range { start, end });
77        } else {
78            let n = part
79                .parse::<usize>()
80                .map_err(|_| format!("invalid field: '{}'", part))?;
81            if n == 0 {
82                return Err("fields and positions are numbered from 1".to_string());
83            }
84            ranges.push(Range { start: n, end: n });
85        }
86    }
87
88    if ranges.is_empty() {
89        return Err("you must specify a list of bytes, characters, or fields".to_string());
90    }
91
92    // Sort and merge overlapping/adjacent ranges
93    ranges.sort_by_key(|r| (r.start, r.end));
94    let mut merged = vec![ranges[0].clone()];
95    for r in &ranges[1..] {
96        let last = merged.last_mut().unwrap();
97        if no_merge_adjacent {
98            // Only merge truly overlapping ranges, not adjacent ones
99            if r.start <= last.end {
100                last.end = last.end.max(r.end);
101            } else {
102                merged.push(r.clone());
103            }
104        } else {
105            // Merge both overlapping and adjacent ranges
106            if r.start <= last.end.saturating_add(1) {
107                last.end = last.end.max(r.end);
108            } else {
109                merged.push(r.clone());
110            }
111        }
112    }
113
114    Ok(merged)
115}
116
117/// Check if a 1-based position is in any range.
118/// Ranges must be sorted. Uses early exit since ranges are sorted.
119#[inline(always)]
120fn in_ranges(ranges: &[Range], pos: usize) -> bool {
121    for r in ranges {
122        if pos < r.start {
123            return false;
124        }
125        if pos <= r.end {
126            return true;
127        }
128    }
129    false
130}
131
132/// Pre-compute a 64-bit mask for field selection.
133/// Bit i-1 is set if field i should be output.
134#[inline]
135fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
136    let mut mask: u64 = 0;
137    for i in 1..=64u32 {
138        let in_range = in_ranges(ranges, i as usize);
139        if in_range != complement {
140            mask |= 1u64 << (i - 1);
141        }
142    }
143    mask
144}
145
146/// Check if a field should be selected, using bitset for first 64 fields.
147#[inline(always)]
148fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
149    if field_num <= 64 {
150        (mask >> (field_num - 1)) & 1 == 1
151    } else {
152        in_ranges(ranges, field_num) != complement
153    }
154}
155
156// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
157
158/// Append a slice to buf without capacity checks.
159/// Caller MUST ensure buf has enough remaining capacity.
160#[inline(always)]
161unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
162    unsafe {
163        let len = buf.len();
164        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
165        buf.set_len(len + data.len());
166    }
167}
168
169/// Append a single byte to buf without capacity checks.
170/// Caller MUST ensure buf has enough remaining capacity.
171#[inline(always)]
172unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
173    unsafe {
174        let len = buf.len();
175        *buf.as_mut_ptr().add(len) = b;
176        buf.set_len(len + 1);
177    }
178}
179
180/// Append a slice + a single trailing byte to buf without capacity checks.
181/// Fused operation saves one len load/store vs separate buf_extend + buf_push.
182/// Hot path for field extraction: copies field content + newline in one call.
183/// Caller MUST ensure buf has enough remaining capacity.
184#[inline(always)]
185unsafe fn buf_extend_byte(buf: &mut Vec<u8>, data: &[u8], b: u8) {
186    unsafe {
187        let len = buf.len();
188        let ptr = buf.as_mut_ptr().add(len);
189        std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, data.len());
190        *ptr.add(data.len()) = b;
191        buf.set_len(len + data.len() + 1);
192    }
193}
194
195/// Write multiple IoSlice buffers using write_vectored (writev syscall).
196/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
197/// Cold path (partial write) is out-of-line to keep the hot loop tight.
198#[inline]
199fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
200    if slices.is_empty() {
201        return Ok(());
202    }
203    for batch in slices.chunks(MAX_IOV) {
204        let total: usize = batch.iter().map(|s| s.len()).sum();
205        let written = out.write_vectored(batch)?;
206        if written >= total {
207            continue;
208        }
209        if written == 0 {
210            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
211        }
212        write_ioslices_slow(out, batch, written)?;
213    }
214    Ok(())
215}
216
217/// Handle partial write_vectored (cold path, never inlined).
218#[cold]
219#[inline(never)]
220fn write_ioslices_slow(
221    out: &mut impl Write,
222    slices: &[IoSlice],
223    mut skip: usize,
224) -> io::Result<()> {
225    for slice in slices {
226        let len = slice.len();
227        if skip >= len {
228            skip -= len;
229            continue;
230        }
231        out.write_all(&slice[skip..])?;
232        skip = 0;
233    }
234    Ok(())
235}
236
237// ── Chunk splitting for parallel processing ──────────────────────────────
238
239/// Number of available CPUs for parallel chunk splitting.
240/// Uses std::thread::available_parallelism() to avoid triggering premature
241/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
242#[inline]
243fn num_cpus() -> usize {
244    std::thread::available_parallelism()
245        .map(|n| n.get())
246        .unwrap_or(1)
247}
248
249/// Split data into chunks for rayon::scope parallel processing.
250/// Uses Rayon's thread count to match the number of worker threads.
251fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
252    let num_threads = num_cpus().max(1);
253    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
254        return vec![data];
255    }
256
257    let chunk_size = data.len() / num_threads;
258    let mut chunks = Vec::with_capacity(num_threads);
259    let mut pos = 0;
260
261    for _ in 0..num_threads - 1 {
262        let target = pos + chunk_size;
263        if target >= data.len() {
264            break;
265        }
266        let boundary = memchr::memchr(line_delim, &data[target..])
267            .map(|p| target + p + 1)
268            .unwrap_or(data.len());
269        if boundary > pos {
270            chunks.push(&data[pos..boundary]);
271        }
272        pos = boundary;
273    }
274
275    if pos < data.len() {
276        chunks.push(&data[pos..]);
277    }
278
279    chunks
280}
281
282// ── Fast path: multi-field non-contiguous extraction ─────────────────────
283
284/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
285/// Pre-collects delimiter positions per line into a stack-allocated array,
286/// then directly indexes into them for each selected field.
287/// This is O(max_field) per line instead of O(num_fields * scan_length).
288fn process_fields_multi_select(
289    data: &[u8],
290    delim: u8,
291    line_delim: u8,
292    ranges: &[Range],
293    suppress: bool,
294    out: &mut impl Write,
295) -> io::Result<()> {
296    let max_field = ranges.last().map_or(0, |r| r.end);
297
298    if data.len() >= PARALLEL_THRESHOLD {
299        let chunks = split_for_scope(data, line_delim);
300        let n = chunks.len();
301        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
302        rayon::scope(|s| {
303            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
304                s.spawn(move |_| {
305                    result.reserve(chunk.len() * 3 / 4);
306                    multi_select_chunk(
307                        chunk, delim, line_delim, ranges, max_field, suppress, result,
308                    );
309                });
310            }
311        });
312        let slices: Vec<IoSlice> = results
313            .iter()
314            .filter(|r| !r.is_empty())
315            .map(|r| IoSlice::new(r))
316            .collect();
317        write_ioslices(out, &slices)?;
318    } else {
319        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
320        multi_select_chunk(
321            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
322        );
323        if !buf.is_empty() {
324            out.write_all(&buf)?;
325        }
326    }
327    Ok(())
328}
329
330/// Process a chunk for multi-field extraction using a single-pass memchr2 scan.
331/// Scans for both delimiter and line_delim in one SIMD pass over the entire chunk,
332/// eliminating per-line memchr_iter setup overhead (significant for short lines).
333/// Delimiter positions are collected in a stack array per line.
334/// When max_field is reached on a line, remaining delimiters are ignored.
335fn multi_select_chunk(
336    data: &[u8],
337    delim: u8,
338    line_delim: u8,
339    ranges: &[Range],
340    max_field: usize,
341    suppress: bool,
342    buf: &mut Vec<u8>,
343) {
344    // When delim == line_delim, fall back to two-level approach
345    if delim == line_delim {
346        buf.reserve(data.len());
347        let base = data.as_ptr();
348        let mut start = 0;
349        for end_pos in memchr_iter(line_delim, data) {
350            let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
351            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
352            start = end_pos + 1;
353        }
354        if start < data.len() {
355            let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
356            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
357        }
358        return;
359    }
360
361    buf.reserve(data.len());
362    let base = data.as_ptr();
363    let data_len = data.len();
364
365    // Per-line state
366    let mut line_start: usize = 0;
367    let mut delim_pos = [0usize; 64];
368    let mut num_delims: usize = 0;
369    let max_delims = max_field.min(64);
370    let mut at_max = false;
371
372    // Single-pass scan using memchr2 for both delimiter and newline
373    for pos in memchr::memchr2_iter(delim, line_delim, data) {
374        let byte = unsafe { *base.add(pos) };
375
376        if byte == line_delim {
377            // End of line: extract fields from collected positions
378            let line_len = pos - line_start;
379            if num_delims == 0 {
380                // No delimiter in line
381                if !suppress {
382                    unsafe {
383                        buf_extend(
384                            buf,
385                            std::slice::from_raw_parts(base.add(line_start), line_len),
386                        );
387                        buf_push(buf, line_delim);
388                    }
389                }
390            } else {
391                // Extract fields using collected delimiter positions
392                let total_fields = num_delims + 1;
393                let mut first_output = true;
394
395                for r in ranges {
396                    let range_start = r.start;
397                    let range_end = r.end.min(total_fields);
398                    if range_start > total_fields {
399                        break;
400                    }
401                    for field_num in range_start..=range_end {
402                        if field_num > total_fields {
403                            break;
404                        }
405
406                        let field_start = if field_num == 1 {
407                            line_start
408                        } else if field_num - 2 < num_delims {
409                            delim_pos[field_num - 2] + 1
410                        } else {
411                            continue;
412                        };
413                        let field_end = if field_num <= num_delims {
414                            delim_pos[field_num - 1]
415                        } else {
416                            pos
417                        };
418
419                        if !first_output {
420                            unsafe { buf_push(buf, delim) };
421                        }
422                        unsafe {
423                            buf_extend(
424                                buf,
425                                std::slice::from_raw_parts(
426                                    base.add(field_start),
427                                    field_end - field_start,
428                                ),
429                            );
430                        }
431                        first_output = false;
432                    }
433                }
434
435                unsafe { buf_push(buf, line_delim) };
436            }
437
438            // Reset for next line
439            line_start = pos + 1;
440            num_delims = 0;
441            at_max = false;
442        } else {
443            // Delimiter found: collect position (up to max_field)
444            if !at_max && num_delims < max_delims {
445                delim_pos[num_delims] = pos;
446                num_delims += 1;
447                if num_delims >= max_delims {
448                    at_max = true;
449                }
450            }
451        }
452    }
453
454    // Handle last line without trailing line_delim
455    if line_start < data_len {
456        if num_delims == 0 {
457            if !suppress {
458                unsafe {
459                    buf_extend(
460                        buf,
461                        std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
462                    );
463                    buf_push(buf, line_delim);
464                }
465            }
466        } else {
467            let total_fields = num_delims + 1;
468            let mut first_output = true;
469
470            for r in ranges {
471                let range_start = r.start;
472                let range_end = r.end.min(total_fields);
473                if range_start > total_fields {
474                    break;
475                }
476                for field_num in range_start..=range_end {
477                    if field_num > total_fields {
478                        break;
479                    }
480
481                    let field_start = if field_num == 1 {
482                        line_start
483                    } else if field_num - 2 < num_delims {
484                        delim_pos[field_num - 2] + 1
485                    } else {
486                        continue;
487                    };
488                    let field_end = if field_num <= num_delims {
489                        delim_pos[field_num - 1]
490                    } else {
491                        data_len
492                    };
493
494                    if !first_output {
495                        unsafe { buf_push(buf, delim) };
496                    }
497                    unsafe {
498                        buf_extend(
499                            buf,
500                            std::slice::from_raw_parts(
501                                base.add(field_start),
502                                field_end - field_start,
503                            ),
504                        );
505                    }
506                    first_output = false;
507                }
508            }
509
510            unsafe { buf_push(buf, line_delim) };
511        }
512    }
513}
514
515/// Extract selected fields from a single line using delimiter position scanning.
516/// Scans delimiters only up to max_field (early exit), then extracts selected fields
517/// by indexing directly into the collected positions. Since ranges are pre-sorted and
518/// non-overlapping, every field within a range is selected — no is_selected check needed.
519#[inline(always)]
520fn multi_select_line(
521    line: &[u8],
522    delim: u8,
523    line_delim: u8,
524    ranges: &[Range],
525    max_field: usize,
526    suppress: bool,
527    buf: &mut Vec<u8>,
528) {
529    let len = line.len();
530    if len == 0 {
531        if !suppress {
532            unsafe { buf_push(buf, line_delim) };
533        }
534        return;
535    }
536
537    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
538    let base = line.as_ptr();
539
540    // Collect delimiter positions up to max_field (early exit).
541    // Stack array for up to 64 delimiter positions.
542    let mut delim_pos = [0usize; 64];
543    let mut num_delims: usize = 0;
544    let max_delims = max_field.min(64);
545
546    for pos in memchr_iter(delim, line) {
547        if num_delims < max_delims {
548            delim_pos[num_delims] = pos;
549            num_delims += 1;
550            if num_delims >= max_delims {
551                break;
552            }
553        }
554    }
555
556    if num_delims == 0 {
557        if !suppress {
558            unsafe {
559                buf_extend(buf, line);
560                buf_push(buf, line_delim);
561            }
562        }
563        return;
564    }
565
566    // Extract selected fields using delimiter positions.
567    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
568    // is selected — skip the is_selected check entirely (saves 1 function call per field).
569    let total_fields = num_delims + 1;
570    let mut first_output = true;
571
572    for r in ranges {
573        let range_start = r.start;
574        let range_end = r.end.min(total_fields);
575        if range_start > total_fields {
576            break;
577        }
578        for field_num in range_start..=range_end {
579            if field_num > total_fields {
580                break;
581            }
582
583            let field_start = if field_num == 1 {
584                0
585            } else if field_num - 2 < num_delims {
586                delim_pos[field_num - 2] + 1
587            } else {
588                continue;
589            };
590            let field_end = if field_num <= num_delims {
591                delim_pos[field_num - 1]
592            } else {
593                len
594            };
595
596            if !first_output {
597                unsafe { buf_push(buf, delim) };
598            }
599            unsafe {
600                buf_extend(
601                    buf,
602                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
603                );
604            }
605            first_output = false;
606        }
607    }
608
609    unsafe { buf_push(buf, line_delim) };
610}
611
612// ── Fast path: field extraction with batched output ──────────────────────
613
614/// Optimized field extraction with early exit and batched output.
615fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
616    let delim = cfg.delim;
617    let line_delim = cfg.line_delim;
618    let ranges = cfg.ranges;
619    let complement = cfg.complement;
620    let output_delim = cfg.output_delim;
621    let suppress = cfg.suppress_no_delim;
622
623    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
624    // That scan was O(N) over the entire file just to check an edge case
625    // (no delimiter in any line). The per-line processing already handles
626    // lines without delimiters correctly, so the scan was pure overhead
627    // for files that DO contain delimiters (the common case).
628
629    // Ultra-fast path: single field extraction (e.g., cut -f5)
630    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
631        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
632    }
633
634    // Fast path: complement of single field or contiguous range with default output delimiter.
635    if complement
636        && ranges.len() == 1
637        && output_delim.len() == 1
638        && output_delim[0] == delim
639        && ranges[0].start == ranges[0].end
640    {
641        return process_complement_single_field(
642            data,
643            delim,
644            line_delim,
645            ranges[0].start,
646            suppress,
647            out,
648        );
649    }
650
651    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
652    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
653    if complement
654        && ranges.len() == 1
655        && ranges[0].start > 1
656        && ranges[0].end < usize::MAX
657        && output_delim.len() == 1
658        && output_delim[0] == delim
659    {
660        return process_complement_range(
661            data,
662            delim,
663            line_delim,
664            ranges[0].start,
665            ranges[0].end,
666            suppress,
667            out,
668        );
669    }
670
671    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
672    if !complement
673        && ranges.len() == 1
674        && ranges[0].start == 1
675        && output_delim.len() == 1
676        && output_delim[0] == delim
677        && ranges[0].end < usize::MAX
678    {
679        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
680    }
681
682    // Fast path: open-ended field range from field N (e.g., cut -f3-)
683    if !complement
684        && ranges.len() == 1
685        && ranges[0].end == usize::MAX
686        && ranges[0].start > 1
687        && output_delim.len() == 1
688        && output_delim[0] == delim
689    {
690        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
691    }
692
693    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
694    if !complement
695        && ranges.len() == 1
696        && ranges[0].start > 1
697        && ranges[0].end < usize::MAX
698        && output_delim.len() == 1
699        && output_delim[0] == delim
700    {
701        return process_fields_mid_range(
702            data,
703            delim,
704            line_delim,
705            ranges[0].start,
706            ranges[0].end,
707            suppress,
708            out,
709        );
710    }
711
712    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
713    // Uses delimiter position caching: find all delimiter positions per line,
714    // then directly index into them for each selected field.
715    // This is faster than the general extract_fields_to_buf which re-checks
716    // is_selected() for every field encountered.
717    if !complement
718        && ranges.len() > 1
719        && ranges.last().map_or(false, |r| r.end < usize::MAX)
720        && output_delim.len() == 1
721        && output_delim[0] == delim
722        && delim != line_delim
723    {
724        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
725    }
726
727    // General field extraction
728    let max_field = if complement {
729        usize::MAX
730    } else {
731        ranges.last().map(|r| r.end).unwrap_or(0)
732    };
733    let field_mask = compute_field_mask(ranges, complement);
734
735    if data.len() >= PARALLEL_THRESHOLD {
736        let chunks = split_for_scope(data, line_delim);
737        let n = chunks.len();
738        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
739        rayon::scope(|s| {
740            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
741                s.spawn(move |_| {
742                    result.reserve(chunk.len() + 1);
743                    process_fields_chunk(
744                        chunk,
745                        delim,
746                        ranges,
747                        output_delim,
748                        suppress,
749                        max_field,
750                        field_mask,
751                        line_delim,
752                        complement,
753                        result,
754                    );
755                });
756            }
757        });
758        let slices: Vec<IoSlice> = results
759            .iter()
760            .filter(|r| !r.is_empty())
761            .map(|r| IoSlice::new(r))
762            .collect();
763        write_ioslices(out, &slices)?;
764    } else {
765        // +1 for potential trailing line_delim when input doesn't end with one
766        let mut buf = Vec::with_capacity(data.len() + 1);
767        process_fields_chunk(
768            data,
769            delim,
770            ranges,
771            output_delim,
772            suppress,
773            max_field,
774            field_mask,
775            line_delim,
776            complement,
777            &mut buf,
778        );
779        if !buf.is_empty() {
780            out.write_all(&buf)?;
781        }
782    }
783    Ok(())
784}
785
786/// Process a chunk of data for general field extraction.
787/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
788/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
789/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
790fn process_fields_chunk(
791    data: &[u8],
792    delim: u8,
793    ranges: &[Range],
794    output_delim: &[u8],
795    suppress: bool,
796    max_field: usize,
797    field_mask: u64,
798    line_delim: u8,
799    complement: bool,
800    buf: &mut Vec<u8>,
801) {
802    // When delim != line_delim and max_field is bounded, use two-level approach:
803    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
804    // This avoids scanning past max_field on each line (significant for lines with
805    // many columns but small field selection like -f1,3,5 on 20-column CSV).
806    // For complement or unbounded ranges, use single-pass memchr2_iter which
807    // needs to process all delimiters anyway.
808    if delim != line_delim && max_field < usize::MAX && !complement {
809        buf.reserve(data.len());
810        let mut start = 0;
811        for end_pos in memchr_iter(line_delim, data) {
812            let line = &data[start..end_pos];
813            extract_fields_to_buf(
814                line,
815                delim,
816                ranges,
817                output_delim,
818                suppress,
819                max_field,
820                field_mask,
821                line_delim,
822                buf,
823                complement,
824            );
825            start = end_pos + 1;
826        }
827        if start < data.len() {
828            extract_fields_to_buf(
829                &data[start..],
830                delim,
831                ranges,
832                output_delim,
833                suppress,
834                max_field,
835                field_mask,
836                line_delim,
837                buf,
838                complement,
839            );
840        }
841        return;
842    }
843
844    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
845    // delimiter and line_delim in one SIMD scan.
846    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
847    if delim != line_delim {
848        buf.reserve(data.len());
849
850        let data_len = data.len();
851        let base = data.as_ptr();
852        let mut line_start: usize = 0;
853        let mut field_start: usize = 0;
854        let mut field_num: usize = 1;
855        let mut first_output = true;
856        let mut has_delim = false;
857
858        for pos in memchr::memchr2_iter(delim, line_delim, data) {
859            let byte = unsafe { *base.add(pos) };
860
861            if byte == line_delim {
862                // End of line: flush final field and emit line delimiter
863                if (field_num <= max_field || complement)
864                    && has_delim
865                    && is_selected(field_num, field_mask, ranges, complement)
866                {
867                    if !first_output {
868                        unsafe { buf_extend(buf, output_delim) };
869                    }
870                    unsafe {
871                        buf_extend(
872                            buf,
873                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
874                        )
875                    };
876                    first_output = false;
877                }
878
879                if !first_output {
880                    unsafe { buf_push(buf, line_delim) };
881                } else if !has_delim {
882                    if !suppress {
883                        unsafe {
884                            buf_extend(
885                                buf,
886                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
887                            );
888                            buf_push(buf, line_delim);
889                        }
890                    }
891                } else {
892                    unsafe { buf_push(buf, line_delim) };
893                }
894
895                // Reset state for next line
896                line_start = pos + 1;
897                field_start = pos + 1;
898                field_num = 1;
899                first_output = true;
900                has_delim = false;
901            } else {
902                // Field delimiter hit
903                has_delim = true;
904
905                if is_selected(field_num, field_mask, ranges, complement) {
906                    if !first_output {
907                        unsafe { buf_extend(buf, output_delim) };
908                    }
909                    unsafe {
910                        buf_extend(
911                            buf,
912                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
913                        )
914                    };
915                    first_output = false;
916                }
917
918                field_num += 1;
919                field_start = pos + 1;
920            }
921        }
922
923        // Handle last line without trailing line_delim
924        if line_start < data_len {
925            if line_start < data_len {
926                if (field_num <= max_field || complement)
927                    && has_delim
928                    && is_selected(field_num, field_mask, ranges, complement)
929                {
930                    if !first_output {
931                        unsafe { buf_extend(buf, output_delim) };
932                    }
933                    unsafe {
934                        buf_extend(
935                            buf,
936                            std::slice::from_raw_parts(
937                                base.add(field_start),
938                                data_len - field_start,
939                            ),
940                        )
941                    };
942                    first_output = false;
943                }
944
945                if !first_output {
946                    unsafe { buf_push(buf, line_delim) };
947                } else if !has_delim {
948                    if !suppress {
949                        unsafe {
950                            buf_extend(
951                                buf,
952                                std::slice::from_raw_parts(
953                                    base.add(line_start),
954                                    data_len - line_start,
955                                ),
956                            );
957                            buf_push(buf, line_delim);
958                        }
959                    }
960                } else {
961                    unsafe { buf_push(buf, line_delim) };
962                }
963            }
964        }
965
966        return;
967    }
968
969    // Fallback: when delim == line_delim, use the two-level scan approach
970    let mut start = 0;
971    for end_pos in memchr_iter(line_delim, data) {
972        let line = &data[start..end_pos];
973        extract_fields_to_buf(
974            line,
975            delim,
976            ranges,
977            output_delim,
978            suppress,
979            max_field,
980            field_mask,
981            line_delim,
982            buf,
983            complement,
984        );
985        start = end_pos + 1;
986    }
987    if start < data.len() {
988        extract_fields_to_buf(
989            &data[start..],
990            delim,
991            ranges,
992            output_delim,
993            suppress,
994            max_field,
995            field_mask,
996            line_delim,
997            buf,
998            complement,
999        );
1000    }
1001}
1002
1003// ── Ultra-fast single field extraction ───────────────────────────────────
1004
1005/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
1006/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
1007/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
1008fn process_single_field(
1009    data: &[u8],
1010    delim: u8,
1011    line_delim: u8,
1012    target: usize,
1013    suppress: bool,
1014    out: &mut impl Write,
1015) -> io::Result<()> {
1016    let target_idx = target - 1;
1017
1018    // For single-field extraction, parallelize at 16MB+ to match PARALLEL_THRESHOLD.
1019    const FIELD_PARALLEL_MIN: usize = 16 * 1024 * 1024;
1020
1021    if delim != line_delim {
1022        // Field 1 fast path: memchr2 single-pass scan.
1023        // For field 1, the first delimiter IS the field boundary. Lines without
1024        // delimiter are passed through unchanged.
1025        if target_idx == 0 && !suppress {
1026            if data.len() >= FIELD_PARALLEL_MIN {
1027                return single_field1_parallel(data, delim, line_delim, out);
1028            }
1029            // Sequential: scan with memchr2 into buffer, single write_all.
1030            // Faster than writev/IoSlice for moderate data because it produces
1031            // one contiguous buffer → one write syscall, and avoids IoSlice
1032            // allocation overhead for high-delimiter-density data.
1033            let mut buf = Vec::with_capacity(data.len() + 1);
1034            single_field1_to_buf(data, delim, line_delim, &mut buf);
1035            if !buf.is_empty() {
1036                out.write_all(&buf)?;
1037            }
1038            return Ok(());
1039        }
1040
1041        // Two-level approach for field N: outer newline scan + inner delim scan
1042        // with early exit at target_idx. Faster than memchr2 single-pass because
1043        // we only scan delimiters up to target_idx per line (not all of them).
1044        if data.len() >= FIELD_PARALLEL_MIN {
1045            let chunks = split_for_scope(data, line_delim);
1046            let n = chunks.len();
1047            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1048            rayon::scope(|s| {
1049                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1050                    s.spawn(move |_| {
1051                        result.reserve(chunk.len() / 2);
1052                        process_single_field_chunk(
1053                            chunk, delim, target_idx, line_delim, suppress, result,
1054                        );
1055                    });
1056                }
1057            });
1058            let slices: Vec<IoSlice> = results
1059                .iter()
1060                .filter(|r| !r.is_empty())
1061                .map(|r| IoSlice::new(r))
1062                .collect();
1063            write_ioslices(out, &slices)?;
1064        } else {
1065            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1066            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1067            if !buf.is_empty() {
1068                out.write_all(&buf)?;
1069            }
1070        }
1071        return Ok(());
1072    }
1073
1074    // Fallback for delim == line_delim: nested loop approach
1075    if data.len() >= FIELD_PARALLEL_MIN {
1076        let chunks = split_for_scope(data, line_delim);
1077        let n = chunks.len();
1078        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1079        rayon::scope(|s| {
1080            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1081                s.spawn(move |_| {
1082                    result.reserve(chunk.len() / 4);
1083                    process_single_field_chunk(
1084                        chunk, delim, target_idx, line_delim, suppress, result,
1085                    );
1086                });
1087            }
1088        });
1089        let slices: Vec<IoSlice> = results
1090            .iter()
1091            .filter(|r| !r.is_empty())
1092            .map(|r| IoSlice::new(r))
1093            .collect();
1094        write_ioslices(out, &slices)?;
1095    } else {
1096        let mut buf = Vec::with_capacity(data.len() / 4);
1097        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1098        if !buf.is_empty() {
1099            out.write_all(&buf)?;
1100        }
1101    }
1102    Ok(())
1103}
1104
1105/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1106/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1107fn process_complement_range(
1108    data: &[u8],
1109    delim: u8,
1110    line_delim: u8,
1111    skip_start: usize,
1112    skip_end: usize,
1113    suppress: bool,
1114    out: &mut impl Write,
1115) -> io::Result<()> {
1116    if data.len() >= PARALLEL_THRESHOLD {
1117        let chunks = split_for_scope(data, line_delim);
1118        let n = chunks.len();
1119        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1120        rayon::scope(|s| {
1121            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1122                s.spawn(move |_| {
1123                    result.reserve(chunk.len());
1124                    complement_range_chunk(
1125                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
1126                    );
1127                });
1128            }
1129        });
1130        let slices: Vec<IoSlice> = results
1131            .iter()
1132            .filter(|r| !r.is_empty())
1133            .map(|r| IoSlice::new(r))
1134            .collect();
1135        write_ioslices(out, &slices)?;
1136    } else {
1137        let mut buf = Vec::with_capacity(data.len());
1138        complement_range_chunk(
1139            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1140        );
1141        if !buf.is_empty() {
1142            out.write_all(&buf)?;
1143        }
1144    }
1145    Ok(())
1146}
1147
1148/// Process a chunk for complement range extraction.
1149fn complement_range_chunk(
1150    data: &[u8],
1151    delim: u8,
1152    skip_start: usize,
1153    skip_end: usize,
1154    line_delim: u8,
1155    suppress: bool,
1156    buf: &mut Vec<u8>,
1157) {
1158    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1159    buf.reserve(data.len());
1160    let mut start = 0;
1161    for end_pos in memchr_iter(line_delim, data) {
1162        let line = &data[start..end_pos];
1163        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1164        start = end_pos + 1;
1165    }
1166    if start < data.len() {
1167        complement_range_line(
1168            &data[start..],
1169            delim,
1170            skip_start,
1171            skip_end,
1172            line_delim,
1173            suppress,
1174            buf,
1175        );
1176    }
1177}
1178
1179/// Extract all fields except skip_start..=skip_end from one line.
1180/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1181///
1182/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1183/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1184/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1185#[inline(always)]
1186fn complement_range_line(
1187    line: &[u8],
1188    delim: u8,
1189    skip_start: usize,
1190    skip_end: usize,
1191    line_delim: u8,
1192    suppress: bool,
1193    buf: &mut Vec<u8>,
1194) {
1195    let len = line.len();
1196    if len == 0 {
1197        if !suppress {
1198            unsafe { buf_push(buf, line_delim) };
1199        }
1200        return;
1201    }
1202
1203    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1204    let base = line.as_ptr();
1205
1206    // 1-based field numbers. To skip fields skip_start..=skip_end:
1207    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1208    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1209    //
1210    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1211    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1212
1213    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1214    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1215    let total_need = need_prefix_delims + need_skip_delims;
1216
1217    // Find delimiter positions up to total_need
1218    let mut delim_count: usize = 0;
1219    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1220    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1221
1222    for pos in memchr_iter(delim, line) {
1223        delim_count += 1;
1224        if delim_count == need_prefix_delims {
1225            prefix_end_pos = pos;
1226        }
1227        if delim_count == total_need {
1228            suffix_start_pos = pos + 1;
1229            break;
1230        }
1231    }
1232
1233    if delim_count == 0 {
1234        // No delimiter at all
1235        if !suppress {
1236            unsafe {
1237                buf_extend(buf, line);
1238                buf_push(buf, line_delim);
1239            }
1240        }
1241        return;
1242    }
1243
1244    // Case analysis:
1245    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1246    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1247    // 3. Enough to reach skip_end: prefix + delim + suffix
1248
1249    if delim_count < need_prefix_delims {
1250        // Not enough fields to reach skip region — output entire line
1251        unsafe {
1252            buf_extend(buf, line);
1253            buf_push(buf, line_delim);
1254        }
1255        return;
1256    }
1257
1258    let has_prefix = need_prefix_delims > 0;
1259    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1260
1261    if has_prefix && has_suffix {
1262        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1263        unsafe {
1264            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1265            buf_push(buf, delim);
1266            buf_extend(
1267                buf,
1268                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1269            );
1270            buf_push(buf, line_delim);
1271        }
1272    } else if has_prefix {
1273        // Only prefix, no suffix (skip region extends to end of line)
1274        unsafe {
1275            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1276            buf_push(buf, line_delim);
1277        }
1278    } else if has_suffix {
1279        // No prefix (skip_start == 1), only suffix
1280        unsafe {
1281            buf_extend(
1282                buf,
1283                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1284            );
1285            buf_push(buf, line_delim);
1286        }
1287    } else {
1288        // All fields skipped
1289        unsafe { buf_push(buf, line_delim) };
1290    }
1291}
1292
1293/// Complement single-field extraction: skip one field, output rest unchanged.
1294fn process_complement_single_field(
1295    data: &[u8],
1296    delim: u8,
1297    line_delim: u8,
1298    skip_field: usize,
1299    suppress: bool,
1300    out: &mut impl Write,
1301) -> io::Result<()> {
1302    let skip_idx = skip_field - 1;
1303
1304    if data.len() >= PARALLEL_THRESHOLD {
1305        let chunks = split_for_scope(data, line_delim);
1306        let n = chunks.len();
1307        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1308        rayon::scope(|s| {
1309            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1310                s.spawn(move |_| {
1311                    result.reserve(chunk.len());
1312                    complement_single_field_chunk(
1313                        chunk, delim, skip_idx, line_delim, suppress, result,
1314                    );
1315                });
1316            }
1317        });
1318        let slices: Vec<IoSlice> = results
1319            .iter()
1320            .filter(|r| !r.is_empty())
1321            .map(|r| IoSlice::new(r))
1322            .collect();
1323        write_ioslices(out, &slices)?;
1324    } else {
1325        let mut buf = Vec::with_capacity(data.len());
1326        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1327        if !buf.is_empty() {
1328            out.write_all(&buf)?;
1329        }
1330    }
1331    Ok(())
1332}
1333
1334/// Process a chunk for complement single-field extraction using memchr2 single-pass.
1335/// Scans for both delimiter and line_delim in one SIMD pass, tracking delimiter count
1336/// per line. When the skip field's bounding delimiters are found, copies prefix + suffix.
1337/// This eliminates the per-line memchr_iter setup overhead and reduces from two SIMD
1338/// passes (outer newline scan + inner delimiter scan) to one.
1339fn complement_single_field_chunk(
1340    data: &[u8],
1341    delim: u8,
1342    skip_idx: usize,
1343    line_delim: u8,
1344    suppress: bool,
1345    buf: &mut Vec<u8>,
1346) {
1347    // When delim == line_delim, fall back to per-line approach
1348    if delim == line_delim {
1349        buf.reserve(data.len());
1350        let mut start = 0;
1351        for end_pos in memchr_iter(line_delim, data) {
1352            let line = &data[start..end_pos];
1353            complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1354            start = end_pos + 1;
1355        }
1356        if start < data.len() {
1357            complement_single_field_line(
1358                &data[start..],
1359                delim,
1360                skip_idx,
1361                line_delim,
1362                suppress,
1363                buf,
1364            );
1365        }
1366        return;
1367    }
1368
1369    buf.reserve(data.len());
1370    let base = data.as_ptr();
1371    let data_len = data.len();
1372    let need_before = skip_idx; // delimiters before skip field
1373    let need_total = skip_idx + 1; // delimiters to find end of skip field
1374
1375    // Per-line state
1376    let mut line_start: usize = 0;
1377    let mut delim_count: usize = 0;
1378    let mut skip_start_pos: usize = 0;
1379    let mut skip_end_pos: usize = 0;
1380    let mut found_start = need_before == 0; // skip_idx==0 means skip starts at line start
1381    let mut found_end = false;
1382
1383    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1384        let byte = unsafe { *base.add(pos) };
1385
1386        if byte == line_delim {
1387            // End of line: emit based on what we found
1388            if delim_count == 0 {
1389                // No delimiter in line
1390                if !suppress {
1391                    unsafe {
1392                        buf_extend(
1393                            buf,
1394                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1395                        );
1396                        buf_push(buf, line_delim);
1397                    }
1398                }
1399            } else if !found_start || delim_count < need_before {
1400                // Not enough delimiters to reach skip field — output entire line
1401                unsafe {
1402                    buf_extend(
1403                        buf,
1404                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1405                    );
1406                    buf_push(buf, line_delim);
1407                }
1408            } else {
1409                let has_prefix = skip_idx > 0;
1410                let has_suffix = found_end && skip_end_pos < pos;
1411
1412                if has_prefix && has_suffix {
1413                    unsafe {
1414                        buf_extend(
1415                            buf,
1416                            std::slice::from_raw_parts(
1417                                base.add(line_start),
1418                                skip_start_pos - 1 - line_start,
1419                            ),
1420                        );
1421                        buf_push(buf, delim);
1422                        buf_extend(
1423                            buf,
1424                            std::slice::from_raw_parts(
1425                                base.add(skip_end_pos + 1),
1426                                pos - skip_end_pos - 1,
1427                            ),
1428                        );
1429                        buf_push(buf, line_delim);
1430                    }
1431                } else if has_prefix {
1432                    unsafe {
1433                        buf_extend(
1434                            buf,
1435                            std::slice::from_raw_parts(
1436                                base.add(line_start),
1437                                skip_start_pos - 1 - line_start,
1438                            ),
1439                        );
1440                        buf_push(buf, line_delim);
1441                    }
1442                } else if has_suffix {
1443                    unsafe {
1444                        buf_extend(
1445                            buf,
1446                            std::slice::from_raw_parts(
1447                                base.add(skip_end_pos + 1),
1448                                pos - skip_end_pos - 1,
1449                            ),
1450                        );
1451                        buf_push(buf, line_delim);
1452                    }
1453                } else {
1454                    unsafe { buf_push(buf, line_delim) };
1455                }
1456            }
1457
1458            // Reset for next line
1459            line_start = pos + 1;
1460            delim_count = 0;
1461            skip_start_pos = 0;
1462            skip_end_pos = 0;
1463            found_start = need_before == 0;
1464            found_end = false;
1465        } else {
1466            // Delimiter found
1467            delim_count += 1;
1468            if delim_count == need_before {
1469                skip_start_pos = pos + 1;
1470                found_start = true;
1471            }
1472            if delim_count == need_total {
1473                skip_end_pos = pos;
1474                found_end = true;
1475            }
1476        }
1477    }
1478
1479    // Handle last line without trailing line_delim
1480    if line_start < data_len {
1481        let pos = data_len;
1482        if delim_count == 0 {
1483            if !suppress {
1484                unsafe {
1485                    buf_extend(
1486                        buf,
1487                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1488                    );
1489                    buf_push(buf, line_delim);
1490                }
1491            }
1492        } else if !found_start || delim_count < need_before {
1493            unsafe {
1494                buf_extend(
1495                    buf,
1496                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1497                );
1498                buf_push(buf, line_delim);
1499            }
1500        } else {
1501            let has_prefix = skip_idx > 0;
1502            let has_suffix = found_end && skip_end_pos < pos;
1503
1504            if has_prefix && has_suffix {
1505                unsafe {
1506                    buf_extend(
1507                        buf,
1508                        std::slice::from_raw_parts(
1509                            base.add(line_start),
1510                            skip_start_pos - 1 - line_start,
1511                        ),
1512                    );
1513                    buf_push(buf, delim);
1514                    buf_extend(
1515                        buf,
1516                        std::slice::from_raw_parts(
1517                            base.add(skip_end_pos + 1),
1518                            pos - skip_end_pos - 1,
1519                        ),
1520                    );
1521                    buf_push(buf, line_delim);
1522                }
1523            } else if has_prefix {
1524                unsafe {
1525                    buf_extend(
1526                        buf,
1527                        std::slice::from_raw_parts(
1528                            base.add(line_start),
1529                            skip_start_pos - 1 - line_start,
1530                        ),
1531                    );
1532                    buf_push(buf, line_delim);
1533                }
1534            } else if has_suffix {
1535                unsafe {
1536                    buf_extend(
1537                        buf,
1538                        std::slice::from_raw_parts(
1539                            base.add(skip_end_pos + 1),
1540                            pos - skip_end_pos - 1,
1541                        ),
1542                    );
1543                    buf_push(buf, line_delim);
1544                }
1545            } else {
1546                unsafe { buf_push(buf, line_delim) };
1547            }
1548        }
1549    }
1550}
1551
1552/// Fallback per-line complement single-field extraction (for delim == line_delim).
1553#[inline(always)]
1554fn complement_single_field_line(
1555    line: &[u8],
1556    delim: u8,
1557    skip_idx: usize,
1558    line_delim: u8,
1559    suppress: bool,
1560    buf: &mut Vec<u8>,
1561) {
1562    let len = line.len();
1563    if len == 0 {
1564        if !suppress {
1565            unsafe { buf_push(buf, line_delim) };
1566        }
1567        return;
1568    }
1569
1570    let base = line.as_ptr();
1571    let need_before = skip_idx;
1572    let need_total = skip_idx + 1;
1573
1574    let mut delim_count: usize = 0;
1575    let mut skip_start_pos: usize = 0;
1576    let mut skip_end_pos: usize = len;
1577    let mut found_end = false;
1578
1579    for pos in memchr_iter(delim, line) {
1580        delim_count += 1;
1581        if delim_count == need_before {
1582            skip_start_pos = pos + 1;
1583        }
1584        if delim_count == need_total {
1585            skip_end_pos = pos;
1586            found_end = true;
1587            break;
1588        }
1589    }
1590
1591    if delim_count == 0 {
1592        if !suppress {
1593            unsafe {
1594                buf_extend(buf, line);
1595                buf_push(buf, line_delim);
1596            }
1597        }
1598        return;
1599    }
1600
1601    if delim_count < need_before {
1602        unsafe {
1603            buf_extend(buf, line);
1604            buf_push(buf, line_delim);
1605        }
1606        return;
1607    }
1608
1609    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1610    let has_suffix = found_end && skip_end_pos < len;
1611
1612    if has_prefix && has_suffix {
1613        unsafe {
1614            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1615            buf_push(buf, delim);
1616            buf_extend(
1617                buf,
1618                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1619            );
1620            buf_push(buf, line_delim);
1621        }
1622    } else if has_prefix {
1623        unsafe {
1624            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1625            buf_push(buf, line_delim);
1626        }
1627    } else if has_suffix {
1628        unsafe {
1629            buf_extend(
1630                buf,
1631                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1632            );
1633            buf_push(buf, line_delim);
1634        }
1635    } else {
1636        unsafe { buf_push(buf, line_delim) };
1637    }
1638}
1639
1640/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1641/// Zero-copy for the non-parallel path: identifies the truncation point per line
1642/// and writes contiguous runs directly from the source data.
1643fn process_fields_prefix(
1644    data: &[u8],
1645    delim: u8,
1646    line_delim: u8,
1647    last_field: usize,
1648    suppress: bool,
1649    out: &mut impl Write,
1650) -> io::Result<()> {
1651    if data.len() >= PARALLEL_THRESHOLD {
1652        let chunks = split_for_scope(data, line_delim);
1653        let n = chunks.len();
1654        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1655        rayon::scope(|s| {
1656            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1657                s.spawn(move |_| {
1658                    result.reserve(chunk.len());
1659                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1660                });
1661            }
1662        });
1663        let slices: Vec<IoSlice> = results
1664            .iter()
1665            .filter(|r| !r.is_empty())
1666            .map(|r| IoSlice::new(r))
1667            .collect();
1668        write_ioslices(out, &slices)?;
1669    } else if !suppress {
1670        // Zero-copy fast path: scan for truncation points, write runs from source.
1671        // When suppress is false, every line is output (with or without delimiter).
1672        // Most lines have enough fields, so the output is often identical to input.
1673        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1674    } else {
1675        let mut buf = Vec::with_capacity(data.len());
1676        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1677        if !buf.is_empty() {
1678            out.write_all(&buf)?;
1679        }
1680    }
1681    Ok(())
1682}
1683
1684/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1685/// directly into the source data, flushing in MAX_IOV-sized batches.
1686/// For lines where the Nth delimiter exists, we truncate at that point.
1687/// For lines with fewer fields, we output them unchanged (contiguous run).
1688/// Lines without any delimiter are output unchanged (suppress=false assumed).
1689#[inline]
1690fn fields_prefix_zerocopy(
1691    data: &[u8],
1692    delim: u8,
1693    line_delim: u8,
1694    last_field: usize,
1695    out: &mut impl Write,
1696) -> io::Result<()> {
1697    let newline_buf: [u8; 1] = [line_delim];
1698    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1699    let mut start = 0;
1700    let mut run_start: usize = 0;
1701
1702    for end_pos in memchr_iter(line_delim, data) {
1703        let line = &data[start..end_pos];
1704        let mut field_count = 1;
1705        let mut truncate_at: Option<usize> = None;
1706        for dpos in memchr_iter(delim, line) {
1707            if field_count >= last_field {
1708                truncate_at = Some(start + dpos);
1709                break;
1710            }
1711            field_count += 1;
1712        }
1713
1714        if let Some(trunc_pos) = truncate_at {
1715            if run_start < start {
1716                iov.push(IoSlice::new(&data[run_start..start]));
1717            }
1718            iov.push(IoSlice::new(&data[start..trunc_pos]));
1719            iov.push(IoSlice::new(&newline_buf));
1720            run_start = end_pos + 1;
1721
1722            if iov.len() >= MAX_IOV - 2 {
1723                write_ioslices(out, &iov)?;
1724                iov.clear();
1725            }
1726        }
1727        start = end_pos + 1;
1728    }
1729    // Handle last line without terminator
1730    if start < data.len() {
1731        let line = &data[start..];
1732        let mut field_count = 1;
1733        let mut truncate_at: Option<usize> = None;
1734        for dpos in memchr_iter(delim, line) {
1735            if field_count >= last_field {
1736                truncate_at = Some(start + dpos);
1737                break;
1738            }
1739            field_count += 1;
1740        }
1741        if let Some(trunc_pos) = truncate_at {
1742            if run_start < start {
1743                iov.push(IoSlice::new(&data[run_start..start]));
1744            }
1745            iov.push(IoSlice::new(&data[start..trunc_pos]));
1746            iov.push(IoSlice::new(&newline_buf));
1747            if !iov.is_empty() {
1748                write_ioslices(out, &iov)?;
1749            }
1750            return Ok(());
1751        }
1752    }
1753    // Flush remaining contiguous run
1754    if run_start < data.len() {
1755        iov.push(IoSlice::new(&data[run_start..]));
1756        if !data.is_empty() && *data.last().unwrap() != line_delim {
1757            iov.push(IoSlice::new(&newline_buf));
1758        }
1759    }
1760    if !iov.is_empty() {
1761        write_ioslices(out, &iov)?;
1762    }
1763    Ok(())
1764}
1765
1766/// Process a chunk for contiguous from-start field range extraction.
1767fn fields_prefix_chunk(
1768    data: &[u8],
1769    delim: u8,
1770    line_delim: u8,
1771    last_field: usize,
1772    suppress: bool,
1773    buf: &mut Vec<u8>,
1774) {
1775    buf.reserve(data.len());
1776    let mut start = 0;
1777    for end_pos in memchr_iter(line_delim, data) {
1778        let line = &data[start..end_pos];
1779        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1780        start = end_pos + 1;
1781    }
1782    if start < data.len() {
1783        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1784    }
1785}
1786
1787/// Extract first N fields from one line (contiguous from-start range).
1788/// Uses memchr SIMD for delimiter scanning on all line sizes.
1789#[inline(always)]
1790fn fields_prefix_line(
1791    line: &[u8],
1792    delim: u8,
1793    line_delim: u8,
1794    last_field: usize,
1795    suppress: bool,
1796    buf: &mut Vec<u8>,
1797) {
1798    let len = line.len();
1799    if len == 0 {
1800        if !suppress {
1801            unsafe { buf_push(buf, line_delim) };
1802        }
1803        return;
1804    }
1805
1806    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1807    let base = line.as_ptr();
1808
1809    let mut field_count = 1usize;
1810    let mut has_delim = false;
1811
1812    for pos in memchr_iter(delim, line) {
1813        has_delim = true;
1814        if field_count >= last_field {
1815            unsafe {
1816                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1817                buf_push(buf, line_delim);
1818            }
1819            return;
1820        }
1821        field_count += 1;
1822    }
1823
1824    if !has_delim {
1825        if !suppress {
1826            unsafe {
1827                buf_extend(buf, line);
1828                buf_push(buf, line_delim);
1829            }
1830        }
1831        return;
1832    }
1833
1834    unsafe {
1835        buf_extend(buf, line);
1836        buf_push(buf, line_delim);
1837    }
1838}
1839
1840/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1841fn process_fields_suffix(
1842    data: &[u8],
1843    delim: u8,
1844    line_delim: u8,
1845    start_field: usize,
1846    suppress: bool,
1847    out: &mut impl Write,
1848) -> io::Result<()> {
1849    if data.len() >= PARALLEL_THRESHOLD {
1850        let chunks = split_for_scope(data, line_delim);
1851        let n = chunks.len();
1852        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1853        rayon::scope(|s| {
1854            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1855                s.spawn(move |_| {
1856                    result.reserve(chunk.len());
1857                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1858                });
1859            }
1860        });
1861        let slices: Vec<IoSlice> = results
1862            .iter()
1863            .filter(|r| !r.is_empty())
1864            .map(|r| IoSlice::new(r))
1865            .collect();
1866        write_ioslices(out, &slices)?;
1867    } else {
1868        let mut buf = Vec::with_capacity(data.len());
1869        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1870        if !buf.is_empty() {
1871            out.write_all(&buf)?;
1872        }
1873    }
1874    Ok(())
1875}
1876
1877/// Process a chunk for open-ended field suffix extraction.
1878fn fields_suffix_chunk(
1879    data: &[u8],
1880    delim: u8,
1881    line_delim: u8,
1882    start_field: usize,
1883    suppress: bool,
1884    buf: &mut Vec<u8>,
1885) {
1886    buf.reserve(data.len());
1887    let mut start = 0;
1888    for end_pos in memchr_iter(line_delim, data) {
1889        let line = &data[start..end_pos];
1890        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1891        start = end_pos + 1;
1892    }
1893    if start < data.len() {
1894        fields_suffix_line(
1895            &data[start..],
1896            delim,
1897            line_delim,
1898            start_field,
1899            suppress,
1900            buf,
1901        );
1902    }
1903}
1904
1905/// Extract fields from start_field to end from one line.
1906/// Uses memchr SIMD for delimiter scanning on all line sizes.
1907#[inline(always)]
1908fn fields_suffix_line(
1909    line: &[u8],
1910    delim: u8,
1911    line_delim: u8,
1912    start_field: usize,
1913    suppress: bool,
1914    buf: &mut Vec<u8>,
1915) {
1916    let len = line.len();
1917    if len == 0 {
1918        if !suppress {
1919            unsafe { buf_push(buf, line_delim) };
1920        }
1921        return;
1922    }
1923
1924    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1925    let base = line.as_ptr();
1926
1927    let skip_delims = start_field - 1;
1928    let mut delim_count = 0usize;
1929    let mut has_delim = false;
1930
1931    for pos in memchr_iter(delim, line) {
1932        has_delim = true;
1933        delim_count += 1;
1934        if delim_count >= skip_delims {
1935            unsafe {
1936                buf_extend(
1937                    buf,
1938                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1939                );
1940                buf_push(buf, line_delim);
1941            }
1942            return;
1943        }
1944    }
1945
1946    if !has_delim {
1947        if !suppress {
1948            unsafe {
1949                buf_extend(buf, line);
1950                buf_push(buf, line_delim);
1951            }
1952        }
1953        return;
1954    }
1955
1956    // Fewer delimiters than needed
1957    unsafe { buf_push(buf, line_delim) };
1958}
1959
1960/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1961/// Optimized: skip to start_field using memchr, then output until end_field.
1962fn process_fields_mid_range(
1963    data: &[u8],
1964    delim: u8,
1965    line_delim: u8,
1966    start_field: usize,
1967    end_field: usize,
1968    suppress: bool,
1969    out: &mut impl Write,
1970) -> io::Result<()> {
1971    if data.len() >= PARALLEL_THRESHOLD {
1972        let chunks = split_for_scope(data, line_delim);
1973        let n = chunks.len();
1974        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1975        rayon::scope(|s| {
1976            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1977                s.spawn(move |_| {
1978                    result.reserve(chunk.len());
1979                    fields_mid_range_chunk(
1980                        chunk,
1981                        delim,
1982                        line_delim,
1983                        start_field,
1984                        end_field,
1985                        suppress,
1986                        result,
1987                    );
1988                });
1989            }
1990        });
1991        let slices: Vec<IoSlice> = results
1992            .iter()
1993            .filter(|r| !r.is_empty())
1994            .map(|r| IoSlice::new(r))
1995            .collect();
1996        write_ioslices(out, &slices)?;
1997    } else {
1998        let mut buf = Vec::with_capacity(data.len());
1999        fields_mid_range_chunk(
2000            data,
2001            delim,
2002            line_delim,
2003            start_field,
2004            end_field,
2005            suppress,
2006            &mut buf,
2007        );
2008        if !buf.is_empty() {
2009            out.write_all(&buf)?;
2010        }
2011    }
2012    Ok(())
2013}
2014
2015/// Process a chunk for contiguous mid-range field extraction.
2016/// Single-pass memchr2 scan over the entire chunk, tracking delimiter count
2017/// per line. Avoids the double-scan (outer newline + inner delimiter).
2018fn fields_mid_range_chunk(
2019    data: &[u8],
2020    delim: u8,
2021    line_delim: u8,
2022    start_field: usize,
2023    end_field: usize,
2024    suppress: bool,
2025    buf: &mut Vec<u8>,
2026) {
2027    // When delim == line_delim, fall back to per-line approach
2028    if delim == line_delim {
2029        buf.reserve(data.len());
2030        let mut start = 0;
2031        for end_pos in memchr_iter(line_delim, data) {
2032            let line = &data[start..end_pos];
2033            fields_mid_range_line(
2034                line,
2035                delim,
2036                line_delim,
2037                start_field,
2038                end_field,
2039                suppress,
2040                buf,
2041            );
2042            start = end_pos + 1;
2043        }
2044        if start < data.len() {
2045            fields_mid_range_line(
2046                &data[start..],
2047                delim,
2048                line_delim,
2049                start_field,
2050                end_field,
2051                suppress,
2052                buf,
2053            );
2054        }
2055        return;
2056    }
2057
2058    buf.reserve(data.len());
2059    let base = data.as_ptr();
2060    let skip_before = start_field - 1; // delimiters to skip before range
2061    let target_end_delim = skip_before + (end_field - start_field) + 1;
2062
2063    let mut line_start: usize = 0;
2064    let mut delim_count: usize = 0;
2065    let mut range_start: usize = 0;
2066    let mut has_delim = false;
2067    let mut found_end = false; // true when we found all target fields, skip to newline
2068
2069    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2070        let byte = unsafe { *base.add(pos) };
2071        if byte == line_delim {
2072            // End of line
2073            if found_end {
2074                // Already output this line's range
2075            } else if !has_delim {
2076                // No delimiter on this line
2077                if !suppress {
2078                    unsafe {
2079                        buf_extend(
2080                            buf,
2081                            std::slice::from_raw_parts(base.add(line_start), pos + 1 - line_start),
2082                        );
2083                    }
2084                }
2085            } else if delim_count >= skip_before {
2086                // Have enough fields for start_field; output from range_start to EOL
2087                if skip_before == 0 {
2088                    range_start = line_start;
2089                }
2090                unsafe {
2091                    buf_extend(
2092                        buf,
2093                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2094                    );
2095                    buf_push(buf, line_delim);
2096                }
2097            } else {
2098                // Not enough fields for start_field — output empty line
2099                unsafe { buf_push(buf, line_delim) };
2100            }
2101            line_start = pos + 1;
2102            delim_count = 0;
2103            has_delim = false;
2104            found_end = false;
2105        } else if !found_end {
2106            // Delimiter
2107            has_delim = true;
2108            delim_count += 1;
2109            if delim_count == skip_before {
2110                range_start = pos + 1;
2111            }
2112            if delim_count == target_end_delim {
2113                if skip_before == 0 {
2114                    range_start = line_start;
2115                }
2116                unsafe {
2117                    buf_extend(
2118                        buf,
2119                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2120                    );
2121                    buf_push(buf, line_delim);
2122                }
2123                found_end = true;
2124            }
2125        }
2126    }
2127    // Handle trailing data without final newline
2128    if line_start < data.len() && !found_end {
2129        if !has_delim {
2130            if !suppress {
2131                unsafe {
2132                    buf_extend(
2133                        buf,
2134                        std::slice::from_raw_parts(base.add(line_start), data.len() - line_start),
2135                    );
2136                }
2137            }
2138        } else if delim_count >= skip_before {
2139            if skip_before == 0 {
2140                range_start = line_start;
2141            }
2142            unsafe {
2143                buf_extend(
2144                    buf,
2145                    std::slice::from_raw_parts(base.add(range_start), data.len() - range_start),
2146                );
2147            }
2148        }
2149    }
2150}
2151
2152/// Extract fields start_field..=end_field from one line.
2153/// Uses scalar byte scanning for short lines, memchr_iter for longer.
2154/// Raw pointer arithmetic to eliminate bounds checking.
2155#[inline(always)]
2156fn fields_mid_range_line(
2157    line: &[u8],
2158    delim: u8,
2159    line_delim: u8,
2160    start_field: usize,
2161    end_field: usize,
2162    suppress: bool,
2163    buf: &mut Vec<u8>,
2164) {
2165    let len = line.len();
2166    if len == 0 {
2167        if !suppress {
2168            unsafe { buf_push(buf, line_delim) };
2169        }
2170        return;
2171    }
2172
2173    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
2174    let base = line.as_ptr();
2175
2176    // Count delimiters to find start_field and end_field boundaries
2177    let skip_before = start_field - 1; // delimiters to skip before start_field
2178    let field_span = end_field - start_field; // additional delimiters within the range
2179    let target_end_delim = skip_before + field_span + 1;
2180    let mut delim_count = 0;
2181    let mut range_start = 0;
2182    let mut has_delim = false;
2183
2184    for pos in memchr_iter(delim, line) {
2185        has_delim = true;
2186        delim_count += 1;
2187        if delim_count == skip_before {
2188            range_start = pos + 1;
2189        }
2190        if delim_count == target_end_delim {
2191            if skip_before == 0 {
2192                range_start = 0;
2193            }
2194            unsafe {
2195                buf_extend(
2196                    buf,
2197                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2198                );
2199                buf_push(buf, line_delim);
2200            }
2201            return;
2202        }
2203    }
2204
2205    if !has_delim {
2206        if !suppress {
2207            unsafe {
2208                buf_extend(buf, line);
2209                buf_push(buf, line_delim);
2210            }
2211        }
2212        return;
2213    }
2214
2215    // Line has delimiters but fewer fields than end_field
2216    if delim_count >= skip_before {
2217        // We have at least start_field, output from range_start to end
2218        if skip_before == 0 {
2219            range_start = 0;
2220        }
2221        unsafe {
2222            buf_extend(
2223                buf,
2224                std::slice::from_raw_parts(base.add(range_start), len - range_start),
2225            );
2226            buf_push(buf, line_delim);
2227        }
2228    } else {
2229        // Not enough fields even for start_field — output empty line
2230        unsafe { buf_push(buf, line_delim) };
2231    }
2232}
2233
2234/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
2235/// directly into the source data, flushing in MAX_IOV-sized batches.
2236/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
2237///
2238/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
2239/// Parallel field-1 extraction for large data using memchr2 single-pass.
2240/// Splits data into per-thread chunks, each chunk extracts field 1 using
2241/// memchr2(delim, newline) which finds the first special byte in one scan.
2242/// For field 1: first special byte is either the delimiter (field end) or
2243/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
2244fn single_field1_parallel(
2245    data: &[u8],
2246    delim: u8,
2247    line_delim: u8,
2248    out: &mut impl Write,
2249) -> io::Result<()> {
2250    let chunks = split_for_scope(data, line_delim);
2251    let n = chunks.len();
2252    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2253    rayon::scope(|s| {
2254        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2255            s.spawn(move |_| {
2256                result.reserve(chunk.len() + 1);
2257                single_field1_to_buf(chunk, delim, line_delim, result);
2258            });
2259        }
2260    });
2261    let slices: Vec<IoSlice> = results
2262        .iter()
2263        .filter(|r| !r.is_empty())
2264        .map(|r| IoSlice::new(r))
2265        .collect();
2266    write_ioslices(out, &slices)
2267}
2268
2269/// Extract field 1 from a chunk using memchr2_iter single-pass SIMD scanning.
2270/// Uses a single memchr2_iter pass over the entire chunk to find both delimiters
2271/// and newlines. This eliminates the per-line memchr function call overhead
2272/// (~5-10ns per call × 2 calls per line) that dominates for short-field data.
2273///
2274/// Optimizations:
2275/// - Deferred field copy: delays copying from delimiter position to newline,
2276///   enabling fused field+newline output in a single copy sequence.
2277/// - Single output pointer: avoids per-line buf.len() load/store (saves ~488K
2278///   ops for 244K lines). One set_len at the end.
2279#[inline]
2280fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
2281    debug_assert_ne!(delim, line_delim, "delim and line_delim must differ");
2282    // Reserve data.len() + 1: output ≤ input for all lines except potentially
2283    // the last line without trailing newline, where we add a newline (GNU compat).
2284    buf.reserve(data.len() + 1);
2285
2286    // Use a single output pointer — avoids per-line buf.len() load/store.
2287    // Only one set_len at the end instead of 2 per line (saves ~488K ops for 244K lines).
2288    let base = data.as_ptr();
2289    let initial_len = buf.len();
2290    let mut out_ptr = unsafe { buf.as_mut_ptr().add(initial_len) };
2291    let mut line_start: usize = 0;
2292    let mut found_delim = false;
2293    let mut delim_pos: usize = 0; // only valid when found_delim == true
2294
2295    // SAFETY (capacity): Total output <= data.len() + 1 because:
2296    // - Lines without delimiter: output exactly the input bytes (subrange of data)
2297    // - Lines with delimiter: output field bytes (< input line), uses base reservation
2298    // - Unterminated last line: adds 1 newline, which is why we reserve +1
2299    // The +1 is only consumed by the unterminated-last-line case; all other cases
2300    // stay within data.len(). reserve(data.len() + 1) guarantees sufficient capacity.
2301    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2302        let byte = unsafe { *base.add(pos) };
2303        if byte == line_delim {
2304            if !found_delim {
2305                // No delimiter on this line — output entire line including newline
2306                let len = pos + 1 - line_start;
2307                unsafe {
2308                    std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, len);
2309                    out_ptr = out_ptr.add(len);
2310                }
2311            } else {
2312                // Delimiter was found — output field + newline in one fused copy.
2313                // field_len may be 0 (line starts with delimiter, e.g. "\trest"):
2314                // copy_nonoverlapping with count=0 is a no-op, which is correct.
2315                let field_len = delim_pos - line_start;
2316                unsafe {
2317                    std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, field_len);
2318                    out_ptr = out_ptr.add(field_len);
2319                    *out_ptr = line_delim;
2320                    out_ptr = out_ptr.add(1);
2321                }
2322            }
2323            line_start = pos + 1;
2324            found_delim = false;
2325        } else if !found_delim {
2326            // First delimiter on this line — record position, defer copy to newline
2327            found_delim = true;
2328            delim_pos = pos;
2329        }
2330        // Subsequent delimiters: ignore
2331    }
2332
2333    // Handle last line without trailing newline — GNU cut always adds newline
2334    if line_start < data.len() {
2335        if !found_delim {
2336            // No delimiter — output remaining data + newline (GNU compat)
2337            let len = data.len() - line_start;
2338            unsafe {
2339                std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, len);
2340                out_ptr = out_ptr.add(len);
2341                *out_ptr = line_delim;
2342                out_ptr = out_ptr.add(1);
2343            }
2344        } else {
2345            // Field + trailing newline (GNU compat)
2346            let field_len = delim_pos - line_start;
2347            unsafe {
2348                std::ptr::copy_nonoverlapping(base.add(line_start), out_ptr, field_len);
2349                out_ptr = out_ptr.add(field_len);
2350                *out_ptr = line_delim;
2351                out_ptr = out_ptr.add(1);
2352            }
2353        }
2354    }
2355
2356    // SAFETY: out_ptr was derived from buf.as_mut_ptr().add(initial_len) after
2357    // the reserve() call, and no Vec reallocation occurred between capture and
2358    // here (no safe buf.* calls in the loop body). Using pointer subtraction
2359    // instead of offset_from avoids the isize intermediate — both pointers are
2360    // in the same allocation so the subtraction is always non-negative.
2361    unsafe {
2362        let new_len = out_ptr as usize - buf.as_ptr() as usize;
2363        debug_assert!(new_len >= initial_len && new_len <= buf.capacity());
2364        buf.set_len(new_len);
2365    }
2366}
2367
2368/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
2369/// directly into the source data. Uses two-level scan: outer memchr(newline)
2370/// for the first delimiter. This is faster than memchr2 for SMALL data because
2371/// the inner scan exits after the FIRST delimiter, skipping all
2372/// subsequent delimiters on the line.
2373///
2374/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2375/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2376#[inline]
2377#[allow(dead_code)]
2378fn single_field1_zerocopy(
2379    data: &[u8],
2380    delim: u8,
2381    line_delim: u8,
2382    out: &mut impl Write,
2383) -> io::Result<()> {
2384    let newline_buf: [u8; 1] = [line_delim];
2385
2386    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2387    let mut run_start: usize = 0;
2388    let mut start = 0;
2389
2390    for end_pos in memchr_iter(line_delim, data) {
2391        let line = &data[start..end_pos];
2392        if let Some(dp) = memchr::memchr(delim, line) {
2393            // Line has delimiter — truncate at first delimiter.
2394            // Flush current contiguous run, then add truncated field + newline.
2395            if run_start < start {
2396                iov.push(IoSlice::new(&data[run_start..start]));
2397            }
2398            iov.push(IoSlice::new(&data[start..start + dp]));
2399            iov.push(IoSlice::new(&newline_buf));
2400            run_start = end_pos + 1;
2401
2402            if iov.len() >= MAX_IOV - 2 {
2403                write_ioslices(out, &iov)?;
2404                iov.clear();
2405            }
2406        }
2407        // else: no delimiter in line, output unchanged (stays in contiguous run)
2408        start = end_pos + 1;
2409    }
2410
2411    // Handle last line (no trailing newline)
2412    if start < data.len() {
2413        let line = &data[start..];
2414        if let Some(dp) = memchr::memchr(delim, line) {
2415            if run_start < start {
2416                iov.push(IoSlice::new(&data[run_start..start]));
2417            }
2418            iov.push(IoSlice::new(&data[start..start + dp]));
2419            iov.push(IoSlice::new(&newline_buf));
2420            if !iov.is_empty() {
2421                write_ioslices(out, &iov)?;
2422            }
2423            return Ok(());
2424        }
2425    }
2426
2427    // Flush remaining contiguous run
2428    if run_start < data.len() {
2429        iov.push(IoSlice::new(&data[run_start..]));
2430        if !data.is_empty() && *data.last().unwrap() != line_delim {
2431            iov.push(IoSlice::new(&newline_buf));
2432        }
2433    }
2434    if !iov.is_empty() {
2435        write_ioslices(out, &iov)?;
2436    }
2437    Ok(())
2438}
2439
2440/// Process a chunk of data for single-field extraction.
2441fn process_single_field_chunk(
2442    data: &[u8],
2443    delim: u8,
2444    target_idx: usize,
2445    line_delim: u8,
2446    suppress: bool,
2447    buf: &mut Vec<u8>,
2448) {
2449    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2450    buf.reserve(data.len());
2451    let mut start = 0;
2452    for end_pos in memchr_iter(line_delim, data) {
2453        let line = &data[start..end_pos];
2454        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2455        start = end_pos + 1;
2456    }
2457    if start < data.len() {
2458        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2459    }
2460}
2461
2462/// Extract a single field from one line.
2463/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2464/// For longer lines, uses memchr for SIMD-accelerated scanning.
2465/// Raw pointer arithmetic eliminates per-field bounds checking.
2466#[inline(always)]
2467fn extract_single_field_line(
2468    line: &[u8],
2469    delim: u8,
2470    target_idx: usize,
2471    line_delim: u8,
2472    suppress: bool,
2473    buf: &mut Vec<u8>,
2474) {
2475    let len = line.len();
2476    if len == 0 {
2477        if !suppress {
2478            unsafe { buf_push(buf, line_delim) };
2479        }
2480        return;
2481    }
2482
2483    // Note: no per-line buf.reserve — process_single_field_chunk already reserves data.len()
2484    let base = line.as_ptr();
2485
2486    // Ultra-fast path for first field: single memchr
2487    if target_idx == 0 {
2488        match memchr::memchr(delim, line) {
2489            Some(pos) => unsafe {
2490                buf_extend_byte(buf, std::slice::from_raw_parts(base, pos), line_delim);
2491            },
2492            None => {
2493                if !suppress {
2494                    unsafe {
2495                        buf_extend_byte(buf, line, line_delim);
2496                    }
2497                }
2498            }
2499        }
2500        return;
2501    }
2502
2503    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2504    let mut field_start = 0;
2505    let mut field_idx = 0;
2506    let mut has_delim = false;
2507
2508    for pos in memchr_iter(delim, line) {
2509        has_delim = true;
2510        if field_idx == target_idx {
2511            unsafe {
2512                buf_extend_byte(
2513                    buf,
2514                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2515                    line_delim,
2516                );
2517            }
2518            return;
2519        }
2520        field_idx += 1;
2521        field_start = pos + 1;
2522    }
2523
2524    if !has_delim {
2525        if !suppress {
2526            unsafe {
2527                buf_extend_byte(buf, line, line_delim);
2528            }
2529        }
2530        return;
2531    }
2532
2533    if field_idx == target_idx {
2534        unsafe {
2535            buf_extend_byte(
2536                buf,
2537                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2538                line_delim,
2539            );
2540        }
2541    } else {
2542        unsafe { buf_push(buf, line_delim) };
2543    }
2544}
2545
2546/// Extract fields from a single line into the output buffer.
2547/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2548/// Raw pointer arithmetic eliminates per-field bounds checking.
2549#[inline(always)]
2550fn extract_fields_to_buf(
2551    line: &[u8],
2552    delim: u8,
2553    ranges: &[Range],
2554    output_delim: &[u8],
2555    suppress: bool,
2556    max_field: usize,
2557    field_mask: u64,
2558    line_delim: u8,
2559    buf: &mut Vec<u8>,
2560    complement: bool,
2561) {
2562    let len = line.len();
2563
2564    if len == 0 {
2565        if !suppress {
2566            buf.push(line_delim);
2567        }
2568        return;
2569    }
2570
2571    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2572    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2573    let needed = len + output_delim.len() * 16 + 1;
2574    if buf.capacity() - buf.len() < needed {
2575        buf.reserve(needed);
2576    }
2577
2578    let base = line.as_ptr();
2579    let mut field_num: usize = 1;
2580    let mut field_start: usize = 0;
2581    let mut first_output = true;
2582    let mut has_delim = false;
2583
2584    // Use memchr SIMD for all line sizes
2585    for delim_pos in memchr_iter(delim, line) {
2586        has_delim = true;
2587
2588        if is_selected(field_num, field_mask, ranges, complement) {
2589            if !first_output {
2590                unsafe { buf_extend(buf, output_delim) };
2591            }
2592            unsafe {
2593                buf_extend(
2594                    buf,
2595                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2596                )
2597            };
2598            first_output = false;
2599        }
2600
2601        field_num += 1;
2602        field_start = delim_pos + 1;
2603
2604        if field_num > max_field {
2605            break;
2606        }
2607    }
2608
2609    // Last field
2610    if (field_num <= max_field || complement)
2611        && has_delim
2612        && is_selected(field_num, field_mask, ranges, complement)
2613    {
2614        if !first_output {
2615            unsafe { buf_extend(buf, output_delim) };
2616        }
2617        unsafe {
2618            buf_extend(
2619                buf,
2620                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2621            )
2622        };
2623        first_output = false;
2624    }
2625
2626    if !first_output {
2627        unsafe { buf_push(buf, line_delim) };
2628    } else if !has_delim {
2629        if !suppress {
2630            unsafe {
2631                buf_extend(buf, line);
2632                buf_push(buf, line_delim);
2633            }
2634        }
2635    } else {
2636        unsafe { buf_push(buf, line_delim) };
2637    }
2638}
2639
2640// ── Fast path: byte/char extraction with batched output ──────────────────
2641
2642/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2643/// Zero-copy: writes directly from the source data using output runs.
2644/// For lines shorter than max_bytes, the output is identical to the input,
2645/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2646fn process_bytes_from_start(
2647    data: &[u8],
2648    max_bytes: usize,
2649    line_delim: u8,
2650    out: &mut impl Write,
2651) -> io::Result<()> {
2652    // For small data (< PARALLEL_THRESHOLD): check if all lines fit for zero-copy passthrough.
2653    // The sequential scan + write_all is competitive with per-line processing for small data.
2654    //
2655    // For large data (>= PARALLEL_THRESHOLD): skip the all_fit scan entirely.
2656    // The scan is sequential (~1.7ms for 10MB at memchr speed) while parallel per-line
2657    // processing is much faster (~0.5ms for 10MB with 4 threads). Even when all lines fit,
2658    // the parallel copy + write is faster than sequential scan + zero-copy write.
2659    if data.len() < PARALLEL_THRESHOLD && max_bytes > 0 && max_bytes < usize::MAX {
2660        let mut start = 0;
2661        let mut all_fit = true;
2662        for pos in memchr_iter(line_delim, data) {
2663            if pos - start > max_bytes {
2664                all_fit = false;
2665                break;
2666            }
2667            start = pos + 1;
2668        }
2669        // Check last line (no trailing delimiter)
2670        if all_fit && start < data.len() && data.len() - start > max_bytes {
2671            all_fit = false;
2672        }
2673        if all_fit {
2674            // All lines fit: output = input. Handle missing trailing delimiter.
2675            if !data.is_empty() && data[data.len() - 1] == line_delim {
2676                return out.write_all(data);
2677            } else if !data.is_empty() {
2678                out.write_all(data)?;
2679                return out.write_all(&[line_delim]);
2680            }
2681            return Ok(());
2682        }
2683    }
2684
2685    if data.len() >= PARALLEL_THRESHOLD {
2686        let chunks = split_for_scope(data, line_delim);
2687        let n = chunks.len();
2688        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2689        rayon::scope(|s| {
2690            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2691                s.spawn(move |_| {
2692                    // Output can be up to input size (when all lines fit).
2693                    // Reserve full chunk size to avoid reallocation.
2694                    result.reserve(chunk.len());
2695                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2696                });
2697            }
2698        });
2699        // Use write_vectored (writev) to batch N writes into fewer syscalls
2700        let slices: Vec<IoSlice> = results
2701            .iter()
2702            .filter(|r| !r.is_empty())
2703            .map(|r| IoSlice::new(r))
2704            .collect();
2705        write_ioslices(out, &slices)?;
2706    } else {
2707        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2708        // because every line gets truncated, creating 3 IoSlice entries per line.
2709        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2710        // managing millions of IoSlice entries through the kernel.
2711        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2712        if max_bytes <= 512 {
2713            // Estimate output size without scanning: output <= data.len(),
2714            // typically ~data.len()/4 for short max_bytes on longer lines.
2715            let est_out = (data.len() / 4).max(max_bytes + 2);
2716            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2717            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2718            if !buf.is_empty() {
2719                out.write_all(&buf)?;
2720            }
2721        } else {
2722            // Zero-copy path: track contiguous output runs and write directly from source.
2723            // For lines <= max_bytes, we include them as-is (no copy needed).
2724            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2725            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2726        }
2727    }
2728    Ok(())
2729}
2730
2731/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2732/// directly into the source data, flushing in MAX_IOV-sized batches.
2733/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2734/// produce two IoSlices (truncated data + newline).
2735#[inline]
2736fn bytes_from_start_zerocopy(
2737    data: &[u8],
2738    max_bytes: usize,
2739    line_delim: u8,
2740    out: &mut impl Write,
2741) -> io::Result<()> {
2742    let newline_buf: [u8; 1] = [line_delim];
2743    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2744    let mut start = 0;
2745    let mut run_start: usize = 0;
2746
2747    for pos in memchr_iter(line_delim, data) {
2748        let line_len = pos - start;
2749        if line_len > max_bytes {
2750            // This line needs truncation
2751            if run_start < start {
2752                iov.push(IoSlice::new(&data[run_start..start]));
2753            }
2754            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2755            iov.push(IoSlice::new(&newline_buf));
2756            run_start = pos + 1;
2757
2758            if iov.len() >= MAX_IOV - 2 {
2759                write_ioslices(out, &iov)?;
2760                iov.clear();
2761            }
2762        }
2763        start = pos + 1;
2764    }
2765    // Handle last line without terminator
2766    if start < data.len() {
2767        let line_len = data.len() - start;
2768        if line_len > max_bytes {
2769            if run_start < start {
2770                iov.push(IoSlice::new(&data[run_start..start]));
2771            }
2772            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2773            iov.push(IoSlice::new(&newline_buf));
2774            if !iov.is_empty() {
2775                write_ioslices(out, &iov)?;
2776            }
2777            return Ok(());
2778        }
2779    }
2780    // Flush remaining contiguous run
2781    if run_start < data.len() {
2782        iov.push(IoSlice::new(&data[run_start..]));
2783        if !data.is_empty() && *data.last().unwrap() != line_delim {
2784            iov.push(IoSlice::new(&newline_buf));
2785        }
2786    }
2787    if !iov.is_empty() {
2788        write_ioslices(out, &iov)?;
2789    }
2790    Ok(())
2791}
2792
2793/// Process a chunk for from-start byte range extraction (parallel path).
2794/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2795/// Pre-reserves data.len() (output never exceeds input), then uses a single
2796/// write pointer with deferred set_len — no per-line capacity checks.
2797#[inline]
2798fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2799    // Output is always <= input size (we only truncate, never expand).
2800    // Single reserve eliminates ALL per-line capacity checks.
2801    buf.reserve(data.len());
2802
2803    let src = data.as_ptr();
2804    let dst_base = buf.as_mut_ptr();
2805    let mut wp = buf.len();
2806    let mut start = 0;
2807
2808    for pos in memchr_iter(line_delim, data) {
2809        let line_len = pos - start;
2810        let take = line_len.min(max_bytes);
2811        unsafe {
2812            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2813            *dst_base.add(wp + take) = line_delim;
2814        }
2815        wp += take + 1;
2816        start = pos + 1;
2817    }
2818    // Handle last line without terminator
2819    if start < data.len() {
2820        let line_len = data.len() - start;
2821        let take = line_len.min(max_bytes);
2822        unsafe {
2823            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2824            *dst_base.add(wp + take) = line_delim;
2825        }
2826        wp += take + 1;
2827    }
2828    unsafe { buf.set_len(wp) };
2829}
2830
2831/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2832fn process_bytes_from_offset(
2833    data: &[u8],
2834    skip_bytes: usize,
2835    line_delim: u8,
2836    out: &mut impl Write,
2837) -> io::Result<()> {
2838    if data.len() >= PARALLEL_THRESHOLD {
2839        let chunks = split_for_scope(data, line_delim);
2840        let n = chunks.len();
2841        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2842        rayon::scope(|s| {
2843            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2844                s.spawn(move |_| {
2845                    result.reserve(chunk.len());
2846                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2847                });
2848            }
2849        });
2850        // Use write_vectored (writev) to batch N writes into fewer syscalls
2851        let slices: Vec<IoSlice> = results
2852            .iter()
2853            .filter(|r| !r.is_empty())
2854            .map(|r| IoSlice::new(r))
2855            .collect();
2856        write_ioslices(out, &slices)?;
2857    } else {
2858        // Zero-copy: write suffix of each line directly from source
2859        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2860    }
2861    Ok(())
2862}
2863
2864/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2865/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2866/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2867#[inline]
2868fn bytes_from_offset_zerocopy(
2869    data: &[u8],
2870    skip_bytes: usize,
2871    line_delim: u8,
2872    out: &mut impl Write,
2873) -> io::Result<()> {
2874    let delim_buf = [line_delim];
2875    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2876
2877    let mut start = 0;
2878    for pos in memchr_iter(line_delim, data) {
2879        let line_len = pos - start;
2880        if line_len > skip_bytes {
2881            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2882        }
2883        iov.push(IoSlice::new(&delim_buf));
2884        // Flush when approaching MAX_IOV to avoid oversized writev
2885        if iov.len() >= MAX_IOV - 1 {
2886            write_ioslices(out, &iov)?;
2887            iov.clear();
2888        }
2889        start = pos + 1;
2890    }
2891    if start < data.len() {
2892        let line_len = data.len() - start;
2893        if line_len > skip_bytes {
2894            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2895        }
2896        iov.push(IoSlice::new(&delim_buf));
2897    }
2898    if !iov.is_empty() {
2899        write_ioslices(out, &iov)?;
2900    }
2901    Ok(())
2902}
2903
2904/// Process a chunk for from-offset byte range extraction.
2905/// Single reserve + deferred set_len for zero per-line overhead.
2906#[inline]
2907fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2908    buf.reserve(data.len());
2909
2910    let src = data.as_ptr();
2911    let dst_base = buf.as_mut_ptr();
2912    let mut wp = buf.len();
2913    let mut start = 0;
2914
2915    for pos in memchr_iter(line_delim, data) {
2916        let line_len = pos - start;
2917        if line_len > skip_bytes {
2918            let take = line_len - skip_bytes;
2919            unsafe {
2920                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2921            }
2922            wp += take;
2923        }
2924        unsafe {
2925            *dst_base.add(wp) = line_delim;
2926        }
2927        wp += 1;
2928        start = pos + 1;
2929    }
2930    if start < data.len() {
2931        let line_len = data.len() - start;
2932        if line_len > skip_bytes {
2933            let take = line_len - skip_bytes;
2934            unsafe {
2935                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2936            }
2937            wp += take;
2938        }
2939        unsafe {
2940            *dst_base.add(wp) = line_delim;
2941        }
2942        wp += 1;
2943    }
2944    unsafe { buf.set_len(wp) };
2945}
2946
2947/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2948fn process_bytes_mid_range(
2949    data: &[u8],
2950    start_byte: usize,
2951    end_byte: usize,
2952    line_delim: u8,
2953    out: &mut impl Write,
2954) -> io::Result<()> {
2955    let skip = start_byte.saturating_sub(1);
2956
2957    if data.len() >= PARALLEL_THRESHOLD {
2958        let chunks = split_for_scope(data, line_delim);
2959        let n = chunks.len();
2960        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2961        rayon::scope(|s| {
2962            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2963                s.spawn(move |_| {
2964                    result.reserve(chunk.len());
2965                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2966                });
2967            }
2968        });
2969        let slices: Vec<IoSlice> = results
2970            .iter()
2971            .filter(|r| !r.is_empty())
2972            .map(|r| IoSlice::new(r))
2973            .collect();
2974        write_ioslices(out, &slices)?;
2975    } else {
2976        let mut buf = Vec::with_capacity(data.len());
2977        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2978        if !buf.is_empty() {
2979            out.write_all(&buf)?;
2980        }
2981    }
2982    Ok(())
2983}
2984
2985/// Process a chunk for mid-range byte extraction.
2986/// For each line, output bytes skip..min(line_len, end_byte).
2987/// Single reserve + deferred set_len.
2988#[inline]
2989fn bytes_mid_range_chunk(
2990    data: &[u8],
2991    skip: usize,
2992    end_byte: usize,
2993    line_delim: u8,
2994    buf: &mut Vec<u8>,
2995) {
2996    buf.reserve(data.len());
2997
2998    let src = data.as_ptr();
2999    let dst_base = buf.as_mut_ptr();
3000    let mut wp = buf.len();
3001    let mut start = 0;
3002
3003    for pos in memchr_iter(line_delim, data) {
3004        let line_len = pos - start;
3005        if line_len > skip {
3006            let take_end = line_len.min(end_byte);
3007            let take = take_end - skip;
3008            unsafe {
3009                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
3010            }
3011            wp += take;
3012        }
3013        unsafe {
3014            *dst_base.add(wp) = line_delim;
3015        }
3016        wp += 1;
3017        start = pos + 1;
3018    }
3019    if start < data.len() {
3020        let line_len = data.len() - start;
3021        if line_len > skip {
3022            let take_end = line_len.min(end_byte);
3023            let take = take_end - skip;
3024            unsafe {
3025                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
3026            }
3027            wp += take;
3028        }
3029        unsafe {
3030            *dst_base.add(wp) = line_delim;
3031        }
3032        wp += 1;
3033    }
3034    unsafe { buf.set_len(wp) };
3035}
3036
3037/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
3038fn process_bytes_complement_mid(
3039    data: &[u8],
3040    skip_start: usize,
3041    skip_end: usize,
3042    line_delim: u8,
3043    out: &mut impl Write,
3044) -> io::Result<()> {
3045    let prefix_bytes = skip_start - 1; // bytes before the skip region
3046    if data.len() >= PARALLEL_THRESHOLD {
3047        let chunks = split_for_scope(data, line_delim);
3048        let n = chunks.len();
3049        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3050        rayon::scope(|s| {
3051            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3052                s.spawn(move |_| {
3053                    result.reserve(chunk.len());
3054                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
3055                });
3056            }
3057        });
3058        let slices: Vec<IoSlice> = results
3059            .iter()
3060            .filter(|r| !r.is_empty())
3061            .map(|r| IoSlice::new(r))
3062            .collect();
3063        write_ioslices(out, &slices)?;
3064    } else {
3065        let mut buf = Vec::with_capacity(data.len());
3066        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
3067        if !buf.is_empty() {
3068            out.write_all(&buf)?;
3069        }
3070    }
3071    Ok(())
3072}
3073
3074/// Process a chunk for complement mid-range byte extraction.
3075/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
3076#[inline]
3077fn bytes_complement_mid_chunk(
3078    data: &[u8],
3079    prefix_bytes: usize,
3080    skip_end: usize,
3081    line_delim: u8,
3082    buf: &mut Vec<u8>,
3083) {
3084    buf.reserve(data.len());
3085
3086    let src = data.as_ptr();
3087    let dst_base = buf.as_mut_ptr();
3088    let mut wp = buf.len();
3089    let mut start = 0;
3090
3091    for pos in memchr_iter(line_delim, data) {
3092        let line_len = pos - start;
3093        // Copy prefix (bytes before skip region)
3094        let take_prefix = prefix_bytes.min(line_len);
3095        if take_prefix > 0 {
3096            unsafe {
3097                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3098            }
3099            wp += take_prefix;
3100        }
3101        // Copy suffix (bytes after skip region)
3102        if line_len > skip_end {
3103            let suffix_len = line_len - skip_end;
3104            unsafe {
3105                std::ptr::copy_nonoverlapping(
3106                    src.add(start + skip_end),
3107                    dst_base.add(wp),
3108                    suffix_len,
3109                );
3110            }
3111            wp += suffix_len;
3112        }
3113        unsafe {
3114            *dst_base.add(wp) = line_delim;
3115        }
3116        wp += 1;
3117        start = pos + 1;
3118    }
3119    if start < data.len() {
3120        let line_len = data.len() - start;
3121        let take_prefix = prefix_bytes.min(line_len);
3122        if take_prefix > 0 {
3123            unsafe {
3124                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3125            }
3126            wp += take_prefix;
3127        }
3128        if line_len > skip_end {
3129            let suffix_len = line_len - skip_end;
3130            unsafe {
3131                std::ptr::copy_nonoverlapping(
3132                    src.add(start + skip_end),
3133                    dst_base.add(wp),
3134                    suffix_len,
3135                );
3136            }
3137            wp += suffix_len;
3138        }
3139        unsafe {
3140            *dst_base.add(wp) = line_delim;
3141        }
3142        wp += 1;
3143    }
3144    unsafe { buf.set_len(wp) };
3145}
3146
3147/// Optimized byte/char extraction with batched output and parallel processing.
3148fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3149    let line_delim = cfg.line_delim;
3150    let ranges = cfg.ranges;
3151    let complement = cfg.complement;
3152    let output_delim = cfg.output_delim;
3153
3154    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
3155    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
3156        let max_bytes = ranges[0].end;
3157        if max_bytes < usize::MAX {
3158            return process_bytes_from_start(data, max_bytes, line_delim, out);
3159        }
3160    }
3161
3162    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
3163    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
3164        let skip_bytes = ranges[0].start.saturating_sub(1);
3165        if skip_bytes > 0 {
3166            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
3167        }
3168    }
3169
3170    // Fast path: single mid-range (e.g., cut -b5-100)
3171    if !complement
3172        && ranges.len() == 1
3173        && ranges[0].start > 1
3174        && ranges[0].end < usize::MAX
3175        && output_delim.is_empty()
3176    {
3177        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
3178    }
3179
3180    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
3181    if complement
3182        && ranges.len() == 1
3183        && ranges[0].start == 1
3184        && ranges[0].end < usize::MAX
3185        && output_delim.is_empty()
3186    {
3187        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
3188    }
3189
3190    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
3191    if complement
3192        && ranges.len() == 1
3193        && ranges[0].end == usize::MAX
3194        && ranges[0].start > 1
3195        && output_delim.is_empty()
3196    {
3197        let max_bytes = ranges[0].start - 1;
3198        return process_bytes_from_start(data, max_bytes, line_delim, out);
3199    }
3200
3201    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
3202    if complement
3203        && ranges.len() == 1
3204        && ranges[0].start > 1
3205        && ranges[0].end < usize::MAX
3206        && output_delim.is_empty()
3207    {
3208        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
3209    }
3210
3211    if data.len() >= PARALLEL_THRESHOLD {
3212        let chunks = split_for_scope(data, line_delim);
3213        let n = chunks.len();
3214        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3215        rayon::scope(|s| {
3216            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3217                s.spawn(move |_| {
3218                    result.reserve(chunk.len() + 1);
3219                    process_bytes_chunk(
3220                        chunk,
3221                        ranges,
3222                        complement,
3223                        output_delim,
3224                        line_delim,
3225                        result,
3226                    );
3227                });
3228            }
3229        });
3230        let slices: Vec<IoSlice> = results
3231            .iter()
3232            .filter(|r| !r.is_empty())
3233            .map(|r| IoSlice::new(r))
3234            .collect();
3235        write_ioslices(out, &slices)?;
3236    } else {
3237        // +1 for potential trailing line_delim when input doesn't end with one
3238        let mut buf = Vec::with_capacity(data.len() + 1);
3239        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
3240        if !buf.is_empty() {
3241            out.write_all(&buf)?;
3242        }
3243    }
3244    Ok(())
3245}
3246
3247/// Process a chunk of data for byte/char extraction.
3248/// Uses raw pointer arithmetic for the newline scan.
3249/// Complement single-range fast path: compute complement ranges once, then use
3250/// the non-complement multi-range path which is more cache-friendly.
3251fn process_bytes_chunk(
3252    data: &[u8],
3253    ranges: &[Range],
3254    complement: bool,
3255    output_delim: &[u8],
3256    line_delim: u8,
3257    buf: &mut Vec<u8>,
3258) {
3259    buf.reserve(data.len());
3260    let base = data.as_ptr();
3261    let mut start = 0;
3262    for end_pos in memchr_iter(line_delim, data) {
3263        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
3264        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3265        unsafe { buf_push(buf, line_delim) };
3266        start = end_pos + 1;
3267    }
3268    if start < data.len() {
3269        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
3270        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3271        unsafe { buf_push(buf, line_delim) };
3272    }
3273}
3274
3275/// Extract byte ranges from a line into the output buffer.
3276/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
3277/// Raw pointer arithmetic eliminates per-range bounds checking.
3278#[inline(always)]
3279fn cut_bytes_to_buf(
3280    line: &[u8],
3281    ranges: &[Range],
3282    complement: bool,
3283    output_delim: &[u8],
3284    buf: &mut Vec<u8>,
3285) {
3286    let len = line.len();
3287    let base = line.as_ptr();
3288    let mut first_range = true;
3289
3290    // Reserve worst case: full line + delimiters between ranges
3291    let needed = len + output_delim.len() * ranges.len() + 1;
3292    if buf.capacity() - buf.len() < needed {
3293        buf.reserve(needed);
3294    }
3295
3296    if complement {
3297        let mut pos: usize = 1;
3298        for r in ranges {
3299            let rs = r.start;
3300            let re = r.end.min(len);
3301            if pos < rs {
3302                if !first_range && !output_delim.is_empty() {
3303                    unsafe { buf_extend(buf, output_delim) };
3304                }
3305                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
3306                first_range = false;
3307            }
3308            pos = re + 1;
3309            if pos > len {
3310                break;
3311            }
3312        }
3313        if pos <= len {
3314            if !first_range && !output_delim.is_empty() {
3315                unsafe { buf_extend(buf, output_delim) };
3316            }
3317            unsafe {
3318                buf_extend(
3319                    buf,
3320                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3321                )
3322            };
3323        }
3324    } else if output_delim.is_empty() && ranges.len() == 1 {
3325        // Ultra-fast path: single range, no output delimiter
3326        let start = ranges[0].start.saturating_sub(1);
3327        let end = ranges[0].end.min(len);
3328        if start < len {
3329            unsafe {
3330                buf_extend(
3331                    buf,
3332                    std::slice::from_raw_parts(base.add(start), end - start),
3333                )
3334            };
3335        }
3336    } else {
3337        for r in ranges {
3338            let start = r.start.saturating_sub(1);
3339            let end = r.end.min(len);
3340            if start >= len {
3341                break;
3342            }
3343            if !first_range && !output_delim.is_empty() {
3344                unsafe { buf_extend(buf, output_delim) };
3345            }
3346            unsafe {
3347                buf_extend(
3348                    buf,
3349                    std::slice::from_raw_parts(base.add(start), end - start),
3350                )
3351            };
3352            first_range = false;
3353        }
3354    }
3355}
3356
3357// ── Public API ───────────────────────────────────────────────────────────
3358
3359/// Cut fields from a line using a delimiter. Writes to `out`.
3360#[inline]
3361pub fn cut_fields(
3362    line: &[u8],
3363    delim: u8,
3364    ranges: &[Range],
3365    complement: bool,
3366    output_delim: &[u8],
3367    suppress_no_delim: bool,
3368    out: &mut impl Write,
3369) -> io::Result<bool> {
3370    if memchr::memchr(delim, line).is_none() {
3371        if !suppress_no_delim {
3372            out.write_all(line)?;
3373            return Ok(true);
3374        }
3375        return Ok(false);
3376    }
3377
3378    let mut field_num: usize = 1;
3379    let mut field_start: usize = 0;
3380    let mut first_output = true;
3381
3382    for delim_pos in memchr_iter(delim, line) {
3383        let selected = in_ranges(ranges, field_num) != complement;
3384        if selected {
3385            if !first_output {
3386                out.write_all(output_delim)?;
3387            }
3388            out.write_all(&line[field_start..delim_pos])?;
3389            first_output = false;
3390        }
3391        field_start = delim_pos + 1;
3392        field_num += 1;
3393    }
3394
3395    let selected = in_ranges(ranges, field_num) != complement;
3396    if selected {
3397        if !first_output {
3398            out.write_all(output_delim)?;
3399        }
3400        out.write_all(&line[field_start..])?;
3401    }
3402
3403    Ok(true)
3404}
3405
3406/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3407#[inline]
3408pub fn cut_bytes(
3409    line: &[u8],
3410    ranges: &[Range],
3411    complement: bool,
3412    output_delim: &[u8],
3413    out: &mut impl Write,
3414) -> io::Result<bool> {
3415    let mut first_range = true;
3416
3417    if complement {
3418        let len = line.len();
3419        let mut comp_ranges = Vec::new();
3420        let mut pos: usize = 1;
3421        for r in ranges {
3422            let rs = r.start;
3423            let re = r.end.min(len);
3424            if pos < rs {
3425                comp_ranges.push((pos, rs - 1));
3426            }
3427            pos = re + 1;
3428            if pos > len {
3429                break;
3430            }
3431        }
3432        if pos <= len {
3433            comp_ranges.push((pos, len));
3434        }
3435        for &(s, e) in &comp_ranges {
3436            if !first_range && !output_delim.is_empty() {
3437                out.write_all(output_delim)?;
3438            }
3439            out.write_all(&line[s - 1..e])?;
3440            first_range = false;
3441        }
3442    } else {
3443        for r in ranges {
3444            let start = r.start.saturating_sub(1);
3445            let end = r.end.min(line.len());
3446            if start >= line.len() {
3447                break;
3448            }
3449            if !first_range && !output_delim.is_empty() {
3450                out.write_all(output_delim)?;
3451            }
3452            out.write_all(&line[start..end])?;
3453            first_range = false;
3454        }
3455    }
3456    Ok(true)
3457}
3458
3459/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3460/// Output is always <= input (we remove everything after first delimiter per line).
3461/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3462/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3463///
3464/// Lines without delimiter pass through unchanged (unless suppress=true).
3465/// Lines with delimiter: keep bytes before delimiter + newline.
3466pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3467    let len = data.len();
3468    let mut wp: usize = 0;
3469    let mut rp: usize = 0;
3470
3471    while rp < len {
3472        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3473            None => {
3474                // Rest is partial line, no delimiter
3475                if suppress {
3476                    // suppress: skip lines without delimiter
3477                    break;
3478                }
3479                let remaining = len - rp;
3480                if wp != rp {
3481                    data.copy_within(rp..len, wp);
3482                }
3483                wp += remaining;
3484                break;
3485            }
3486            Some(offset) => {
3487                let actual = rp + offset;
3488                if data[actual] == line_delim {
3489                    // No delimiter on this line
3490                    if suppress {
3491                        // Skip this line entirely
3492                        rp = actual + 1;
3493                    } else {
3494                        // Output entire line including newline
3495                        let chunk_len = actual + 1 - rp;
3496                        if wp != rp {
3497                            data.copy_within(rp..actual + 1, wp);
3498                        }
3499                        wp += chunk_len;
3500                        rp = actual + 1;
3501                    }
3502                } else {
3503                    // Delimiter found: output field 1 (up to delimiter) + newline
3504                    let field_len = actual - rp;
3505                    if wp != rp && field_len > 0 {
3506                        data.copy_within(rp..actual, wp);
3507                    }
3508                    wp += field_len;
3509                    data[wp] = line_delim;
3510                    wp += 1;
3511                    // Skip to next newline
3512                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3513                        None => {
3514                            rp = len;
3515                        }
3516                        Some(nl_off) => {
3517                            rp = actual + 1 + nl_off + 1;
3518                        }
3519                    }
3520                }
3521            }
3522        }
3523    }
3524    wp
3525}
3526
3527/// Process a full data buffer (from mmap or read) with cut operation.
3528pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3529    match cfg.mode {
3530        CutMode::Fields => process_fields_fast(data, cfg, out),
3531        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3532    }
3533}
3534
3535/// Process input from a reader (for stdin).
3536/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3537/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3538/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3539pub fn process_cut_reader<R: BufRead>(
3540    mut reader: R,
3541    cfg: &CutConfig,
3542    out: &mut impl Write,
3543) -> io::Result<()> {
3544    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3545    let line_delim = cfg.line_delim;
3546
3547    // Read large chunks and process in batch.
3548    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3549    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3550
3551    loop {
3552        // Read up to CHUNK_SIZE bytes
3553        buf.reserve(CHUNK_SIZE);
3554        let read_start = buf.len();
3555        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3556        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3557        buf.truncate(read_start + n);
3558
3559        if buf.is_empty() {
3560            break;
3561        }
3562
3563        if n == 0 {
3564            // EOF with leftover data (last line without terminator)
3565            process_cut_data(&buf, cfg, out)?;
3566            break;
3567        }
3568
3569        // Find the last line delimiter in the buffer so we process complete lines
3570        let process_end = match memchr::memrchr(line_delim, &buf) {
3571            Some(pos) => pos + 1,
3572            None => {
3573                // No line delimiter found — keep accumulating
3574                continue;
3575            }
3576        };
3577
3578        // Process the complete lines using the fast batch path
3579        process_cut_data(&buf[..process_end], cfg, out)?;
3580
3581        // Shift leftover to the front for next iteration
3582        let leftover_len = buf.len() - process_end;
3583        if leftover_len > 0 {
3584            buf.copy_within(process_end.., 0);
3585        }
3586        buf.truncate(leftover_len);
3587    }
3588
3589    Ok(())
3590}
3591
3592/// Read as many bytes as possible into buf, retrying on partial reads.
3593#[inline]
3594fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3595    let n = reader.read(buf)?;
3596    if n == buf.len() || n == 0 {
3597        return Ok(n);
3598    }
3599    // Slow path: partial read — retry to fill buffer
3600    let mut total = n;
3601    while total < buf.len() {
3602        match reader.read(&mut buf[total..]) {
3603            Ok(0) => break,
3604            Ok(n) => total += n,
3605            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3606            Err(e) => return Err(e),
3607        }
3608    }
3609    Ok(total)
3610}
3611
3612/// In-place cut processing for mutable data buffers.
3613/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3614/// for the given configuration (caller should fall back to regular processing).
3615///
3616/// In-place avoids allocating intermediate output buffers — the result is written
3617/// directly into the input buffer (output is always <= input for non-complement modes
3618/// with default output delimiter).
3619///
3620/// Note: if the input does not end with line_delim, we fall back to the regular
3621/// path because GNU cut always adds a trailing line delimiter, and the in-place
3622/// buffer cannot grow beyond the input size.
3623pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3624    if cfg.complement {
3625        return None;
3626    }
3627    // If input doesn't end with line_delim, the output may need an extra byte
3628    // (GNU cut always terminates the last line). In-place can't grow the buffer,
3629    // so fall back to the regular allocating path.
3630    if data.is_empty() || data[data.len() - 1] != cfg.line_delim {
3631        return None;
3632    }
3633
3634    match cfg.mode {
3635        CutMode::Fields => {
3636            // Only handle when output delimiter matches input (single-byte)
3637            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3638                return None;
3639            }
3640            if cfg.delim == cfg.line_delim {
3641                return None;
3642            }
3643            Some(cut_fields_inplace_general(
3644                data,
3645                cfg.delim,
3646                cfg.line_delim,
3647                cfg.ranges,
3648                cfg.suppress_no_delim,
3649            ))
3650        }
3651        CutMode::Bytes | CutMode::Characters => {
3652            if !cfg.output_delim.is_empty() {
3653                return None;
3654            }
3655            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3656        }
3657    }
3658}
3659
3660/// In-place generalized field extraction.
3661/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3662fn cut_fields_inplace_general(
3663    data: &mut [u8],
3664    delim: u8,
3665    line_delim: u8,
3666    ranges: &[Range],
3667    suppress: bool,
3668) -> usize {
3669    // Special case: field 1 only (existing optimized path)
3670    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3671        return cut_field1_inplace(data, delim, line_delim, suppress);
3672    }
3673
3674    let len = data.len();
3675    if len == 0 {
3676        return 0;
3677    }
3678
3679    let max_field = ranges.last().map_or(0, |r| r.end);
3680    let max_delims = max_field.min(64);
3681    let mut wp: usize = 0;
3682    let mut rp: usize = 0;
3683
3684    while rp < len {
3685        let line_end = memchr::memchr(line_delim, &data[rp..])
3686            .map(|p| rp + p)
3687            .unwrap_or(len);
3688        let line_len = line_end - rp;
3689
3690        // Collect delimiter positions (relative to line start)
3691        let mut delim_pos = [0usize; 64];
3692        let mut num_delims: usize = 0;
3693
3694        for pos in memchr_iter(delim, &data[rp..line_end]) {
3695            if num_delims < max_delims {
3696                delim_pos[num_delims] = pos;
3697                num_delims += 1;
3698                if num_delims >= max_delims {
3699                    break;
3700                }
3701            }
3702        }
3703
3704        if num_delims == 0 {
3705            // No delimiter in line
3706            if !suppress {
3707                if wp != rp {
3708                    data.copy_within(rp..line_end, wp);
3709                }
3710                wp += line_len;
3711                if line_end < len {
3712                    data[wp] = line_delim;
3713                    wp += 1;
3714                }
3715            }
3716        } else {
3717            let total_fields = num_delims + 1;
3718            let mut first_output = true;
3719
3720            for r in ranges {
3721                let range_start = r.start;
3722                let range_end = r.end.min(total_fields);
3723                if range_start > total_fields {
3724                    break;
3725                }
3726                for field_num in range_start..=range_end {
3727                    if field_num > total_fields {
3728                        break;
3729                    }
3730
3731                    let field_start = if field_num == 1 {
3732                        0
3733                    } else if field_num - 2 < num_delims {
3734                        delim_pos[field_num - 2] + 1
3735                    } else {
3736                        continue;
3737                    };
3738                    let field_end = if field_num <= num_delims {
3739                        delim_pos[field_num - 1]
3740                    } else {
3741                        line_len
3742                    };
3743
3744                    if !first_output {
3745                        data[wp] = delim;
3746                        wp += 1;
3747                    }
3748                    let flen = field_end - field_start;
3749                    if flen > 0 {
3750                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3751                        wp += flen;
3752                    }
3753                    first_output = false;
3754                }
3755            }
3756
3757            if !first_output && line_end < len {
3758                data[wp] = line_delim;
3759                wp += 1;
3760            } else if first_output && line_end < len {
3761                // No fields selected but line had delimiters — output empty line
3762                data[wp] = line_delim;
3763                wp += 1;
3764            }
3765        }
3766
3767        rp = if line_end < len { line_end + 1 } else { len };
3768    }
3769
3770    wp
3771}
3772
3773/// In-place byte/char range extraction.
3774fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3775    let len = data.len();
3776    if len == 0 {
3777        return 0;
3778    }
3779
3780    // Quick check: single range from byte 1 to end = no-op
3781    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3782        return len;
3783    }
3784
3785    // Single range from byte 1: fast truncation path
3786    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3787        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3788    }
3789
3790    let mut wp: usize = 0;
3791    let mut rp: usize = 0;
3792
3793    while rp < len {
3794        let line_end = memchr::memchr(line_delim, &data[rp..])
3795            .map(|p| rp + p)
3796            .unwrap_or(len);
3797        let line_len = line_end - rp;
3798
3799        for r in ranges {
3800            let start = r.start.saturating_sub(1);
3801            let end = r.end.min(line_len);
3802            if start >= line_len {
3803                break;
3804            }
3805            let flen = end - start;
3806            if flen > 0 {
3807                data.copy_within(rp + start..rp + start + flen, wp);
3808                wp += flen;
3809            }
3810        }
3811
3812        if line_end < len {
3813            data[wp] = line_delim;
3814            wp += 1;
3815        }
3816
3817        rp = if line_end < len { line_end + 1 } else { len };
3818    }
3819
3820    wp
3821}
3822
3823/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3824fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3825    let len = data.len();
3826
3827    // Quick check: see if all lines fit within max_bytes (common case)
3828    let mut all_fit = true;
3829    let mut start = 0;
3830    for pos in memchr_iter(line_delim, data) {
3831        if pos - start > max_bytes {
3832            all_fit = false;
3833            break;
3834        }
3835        start = pos + 1;
3836    }
3837    if all_fit && start < len && len - start > max_bytes {
3838        all_fit = false;
3839    }
3840    if all_fit {
3841        return len;
3842    }
3843
3844    // Some lines need truncation
3845    let mut wp: usize = 0;
3846    let mut rp: usize = 0;
3847
3848    while rp < len {
3849        let line_end = memchr::memchr(line_delim, &data[rp..])
3850            .map(|p| rp + p)
3851            .unwrap_or(len);
3852        let line_len = line_end - rp;
3853
3854        let take = line_len.min(max_bytes);
3855        if take > 0 && wp != rp {
3856            data.copy_within(rp..rp + take, wp);
3857        }
3858        wp += take;
3859
3860        if line_end < len {
3861            data[wp] = line_delim;
3862            wp += 1;
3863        }
3864
3865        rp = if line_end < len { line_end + 1 } else { len };
3866    }
3867
3868    wp
3869}
3870
3871/// Cut operation mode
3872#[derive(Debug, Clone, Copy, PartialEq)]
3873pub enum CutMode {
3874    Bytes,
3875    Characters,
3876    Fields,
3877}