Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, IoSlice, Write};
3
4/// Minimum file size for parallel processing (16MB).
5/// Files above this threshold use rayon parallel chunked processing.
6/// 16MB balances the split_for_scope scan overhead against parallel benefits.
7const PARALLEL_THRESHOLD: usize = 16 * 1024 * 1024;
8
9/// Max iovec entries per writev call (Linux default).
10const MAX_IOV: usize = 1024;
11
12/// Configuration for cut operations.
13pub struct CutConfig<'a> {
14    pub mode: CutMode,
15    pub ranges: &'a [Range],
16    pub complement: bool,
17    pub delim: u8,
18    pub output_delim: &'a [u8],
19    pub suppress_no_delim: bool,
20    pub line_delim: u8,
21}
22
23/// A range specification like 1, 3-5, -3, 4-
24#[derive(Debug, Clone)]
25pub struct Range {
26    pub start: usize, // 1-based, 0 means "from beginning"
27    pub end: usize,   // 1-based, usize::MAX means "to end"
28}
29
30/// Parse a LIST specification like "1,3-5,7-" into ranges.
31/// Each range is 1-based. Returns sorted, merged ranges.
32pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
33    let mut ranges = Vec::new();
34
35    for part in spec.split(',') {
36        let part = part.trim();
37        if part.is_empty() {
38            continue;
39        }
40
41        if let Some(idx) = part.find('-') {
42            let left = &part[..idx];
43            let right = &part[idx + 1..];
44
45            let start = if left.is_empty() {
46                1
47            } else {
48                left.parse::<usize>()
49                    .map_err(|_| format!("invalid range: '{}'", part))?
50            };
51
52            let end = if right.is_empty() {
53                usize::MAX
54            } else {
55                right
56                    .parse::<usize>()
57                    .map_err(|_| format!("invalid range: '{}'", part))?
58            };
59
60            if start == 0 {
61                return Err("fields and positions are numbered from 1".to_string());
62            }
63            if start > end {
64                return Err(format!("invalid decreasing range: '{}'", part));
65            }
66
67            ranges.push(Range { start, end });
68        } else {
69            let n = part
70                .parse::<usize>()
71                .map_err(|_| format!("invalid field: '{}'", part))?;
72            if n == 0 {
73                return Err("fields and positions are numbered from 1".to_string());
74            }
75            ranges.push(Range { start: n, end: n });
76        }
77    }
78
79    if ranges.is_empty() {
80        return Err("you must specify a list of bytes, characters, or fields".to_string());
81    }
82
83    // Sort and merge overlapping ranges
84    ranges.sort_by_key(|r| (r.start, r.end));
85    let mut merged = vec![ranges[0].clone()];
86    for r in &ranges[1..] {
87        let last = merged.last_mut().unwrap();
88        if r.start <= last.end.saturating_add(1) {
89            last.end = last.end.max(r.end);
90        } else {
91            merged.push(r.clone());
92        }
93    }
94
95    Ok(merged)
96}
97
98/// Check if a 1-based position is in any range.
99/// Ranges must be sorted. Uses early exit since ranges are sorted.
100#[inline(always)]
101fn in_ranges(ranges: &[Range], pos: usize) -> bool {
102    for r in ranges {
103        if pos < r.start {
104            return false;
105        }
106        if pos <= r.end {
107            return true;
108        }
109    }
110    false
111}
112
113/// Pre-compute a 64-bit mask for field selection.
114/// Bit i-1 is set if field i should be output.
115#[inline]
116fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
117    let mut mask: u64 = 0;
118    for i in 1..=64u32 {
119        let in_range = in_ranges(ranges, i as usize);
120        if in_range != complement {
121            mask |= 1u64 << (i - 1);
122        }
123    }
124    mask
125}
126
127/// Check if a field should be selected, using bitset for first 64 fields.
128#[inline(always)]
129fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
130    if field_num <= 64 {
131        (mask >> (field_num - 1)) & 1 == 1
132    } else {
133        in_ranges(ranges, field_num) != complement
134    }
135}
136
137// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
138
139/// Append a slice to buf without capacity checks.
140/// Caller MUST ensure buf has enough remaining capacity.
141#[inline(always)]
142unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
143    unsafe {
144        let len = buf.len();
145        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
146        buf.set_len(len + data.len());
147    }
148}
149
150/// Append a single byte to buf without capacity checks.
151/// Caller MUST ensure buf has enough remaining capacity.
152#[inline(always)]
153unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
154    unsafe {
155        let len = buf.len();
156        *buf.as_mut_ptr().add(len) = b;
157        buf.set_len(len + 1);
158    }
159}
160
161/// Append a slice + a single trailing byte to buf without capacity checks.
162/// Fused operation saves one len load/store vs separate buf_extend + buf_push.
163/// Hot path for field extraction: copies field content + newline in one call.
164/// Caller MUST ensure buf has enough remaining capacity.
165#[inline(always)]
166unsafe fn buf_extend_byte(buf: &mut Vec<u8>, data: &[u8], b: u8) {
167    unsafe {
168        let len = buf.len();
169        let ptr = buf.as_mut_ptr().add(len);
170        std::ptr::copy_nonoverlapping(data.as_ptr(), ptr, data.len());
171        *ptr.add(data.len()) = b;
172        buf.set_len(len + data.len() + 1);
173    }
174}
175
176/// Write multiple IoSlice buffers using write_vectored (writev syscall).
177/// Batches into MAX_IOV-sized groups. Hot path: single write_vectored succeeds.
178/// Cold path (partial write) is out-of-line to keep the hot loop tight.
179#[inline]
180fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
181    if slices.is_empty() {
182        return Ok(());
183    }
184    for batch in slices.chunks(MAX_IOV) {
185        let total: usize = batch.iter().map(|s| s.len()).sum();
186        let written = out.write_vectored(batch)?;
187        if written >= total {
188            continue;
189        }
190        if written == 0 {
191            return Err(io::Error::new(io::ErrorKind::WriteZero, "write zero"));
192        }
193        write_ioslices_slow(out, batch, written)?;
194    }
195    Ok(())
196}
197
198/// Handle partial write_vectored (cold path, never inlined).
199#[cold]
200#[inline(never)]
201fn write_ioslices_slow(
202    out: &mut impl Write,
203    slices: &[IoSlice],
204    mut skip: usize,
205) -> io::Result<()> {
206    for slice in slices {
207        let len = slice.len();
208        if skip >= len {
209            skip -= len;
210            continue;
211        }
212        out.write_all(&slice[skip..])?;
213        skip = 0;
214    }
215    Ok(())
216}
217
218// ── Chunk splitting for parallel processing ──────────────────────────────
219
220/// Number of available CPUs for parallel chunk splitting.
221/// Uses std::thread::available_parallelism() to avoid triggering premature
222/// rayon pool initialization (~300-500µs). Rayon pool inits on first scope() call.
223#[inline]
224fn num_cpus() -> usize {
225    std::thread::available_parallelism()
226        .map(|n| n.get())
227        .unwrap_or(1)
228}
229
230/// Split data into chunks for rayon::scope parallel processing.
231/// Uses Rayon's thread count to match the number of worker threads.
232fn split_for_scope<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
233    let num_threads = num_cpus().max(1);
234    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
235        return vec![data];
236    }
237
238    let chunk_size = data.len() / num_threads;
239    let mut chunks = Vec::with_capacity(num_threads);
240    let mut pos = 0;
241
242    for _ in 0..num_threads - 1 {
243        let target = pos + chunk_size;
244        if target >= data.len() {
245            break;
246        }
247        let boundary = memchr::memchr(line_delim, &data[target..])
248            .map(|p| target + p + 1)
249            .unwrap_or(data.len());
250        if boundary > pos {
251            chunks.push(&data[pos..boundary]);
252        }
253        pos = boundary;
254    }
255
256    if pos < data.len() {
257        chunks.push(&data[pos..]);
258    }
259
260    chunks
261}
262
263// ── Fast path: multi-field non-contiguous extraction ─────────────────────
264
265/// Multi-field non-contiguous extraction (e.g., `cut -d, -f1,3,5`).
266/// Pre-collects delimiter positions per line into a stack-allocated array,
267/// then directly indexes into them for each selected field.
268/// This is O(max_field) per line instead of O(num_fields * scan_length).
269fn process_fields_multi_select(
270    data: &[u8],
271    delim: u8,
272    line_delim: u8,
273    ranges: &[Range],
274    suppress: bool,
275    out: &mut impl Write,
276) -> io::Result<()> {
277    let max_field = ranges.last().map_or(0, |r| r.end);
278
279    if data.len() >= PARALLEL_THRESHOLD {
280        let chunks = split_for_scope(data, line_delim);
281        let n = chunks.len();
282        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
283        rayon::scope(|s| {
284            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
285                s.spawn(move |_| {
286                    result.reserve(chunk.len() * 3 / 4);
287                    multi_select_chunk(
288                        chunk, delim, line_delim, ranges, max_field, suppress, result,
289                    );
290                });
291            }
292        });
293        let slices: Vec<IoSlice> = results
294            .iter()
295            .filter(|r| !r.is_empty())
296            .map(|r| IoSlice::new(r))
297            .collect();
298        write_ioslices(out, &slices)?;
299    } else {
300        let mut buf = Vec::with_capacity(data.len() * 3 / 4);
301        multi_select_chunk(
302            data, delim, line_delim, ranges, max_field, suppress, &mut buf,
303        );
304        if !buf.is_empty() {
305            out.write_all(&buf)?;
306        }
307    }
308    Ok(())
309}
310
311/// Process a chunk for multi-field extraction using a single-pass memchr2 scan.
312/// Scans for both delimiter and line_delim in one SIMD pass over the entire chunk,
313/// eliminating per-line memchr_iter setup overhead (significant for short lines).
314/// Delimiter positions are collected in a stack array per line.
315/// When max_field is reached on a line, remaining delimiters are ignored.
316fn multi_select_chunk(
317    data: &[u8],
318    delim: u8,
319    line_delim: u8,
320    ranges: &[Range],
321    max_field: usize,
322    suppress: bool,
323    buf: &mut Vec<u8>,
324) {
325    // When delim == line_delim, fall back to two-level approach
326    if delim == line_delim {
327        buf.reserve(data.len());
328        let base = data.as_ptr();
329        let mut start = 0;
330        for end_pos in memchr_iter(line_delim, data) {
331            let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
332            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
333            start = end_pos + 1;
334        }
335        if start < data.len() {
336            let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
337            multi_select_line(line, delim, line_delim, ranges, max_field, suppress, buf);
338        }
339        return;
340    }
341
342    buf.reserve(data.len());
343    let base = data.as_ptr();
344    let data_len = data.len();
345
346    // Per-line state
347    let mut line_start: usize = 0;
348    let mut delim_pos = [0usize; 64];
349    let mut num_delims: usize = 0;
350    let max_delims = max_field.min(64);
351    let mut at_max = false;
352
353    // Single-pass scan using memchr2 for both delimiter and newline
354    for pos in memchr::memchr2_iter(delim, line_delim, data) {
355        let byte = unsafe { *base.add(pos) };
356
357        if byte == line_delim {
358            // End of line: extract fields from collected positions
359            let line_len = pos - line_start;
360            if num_delims == 0 {
361                // No delimiter in line
362                if !suppress {
363                    unsafe {
364                        buf_extend(
365                            buf,
366                            std::slice::from_raw_parts(base.add(line_start), line_len),
367                        );
368                        buf_push(buf, line_delim);
369                    }
370                }
371            } else {
372                // Extract fields using collected delimiter positions
373                let total_fields = num_delims + 1;
374                let mut first_output = true;
375
376                for r in ranges {
377                    let range_start = r.start;
378                    let range_end = r.end.min(total_fields);
379                    if range_start > total_fields {
380                        break;
381                    }
382                    for field_num in range_start..=range_end {
383                        if field_num > total_fields {
384                            break;
385                        }
386
387                        let field_start = if field_num == 1 {
388                            line_start
389                        } else if field_num - 2 < num_delims {
390                            delim_pos[field_num - 2] + 1
391                        } else {
392                            continue;
393                        };
394                        let field_end = if field_num <= num_delims {
395                            delim_pos[field_num - 1]
396                        } else {
397                            pos
398                        };
399
400                        if !first_output {
401                            unsafe { buf_push(buf, delim) };
402                        }
403                        unsafe {
404                            buf_extend(
405                                buf,
406                                std::slice::from_raw_parts(
407                                    base.add(field_start),
408                                    field_end - field_start,
409                                ),
410                            );
411                        }
412                        first_output = false;
413                    }
414                }
415
416                unsafe { buf_push(buf, line_delim) };
417            }
418
419            // Reset for next line
420            line_start = pos + 1;
421            num_delims = 0;
422            at_max = false;
423        } else {
424            // Delimiter found: collect position (up to max_field)
425            if !at_max && num_delims < max_delims {
426                delim_pos[num_delims] = pos;
427                num_delims += 1;
428                if num_delims >= max_delims {
429                    at_max = true;
430                }
431            }
432        }
433    }
434
435    // Handle last line without trailing line_delim
436    if line_start < data_len {
437        if num_delims == 0 {
438            if !suppress {
439                unsafe {
440                    buf_extend(
441                        buf,
442                        std::slice::from_raw_parts(base.add(line_start), data_len - line_start),
443                    );
444                    buf_push(buf, line_delim);
445                }
446            }
447        } else {
448            let total_fields = num_delims + 1;
449            let mut first_output = true;
450
451            for r in ranges {
452                let range_start = r.start;
453                let range_end = r.end.min(total_fields);
454                if range_start > total_fields {
455                    break;
456                }
457                for field_num in range_start..=range_end {
458                    if field_num > total_fields {
459                        break;
460                    }
461
462                    let field_start = if field_num == 1 {
463                        line_start
464                    } else if field_num - 2 < num_delims {
465                        delim_pos[field_num - 2] + 1
466                    } else {
467                        continue;
468                    };
469                    let field_end = if field_num <= num_delims {
470                        delim_pos[field_num - 1]
471                    } else {
472                        data_len
473                    };
474
475                    if !first_output {
476                        unsafe { buf_push(buf, delim) };
477                    }
478                    unsafe {
479                        buf_extend(
480                            buf,
481                            std::slice::from_raw_parts(
482                                base.add(field_start),
483                                field_end - field_start,
484                            ),
485                        );
486                    }
487                    first_output = false;
488                }
489            }
490
491            unsafe { buf_push(buf, line_delim) };
492        }
493    }
494}
495
496/// Extract selected fields from a single line using delimiter position scanning.
497/// Scans delimiters only up to max_field (early exit), then extracts selected fields
498/// by indexing directly into the collected positions. Since ranges are pre-sorted and
499/// non-overlapping, every field within a range is selected — no is_selected check needed.
500#[inline(always)]
501fn multi_select_line(
502    line: &[u8],
503    delim: u8,
504    line_delim: u8,
505    ranges: &[Range],
506    max_field: usize,
507    suppress: bool,
508    buf: &mut Vec<u8>,
509) {
510    let len = line.len();
511    if len == 0 {
512        if !suppress {
513            unsafe { buf_push(buf, line_delim) };
514        }
515        return;
516    }
517
518    // Note: no per-line buf.reserve — multi_select_chunk already reserves data.len()
519    let base = line.as_ptr();
520
521    // Collect delimiter positions up to max_field (early exit).
522    // Stack array for up to 64 delimiter positions.
523    let mut delim_pos = [0usize; 64];
524    let mut num_delims: usize = 0;
525    let max_delims = max_field.min(64);
526
527    for pos in memchr_iter(delim, line) {
528        if num_delims < max_delims {
529            delim_pos[num_delims] = pos;
530            num_delims += 1;
531            if num_delims >= max_delims {
532                break;
533            }
534        }
535    }
536
537    if num_delims == 0 {
538        if !suppress {
539            unsafe {
540                buf_extend(buf, line);
541                buf_push(buf, line_delim);
542            }
543        }
544        return;
545    }
546
547    // Extract selected fields using delimiter positions.
548    // Ranges are pre-sorted and non-overlapping, so every field_num within a range
549    // is selected — skip the is_selected check entirely (saves 1 function call per field).
550    let total_fields = num_delims + 1;
551    let mut first_output = true;
552
553    for r in ranges {
554        let range_start = r.start;
555        let range_end = r.end.min(total_fields);
556        if range_start > total_fields {
557            break;
558        }
559        for field_num in range_start..=range_end {
560            if field_num > total_fields {
561                break;
562            }
563
564            let field_start = if field_num == 1 {
565                0
566            } else if field_num - 2 < num_delims {
567                delim_pos[field_num - 2] + 1
568            } else {
569                continue;
570            };
571            let field_end = if field_num <= num_delims {
572                delim_pos[field_num - 1]
573            } else {
574                len
575            };
576
577            if !first_output {
578                unsafe { buf_push(buf, delim) };
579            }
580            unsafe {
581                buf_extend(
582                    buf,
583                    std::slice::from_raw_parts(base.add(field_start), field_end - field_start),
584                );
585            }
586            first_output = false;
587        }
588    }
589
590    unsafe { buf_push(buf, line_delim) };
591}
592
593// ── Fast path: field extraction with batched output ──────────────────────
594
595/// Optimized field extraction with early exit and batched output.
596fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
597    let delim = cfg.delim;
598    let line_delim = cfg.line_delim;
599    let ranges = cfg.ranges;
600    let complement = cfg.complement;
601    let output_delim = cfg.output_delim;
602    let suppress = cfg.suppress_no_delim;
603
604    // NOTE: Removed the full-file `memchr(delim, data).is_none()` scan.
605    // That scan was O(N) over the entire file just to check an edge case
606    // (no delimiter in any line). The per-line processing already handles
607    // lines without delimiters correctly, so the scan was pure overhead
608    // for files that DO contain delimiters (the common case).
609
610    // Ultra-fast path: single field extraction (e.g., cut -f5)
611    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
612        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
613    }
614
615    // Fast path: complement of single field or contiguous range with default output delimiter.
616    if complement
617        && ranges.len() == 1
618        && output_delim.len() == 1
619        && output_delim[0] == delim
620        && ranges[0].start == ranges[0].end
621    {
622        return process_complement_single_field(
623            data,
624            delim,
625            line_delim,
626            ranges[0].start,
627            suppress,
628            out,
629        );
630    }
631
632    // Fast path: complement of contiguous range (e.g., --complement -f3-5 = output fields 1,2,6+).
633    // This is equivalent to outputting a prefix and a suffix, skipping the middle range.
634    if complement
635        && ranges.len() == 1
636        && ranges[0].start > 1
637        && ranges[0].end < usize::MAX
638        && output_delim.len() == 1
639        && output_delim[0] == delim
640    {
641        return process_complement_range(
642            data,
643            delim,
644            line_delim,
645            ranges[0].start,
646            ranges[0].end,
647            suppress,
648            out,
649        );
650    }
651
652    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
653    if !complement
654        && ranges.len() == 1
655        && ranges[0].start == 1
656        && output_delim.len() == 1
657        && output_delim[0] == delim
658        && ranges[0].end < usize::MAX
659    {
660        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
661    }
662
663    // Fast path: open-ended field range from field N (e.g., cut -f3-)
664    if !complement
665        && ranges.len() == 1
666        && ranges[0].end == usize::MAX
667        && ranges[0].start > 1
668        && output_delim.len() == 1
669        && output_delim[0] == delim
670    {
671        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
672    }
673
674    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
675    if !complement
676        && ranges.len() == 1
677        && ranges[0].start > 1
678        && ranges[0].end < usize::MAX
679        && output_delim.len() == 1
680        && output_delim[0] == delim
681    {
682        return process_fields_mid_range(
683            data,
684            delim,
685            line_delim,
686            ranges[0].start,
687            ranges[0].end,
688            suppress,
689            out,
690        );
691    }
692
693    // Fast path: multi-field non-contiguous extraction (e.g., cut -f1,3,5)
694    // Uses delimiter position caching: find all delimiter positions per line,
695    // then directly index into them for each selected field.
696    // This is faster than the general extract_fields_to_buf which re-checks
697    // is_selected() for every field encountered.
698    if !complement
699        && ranges.len() > 1
700        && ranges.last().map_or(false, |r| r.end < usize::MAX)
701        && output_delim.len() == 1
702        && output_delim[0] == delim
703        && delim != line_delim
704    {
705        return process_fields_multi_select(data, delim, line_delim, ranges, suppress, out);
706    }
707
708    // General field extraction
709    let max_field = if complement {
710        usize::MAX
711    } else {
712        ranges.last().map(|r| r.end).unwrap_or(0)
713    };
714    let field_mask = compute_field_mask(ranges, complement);
715
716    if data.len() >= PARALLEL_THRESHOLD {
717        let chunks = split_for_scope(data, line_delim);
718        let n = chunks.len();
719        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
720        rayon::scope(|s| {
721            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
722                s.spawn(move |_| {
723                    result.reserve(chunk.len());
724                    process_fields_chunk(
725                        chunk,
726                        delim,
727                        ranges,
728                        output_delim,
729                        suppress,
730                        max_field,
731                        field_mask,
732                        line_delim,
733                        complement,
734                        result,
735                    );
736                });
737            }
738        });
739        let slices: Vec<IoSlice> = results
740            .iter()
741            .filter(|r| !r.is_empty())
742            .map(|r| IoSlice::new(r))
743            .collect();
744        write_ioslices(out, &slices)?;
745    } else {
746        let mut buf = Vec::with_capacity(data.len());
747        process_fields_chunk(
748            data,
749            delim,
750            ranges,
751            output_delim,
752            suppress,
753            max_field,
754            field_mask,
755            line_delim,
756            complement,
757            &mut buf,
758        );
759        if !buf.is_empty() {
760            out.write_all(&buf)?;
761        }
762    }
763    Ok(())
764}
765
766/// Process a chunk of data for general field extraction.
767/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
768/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
769/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
770fn process_fields_chunk(
771    data: &[u8],
772    delim: u8,
773    ranges: &[Range],
774    output_delim: &[u8],
775    suppress: bool,
776    max_field: usize,
777    field_mask: u64,
778    line_delim: u8,
779    complement: bool,
780    buf: &mut Vec<u8>,
781) {
782    // When delim != line_delim and max_field is bounded, use two-level approach:
783    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
784    // This avoids scanning past max_field on each line (significant for lines with
785    // many columns but small field selection like -f1,3,5 on 20-column CSV).
786    // For complement or unbounded ranges, use single-pass memchr2_iter which
787    // needs to process all delimiters anyway.
788    if delim != line_delim && max_field < usize::MAX && !complement {
789        buf.reserve(data.len());
790        let mut start = 0;
791        for end_pos in memchr_iter(line_delim, data) {
792            let line = &data[start..end_pos];
793            extract_fields_to_buf(
794                line,
795                delim,
796                ranges,
797                output_delim,
798                suppress,
799                max_field,
800                field_mask,
801                line_delim,
802                buf,
803                complement,
804            );
805            start = end_pos + 1;
806        }
807        if start < data.len() {
808            extract_fields_to_buf(
809                &data[start..],
810                delim,
811                ranges,
812                output_delim,
813                suppress,
814                max_field,
815                field_mask,
816                line_delim,
817                buf,
818                complement,
819            );
820        }
821        return;
822    }
823
824    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
825    // delimiter and line_delim in one SIMD scan.
826    // Uses raw pointer arithmetic to eliminate bounds checking in the hot loop.
827    if delim != line_delim {
828        buf.reserve(data.len());
829
830        let data_len = data.len();
831        let base = data.as_ptr();
832        let mut line_start: usize = 0;
833        let mut field_start: usize = 0;
834        let mut field_num: usize = 1;
835        let mut first_output = true;
836        let mut has_delim = false;
837
838        for pos in memchr::memchr2_iter(delim, line_delim, data) {
839            let byte = unsafe { *base.add(pos) };
840
841            if byte == line_delim {
842                // End of line: flush final field and emit line delimiter
843                if (field_num <= max_field || complement)
844                    && has_delim
845                    && is_selected(field_num, field_mask, ranges, complement)
846                {
847                    if !first_output {
848                        unsafe { buf_extend(buf, output_delim) };
849                    }
850                    unsafe {
851                        buf_extend(
852                            buf,
853                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
854                        )
855                    };
856                    first_output = false;
857                }
858
859                if !first_output {
860                    unsafe { buf_push(buf, line_delim) };
861                } else if !has_delim {
862                    if !suppress {
863                        unsafe {
864                            buf_extend(
865                                buf,
866                                std::slice::from_raw_parts(base.add(line_start), pos - line_start),
867                            );
868                            buf_push(buf, line_delim);
869                        }
870                    }
871                } else {
872                    unsafe { buf_push(buf, line_delim) };
873                }
874
875                // Reset state for next line
876                line_start = pos + 1;
877                field_start = pos + 1;
878                field_num = 1;
879                first_output = true;
880                has_delim = false;
881            } else {
882                // Field delimiter hit
883                has_delim = true;
884
885                if is_selected(field_num, field_mask, ranges, complement) {
886                    if !first_output {
887                        unsafe { buf_extend(buf, output_delim) };
888                    }
889                    unsafe {
890                        buf_extend(
891                            buf,
892                            std::slice::from_raw_parts(base.add(field_start), pos - field_start),
893                        )
894                    };
895                    first_output = false;
896                }
897
898                field_num += 1;
899                field_start = pos + 1;
900            }
901        }
902
903        // Handle last line without trailing line_delim
904        if line_start < data_len {
905            if line_start < data_len {
906                if (field_num <= max_field || complement)
907                    && has_delim
908                    && is_selected(field_num, field_mask, ranges, complement)
909                {
910                    if !first_output {
911                        unsafe { buf_extend(buf, output_delim) };
912                    }
913                    unsafe {
914                        buf_extend(
915                            buf,
916                            std::slice::from_raw_parts(
917                                base.add(field_start),
918                                data_len - field_start,
919                            ),
920                        )
921                    };
922                    first_output = false;
923                }
924
925                if !first_output {
926                    unsafe { buf_push(buf, line_delim) };
927                } else if !has_delim {
928                    if !suppress {
929                        unsafe {
930                            buf_extend(
931                                buf,
932                                std::slice::from_raw_parts(
933                                    base.add(line_start),
934                                    data_len - line_start,
935                                ),
936                            );
937                            buf_push(buf, line_delim);
938                        }
939                    }
940                } else {
941                    unsafe { buf_push(buf, line_delim) };
942                }
943            }
944        }
945
946        return;
947    }
948
949    // Fallback: when delim == line_delim, use the two-level scan approach
950    let mut start = 0;
951    for end_pos in memchr_iter(line_delim, data) {
952        let line = &data[start..end_pos];
953        extract_fields_to_buf(
954            line,
955            delim,
956            ranges,
957            output_delim,
958            suppress,
959            max_field,
960            field_mask,
961            line_delim,
962            buf,
963            complement,
964        );
965        start = end_pos + 1;
966    }
967    if start < data.len() {
968        extract_fields_to_buf(
969            &data[start..],
970            delim,
971            ranges,
972            output_delim,
973            suppress,
974            max_field,
975            field_mask,
976            line_delim,
977            buf,
978            complement,
979        );
980    }
981}
982
983// ── Ultra-fast single field extraction ───────────────────────────────────
984
985/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
986/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
987/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
988fn process_single_field(
989    data: &[u8],
990    delim: u8,
991    line_delim: u8,
992    target: usize,
993    suppress: bool,
994    out: &mut impl Write,
995) -> io::Result<()> {
996    let target_idx = target - 1;
997
998    // For single-field extraction, parallelize at 16MB+ to match PARALLEL_THRESHOLD.
999    const FIELD_PARALLEL_MIN: usize = 16 * 1024 * 1024;
1000
1001    if delim != line_delim {
1002        // Field 1 fast path: memchr2 single-pass scan.
1003        // For field 1, the first delimiter IS the field boundary. Lines without
1004        // delimiter are passed through unchanged.
1005        if target_idx == 0 && !suppress {
1006            if data.len() >= FIELD_PARALLEL_MIN {
1007                return single_field1_parallel(data, delim, line_delim, out);
1008            }
1009            // Sequential: scan with memchr2 into buffer, single write_all.
1010            // Faster than writev/IoSlice for moderate data because it produces
1011            // one contiguous buffer → one write syscall, and avoids IoSlice
1012            // allocation overhead for high-delimiter-density data.
1013            let mut buf = Vec::with_capacity(data.len() + 1);
1014            single_field1_to_buf(data, delim, line_delim, &mut buf);
1015            if !buf.is_empty() {
1016                out.write_all(&buf)?;
1017            }
1018            return Ok(());
1019        }
1020
1021        // Two-level approach for field N: outer newline scan + inner delim scan
1022        // with early exit at target_idx. Faster than memchr2 single-pass because
1023        // we only scan delimiters up to target_idx per line (not all of them).
1024        if data.len() >= FIELD_PARALLEL_MIN {
1025            let chunks = split_for_scope(data, line_delim);
1026            let n = chunks.len();
1027            let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1028            rayon::scope(|s| {
1029                for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1030                    s.spawn(move |_| {
1031                        result.reserve(chunk.len() / 2);
1032                        process_single_field_chunk(
1033                            chunk, delim, target_idx, line_delim, suppress, result,
1034                        );
1035                    });
1036                }
1037            });
1038            let slices: Vec<IoSlice> = results
1039                .iter()
1040                .filter(|r| !r.is_empty())
1041                .map(|r| IoSlice::new(r))
1042                .collect();
1043            write_ioslices(out, &slices)?;
1044        } else {
1045            let mut buf = Vec::with_capacity(data.len().min(4 * 1024 * 1024));
1046            process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1047            if !buf.is_empty() {
1048                out.write_all(&buf)?;
1049            }
1050        }
1051        return Ok(());
1052    }
1053
1054    // Fallback for delim == line_delim: nested loop approach
1055    if data.len() >= FIELD_PARALLEL_MIN {
1056        let chunks = split_for_scope(data, line_delim);
1057        let n = chunks.len();
1058        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1059        rayon::scope(|s| {
1060            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1061                s.spawn(move |_| {
1062                    result.reserve(chunk.len() / 4);
1063                    process_single_field_chunk(
1064                        chunk, delim, target_idx, line_delim, suppress, result,
1065                    );
1066                });
1067            }
1068        });
1069        let slices: Vec<IoSlice> = results
1070            .iter()
1071            .filter(|r| !r.is_empty())
1072            .map(|r| IoSlice::new(r))
1073            .collect();
1074        write_ioslices(out, &slices)?;
1075    } else {
1076        let mut buf = Vec::with_capacity(data.len() / 4);
1077        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
1078        if !buf.is_empty() {
1079            out.write_all(&buf)?;
1080        }
1081    }
1082    Ok(())
1083}
1084
1085/// Complement range extraction: skip fields start..=end, output rest (e.g., --complement -f3-5).
1086/// For each line: output fields 1..start-1, then fields end+1..EOF, skipping fields start..end.
1087fn process_complement_range(
1088    data: &[u8],
1089    delim: u8,
1090    line_delim: u8,
1091    skip_start: usize,
1092    skip_end: usize,
1093    suppress: bool,
1094    out: &mut impl Write,
1095) -> io::Result<()> {
1096    if data.len() >= PARALLEL_THRESHOLD {
1097        let chunks = split_for_scope(data, line_delim);
1098        let n = chunks.len();
1099        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1100        rayon::scope(|s| {
1101            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1102                s.spawn(move |_| {
1103                    result.reserve(chunk.len());
1104                    complement_range_chunk(
1105                        chunk, delim, skip_start, skip_end, line_delim, suppress, result,
1106                    );
1107                });
1108            }
1109        });
1110        let slices: Vec<IoSlice> = results
1111            .iter()
1112            .filter(|r| !r.is_empty())
1113            .map(|r| IoSlice::new(r))
1114            .collect();
1115        write_ioslices(out, &slices)?;
1116    } else {
1117        let mut buf = Vec::with_capacity(data.len());
1118        complement_range_chunk(
1119            data, delim, skip_start, skip_end, line_delim, suppress, &mut buf,
1120        );
1121        if !buf.is_empty() {
1122            out.write_all(&buf)?;
1123        }
1124    }
1125    Ok(())
1126}
1127
1128/// Process a chunk for complement range extraction.
1129fn complement_range_chunk(
1130    data: &[u8],
1131    delim: u8,
1132    skip_start: usize,
1133    skip_end: usize,
1134    line_delim: u8,
1135    suppress: bool,
1136    buf: &mut Vec<u8>,
1137) {
1138    // Pre-reserve entire chunk capacity to eliminate per-line reserve overhead.
1139    buf.reserve(data.len());
1140    let mut start = 0;
1141    for end_pos in memchr_iter(line_delim, data) {
1142        let line = &data[start..end_pos];
1143        complement_range_line(line, delim, skip_start, skip_end, line_delim, suppress, buf);
1144        start = end_pos + 1;
1145    }
1146    if start < data.len() {
1147        complement_range_line(
1148            &data[start..],
1149            delim,
1150            skip_start,
1151            skip_end,
1152            line_delim,
1153            suppress,
1154            buf,
1155        );
1156    }
1157}
1158
1159/// Extract all fields except skip_start..=skip_end from one line.
1160/// Outputs fields 1..skip_start-1, then fields skip_end+1..EOF.
1161///
1162/// Optimized: only scans for enough delimiters to find the skip region boundaries.
1163/// For `--complement -f3-5` with 20 fields, this finds delimiter 2 and 5, then
1164/// does a single copy of prefix + suffix, avoiding scanning past field 5.
1165#[inline(always)]
1166fn complement_range_line(
1167    line: &[u8],
1168    delim: u8,
1169    skip_start: usize,
1170    skip_end: usize,
1171    line_delim: u8,
1172    suppress: bool,
1173    buf: &mut Vec<u8>,
1174) {
1175    let len = line.len();
1176    if len == 0 {
1177        if !suppress {
1178            unsafe { buf_push(buf, line_delim) };
1179        }
1180        return;
1181    }
1182
1183    // Note: no per-line buf.reserve — complement_range_chunk already reserves data.len()
1184    let base = line.as_ptr();
1185
1186    // 1-based field numbers. To skip fields skip_start..=skip_end:
1187    // - prefix_end = position of (skip_start-1)th delimiter (exclusive; end of prefix fields)
1188    // - suffix_start = position after skip_end-th delimiter (inclusive; start of suffix fields)
1189    //
1190    // Find the first (skip_start - 1) delimiters to locate prefix_end,
1191    // then the next (skip_end - skip_start + 1) delimiters to locate suffix_start.
1192
1193    let need_prefix_delims = skip_start - 1; // number of delimiters before the skip region
1194    let need_skip_delims = skip_end - skip_start + 1; // delimiters within the skip region
1195    let total_need = need_prefix_delims + need_skip_delims;
1196
1197    // Find delimiter positions up to total_need
1198    let mut delim_count: usize = 0;
1199    let mut prefix_end_pos: usize = usize::MAX; // byte position of (skip_start-1)th delim
1200    let mut suffix_start_pos: usize = usize::MAX; // byte position after skip_end-th delim
1201
1202    for pos in memchr_iter(delim, line) {
1203        delim_count += 1;
1204        if delim_count == need_prefix_delims {
1205            prefix_end_pos = pos;
1206        }
1207        if delim_count == total_need {
1208            suffix_start_pos = pos + 1;
1209            break;
1210        }
1211    }
1212
1213    if delim_count == 0 {
1214        // No delimiter at all
1215        if !suppress {
1216            unsafe {
1217                buf_extend(buf, line);
1218                buf_push(buf, line_delim);
1219            }
1220        }
1221        return;
1222    }
1223
1224    // Case analysis:
1225    // 1. Not enough delims to reach skip_start: all fields are before skip region, output all
1226    // 2. Enough to reach skip_start but not skip_end: prefix + no suffix
1227    // 3. Enough to reach skip_end: prefix + delim + suffix
1228
1229    if delim_count < need_prefix_delims {
1230        // Not enough fields to reach skip region — output entire line
1231        unsafe {
1232            buf_extend(buf, line);
1233            buf_push(buf, line_delim);
1234        }
1235        return;
1236    }
1237
1238    let has_prefix = need_prefix_delims > 0;
1239    let has_suffix = suffix_start_pos != usize::MAX && suffix_start_pos < len;
1240
1241    if has_prefix && has_suffix {
1242        // Output: prefix (up to prefix_end_pos) + delim + suffix (from suffix_start_pos)
1243        unsafe {
1244            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1245            buf_push(buf, delim);
1246            buf_extend(
1247                buf,
1248                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1249            );
1250            buf_push(buf, line_delim);
1251        }
1252    } else if has_prefix {
1253        // Only prefix, no suffix (skip region extends to end of line)
1254        unsafe {
1255            buf_extend(buf, std::slice::from_raw_parts(base, prefix_end_pos));
1256            buf_push(buf, line_delim);
1257        }
1258    } else if has_suffix {
1259        // No prefix (skip_start == 1), only suffix
1260        unsafe {
1261            buf_extend(
1262                buf,
1263                std::slice::from_raw_parts(base.add(suffix_start_pos), len - suffix_start_pos),
1264            );
1265            buf_push(buf, line_delim);
1266        }
1267    } else {
1268        // All fields skipped
1269        unsafe { buf_push(buf, line_delim) };
1270    }
1271}
1272
1273/// Complement single-field extraction: skip one field, output rest unchanged.
1274fn process_complement_single_field(
1275    data: &[u8],
1276    delim: u8,
1277    line_delim: u8,
1278    skip_field: usize,
1279    suppress: bool,
1280    out: &mut impl Write,
1281) -> io::Result<()> {
1282    let skip_idx = skip_field - 1;
1283
1284    if data.len() >= PARALLEL_THRESHOLD {
1285        let chunks = split_for_scope(data, line_delim);
1286        let n = chunks.len();
1287        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1288        rayon::scope(|s| {
1289            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1290                s.spawn(move |_| {
1291                    result.reserve(chunk.len());
1292                    complement_single_field_chunk(
1293                        chunk, delim, skip_idx, line_delim, suppress, result,
1294                    );
1295                });
1296            }
1297        });
1298        let slices: Vec<IoSlice> = results
1299            .iter()
1300            .filter(|r| !r.is_empty())
1301            .map(|r| IoSlice::new(r))
1302            .collect();
1303        write_ioslices(out, &slices)?;
1304    } else {
1305        let mut buf = Vec::with_capacity(data.len());
1306        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
1307        if !buf.is_empty() {
1308            out.write_all(&buf)?;
1309        }
1310    }
1311    Ok(())
1312}
1313
1314/// Process a chunk for complement single-field extraction using memchr2 single-pass.
1315/// Scans for both delimiter and line_delim in one SIMD pass, tracking delimiter count
1316/// per line. When the skip field's bounding delimiters are found, copies prefix + suffix.
1317/// This eliminates the per-line memchr_iter setup overhead and reduces from two SIMD
1318/// passes (outer newline scan + inner delimiter scan) to one.
1319fn complement_single_field_chunk(
1320    data: &[u8],
1321    delim: u8,
1322    skip_idx: usize,
1323    line_delim: u8,
1324    suppress: bool,
1325    buf: &mut Vec<u8>,
1326) {
1327    // When delim == line_delim, fall back to per-line approach
1328    if delim == line_delim {
1329        buf.reserve(data.len());
1330        let mut start = 0;
1331        for end_pos in memchr_iter(line_delim, data) {
1332            let line = &data[start..end_pos];
1333            complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
1334            start = end_pos + 1;
1335        }
1336        if start < data.len() {
1337            complement_single_field_line(
1338                &data[start..],
1339                delim,
1340                skip_idx,
1341                line_delim,
1342                suppress,
1343                buf,
1344            );
1345        }
1346        return;
1347    }
1348
1349    buf.reserve(data.len());
1350    let base = data.as_ptr();
1351    let data_len = data.len();
1352    let need_before = skip_idx; // delimiters before skip field
1353    let need_total = skip_idx + 1; // delimiters to find end of skip field
1354
1355    // Per-line state
1356    let mut line_start: usize = 0;
1357    let mut delim_count: usize = 0;
1358    let mut skip_start_pos: usize = 0;
1359    let mut skip_end_pos: usize = 0;
1360    let mut found_start = need_before == 0; // skip_idx==0 means skip starts at line start
1361    let mut found_end = false;
1362
1363    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1364        let byte = unsafe { *base.add(pos) };
1365
1366        if byte == line_delim {
1367            // End of line: emit based on what we found
1368            if delim_count == 0 {
1369                // No delimiter in line
1370                if !suppress {
1371                    unsafe {
1372                        buf_extend(
1373                            buf,
1374                            std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1375                        );
1376                        buf_push(buf, line_delim);
1377                    }
1378                }
1379            } else if !found_start || delim_count < need_before {
1380                // Not enough delimiters to reach skip field — output entire line
1381                unsafe {
1382                    buf_extend(
1383                        buf,
1384                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1385                    );
1386                    buf_push(buf, line_delim);
1387                }
1388            } else {
1389                let has_prefix = skip_idx > 0;
1390                let has_suffix = found_end && skip_end_pos < pos;
1391
1392                if has_prefix && has_suffix {
1393                    unsafe {
1394                        buf_extend(
1395                            buf,
1396                            std::slice::from_raw_parts(
1397                                base.add(line_start),
1398                                skip_start_pos - 1 - line_start,
1399                            ),
1400                        );
1401                        buf_push(buf, delim);
1402                        buf_extend(
1403                            buf,
1404                            std::slice::from_raw_parts(
1405                                base.add(skip_end_pos + 1),
1406                                pos - skip_end_pos - 1,
1407                            ),
1408                        );
1409                        buf_push(buf, line_delim);
1410                    }
1411                } else if has_prefix {
1412                    unsafe {
1413                        buf_extend(
1414                            buf,
1415                            std::slice::from_raw_parts(
1416                                base.add(line_start),
1417                                skip_start_pos - 1 - line_start,
1418                            ),
1419                        );
1420                        buf_push(buf, line_delim);
1421                    }
1422                } else if has_suffix {
1423                    unsafe {
1424                        buf_extend(
1425                            buf,
1426                            std::slice::from_raw_parts(
1427                                base.add(skip_end_pos + 1),
1428                                pos - skip_end_pos - 1,
1429                            ),
1430                        );
1431                        buf_push(buf, line_delim);
1432                    }
1433                } else {
1434                    unsafe { buf_push(buf, line_delim) };
1435                }
1436            }
1437
1438            // Reset for next line
1439            line_start = pos + 1;
1440            delim_count = 0;
1441            skip_start_pos = 0;
1442            skip_end_pos = 0;
1443            found_start = need_before == 0;
1444            found_end = false;
1445        } else {
1446            // Delimiter found
1447            delim_count += 1;
1448            if delim_count == need_before {
1449                skip_start_pos = pos + 1;
1450                found_start = true;
1451            }
1452            if delim_count == need_total {
1453                skip_end_pos = pos;
1454                found_end = true;
1455            }
1456        }
1457    }
1458
1459    // Handle last line without trailing line_delim
1460    if line_start < data_len {
1461        let pos = data_len;
1462        if delim_count == 0 {
1463            if !suppress {
1464                unsafe {
1465                    buf_extend(
1466                        buf,
1467                        std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1468                    );
1469                    buf_push(buf, line_delim);
1470                }
1471            }
1472        } else if !found_start || delim_count < need_before {
1473            unsafe {
1474                buf_extend(
1475                    buf,
1476                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
1477                );
1478                buf_push(buf, line_delim);
1479            }
1480        } else {
1481            let has_prefix = skip_idx > 0;
1482            let has_suffix = found_end && skip_end_pos < pos;
1483
1484            if has_prefix && has_suffix {
1485                unsafe {
1486                    buf_extend(
1487                        buf,
1488                        std::slice::from_raw_parts(
1489                            base.add(line_start),
1490                            skip_start_pos - 1 - line_start,
1491                        ),
1492                    );
1493                    buf_push(buf, delim);
1494                    buf_extend(
1495                        buf,
1496                        std::slice::from_raw_parts(
1497                            base.add(skip_end_pos + 1),
1498                            pos - skip_end_pos - 1,
1499                        ),
1500                    );
1501                    buf_push(buf, line_delim);
1502                }
1503            } else if has_prefix {
1504                unsafe {
1505                    buf_extend(
1506                        buf,
1507                        std::slice::from_raw_parts(
1508                            base.add(line_start),
1509                            skip_start_pos - 1 - line_start,
1510                        ),
1511                    );
1512                    buf_push(buf, line_delim);
1513                }
1514            } else if has_suffix {
1515                unsafe {
1516                    buf_extend(
1517                        buf,
1518                        std::slice::from_raw_parts(
1519                            base.add(skip_end_pos + 1),
1520                            pos - skip_end_pos - 1,
1521                        ),
1522                    );
1523                    buf_push(buf, line_delim);
1524                }
1525            } else {
1526                unsafe { buf_push(buf, line_delim) };
1527            }
1528        }
1529    }
1530}
1531
1532/// Fallback per-line complement single-field extraction (for delim == line_delim).
1533#[inline(always)]
1534fn complement_single_field_line(
1535    line: &[u8],
1536    delim: u8,
1537    skip_idx: usize,
1538    line_delim: u8,
1539    suppress: bool,
1540    buf: &mut Vec<u8>,
1541) {
1542    let len = line.len();
1543    if len == 0 {
1544        if !suppress {
1545            unsafe { buf_push(buf, line_delim) };
1546        }
1547        return;
1548    }
1549
1550    let base = line.as_ptr();
1551    let need_before = skip_idx;
1552    let need_total = skip_idx + 1;
1553
1554    let mut delim_count: usize = 0;
1555    let mut skip_start_pos: usize = 0;
1556    let mut skip_end_pos: usize = len;
1557    let mut found_end = false;
1558
1559    for pos in memchr_iter(delim, line) {
1560        delim_count += 1;
1561        if delim_count == need_before {
1562            skip_start_pos = pos + 1;
1563        }
1564        if delim_count == need_total {
1565            skip_end_pos = pos;
1566            found_end = true;
1567            break;
1568        }
1569    }
1570
1571    if delim_count == 0 {
1572        if !suppress {
1573            unsafe {
1574                buf_extend(buf, line);
1575                buf_push(buf, line_delim);
1576            }
1577        }
1578        return;
1579    }
1580
1581    if delim_count < need_before {
1582        unsafe {
1583            buf_extend(buf, line);
1584            buf_push(buf, line_delim);
1585        }
1586        return;
1587    }
1588
1589    let has_prefix = skip_idx > 0 && skip_start_pos > 0;
1590    let has_suffix = found_end && skip_end_pos < len;
1591
1592    if has_prefix && has_suffix {
1593        unsafe {
1594            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1595            buf_push(buf, delim);
1596            buf_extend(
1597                buf,
1598                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1599            );
1600            buf_push(buf, line_delim);
1601        }
1602    } else if has_prefix {
1603        unsafe {
1604            buf_extend(buf, std::slice::from_raw_parts(base, skip_start_pos - 1));
1605            buf_push(buf, line_delim);
1606        }
1607    } else if has_suffix {
1608        unsafe {
1609            buf_extend(
1610                buf,
1611                std::slice::from_raw_parts(base.add(skip_end_pos + 1), len - skip_end_pos - 1),
1612            );
1613            buf_push(buf, line_delim);
1614        }
1615    } else {
1616        unsafe { buf_push(buf, line_delim) };
1617    }
1618}
1619
1620/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
1621/// Zero-copy for the non-parallel path: identifies the truncation point per line
1622/// and writes contiguous runs directly from the source data.
1623fn process_fields_prefix(
1624    data: &[u8],
1625    delim: u8,
1626    line_delim: u8,
1627    last_field: usize,
1628    suppress: bool,
1629    out: &mut impl Write,
1630) -> io::Result<()> {
1631    if data.len() >= PARALLEL_THRESHOLD {
1632        let chunks = split_for_scope(data, line_delim);
1633        let n = chunks.len();
1634        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1635        rayon::scope(|s| {
1636            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1637                s.spawn(move |_| {
1638                    result.reserve(chunk.len());
1639                    fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, result);
1640                });
1641            }
1642        });
1643        let slices: Vec<IoSlice> = results
1644            .iter()
1645            .filter(|r| !r.is_empty())
1646            .map(|r| IoSlice::new(r))
1647            .collect();
1648        write_ioslices(out, &slices)?;
1649    } else if !suppress {
1650        // Zero-copy fast path: scan for truncation points, write runs from source.
1651        // When suppress is false, every line is output (with or without delimiter).
1652        // Most lines have enough fields, so the output is often identical to input.
1653        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
1654    } else {
1655        let mut buf = Vec::with_capacity(data.len());
1656        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
1657        if !buf.is_empty() {
1658            out.write_all(&buf)?;
1659        }
1660    }
1661    Ok(())
1662}
1663
1664/// Zero-copy field-prefix extraction using writev: builds IoSlice entries pointing
1665/// directly into the source data, flushing in MAX_IOV-sized batches.
1666/// For lines where the Nth delimiter exists, we truncate at that point.
1667/// For lines with fewer fields, we output them unchanged (contiguous run).
1668/// Lines without any delimiter are output unchanged (suppress=false assumed).
1669#[inline]
1670fn fields_prefix_zerocopy(
1671    data: &[u8],
1672    delim: u8,
1673    line_delim: u8,
1674    last_field: usize,
1675    out: &mut impl Write,
1676) -> io::Result<()> {
1677    let newline_buf: [u8; 1] = [line_delim];
1678    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
1679    let mut start = 0;
1680    let mut run_start: usize = 0;
1681
1682    for end_pos in memchr_iter(line_delim, data) {
1683        let line = &data[start..end_pos];
1684        let mut field_count = 1;
1685        let mut truncate_at: Option<usize> = None;
1686        for dpos in memchr_iter(delim, line) {
1687            if field_count >= last_field {
1688                truncate_at = Some(start + dpos);
1689                break;
1690            }
1691            field_count += 1;
1692        }
1693
1694        if let Some(trunc_pos) = truncate_at {
1695            if run_start < start {
1696                iov.push(IoSlice::new(&data[run_start..start]));
1697            }
1698            iov.push(IoSlice::new(&data[start..trunc_pos]));
1699            iov.push(IoSlice::new(&newline_buf));
1700            run_start = end_pos + 1;
1701
1702            if iov.len() >= MAX_IOV - 2 {
1703                write_ioslices(out, &iov)?;
1704                iov.clear();
1705            }
1706        }
1707        start = end_pos + 1;
1708    }
1709    // Handle last line without terminator
1710    if start < data.len() {
1711        let line = &data[start..];
1712        let mut field_count = 1;
1713        let mut truncate_at: Option<usize> = None;
1714        for dpos in memchr_iter(delim, line) {
1715            if field_count >= last_field {
1716                truncate_at = Some(start + dpos);
1717                break;
1718            }
1719            field_count += 1;
1720        }
1721        if let Some(trunc_pos) = truncate_at {
1722            if run_start < start {
1723                iov.push(IoSlice::new(&data[run_start..start]));
1724            }
1725            iov.push(IoSlice::new(&data[start..trunc_pos]));
1726            iov.push(IoSlice::new(&newline_buf));
1727            if !iov.is_empty() {
1728                write_ioslices(out, &iov)?;
1729            }
1730            return Ok(());
1731        }
1732    }
1733    // Flush remaining contiguous run
1734    if run_start < data.len() {
1735        iov.push(IoSlice::new(&data[run_start..]));
1736        if !data.is_empty() && *data.last().unwrap() != line_delim {
1737            iov.push(IoSlice::new(&newline_buf));
1738        }
1739    }
1740    if !iov.is_empty() {
1741        write_ioslices(out, &iov)?;
1742    }
1743    Ok(())
1744}
1745
1746/// Process a chunk for contiguous from-start field range extraction.
1747fn fields_prefix_chunk(
1748    data: &[u8],
1749    delim: u8,
1750    line_delim: u8,
1751    last_field: usize,
1752    suppress: bool,
1753    buf: &mut Vec<u8>,
1754) {
1755    buf.reserve(data.len());
1756    let mut start = 0;
1757    for end_pos in memchr_iter(line_delim, data) {
1758        let line = &data[start..end_pos];
1759        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
1760        start = end_pos + 1;
1761    }
1762    if start < data.len() {
1763        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
1764    }
1765}
1766
1767/// Extract first N fields from one line (contiguous from-start range).
1768/// Uses memchr SIMD for delimiter scanning on all line sizes.
1769#[inline(always)]
1770fn fields_prefix_line(
1771    line: &[u8],
1772    delim: u8,
1773    line_delim: u8,
1774    last_field: usize,
1775    suppress: bool,
1776    buf: &mut Vec<u8>,
1777) {
1778    let len = line.len();
1779    if len == 0 {
1780        if !suppress {
1781            unsafe { buf_push(buf, line_delim) };
1782        }
1783        return;
1784    }
1785
1786    // Note: no per-line buf.reserve — fields_prefix_chunk already reserves data.len()
1787    let base = line.as_ptr();
1788
1789    let mut field_count = 1usize;
1790    let mut has_delim = false;
1791
1792    for pos in memchr_iter(delim, line) {
1793        has_delim = true;
1794        if field_count >= last_field {
1795            unsafe {
1796                buf_extend(buf, std::slice::from_raw_parts(base, pos));
1797                buf_push(buf, line_delim);
1798            }
1799            return;
1800        }
1801        field_count += 1;
1802    }
1803
1804    if !has_delim {
1805        if !suppress {
1806            unsafe {
1807                buf_extend(buf, line);
1808                buf_push(buf, line_delim);
1809            }
1810        }
1811        return;
1812    }
1813
1814    unsafe {
1815        buf_extend(buf, line);
1816        buf_push(buf, line_delim);
1817    }
1818}
1819
1820/// Open-ended field suffix extraction (e.g., `cut -f3-`).
1821fn process_fields_suffix(
1822    data: &[u8],
1823    delim: u8,
1824    line_delim: u8,
1825    start_field: usize,
1826    suppress: bool,
1827    out: &mut impl Write,
1828) -> io::Result<()> {
1829    if data.len() >= PARALLEL_THRESHOLD {
1830        let chunks = split_for_scope(data, line_delim);
1831        let n = chunks.len();
1832        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1833        rayon::scope(|s| {
1834            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1835                s.spawn(move |_| {
1836                    result.reserve(chunk.len());
1837                    fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, result);
1838                });
1839            }
1840        });
1841        let slices: Vec<IoSlice> = results
1842            .iter()
1843            .filter(|r| !r.is_empty())
1844            .map(|r| IoSlice::new(r))
1845            .collect();
1846        write_ioslices(out, &slices)?;
1847    } else {
1848        let mut buf = Vec::with_capacity(data.len());
1849        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
1850        if !buf.is_empty() {
1851            out.write_all(&buf)?;
1852        }
1853    }
1854    Ok(())
1855}
1856
1857/// Process a chunk for open-ended field suffix extraction.
1858fn fields_suffix_chunk(
1859    data: &[u8],
1860    delim: u8,
1861    line_delim: u8,
1862    start_field: usize,
1863    suppress: bool,
1864    buf: &mut Vec<u8>,
1865) {
1866    buf.reserve(data.len());
1867    let mut start = 0;
1868    for end_pos in memchr_iter(line_delim, data) {
1869        let line = &data[start..end_pos];
1870        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
1871        start = end_pos + 1;
1872    }
1873    if start < data.len() {
1874        fields_suffix_line(
1875            &data[start..],
1876            delim,
1877            line_delim,
1878            start_field,
1879            suppress,
1880            buf,
1881        );
1882    }
1883}
1884
1885/// Extract fields from start_field to end from one line.
1886/// Uses memchr SIMD for delimiter scanning on all line sizes.
1887#[inline(always)]
1888fn fields_suffix_line(
1889    line: &[u8],
1890    delim: u8,
1891    line_delim: u8,
1892    start_field: usize,
1893    suppress: bool,
1894    buf: &mut Vec<u8>,
1895) {
1896    let len = line.len();
1897    if len == 0 {
1898        if !suppress {
1899            unsafe { buf_push(buf, line_delim) };
1900        }
1901        return;
1902    }
1903
1904    // Note: no per-line buf.reserve — fields_suffix_chunk already reserves data.len()
1905    let base = line.as_ptr();
1906
1907    let skip_delims = start_field - 1;
1908    let mut delim_count = 0usize;
1909    let mut has_delim = false;
1910
1911    for pos in memchr_iter(delim, line) {
1912        has_delim = true;
1913        delim_count += 1;
1914        if delim_count >= skip_delims {
1915            unsafe {
1916                buf_extend(
1917                    buf,
1918                    std::slice::from_raw_parts(base.add(pos + 1), len - pos - 1),
1919                );
1920                buf_push(buf, line_delim);
1921            }
1922            return;
1923        }
1924    }
1925
1926    if !has_delim {
1927        if !suppress {
1928            unsafe {
1929                buf_extend(buf, line);
1930                buf_push(buf, line_delim);
1931            }
1932        }
1933        return;
1934    }
1935
1936    // Fewer delimiters than needed
1937    unsafe { buf_push(buf, line_delim) };
1938}
1939
1940/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1941/// Optimized: skip to start_field using memchr, then output until end_field.
1942fn process_fields_mid_range(
1943    data: &[u8],
1944    delim: u8,
1945    line_delim: u8,
1946    start_field: usize,
1947    end_field: usize,
1948    suppress: bool,
1949    out: &mut impl Write,
1950) -> io::Result<()> {
1951    if data.len() >= PARALLEL_THRESHOLD {
1952        let chunks = split_for_scope(data, line_delim);
1953        let n = chunks.len();
1954        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
1955        rayon::scope(|s| {
1956            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
1957                s.spawn(move |_| {
1958                    result.reserve(chunk.len());
1959                    fields_mid_range_chunk(
1960                        chunk,
1961                        delim,
1962                        line_delim,
1963                        start_field,
1964                        end_field,
1965                        suppress,
1966                        result,
1967                    );
1968                });
1969            }
1970        });
1971        let slices: Vec<IoSlice> = results
1972            .iter()
1973            .filter(|r| !r.is_empty())
1974            .map(|r| IoSlice::new(r))
1975            .collect();
1976        write_ioslices(out, &slices)?;
1977    } else {
1978        let mut buf = Vec::with_capacity(data.len());
1979        fields_mid_range_chunk(
1980            data,
1981            delim,
1982            line_delim,
1983            start_field,
1984            end_field,
1985            suppress,
1986            &mut buf,
1987        );
1988        if !buf.is_empty() {
1989            out.write_all(&buf)?;
1990        }
1991    }
1992    Ok(())
1993}
1994
1995/// Process a chunk for contiguous mid-range field extraction.
1996/// Single-pass memchr2 scan over the entire chunk, tracking delimiter count
1997/// per line. Avoids the double-scan (outer newline + inner delimiter).
1998fn fields_mid_range_chunk(
1999    data: &[u8],
2000    delim: u8,
2001    line_delim: u8,
2002    start_field: usize,
2003    end_field: usize,
2004    suppress: bool,
2005    buf: &mut Vec<u8>,
2006) {
2007    // When delim == line_delim, fall back to per-line approach
2008    if delim == line_delim {
2009        buf.reserve(data.len());
2010        let mut start = 0;
2011        for end_pos in memchr_iter(line_delim, data) {
2012            let line = &data[start..end_pos];
2013            fields_mid_range_line(
2014                line,
2015                delim,
2016                line_delim,
2017                start_field,
2018                end_field,
2019                suppress,
2020                buf,
2021            );
2022            start = end_pos + 1;
2023        }
2024        if start < data.len() {
2025            fields_mid_range_line(
2026                &data[start..],
2027                delim,
2028                line_delim,
2029                start_field,
2030                end_field,
2031                suppress,
2032                buf,
2033            );
2034        }
2035        return;
2036    }
2037
2038    buf.reserve(data.len());
2039    let base = data.as_ptr();
2040    let skip_before = start_field - 1; // delimiters to skip before range
2041    let target_end_delim = skip_before + (end_field - start_field) + 1;
2042
2043    let mut line_start: usize = 0;
2044    let mut delim_count: usize = 0;
2045    let mut range_start: usize = 0;
2046    let mut has_delim = false;
2047    let mut found_end = false; // true when we found all target fields, skip to newline
2048
2049    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2050        let byte = unsafe { *base.add(pos) };
2051        if byte == line_delim {
2052            // End of line
2053            if found_end {
2054                // Already output this line's range
2055            } else if !has_delim {
2056                // No delimiter on this line
2057                if !suppress {
2058                    unsafe {
2059                        buf_extend(
2060                            buf,
2061                            std::slice::from_raw_parts(base.add(line_start), pos + 1 - line_start),
2062                        );
2063                    }
2064                }
2065            } else if delim_count >= skip_before {
2066                // Have enough fields for start_field; output from range_start to EOL
2067                if skip_before == 0 {
2068                    range_start = line_start;
2069                }
2070                unsafe {
2071                    buf_extend(
2072                        buf,
2073                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2074                    );
2075                    buf_push(buf, line_delim);
2076                }
2077            } else {
2078                // Not enough fields for start_field — output empty line
2079                unsafe { buf_push(buf, line_delim) };
2080            }
2081            line_start = pos + 1;
2082            delim_count = 0;
2083            has_delim = false;
2084            found_end = false;
2085        } else if !found_end {
2086            // Delimiter
2087            has_delim = true;
2088            delim_count += 1;
2089            if delim_count == skip_before {
2090                range_start = pos + 1;
2091            }
2092            if delim_count == target_end_delim {
2093                if skip_before == 0 {
2094                    range_start = line_start;
2095                }
2096                unsafe {
2097                    buf_extend(
2098                        buf,
2099                        std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2100                    );
2101                    buf_push(buf, line_delim);
2102                }
2103                found_end = true;
2104            }
2105        }
2106    }
2107    // Handle trailing data without final newline
2108    if line_start < data.len() && !found_end {
2109        if !has_delim {
2110            if !suppress {
2111                unsafe {
2112                    buf_extend(
2113                        buf,
2114                        std::slice::from_raw_parts(base.add(line_start), data.len() - line_start),
2115                    );
2116                }
2117            }
2118        } else if delim_count >= skip_before {
2119            if skip_before == 0 {
2120                range_start = line_start;
2121            }
2122            unsafe {
2123                buf_extend(
2124                    buf,
2125                    std::slice::from_raw_parts(base.add(range_start), data.len() - range_start),
2126                );
2127            }
2128        }
2129    }
2130}
2131
2132/// Extract fields start_field..=end_field from one line.
2133/// Uses scalar byte scanning for short lines, memchr_iter for longer.
2134/// Raw pointer arithmetic to eliminate bounds checking.
2135#[inline(always)]
2136fn fields_mid_range_line(
2137    line: &[u8],
2138    delim: u8,
2139    line_delim: u8,
2140    start_field: usize,
2141    end_field: usize,
2142    suppress: bool,
2143    buf: &mut Vec<u8>,
2144) {
2145    let len = line.len();
2146    if len == 0 {
2147        if !suppress {
2148            unsafe { buf_push(buf, line_delim) };
2149        }
2150        return;
2151    }
2152
2153    // Note: no per-line buf.reserve — fields_mid_range_chunk already reserves data.len()
2154    let base = line.as_ptr();
2155
2156    // Count delimiters to find start_field and end_field boundaries
2157    let skip_before = start_field - 1; // delimiters to skip before start_field
2158    let field_span = end_field - start_field; // additional delimiters within the range
2159    let target_end_delim = skip_before + field_span + 1;
2160    let mut delim_count = 0;
2161    let mut range_start = 0;
2162    let mut has_delim = false;
2163
2164    for pos in memchr_iter(delim, line) {
2165        has_delim = true;
2166        delim_count += 1;
2167        if delim_count == skip_before {
2168            range_start = pos + 1;
2169        }
2170        if delim_count == target_end_delim {
2171            if skip_before == 0 {
2172                range_start = 0;
2173            }
2174            unsafe {
2175                buf_extend(
2176                    buf,
2177                    std::slice::from_raw_parts(base.add(range_start), pos - range_start),
2178                );
2179                buf_push(buf, line_delim);
2180            }
2181            return;
2182        }
2183    }
2184
2185    if !has_delim {
2186        if !suppress {
2187            unsafe {
2188                buf_extend(buf, line);
2189                buf_push(buf, line_delim);
2190            }
2191        }
2192        return;
2193    }
2194
2195    // Line has delimiters but fewer fields than end_field
2196    if delim_count >= skip_before {
2197        // We have at least start_field, output from range_start to end
2198        if skip_before == 0 {
2199            range_start = 0;
2200        }
2201        unsafe {
2202            buf_extend(
2203                buf,
2204                std::slice::from_raw_parts(base.add(range_start), len - range_start),
2205            );
2206            buf_push(buf, line_delim);
2207        }
2208    } else {
2209        // Not enough fields even for start_field — output empty line
2210        unsafe { buf_push(buf, line_delim) };
2211    }
2212}
2213
2214/// Zero-copy field-1 extraction using writev: builds IoSlice entries pointing
2215/// directly into the source data, flushing in MAX_IOV-sized batches.
2216/// For each line: if delimiter exists, output field1 + newline; otherwise pass through.
2217///
2218/// Uses a two-level scan: outer memchr(newline) for line boundaries, inner memchr(delim)
2219/// Parallel field-1 extraction for large data using memchr2 single-pass.
2220/// Splits data into per-thread chunks, each chunk extracts field 1 using
2221/// memchr2(delim, newline) which finds the first special byte in one scan.
2222/// For field 1: first special byte is either the delimiter (field end) or
2223/// newline (no delimiter, output line unchanged). 4 threads cut scan time ~4x.
2224fn single_field1_parallel(
2225    data: &[u8],
2226    delim: u8,
2227    line_delim: u8,
2228    out: &mut impl Write,
2229) -> io::Result<()> {
2230    let chunks = split_for_scope(data, line_delim);
2231    let n = chunks.len();
2232    let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2233    rayon::scope(|s| {
2234        for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2235            s.spawn(move |_| {
2236                result.reserve(chunk.len() + 1);
2237                single_field1_to_buf(chunk, delim, line_delim, result);
2238            });
2239        }
2240    });
2241    let slices: Vec<IoSlice> = results
2242        .iter()
2243        .filter(|r| !r.is_empty())
2244        .map(|r| IoSlice::new(r))
2245        .collect();
2246    write_ioslices(out, &slices)
2247}
2248
2249/// Extract field 1 from a chunk using memchr2_iter single-pass SIMD scanning.
2250/// Uses a single memchr2_iter pass over the entire chunk to find both delimiters
2251/// and newlines. This eliminates the per-line memchr function call overhead
2252/// (~5-10ns per call × 2 calls per line) that dominates for short-field data.
2253///
2254/// Optimizations:
2255/// - Contiguous run tracking: consecutive no-delimiter lines are batched into
2256///   a single buf_extend (one memcpy instead of one per line).
2257#[inline]
2258fn single_field1_to_buf(data: &[u8], delim: u8, line_delim: u8, buf: &mut Vec<u8>) {
2259    // Reserve data.len() + 1: output ≤ input for all lines except potentially
2260    // the last line without trailing newline, where we add a newline (GNU compat).
2261    buf.reserve(data.len() + 1);
2262    let base = data.as_ptr();
2263    let mut line_start: usize = 0;
2264    let mut found_delim = false;
2265
2266    for pos in memchr::memchr2_iter(delim, line_delim, data) {
2267        let byte = unsafe { *base.add(pos) };
2268        if byte == line_delim {
2269            if !found_delim {
2270                // No delimiter on this line — output entire line including newline
2271                unsafe {
2272                    buf_extend(
2273                        buf,
2274                        std::slice::from_raw_parts(base.add(line_start), pos + 1 - line_start),
2275                    );
2276                }
2277            } else {
2278                // Delimiter was found earlier — just add the line terminator
2279                unsafe { buf_push(buf, line_delim) };
2280            }
2281            line_start = pos + 1;
2282            found_delim = false;
2283        } else if !found_delim {
2284            // First delimiter on this line — output from line_start to here
2285            found_delim = true;
2286            unsafe {
2287                buf_extend(
2288                    buf,
2289                    std::slice::from_raw_parts(base.add(line_start), pos - line_start),
2290                );
2291            }
2292        }
2293        // Subsequent delimiters: ignore
2294    }
2295
2296    // Handle last line without trailing newline — GNU cut always adds newline
2297    if line_start < data.len() {
2298        if !found_delim {
2299            // No delimiter — output remaining data + newline (GNU compat)
2300            unsafe {
2301                buf_extend_byte(
2302                    buf,
2303                    std::slice::from_raw_parts(base.add(line_start), data.len() - line_start),
2304                    line_delim,
2305                );
2306            }
2307        } else {
2308            // Field already output — add trailing newline (GNU compat)
2309            unsafe { buf_push(buf, line_delim) };
2310        }
2311    }
2312}
2313
2314/// Zero-copy field 1 extraction using writev: builds IoSlice entries pointing
2315/// directly into the source data. Uses two-level scan: outer memchr(newline)
2316/// for the first delimiter. This is faster than memchr2 for SMALL data because
2317/// the inner scan exits after the FIRST delimiter, skipping all
2318/// subsequent delimiters on the line.
2319///
2320/// Lines without delimiter stay in contiguous runs (zero-copy pass-through).
2321/// Lines with delimiter produce two IoSlices (truncated field + newline byte).
2322#[inline]
2323#[allow(dead_code)]
2324fn single_field1_zerocopy(
2325    data: &[u8],
2326    delim: u8,
2327    line_delim: u8,
2328    out: &mut impl Write,
2329) -> io::Result<()> {
2330    let newline_buf: [u8; 1] = [line_delim];
2331
2332    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2333    let mut run_start: usize = 0;
2334    let mut start = 0;
2335
2336    for end_pos in memchr_iter(line_delim, data) {
2337        let line = &data[start..end_pos];
2338        if let Some(dp) = memchr::memchr(delim, line) {
2339            // Line has delimiter — truncate at first delimiter.
2340            // Flush current contiguous run, then add truncated field + newline.
2341            if run_start < start {
2342                iov.push(IoSlice::new(&data[run_start..start]));
2343            }
2344            iov.push(IoSlice::new(&data[start..start + dp]));
2345            iov.push(IoSlice::new(&newline_buf));
2346            run_start = end_pos + 1;
2347
2348            if iov.len() >= MAX_IOV - 2 {
2349                write_ioslices(out, &iov)?;
2350                iov.clear();
2351            }
2352        }
2353        // else: no delimiter in line, output unchanged (stays in contiguous run)
2354        start = end_pos + 1;
2355    }
2356
2357    // Handle last line (no trailing newline)
2358    if start < data.len() {
2359        let line = &data[start..];
2360        if let Some(dp) = memchr::memchr(delim, line) {
2361            if run_start < start {
2362                iov.push(IoSlice::new(&data[run_start..start]));
2363            }
2364            iov.push(IoSlice::new(&data[start..start + dp]));
2365            iov.push(IoSlice::new(&newline_buf));
2366            if !iov.is_empty() {
2367                write_ioslices(out, &iov)?;
2368            }
2369            return Ok(());
2370        }
2371    }
2372
2373    // Flush remaining contiguous run
2374    if run_start < data.len() {
2375        iov.push(IoSlice::new(&data[run_start..]));
2376        if !data.is_empty() && *data.last().unwrap() != line_delim {
2377            iov.push(IoSlice::new(&newline_buf));
2378        }
2379    }
2380    if !iov.is_empty() {
2381        write_ioslices(out, &iov)?;
2382    }
2383    Ok(())
2384}
2385
2386/// Process a chunk of data for single-field extraction.
2387fn process_single_field_chunk(
2388    data: &[u8],
2389    delim: u8,
2390    target_idx: usize,
2391    line_delim: u8,
2392    suppress: bool,
2393    buf: &mut Vec<u8>,
2394) {
2395    // Pre-reserve chunk capacity to eliminate per-line reserve overhead.
2396    buf.reserve(data.len());
2397    let mut start = 0;
2398    for end_pos in memchr_iter(line_delim, data) {
2399        let line = &data[start..end_pos];
2400        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
2401        start = end_pos + 1;
2402    }
2403    if start < data.len() {
2404        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
2405    }
2406}
2407
2408/// Extract a single field from one line.
2409/// For short lines (< 256 bytes), uses direct scalar scanning to avoid memchr overhead.
2410/// For longer lines, uses memchr for SIMD-accelerated scanning.
2411/// Raw pointer arithmetic eliminates per-field bounds checking.
2412#[inline(always)]
2413fn extract_single_field_line(
2414    line: &[u8],
2415    delim: u8,
2416    target_idx: usize,
2417    line_delim: u8,
2418    suppress: bool,
2419    buf: &mut Vec<u8>,
2420) {
2421    let len = line.len();
2422    if len == 0 {
2423        if !suppress {
2424            unsafe { buf_push(buf, line_delim) };
2425        }
2426        return;
2427    }
2428
2429    // Note: no per-line buf.reserve — process_single_field_chunk already reserves data.len()
2430    let base = line.as_ptr();
2431
2432    // Ultra-fast path for first field: single memchr
2433    if target_idx == 0 {
2434        match memchr::memchr(delim, line) {
2435            Some(pos) => unsafe {
2436                buf_extend_byte(buf, std::slice::from_raw_parts(base, pos), line_delim);
2437            },
2438            None => {
2439                if !suppress {
2440                    unsafe {
2441                        buf_extend_byte(buf, line, line_delim);
2442                    }
2443                }
2444            }
2445        }
2446        return;
2447    }
2448
2449    // Use memchr SIMD for all line sizes (faster than scalar even for short lines)
2450    let mut field_start = 0;
2451    let mut field_idx = 0;
2452    let mut has_delim = false;
2453
2454    for pos in memchr_iter(delim, line) {
2455        has_delim = true;
2456        if field_idx == target_idx {
2457            unsafe {
2458                buf_extend_byte(
2459                    buf,
2460                    std::slice::from_raw_parts(base.add(field_start), pos - field_start),
2461                    line_delim,
2462                );
2463            }
2464            return;
2465        }
2466        field_idx += 1;
2467        field_start = pos + 1;
2468    }
2469
2470    if !has_delim {
2471        if !suppress {
2472            unsafe {
2473                buf_extend_byte(buf, line, line_delim);
2474            }
2475        }
2476        return;
2477    }
2478
2479    if field_idx == target_idx {
2480        unsafe {
2481            buf_extend_byte(
2482                buf,
2483                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2484                line_delim,
2485            );
2486        }
2487    } else {
2488        unsafe { buf_push(buf, line_delim) };
2489    }
2490}
2491
2492/// Extract fields from a single line into the output buffer.
2493/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
2494/// Raw pointer arithmetic eliminates per-field bounds checking.
2495#[inline(always)]
2496fn extract_fields_to_buf(
2497    line: &[u8],
2498    delim: u8,
2499    ranges: &[Range],
2500    output_delim: &[u8],
2501    suppress: bool,
2502    max_field: usize,
2503    field_mask: u64,
2504    line_delim: u8,
2505    buf: &mut Vec<u8>,
2506    complement: bool,
2507) {
2508    let len = line.len();
2509
2510    if len == 0 {
2511        if !suppress {
2512            buf.push(line_delim);
2513        }
2514        return;
2515    }
2516
2517    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
2518    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
2519    let needed = len + output_delim.len() * 16 + 1;
2520    if buf.capacity() - buf.len() < needed {
2521        buf.reserve(needed);
2522    }
2523
2524    let base = line.as_ptr();
2525    let mut field_num: usize = 1;
2526    let mut field_start: usize = 0;
2527    let mut first_output = true;
2528    let mut has_delim = false;
2529
2530    // Use memchr SIMD for all line sizes
2531    for delim_pos in memchr_iter(delim, line) {
2532        has_delim = true;
2533
2534        if is_selected(field_num, field_mask, ranges, complement) {
2535            if !first_output {
2536                unsafe { buf_extend(buf, output_delim) };
2537            }
2538            unsafe {
2539                buf_extend(
2540                    buf,
2541                    std::slice::from_raw_parts(base.add(field_start), delim_pos - field_start),
2542                )
2543            };
2544            first_output = false;
2545        }
2546
2547        field_num += 1;
2548        field_start = delim_pos + 1;
2549
2550        if field_num > max_field {
2551            break;
2552        }
2553    }
2554
2555    // Last field
2556    if (field_num <= max_field || complement)
2557        && has_delim
2558        && is_selected(field_num, field_mask, ranges, complement)
2559    {
2560        if !first_output {
2561            unsafe { buf_extend(buf, output_delim) };
2562        }
2563        unsafe {
2564            buf_extend(
2565                buf,
2566                std::slice::from_raw_parts(base.add(field_start), len - field_start),
2567            )
2568        };
2569        first_output = false;
2570    }
2571
2572    if !first_output {
2573        unsafe { buf_push(buf, line_delim) };
2574    } else if !has_delim {
2575        if !suppress {
2576            unsafe {
2577                buf_extend(buf, line);
2578                buf_push(buf, line_delim);
2579            }
2580        }
2581    } else {
2582        unsafe { buf_push(buf, line_delim) };
2583    }
2584}
2585
2586// ── Fast path: byte/char extraction with batched output ──────────────────
2587
2588/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
2589/// Zero-copy: writes directly from the source data using output runs.
2590/// For lines shorter than max_bytes, the output is identical to the input,
2591/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
2592fn process_bytes_from_start(
2593    data: &[u8],
2594    max_bytes: usize,
2595    line_delim: u8,
2596    out: &mut impl Write,
2597) -> io::Result<()> {
2598    // For small data (< PARALLEL_THRESHOLD): check if all lines fit for zero-copy passthrough.
2599    // The sequential scan + write_all is competitive with per-line processing for small data.
2600    //
2601    // For large data (>= PARALLEL_THRESHOLD): skip the all_fit scan entirely.
2602    // The scan is sequential (~1.7ms for 10MB at memchr speed) while parallel per-line
2603    // processing is much faster (~0.5ms for 10MB with 4 threads). Even when all lines fit,
2604    // the parallel copy + write is faster than sequential scan + zero-copy write.
2605    if data.len() < PARALLEL_THRESHOLD && max_bytes > 0 && max_bytes < usize::MAX {
2606        let mut start = 0;
2607        let mut all_fit = true;
2608        for pos in memchr_iter(line_delim, data) {
2609            if pos - start > max_bytes {
2610                all_fit = false;
2611                break;
2612            }
2613            start = pos + 1;
2614        }
2615        // Check last line (no trailing delimiter)
2616        if all_fit && start < data.len() && data.len() - start > max_bytes {
2617            all_fit = false;
2618        }
2619        if all_fit {
2620            // All lines fit: output = input. Handle missing trailing delimiter.
2621            if !data.is_empty() && data[data.len() - 1] == line_delim {
2622                return out.write_all(data);
2623            } else if !data.is_empty() {
2624                out.write_all(data)?;
2625                return out.write_all(&[line_delim]);
2626            }
2627            return Ok(());
2628        }
2629    }
2630
2631    if data.len() >= PARALLEL_THRESHOLD {
2632        let chunks = split_for_scope(data, line_delim);
2633        let n = chunks.len();
2634        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2635        rayon::scope(|s| {
2636            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2637                s.spawn(move |_| {
2638                    // Output can be up to input size (when all lines fit).
2639                    // Reserve full chunk size to avoid reallocation.
2640                    result.reserve(chunk.len());
2641                    bytes_from_start_chunk(chunk, max_bytes, line_delim, result);
2642                });
2643            }
2644        });
2645        // Use write_vectored (writev) to batch N writes into fewer syscalls
2646        let slices: Vec<IoSlice> = results
2647            .iter()
2648            .filter(|r| !r.is_empty())
2649            .map(|r| IoSlice::new(r))
2650            .collect();
2651        write_ioslices(out, &slices)?;
2652    } else {
2653        // For moderate max_bytes, the buffer path is faster than writev zero-copy
2654        // because every line gets truncated, creating 3 IoSlice entries per line.
2655        // Copying max_bytes+1 bytes into a contiguous buffer is cheaper than
2656        // managing millions of IoSlice entries through the kernel.
2657        // Threshold at 512 covers common byte-range benchmarks like -b1-100.
2658        if max_bytes <= 512 {
2659            // Estimate output size without scanning: output <= data.len(),
2660            // typically ~data.len()/4 for short max_bytes on longer lines.
2661            let est_out = (data.len() / 4).max(max_bytes + 2);
2662            let mut buf = Vec::with_capacity(est_out.min(data.len()));
2663            bytes_from_start_chunk(data, max_bytes, line_delim, &mut buf);
2664            if !buf.is_empty() {
2665                out.write_all(&buf)?;
2666            }
2667        } else {
2668            // Zero-copy path: track contiguous output runs and write directly from source.
2669            // For lines <= max_bytes, we include them as-is (no copy needed).
2670            // For lines > max_bytes, we flush the run, write the truncated line, start new run.
2671            bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
2672        }
2673    }
2674    Ok(())
2675}
2676
2677/// Zero-copy byte-prefix extraction using writev: builds IoSlice entries pointing
2678/// directly into the source data, flushing in MAX_IOV-sized batches.
2679/// Lines shorter than max_bytes stay in contiguous runs. Lines needing truncation
2680/// produce two IoSlices (truncated data + newline).
2681#[inline]
2682fn bytes_from_start_zerocopy(
2683    data: &[u8],
2684    max_bytes: usize,
2685    line_delim: u8,
2686    out: &mut impl Write,
2687) -> io::Result<()> {
2688    let newline_buf: [u8; 1] = [line_delim];
2689    let mut iov: Vec<IoSlice> = Vec::with_capacity(MAX_IOV);
2690    let mut start = 0;
2691    let mut run_start: usize = 0;
2692
2693    for pos in memchr_iter(line_delim, data) {
2694        let line_len = pos - start;
2695        if line_len > max_bytes {
2696            // This line needs truncation
2697            if run_start < start {
2698                iov.push(IoSlice::new(&data[run_start..start]));
2699            }
2700            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2701            iov.push(IoSlice::new(&newline_buf));
2702            run_start = pos + 1;
2703
2704            if iov.len() >= MAX_IOV - 2 {
2705                write_ioslices(out, &iov)?;
2706                iov.clear();
2707            }
2708        }
2709        start = pos + 1;
2710    }
2711    // Handle last line without terminator
2712    if start < data.len() {
2713        let line_len = data.len() - start;
2714        if line_len > max_bytes {
2715            if run_start < start {
2716                iov.push(IoSlice::new(&data[run_start..start]));
2717            }
2718            iov.push(IoSlice::new(&data[start..start + max_bytes]));
2719            iov.push(IoSlice::new(&newline_buf));
2720            if !iov.is_empty() {
2721                write_ioslices(out, &iov)?;
2722            }
2723            return Ok(());
2724        }
2725    }
2726    // Flush remaining contiguous run
2727    if run_start < data.len() {
2728        iov.push(IoSlice::new(&data[run_start..]));
2729        if !data.is_empty() && *data.last().unwrap() != line_delim {
2730            iov.push(IoSlice::new(&newline_buf));
2731        }
2732    }
2733    if !iov.is_empty() {
2734        write_ioslices(out, &iov)?;
2735    }
2736    Ok(())
2737}
2738
2739/// Process a chunk for from-start byte range extraction (parallel path).
2740/// Uses unsafe appends to eliminate bounds checking in the hot loop.
2741/// Pre-reserves data.len() (output never exceeds input), then uses a single
2742/// write pointer with deferred set_len — no per-line capacity checks.
2743#[inline]
2744fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2745    // Output is always <= input size (we only truncate, never expand).
2746    // Single reserve eliminates ALL per-line capacity checks.
2747    buf.reserve(data.len());
2748
2749    let src = data.as_ptr();
2750    let dst_base = buf.as_mut_ptr();
2751    let mut wp = buf.len();
2752    let mut start = 0;
2753
2754    for pos in memchr_iter(line_delim, data) {
2755        let line_len = pos - start;
2756        let take = line_len.min(max_bytes);
2757        unsafe {
2758            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2759            *dst_base.add(wp + take) = line_delim;
2760        }
2761        wp += take + 1;
2762        start = pos + 1;
2763    }
2764    // Handle last line without terminator
2765    if start < data.len() {
2766        let line_len = data.len() - start;
2767        let take = line_len.min(max_bytes);
2768        unsafe {
2769            std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take);
2770            *dst_base.add(wp + take) = line_delim;
2771        }
2772        wp += take + 1;
2773    }
2774    unsafe { buf.set_len(wp) };
2775}
2776
2777/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
2778fn process_bytes_from_offset(
2779    data: &[u8],
2780    skip_bytes: usize,
2781    line_delim: u8,
2782    out: &mut impl Write,
2783) -> io::Result<()> {
2784    if data.len() >= PARALLEL_THRESHOLD {
2785        let chunks = split_for_scope(data, line_delim);
2786        let n = chunks.len();
2787        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2788        rayon::scope(|s| {
2789            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2790                s.spawn(move |_| {
2791                    result.reserve(chunk.len());
2792                    bytes_from_offset_chunk(chunk, skip_bytes, line_delim, result);
2793                });
2794            }
2795        });
2796        // Use write_vectored (writev) to batch N writes into fewer syscalls
2797        let slices: Vec<IoSlice> = results
2798            .iter()
2799            .filter(|r| !r.is_empty())
2800            .map(|r| IoSlice::new(r))
2801            .collect();
2802        write_ioslices(out, &slices)?;
2803    } else {
2804        // Zero-copy: write suffix of each line directly from source
2805        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
2806    }
2807    Ok(())
2808}
2809
2810/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
2811/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
2812/// reducing syscall overhead from 2 write_all calls per line to batched writev.
2813#[inline]
2814fn bytes_from_offset_zerocopy(
2815    data: &[u8],
2816    skip_bytes: usize,
2817    line_delim: u8,
2818    out: &mut impl Write,
2819) -> io::Result<()> {
2820    let delim_buf = [line_delim];
2821    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
2822
2823    let mut start = 0;
2824    for pos in memchr_iter(line_delim, data) {
2825        let line_len = pos - start;
2826        if line_len > skip_bytes {
2827            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
2828        }
2829        iov.push(IoSlice::new(&delim_buf));
2830        // Flush when approaching MAX_IOV to avoid oversized writev
2831        if iov.len() >= MAX_IOV - 1 {
2832            write_ioslices(out, &iov)?;
2833            iov.clear();
2834        }
2835        start = pos + 1;
2836    }
2837    if start < data.len() {
2838        let line_len = data.len() - start;
2839        if line_len > skip_bytes {
2840            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
2841        }
2842        iov.push(IoSlice::new(&delim_buf));
2843    }
2844    if !iov.is_empty() {
2845        write_ioslices(out, &iov)?;
2846    }
2847    Ok(())
2848}
2849
2850/// Process a chunk for from-offset byte range extraction.
2851/// Single reserve + deferred set_len for zero per-line overhead.
2852#[inline]
2853fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
2854    buf.reserve(data.len());
2855
2856    let src = data.as_ptr();
2857    let dst_base = buf.as_mut_ptr();
2858    let mut wp = buf.len();
2859    let mut start = 0;
2860
2861    for pos in memchr_iter(line_delim, data) {
2862        let line_len = pos - start;
2863        if line_len > skip_bytes {
2864            let take = line_len - skip_bytes;
2865            unsafe {
2866                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2867            }
2868            wp += take;
2869        }
2870        unsafe {
2871            *dst_base.add(wp) = line_delim;
2872        }
2873        wp += 1;
2874        start = pos + 1;
2875    }
2876    if start < data.len() {
2877        let line_len = data.len() - start;
2878        if line_len > skip_bytes {
2879            let take = line_len - skip_bytes;
2880            unsafe {
2881                std::ptr::copy_nonoverlapping(src.add(start + skip_bytes), dst_base.add(wp), take);
2882            }
2883            wp += take;
2884        }
2885        unsafe {
2886            *dst_base.add(wp) = line_delim;
2887        }
2888        wp += 1;
2889    }
2890    unsafe { buf.set_len(wp) };
2891}
2892
2893/// Fast path for `cut -bN-M` where N > 1 and M < MAX: extract bytes N through M per line.
2894fn process_bytes_mid_range(
2895    data: &[u8],
2896    start_byte: usize,
2897    end_byte: usize,
2898    line_delim: u8,
2899    out: &mut impl Write,
2900) -> io::Result<()> {
2901    let skip = start_byte.saturating_sub(1);
2902
2903    if data.len() >= PARALLEL_THRESHOLD {
2904        let chunks = split_for_scope(data, line_delim);
2905        let n = chunks.len();
2906        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2907        rayon::scope(|s| {
2908            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2909                s.spawn(move |_| {
2910                    result.reserve(chunk.len());
2911                    bytes_mid_range_chunk(chunk, skip, end_byte, line_delim, result);
2912                });
2913            }
2914        });
2915        let slices: Vec<IoSlice> = results
2916            .iter()
2917            .filter(|r| !r.is_empty())
2918            .map(|r| IoSlice::new(r))
2919            .collect();
2920        write_ioslices(out, &slices)?;
2921    } else {
2922        let mut buf = Vec::with_capacity(data.len());
2923        bytes_mid_range_chunk(data, skip, end_byte, line_delim, &mut buf);
2924        if !buf.is_empty() {
2925            out.write_all(&buf)?;
2926        }
2927    }
2928    Ok(())
2929}
2930
2931/// Process a chunk for mid-range byte extraction.
2932/// For each line, output bytes skip..min(line_len, end_byte).
2933/// Single reserve + deferred set_len.
2934#[inline]
2935fn bytes_mid_range_chunk(
2936    data: &[u8],
2937    skip: usize,
2938    end_byte: usize,
2939    line_delim: u8,
2940    buf: &mut Vec<u8>,
2941) {
2942    buf.reserve(data.len());
2943
2944    let src = data.as_ptr();
2945    let dst_base = buf.as_mut_ptr();
2946    let mut wp = buf.len();
2947    let mut start = 0;
2948
2949    for pos in memchr_iter(line_delim, data) {
2950        let line_len = pos - start;
2951        if line_len > skip {
2952            let take_end = line_len.min(end_byte);
2953            let take = take_end - skip;
2954            unsafe {
2955                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2956            }
2957            wp += take;
2958        }
2959        unsafe {
2960            *dst_base.add(wp) = line_delim;
2961        }
2962        wp += 1;
2963        start = pos + 1;
2964    }
2965    if start < data.len() {
2966        let line_len = data.len() - start;
2967        if line_len > skip {
2968            let take_end = line_len.min(end_byte);
2969            let take = take_end - skip;
2970            unsafe {
2971                std::ptr::copy_nonoverlapping(src.add(start + skip), dst_base.add(wp), take);
2972            }
2973            wp += take;
2974        }
2975        unsafe {
2976            *dst_base.add(wp) = line_delim;
2977        }
2978        wp += 1;
2979    }
2980    unsafe { buf.set_len(wp) };
2981}
2982
2983/// Fast path for `--complement -bN-M`: output bytes 1..N-1 and M+1..end per line.
2984fn process_bytes_complement_mid(
2985    data: &[u8],
2986    skip_start: usize,
2987    skip_end: usize,
2988    line_delim: u8,
2989    out: &mut impl Write,
2990) -> io::Result<()> {
2991    let prefix_bytes = skip_start - 1; // bytes before the skip region
2992    if data.len() >= PARALLEL_THRESHOLD {
2993        let chunks = split_for_scope(data, line_delim);
2994        let n = chunks.len();
2995        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
2996        rayon::scope(|s| {
2997            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
2998                s.spawn(move |_| {
2999                    result.reserve(chunk.len());
3000                    bytes_complement_mid_chunk(chunk, prefix_bytes, skip_end, line_delim, result);
3001                });
3002            }
3003        });
3004        let slices: Vec<IoSlice> = results
3005            .iter()
3006            .filter(|r| !r.is_empty())
3007            .map(|r| IoSlice::new(r))
3008            .collect();
3009        write_ioslices(out, &slices)?;
3010    } else {
3011        let mut buf = Vec::with_capacity(data.len());
3012        bytes_complement_mid_chunk(data, prefix_bytes, skip_end, line_delim, &mut buf);
3013        if !buf.is_empty() {
3014            out.write_all(&buf)?;
3015        }
3016    }
3017    Ok(())
3018}
3019
3020/// Process a chunk for complement mid-range byte extraction.
3021/// For each line: output bytes 0..prefix_bytes, then bytes skip_end..line_len.
3022#[inline]
3023fn bytes_complement_mid_chunk(
3024    data: &[u8],
3025    prefix_bytes: usize,
3026    skip_end: usize,
3027    line_delim: u8,
3028    buf: &mut Vec<u8>,
3029) {
3030    buf.reserve(data.len());
3031
3032    let src = data.as_ptr();
3033    let dst_base = buf.as_mut_ptr();
3034    let mut wp = buf.len();
3035    let mut start = 0;
3036
3037    for pos in memchr_iter(line_delim, data) {
3038        let line_len = pos - start;
3039        // Copy prefix (bytes before skip region)
3040        let take_prefix = prefix_bytes.min(line_len);
3041        if take_prefix > 0 {
3042            unsafe {
3043                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3044            }
3045            wp += take_prefix;
3046        }
3047        // Copy suffix (bytes after skip region)
3048        if line_len > skip_end {
3049            let suffix_len = line_len - skip_end;
3050            unsafe {
3051                std::ptr::copy_nonoverlapping(
3052                    src.add(start + skip_end),
3053                    dst_base.add(wp),
3054                    suffix_len,
3055                );
3056            }
3057            wp += suffix_len;
3058        }
3059        unsafe {
3060            *dst_base.add(wp) = line_delim;
3061        }
3062        wp += 1;
3063        start = pos + 1;
3064    }
3065    if start < data.len() {
3066        let line_len = data.len() - start;
3067        let take_prefix = prefix_bytes.min(line_len);
3068        if take_prefix > 0 {
3069            unsafe {
3070                std::ptr::copy_nonoverlapping(src.add(start), dst_base.add(wp), take_prefix);
3071            }
3072            wp += take_prefix;
3073        }
3074        if line_len > skip_end {
3075            let suffix_len = line_len - skip_end;
3076            unsafe {
3077                std::ptr::copy_nonoverlapping(
3078                    src.add(start + skip_end),
3079                    dst_base.add(wp),
3080                    suffix_len,
3081                );
3082            }
3083            wp += suffix_len;
3084        }
3085        unsafe {
3086            *dst_base.add(wp) = line_delim;
3087        }
3088        wp += 1;
3089    }
3090    unsafe { buf.set_len(wp) };
3091}
3092
3093/// Optimized byte/char extraction with batched output and parallel processing.
3094fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3095    let line_delim = cfg.line_delim;
3096    let ranges = cfg.ranges;
3097    let complement = cfg.complement;
3098    let output_delim = cfg.output_delim;
3099
3100    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
3101    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
3102        let max_bytes = ranges[0].end;
3103        if max_bytes < usize::MAX {
3104            return process_bytes_from_start(data, max_bytes, line_delim, out);
3105        }
3106    }
3107
3108    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
3109    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
3110        let skip_bytes = ranges[0].start.saturating_sub(1);
3111        if skip_bytes > 0 {
3112            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
3113        }
3114    }
3115
3116    // Fast path: single mid-range (e.g., cut -b5-100)
3117    if !complement
3118        && ranges.len() == 1
3119        && ranges[0].start > 1
3120        && ranges[0].end < usize::MAX
3121        && output_delim.is_empty()
3122    {
3123        return process_bytes_mid_range(data, ranges[0].start, ranges[0].end, line_delim, out);
3124    }
3125
3126    // Fast path: complement of single from-start range (e.g., --complement -b1-100 = output bytes 101+)
3127    if complement
3128        && ranges.len() == 1
3129        && ranges[0].start == 1
3130        && ranges[0].end < usize::MAX
3131        && output_delim.is_empty()
3132    {
3133        return process_bytes_from_offset(data, ranges[0].end, line_delim, out);
3134    }
3135
3136    // Fast path: complement of single from-offset range (e.g., --complement -b5- = output bytes 1-4)
3137    if complement
3138        && ranges.len() == 1
3139        && ranges[0].end == usize::MAX
3140        && ranges[0].start > 1
3141        && output_delim.is_empty()
3142    {
3143        let max_bytes = ranges[0].start - 1;
3144        return process_bytes_from_start(data, max_bytes, line_delim, out);
3145    }
3146
3147    // Fast path: complement of single mid-range (e.g., --complement -b5-100 = bytes 1-4,101+)
3148    if complement
3149        && ranges.len() == 1
3150        && ranges[0].start > 1
3151        && ranges[0].end < usize::MAX
3152        && output_delim.is_empty()
3153    {
3154        return process_bytes_complement_mid(data, ranges[0].start, ranges[0].end, line_delim, out);
3155    }
3156
3157    if data.len() >= PARALLEL_THRESHOLD {
3158        let chunks = split_for_scope(data, line_delim);
3159        let n = chunks.len();
3160        let mut results: Vec<Vec<u8>> = (0..n).map(|_| Vec::new()).collect();
3161        rayon::scope(|s| {
3162            for (chunk, result) in chunks.iter().zip(results.iter_mut()) {
3163                s.spawn(move |_| {
3164                    result.reserve(chunk.len());
3165                    process_bytes_chunk(
3166                        chunk,
3167                        ranges,
3168                        complement,
3169                        output_delim,
3170                        line_delim,
3171                        result,
3172                    );
3173                });
3174            }
3175        });
3176        let slices: Vec<IoSlice> = results
3177            .iter()
3178            .filter(|r| !r.is_empty())
3179            .map(|r| IoSlice::new(r))
3180            .collect();
3181        write_ioslices(out, &slices)?;
3182    } else {
3183        let mut buf = Vec::with_capacity(data.len());
3184        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
3185        if !buf.is_empty() {
3186            out.write_all(&buf)?;
3187        }
3188    }
3189    Ok(())
3190}
3191
3192/// Process a chunk of data for byte/char extraction.
3193/// Uses raw pointer arithmetic for the newline scan.
3194/// Complement single-range fast path: compute complement ranges once, then use
3195/// the non-complement multi-range path which is more cache-friendly.
3196fn process_bytes_chunk(
3197    data: &[u8],
3198    ranges: &[Range],
3199    complement: bool,
3200    output_delim: &[u8],
3201    line_delim: u8,
3202    buf: &mut Vec<u8>,
3203) {
3204    buf.reserve(data.len());
3205    let base = data.as_ptr();
3206    let mut start = 0;
3207    for end_pos in memchr_iter(line_delim, data) {
3208        let line = unsafe { std::slice::from_raw_parts(base.add(start), end_pos - start) };
3209        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3210        unsafe { buf_push(buf, line_delim) };
3211        start = end_pos + 1;
3212    }
3213    if start < data.len() {
3214        let line = unsafe { std::slice::from_raw_parts(base.add(start), data.len() - start) };
3215        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
3216        unsafe { buf_push(buf, line_delim) };
3217    }
3218}
3219
3220/// Extract byte ranges from a line into the output buffer.
3221/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
3222/// Raw pointer arithmetic eliminates per-range bounds checking.
3223#[inline(always)]
3224fn cut_bytes_to_buf(
3225    line: &[u8],
3226    ranges: &[Range],
3227    complement: bool,
3228    output_delim: &[u8],
3229    buf: &mut Vec<u8>,
3230) {
3231    let len = line.len();
3232    let base = line.as_ptr();
3233    let mut first_range = true;
3234
3235    // Reserve worst case: full line + delimiters between ranges
3236    let needed = len + output_delim.len() * ranges.len() + 1;
3237    if buf.capacity() - buf.len() < needed {
3238        buf.reserve(needed);
3239    }
3240
3241    if complement {
3242        let mut pos: usize = 1;
3243        for r in ranges {
3244            let rs = r.start;
3245            let re = r.end.min(len);
3246            if pos < rs {
3247                if !first_range && !output_delim.is_empty() {
3248                    unsafe { buf_extend(buf, output_delim) };
3249                }
3250                unsafe { buf_extend(buf, std::slice::from_raw_parts(base.add(pos - 1), rs - pos)) };
3251                first_range = false;
3252            }
3253            pos = re + 1;
3254            if pos > len {
3255                break;
3256            }
3257        }
3258        if pos <= len {
3259            if !first_range && !output_delim.is_empty() {
3260                unsafe { buf_extend(buf, output_delim) };
3261            }
3262            unsafe {
3263                buf_extend(
3264                    buf,
3265                    std::slice::from_raw_parts(base.add(pos - 1), len - pos + 1),
3266                )
3267            };
3268        }
3269    } else if output_delim.is_empty() && ranges.len() == 1 {
3270        // Ultra-fast path: single range, no output delimiter
3271        let start = ranges[0].start.saturating_sub(1);
3272        let end = ranges[0].end.min(len);
3273        if start < len {
3274            unsafe {
3275                buf_extend(
3276                    buf,
3277                    std::slice::from_raw_parts(base.add(start), end - start),
3278                )
3279            };
3280        }
3281    } else {
3282        for r in ranges {
3283            let start = r.start.saturating_sub(1);
3284            let end = r.end.min(len);
3285            if start >= len {
3286                break;
3287            }
3288            if !first_range && !output_delim.is_empty() {
3289                unsafe { buf_extend(buf, output_delim) };
3290            }
3291            unsafe {
3292                buf_extend(
3293                    buf,
3294                    std::slice::from_raw_parts(base.add(start), end - start),
3295                )
3296            };
3297            first_range = false;
3298        }
3299    }
3300}
3301
3302// ── Public API ───────────────────────────────────────────────────────────
3303
3304/// Cut fields from a line using a delimiter. Writes to `out`.
3305#[inline]
3306pub fn cut_fields(
3307    line: &[u8],
3308    delim: u8,
3309    ranges: &[Range],
3310    complement: bool,
3311    output_delim: &[u8],
3312    suppress_no_delim: bool,
3313    out: &mut impl Write,
3314) -> io::Result<bool> {
3315    if memchr::memchr(delim, line).is_none() {
3316        if !suppress_no_delim {
3317            out.write_all(line)?;
3318            return Ok(true);
3319        }
3320        return Ok(false);
3321    }
3322
3323    let mut field_num: usize = 1;
3324    let mut field_start: usize = 0;
3325    let mut first_output = true;
3326
3327    for delim_pos in memchr_iter(delim, line) {
3328        let selected = in_ranges(ranges, field_num) != complement;
3329        if selected {
3330            if !first_output {
3331                out.write_all(output_delim)?;
3332            }
3333            out.write_all(&line[field_start..delim_pos])?;
3334            first_output = false;
3335        }
3336        field_start = delim_pos + 1;
3337        field_num += 1;
3338    }
3339
3340    let selected = in_ranges(ranges, field_num) != complement;
3341    if selected {
3342        if !first_output {
3343            out.write_all(output_delim)?;
3344        }
3345        out.write_all(&line[field_start..])?;
3346    }
3347
3348    Ok(true)
3349}
3350
3351/// Cut bytes/chars from a line. Writes selected bytes to `out`.
3352#[inline]
3353pub fn cut_bytes(
3354    line: &[u8],
3355    ranges: &[Range],
3356    complement: bool,
3357    output_delim: &[u8],
3358    out: &mut impl Write,
3359) -> io::Result<bool> {
3360    let mut first_range = true;
3361
3362    if complement {
3363        let len = line.len();
3364        let mut comp_ranges = Vec::new();
3365        let mut pos: usize = 1;
3366        for r in ranges {
3367            let rs = r.start;
3368            let re = r.end.min(len);
3369            if pos < rs {
3370                comp_ranges.push((pos, rs - 1));
3371            }
3372            pos = re + 1;
3373            if pos > len {
3374                break;
3375            }
3376        }
3377        if pos <= len {
3378            comp_ranges.push((pos, len));
3379        }
3380        for &(s, e) in &comp_ranges {
3381            if !first_range && !output_delim.is_empty() {
3382                out.write_all(output_delim)?;
3383            }
3384            out.write_all(&line[s - 1..e])?;
3385            first_range = false;
3386        }
3387    } else {
3388        for r in ranges {
3389            let start = r.start.saturating_sub(1);
3390            let end = r.end.min(line.len());
3391            if start >= line.len() {
3392                break;
3393            }
3394            if !first_range && !output_delim.is_empty() {
3395                out.write_all(output_delim)?;
3396            }
3397            out.write_all(&line[start..end])?;
3398            first_range = false;
3399        }
3400    }
3401    Ok(true)
3402}
3403
3404/// In-place field 1 extraction: modifies `data` buffer directly, returns new length.
3405/// Output is always <= input (we remove everything after first delimiter per line).
3406/// Avoids intermediate Vec allocation + BufWriter copy, saving ~10MB of memory
3407/// bandwidth for 10MB input. Requires owned mutable data (not mmap).
3408///
3409/// Lines without delimiter pass through unchanged (unless suppress=true).
3410/// Lines with delimiter: keep bytes before delimiter + newline.
3411pub fn cut_field1_inplace(data: &mut [u8], delim: u8, line_delim: u8, suppress: bool) -> usize {
3412    let len = data.len();
3413    let mut wp: usize = 0;
3414    let mut rp: usize = 0;
3415
3416    while rp < len {
3417        match memchr::memchr2(delim, line_delim, &data[rp..]) {
3418            None => {
3419                // Rest is partial line, no delimiter
3420                if suppress {
3421                    // suppress: skip lines without delimiter
3422                    break;
3423                }
3424                let remaining = len - rp;
3425                if wp != rp {
3426                    data.copy_within(rp..len, wp);
3427                }
3428                wp += remaining;
3429                break;
3430            }
3431            Some(offset) => {
3432                let actual = rp + offset;
3433                if data[actual] == line_delim {
3434                    // No delimiter on this line
3435                    if suppress {
3436                        // Skip this line entirely
3437                        rp = actual + 1;
3438                    } else {
3439                        // Output entire line including newline
3440                        let chunk_len = actual + 1 - rp;
3441                        if wp != rp {
3442                            data.copy_within(rp..actual + 1, wp);
3443                        }
3444                        wp += chunk_len;
3445                        rp = actual + 1;
3446                    }
3447                } else {
3448                    // Delimiter found: output field 1 (up to delimiter) + newline
3449                    let field_len = actual - rp;
3450                    if wp != rp && field_len > 0 {
3451                        data.copy_within(rp..actual, wp);
3452                    }
3453                    wp += field_len;
3454                    data[wp] = line_delim;
3455                    wp += 1;
3456                    // Skip to next newline
3457                    match memchr::memchr(line_delim, &data[actual + 1..]) {
3458                        None => {
3459                            rp = len;
3460                        }
3461                        Some(nl_off) => {
3462                            rp = actual + 1 + nl_off + 1;
3463                        }
3464                    }
3465                }
3466            }
3467        }
3468    }
3469    wp
3470}
3471
3472/// Process a full data buffer (from mmap or read) with cut operation.
3473pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
3474    match cfg.mode {
3475        CutMode::Fields => process_fields_fast(data, cfg, out),
3476        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
3477    }
3478}
3479
3480/// Process input from a reader (for stdin).
3481/// Uses batch reading: reads large chunks (16MB), then processes them in batch
3482/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
3483/// 16MB chunks mean a 10MB piped input is consumed in a single batch.
3484pub fn process_cut_reader<R: BufRead>(
3485    mut reader: R,
3486    cfg: &CutConfig,
3487    out: &mut impl Write,
3488) -> io::Result<()> {
3489    const CHUNK_SIZE: usize = 16 * 1024 * 1024; // 16MB read chunks
3490    let line_delim = cfg.line_delim;
3491
3492    // Read large chunks and process in batch.
3493    // We keep a buffer; after processing complete lines, we shift leftover to the front.
3494    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
3495
3496    loop {
3497        // Read up to CHUNK_SIZE bytes
3498        buf.reserve(CHUNK_SIZE);
3499        let read_start = buf.len();
3500        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
3501        let n = read_fully(&mut reader, &mut buf[read_start..])?;
3502        buf.truncate(read_start + n);
3503
3504        if buf.is_empty() {
3505            break;
3506        }
3507
3508        if n == 0 {
3509            // EOF with leftover data (last line without terminator)
3510            process_cut_data(&buf, cfg, out)?;
3511            break;
3512        }
3513
3514        // Find the last line delimiter in the buffer so we process complete lines
3515        let process_end = match memchr::memrchr(line_delim, &buf) {
3516            Some(pos) => pos + 1,
3517            None => {
3518                // No line delimiter found — keep accumulating
3519                continue;
3520            }
3521        };
3522
3523        // Process the complete lines using the fast batch path
3524        process_cut_data(&buf[..process_end], cfg, out)?;
3525
3526        // Shift leftover to the front for next iteration
3527        let leftover_len = buf.len() - process_end;
3528        if leftover_len > 0 {
3529            buf.copy_within(process_end.., 0);
3530        }
3531        buf.truncate(leftover_len);
3532    }
3533
3534    Ok(())
3535}
3536
3537/// Read as many bytes as possible into buf, retrying on partial reads.
3538#[inline]
3539fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
3540    let n = reader.read(buf)?;
3541    if n == buf.len() || n == 0 {
3542        return Ok(n);
3543    }
3544    // Slow path: partial read — retry to fill buffer
3545    let mut total = n;
3546    while total < buf.len() {
3547        match reader.read(&mut buf[total..]) {
3548            Ok(0) => break,
3549            Ok(n) => total += n,
3550            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
3551            Err(e) => return Err(e),
3552        }
3553    }
3554    Ok(total)
3555}
3556
3557/// In-place cut processing for mutable data buffers.
3558/// Returns Some(new_length) if in-place processing succeeded, None if not supported
3559/// for the given configuration (caller should fall back to regular processing).
3560///
3561/// In-place avoids allocating intermediate output buffers — the result is written
3562/// directly into the input buffer (output is always <= input for non-complement modes
3563/// with default output delimiter).
3564pub fn process_cut_data_mut(data: &mut [u8], cfg: &CutConfig) -> Option<usize> {
3565    if cfg.complement {
3566        return None;
3567    }
3568
3569    match cfg.mode {
3570        CutMode::Fields => {
3571            // Only handle when output delimiter matches input (single-byte)
3572            if cfg.output_delim.len() != 1 || cfg.output_delim[0] != cfg.delim {
3573                return None;
3574            }
3575            if cfg.delim == cfg.line_delim {
3576                return None;
3577            }
3578            Some(cut_fields_inplace_general(
3579                data,
3580                cfg.delim,
3581                cfg.line_delim,
3582                cfg.ranges,
3583                cfg.suppress_no_delim,
3584            ))
3585        }
3586        CutMode::Bytes | CutMode::Characters => {
3587            if !cfg.output_delim.is_empty() {
3588                return None;
3589            }
3590            Some(cut_bytes_inplace_general(data, cfg.line_delim, cfg.ranges))
3591        }
3592    }
3593}
3594
3595/// In-place generalized field extraction.
3596/// Handles single fields, contiguous ranges, and non-contiguous multi-field patterns.
3597fn cut_fields_inplace_general(
3598    data: &mut [u8],
3599    delim: u8,
3600    line_delim: u8,
3601    ranges: &[Range],
3602    suppress: bool,
3603) -> usize {
3604    // Special case: field 1 only (existing optimized path)
3605    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == 1 {
3606        return cut_field1_inplace(data, delim, line_delim, suppress);
3607    }
3608
3609    let len = data.len();
3610    if len == 0 {
3611        return 0;
3612    }
3613
3614    let max_field = ranges.last().map_or(0, |r| r.end);
3615    let max_delims = max_field.min(64);
3616    let mut wp: usize = 0;
3617    let mut rp: usize = 0;
3618
3619    while rp < len {
3620        let line_end = memchr::memchr(line_delim, &data[rp..])
3621            .map(|p| rp + p)
3622            .unwrap_or(len);
3623        let line_len = line_end - rp;
3624
3625        // Collect delimiter positions (relative to line start)
3626        let mut delim_pos = [0usize; 64];
3627        let mut num_delims: usize = 0;
3628
3629        for pos in memchr_iter(delim, &data[rp..line_end]) {
3630            if num_delims < max_delims {
3631                delim_pos[num_delims] = pos;
3632                num_delims += 1;
3633                if num_delims >= max_delims {
3634                    break;
3635                }
3636            }
3637        }
3638
3639        if num_delims == 0 {
3640            // No delimiter in line
3641            if !suppress {
3642                if wp != rp {
3643                    data.copy_within(rp..line_end, wp);
3644                }
3645                wp += line_len;
3646                if line_end < len {
3647                    data[wp] = line_delim;
3648                    wp += 1;
3649                }
3650            }
3651        } else {
3652            let total_fields = num_delims + 1;
3653            let mut first_output = true;
3654
3655            for r in ranges {
3656                let range_start = r.start;
3657                let range_end = r.end.min(total_fields);
3658                if range_start > total_fields {
3659                    break;
3660                }
3661                for field_num in range_start..=range_end {
3662                    if field_num > total_fields {
3663                        break;
3664                    }
3665
3666                    let field_start = if field_num == 1 {
3667                        0
3668                    } else if field_num - 2 < num_delims {
3669                        delim_pos[field_num - 2] + 1
3670                    } else {
3671                        continue;
3672                    };
3673                    let field_end = if field_num <= num_delims {
3674                        delim_pos[field_num - 1]
3675                    } else {
3676                        line_len
3677                    };
3678
3679                    if !first_output {
3680                        data[wp] = delim;
3681                        wp += 1;
3682                    }
3683                    let flen = field_end - field_start;
3684                    if flen > 0 {
3685                        data.copy_within(rp + field_start..rp + field_start + flen, wp);
3686                        wp += flen;
3687                    }
3688                    first_output = false;
3689                }
3690            }
3691
3692            if !first_output && line_end < len {
3693                data[wp] = line_delim;
3694                wp += 1;
3695            } else if first_output && line_end < len {
3696                // No fields selected but line had delimiters — output empty line
3697                data[wp] = line_delim;
3698                wp += 1;
3699            }
3700        }
3701
3702        rp = if line_end < len { line_end + 1 } else { len };
3703    }
3704
3705    wp
3706}
3707
3708/// In-place byte/char range extraction.
3709fn cut_bytes_inplace_general(data: &mut [u8], line_delim: u8, ranges: &[Range]) -> usize {
3710    let len = data.len();
3711    if len == 0 {
3712        return 0;
3713    }
3714
3715    // Quick check: single range from byte 1 to end = no-op
3716    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end == usize::MAX {
3717        return len;
3718    }
3719
3720    // Single range from byte 1: fast truncation path
3721    if ranges.len() == 1 && ranges[0].start == 1 && ranges[0].end < usize::MAX {
3722        return cut_bytes_from_start_inplace(data, line_delim, ranges[0].end);
3723    }
3724
3725    let mut wp: usize = 0;
3726    let mut rp: usize = 0;
3727
3728    while rp < len {
3729        let line_end = memchr::memchr(line_delim, &data[rp..])
3730            .map(|p| rp + p)
3731            .unwrap_or(len);
3732        let line_len = line_end - rp;
3733
3734        for r in ranges {
3735            let start = r.start.saturating_sub(1);
3736            let end = r.end.min(line_len);
3737            if start >= line_len {
3738                break;
3739            }
3740            let flen = end - start;
3741            if flen > 0 {
3742                data.copy_within(rp + start..rp + start + flen, wp);
3743                wp += flen;
3744            }
3745        }
3746
3747        if line_end < len {
3748            data[wp] = line_delim;
3749            wp += 1;
3750        }
3751
3752        rp = if line_end < len { line_end + 1 } else { len };
3753    }
3754
3755    wp
3756}
3757
3758/// In-place truncation for -b1-N: truncate each line to at most max_bytes.
3759fn cut_bytes_from_start_inplace(data: &mut [u8], line_delim: u8, max_bytes: usize) -> usize {
3760    let len = data.len();
3761
3762    // Quick check: see if all lines fit within max_bytes (common case)
3763    let mut all_fit = true;
3764    let mut start = 0;
3765    for pos in memchr_iter(line_delim, data) {
3766        if pos - start > max_bytes {
3767            all_fit = false;
3768            break;
3769        }
3770        start = pos + 1;
3771    }
3772    if all_fit && start < len && len - start > max_bytes {
3773        all_fit = false;
3774    }
3775    if all_fit {
3776        return len;
3777    }
3778
3779    // Some lines need truncation
3780    let mut wp: usize = 0;
3781    let mut rp: usize = 0;
3782
3783    while rp < len {
3784        let line_end = memchr::memchr(line_delim, &data[rp..])
3785            .map(|p| rp + p)
3786            .unwrap_or(len);
3787        let line_len = line_end - rp;
3788
3789        let take = line_len.min(max_bytes);
3790        if take > 0 && wp != rp {
3791            data.copy_within(rp..rp + take, wp);
3792        }
3793        wp += take;
3794
3795        if line_end < len {
3796            data[wp] = line_delim;
3797            wp += 1;
3798        }
3799
3800        rp = if line_end < len { line_end + 1 } else { len };
3801    }
3802
3803    wp
3804}
3805
3806/// Cut operation mode
3807#[derive(Debug, Clone, Copy, PartialEq)]
3808pub enum CutMode {
3809    Bytes,
3810    Characters,
3811    Fields,
3812}