Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, IoSlice, Write};
4
5/// Minimum file size for parallel processing (2MB).
6const PARALLEL_THRESHOLD: usize = 2 * 1024 * 1024;
7
8/// Max iovec entries per writev call (Linux default).
9const MAX_IOV: usize = 1024;
10
11/// Configuration for cut operations.
12pub struct CutConfig<'a> {
13    pub mode: CutMode,
14    pub ranges: &'a [Range],
15    pub complement: bool,
16    pub delim: u8,
17    pub output_delim: &'a [u8],
18    pub suppress_no_delim: bool,
19    pub line_delim: u8,
20}
21
22/// A range specification like 1, 3-5, -3, 4-
23#[derive(Debug, Clone)]
24pub struct Range {
25    pub start: usize, // 1-based, 0 means "from beginning"
26    pub end: usize,   // 1-based, usize::MAX means "to end"
27}
28
29/// Parse a LIST specification like "1,3-5,7-" into ranges.
30/// Each range is 1-based. Returns sorted, merged ranges.
31pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
32    let mut ranges = Vec::new();
33
34    for part in spec.split(',') {
35        let part = part.trim();
36        if part.is_empty() {
37            continue;
38        }
39
40        if let Some(idx) = part.find('-') {
41            let left = &part[..idx];
42            let right = &part[idx + 1..];
43
44            let start = if left.is_empty() {
45                1
46            } else {
47                left.parse::<usize>()
48                    .map_err(|_| format!("invalid range: '{}'", part))?
49            };
50
51            let end = if right.is_empty() {
52                usize::MAX
53            } else {
54                right
55                    .parse::<usize>()
56                    .map_err(|_| format!("invalid range: '{}'", part))?
57            };
58
59            if start == 0 {
60                return Err("fields and positions are numbered from 1".to_string());
61            }
62            if start > end {
63                return Err(format!("invalid decreasing range: '{}'", part));
64            }
65
66            ranges.push(Range { start, end });
67        } else {
68            let n = part
69                .parse::<usize>()
70                .map_err(|_| format!("invalid field: '{}'", part))?;
71            if n == 0 {
72                return Err("fields and positions are numbered from 1".to_string());
73            }
74            ranges.push(Range { start: n, end: n });
75        }
76    }
77
78    if ranges.is_empty() {
79        return Err("you must specify a list of bytes, characters, or fields".to_string());
80    }
81
82    // Sort and merge overlapping ranges
83    ranges.sort_by_key(|r| (r.start, r.end));
84    let mut merged = vec![ranges[0].clone()];
85    for r in &ranges[1..] {
86        let last = merged.last_mut().unwrap();
87        if r.start <= last.end.saturating_add(1) {
88            last.end = last.end.max(r.end);
89        } else {
90            merged.push(r.clone());
91        }
92    }
93
94    Ok(merged)
95}
96
97/// Check if a 1-based position is in any range.
98/// Ranges must be sorted. Uses early exit since ranges are sorted.
99#[inline(always)]
100fn in_ranges(ranges: &[Range], pos: usize) -> bool {
101    for r in ranges {
102        if pos < r.start {
103            return false;
104        }
105        if pos <= r.end {
106            return true;
107        }
108    }
109    false
110}
111
112/// Pre-compute a 64-bit mask for field selection.
113/// Bit i-1 is set if field i should be output.
114#[inline]
115fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
116    let mut mask: u64 = 0;
117    for i in 1..=64u32 {
118        let in_range = in_ranges(ranges, i as usize);
119        if in_range != complement {
120            mask |= 1u64 << (i - 1);
121        }
122    }
123    mask
124}
125
126/// Check if a field should be selected, using bitset for first 64 fields.
127#[inline(always)]
128fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
129    if field_num <= 64 {
130        (mask >> (field_num - 1)) & 1 == 1
131    } else {
132        in_ranges(ranges, field_num) != complement
133    }
134}
135
136// ── Unsafe buffer helpers (skip bounds checks in hot loops) ──────────────
137
138/// Append a slice to buf without capacity checks.
139/// Caller MUST ensure buf has enough remaining capacity.
140#[inline(always)]
141unsafe fn buf_extend(buf: &mut Vec<u8>, data: &[u8]) {
142    unsafe {
143        let len = buf.len();
144        std::ptr::copy_nonoverlapping(data.as_ptr(), buf.as_mut_ptr().add(len), data.len());
145        buf.set_len(len + data.len());
146    }
147}
148
149/// Append a single byte to buf without capacity checks.
150/// Caller MUST ensure buf has enough remaining capacity.
151#[inline(always)]
152unsafe fn buf_push(buf: &mut Vec<u8>, b: u8) {
153    unsafe {
154        let len = buf.len();
155        *buf.as_mut_ptr().add(len) = b;
156        buf.set_len(len + 1);
157    }
158}
159
160/// Write multiple IoSlice buffers using write_vectored (writev syscall).
161/// Batches into MAX_IOV-sized groups. Falls back to write_all per slice for partial writes.
162#[inline]
163fn write_ioslices(out: &mut impl Write, slices: &[IoSlice]) -> io::Result<()> {
164    if slices.is_empty() {
165        return Ok(());
166    }
167    for batch in slices.chunks(MAX_IOV) {
168        let total: usize = batch.iter().map(|s| s.len()).sum();
169        match out.write_vectored(batch) {
170            Ok(n) if n >= total => continue,
171            Ok(mut written) => {
172                // Partial write: fall back to write_all per remaining slice
173                for slice in batch {
174                    let slen = slice.len();
175                    if written >= slen {
176                        written -= slen;
177                        continue;
178                    }
179                    if written > 0 {
180                        out.write_all(&slice[written..])?;
181                        written = 0;
182                    } else {
183                        out.write_all(slice)?;
184                    }
185                }
186            }
187            Err(e) => return Err(e),
188        }
189    }
190    Ok(())
191}
192
193// ── Chunk splitting for parallel processing ──────────────────────────────
194
195/// Split data into chunks aligned to line boundaries for parallel processing.
196fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
197    let num_threads = rayon::current_num_threads().max(1);
198    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
199        return vec![data];
200    }
201
202    let chunk_size = data.len() / num_threads;
203    let mut chunks = Vec::with_capacity(num_threads);
204    let mut pos = 0;
205
206    for _ in 0..num_threads - 1 {
207        let target = pos + chunk_size;
208        if target >= data.len() {
209            break;
210        }
211        let boundary = memchr::memchr(line_delim, &data[target..])
212            .map(|p| target + p + 1)
213            .unwrap_or(data.len());
214        if boundary > pos {
215            chunks.push(&data[pos..boundary]);
216        }
217        pos = boundary;
218    }
219
220    if pos < data.len() {
221        chunks.push(&data[pos..]);
222    }
223
224    chunks
225}
226
227// ── Fast path: field extraction with batched output ──────────────────────
228
229/// Optimized field extraction with early exit and batched output.
230fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
231    let delim = cfg.delim;
232    let line_delim = cfg.line_delim;
233    let ranges = cfg.ranges;
234    let complement = cfg.complement;
235    let output_delim = cfg.output_delim;
236    let suppress = cfg.suppress_no_delim;
237
238    // Zero-copy fast path: if delimiter never appears, output = input unchanged.
239    if !complement && memchr::memchr(delim, data).is_none() {
240        if suppress {
241            return Ok(());
242        }
243        out.write_all(data)?;
244        if !data.is_empty() && *data.last().unwrap() != line_delim {
245            out.write_all(&[line_delim])?;
246        }
247        return Ok(());
248    }
249
250    // Ultra-fast path: single field extraction (e.g., cut -f5)
251    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
252        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
253    }
254
255    // Fast path: complement of single field with default output delimiter.
256    if complement
257        && ranges.len() == 1
258        && ranges[0].start == ranges[0].end
259        && output_delim.len() == 1
260        && output_delim[0] == delim
261    {
262        return process_complement_single_field(
263            data,
264            delim,
265            line_delim,
266            ranges[0].start,
267            suppress,
268            out,
269        );
270    }
271
272    // Fast path: contiguous from-start field range (e.g., cut -f1-5)
273    if !complement
274        && ranges.len() == 1
275        && ranges[0].start == 1
276        && output_delim.len() == 1
277        && output_delim[0] == delim
278        && ranges[0].end < usize::MAX
279    {
280        return process_fields_prefix(data, delim, line_delim, ranges[0].end, suppress, out);
281    }
282
283    // Fast path: open-ended field range from field N (e.g., cut -f3-)
284    if !complement
285        && ranges.len() == 1
286        && ranges[0].end == usize::MAX
287        && ranges[0].start > 1
288        && output_delim.len() == 1
289        && output_delim[0] == delim
290    {
291        return process_fields_suffix(data, delim, line_delim, ranges[0].start, suppress, out);
292    }
293
294    // Fast path: contiguous field range with start > 1 (e.g., cut -f2-4)
295    if !complement
296        && ranges.len() == 1
297        && ranges[0].start > 1
298        && ranges[0].end < usize::MAX
299        && output_delim.len() == 1
300        && output_delim[0] == delim
301    {
302        return process_fields_mid_range(
303            data,
304            delim,
305            line_delim,
306            ranges[0].start,
307            ranges[0].end,
308            suppress,
309            out,
310        );
311    }
312
313    // General field extraction
314    let max_field = if complement {
315        usize::MAX
316    } else {
317        ranges.last().map(|r| r.end).unwrap_or(0)
318    };
319    let field_mask = compute_field_mask(ranges, complement);
320
321    if data.len() >= PARALLEL_THRESHOLD {
322        let chunks = split_into_chunks(data, line_delim);
323        let results: Vec<Vec<u8>> = chunks
324            .par_iter()
325            .map(|chunk| {
326                let mut buf = Vec::with_capacity(chunk.len());
327                process_fields_chunk(
328                    chunk,
329                    delim,
330                    ranges,
331                    output_delim,
332                    suppress,
333                    max_field,
334                    field_mask,
335                    line_delim,
336                    complement,
337                    &mut buf,
338                );
339                buf
340            })
341            .collect();
342        // Use write_vectored (writev) to batch N writes into fewer syscalls
343        let slices: Vec<IoSlice> = results
344            .iter()
345            .filter(|r| !r.is_empty())
346            .map(|r| IoSlice::new(r))
347            .collect();
348        write_ioslices(out, &slices)?;
349    } else {
350        let mut buf = Vec::with_capacity(data.len());
351        process_fields_chunk(
352            data,
353            delim,
354            ranges,
355            output_delim,
356            suppress,
357            max_field,
358            field_mask,
359            line_delim,
360            complement,
361            &mut buf,
362        );
363        if !buf.is_empty() {
364            out.write_all(&buf)?;
365        }
366    }
367    Ok(())
368}
369
370/// Process a chunk of data for general field extraction.
371/// When `delim != line_delim`, uses a single-pass memchr2_iter scan to find both
372/// delimiters and line terminators in one SIMD pass, eliminating per-line memchr_iter
373/// setup overhead. When `delim == line_delim`, falls back to the two-level approach.
374fn process_fields_chunk(
375    data: &[u8],
376    delim: u8,
377    ranges: &[Range],
378    output_delim: &[u8],
379    suppress: bool,
380    max_field: usize,
381    field_mask: u64,
382    line_delim: u8,
383    complement: bool,
384    buf: &mut Vec<u8>,
385) {
386    // When delim != line_delim and max_field is bounded, use two-level approach:
387    // outer memchr for newlines, inner memchr_iter for delimiters with early exit.
388    // This avoids scanning past max_field on each line (significant for lines with
389    // many columns but small field selection like -f1,3,5 on 20-column CSV).
390    // For complement or unbounded ranges, use single-pass memchr2_iter which
391    // needs to process all delimiters anyway.
392    if delim != line_delim && max_field < usize::MAX && !complement {
393        buf.reserve(data.len());
394        let mut start = 0;
395        for end_pos in memchr_iter(line_delim, data) {
396            let line = &data[start..end_pos];
397            extract_fields_to_buf(
398                line,
399                delim,
400                ranges,
401                output_delim,
402                suppress,
403                max_field,
404                field_mask,
405                line_delim,
406                buf,
407                complement,
408            );
409            start = end_pos + 1;
410        }
411        if start < data.len() {
412            extract_fields_to_buf(
413                &data[start..],
414                delim,
415                ranges,
416                output_delim,
417                suppress,
418                max_field,
419                field_mask,
420                line_delim,
421                buf,
422                complement,
423            );
424        }
425        return;
426    }
427
428    // Single-pass path for complement or unbounded ranges: memchr2_iter for both
429    // delimiter and line_delim in one SIMD scan.
430    if delim != line_delim {
431        buf.reserve(data.len());
432
433        let mut line_start: usize = 0;
434        let mut field_start: usize = 0;
435        let mut field_num: usize = 1;
436        let mut first_output = true;
437        let mut has_delim = false;
438
439        for pos in memchr::memchr2_iter(delim, line_delim, data) {
440            let byte = unsafe { *data.get_unchecked(pos) };
441
442            if byte == line_delim {
443                // End of line: flush final field and emit line delimiter
444                if (field_num <= max_field || complement)
445                    && has_delim
446                    && is_selected(field_num, field_mask, ranges, complement)
447                {
448                    if !first_output {
449                        unsafe { buf_extend(buf, output_delim) };
450                    }
451                    unsafe { buf_extend(buf, &data[field_start..pos]) };
452                    first_output = false;
453                }
454
455                if !first_output {
456                    unsafe { buf_push(buf, line_delim) };
457                } else if !has_delim {
458                    if !suppress {
459                        unsafe {
460                            buf_extend(buf, &data[line_start..pos]);
461                            buf_push(buf, line_delim);
462                        }
463                    }
464                } else {
465                    unsafe { buf_push(buf, line_delim) };
466                }
467
468                // Reset state for next line
469                line_start = pos + 1;
470                field_start = pos + 1;
471                field_num = 1;
472                first_output = true;
473                has_delim = false;
474            } else {
475                // Field delimiter hit
476                has_delim = true;
477
478                if is_selected(field_num, field_mask, ranges, complement) {
479                    if !first_output {
480                        unsafe { buf_extend(buf, output_delim) };
481                    }
482                    unsafe { buf_extend(buf, &data[field_start..pos]) };
483                    first_output = false;
484                }
485
486                field_num += 1;
487                field_start = pos + 1;
488            }
489        }
490
491        // Handle last line without trailing line_delim
492        if line_start < data.len() {
493            let line = &data[line_start..];
494            if !line.is_empty() {
495                if (field_num <= max_field || complement)
496                    && has_delim
497                    && is_selected(field_num, field_mask, ranges, complement)
498                {
499                    if !first_output {
500                        unsafe { buf_extend(buf, output_delim) };
501                    }
502                    unsafe { buf_extend(buf, &data[field_start..data.len()]) };
503                    first_output = false;
504                }
505
506                if !first_output {
507                    unsafe { buf_push(buf, line_delim) };
508                } else if !has_delim {
509                    if !suppress {
510                        unsafe {
511                            buf_extend(buf, &data[line_start..data.len()]);
512                            buf_push(buf, line_delim);
513                        }
514                    }
515                } else {
516                    unsafe { buf_push(buf, line_delim) };
517                }
518            }
519        }
520
521        return;
522    }
523
524    // Fallback: when delim == line_delim, use the two-level scan approach
525    let mut start = 0;
526    for end_pos in memchr_iter(line_delim, data) {
527        let line = &data[start..end_pos];
528        extract_fields_to_buf(
529            line,
530            delim,
531            ranges,
532            output_delim,
533            suppress,
534            max_field,
535            field_mask,
536            line_delim,
537            buf,
538            complement,
539        );
540        start = end_pos + 1;
541    }
542    if start < data.len() {
543        extract_fields_to_buf(
544            &data[start..],
545            delim,
546            ranges,
547            output_delim,
548            suppress,
549            max_field,
550            field_mask,
551            line_delim,
552            buf,
553            complement,
554        );
555    }
556}
557
558// ── Ultra-fast single field extraction ───────────────────────────────────
559
560/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
561/// Uses combined memchr2_iter SIMD scan when delim != line_delim for a single
562/// pass over the data (vs. nested loops: outer newline scan + inner delim scan).
563fn process_single_field(
564    data: &[u8],
565    delim: u8,
566    line_delim: u8,
567    target: usize,
568    suppress: bool,
569    out: &mut impl Write,
570) -> io::Result<()> {
571    let target_idx = target - 1;
572
573    // Combined SIMD scan: single pass using memchr2 for any target field.
574    if delim != line_delim {
575        if data.len() >= PARALLEL_THRESHOLD {
576            let chunks = split_into_chunks(data, line_delim);
577            let results: Vec<Vec<u8>> = chunks
578                .par_iter()
579                .map(|chunk| {
580                    let mut buf = Vec::with_capacity(chunk.len());
581                    process_nth_field_combined(
582                        chunk, delim, line_delim, target_idx, suppress, &mut buf,
583                    );
584                    buf
585                })
586                .collect();
587            for result in &results {
588                if !result.is_empty() {
589                    out.write_all(result)?;
590                }
591            }
592        } else if target_idx == 0 && !suppress {
593            // Zero-copy fast path for field 1 (most common case):
594            // For each line, either truncate at the first delimiter, or pass through.
595            // Since most lines have a delimiter, and field 1 is a prefix of each line,
596            // we can write contiguous runs directly from the source data.
597            single_field1_zerocopy(data, delim, line_delim, out)?;
598        } else {
599            let mut buf = Vec::with_capacity(data.len());
600            process_nth_field_combined(data, delim, line_delim, target_idx, suppress, &mut buf);
601            if !buf.is_empty() {
602                out.write_all(&buf)?;
603            }
604        }
605        return Ok(());
606    }
607
608    // Fallback for delim == line_delim: nested loop approach
609    if data.len() >= PARALLEL_THRESHOLD {
610        let chunks = split_into_chunks(data, line_delim);
611        let results: Vec<Vec<u8>> = chunks
612            .par_iter()
613            .map(|chunk| {
614                let mut buf = Vec::with_capacity(chunk.len() / 4);
615                process_single_field_chunk(
616                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
617                );
618                buf
619            })
620            .collect();
621        // Use write_vectored (writev) to batch N writes into fewer syscalls
622        let slices: Vec<IoSlice> = results
623            .iter()
624            .filter(|r| !r.is_empty())
625            .map(|r| IoSlice::new(r))
626            .collect();
627        write_ioslices(out, &slices)?;
628    } else {
629        let mut buf = Vec::with_capacity(data.len() / 4);
630        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
631        if !buf.is_empty() {
632            out.write_all(&buf)?;
633        }
634    }
635    Ok(())
636}
637
638/// Complement single-field extraction: skip one field, output rest unchanged.
639fn process_complement_single_field(
640    data: &[u8],
641    delim: u8,
642    line_delim: u8,
643    skip_field: usize,
644    suppress: bool,
645    out: &mut impl Write,
646) -> io::Result<()> {
647    let skip_idx = skip_field - 1;
648
649    if data.len() >= PARALLEL_THRESHOLD {
650        let chunks = split_into_chunks(data, line_delim);
651        let results: Vec<Vec<u8>> = chunks
652            .par_iter()
653            .map(|chunk| {
654                let mut buf = Vec::with_capacity(chunk.len());
655                complement_single_field_chunk(
656                    chunk, delim, skip_idx, line_delim, suppress, &mut buf,
657                );
658                buf
659            })
660            .collect();
661        // Use write_vectored (writev) to batch N writes into fewer syscalls
662        let slices: Vec<IoSlice> = results
663            .iter()
664            .filter(|r| !r.is_empty())
665            .map(|r| IoSlice::new(r))
666            .collect();
667        write_ioslices(out, &slices)?;
668    } else {
669        let mut buf = Vec::with_capacity(data.len());
670        complement_single_field_chunk(data, delim, skip_idx, line_delim, suppress, &mut buf);
671        if !buf.is_empty() {
672            out.write_all(&buf)?;
673        }
674    }
675    Ok(())
676}
677
678/// Process a chunk for complement single-field extraction.
679fn complement_single_field_chunk(
680    data: &[u8],
681    delim: u8,
682    skip_idx: usize,
683    line_delim: u8,
684    suppress: bool,
685    buf: &mut Vec<u8>,
686) {
687    let mut start = 0;
688    for end_pos in memchr_iter(line_delim, data) {
689        let line = &data[start..end_pos];
690        complement_single_field_line(line, delim, skip_idx, line_delim, suppress, buf);
691        start = end_pos + 1;
692    }
693    if start < data.len() {
694        complement_single_field_line(&data[start..], delim, skip_idx, line_delim, suppress, buf);
695    }
696}
697
698/// Extract all fields except skip_idx from one line.
699#[inline(always)]
700fn complement_single_field_line(
701    line: &[u8],
702    delim: u8,
703    skip_idx: usize,
704    line_delim: u8,
705    suppress: bool,
706    buf: &mut Vec<u8>,
707) {
708    if line.is_empty() {
709        if !suppress {
710            buf.push(line_delim);
711        }
712        return;
713    }
714
715    buf.reserve(line.len() + 1);
716
717    let mut field_idx = 0;
718    let mut field_start = 0;
719    let mut first_output = true;
720    let mut has_delim = false;
721
722    for pos in memchr_iter(delim, line) {
723        has_delim = true;
724        if field_idx != skip_idx {
725            if !first_output {
726                unsafe { buf_push(buf, delim) };
727            }
728            unsafe { buf_extend(buf, &line[field_start..pos]) };
729            first_output = false;
730        }
731        field_idx += 1;
732        field_start = pos + 1;
733    }
734
735    if !has_delim {
736        if !suppress {
737            unsafe {
738                buf_extend(buf, line);
739                buf_push(buf, line_delim);
740            }
741        }
742        return;
743    }
744
745    // Last field
746    if field_idx != skip_idx {
747        if !first_output {
748            unsafe { buf_push(buf, delim) };
749        }
750        unsafe { buf_extend(buf, &line[field_start..]) };
751    }
752
753    unsafe { buf_push(buf, line_delim) };
754}
755
756/// Contiguous from-start field range extraction (e.g., `cut -f1-5`).
757/// Zero-copy for the non-parallel path: identifies the truncation point per line
758/// and writes contiguous runs directly from the source data.
759fn process_fields_prefix(
760    data: &[u8],
761    delim: u8,
762    line_delim: u8,
763    last_field: usize,
764    suppress: bool,
765    out: &mut impl Write,
766) -> io::Result<()> {
767    if data.len() >= PARALLEL_THRESHOLD {
768        let chunks = split_into_chunks(data, line_delim);
769        let results: Vec<Vec<u8>> = chunks
770            .par_iter()
771            .map(|chunk| {
772                let mut buf = Vec::with_capacity(chunk.len());
773                fields_prefix_chunk(chunk, delim, line_delim, last_field, suppress, &mut buf);
774                buf
775            })
776            .collect();
777        // Use write_vectored (writev) to batch N writes into fewer syscalls
778        let slices: Vec<IoSlice> = results
779            .iter()
780            .filter(|r| !r.is_empty())
781            .map(|r| IoSlice::new(r))
782            .collect();
783        write_ioslices(out, &slices)?;
784    } else if !suppress {
785        // Zero-copy fast path: scan for truncation points, write runs from source.
786        // When suppress is false, every line is output (with or without delimiter).
787        // Most lines have enough fields, so the output is often identical to input.
788        fields_prefix_zerocopy(data, delim, line_delim, last_field, out)?;
789    } else {
790        let mut buf = Vec::with_capacity(data.len());
791        fields_prefix_chunk(data, delim, line_delim, last_field, suppress, &mut buf);
792        if !buf.is_empty() {
793            out.write_all(&buf)?;
794        }
795    }
796    Ok(())
797}
798
799/// Zero-copy field-prefix extraction: writes contiguous runs directly from source data.
800/// For lines where the Nth delimiter exists, we truncate at that point.
801/// For lines with fewer fields, we output them unchanged.
802/// Lines without any delimiter are output unchanged (suppress=false assumed).
803#[inline]
804fn fields_prefix_zerocopy(
805    data: &[u8],
806    delim: u8,
807    line_delim: u8,
808    last_field: usize,
809    out: &mut impl Write,
810) -> io::Result<()> {
811    let mut start = 0;
812    let mut run_start: usize = 0;
813
814    for end_pos in memchr_iter(line_delim, data) {
815        let line = &data[start..end_pos];
816        // Find the position of the Nth delimiter to truncate at
817        let mut field_count = 1;
818        let mut truncate_at: Option<usize> = None;
819        for dpos in memchr_iter(delim, line) {
820            if field_count >= last_field {
821                truncate_at = Some(start + dpos);
822                break;
823            }
824            field_count += 1;
825        }
826
827        if let Some(trunc_pos) = truncate_at {
828            // This line has more fields than needed. Flush run, write truncated.
829            if run_start < start {
830                out.write_all(&data[run_start..start])?;
831            }
832            out.write_all(&data[start..trunc_pos])?;
833            out.write_all(&[line_delim])?;
834            run_start = end_pos + 1;
835        }
836        // else: line has <= last_field fields, keep it in the run
837        start = end_pos + 1;
838    }
839    // Handle last line without terminator
840    if start < data.len() {
841        let line = &data[start..];
842        let mut field_count = 1;
843        let mut truncate_at: Option<usize> = None;
844        for dpos in memchr_iter(delim, line) {
845            if field_count >= last_field {
846                truncate_at = Some(start + dpos);
847                break;
848            }
849            field_count += 1;
850        }
851        if let Some(trunc_pos) = truncate_at {
852            if run_start < start {
853                out.write_all(&data[run_start..start])?;
854            }
855            out.write_all(&data[start..trunc_pos])?;
856            out.write_all(&[line_delim])?;
857            return Ok(());
858        }
859    }
860    // Flush remaining run
861    if run_start < data.len() {
862        out.write_all(&data[run_start..])?;
863        if !data.is_empty() && *data.last().unwrap() != line_delim {
864            out.write_all(&[line_delim])?;
865        }
866    }
867    Ok(())
868}
869
870/// Process a chunk for contiguous from-start field range extraction.
871fn fields_prefix_chunk(
872    data: &[u8],
873    delim: u8,
874    line_delim: u8,
875    last_field: usize,
876    suppress: bool,
877    buf: &mut Vec<u8>,
878) {
879    let mut start = 0;
880    for end_pos in memchr_iter(line_delim, data) {
881        let line = &data[start..end_pos];
882        fields_prefix_line(line, delim, line_delim, last_field, suppress, buf);
883        start = end_pos + 1;
884    }
885    if start < data.len() {
886        fields_prefix_line(&data[start..], delim, line_delim, last_field, suppress, buf);
887    }
888}
889
890/// Extract first N fields from one line (contiguous from-start range).
891#[inline(always)]
892fn fields_prefix_line(
893    line: &[u8],
894    delim: u8,
895    line_delim: u8,
896    last_field: usize,
897    suppress: bool,
898    buf: &mut Vec<u8>,
899) {
900    if line.is_empty() {
901        if !suppress {
902            buf.push(line_delim);
903        }
904        return;
905    }
906
907    buf.reserve(line.len() + 1);
908
909    let mut field_count = 1;
910    let mut has_delim = false;
911
912    for pos in memchr_iter(delim, line) {
913        has_delim = true;
914        if field_count >= last_field {
915            unsafe {
916                buf_extend(buf, &line[..pos]);
917                buf_push(buf, line_delim);
918            }
919            return;
920        }
921        field_count += 1;
922    }
923
924    if !has_delim {
925        if !suppress {
926            unsafe {
927                buf_extend(buf, line);
928                buf_push(buf, line_delim);
929            }
930        }
931        return;
932    }
933
934    unsafe {
935        buf_extend(buf, line);
936        buf_push(buf, line_delim);
937    }
938}
939
940/// Open-ended field suffix extraction (e.g., `cut -f3-`).
941fn process_fields_suffix(
942    data: &[u8],
943    delim: u8,
944    line_delim: u8,
945    start_field: usize,
946    suppress: bool,
947    out: &mut impl Write,
948) -> io::Result<()> {
949    if data.len() >= PARALLEL_THRESHOLD {
950        let chunks = split_into_chunks(data, line_delim);
951        let results: Vec<Vec<u8>> = chunks
952            .par_iter()
953            .map(|chunk| {
954                let mut buf = Vec::with_capacity(chunk.len());
955                fields_suffix_chunk(chunk, delim, line_delim, start_field, suppress, &mut buf);
956                buf
957            })
958            .collect();
959        // Use write_vectored (writev) to batch N writes into fewer syscalls
960        let slices: Vec<IoSlice> = results
961            .iter()
962            .filter(|r| !r.is_empty())
963            .map(|r| IoSlice::new(r))
964            .collect();
965        write_ioslices(out, &slices)?;
966    } else {
967        let mut buf = Vec::with_capacity(data.len());
968        fields_suffix_chunk(data, delim, line_delim, start_field, suppress, &mut buf);
969        if !buf.is_empty() {
970            out.write_all(&buf)?;
971        }
972    }
973    Ok(())
974}
975
976/// Process a chunk for open-ended field suffix extraction.
977fn fields_suffix_chunk(
978    data: &[u8],
979    delim: u8,
980    line_delim: u8,
981    start_field: usize,
982    suppress: bool,
983    buf: &mut Vec<u8>,
984) {
985    let mut start = 0;
986    for end_pos in memchr_iter(line_delim, data) {
987        let line = &data[start..end_pos];
988        fields_suffix_line(line, delim, line_delim, start_field, suppress, buf);
989        start = end_pos + 1;
990    }
991    if start < data.len() {
992        fields_suffix_line(
993            &data[start..],
994            delim,
995            line_delim,
996            start_field,
997            suppress,
998            buf,
999        );
1000    }
1001}
1002
1003/// Extract fields from start_field to end from one line.
1004#[inline(always)]
1005fn fields_suffix_line(
1006    line: &[u8],
1007    delim: u8,
1008    line_delim: u8,
1009    start_field: usize,
1010    suppress: bool,
1011    buf: &mut Vec<u8>,
1012) {
1013    if line.is_empty() {
1014        if !suppress {
1015            buf.push(line_delim);
1016        }
1017        return;
1018    }
1019
1020    buf.reserve(line.len() + 1);
1021
1022    let skip_delims = start_field - 1;
1023    let mut delim_count = 0;
1024    let mut has_delim = false;
1025
1026    for pos in memchr_iter(delim, line) {
1027        has_delim = true;
1028        delim_count += 1;
1029        if delim_count >= skip_delims {
1030            unsafe {
1031                buf_extend(buf, &line[pos + 1..]);
1032                buf_push(buf, line_delim);
1033            }
1034            return;
1035        }
1036    }
1037
1038    if !has_delim {
1039        if !suppress {
1040            unsafe {
1041                buf_extend(buf, line);
1042                buf_push(buf, line_delim);
1043            }
1044        }
1045        return;
1046    }
1047
1048    // Fewer delimiters than needed
1049    unsafe { buf_push(buf, line_delim) };
1050}
1051
1052/// Contiguous mid-range field extraction (e.g., `cut -f2-4`).
1053/// Optimized: skip to start_field using memchr, then output until end_field.
1054fn process_fields_mid_range(
1055    data: &[u8],
1056    delim: u8,
1057    line_delim: u8,
1058    start_field: usize,
1059    end_field: usize,
1060    suppress: bool,
1061    out: &mut impl Write,
1062) -> io::Result<()> {
1063    if data.len() >= PARALLEL_THRESHOLD {
1064        let chunks = split_into_chunks(data, line_delim);
1065        let results: Vec<Vec<u8>> = chunks
1066            .par_iter()
1067            .map(|chunk| {
1068                let mut buf = Vec::with_capacity(chunk.len());
1069                fields_mid_range_chunk(
1070                    chunk,
1071                    delim,
1072                    line_delim,
1073                    start_field,
1074                    end_field,
1075                    suppress,
1076                    &mut buf,
1077                );
1078                buf
1079            })
1080            .collect();
1081        let slices: Vec<IoSlice> = results
1082            .iter()
1083            .filter(|r| !r.is_empty())
1084            .map(|r| IoSlice::new(r))
1085            .collect();
1086        write_ioslices(out, &slices)?;
1087    } else {
1088        let mut buf = Vec::with_capacity(data.len());
1089        fields_mid_range_chunk(
1090            data,
1091            delim,
1092            line_delim,
1093            start_field,
1094            end_field,
1095            suppress,
1096            &mut buf,
1097        );
1098        if !buf.is_empty() {
1099            out.write_all(&buf)?;
1100        }
1101    }
1102    Ok(())
1103}
1104
1105/// Process a chunk for contiguous mid-range field extraction.
1106fn fields_mid_range_chunk(
1107    data: &[u8],
1108    delim: u8,
1109    line_delim: u8,
1110    start_field: usize,
1111    end_field: usize,
1112    suppress: bool,
1113    buf: &mut Vec<u8>,
1114) {
1115    let mut start = 0;
1116    for end_pos in memchr_iter(line_delim, data) {
1117        let line = &data[start..end_pos];
1118        fields_mid_range_line(
1119            line,
1120            delim,
1121            line_delim,
1122            start_field,
1123            end_field,
1124            suppress,
1125            buf,
1126        );
1127        start = end_pos + 1;
1128    }
1129    if start < data.len() {
1130        fields_mid_range_line(
1131            &data[start..],
1132            delim,
1133            line_delim,
1134            start_field,
1135            end_field,
1136            suppress,
1137            buf,
1138        );
1139    }
1140}
1141
1142/// Extract fields start_field..=end_field from one line.
1143/// Uses memchr_iter to skip to start_field, then counts delimiters to end_field.
1144#[inline(always)]
1145fn fields_mid_range_line(
1146    line: &[u8],
1147    delim: u8,
1148    line_delim: u8,
1149    start_field: usize,
1150    end_field: usize,
1151    suppress: bool,
1152    buf: &mut Vec<u8>,
1153) {
1154    if line.is_empty() {
1155        if !suppress {
1156            buf.push(line_delim);
1157        }
1158        return;
1159    }
1160
1161    buf.reserve(line.len() + 1);
1162
1163    // Count delimiters to find start_field and end_field boundaries
1164    let skip_before = start_field - 1; // delimiters to skip before start_field
1165    let field_span = end_field - start_field; // additional delimiters within the range
1166    let mut delim_count = 0;
1167    let mut range_start = 0;
1168    let mut has_delim = false;
1169
1170    for pos in memchr_iter(delim, line) {
1171        has_delim = true;
1172        delim_count += 1;
1173        if delim_count == skip_before {
1174            range_start = pos + 1;
1175        }
1176        if delim_count == skip_before + field_span + 1 {
1177            // Found the delimiter after end_field — output the range
1178            if skip_before == 0 {
1179                range_start = 0;
1180            }
1181            unsafe {
1182                buf_extend(buf, &line[range_start..pos]);
1183                buf_push(buf, line_delim);
1184            }
1185            return;
1186        }
1187    }
1188
1189    if !has_delim {
1190        if !suppress {
1191            unsafe {
1192                buf_extend(buf, line);
1193                buf_push(buf, line_delim);
1194            }
1195        }
1196        return;
1197    }
1198
1199    // Line has delimiters but fewer fields than end_field
1200    if delim_count >= skip_before {
1201        // We have at least start_field, output from range_start to end
1202        if skip_before == 0 {
1203            range_start = 0;
1204        }
1205        unsafe {
1206            buf_extend(buf, &line[range_start..]);
1207            buf_push(buf, line_delim);
1208        }
1209    } else {
1210        // Not enough fields even for start_field — output empty line
1211        unsafe { buf_push(buf, line_delim) };
1212    }
1213}
1214
1215/// Combined SIMD scan for arbitrary single field extraction.
1216/// Uses memchr2_iter(delim, line_delim) to scan for both bytes in a single SIMD pass.
1217/// This is faster than the nested approach (outer: find newlines, inner: find delimiters)
1218/// because it eliminates one full SIMD scan and improves cache locality.
1219fn process_nth_field_combined(
1220    data: &[u8],
1221    delim: u8,
1222    line_delim: u8,
1223    target_idx: usize,
1224    suppress: bool,
1225    buf: &mut Vec<u8>,
1226) {
1227    buf.reserve(data.len());
1228
1229    let mut line_start: usize = 0;
1230    let mut field_start: usize = 0;
1231    let mut field_idx: usize = 0;
1232    let mut has_delim = false;
1233    let mut emitted = false;
1234
1235    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1236        let byte = unsafe { *data.get_unchecked(pos) };
1237
1238        if byte == line_delim {
1239            // End of line
1240            if !emitted {
1241                if has_delim && field_idx == target_idx {
1242                    // Last field matches target
1243                    unsafe {
1244                        buf_extend(buf, &data[field_start..pos]);
1245                        buf_push(buf, line_delim);
1246                    }
1247                } else if has_delim {
1248                    // Target field doesn't exist (fewer fields)
1249                    unsafe {
1250                        buf_push(buf, line_delim);
1251                    }
1252                } else if !suppress {
1253                    // No delimiter in line — output unchanged
1254                    unsafe {
1255                        buf_extend(buf, &data[line_start..pos]);
1256                        buf_push(buf, line_delim);
1257                    }
1258                }
1259            }
1260            // Reset for next line
1261            line_start = pos + 1;
1262            field_start = pos + 1;
1263            field_idx = 0;
1264            has_delim = false;
1265            emitted = false;
1266        } else {
1267            // Delimiter found
1268            has_delim = true;
1269            if field_idx == target_idx {
1270                unsafe {
1271                    buf_extend(buf, &data[field_start..pos]);
1272                    buf_push(buf, line_delim);
1273                }
1274                emitted = true;
1275            }
1276            field_idx += 1;
1277            field_start = pos + 1;
1278        }
1279    }
1280
1281    // Handle last line without trailing newline
1282    if line_start < data.len() && !emitted {
1283        if has_delim && field_idx == target_idx {
1284            unsafe {
1285                buf_extend(buf, &data[field_start..data.len()]);
1286                buf_push(buf, line_delim);
1287            }
1288        } else if has_delim {
1289            unsafe {
1290                buf_push(buf, line_delim);
1291            }
1292        } else if !suppress {
1293            unsafe {
1294                buf_extend(buf, &data[line_start..data.len()]);
1295                buf_push(buf, line_delim);
1296            }
1297        }
1298    }
1299}
1300
1301/// Zero-copy field-1 extraction: writes contiguous runs directly from source data.
1302/// For each line: if delimiter exists, truncate at first delimiter; otherwise pass through.
1303/// Uses memchr2 to scan for both delimiter and line terminator in a single SIMD pass.
1304#[inline]
1305fn single_field1_zerocopy(
1306    data: &[u8],
1307    delim: u8,
1308    line_delim: u8,
1309    out: &mut impl Write,
1310) -> io::Result<()> {
1311    let mut line_start: usize = 0;
1312    let mut run_start: usize = 0;
1313    let mut first_delim: Option<usize> = None;
1314
1315    for pos in memchr::memchr2_iter(delim, line_delim, data) {
1316        let byte = unsafe { *data.get_unchecked(pos) };
1317
1318        if byte == line_delim {
1319            // End of line
1320            if let Some(dp) = first_delim {
1321                // Line has delimiter — truncate at first delimiter.
1322                // Flush current run up to line_start, write truncated line.
1323                if run_start < line_start {
1324                    out.write_all(&data[run_start..line_start])?;
1325                }
1326                out.write_all(&data[line_start..dp])?;
1327                out.write_all(&[line_delim])?;
1328                run_start = pos + 1;
1329            }
1330            // else: no delimiter in line, output unchanged (stays in run)
1331            line_start = pos + 1;
1332            first_delim = None;
1333        } else {
1334            // Delimiter found
1335            if first_delim.is_none() {
1336                first_delim = Some(pos);
1337            }
1338        }
1339    }
1340
1341    // Handle last line (no trailing line_delim)
1342    if line_start < data.len() {
1343        if let Some(dp) = first_delim {
1344            if run_start < line_start {
1345                out.write_all(&data[run_start..line_start])?;
1346            }
1347            out.write_all(&data[line_start..dp])?;
1348            out.write_all(&[line_delim])?;
1349            return Ok(());
1350        }
1351    }
1352
1353    // Flush remaining run
1354    if run_start < data.len() {
1355        out.write_all(&data[run_start..])?;
1356        if !data.is_empty() && *data.last().unwrap() != line_delim {
1357            out.write_all(&[line_delim])?;
1358        }
1359    }
1360    Ok(())
1361}
1362
1363/// Process a chunk of data for single-field extraction.
1364fn process_single_field_chunk(
1365    data: &[u8],
1366    delim: u8,
1367    target_idx: usize,
1368    line_delim: u8,
1369    suppress: bool,
1370    buf: &mut Vec<u8>,
1371) {
1372    let mut start = 0;
1373    for end_pos in memchr_iter(line_delim, data) {
1374        let line = &data[start..end_pos];
1375        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
1376        start = end_pos + 1;
1377    }
1378    if start < data.len() {
1379        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
1380    }
1381}
1382
1383/// Extract a single field from one line.
1384/// Uses unsafe buf helpers — caller must ensure buf has capacity reserved.
1385#[inline(always)]
1386fn extract_single_field_line(
1387    line: &[u8],
1388    delim: u8,
1389    target_idx: usize,
1390    line_delim: u8,
1391    suppress: bool,
1392    buf: &mut Vec<u8>,
1393) {
1394    if line.is_empty() {
1395        if !suppress {
1396            buf.push(line_delim);
1397        }
1398        return;
1399    }
1400
1401    // Ensure capacity for worst case (full line + newline)
1402    buf.reserve(line.len() + 1);
1403
1404    // Ultra-fast path for first field: single memchr
1405    if target_idx == 0 {
1406        match memchr::memchr(delim, line) {
1407            Some(pos) => unsafe {
1408                buf_extend(buf, &line[..pos]);
1409                buf_push(buf, line_delim);
1410            },
1411            None => {
1412                if !suppress {
1413                    unsafe {
1414                        buf_extend(buf, line);
1415                        buf_push(buf, line_delim);
1416                    }
1417                }
1418            }
1419        }
1420        return;
1421    }
1422
1423    let mut field_start = 0;
1424    let mut field_idx = 0;
1425    let mut has_delim = false;
1426
1427    for pos in memchr_iter(delim, line) {
1428        has_delim = true;
1429        if field_idx == target_idx {
1430            unsafe {
1431                buf_extend(buf, &line[field_start..pos]);
1432                buf_push(buf, line_delim);
1433            }
1434            return;
1435        }
1436        field_idx += 1;
1437        field_start = pos + 1;
1438    }
1439
1440    if !has_delim {
1441        if !suppress {
1442            unsafe {
1443                buf_extend(buf, line);
1444                buf_push(buf, line_delim);
1445            }
1446        }
1447        return;
1448    }
1449
1450    if field_idx == target_idx {
1451        unsafe {
1452            buf_extend(buf, &line[field_start..]);
1453            buf_push(buf, line_delim);
1454        }
1455    } else {
1456        unsafe { buf_push(buf, line_delim) };
1457    }
1458}
1459
1460/// Extract fields from a single line into the output buffer.
1461/// Uses unsafe buf helpers with pre-reserved capacity for zero bounds-check overhead.
1462#[inline(always)]
1463fn extract_fields_to_buf(
1464    line: &[u8],
1465    delim: u8,
1466    ranges: &[Range],
1467    output_delim: &[u8],
1468    suppress: bool,
1469    max_field: usize,
1470    field_mask: u64,
1471    line_delim: u8,
1472    buf: &mut Vec<u8>,
1473    complement: bool,
1474) {
1475    let len = line.len();
1476
1477    if len == 0 {
1478        if !suppress {
1479            buf.push(line_delim);
1480        }
1481        return;
1482    }
1483
1484    // Only reserve if remaining capacity is insufficient. The caller pre-sizes the
1485    // buffer to data.len(), so this check avoids redundant reserve() calls per line.
1486    let needed = len + output_delim.len() * 16 + 1;
1487    if buf.capacity() - buf.len() < needed {
1488        buf.reserve(needed);
1489    }
1490
1491    let mut field_num: usize = 1;
1492    let mut field_start: usize = 0;
1493    let mut first_output = true;
1494    let mut has_delim = false;
1495
1496    for delim_pos in memchr_iter(delim, line) {
1497        has_delim = true;
1498
1499        if is_selected(field_num, field_mask, ranges, complement) {
1500            if !first_output {
1501                unsafe { buf_extend(buf, output_delim) };
1502            }
1503            unsafe { buf_extend(buf, &line[field_start..delim_pos]) };
1504            first_output = false;
1505        }
1506
1507        field_num += 1;
1508        field_start = delim_pos + 1;
1509
1510        if field_num > max_field {
1511            break;
1512        }
1513    }
1514
1515    // Last field
1516    if (field_num <= max_field || complement)
1517        && has_delim
1518        && is_selected(field_num, field_mask, ranges, complement)
1519    {
1520        if !first_output {
1521            unsafe { buf_extend(buf, output_delim) };
1522        }
1523        unsafe { buf_extend(buf, &line[field_start..len]) };
1524        first_output = false;
1525    }
1526
1527    if !first_output {
1528        unsafe { buf_push(buf, line_delim) };
1529    } else if !has_delim {
1530        if !suppress {
1531            unsafe {
1532                buf_extend(buf, line);
1533                buf_push(buf, line_delim);
1534            }
1535        }
1536    } else {
1537        unsafe { buf_push(buf, line_delim) };
1538    }
1539}
1540
1541// ── Fast path: byte/char extraction with batched output ──────────────────
1542
1543/// Ultra-fast path for `cut -b1-N`: single from-start byte range.
1544/// Zero-copy: writes directly from the source data using output runs.
1545/// For lines shorter than max_bytes, the output is identical to the input,
1546/// so we emit contiguous runs directly. Only lines exceeding max_bytes need truncation.
1547fn process_bytes_from_start(
1548    data: &[u8],
1549    max_bytes: usize,
1550    line_delim: u8,
1551    out: &mut impl Write,
1552) -> io::Result<()> {
1553    if data.len() >= PARALLEL_THRESHOLD {
1554        let chunks = split_into_chunks(data, line_delim);
1555        let results: Vec<Vec<u8>> = chunks
1556            .par_iter()
1557            .map(|chunk| {
1558                let mut buf = Vec::with_capacity(chunk.len());
1559                bytes_from_start_chunk(chunk, max_bytes, line_delim, &mut buf);
1560                buf
1561            })
1562            .collect();
1563        // Use write_vectored (writev) to batch N writes into fewer syscalls
1564        let slices: Vec<IoSlice> = results
1565            .iter()
1566            .filter(|r| !r.is_empty())
1567            .map(|r| IoSlice::new(r))
1568            .collect();
1569        write_ioslices(out, &slices)?;
1570    } else {
1571        // Zero-copy path: track contiguous output runs and write directly from source.
1572        // For lines <= max_bytes, we include them as-is (no copy needed).
1573        // For lines > max_bytes, we flush the run, write the truncated line, start new run.
1574        bytes_from_start_zerocopy(data, max_bytes, line_delim, out)?;
1575    }
1576    Ok(())
1577}
1578
1579/// Zero-copy byte-prefix extraction: writes contiguous runs directly from the source data.
1580/// Only copies when a line needs truncation (line > max_bytes).
1581#[inline]
1582fn bytes_from_start_zerocopy(
1583    data: &[u8],
1584    max_bytes: usize,
1585    line_delim: u8,
1586    out: &mut impl Write,
1587) -> io::Result<()> {
1588    let mut start = 0;
1589    let mut run_start: usize = 0;
1590
1591    for pos in memchr_iter(line_delim, data) {
1592        let line_len = pos - start;
1593        if line_len > max_bytes {
1594            // This line needs truncation. Flush current run, write truncated line.
1595            if run_start < start {
1596                out.write_all(&data[run_start..start])?;
1597            }
1598            out.write_all(&data[start..start + max_bytes])?;
1599            out.write_all(&[line_delim])?;
1600            run_start = pos + 1;
1601        }
1602        // else: line fits, keep it in the current contiguous run
1603        start = pos + 1;
1604    }
1605    // Handle last line without terminator
1606    if start < data.len() {
1607        let line_len = data.len() - start;
1608        if line_len > max_bytes {
1609            if run_start < start {
1610                out.write_all(&data[run_start..start])?;
1611            }
1612            out.write_all(&data[start..start + max_bytes])?;
1613            out.write_all(&[line_delim])?;
1614            return Ok(());
1615        }
1616    }
1617    // Flush remaining run (includes all short lines + the last line)
1618    if run_start < data.len() {
1619        out.write_all(&data[run_start..])?;
1620        // Add terminator if last byte isn't one
1621        if !data.is_empty() && *data.last().unwrap() != line_delim {
1622            out.write_all(&[line_delim])?;
1623        }
1624    }
1625    Ok(())
1626}
1627
1628/// Process a chunk for from-start byte range extraction (parallel path).
1629/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1630#[inline]
1631fn bytes_from_start_chunk(data: &[u8], max_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1632    // Reserve enough capacity: output <= input size
1633    buf.reserve(data.len());
1634
1635    let mut start = 0;
1636    for pos in memchr_iter(line_delim, data) {
1637        let line_len = pos - start;
1638        let take = line_len.min(max_bytes);
1639        unsafe {
1640            buf_extend(buf, &data[start..start + take]);
1641            buf_push(buf, line_delim);
1642        }
1643        start = pos + 1;
1644    }
1645    // Handle last line without terminator
1646    if start < data.len() {
1647        let line_len = data.len() - start;
1648        let take = line_len.min(max_bytes);
1649        unsafe {
1650            buf_extend(buf, &data[start..start + take]);
1651            buf_push(buf, line_delim);
1652        }
1653    }
1654}
1655
1656/// Fast path for `cut -bN-`: skip first N-1 bytes per line.
1657fn process_bytes_from_offset(
1658    data: &[u8],
1659    skip_bytes: usize,
1660    line_delim: u8,
1661    out: &mut impl Write,
1662) -> io::Result<()> {
1663    if data.len() >= PARALLEL_THRESHOLD {
1664        let chunks = split_into_chunks(data, line_delim);
1665        let results: Vec<Vec<u8>> = chunks
1666            .par_iter()
1667            .map(|chunk| {
1668                let mut buf = Vec::with_capacity(chunk.len());
1669                bytes_from_offset_chunk(chunk, skip_bytes, line_delim, &mut buf);
1670                buf
1671            })
1672            .collect();
1673        // Use write_vectored (writev) to batch N writes into fewer syscalls
1674        let slices: Vec<IoSlice> = results
1675            .iter()
1676            .filter(|r| !r.is_empty())
1677            .map(|r| IoSlice::new(r))
1678            .collect();
1679        write_ioslices(out, &slices)?;
1680    } else {
1681        // Zero-copy: write suffix of each line directly from source
1682        bytes_from_offset_zerocopy(data, skip_bytes, line_delim, out)?;
1683    }
1684    Ok(())
1685}
1686
1687/// Zero-copy byte-offset extraction: writes suffix of each line directly from source data.
1688/// Collects IoSlice pairs (data + delimiter) and flushes with write_vectored in batches,
1689/// reducing syscall overhead from 2 write_all calls per line to batched writev.
1690#[inline]
1691fn bytes_from_offset_zerocopy(
1692    data: &[u8],
1693    skip_bytes: usize,
1694    line_delim: u8,
1695    out: &mut impl Write,
1696) -> io::Result<()> {
1697    let delim_buf = [line_delim];
1698    let mut iov: Vec<IoSlice> = Vec::with_capacity(256);
1699
1700    let mut start = 0;
1701    for pos in memchr_iter(line_delim, data) {
1702        let line_len = pos - start;
1703        if line_len > skip_bytes {
1704            iov.push(IoSlice::new(&data[start + skip_bytes..pos]));
1705        }
1706        iov.push(IoSlice::new(&delim_buf));
1707        // Flush when approaching MAX_IOV to avoid oversized writev
1708        if iov.len() >= MAX_IOV - 1 {
1709            write_ioslices(out, &iov)?;
1710            iov.clear();
1711        }
1712        start = pos + 1;
1713    }
1714    if start < data.len() {
1715        let line_len = data.len() - start;
1716        if line_len > skip_bytes {
1717            iov.push(IoSlice::new(&data[start + skip_bytes..data.len()]));
1718        }
1719        iov.push(IoSlice::new(&delim_buf));
1720    }
1721    if !iov.is_empty() {
1722        write_ioslices(out, &iov)?;
1723    }
1724    Ok(())
1725}
1726
1727/// Process a chunk for from-offset byte range extraction.
1728/// Uses unsafe appends to eliminate bounds checking in the hot loop.
1729#[inline]
1730fn bytes_from_offset_chunk(data: &[u8], skip_bytes: usize, line_delim: u8, buf: &mut Vec<u8>) {
1731    buf.reserve(data.len());
1732
1733    let mut start = 0;
1734    for pos in memchr_iter(line_delim, data) {
1735        let line_len = pos - start;
1736        if line_len > skip_bytes {
1737            unsafe {
1738                buf_extend(buf, &data[start + skip_bytes..pos]);
1739            }
1740        }
1741        unsafe {
1742            buf_push(buf, line_delim);
1743        }
1744        start = pos + 1;
1745    }
1746    if start < data.len() {
1747        let line_len = data.len() - start;
1748        if line_len > skip_bytes {
1749            unsafe {
1750                buf_extend(buf, &data[start + skip_bytes..data.len()]);
1751            }
1752        }
1753        unsafe {
1754            buf_push(buf, line_delim);
1755        }
1756    }
1757}
1758
1759/// Optimized byte/char extraction with batched output and parallel processing.
1760fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
1761    let line_delim = cfg.line_delim;
1762    let ranges = cfg.ranges;
1763    let complement = cfg.complement;
1764    let output_delim = cfg.output_delim;
1765
1766    // Ultra-fast path: single range from byte 1 (e.g., cut -b1-10, cut -b-20)
1767    if !complement && ranges.len() == 1 && ranges[0].start == 1 && output_delim.is_empty() {
1768        let max_bytes = ranges[0].end;
1769        if max_bytes < usize::MAX {
1770            return process_bytes_from_start(data, max_bytes, line_delim, out);
1771        }
1772    }
1773
1774    // Fast path: single open-ended range from byte N (e.g., cut -b5-)
1775    if !complement && ranges.len() == 1 && ranges[0].end == usize::MAX && output_delim.is_empty() {
1776        let skip_bytes = ranges[0].start.saturating_sub(1);
1777        if skip_bytes > 0 {
1778            return process_bytes_from_offset(data, skip_bytes, line_delim, out);
1779        }
1780    }
1781
1782    if data.len() >= PARALLEL_THRESHOLD {
1783        let chunks = split_into_chunks(data, line_delim);
1784        let results: Vec<Vec<u8>> = chunks
1785            .par_iter()
1786            .map(|chunk| {
1787                let mut buf = Vec::with_capacity(chunk.len());
1788                process_bytes_chunk(
1789                    chunk,
1790                    ranges,
1791                    complement,
1792                    output_delim,
1793                    line_delim,
1794                    &mut buf,
1795                );
1796                buf
1797            })
1798            .collect();
1799        // Use write_vectored (writev) to batch N writes into fewer syscalls
1800        let slices: Vec<IoSlice> = results
1801            .iter()
1802            .filter(|r| !r.is_empty())
1803            .map(|r| IoSlice::new(r))
1804            .collect();
1805        write_ioslices(out, &slices)?;
1806    } else {
1807        let mut buf = Vec::with_capacity(data.len());
1808        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
1809        if !buf.is_empty() {
1810            out.write_all(&buf)?;
1811        }
1812    }
1813    Ok(())
1814}
1815
1816/// Process a chunk of data for byte/char extraction.
1817fn process_bytes_chunk(
1818    data: &[u8],
1819    ranges: &[Range],
1820    complement: bool,
1821    output_delim: &[u8],
1822    line_delim: u8,
1823    buf: &mut Vec<u8>,
1824) {
1825    let mut start = 0;
1826    for end_pos in memchr_iter(line_delim, data) {
1827        let line = &data[start..end_pos];
1828        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
1829        buf.push(line_delim);
1830        start = end_pos + 1;
1831    }
1832    if start < data.len() {
1833        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
1834        buf.push(line_delim);
1835    }
1836}
1837
1838/// Extract byte ranges from a line into the output buffer.
1839/// Uses unsafe buf helpers for zero bounds-check overhead in hot loops.
1840#[inline(always)]
1841fn cut_bytes_to_buf(
1842    line: &[u8],
1843    ranges: &[Range],
1844    complement: bool,
1845    output_delim: &[u8],
1846    buf: &mut Vec<u8>,
1847) {
1848    let len = line.len();
1849    let mut first_range = true;
1850
1851    // Reserve worst case: full line + delimiters between ranges
1852    buf.reserve(len + output_delim.len() * ranges.len() + 1);
1853
1854    if complement {
1855        let mut pos: usize = 1;
1856        for r in ranges {
1857            let rs = r.start;
1858            let re = r.end.min(len);
1859            if pos < rs {
1860                if !first_range && !output_delim.is_empty() {
1861                    unsafe { buf_extend(buf, output_delim) };
1862                }
1863                unsafe { buf_extend(buf, &line[pos - 1..rs - 1]) };
1864                first_range = false;
1865            }
1866            pos = re + 1;
1867            if pos > len {
1868                break;
1869            }
1870        }
1871        if pos <= len {
1872            if !first_range && !output_delim.is_empty() {
1873                unsafe { buf_extend(buf, output_delim) };
1874            }
1875            unsafe { buf_extend(buf, &line[pos - 1..len]) };
1876        }
1877    } else if output_delim.is_empty() && ranges.len() == 1 {
1878        // Ultra-fast path: single range, no output delimiter
1879        let start = ranges[0].start.saturating_sub(1);
1880        let end = ranges[0].end.min(len);
1881        if start < len {
1882            unsafe { buf_extend(buf, &line[start..end]) };
1883        }
1884    } else {
1885        for r in ranges {
1886            let start = r.start.saturating_sub(1);
1887            let end = r.end.min(len);
1888            if start >= len {
1889                break;
1890            }
1891            if !first_range && !output_delim.is_empty() {
1892                unsafe { buf_extend(buf, output_delim) };
1893            }
1894            unsafe { buf_extend(buf, &line[start..end]) };
1895            first_range = false;
1896        }
1897    }
1898}
1899
1900// ── Public API ───────────────────────────────────────────────────────────
1901
1902/// Cut fields from a line using a delimiter. Writes to `out`.
1903#[inline]
1904pub fn cut_fields(
1905    line: &[u8],
1906    delim: u8,
1907    ranges: &[Range],
1908    complement: bool,
1909    output_delim: &[u8],
1910    suppress_no_delim: bool,
1911    out: &mut impl Write,
1912) -> io::Result<bool> {
1913    if memchr::memchr(delim, line).is_none() {
1914        if !suppress_no_delim {
1915            out.write_all(line)?;
1916            return Ok(true);
1917        }
1918        return Ok(false);
1919    }
1920
1921    let mut field_num: usize = 1;
1922    let mut field_start: usize = 0;
1923    let mut first_output = true;
1924
1925    for delim_pos in memchr_iter(delim, line) {
1926        let selected = in_ranges(ranges, field_num) != complement;
1927        if selected {
1928            if !first_output {
1929                out.write_all(output_delim)?;
1930            }
1931            out.write_all(&line[field_start..delim_pos])?;
1932            first_output = false;
1933        }
1934        field_start = delim_pos + 1;
1935        field_num += 1;
1936    }
1937
1938    let selected = in_ranges(ranges, field_num) != complement;
1939    if selected {
1940        if !first_output {
1941            out.write_all(output_delim)?;
1942        }
1943        out.write_all(&line[field_start..])?;
1944    }
1945
1946    Ok(true)
1947}
1948
1949/// Cut bytes/chars from a line. Writes selected bytes to `out`.
1950#[inline]
1951pub fn cut_bytes(
1952    line: &[u8],
1953    ranges: &[Range],
1954    complement: bool,
1955    output_delim: &[u8],
1956    out: &mut impl Write,
1957) -> io::Result<bool> {
1958    let mut first_range = true;
1959
1960    if complement {
1961        let len = line.len();
1962        let mut comp_ranges = Vec::new();
1963        let mut pos: usize = 1;
1964        for r in ranges {
1965            let rs = r.start;
1966            let re = r.end.min(len);
1967            if pos < rs {
1968                comp_ranges.push((pos, rs - 1));
1969            }
1970            pos = re + 1;
1971            if pos > len {
1972                break;
1973            }
1974        }
1975        if pos <= len {
1976            comp_ranges.push((pos, len));
1977        }
1978        for &(s, e) in &comp_ranges {
1979            if !first_range && !output_delim.is_empty() {
1980                out.write_all(output_delim)?;
1981            }
1982            out.write_all(&line[s - 1..e])?;
1983            first_range = false;
1984        }
1985    } else {
1986        for r in ranges {
1987            let start = r.start.saturating_sub(1);
1988            let end = r.end.min(line.len());
1989            if start >= line.len() {
1990                break;
1991            }
1992            if !first_range && !output_delim.is_empty() {
1993                out.write_all(output_delim)?;
1994            }
1995            out.write_all(&line[start..end])?;
1996            first_range = false;
1997        }
1998    }
1999    Ok(true)
2000}
2001
2002/// Process a full data buffer (from mmap or read) with cut operation.
2003pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
2004    match cfg.mode {
2005        CutMode::Fields => process_fields_fast(data, cfg, out),
2006        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
2007    }
2008}
2009
2010/// Process input from a reader (for stdin).
2011/// Uses batch reading: reads large chunks (4MB), then processes them in batch
2012/// using the fast mmap-based paths, avoiding per-line read_until syscall overhead.
2013pub fn process_cut_reader<R: BufRead>(
2014    mut reader: R,
2015    cfg: &CutConfig,
2016    out: &mut impl Write,
2017) -> io::Result<()> {
2018    const CHUNK_SIZE: usize = 4 * 1024 * 1024; // 4MB read chunks
2019    let line_delim = cfg.line_delim;
2020
2021    // Read large chunks and process in batch.
2022    // We keep a buffer; after processing complete lines, we shift leftover to the front.
2023    let mut buf = Vec::with_capacity(CHUNK_SIZE + 4096);
2024
2025    loop {
2026        // Read up to CHUNK_SIZE bytes
2027        buf.reserve(CHUNK_SIZE);
2028        let read_start = buf.len();
2029        unsafe { buf.set_len(read_start + CHUNK_SIZE) };
2030        let n = read_fully(&mut reader, &mut buf[read_start..])?;
2031        buf.truncate(read_start + n);
2032
2033        if buf.is_empty() {
2034            break;
2035        }
2036
2037        if n == 0 {
2038            // EOF with leftover data (last line without terminator)
2039            process_cut_data(&buf, cfg, out)?;
2040            break;
2041        }
2042
2043        // Find the last line delimiter in the buffer so we process complete lines
2044        let process_end = match memchr::memrchr(line_delim, &buf) {
2045            Some(pos) => pos + 1,
2046            None => {
2047                // No line delimiter found — keep accumulating
2048                continue;
2049            }
2050        };
2051
2052        // Process the complete lines using the fast batch path
2053        process_cut_data(&buf[..process_end], cfg, out)?;
2054
2055        // Shift leftover to the front for next iteration
2056        let leftover_len = buf.len() - process_end;
2057        if leftover_len > 0 {
2058            buf.copy_within(process_end.., 0);
2059        }
2060        buf.truncate(leftover_len);
2061    }
2062
2063    Ok(())
2064}
2065
2066/// Read as many bytes as possible into buf, retrying on partial reads.
2067#[inline]
2068fn read_fully<R: BufRead>(reader: &mut R, buf: &mut [u8]) -> io::Result<usize> {
2069    let n = reader.read(buf)?;
2070    if n == buf.len() || n == 0 {
2071        return Ok(n);
2072    }
2073    // Slow path: partial read — retry to fill buffer
2074    let mut total = n;
2075    while total < buf.len() {
2076        match reader.read(&mut buf[total..]) {
2077            Ok(0) => break,
2078            Ok(n) => total += n,
2079            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2080            Err(e) => return Err(e),
2081        }
2082    }
2083    Ok(total)
2084}
2085
2086/// Cut operation mode
2087#[derive(Debug, Clone, Copy, PartialEq)]
2088pub enum CutMode {
2089    Bytes,
2090    Characters,
2091    Fields,
2092}