coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5/// Minimum file size for parallel processing (1MB).
6/// Rayon overhead is ~5-10μs per task; at 1MB per chunk,
7/// each chunk takes ~100μs+ to process, so overhead is < 10%.
8const PARALLEL_THRESHOLD: usize = 1024 * 1024;
9
10/// Configuration for cut operations.
11pub struct CutConfig<'a> {
12    pub mode: CutMode,
13    pub ranges: &'a [Range],
14    pub complement: bool,
15    pub delim: u8,
16    pub output_delim: &'a [u8],
17    pub suppress_no_delim: bool,
18    pub line_delim: u8,
19}
20
21/// A range specification like 1, 3-5, -3, 4-
22#[derive(Debug, Clone)]
23pub struct Range {
24    pub start: usize, // 1-based, 0 means "from beginning"
25    pub end: usize,   // 1-based, usize::MAX means "to end"
26}
27
28/// Parse a LIST specification like "1,3-5,7-" into ranges.
29/// Each range is 1-based. Returns sorted, merged ranges.
30pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
31    let mut ranges = Vec::new();
32
33    for part in spec.split(',') {
34        let part = part.trim();
35        if part.is_empty() {
36            continue;
37        }
38
39        if let Some(idx) = part.find('-') {
40            let left = &part[..idx];
41            let right = &part[idx + 1..];
42
43            let start = if left.is_empty() {
44                1
45            } else {
46                left.parse::<usize>()
47                    .map_err(|_| format!("invalid range: '{}'", part))?
48            };
49
50            let end = if right.is_empty() {
51                usize::MAX
52            } else {
53                right
54                    .parse::<usize>()
55                    .map_err(|_| format!("invalid range: '{}'", part))?
56            };
57
58            if start == 0 {
59                return Err("fields and positions are numbered from 1".to_string());
60            }
61            if start > end {
62                return Err(format!("invalid decreasing range: '{}'", part));
63            }
64
65            ranges.push(Range { start, end });
66        } else {
67            let n = part
68                .parse::<usize>()
69                .map_err(|_| format!("invalid field: '{}'", part))?;
70            if n == 0 {
71                return Err("fields and positions are numbered from 1".to_string());
72            }
73            ranges.push(Range { start: n, end: n });
74        }
75    }
76
77    if ranges.is_empty() {
78        return Err("you must specify a list of bytes, characters, or fields".to_string());
79    }
80
81    // Sort and merge overlapping ranges
82    ranges.sort_by_key(|r| (r.start, r.end));
83    let mut merged = vec![ranges[0].clone()];
84    for r in &ranges[1..] {
85        let last = merged.last_mut().unwrap();
86        if r.start <= last.end.saturating_add(1) {
87            last.end = last.end.max(r.end);
88        } else {
89            merged.push(r.clone());
90        }
91    }
92
93    Ok(merged)
94}
95
96/// Check if a 1-based position is in any range.
97/// Ranges must be sorted. Uses early exit since ranges are sorted.
98#[inline(always)]
99fn in_ranges(ranges: &[Range], pos: usize) -> bool {
100    for r in ranges {
101        if pos < r.start {
102            return false; // ranges are sorted, no point checking further
103        }
104        if pos <= r.end {
105            return true;
106        }
107    }
108    false
109}
110
111/// Pre-compute a 64-bit mask for field selection.
112/// Bit i-1 is set if field i should be output.
113#[inline]
114fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
115    let mut mask: u64 = 0;
116    for i in 1..=64u32 {
117        let in_range = in_ranges(ranges, i as usize);
118        if in_range != complement {
119            mask |= 1u64 << (i - 1);
120        }
121    }
122    mask
123}
124
125/// Check if a field should be selected, using bitset for first 64 fields.
126#[inline(always)]
127fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
128    if field_num <= 64 {
129        (mask >> (field_num - 1)) & 1 == 1
130    } else {
131        in_ranges(ranges, field_num) != complement
132    }
133}
134
135// ── Chunk splitting for parallel processing ──────────────────────────────
136
137/// Split data into chunks aligned to line boundaries for parallel processing.
138/// Returns slices that each end on a line_delim boundary (except possibly the last).
139fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
140    let num_threads = rayon::current_num_threads().max(1);
141    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
142        return vec![data];
143    }
144
145    let chunk_size = data.len() / num_threads;
146    let mut chunks = Vec::with_capacity(num_threads);
147    let mut pos = 0;
148
149    for _ in 0..num_threads - 1 {
150        let target = pos + chunk_size;
151        if target >= data.len() {
152            break;
153        }
154        // Align to next line boundary
155        let boundary = memchr::memchr(line_delim, &data[target..])
156            .map(|p| target + p + 1)
157            .unwrap_or(data.len());
158        if boundary > pos {
159            chunks.push(&data[pos..boundary]);
160        }
161        pos = boundary;
162    }
163
164    // Last chunk gets the remainder
165    if pos < data.len() {
166        chunks.push(&data[pos..]);
167    }
168
169    chunks
170}
171
172// ── Fast path: field extraction with batched output ──────────────────────
173
174/// Optimized field extraction with early exit and batched output.
175/// Uses SIMD newline scanning + inline byte scan per line.
176fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
177    let delim = cfg.delim;
178    let line_delim = cfg.line_delim;
179    let ranges = cfg.ranges;
180    let complement = cfg.complement;
181    let output_delim = cfg.output_delim;
182    let suppress = cfg.suppress_no_delim;
183
184    // Ultra-fast path: single field extraction (e.g., cut -f5)
185    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
186        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
187    }
188
189    // Pre-compute for general field extraction
190    let max_field = if complement {
191        usize::MAX
192    } else {
193        ranges.last().map(|r| r.end).unwrap_or(0)
194    };
195    let field_mask = compute_field_mask(ranges, complement);
196
197    if data.len() >= PARALLEL_THRESHOLD {
198        // Parallel path
199        let chunks = split_into_chunks(data, line_delim);
200        let results: Vec<Vec<u8>> = chunks
201            .par_iter()
202            .map(|chunk| {
203                let mut buf = Vec::with_capacity(chunk.len() / 2);
204                process_fields_chunk(
205                    chunk,
206                    delim,
207                    ranges,
208                    output_delim,
209                    suppress,
210                    max_field,
211                    field_mask,
212                    line_delim,
213                    complement,
214                    &mut buf,
215                );
216                buf
217            })
218            .collect();
219        for result in &results {
220            if !result.is_empty() {
221                out.write_all(result)?;
222            }
223        }
224    } else {
225        // Sequential path
226        let mut buf = Vec::with_capacity(data.len() / 2);
227        process_fields_chunk(
228            data,
229            delim,
230            ranges,
231            output_delim,
232            suppress,
233            max_field,
234            field_mask,
235            line_delim,
236            complement,
237            &mut buf,
238        );
239        if !buf.is_empty() {
240            out.write_all(&buf)?;
241        }
242    }
243    Ok(())
244}
245
246/// Process a chunk of data for general field extraction.
247fn process_fields_chunk(
248    data: &[u8],
249    delim: u8,
250    ranges: &[Range],
251    output_delim: &[u8],
252    suppress: bool,
253    max_field: usize,
254    field_mask: u64,
255    line_delim: u8,
256    complement: bool,
257    buf: &mut Vec<u8>,
258) {
259    let mut start = 0;
260    for end_pos in memchr_iter(line_delim, data) {
261        let line = &data[start..end_pos];
262        extract_fields_to_buf(
263            line,
264            delim,
265            ranges,
266            output_delim,
267            suppress,
268            max_field,
269            field_mask,
270            line_delim,
271            buf,
272            complement,
273        );
274        start = end_pos + 1;
275    }
276    if start < data.len() {
277        extract_fields_to_buf(
278            &data[start..],
279            delim,
280            ranges,
281            output_delim,
282            suppress,
283            max_field,
284            field_mask,
285            line_delim,
286            buf,
287            complement,
288        );
289    }
290}
291
292// ── Ultra-fast single field extraction ───────────────────────────────────
293
294/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
295/// Uses SIMD memchr for newline scanning + minimal inline byte scan per line.
296/// For large files, splits work across multiple threads with rayon.
297fn process_single_field(
298    data: &[u8],
299    delim: u8,
300    line_delim: u8,
301    target: usize,
302    suppress: bool,
303    out: &mut impl Write,
304) -> io::Result<()> {
305    let target_idx = target - 1;
306
307    if data.len() >= PARALLEL_THRESHOLD {
308        // Parallel path: split into chunks aligned to line boundaries
309        let chunks = split_into_chunks(data, line_delim);
310        let results: Vec<Vec<u8>> = chunks
311            .par_iter()
312            .map(|chunk| {
313                let mut buf = Vec::with_capacity(chunk.len() / 4);
314                process_single_field_chunk(
315                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
316                );
317                buf
318            })
319            .collect();
320        for result in &results {
321            if !result.is_empty() {
322                out.write_all(result)?;
323            }
324        }
325    } else {
326        // Sequential path for small data
327        let mut buf = Vec::with_capacity(data.len() / 4);
328        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
329        if !buf.is_empty() {
330            out.write_all(&buf)?;
331        }
332    }
333    Ok(())
334}
335
336/// Process a chunk of data for single-field extraction.
337fn process_single_field_chunk(
338    data: &[u8],
339    delim: u8,
340    target_idx: usize,
341    line_delim: u8,
342    suppress: bool,
343    buf: &mut Vec<u8>,
344) {
345    let mut start = 0;
346    for end_pos in memchr_iter(line_delim, data) {
347        let line = &data[start..end_pos];
348        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
349        start = end_pos + 1;
350    }
351    // Handle final line without terminator
352    if start < data.len() {
353        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
354    }
355}
356
357/// Extract a single field from one line.
358/// For target_idx == 0 (first field), uses single memchr instead of memchr_iter.
359/// For other fields, uses SIMD memchr_iter with early exit.
360#[inline(always)]
361fn extract_single_field_line(
362    line: &[u8],
363    delim: u8,
364    target_idx: usize,
365    line_delim: u8,
366    suppress: bool,
367    buf: &mut Vec<u8>,
368) {
369    if line.is_empty() {
370        if !suppress {
371            buf.push(line_delim);
372        }
373        return;
374    }
375
376    // Ultra-fast path for first field (target_idx == 0): single memchr
377    if target_idx == 0 {
378        match memchr::memchr(delim, line) {
379            Some(pos) => {
380                buf.extend_from_slice(&line[..pos]);
381                buf.push(line_delim);
382            }
383            None => {
384                // No delimiter — output whole line or suppress
385                if !suppress {
386                    buf.extend_from_slice(line);
387                    buf.push(line_delim);
388                }
389            }
390        }
391        return;
392    }
393
394    let mut field_start = 0;
395    let mut field_idx = 0;
396    let mut has_delim = false;
397
398    for pos in memchr_iter(delim, line) {
399        has_delim = true;
400        if field_idx == target_idx {
401            // Found end of target field — output and return
402            buf.extend_from_slice(&line[field_start..pos]);
403            buf.push(line_delim);
404            return;
405        }
406        field_idx += 1;
407        field_start = pos + 1;
408    }
409
410    if !has_delim {
411        // No delimiters found — output whole line or suppress
412        if !suppress {
413            buf.extend_from_slice(line);
414            buf.push(line_delim);
415        }
416        return;
417    }
418
419    if field_idx == target_idx {
420        // Target is the last field (no trailing delimiter)
421        buf.extend_from_slice(&line[field_start..]);
422        buf.push(line_delim);
423    } else {
424        // Not enough fields — output empty line
425        buf.push(line_delim);
426    }
427}
428
429/// Extract fields from a single line into the output buffer.
430/// Uses inline byte scanning with early exit for maximum performance.
431#[inline(always)]
432fn extract_fields_to_buf(
433    line: &[u8],
434    delim: u8,
435    ranges: &[Range],
436    output_delim: &[u8],
437    suppress: bool,
438    max_field: usize,
439    field_mask: u64,
440    line_delim: u8,
441    buf: &mut Vec<u8>,
442    complement: bool,
443) {
444    let len = line.len();
445
446    // Empty line: no delimiter possible
447    if len == 0 {
448        if !suppress {
449            buf.push(line_delim);
450        }
451        return;
452    }
453
454    let mut field_num: usize = 1;
455    let mut field_start: usize = 0;
456    let mut first_output = true;
457    let mut has_delim = false;
458
459    // SIMD-accelerated delimiter scanning with early exit
460    for delim_pos in memchr_iter(delim, line) {
461        has_delim = true;
462
463        if is_selected(field_num, field_mask, ranges, complement) {
464            if !first_output {
465                buf.extend_from_slice(output_delim);
466            }
467            buf.extend_from_slice(&line[field_start..delim_pos]);
468            first_output = false;
469        }
470
471        field_num += 1;
472        field_start = delim_pos + 1;
473
474        // Early exit: past the last needed field
475        if field_num > max_field {
476            break;
477        }
478    }
479
480    // Last field (only if we didn't early-exit past it)
481    if (field_num <= max_field || complement)
482        && has_delim
483        && is_selected(field_num, field_mask, ranges, complement)
484    {
485        if !first_output {
486            buf.extend_from_slice(output_delim);
487        }
488        buf.extend_from_slice(&line[field_start..len]);
489        first_output = false;
490    }
491
492    // Output line terminator
493    if !first_output {
494        // Had output — add line delimiter
495        buf.push(line_delim);
496    } else if !has_delim {
497        // No delimiter found — output whole line or suppress
498        if !suppress {
499            buf.extend_from_slice(line);
500            buf.push(line_delim);
501        }
502    } else {
503        // Had delimiter but no selected field — output empty line (GNU compat)
504        buf.push(line_delim);
505    }
506}
507
508// ── Fast path: byte/char extraction with batched output ──────────────────
509
510/// Optimized byte/char extraction with batched output and parallel processing.
511fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
512    let line_delim = cfg.line_delim;
513    let ranges = cfg.ranges;
514    let complement = cfg.complement;
515    let output_delim = cfg.output_delim;
516
517    if data.len() >= PARALLEL_THRESHOLD {
518        let chunks = split_into_chunks(data, line_delim);
519        let results: Vec<Vec<u8>> = chunks
520            .par_iter()
521            .map(|chunk| {
522                let mut buf = Vec::with_capacity(chunk.len() / 2);
523                process_bytes_chunk(
524                    chunk,
525                    ranges,
526                    complement,
527                    output_delim,
528                    line_delim,
529                    &mut buf,
530                );
531                buf
532            })
533            .collect();
534        for result in &results {
535            if !result.is_empty() {
536                out.write_all(result)?;
537            }
538        }
539    } else {
540        let mut buf = Vec::with_capacity(data.len() / 2);
541        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
542        if !buf.is_empty() {
543            out.write_all(&buf)?;
544        }
545    }
546    Ok(())
547}
548
549/// Process a chunk of data for byte/char extraction.
550fn process_bytes_chunk(
551    data: &[u8],
552    ranges: &[Range],
553    complement: bool,
554    output_delim: &[u8],
555    line_delim: u8,
556    buf: &mut Vec<u8>,
557) {
558    let mut start = 0;
559    for end_pos in memchr_iter(line_delim, data) {
560        let line = &data[start..end_pos];
561        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
562        buf.push(line_delim);
563        start = end_pos + 1;
564    }
565    if start < data.len() {
566        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
567        buf.push(line_delim);
568    }
569}
570
571/// Extract byte ranges from a line into the output buffer.
572/// For the common non-complement case with contiguous ranges, uses bulk copy.
573#[inline(always)]
574fn cut_bytes_to_buf(
575    line: &[u8],
576    ranges: &[Range],
577    complement: bool,
578    output_delim: &[u8],
579    buf: &mut Vec<u8>,
580) {
581    let len = line.len();
582    let mut first_range = true;
583
584    if complement {
585        let mut pos: usize = 1;
586        for r in ranges {
587            let rs = r.start;
588            let re = r.end.min(len);
589            if pos < rs {
590                if !first_range && !output_delim.is_empty() {
591                    buf.extend_from_slice(output_delim);
592                }
593                buf.extend_from_slice(&line[pos - 1..rs - 1]);
594                first_range = false;
595            }
596            pos = re + 1;
597            if pos > len {
598                break;
599            }
600        }
601        if pos <= len {
602            if !first_range && !output_delim.is_empty() {
603                buf.extend_from_slice(output_delim);
604            }
605            buf.extend_from_slice(&line[pos - 1..len]);
606        }
607    } else if output_delim.is_empty() && ranges.len() == 1 {
608        // Ultra-fast path: single range, no output delimiter
609        let start = ranges[0].start.saturating_sub(1);
610        let end = ranges[0].end.min(len);
611        if start < len {
612            buf.extend_from_slice(&line[start..end]);
613        }
614    } else {
615        for r in ranges {
616            let start = r.start.saturating_sub(1);
617            let end = r.end.min(len);
618            if start >= len {
619                break;
620            }
621            if !first_range && !output_delim.is_empty() {
622                buf.extend_from_slice(output_delim);
623            }
624            buf.extend_from_slice(&line[start..end]);
625            first_range = false;
626        }
627    }
628}
629
630// ── Public API ───────────────────────────────────────────────────────────
631
632/// Cut fields from a line using a delimiter. Writes to `out`.
633/// Returns true if any output was written, false if suppressed.
634/// Used by process_cut_reader (stdin path) and unit tests.
635#[inline]
636pub fn cut_fields(
637    line: &[u8],
638    delim: u8,
639    ranges: &[Range],
640    complement: bool,
641    output_delim: &[u8],
642    suppress_no_delim: bool,
643    out: &mut impl Write,
644) -> io::Result<bool> {
645    // Check if line contains delimiter at all
646    if memchr::memchr(delim, line).is_none() {
647        if !suppress_no_delim {
648            out.write_all(line)?;
649            return Ok(true);
650        }
651        return Ok(false); // suppressed
652    }
653
654    // Walk through fields using memchr, output selected ones
655    let mut field_num: usize = 1;
656    let mut field_start: usize = 0;
657    let mut first_output = true;
658
659    for delim_pos in memchr_iter(delim, line) {
660        let selected = in_ranges(ranges, field_num) != complement;
661        if selected {
662            if !first_output {
663                out.write_all(output_delim)?;
664            }
665            out.write_all(&line[field_start..delim_pos])?;
666            first_output = false;
667        }
668        field_start = delim_pos + 1;
669        field_num += 1;
670    }
671
672    // Last field (after last delimiter)
673    let selected = in_ranges(ranges, field_num) != complement;
674    if selected {
675        if !first_output {
676            out.write_all(output_delim)?;
677        }
678        out.write_all(&line[field_start..])?;
679    }
680
681    Ok(true)
682}
683
684/// Cut bytes/chars from a line. Writes selected bytes to `out`.
685/// Used by process_cut_reader (stdin path) and unit tests.
686#[inline]
687pub fn cut_bytes(
688    line: &[u8],
689    ranges: &[Range],
690    complement: bool,
691    output_delim: &[u8],
692    out: &mut impl Write,
693) -> io::Result<bool> {
694    let mut first_range = true;
695
696    if complement {
697        let len = line.len();
698        let mut comp_ranges = Vec::new();
699        let mut pos: usize = 1;
700        for r in ranges {
701            let rs = r.start;
702            let re = r.end.min(len);
703            if pos < rs {
704                comp_ranges.push((pos, rs - 1));
705            }
706            pos = re + 1;
707            if pos > len {
708                break;
709            }
710        }
711        if pos <= len {
712            comp_ranges.push((pos, len));
713        }
714        for &(s, e) in &comp_ranges {
715            if !first_range && !output_delim.is_empty() {
716                out.write_all(output_delim)?;
717            }
718            out.write_all(&line[s - 1..e])?;
719            first_range = false;
720        }
721    } else {
722        for r in ranges {
723            let start = r.start.saturating_sub(1);
724            let end = r.end.min(line.len());
725            if start >= line.len() {
726                break;
727            }
728            if !first_range && !output_delim.is_empty() {
729                out.write_all(output_delim)?;
730            }
731            out.write_all(&line[start..end])?;
732            first_range = false;
733        }
734    }
735    Ok(true)
736}
737
738/// Process a full data buffer (from mmap or read) with cut operation.
739/// Dispatches to optimized fast paths for field and byte modes.
740pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
741    match cfg.mode {
742        CutMode::Fields => process_fields_fast(data, cfg, out),
743        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
744    }
745}
746
747/// Process input from a reader (for stdin).
748pub fn process_cut_reader<R: BufRead>(
749    mut reader: R,
750    cfg: &CutConfig,
751    out: &mut impl Write,
752) -> io::Result<()> {
753    let mut buf = Vec::new();
754
755    loop {
756        buf.clear();
757        let n = reader.read_until(cfg.line_delim, &mut buf)?;
758        if n == 0 {
759            break;
760        }
761
762        let has_line_delim = buf.last() == Some(&cfg.line_delim);
763        let line = if has_line_delim {
764            &buf[..buf.len() - 1]
765        } else {
766            &buf[..]
767        };
768
769        let wrote = process_one_line(line, cfg, out)?;
770
771        // GNU always terminates output lines, even if input had no trailing delimiter
772        if wrote {
773            out.write_all(&[cfg.line_delim])?;
774        }
775    }
776
777    Ok(())
778}
779
780/// Process one line according to the cut config (used by stdin reader path).
781#[inline]
782fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
783    match cfg.mode {
784        CutMode::Fields => cut_fields(
785            line,
786            cfg.delim,
787            cfg.ranges,
788            cfg.complement,
789            cfg.output_delim,
790            cfg.suppress_no_delim,
791            out,
792        ),
793        CutMode::Bytes | CutMode::Characters => {
794            cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
795        }
796    }
797}
798
799/// Cut operation mode
800#[derive(Debug, Clone, Copy, PartialEq)]
801pub enum CutMode {
802    Bytes,
803    Characters,
804    Fields,
805}
coreutils_rs/cut/core.rs

coreutils_rs/cut/
core.rs