Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use rayon::prelude::*;
3use std::io::{self, BufRead, Write};
4
5/// Minimum file size for parallel processing (512KB).
6/// Tuned for optimal throughput on multi-core systems.
7const PARALLEL_THRESHOLD: usize = 512 * 1024;
8
9/// Configuration for cut operations.
10pub struct CutConfig<'a> {
11    pub mode: CutMode,
12    pub ranges: &'a [Range],
13    pub complement: bool,
14    pub delim: u8,
15    pub output_delim: &'a [u8],
16    pub suppress_no_delim: bool,
17    pub line_delim: u8,
18}
19
20/// A range specification like 1, 3-5, -3, 4-
21#[derive(Debug, Clone)]
22pub struct Range {
23    pub start: usize, // 1-based, 0 means "from beginning"
24    pub end: usize,   // 1-based, usize::MAX means "to end"
25}
26
27/// Parse a LIST specification like "1,3-5,7-" into ranges.
28/// Each range is 1-based. Returns sorted, merged ranges.
29pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
30    let mut ranges = Vec::new();
31
32    for part in spec.split(',') {
33        let part = part.trim();
34        if part.is_empty() {
35            continue;
36        }
37
38        if let Some(idx) = part.find('-') {
39            let left = &part[..idx];
40            let right = &part[idx + 1..];
41
42            let start = if left.is_empty() {
43                1
44            } else {
45                left.parse::<usize>()
46                    .map_err(|_| format!("invalid range: '{}'", part))?
47            };
48
49            let end = if right.is_empty() {
50                usize::MAX
51            } else {
52                right
53                    .parse::<usize>()
54                    .map_err(|_| format!("invalid range: '{}'", part))?
55            };
56
57            if start == 0 {
58                return Err("fields and positions are numbered from 1".to_string());
59            }
60            if start > end {
61                return Err(format!("invalid decreasing range: '{}'", part));
62            }
63
64            ranges.push(Range { start, end });
65        } else {
66            let n = part
67                .parse::<usize>()
68                .map_err(|_| format!("invalid field: '{}'", part))?;
69            if n == 0 {
70                return Err("fields and positions are numbered from 1".to_string());
71            }
72            ranges.push(Range { start: n, end: n });
73        }
74    }
75
76    if ranges.is_empty() {
77        return Err("you must specify a list of bytes, characters, or fields".to_string());
78    }
79
80    // Sort and merge overlapping ranges
81    ranges.sort_by_key(|r| (r.start, r.end));
82    let mut merged = vec![ranges[0].clone()];
83    for r in &ranges[1..] {
84        let last = merged.last_mut().unwrap();
85        if r.start <= last.end.saturating_add(1) {
86            last.end = last.end.max(r.end);
87        } else {
88            merged.push(r.clone());
89        }
90    }
91
92    Ok(merged)
93}
94
95/// Check if a 1-based position is in any range.
96/// Ranges must be sorted. Uses early exit since ranges are sorted.
97#[inline(always)]
98fn in_ranges(ranges: &[Range], pos: usize) -> bool {
99    for r in ranges {
100        if pos < r.start {
101            return false; // ranges are sorted, no point checking further
102        }
103        if pos <= r.end {
104            return true;
105        }
106    }
107    false
108}
109
110/// Pre-compute a 64-bit mask for field selection.
111/// Bit i-1 is set if field i should be output.
112#[inline]
113fn compute_field_mask(ranges: &[Range], complement: bool) -> u64 {
114    let mut mask: u64 = 0;
115    for i in 1..=64u32 {
116        let in_range = in_ranges(ranges, i as usize);
117        if in_range != complement {
118            mask |= 1u64 << (i - 1);
119        }
120    }
121    mask
122}
123
124/// Check if a field should be selected, using bitset for first 64 fields.
125#[inline(always)]
126fn is_selected(field_num: usize, mask: u64, ranges: &[Range], complement: bool) -> bool {
127    if field_num <= 64 {
128        (mask >> (field_num - 1)) & 1 == 1
129    } else {
130        in_ranges(ranges, field_num) != complement
131    }
132}
133
134// ── Chunk splitting for parallel processing ──────────────────────────────
135
136/// Split data into chunks aligned to line boundaries for parallel processing.
137/// Returns slices that each end on a line_delim boundary (except possibly the last).
138fn split_into_chunks<'a>(data: &'a [u8], line_delim: u8) -> Vec<&'a [u8]> {
139    let num_threads = rayon::current_num_threads().max(1);
140    if data.len() < PARALLEL_THRESHOLD || num_threads <= 1 {
141        return vec![data];
142    }
143
144    let chunk_size = data.len() / num_threads;
145    let mut chunks = Vec::with_capacity(num_threads);
146    let mut pos = 0;
147
148    for _ in 0..num_threads - 1 {
149        let target = pos + chunk_size;
150        if target >= data.len() {
151            break;
152        }
153        // Align to next line boundary
154        let boundary = memchr::memchr(line_delim, &data[target..])
155            .map(|p| target + p + 1)
156            .unwrap_or(data.len());
157        if boundary > pos {
158            chunks.push(&data[pos..boundary]);
159        }
160        pos = boundary;
161    }
162
163    // Last chunk gets the remainder
164    if pos < data.len() {
165        chunks.push(&data[pos..]);
166    }
167
168    chunks
169}
170
171// ── Fast path: field extraction with batched output ──────────────────────
172
173/// Optimized field extraction with early exit and batched output.
174/// Uses SIMD newline scanning + inline byte scan per line.
175fn process_fields_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
176    let delim = cfg.delim;
177    let line_delim = cfg.line_delim;
178    let ranges = cfg.ranges;
179    let complement = cfg.complement;
180    let output_delim = cfg.output_delim;
181    let suppress = cfg.suppress_no_delim;
182
183    // Ultra-fast path: single field extraction (e.g., cut -f5)
184    if !complement && ranges.len() == 1 && ranges[0].start == ranges[0].end {
185        return process_single_field(data, delim, line_delim, ranges[0].start, suppress, out);
186    }
187
188    // Pre-compute for general field extraction
189    let max_field = if complement {
190        usize::MAX
191    } else {
192        ranges.last().map(|r| r.end).unwrap_or(0)
193    };
194    let field_mask = compute_field_mask(ranges, complement);
195
196    if data.len() >= PARALLEL_THRESHOLD {
197        // Parallel path
198        let chunks = split_into_chunks(data, line_delim);
199        let results: Vec<Vec<u8>> = chunks
200            .par_iter()
201            .map(|chunk| {
202                let mut buf = Vec::with_capacity(chunk.len() / 2);
203                process_fields_chunk(
204                    chunk,
205                    delim,
206                    ranges,
207                    output_delim,
208                    suppress,
209                    max_field,
210                    field_mask,
211                    line_delim,
212                    complement,
213                    &mut buf,
214                );
215                buf
216            })
217            .collect();
218        for result in &results {
219            if !result.is_empty() {
220                out.write_all(result)?;
221            }
222        }
223    } else {
224        // Sequential path
225        let mut buf = Vec::with_capacity(data.len() / 2);
226        process_fields_chunk(
227            data,
228            delim,
229            ranges,
230            output_delim,
231            suppress,
232            max_field,
233            field_mask,
234            line_delim,
235            complement,
236            &mut buf,
237        );
238        if !buf.is_empty() {
239            out.write_all(&buf)?;
240        }
241    }
242    Ok(())
243}
244
245/// Process a chunk of data for general field extraction.
246fn process_fields_chunk(
247    data: &[u8],
248    delim: u8,
249    ranges: &[Range],
250    output_delim: &[u8],
251    suppress: bool,
252    max_field: usize,
253    field_mask: u64,
254    line_delim: u8,
255    complement: bool,
256    buf: &mut Vec<u8>,
257) {
258    let mut start = 0;
259    for end_pos in memchr_iter(line_delim, data) {
260        let line = &data[start..end_pos];
261        extract_fields_to_buf(
262            line,
263            delim,
264            ranges,
265            output_delim,
266            suppress,
267            max_field,
268            field_mask,
269            line_delim,
270            buf,
271            complement,
272        );
273        start = end_pos + 1;
274    }
275    if start < data.len() {
276        extract_fields_to_buf(
277            &data[start..],
278            delim,
279            ranges,
280            output_delim,
281            suppress,
282            max_field,
283            field_mask,
284            line_delim,
285            buf,
286            complement,
287        );
288    }
289}
290
291// ── Ultra-fast single field extraction ───────────────────────────────────
292
293/// Specialized path for extracting exactly one field (e.g., `cut -f5`).
294/// Uses SIMD memchr for newline scanning + minimal inline byte scan per line.
295/// For large files, splits work across multiple threads with rayon.
296fn process_single_field(
297    data: &[u8],
298    delim: u8,
299    line_delim: u8,
300    target: usize,
301    suppress: bool,
302    out: &mut impl Write,
303) -> io::Result<()> {
304    let target_idx = target - 1;
305
306    if data.len() >= PARALLEL_THRESHOLD {
307        // Parallel path: split into chunks aligned to line boundaries
308        let chunks = split_into_chunks(data, line_delim);
309        let results: Vec<Vec<u8>> = chunks
310            .par_iter()
311            .map(|chunk| {
312                let mut buf = Vec::with_capacity(chunk.len() / 4);
313                process_single_field_chunk(
314                    chunk, delim, target_idx, line_delim, suppress, &mut buf,
315                );
316                buf
317            })
318            .collect();
319        for result in &results {
320            if !result.is_empty() {
321                out.write_all(result)?;
322            }
323        }
324    } else {
325        // Sequential path for small data
326        let mut buf = Vec::with_capacity(data.len() / 4);
327        process_single_field_chunk(data, delim, target_idx, line_delim, suppress, &mut buf);
328        if !buf.is_empty() {
329            out.write_all(&buf)?;
330        }
331    }
332    Ok(())
333}
334
335/// Process a chunk of data for single-field extraction.
336fn process_single_field_chunk(
337    data: &[u8],
338    delim: u8,
339    target_idx: usize,
340    line_delim: u8,
341    suppress: bool,
342    buf: &mut Vec<u8>,
343) {
344    let mut start = 0;
345    for end_pos in memchr_iter(line_delim, data) {
346        let line = &data[start..end_pos];
347        extract_single_field_line(line, delim, target_idx, line_delim, suppress, buf);
348        start = end_pos + 1;
349    }
350    // Handle final line without terminator
351    if start < data.len() {
352        extract_single_field_line(&data[start..], delim, target_idx, line_delim, suppress, buf);
353    }
354}
355
356/// Extract a single field from one line using a single SIMD memchr_iter pass.
357/// Collects all delimiter positions in one scan, then indexes directly.
358#[inline(always)]
359fn extract_single_field_line(
360    line: &[u8],
361    delim: u8,
362    target_idx: usize,
363    line_delim: u8,
364    suppress: bool,
365    buf: &mut Vec<u8>,
366) {
367    if line.is_empty() {
368        if !suppress {
369            buf.push(line_delim);
370        }
371        return;
372    }
373
374    // Single SIMD pass to find ALL delimiter positions
375    // This reduces N SIMD scans to 1 for field N
376    let mut delim_positions: [usize; 64] = [0; 64];
377    let mut num_delims: usize = 0;
378
379    for pos in memchr_iter(delim, line) {
380        if num_delims < 64 {
381            delim_positions[num_delims] = pos;
382        }
383        num_delims += 1;
384        // Early exit: we have enough delimiters to extract our target field
385        if num_delims > target_idx {
386            break;
387        }
388    }
389
390    // No delimiters found — output whole line or suppress
391    if num_delims == 0 {
392        if !suppress {
393            buf.extend_from_slice(line);
394            buf.push(line_delim);
395        }
396        return;
397    }
398
399    // Fast path: target is field 1 (cut -f1)
400    if target_idx == 0 {
401        buf.extend_from_slice(&line[..delim_positions[0]]);
402        buf.push(line_delim);
403        return;
404    }
405
406    // Not enough fields for target
407    if num_delims < target_idx {
408        buf.push(line_delim);
409        return;
410    }
411
412    // Field start is right after the (target_idx-1)th delimiter
413    let field_start = delim_positions[target_idx - 1] + 1;
414
415    // Field end is the target_idx-th delimiter (or end of line)
416    if num_delims > target_idx && target_idx < 64 {
417        buf.extend_from_slice(&line[field_start..delim_positions[target_idx]]);
418    } else {
419        buf.extend_from_slice(&line[field_start..]);
420    }
421    buf.push(line_delim);
422}
423
424/// Extract fields from a single line into the output buffer.
425/// Uses inline byte scanning with early exit for maximum performance.
426#[inline(always)]
427fn extract_fields_to_buf(
428    line: &[u8],
429    delim: u8,
430    ranges: &[Range],
431    output_delim: &[u8],
432    suppress: bool,
433    max_field: usize,
434    field_mask: u64,
435    line_delim: u8,
436    buf: &mut Vec<u8>,
437    complement: bool,
438) {
439    let len = line.len();
440
441    // Empty line: no delimiter possible
442    if len == 0 {
443        if !suppress {
444            buf.push(line_delim);
445        }
446        return;
447    }
448
449    let mut field_num: usize = 1;
450    let mut field_start: usize = 0;
451    let mut first_output = true;
452    let mut has_delim = false;
453
454    // SIMD-accelerated delimiter scanning with early exit
455    for delim_pos in memchr_iter(delim, line) {
456        has_delim = true;
457
458        if is_selected(field_num, field_mask, ranges, complement) {
459            if !first_output {
460                buf.extend_from_slice(output_delim);
461            }
462            buf.extend_from_slice(&line[field_start..delim_pos]);
463            first_output = false;
464        }
465
466        field_num += 1;
467        field_start = delim_pos + 1;
468
469        // Early exit: past the last needed field
470        if field_num > max_field {
471            break;
472        }
473    }
474
475    // Last field (only if we didn't early-exit past it)
476    if (field_num <= max_field || complement)
477        && has_delim
478        && is_selected(field_num, field_mask, ranges, complement)
479    {
480        if !first_output {
481            buf.extend_from_slice(output_delim);
482        }
483        buf.extend_from_slice(&line[field_start..len]);
484        first_output = false;
485    }
486
487    // Output line terminator
488    if !first_output {
489        // Had output — add line delimiter
490        buf.push(line_delim);
491    } else if !has_delim {
492        // No delimiter found — output whole line or suppress
493        if !suppress {
494            buf.extend_from_slice(line);
495            buf.push(line_delim);
496        }
497    } else {
498        // Had delimiter but no selected field — output empty line (GNU compat)
499        buf.push(line_delim);
500    }
501}
502
503// ── Fast path: byte/char extraction with batched output ──────────────────
504
505/// Optimized byte/char extraction with batched output and parallel processing.
506fn process_bytes_fast(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
507    let line_delim = cfg.line_delim;
508    let ranges = cfg.ranges;
509    let complement = cfg.complement;
510    let output_delim = cfg.output_delim;
511
512    if data.len() >= PARALLEL_THRESHOLD {
513        let chunks = split_into_chunks(data, line_delim);
514        let results: Vec<Vec<u8>> = chunks
515            .par_iter()
516            .map(|chunk| {
517                let mut buf = Vec::with_capacity(chunk.len() / 2);
518                process_bytes_chunk(
519                    chunk,
520                    ranges,
521                    complement,
522                    output_delim,
523                    line_delim,
524                    &mut buf,
525                );
526                buf
527            })
528            .collect();
529        for result in &results {
530            if !result.is_empty() {
531                out.write_all(result)?;
532            }
533        }
534    } else {
535        let mut buf = Vec::with_capacity(data.len() / 2);
536        process_bytes_chunk(data, ranges, complement, output_delim, line_delim, &mut buf);
537        if !buf.is_empty() {
538            out.write_all(&buf)?;
539        }
540    }
541    Ok(())
542}
543
544/// Process a chunk of data for byte/char extraction.
545fn process_bytes_chunk(
546    data: &[u8],
547    ranges: &[Range],
548    complement: bool,
549    output_delim: &[u8],
550    line_delim: u8,
551    buf: &mut Vec<u8>,
552) {
553    let mut start = 0;
554    for end_pos in memchr_iter(line_delim, data) {
555        let line = &data[start..end_pos];
556        cut_bytes_to_buf(line, ranges, complement, output_delim, buf);
557        buf.push(line_delim);
558        start = end_pos + 1;
559    }
560    if start < data.len() {
561        cut_bytes_to_buf(&data[start..], ranges, complement, output_delim, buf);
562        buf.push(line_delim);
563    }
564}
565
566/// Extract byte ranges from a line into the output buffer.
567#[inline(always)]
568fn cut_bytes_to_buf(
569    line: &[u8],
570    ranges: &[Range],
571    complement: bool,
572    output_delim: &[u8],
573    buf: &mut Vec<u8>,
574) {
575    let mut first_range = true;
576
577    if complement {
578        let len = line.len();
579        let mut pos: usize = 1;
580        for r in ranges {
581            let rs = r.start;
582            let re = r.end.min(len);
583            if pos < rs {
584                if !first_range && !output_delim.is_empty() {
585                    buf.extend_from_slice(output_delim);
586                }
587                buf.extend_from_slice(&line[pos - 1..rs - 1]);
588                first_range = false;
589            }
590            pos = re + 1;
591            if pos > len {
592                break;
593            }
594        }
595        if pos <= len {
596            if !first_range && !output_delim.is_empty() {
597                buf.extend_from_slice(output_delim);
598            }
599            buf.extend_from_slice(&line[pos - 1..len]);
600        }
601    } else {
602        for r in ranges {
603            let start = r.start.saturating_sub(1);
604            let end = r.end.min(line.len());
605            if start >= line.len() {
606                break;
607            }
608            if !first_range && !output_delim.is_empty() {
609                buf.extend_from_slice(output_delim);
610            }
611            buf.extend_from_slice(&line[start..end]);
612            first_range = false;
613        }
614    }
615}
616
617// ── Public API ───────────────────────────────────────────────────────────
618
619/// Cut fields from a line using a delimiter. Writes to `out`.
620/// Returns true if any output was written, false if suppressed.
621/// Used by process_cut_reader (stdin path) and unit tests.
622#[inline]
623pub fn cut_fields(
624    line: &[u8],
625    delim: u8,
626    ranges: &[Range],
627    complement: bool,
628    output_delim: &[u8],
629    suppress_no_delim: bool,
630    out: &mut impl Write,
631) -> io::Result<bool> {
632    // Check if line contains delimiter at all
633    if memchr::memchr(delim, line).is_none() {
634        if !suppress_no_delim {
635            out.write_all(line)?;
636            return Ok(true);
637        }
638        return Ok(false); // suppressed
639    }
640
641    // Walk through fields using memchr, output selected ones
642    let mut field_num: usize = 1;
643    let mut field_start: usize = 0;
644    let mut first_output = true;
645
646    for delim_pos in memchr_iter(delim, line) {
647        let selected = in_ranges(ranges, field_num) != complement;
648        if selected {
649            if !first_output {
650                out.write_all(output_delim)?;
651            }
652            out.write_all(&line[field_start..delim_pos])?;
653            first_output = false;
654        }
655        field_start = delim_pos + 1;
656        field_num += 1;
657    }
658
659    // Last field (after last delimiter)
660    let selected = in_ranges(ranges, field_num) != complement;
661    if selected {
662        if !first_output {
663            out.write_all(output_delim)?;
664        }
665        out.write_all(&line[field_start..])?;
666    }
667
668    Ok(true)
669}
670
671/// Cut bytes/chars from a line. Writes selected bytes to `out`.
672/// Used by process_cut_reader (stdin path) and unit tests.
673#[inline]
674pub fn cut_bytes(
675    line: &[u8],
676    ranges: &[Range],
677    complement: bool,
678    output_delim: &[u8],
679    out: &mut impl Write,
680) -> io::Result<bool> {
681    let mut first_range = true;
682
683    if complement {
684        let len = line.len();
685        let mut comp_ranges = Vec::new();
686        let mut pos: usize = 1;
687        for r in ranges {
688            let rs = r.start;
689            let re = r.end.min(len);
690            if pos < rs {
691                comp_ranges.push((pos, rs - 1));
692            }
693            pos = re + 1;
694            if pos > len {
695                break;
696            }
697        }
698        if pos <= len {
699            comp_ranges.push((pos, len));
700        }
701        for &(s, e) in &comp_ranges {
702            if !first_range && !output_delim.is_empty() {
703                out.write_all(output_delim)?;
704            }
705            out.write_all(&line[s - 1..e])?;
706            first_range = false;
707        }
708    } else {
709        for r in ranges {
710            let start = r.start.saturating_sub(1);
711            let end = r.end.min(line.len());
712            if start >= line.len() {
713                break;
714            }
715            if !first_range && !output_delim.is_empty() {
716                out.write_all(output_delim)?;
717            }
718            out.write_all(&line[start..end])?;
719            first_range = false;
720        }
721    }
722    Ok(true)
723}
724
725/// Process a full data buffer (from mmap or read) with cut operation.
726/// Dispatches to optimized fast paths for field and byte modes.
727pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
728    match cfg.mode {
729        CutMode::Fields => process_fields_fast(data, cfg, out),
730        CutMode::Bytes | CutMode::Characters => process_bytes_fast(data, cfg, out),
731    }
732}
733
734/// Process input from a reader (for stdin).
735pub fn process_cut_reader<R: BufRead>(
736    mut reader: R,
737    cfg: &CutConfig,
738    out: &mut impl Write,
739) -> io::Result<()> {
740    let mut buf = Vec::new();
741
742    loop {
743        buf.clear();
744        let n = reader.read_until(cfg.line_delim, &mut buf)?;
745        if n == 0 {
746            break;
747        }
748
749        let has_line_delim = buf.last() == Some(&cfg.line_delim);
750        let line = if has_line_delim {
751            &buf[..buf.len() - 1]
752        } else {
753            &buf[..]
754        };
755
756        let wrote = process_one_line(line, cfg, out)?;
757
758        // GNU always terminates output lines, even if input had no trailing delimiter
759        if wrote {
760            out.write_all(&[cfg.line_delim])?;
761        }
762    }
763
764    Ok(())
765}
766
767/// Process one line according to the cut config (used by stdin reader path).
768#[inline]
769fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<bool> {
770    match cfg.mode {
771        CutMode::Fields => cut_fields(
772            line,
773            cfg.delim,
774            cfg.ranges,
775            cfg.complement,
776            cfg.output_delim,
777            cfg.suppress_no_delim,
778            out,
779        ),
780        CutMode::Bytes | CutMode::Characters => {
781            cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
782        }
783    }
784}
785
786/// Cut operation mode
787#[derive(Debug, Clone, Copy, PartialEq)]
788pub enum CutMode {
789    Bytes,
790    Characters,
791    Fields,
792}