Skip to main content

coreutils_rs/cut/
core.rs

1use memchr::memchr_iter;
2use std::io::{self, BufRead, Write};
3
4/// Configuration for cut operations.
5pub struct CutConfig<'a> {
6    pub mode: CutMode,
7    pub ranges: &'a [Range],
8    pub complement: bool,
9    pub delim: u8,
10    pub output_delim: &'a [u8],
11    pub suppress_no_delim: bool,
12    pub line_delim: u8,
13}
14
15/// A range specification like 1, 3-5, -3, 4-
16#[derive(Debug, Clone)]
17pub struct Range {
18    pub start: usize, // 1-based, 0 means "from beginning"
19    pub end: usize,   // 1-based, usize::MAX means "to end"
20}
21
22/// Parse a LIST specification like "1,3-5,7-" into ranges.
23/// Each range is 1-based. Returns sorted, merged ranges.
24pub fn parse_ranges(spec: &str) -> Result<Vec<Range>, String> {
25    let mut ranges = Vec::new();
26
27    for part in spec.split(',') {
28        let part = part.trim();
29        if part.is_empty() {
30            continue;
31        }
32
33        if let Some(idx) = part.find('-') {
34            let left = &part[..idx];
35            let right = &part[idx + 1..];
36
37            let start = if left.is_empty() {
38                1
39            } else {
40                left.parse::<usize>()
41                    .map_err(|_| format!("invalid range: '{}'", part))?
42            };
43
44            let end = if right.is_empty() {
45                usize::MAX
46            } else {
47                right
48                    .parse::<usize>()
49                    .map_err(|_| format!("invalid range: '{}'", part))?
50            };
51
52            if start == 0 {
53                return Err("fields and positions are numbered from 1".to_string());
54            }
55            if start > end {
56                return Err(format!("invalid decreasing range: '{}'", part));
57            }
58
59            ranges.push(Range { start, end });
60        } else {
61            let n = part
62                .parse::<usize>()
63                .map_err(|_| format!("invalid field: '{}'", part))?;
64            if n == 0 {
65                return Err("fields and positions are numbered from 1".to_string());
66            }
67            ranges.push(Range { start: n, end: n });
68        }
69    }
70
71    if ranges.is_empty() {
72        return Err("you must specify a list of bytes, characters, or fields".to_string());
73    }
74
75    // Sort and merge overlapping ranges
76    ranges.sort_by_key(|r| (r.start, r.end));
77    let mut merged = vec![ranges[0].clone()];
78    for r in &ranges[1..] {
79        let last = merged.last_mut().unwrap();
80        if r.start <= last.end.saturating_add(1) {
81            last.end = last.end.max(r.end);
82        } else {
83            merged.push(r.clone());
84        }
85    }
86
87    Ok(merged)
88}
89
90/// Check if a 1-based position is in any range.
91#[inline(always)]
92fn in_ranges(ranges: &[Range], pos: usize) -> bool {
93    for r in ranges {
94        if pos < r.start {
95            return false; // ranges are sorted, no point checking further
96        }
97        if pos <= r.end {
98            return true;
99        }
100    }
101    false
102}
103
104/// Cut fields from a line using a delimiter. Writes to `out`.
105/// Uses memchr for SIMD-accelerated delimiter scanning.
106#[inline]
107pub fn cut_fields(
108    line: &[u8],
109    delim: u8,
110    ranges: &[Range],
111    complement: bool,
112    output_delim: &[u8],
113    suppress_no_delim: bool,
114    out: &mut impl Write,
115) -> io::Result<()> {
116    // Check if line contains delimiter at all
117    if memchr::memchr(delim, line).is_none() {
118        if !suppress_no_delim {
119            out.write_all(line)?;
120        }
121        return Ok(());
122    }
123
124    // Walk through fields using memchr, output selected ones
125    let mut field_num: usize = 1;
126    let mut field_start: usize = 0;
127    let mut first_output = true;
128
129    for delim_pos in memchr_iter(delim, line) {
130        let selected = in_ranges(ranges, field_num) != complement;
131        if selected {
132            if !first_output {
133                out.write_all(output_delim)?;
134            }
135            out.write_all(&line[field_start..delim_pos])?;
136            first_output = false;
137        }
138        field_start = delim_pos + 1;
139        field_num += 1;
140    }
141
142    // Last field (after last delimiter)
143    let selected = in_ranges(ranges, field_num) != complement;
144    if selected {
145        if !first_output {
146            out.write_all(output_delim)?;
147        }
148        out.write_all(&line[field_start..])?;
149    }
150
151    Ok(())
152}
153
154/// Cut bytes/chars from a line. Writes selected bytes to `out`.
155#[inline]
156pub fn cut_bytes(
157    line: &[u8],
158    ranges: &[Range],
159    complement: bool,
160    output_delim: &[u8],
161    out: &mut impl Write,
162) -> io::Result<()> {
163    let mut first_range = true;
164
165    if complement {
166        // For complement, output bytes NOT in any range
167        let mut in_excluded = false;
168        for (i, &b) in line.iter().enumerate() {
169            let pos = i + 1;
170            if in_ranges(ranges, pos) {
171                if in_excluded {
172                    first_range = false;
173                }
174                in_excluded = false;
175            } else {
176                if !in_excluded && !first_range && !output_delim.is_empty() {
177                    out.write_all(output_delim)?;
178                }
179                out.write_all(&[b])?;
180                in_excluded = true;
181            }
182        }
183    } else {
184        // Output bytes in ranges. Ranges are sorted and merged.
185        for r in ranges {
186            let start = r.start.saturating_sub(1); // convert to 0-based
187            let end = r.end.min(line.len()); // clamp to line length
188            if start >= line.len() {
189                break;
190            }
191            if !first_range && !output_delim.is_empty() {
192                out.write_all(output_delim)?;
193            }
194            out.write_all(&line[start..end])?;
195            first_range = false;
196        }
197    }
198    Ok(())
199}
200
201/// Process a full data buffer (from mmap or read) with cut operation.
202/// Processes line-by-line by scanning for line_delim.
203pub fn process_cut_data(data: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
204    let mut start = 0;
205
206    for end_pos in memchr_iter(cfg.line_delim, data) {
207        let line = &data[start..end_pos];
208        process_one_line(line, cfg, out)?;
209        out.write_all(&[cfg.line_delim])?;
210        start = end_pos + 1;
211    }
212
213    // Handle last line without terminator
214    if start < data.len() {
215        let line = &data[start..];
216        process_one_line(line, cfg, out)?;
217        out.write_all(b"\n")?;
218    }
219
220    Ok(())
221}
222
223/// Process input from a reader (for stdin).
224pub fn process_cut_reader<R: BufRead>(
225    mut reader: R,
226    cfg: &CutConfig,
227    out: &mut impl Write,
228) -> io::Result<()> {
229    let mut buf = Vec::new();
230
231    loop {
232        buf.clear();
233        let n = reader.read_until(cfg.line_delim, &mut buf)?;
234        if n == 0 {
235            break;
236        }
237
238        let has_delim = buf.last() == Some(&cfg.line_delim);
239        let line = if has_delim {
240            &buf[..buf.len() - 1]
241        } else {
242            &buf[..]
243        };
244
245        process_one_line(line, cfg, out)?;
246
247        if has_delim {
248            out.write_all(&[cfg.line_delim])?;
249        } else if !line.is_empty() {
250            out.write_all(b"\n")?;
251        }
252    }
253
254    Ok(())
255}
256
257#[inline]
258fn process_one_line(line: &[u8], cfg: &CutConfig, out: &mut impl Write) -> io::Result<()> {
259    match cfg.mode {
260        CutMode::Fields => cut_fields(
261            line,
262            cfg.delim,
263            cfg.ranges,
264            cfg.complement,
265            cfg.output_delim,
266            cfg.suppress_no_delim,
267            out,
268        ),
269        CutMode::Bytes | CutMode::Characters => {
270            cut_bytes(line, cfg.ranges, cfg.complement, cfg.output_delim, out)
271        }
272    }
273}
274
275/// Cut operation mode
276#[derive(Debug, Clone, Copy, PartialEq)]
277pub enum CutMode {
278    Bytes,
279    Characters,
280    Fields,
281}