coreutils_rs/sort/
key.rs

1/// Key definition parsing and field extraction for `sort -k`.
2///
3/// KEYDEF format: FIELD[.CHAR][OPTS],[FIELD[.CHAR][OPTS]]
4/// Fields and characters are 1-indexed.
5
6/// Per-key ordering options that can override global options.
7#[derive(Debug, Clone, Default)]
8pub struct KeyOpts {
9    pub numeric: bool,
10    pub general_numeric: bool,
11    pub human_numeric: bool,
12    pub month: bool,
13    pub version: bool,
14    pub random: bool,
15    pub reverse: bool,
16    pub ignore_leading_blanks: bool,
17    pub dictionary_order: bool,
18    pub ignore_case: bool,
19    pub ignore_nonprinting: bool,
20}
21
22impl KeyOpts {
23    /// Returns true if any sort-type option is set.
24    pub fn has_sort_type(&self) -> bool {
25        self.numeric
26            || self.general_numeric
27            || self.human_numeric
28            || self.month
29            || self.version
30            || self.random
31    }
32
33    /// Returns true if any option at all is set (for key inheritance logic).
34    pub fn has_any_option(&self) -> bool {
35        self.has_sort_type()
36            || self.ignore_case
37            || self.dictionary_order
38            || self.ignore_nonprinting
39            || self.ignore_leading_blanks
40            || self.reverse
41    }
42
43    /// Parse single-letter option flags from a string.
44    pub fn parse_flags(&mut self, flags: &str) {
45        for c in flags.chars() {
46            match c {
47                'b' => self.ignore_leading_blanks = true,
48                'd' => self.dictionary_order = true,
49                'f' => self.ignore_case = true,
50                'g' => self.general_numeric = true,
51                'h' => self.human_numeric = true,
52                'i' => self.ignore_nonprinting = true,
53                'M' => self.month = true,
54                'n' => self.numeric = true,
55                'R' => self.random = true,
56                'r' => self.reverse = true,
57                'V' => self.version = true,
58                _ => {}
59            }
60        }
61    }
62}
63
64/// A parsed key specification from `-k START[,END]`.
65#[derive(Debug, Clone)]
66pub struct KeyDef {
67    pub start_field: usize,
68    pub start_char: usize,
69    pub end_field: usize,
70    pub end_char: usize,
71    pub opts: KeyOpts,
72}
73
74impl KeyDef {
75    /// Parse a KEYDEF string like "2,2n" or "1.3,1.5" or "3,3rn".
76    pub fn parse(spec: &str) -> Result<KeyDef, String> {
77        let parts: Vec<&str> = spec.splitn(2, ',').collect();
78
79        let (start_field, start_char, start_opts) = parse_field_spec(parts[0])?;
80
81        let (end_field, end_char, end_opts) = if parts.len() > 1 {
82            parse_field_spec(parts[1])?
83        } else {
84            (0, 0, String::new())
85        };
86
87        let mut opts = KeyOpts::default();
88        opts.parse_flags(&start_opts);
89        opts.parse_flags(&end_opts);
90
91        if start_field == 0 {
92            return Err("field number is zero: invalid field specification".to_string());
93        }
94
95        Ok(KeyDef {
96            start_field,
97            start_char,
98            end_field,
99            end_char,
100            opts,
101        })
102    }
103}
104
105/// Parse a single field spec like "2" or "1.3" or "2n" or "1.3bf".
106fn parse_field_spec(s: &str) -> Result<(usize, usize, String), String> {
107    let mut field_str = String::new();
108    let mut char_str = String::new();
109    let mut opts = String::new();
110    let mut in_char = false;
111
112    for c in s.chars() {
113        if c == '.' && !in_char && opts.is_empty() {
114            in_char = true;
115        } else if c.is_ascii_digit() && opts.is_empty() {
116            if in_char {
117                char_str.push(c);
118            } else {
119                field_str.push(c);
120            }
121        } else if c.is_ascii_alphabetic() {
122            opts.push(c);
123        } else {
124            return Err(format!("invalid character '{}' in key spec", c));
125        }
126    }
127
128    let field = if field_str.is_empty() {
129        0
130    } else {
131        field_str
132            .parse::<usize>()
133            .map_err(|_| "invalid field number".to_string())?
134    };
135
136    let char_pos = if char_str.is_empty() {
137        0
138    } else {
139        char_str
140            .parse::<usize>()
141            .map_err(|_| "invalid character position".to_string())?
142    };
143
144    Ok((field, char_pos, opts))
145}
146
147/// Find the byte range of the Nth field (0-indexed) in a line.
148/// Returns (start, end) byte offsets. Allocation-free.
149/// Uses SIMD memchr for separator-based field finding.
150/// Optimized: for small N with a separator, uses successive memchr calls
151/// instead of memchr_iter to avoid iterator setup overhead.
152#[inline]
153fn find_nth_field(line: &[u8], n: usize, separator: Option<u8>) -> (usize, usize) {
154    match separator {
155        Some(sep) => {
156            // For small field indices (N < 4), use successive memchr calls.
157            // Each memchr call is SIMD-accelerated and avoids the iterator overhead.
158            // For larger N, use memchr_iter which amortizes setup over many hits.
159            if n < 4 {
160                find_nth_field_memchr(line, n, sep)
161            } else {
162                find_nth_field_iter(line, n, sep)
163            }
164        }
165        None => {
166            let mut field = 0;
167            let mut i = 0;
168            let len = line.len();
169
170            while i < len {
171                let field_start = i;
172                // Skip blanks (part of this field)
173                while i < len && is_blank(line[i]) {
174                    i += 1;
175                }
176                // Consume non-blanks
177                while i < len && !is_blank(line[i]) {
178                    i += 1;
179                }
180                if field == n {
181                    return (field_start, i);
182                }
183                field += 1;
184            }
185
186            (line.len(), line.len())
187        }
188    }
189}
190
191/// Find the Nth field using successive memchr calls (optimal for small N).
192/// Each memchr is a single SIMD scan that stops at the first separator.
193#[inline(always)]
194fn find_nth_field_memchr(line: &[u8], n: usize, sep: u8) -> (usize, usize) {
195    let mut start = 0;
196    // Skip past N separators to reach the start of field N
197    for _ in 0..n {
198        match memchr::memchr(sep, &line[start..]) {
199            Some(pos) => start = start + pos + 1,
200            None => return (line.len(), line.len()),
201        }
202    }
203    // Find the end of field N (next separator or end of line)
204    match memchr::memchr(sep, &line[start..]) {
205        Some(pos) => (start, start + pos),
206        None => (start, line.len()),
207    }
208}
209
210/// Find the Nth field using memchr_iter (optimal for large N).
211#[inline]
212fn find_nth_field_iter(line: &[u8], n: usize, sep: u8) -> (usize, usize) {
213    let mut field = 0;
214    let mut start = 0;
215    for pos in memchr::memchr_iter(sep, line) {
216        if field == n {
217            return (start, pos);
218        }
219        field += 1;
220        start = pos + 1;
221    }
222    if field == n {
223        (start, line.len())
224    } else {
225        (line.len(), line.len())
226    }
227}
228
229#[inline]
230fn is_blank(b: u8) -> bool {
231    b == b' ' || b == b'\t'
232}
233
234/// Extract the key portion of a line based on a KeyDef.
235/// Allocation-free: uses find_nth_field instead of collecting all fields.
236pub fn extract_key<'a>(line: &'a [u8], key: &KeyDef, separator: Option<u8>) -> &'a [u8] {
237    let sf = key.start_field.saturating_sub(1);
238    let (sf_start, sf_end) = find_nth_field(line, sf, separator);
239
240    if sf_start >= line.len() {
241        return b"";
242    }
243
244    let start_byte = if key.start_char > 0 {
245        let field_len = sf_end - sf_start;
246        let char_offset = (key.start_char - 1).min(field_len);
247        sf_start + char_offset
248    } else {
249        sf_start
250    };
251
252    let end_byte = if key.end_field > 0 {
253        let ef = key.end_field.saturating_sub(1);
254        let (ef_start, ef_end) = find_nth_field(line, ef, separator);
255        if key.end_char > 0 {
256            let field_len = ef_end - ef_start;
257            let char_offset = key.end_char.min(field_len);
258            ef_start + char_offset
259        } else {
260            ef_end
261        }
262    } else {
263        line.len()
264    };
265
266    if start_byte >= end_byte || start_byte >= line.len() {
267        return b"";
268    }
269
270    &line[start_byte..end_byte.min(line.len())]
271}
coreutils_rs/sort/key.rs

coreutils_rs/sort/
key.rs