coreutils_rs/sort/
key.rs

1/// Key definition parsing and field extraction for `sort -k`.
2///
3/// KEYDEF format: FIELD[.CHAR][OPTS],[FIELD[.CHAR][OPTS]]
4/// Fields and characters are 1-indexed.
5
6/// Per-key ordering options that can override global options.
7#[derive(Debug, Clone, Default)]
8pub struct KeyOpts {
9    pub numeric: bool,
10    pub general_numeric: bool,
11    pub human_numeric: bool,
12    pub month: bool,
13    pub version: bool,
14    pub random: bool,
15    pub reverse: bool,
16    pub ignore_leading_blanks: bool,
17    pub dictionary_order: bool,
18    pub ignore_case: bool,
19    pub ignore_nonprinting: bool,
20}
21
22impl KeyOpts {
23    /// Returns true if any sort-type option is set.
24    pub fn has_sort_type(&self) -> bool {
25        self.numeric
26            || self.general_numeric
27            || self.human_numeric
28            || self.month
29            || self.version
30            || self.random
31    }
32
33    /// Returns true if any option at all is set (for key inheritance logic).
34    pub fn has_any_option(&self) -> bool {
35        self.has_sort_type()
36            || self.ignore_case
37            || self.dictionary_order
38            || self.ignore_nonprinting
39            || self.ignore_leading_blanks
40            || self.reverse
41    }
42
43    /// Parse single-letter option flags from a string.
44    pub fn parse_flags(&mut self, flags: &str) {
45        for c in flags.chars() {
46            match c {
47                'b' => self.ignore_leading_blanks = true,
48                'd' => self.dictionary_order = true,
49                'f' => self.ignore_case = true,
50                'g' => self.general_numeric = true,
51                'h' => self.human_numeric = true,
52                'i' => self.ignore_nonprinting = true,
53                'M' => self.month = true,
54                'n' => self.numeric = true,
55                'R' => self.random = true,
56                'r' => self.reverse = true,
57                'V' => self.version = true,
58                _ => {}
59            }
60        }
61    }
62
63    /// Validate that the set of options is compatible (GNU sort rules).
64    /// Returns Err with a descriptive message if incompatible options are detected.
65    ///
66    /// GNU incompatibility rules:
67    /// - At most one of {n, g, h, M} can be set (numeric sort types)
68    /// - R is incompatible with {n, g, h, M}
69    /// - V is incompatible with {n, g, h, M}
70    /// - d is incompatible with {n, g, h, M}
71    /// - i is incompatible with {n, g, h, M}
72    ///
73    /// GNU canonical order for error messages: d, g, h, i, M, n, R, V
74    pub fn validate(&self) -> Result<(), String> {
75        // Collect all active options in GNU canonical order: d, g, h, i, M, n, R, V
76        // We only need to check options that participate in incompatibility.
77        let mut active: Vec<char> = Vec::new();
78        if self.dictionary_order {
79            active.push('d');
80        }
81        if self.general_numeric {
82            active.push('g');
83        }
84        if self.human_numeric {
85            active.push('h');
86        }
87        if self.ignore_nonprinting {
88            active.push('i');
89        }
90        if self.month {
91            active.push('M');
92        }
93        if self.numeric {
94            active.push('n');
95        }
96        if self.random {
97            active.push('R');
98        }
99        if self.version {
100            active.push('V');
101        }
102
103        // Define the incompatible pairs (canonical order: lower index in active list first)
104        // Numeric-like types: g, h, M, n — at most one allowed
105        // R, V each incompatible with g, h, M, n
106        // d, i each incompatible with g, h, M, n
107        let is_numeric_type = |c: char| matches!(c, 'g' | 'h' | 'M' | 'n');
108        let incompatible_with_numeric = |c: char| matches!(c, 'd' | 'i' | 'R' | 'V');
109
110        // Check all pairs in canonical order
111        for i in 0..active.len() {
112            for j in (i + 1)..active.len() {
113                let a = active[i];
114                let b = active[j];
115                let conflict = (is_numeric_type(a) && is_numeric_type(b))
116                    || (is_numeric_type(a) && incompatible_with_numeric(b))
117                    || (incompatible_with_numeric(a) && is_numeric_type(b));
118                if conflict {
119                    return Err(format!("options '-{}{}' are incompatible", a, b));
120                }
121            }
122        }
123
124        Ok(())
125    }
126}
127
128/// A parsed key specification from `-k START[,END]`.
129#[derive(Debug, Clone)]
130pub struct KeyDef {
131    pub start_field: usize,
132    pub start_char: usize,
133    pub end_field: usize,
134    pub end_char: usize,
135    pub opts: KeyOpts,
136}
137
138impl KeyDef {
139    /// Parse a KEYDEF string like "2,2n" or "1.3,1.5" or "3,3rn".
140    pub fn parse(spec: &str) -> Result<KeyDef, String> {
141        let parts: Vec<&str> = spec.splitn(2, ',').collect();
142
143        let (start_field, start_char, start_opts) = parse_field_spec(parts[0])?;
144
145        let (end_field, end_char, end_opts) = if parts.len() > 1 {
146            parse_field_spec(parts[1])?
147        } else {
148            (0, 0, String::new())
149        };
150
151        let mut opts = KeyOpts::default();
152        opts.parse_flags(&start_opts);
153        opts.parse_flags(&end_opts);
154
155        if start_field == 0 {
156            return Err("field number is zero: invalid field specification".to_string());
157        }
158
159        // GNU sort rejects character offset 0 in the START position only.
160        // A 0 in the end position is valid (treated as end of field).
161        if start_char == 0 && parts[0].contains('.') {
162            return Err(format!(
163                "character offset is zero: invalid field specification '{}'",
164                spec
165            ));
166        }
167
168        // Validate per-key option compatibility
169        opts.validate()?;
170
171        Ok(KeyDef {
172            start_field,
173            start_char,
174            end_field,
175            end_char,
176            opts,
177        })
178    }
179}
180
181/// Parse a single field spec like "2" or "1.3" or "2n" or "1.3bf".
182fn parse_field_spec(s: &str) -> Result<(usize, usize, String), String> {
183    let mut field_str = String::new();
184    let mut char_str = String::new();
185    let mut opts = String::new();
186    let mut in_char = false;
187
188    for c in s.chars() {
189        if c == '.' && !in_char && opts.is_empty() {
190            in_char = true;
191        } else if c.is_ascii_digit() && opts.is_empty() {
192            if in_char {
193                char_str.push(c);
194            } else {
195                field_str.push(c);
196            }
197        } else if c.is_ascii_alphabetic() {
198            opts.push(c);
199        } else {
200            return Err(format!("invalid character '{}' in key spec", c));
201        }
202    }
203
204    let field = if field_str.is_empty() {
205        0
206    } else {
207        field_str
208            .parse::<usize>()
209            .map_err(|_| "invalid field number".to_string())?
210    };
211
212    let char_pos = if char_str.is_empty() {
213        0
214    } else {
215        char_str
216            .parse::<usize>()
217            .map_err(|_| "invalid character position".to_string())?
218    };
219
220    Ok((field, char_pos, opts))
221}
222
223/// Find the byte range of the Nth field (0-indexed) in a line.
224/// Returns (start, end) byte offsets. Allocation-free.
225/// Uses SIMD memchr for separator-based field finding.
226/// Optimized: for small N with a separator, uses successive memchr calls
227/// instead of memchr_iter to avoid iterator setup overhead.
228#[inline]
229fn find_nth_field(line: &[u8], n: usize, separator: Option<u8>) -> (usize, usize) {
230    match separator {
231        Some(sep) => {
232            // For small field indices (N < 4), use successive memchr calls.
233            // Each memchr call is SIMD-accelerated and avoids the iterator overhead.
234            // For larger N, use memchr_iter which amortizes setup over many hits.
235            if n < 4 {
236                find_nth_field_memchr(line, n, sep)
237            } else {
238                find_nth_field_iter(line, n, sep)
239            }
240        }
241        None => {
242            let mut field = 0;
243            let mut i = 0;
244            let len = line.len();
245
246            while i < len {
247                let field_start = i;
248                // Skip blanks (part of this field)
249                while i < len && is_blank(line[i]) {
250                    i += 1;
251                }
252                // Consume non-blanks
253                while i < len && !is_blank(line[i]) {
254                    i += 1;
255                }
256                if field == n {
257                    return (field_start, i);
258                }
259                field += 1;
260            }
261
262            (line.len(), line.len())
263        }
264    }
265}
266
267/// Find the Nth field using successive memchr calls (optimal for small N).
268/// Each memchr is a single SIMD scan that stops at the first separator.
269#[inline(always)]
270fn find_nth_field_memchr(line: &[u8], n: usize, sep: u8) -> (usize, usize) {
271    let mut start = 0;
272    // Skip past N separators to reach the start of field N
273    for _ in 0..n {
274        match memchr::memchr(sep, &line[start..]) {
275            Some(pos) => start = start + pos + 1,
276            None => return (line.len(), line.len()),
277        }
278    }
279    // Find the end of field N (next separator or end of line)
280    match memchr::memchr(sep, &line[start..]) {
281        Some(pos) => (start, start + pos),
282        None => (start, line.len()),
283    }
284}
285
286/// Find the Nth field using memchr_iter (optimal for large N).
287#[inline]
288fn find_nth_field_iter(line: &[u8], n: usize, sep: u8) -> (usize, usize) {
289    let mut field = 0;
290    let mut start = 0;
291    for pos in memchr::memchr_iter(sep, line) {
292        if field == n {
293            return (start, pos);
294        }
295        field += 1;
296        start = pos + 1;
297    }
298    if field == n {
299        (start, line.len())
300    } else {
301        (line.len(), line.len())
302    }
303}
304
305#[inline]
306fn is_blank(b: u8) -> bool {
307    b == b' ' || b == b'\t'
308}
309
310/// In -z (zero-terminated) mode, newlines are treated as blanks for field splitting.
311#[inline]
312fn is_blank_z(b: u8) -> bool {
313    b == b' ' || b == b'\t' || b == b'\n'
314}
315
316/// Skip leading blanks with a custom blank predicate.
317#[inline]
318fn skip_blanks_from_fn(line: &[u8], from: usize, end: usize, blank_fn: fn(u8) -> bool) -> usize {
319    let mut i = from;
320    while i < end && blank_fn(line[i]) {
321        i += 1;
322    }
323    i
324}
325
326/// Find the Nth field with zero-terminated mode support.
327#[inline]
328fn find_nth_field_z(
329    line: &[u8],
330    n: usize,
331    separator: Option<u8>,
332    zero_terminated: bool,
333) -> (usize, usize) {
334    if !zero_terminated || separator.is_some() {
335        return find_nth_field(line, n, separator);
336    }
337    // In -z mode without explicit separator, use is_blank_z (includes \n)
338    let mut field = 0;
339    let mut i = 0;
340    let len = line.len();
341
342    while i < len {
343        let field_start = i;
344        while i < len && is_blank_z(line[i]) {
345            i += 1;
346        }
347        while i < len && !is_blank_z(line[i]) {
348            i += 1;
349        }
350        if field == n {
351            return (field_start, i);
352        }
353        field += 1;
354    }
355    (line.len(), line.len())
356}
357
358/// Extract the key portion of a line based on a KeyDef.
359/// Allocation-free: uses find_nth_field instead of collecting all fields.
360///
361/// When `ignore_leading_blanks` is true (from the key's -b flag or global -b),
362/// leading blanks in each field are skipped before applying character position
363/// offsets. This matches GNU sort's behavior where `-b` affects where character
364/// counting starts within a field.
365pub fn extract_key<'a>(
366    line: &'a [u8],
367    key: &KeyDef,
368    separator: Option<u8>,
369    ignore_leading_blanks: bool,
370) -> &'a [u8] {
371    extract_key_z(line, key, separator, ignore_leading_blanks, false)
372}
373
374/// Extract key with zero-terminated mode support.
375/// When `zero_terminated` is true and separator is None (default blank splitting),
376/// newlines are treated as blanks, matching GNU sort -z behavior.
377pub fn extract_key_z<'a>(
378    line: &'a [u8],
379    key: &KeyDef,
380    separator: Option<u8>,
381    ignore_leading_blanks: bool,
382    zero_terminated: bool,
383) -> &'a [u8] {
384    let sf = key.start_field.saturating_sub(1);
385    let (sf_start, sf_end) = find_nth_field_z(line, sf, separator, zero_terminated);
386
387    if sf_start >= line.len() {
388        return b"";
389    }
390
391    let blank_fn: fn(u8) -> bool = if zero_terminated && separator.is_none() {
392        is_blank_z
393    } else {
394        is_blank
395    };
396
397    let start_byte = if key.start_char > 0 {
398        let effective_start = if ignore_leading_blanks {
399            skip_blanks_from_fn(line, sf_start, sf_end, blank_fn)
400        } else {
401            sf_start
402        };
403        let field_len = sf_end - effective_start;
404        let char_offset = (key.start_char - 1).min(field_len);
405        effective_start + char_offset
406    } else {
407        sf_start
408    };
409
410    let end_byte = if key.end_field > 0 {
411        let ef = key.end_field.saturating_sub(1);
412        let (ef_start, ef_end) = find_nth_field_z(line, ef, separator, zero_terminated);
413        if key.end_char > 0 {
414            let effective_start = if ignore_leading_blanks {
415                skip_blanks_from_fn(line, ef_start, ef_end, blank_fn)
416            } else {
417                ef_start
418            };
419            let field_len = ef_end - effective_start;
420            let char_offset = key.end_char.min(field_len);
421            effective_start + char_offset
422        } else {
423            ef_end
424        }
425    } else {
426        line.len()
427    };
428
429    if start_byte >= end_byte || start_byte >= line.len() {
430        return b"";
431    }
432
433    &line[start_byte..end_byte.min(line.len())]
434}
coreutils_rs/sort/key.rs

coreutils_rs/sort/
key.rs