Skip to main content

coreutils_rs/sort/
key.rs

1/// Key definition parsing and field extraction for `sort -k`.
2///
3/// KEYDEF format: FIELD[.CHAR][OPTS],[FIELD[.CHAR][OPTS]]
4/// Fields and characters are 1-indexed.
5
6/// Per-key ordering options that can override global options.
7#[derive(Debug, Clone, Default)]
8pub struct KeyOpts {
9    pub numeric: bool,
10    pub general_numeric: bool,
11    pub human_numeric: bool,
12    pub month: bool,
13    pub version: bool,
14    pub random: bool,
15    pub reverse: bool,
16    pub ignore_leading_blanks: bool,
17    pub dictionary_order: bool,
18    pub ignore_case: bool,
19    pub ignore_nonprinting: bool,
20}
21
22impl KeyOpts {
23    /// Returns true if any sort-type option is set.
24    pub fn has_sort_type(&self) -> bool {
25        self.numeric
26            || self.general_numeric
27            || self.human_numeric
28            || self.month
29            || self.version
30            || self.random
31    }
32
33    /// Returns true if any option at all is set (for key inheritance logic).
34    pub fn has_any_option(&self) -> bool {
35        self.has_sort_type()
36            || self.ignore_case
37            || self.dictionary_order
38            || self.ignore_nonprinting
39            || self.ignore_leading_blanks
40            || self.reverse
41    }
42
43    /// Parse single-letter option flags from a string.
44    pub fn parse_flags(&mut self, flags: &str) {
45        for c in flags.chars() {
46            match c {
47                'b' => self.ignore_leading_blanks = true,
48                'd' => self.dictionary_order = true,
49                'f' => self.ignore_case = true,
50                'g' => self.general_numeric = true,
51                'h' => self.human_numeric = true,
52                'i' => self.ignore_nonprinting = true,
53                'M' => self.month = true,
54                'n' => self.numeric = true,
55                'R' => self.random = true,
56                'r' => self.reverse = true,
57                'V' => self.version = true,
58                _ => {}
59            }
60        }
61    }
62}
63
64/// A parsed key specification from `-k START[,END]`.
65#[derive(Debug, Clone)]
66pub struct KeyDef {
67    pub start_field: usize,
68    pub start_char: usize,
69    pub end_field: usize,
70    pub end_char: usize,
71    pub opts: KeyOpts,
72}
73
74impl KeyDef {
75    /// Parse a KEYDEF string like "2,2n" or "1.3,1.5" or "3,3rn".
76    pub fn parse(spec: &str) -> Result<KeyDef, String> {
77        let parts: Vec<&str> = spec.splitn(2, ',').collect();
78
79        let (start_field, start_char, start_opts) = parse_field_spec(parts[0])?;
80
81        let (end_field, end_char, end_opts) = if parts.len() > 1 {
82            parse_field_spec(parts[1])?
83        } else {
84            (0, 0, String::new())
85        };
86
87        let mut opts = KeyOpts::default();
88        opts.parse_flags(&start_opts);
89        opts.parse_flags(&end_opts);
90
91        if start_field == 0 {
92            return Err("field number is zero: invalid field specification".to_string());
93        }
94
95        Ok(KeyDef {
96            start_field,
97            start_char,
98            end_field,
99            end_char,
100            opts,
101        })
102    }
103}
104
105/// Parse a single field spec like "2" or "1.3" or "2n" or "1.3bf".
106fn parse_field_spec(s: &str) -> Result<(usize, usize, String), String> {
107    let mut field_str = String::new();
108    let mut char_str = String::new();
109    let mut opts = String::new();
110    let mut in_char = false;
111
112    for c in s.chars() {
113        if c == '.' && !in_char && opts.is_empty() {
114            in_char = true;
115        } else if c.is_ascii_digit() && opts.is_empty() {
116            if in_char {
117                char_str.push(c);
118            } else {
119                field_str.push(c);
120            }
121        } else if c.is_ascii_alphabetic() {
122            opts.push(c);
123        } else {
124            return Err(format!("invalid character '{}' in key spec", c));
125        }
126    }
127
128    let field = if field_str.is_empty() {
129        0
130    } else {
131        field_str
132            .parse::<usize>()
133            .map_err(|_| "invalid field number".to_string())?
134    };
135
136    let char_pos = if char_str.is_empty() {
137        0
138    } else {
139        char_str
140            .parse::<usize>()
141            .map_err(|_| "invalid character position".to_string())?
142    };
143
144    Ok((field, char_pos, opts))
145}
146
147/// Find the byte range of the Nth field (0-indexed) in a line.
148/// Returns (start, end) byte offsets. Allocation-free.
149/// Uses SIMD memchr for separator-based field finding.
150#[inline]
151fn find_nth_field(line: &[u8], n: usize, separator: Option<u8>) -> (usize, usize) {
152    match separator {
153        Some(sep) => {
154            // Use memchr_iter for SIMD-accelerated separator scanning
155            let mut field = 0;
156            let mut start = 0;
157            for pos in memchr::memchr_iter(sep, line) {
158                if field == n {
159                    return (start, pos);
160                }
161                field += 1;
162                start = pos + 1;
163            }
164            if field == n {
165                (start, line.len())
166            } else {
167                (line.len(), line.len())
168            }
169        }
170        None => {
171            let mut field = 0;
172            let mut i = 0;
173            let len = line.len();
174
175            while i < len {
176                let field_start = i;
177                // Skip blanks (part of this field)
178                while i < len && is_blank(line[i]) {
179                    i += 1;
180                }
181                // Consume non-blanks
182                while i < len && !is_blank(line[i]) {
183                    i += 1;
184                }
185                if field == n {
186                    return (field_start, i);
187                }
188                field += 1;
189            }
190
191            (line.len(), line.len())
192        }
193    }
194}
195
196#[inline]
197fn is_blank(b: u8) -> bool {
198    b == b' ' || b == b'\t'
199}
200
201/// Extract the key portion of a line based on a KeyDef.
202/// Allocation-free: uses find_nth_field instead of collecting all fields.
203pub fn extract_key<'a>(line: &'a [u8], key: &KeyDef, separator: Option<u8>) -> &'a [u8] {
204    let sf = key.start_field.saturating_sub(1);
205    let (sf_start, sf_end) = find_nth_field(line, sf, separator);
206
207    if sf_start >= line.len() {
208        return b"";
209    }
210
211    let start_byte = if key.start_char > 0 {
212        let field_len = sf_end - sf_start;
213        let char_offset = (key.start_char - 1).min(field_len);
214        sf_start + char_offset
215    } else {
216        sf_start
217    };
218
219    let end_byte = if key.end_field > 0 {
220        let ef = key.end_field.saturating_sub(1);
221        let (ef_start, ef_end) = find_nth_field(line, ef, separator);
222        if key.end_char > 0 {
223            let field_len = ef_end - ef_start;
224            let char_offset = key.end_char.min(field_len);
225            ef_start + char_offset
226        } else {
227            ef_end
228        }
229    } else {
230        line.len()
231    };
232
233    if start_byte >= end_byte || start_byte >= line.len() {
234        return b"";
235    }
236
237    &line[start_byte..end_byte.min(line.len())]
238}