Skip to main content

coreutils_rs/sort/
compare.rs

1/// Comparison functions for different sort modes.
2/// All comparison functions are allocation-free for maximum sort performance.
3use std::cmp::Ordering;
4
5use super::key::KeyOpts;
6
7/// Strip leading blanks (space and tab).
8#[inline]
9pub fn skip_leading_blanks(s: &[u8]) -> &[u8] {
10    let mut i = 0;
11    while i < s.len() && (s[i] == b' ' || s[i] == b'\t') {
12        i += 1;
13    }
14    &s[i..]
15}
16
17/// Compare two byte slices lexicographically (default sort).
18#[inline]
19pub fn compare_lexical(a: &[u8], b: &[u8]) -> Ordering {
20    a.cmp(b)
21}
22
23/// Numeric sort (-n): compare leading numeric strings.
24/// Handles optional leading whitespace, sign, and decimal point.
25pub fn compare_numeric(a: &[u8], b: &[u8]) -> Ordering {
26    let va = parse_numeric_value(a);
27    let vb = parse_numeric_value(b);
28    va.partial_cmp(&vb).unwrap_or(Ordering::Equal)
29}
30
31/// Fast custom numeric parser: parses sign + digits + optional decimal directly from bytes.
32/// Avoids UTF-8 validation and str::parse::<f64>() overhead entirely.
33pub fn parse_numeric_value(s: &[u8]) -> f64 {
34    let s = skip_leading_blanks(s);
35    if s.is_empty() {
36        return 0.0;
37    }
38
39    let mut i = 0;
40    let negative = if s[i] == b'-' {
41        i += 1;
42        true
43    } else {
44        if s[i] == b'+' {
45            i += 1;
46        }
47        false
48    };
49
50    // Parse integer part
51    let mut integer: u64 = 0;
52    let mut has_digits = false;
53    while i < s.len() && s[i].is_ascii_digit() {
54        integer = integer.wrapping_mul(10).wrapping_add((s[i] - b'0') as u64);
55        has_digits = true;
56        i += 1;
57    }
58
59    // Parse fractional part
60    if i < s.len() && s[i] == b'.' {
61        i += 1;
62        let frac_start = i;
63        let mut frac_val: u64 = 0;
64        while i < s.len() && s[i].is_ascii_digit() {
65            frac_val = frac_val.wrapping_mul(10).wrapping_add((s[i] - b'0') as u64);
66            has_digits = true;
67            i += 1;
68        }
69        if !has_digits {
70            return 0.0;
71        }
72        let frac_digits = i - frac_start;
73        let result = if frac_digits > 0 {
74            // Use pre-computed powers of 10 for common cases
75            let divisor = POW10[frac_digits.min(POW10.len() - 1)];
76            integer as f64 + frac_val as f64 / divisor
77        } else {
78            integer as f64
79        };
80        return if negative { -result } else { result };
81    }
82
83    if !has_digits {
84        return 0.0;
85    }
86
87    let result = integer as f64;
88    if negative { -result } else { result }
89}
90
91/// Pre-computed powers of 10 for fast decimal conversion.
92const POW10: [f64; 20] = [
93    1.0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16,
94    1e17, 1e18, 1e19,
95];
96
97/// Fast integer-only parser for numeric sort (-n).
98/// Returns Some(i64) if the value is a pure integer (no decimal point, no exponent).
99/// Returns None if the value has a decimal point or is not a valid integer.
100/// This avoids the f64 conversion path for integer-only data.
101#[inline]
102pub fn try_parse_integer(s: &[u8]) -> Option<i64> {
103    let s = skip_leading_blanks(s);
104    if s.is_empty() {
105        return Some(0);
106    }
107
108    let mut i = 0;
109    let negative = if s[i] == b'-' {
110        i += 1;
111        true
112    } else {
113        if s[i] == b'+' {
114            i += 1;
115        }
116        false
117    };
118
119    // Parse digits
120    let mut value: i64 = 0;
121    let mut has_digits = false;
122    while i < s.len() && s[i].is_ascii_digit() {
123        // Check for overflow before multiplying
124        value = value.checked_mul(10)?.checked_add((s[i] - b'0') as i64)?;
125        has_digits = true;
126        i += 1;
127    }
128
129    // If there's a decimal point, this is not a pure integer
130    if i < s.len() && s[i] == b'.' {
131        return None;
132    }
133
134    if !has_digits {
135        return Some(0);
136    }
137
138    Some(if negative { -value } else { value })
139}
140
141/// Convert an i64 to a sortable u64 whose natural ordering matches signed integer ordering.
142/// Adds i64::MAX + 1 (0x8000000000000000) to shift the range to unsigned.
143#[inline]
144pub fn int_to_sortable_u64(v: i64) -> u64 {
145    (v as u64).wrapping_add(0x8000000000000000)
146}
147
148fn find_numeric_end(s: &[u8]) -> usize {
149    let mut i = 0;
150    if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
151        i += 1;
152    }
153    let mut has_digits = false;
154    while i < s.len() && s[i].is_ascii_digit() {
155        i += 1;
156        has_digits = true;
157    }
158    if i < s.len() && s[i] == b'.' {
159        i += 1;
160        while i < s.len() && s[i].is_ascii_digit() {
161            i += 1;
162            has_digits = true;
163        }
164    }
165    if has_digits { i } else { 0 }
166}
167
168/// General numeric sort (-g): handles scientific notation, infinity, NaN.
169/// O(n) parser.
170pub fn compare_general_numeric(a: &[u8], b: &[u8]) -> Ordering {
171    let va = parse_general_numeric(a);
172    let vb = parse_general_numeric(b);
173    match (va.is_nan(), vb.is_nan()) {
174        (true, true) => Ordering::Equal,
175        (true, false) => Ordering::Less,
176        (false, true) => Ordering::Greater,
177        (false, false) => va.partial_cmp(&vb).unwrap_or(Ordering::Equal),
178    }
179}
180
181pub fn parse_general_numeric(s: &[u8]) -> f64 {
182    let s = skip_leading_blanks(s);
183    if s.is_empty() {
184        return f64::NAN;
185    }
186
187    // Find the longest valid float prefix
188    let mut i = 0;
189
190    // Handle "inf", "-inf", "+inf", "nan" etc.
191    let start = if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
192        i += 1;
193        i - 1
194    } else {
195        i
196    };
197
198    // Check for "inf"/"infinity"/"nan" prefix (case-insensitive)
199    if i + 2 < s.len() {
200        let c0 = s[i].to_ascii_lowercase();
201        let c1 = s[i + 1].to_ascii_lowercase();
202        let c2 = s[i + 2].to_ascii_lowercase();
203        if (c0 == b'i' && c1 == b'n' && c2 == b'f') || (c0 == b'n' && c1 == b'a' && c2 == b'n') {
204            // Try parsing the prefix as a special float
205            let end = s.len().min(i + 8); // "infinity" is 8 chars
206            for e in (i + 3..=end).rev() {
207                if let Ok(text) = std::str::from_utf8(&s[start..e]) {
208                    if let Ok(v) = text.parse::<f64>() {
209                        return v;
210                    }
211                }
212            }
213            return f64::NAN;
214        }
215    }
216
217    // Reset i for numeric parsing
218    i = start;
219    if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
220        i += 1;
221    }
222
223    // Digits before decimal
224    let mut has_digits = false;
225    while i < s.len() && s[i].is_ascii_digit() {
226        i += 1;
227        has_digits = true;
228    }
229    // Decimal point
230    if i < s.len() && s[i] == b'.' {
231        i += 1;
232        while i < s.len() && s[i].is_ascii_digit() {
233            i += 1;
234            has_digits = true;
235        }
236    }
237    if !has_digits {
238        return f64::NAN;
239    }
240    // Exponent
241    if i < s.len() && (s[i] == b'e' || s[i] == b'E') {
242        let save = i;
243        i += 1;
244        if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
245            i += 1;
246        }
247        if i < s.len() && s[i].is_ascii_digit() {
248            while i < s.len() && s[i].is_ascii_digit() {
249                i += 1;
250            }
251        } else {
252            i = save;
253        }
254    }
255
256    // Parse the numeric prefix using standard library
257    std::str::from_utf8(&s[start..i])
258        .ok()
259        .and_then(|s| s.parse::<f64>().ok())
260        .unwrap_or(f64::NAN)
261}
262
263/// Human numeric sort (-h): handles suffixes K, M, G, T, P, E, Z, Y.
264pub fn compare_human_numeric(a: &[u8], b: &[u8]) -> Ordering {
265    let va = parse_human_numeric(a);
266    let vb = parse_human_numeric(b);
267    va.partial_cmp(&vb).unwrap_or(Ordering::Equal)
268}
269
270pub fn parse_human_numeric(s: &[u8]) -> f64 {
271    let s = skip_leading_blanks(s);
272    if s.is_empty() {
273        return 0.0;
274    }
275
276    // Use the fast custom parser for the numeric part
277    let base = parse_numeric_value(s);
278    let end = find_numeric_end(s);
279
280    if end < s.len() {
281        let multiplier = match s[end] {
282            b'K' | b'k' => 1e3,
283            b'M' => 1e6,
284            b'G' => 1e9,
285            b'T' => 1e12,
286            b'P' => 1e15,
287            b'E' => 1e18,
288            b'Z' => 1e21,
289            b'Y' => 1e24,
290            _ => 1.0,
291        };
292        base * multiplier
293    } else {
294        base
295    }
296}
297
298/// Month sort (-M).
299pub fn compare_month(a: &[u8], b: &[u8]) -> Ordering {
300    let ma = parse_month(a);
301    let mb = parse_month(b);
302    ma.cmp(&mb)
303}
304
305fn parse_month(s: &[u8]) -> u8 {
306    let s = skip_leading_blanks(s);
307    if s.len() < 3 {
308        return 0;
309    }
310    let m = [
311        s[0].to_ascii_uppercase(),
312        s[1].to_ascii_uppercase(),
313        s[2].to_ascii_uppercase(),
314    ];
315    match &m {
316        b"JAN" => 1,
317        b"FEB" => 2,
318        b"MAR" => 3,
319        b"APR" => 4,
320        b"MAY" => 5,
321        b"JUN" => 6,
322        b"JUL" => 7,
323        b"AUG" => 8,
324        b"SEP" => 9,
325        b"OCT" => 10,
326        b"NOV" => 11,
327        b"DEC" => 12,
328        _ => 0,
329    }
330}
331
332/// Version sort (-V): natural sort of version numbers.
333/// Uses byte slices directly instead of char iterators for maximum performance.
334pub fn compare_version(a: &[u8], b: &[u8]) -> Ordering {
335    let mut ai = 0usize;
336    let mut bi = 0usize;
337
338    loop {
339        if ai >= a.len() && bi >= b.len() {
340            return Ordering::Equal;
341        }
342        if ai >= a.len() {
343            return Ordering::Less;
344        }
345        if bi >= b.len() {
346            return Ordering::Greater;
347        }
348
349        let ac = a[ai];
350        let bc = b[bi];
351
352        if ac.is_ascii_digit() && bc.is_ascii_digit() {
353            let anum = consume_number_bytes(a, &mut ai);
354            let bnum = consume_number_bytes(b, &mut bi);
355            match anum.cmp(&bnum) {
356                Ordering::Equal => continue,
357                other => return other,
358            }
359        } else {
360            match ac.cmp(&bc) {
361                Ordering::Equal => {
362                    ai += 1;
363                    bi += 1;
364                }
365                other => return other,
366            }
367        }
368    }
369}
370
371#[inline]
372fn consume_number_bytes(data: &[u8], pos: &mut usize) -> u64 {
373    let mut n: u64 = 0;
374    while *pos < data.len() && data[*pos].is_ascii_digit() {
375        n = n
376            .saturating_mul(10)
377            .saturating_add((data[*pos] - b'0') as u64);
378        *pos += 1;
379    }
380    n
381}
382
383/// Random sort (-R): hash-based shuffle that groups identical keys.
384pub fn compare_random(a: &[u8], b: &[u8], seed: u64) -> Ordering {
385    let ha = fnv1a_hash(a, seed);
386    let hb = fnv1a_hash(b, seed);
387    ha.cmp(&hb)
388}
389
390/// FNV-1a hash with seed mixing.
391#[inline]
392fn fnv1a_hash(data: &[u8], seed: u64) -> u64 {
393    let mut hash = 0xcbf29ce484222325u64 ^ seed;
394    for &b in data {
395        hash ^= b as u64;
396        hash = hash.wrapping_mul(0x100000001b3);
397    }
398    hash
399}
400
401/// Compare with text filtering (-d, -i, -f flags in any combination).
402/// Allocation-free: uses iterator filtering.
403#[inline]
404fn is_dict_char(b: u8) -> bool {
405    b.is_ascii_alphanumeric() || b == b' ' || b == b'\t'
406}
407
408#[inline]
409fn is_printable(b: u8) -> bool {
410    b >= 0x20 && b < 0x7f
411}
412
413fn compare_text_filtered(
414    a: &[u8],
415    b: &[u8],
416    dict: bool,
417    no_print: bool,
418    fold_case: bool,
419) -> Ordering {
420    if !dict && !no_print && !fold_case {
421        return a.cmp(b);
422    }
423
424    let mut ai = a.iter().copied();
425    let mut bi = b.iter().copied();
426
427    loop {
428        let na = next_valid(&mut ai, dict, no_print);
429        let nb = next_valid(&mut bi, dict, no_print);
430        match (na, nb) {
431            (Some(ab), Some(bb)) => {
432                let ca = if fold_case {
433                    ab.to_ascii_uppercase()
434                } else {
435                    ab
436                };
437                let cb = if fold_case {
438                    bb.to_ascii_uppercase()
439                } else {
440                    bb
441                };
442                match ca.cmp(&cb) {
443                    Ordering::Equal => continue,
444                    other => return other,
445                }
446            }
447            (Some(_), None) => return Ordering::Greater,
448            (None, Some(_)) => return Ordering::Less,
449            (None, None) => return Ordering::Equal,
450        }
451    }
452}
453
454#[inline]
455fn next_valid(iter: &mut impl Iterator<Item = u8>, dict: bool, no_print: bool) -> Option<u8> {
456    loop {
457        match iter.next() {
458            None => return None,
459            Some(b) => {
460                if dict && !is_dict_char(b) {
461                    continue;
462                }
463                if no_print && !is_printable(b) {
464                    continue;
465                }
466                return Some(b);
467            }
468        }
469    }
470}
471
472// Public wrappers for backward compatibility with tests
473pub fn compare_ignore_case(a: &[u8], b: &[u8]) -> Ordering {
474    compare_text_filtered(a, b, false, false, true)
475}
476
477pub fn compare_dictionary(a: &[u8], b: &[u8], ignore_case: bool) -> Ordering {
478    compare_text_filtered(a, b, true, false, ignore_case)
479}
480
481pub fn compare_ignore_nonprinting(a: &[u8], b: &[u8], ignore_case: bool) -> Ordering {
482    compare_text_filtered(a, b, false, true, ignore_case)
483}
484
485/// Master comparison function that dispatches based on KeyOpts.
486pub fn compare_with_opts(a: &[u8], b: &[u8], opts: &KeyOpts, random_seed: u64) -> Ordering {
487    let a = if opts.ignore_leading_blanks {
488        skip_leading_blanks(a)
489    } else {
490        a
491    };
492    let b = if opts.ignore_leading_blanks {
493        skip_leading_blanks(b)
494    } else {
495        b
496    };
497
498    let result = if opts.numeric {
499        compare_numeric(a, b)
500    } else if opts.general_numeric {
501        compare_general_numeric(a, b)
502    } else if opts.human_numeric {
503        compare_human_numeric(a, b)
504    } else if opts.month {
505        compare_month(a, b)
506    } else if opts.version {
507        compare_version(a, b)
508    } else if opts.random {
509        compare_random(a, b, random_seed)
510    } else {
511        compare_text_filtered(
512            a,
513            b,
514            opts.dictionary_order,
515            opts.ignore_nonprinting,
516            opts.ignore_case,
517        )
518    };
519
520    if opts.reverse {
521        result.reverse()
522    } else {
523        result
524    }
525}
526
527/// Concrete comparison function type. Selected once at setup time to avoid
528/// per-comparison flag checking in hot sort loops.
529pub type CompareFn = fn(&[u8], &[u8]) -> Ordering;
530
531/// Select a concrete comparison function based on KeyOpts.
532/// Returns (compare_fn, needs_leading_blank_strip, needs_reverse).
533/// The caller applies blank-stripping and reversal outside the function pointer,
534/// eliminating all per-comparison branching.
535pub fn select_comparator(opts: &KeyOpts, random_seed: u64) -> (CompareFn, bool, bool) {
536    let needs_blank = opts.ignore_leading_blanks;
537    let needs_reverse = opts.reverse;
538
539    let cmp: CompareFn = if opts.numeric {
540        compare_numeric
541    } else if opts.general_numeric {
542        compare_general_numeric
543    } else if opts.human_numeric {
544        compare_human_numeric
545    } else if opts.month {
546        compare_month
547    } else if opts.version {
548        compare_version
549    } else if opts.random {
550        // Random needs seed — wrap in a closure-like pattern
551        // Since we need random_seed, we use a special case
552        return (
553            make_random_comparator(random_seed),
554            needs_blank,
555            needs_reverse,
556        );
557    } else if opts.dictionary_order || opts.ignore_nonprinting || opts.ignore_case {
558        // Text filtering: select specialized variant
559        match (
560            opts.dictionary_order,
561            opts.ignore_nonprinting,
562            opts.ignore_case,
563        ) {
564            (false, false, true) => compare_ignore_case,
565            (true, false, false) => |a: &[u8], b: &[u8]| compare_dictionary(a, b, false),
566            (true, false, true) => |a: &[u8], b: &[u8]| compare_dictionary(a, b, true),
567            (false, true, false) => |a: &[u8], b: &[u8]| compare_ignore_nonprinting(a, b, false),
568            (false, true, true) => |a: &[u8], b: &[u8]| compare_ignore_nonprinting(a, b, true),
569            (true, true, false) => {
570                |a: &[u8], b: &[u8]| compare_text_filtered(a, b, true, true, false)
571            }
572            (true, true, true) => {
573                |a: &[u8], b: &[u8]| compare_text_filtered(a, b, true, true, true)
574            }
575            _ => |a: &[u8], b: &[u8]| a.cmp(b),
576        }
577    } else {
578        |a: &[u8], b: &[u8]| a.cmp(b)
579    };
580
581    (cmp, needs_blank, needs_reverse)
582}
583
584fn make_random_comparator(seed: u64) -> CompareFn {
585    // We can't capture the seed in a function pointer, so we use a static.
586    // This is safe because sort is single-process and seed doesn't change during a sort.
587    RANDOM_SEED.store(seed, std::sync::atomic::Ordering::Relaxed);
588    random_compare_with_static_seed
589}
590
591static RANDOM_SEED: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
592
593fn random_compare_with_static_seed(a: &[u8], b: &[u8]) -> Ordering {
594    let seed = RANDOM_SEED.load(std::sync::atomic::Ordering::Relaxed);
595    compare_random(a, b, seed)
596}