coreutils_rs/sort/
compare.rs

1/// Comparison functions for different sort modes.
2use std::cmp::Ordering;
3
4use super::key::KeyOpts;
5
6/// Strip leading blanks (space and tab).
7#[inline(always)]
8pub fn skip_leading_blanks(s: &[u8]) -> &[u8] {
9    let mut i = 0;
10    while i < s.len()
11        && (unsafe { *s.get_unchecked(i) } == b' ' || unsafe { *s.get_unchecked(i) } == b'\t')
12    {
13        i += 1;
14    }
15    &s[i..]
16}
17
18/// Compare two byte slices using locale-aware collation (strcoll).
19/// Uses stack buffers (up to 256 bytes) to avoid heap allocation in the hot path.
20/// Falls back to heap-allocated CString for longer strings, and to byte comparison
21/// if the input contains interior null bytes.
22#[inline]
23pub fn compare_locale(a: &[u8], b: &[u8]) -> Ordering {
24    // Stack buffer size for null-terminated copies. Most sort keys are < 256 bytes.
25    const STACK_BUF: usize = 256;
26
27    // Fast path: if either contains a null byte, fall back to byte comparison
28    // (CString can't represent interior nulls)
29    if memchr::memchr(0, a).is_some() || memchr::memchr(0, b).is_some() {
30        return a.cmp(b);
31    }
32
33    if a.len() < STACK_BUF && b.len() < STACK_BUF {
34        // Stack-only path: no heap allocation.
35        // Zero-init provides the null terminator after copy_nonoverlapping.
36        let mut buf_a = [0u8; STACK_BUF];
37        let mut buf_b = [0u8; STACK_BUF];
38        // SAFETY: a.len() < STACK_BUF (checked above), no interior NULs (checked above).
39        unsafe {
40            std::ptr::copy_nonoverlapping(a.as_ptr(), buf_a.as_mut_ptr(), a.len());
41            std::ptr::copy_nonoverlapping(b.as_ptr(), buf_b.as_mut_ptr(), b.len());
42            let result = libc::strcoll(buf_a.as_ptr() as *const _, buf_b.as_ptr() as *const _);
43            return result.cmp(&0);
44        }
45    }
46
47    // Fallback for long strings: heap allocate.
48    // Null bytes were already filtered above, so CString::new always succeeds.
49    use std::ffi::CString;
50    let ca = CString::new(a).expect("null bytes already filtered above");
51    let cb = CString::new(b).expect("null bytes already filtered above");
52    let result = unsafe { libc::strcoll(ca.as_ptr(), cb.as_ptr()) };
53    result.cmp(&0)
54}
55
56/// Compare two byte slices lexicographically (default sort).
57#[inline]
58pub fn compare_lexical(a: &[u8], b: &[u8]) -> Ordering {
59    a.cmp(b)
60}
61
62/// Numeric sort (-n): compare leading numeric strings.
63/// Handles optional leading whitespace, sign, and decimal point.
64#[inline]
65pub fn compare_numeric(a: &[u8], b: &[u8]) -> Ordering {
66    let va = parse_numeric_value(a);
67    let vb = parse_numeric_value(b);
68    va.partial_cmp(&vb).unwrap_or(Ordering::Equal)
69}
70
71/// Fast custom numeric parser: parses sign + digits + optional decimal directly from bytes.
72/// Avoids UTF-8 validation and str::parse::<f64>() overhead entirely.
73/// Uses batch digit processing for the integer part (4 digits at a time) to reduce
74/// loop iterations and branch mispredictions.
75#[inline]
76pub fn parse_numeric_value(s: &[u8]) -> f64 {
77    let s = skip_leading_blanks(s);
78    if s.is_empty() {
79        return 0.0;
80    }
81
82    let mut i = 0;
83    let negative = if unsafe { *s.get_unchecked(i) } == b'-' {
84        i += 1;
85        true
86    } else {
87        if i < s.len() && unsafe { *s.get_unchecked(i) } == b'+' {
88            i += 1;
89        }
90        false
91    };
92
93    if i >= s.len() {
94        return 0.0;
95    }
96
97    // Parse integer part — batch 4 digits at a time for reduced loop overhead
98    let mut integer: u64 = 0;
99    let mut has_digits = false;
100
101    // Fast batch path: process 4 digits at a time
102    while i + 4 <= s.len() {
103        let d0 = unsafe { *s.get_unchecked(i) }.wrapping_sub(b'0');
104        if d0 > 9 {
105            break;
106        }
107        let d1 = unsafe { *s.get_unchecked(i + 1) }.wrapping_sub(b'0');
108        if d1 > 9 {
109            integer = integer.wrapping_mul(10).wrapping_add(d0 as u64);
110            i += 1;
111            has_digits = true;
112            break;
113        }
114        let d2 = unsafe { *s.get_unchecked(i + 2) }.wrapping_sub(b'0');
115        if d2 > 9 {
116            integer = integer
117                .wrapping_mul(100)
118                .wrapping_add(d0 as u64 * 10 + d1 as u64);
119            i += 2;
120            has_digits = true;
121            break;
122        }
123        let d3 = unsafe { *s.get_unchecked(i + 3) }.wrapping_sub(b'0');
124        if d3 > 9 {
125            integer = integer
126                .wrapping_mul(1000)
127                .wrapping_add(d0 as u64 * 100 + d1 as u64 * 10 + d2 as u64);
128            i += 3;
129            has_digits = true;
130            break;
131        }
132        integer = integer
133            .wrapping_mul(10000)
134            .wrapping_add(d0 as u64 * 1000 + d1 as u64 * 100 + d2 as u64 * 10 + d3 as u64);
135        i += 4;
136        has_digits = true;
137    }
138
139    // Tail: process remaining digits one at a time
140    while i < s.len() {
141        let d = unsafe { *s.get_unchecked(i) }.wrapping_sub(b'0');
142        if d > 9 {
143            break;
144        }
145        integer = integer.wrapping_mul(10).wrapping_add(d as u64);
146        has_digits = true;
147        i += 1;
148    }
149
150    // Parse fractional part
151    if i < s.len() && unsafe { *s.get_unchecked(i) } == b'.' {
152        i += 1;
153        let frac_start = i;
154        let mut frac_val: u64 = 0;
155        while i < s.len() {
156            let d = unsafe { *s.get_unchecked(i) }.wrapping_sub(b'0');
157            if d > 9 {
158                break;
159            }
160            frac_val = frac_val.wrapping_mul(10).wrapping_add(d as u64);
161            has_digits = true;
162            i += 1;
163        }
164        if !has_digits {
165            return 0.0;
166        }
167        let frac_digits = i - frac_start;
168        let result = if frac_digits > 0 {
169            // Use pre-computed powers of 10 for common cases
170            let divisor = POW10[frac_digits.min(POW10.len() - 1)];
171            integer as f64 + frac_val as f64 / divisor
172        } else {
173            integer as f64
174        };
175        return if negative { -result } else { result };
176    }
177
178    if !has_digits {
179        return 0.0;
180    }
181
182    let result = integer as f64;
183    if negative { -result } else { result }
184}
185
186/// Pre-computed powers of 10 for fast decimal conversion.
187const POW10: [f64; 20] = [
188    1.0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16,
189    1e17, 1e18, 1e19,
190];
191
192/// Fast integer-only parser for numeric sort (-n).
193/// Returns Some(i64) if the value is a pure integer (no decimal point, no exponent).
194/// Returns None if the value has a decimal point or is not a valid integer.
195/// This avoids the f64 conversion path for integer-only data.
196/// Uses wrapping arithmetic (no overflow checks) for speed — values >18 digits
197/// may wrap but sort order is still consistent since all values wrap the same way.
198#[inline]
199pub fn try_parse_integer(s: &[u8]) -> Option<i64> {
200    let s = skip_leading_blanks(s);
201    if s.is_empty() {
202        return Some(0);
203    }
204
205    let mut i = 0;
206    let negative = if unsafe { *s.get_unchecked(i) } == b'-' {
207        i += 1;
208        true
209    } else {
210        if i < s.len() && unsafe { *s.get_unchecked(i) } == b'+' {
211            i += 1;
212        }
213        false
214    };
215
216    // Parse digits — wrapping arithmetic, batch 4 digits at a time
217    let mut value: i64 = 0;
218    let mut has_digits = false;
219
220    // Fast batch: 4 digits at a time
221    while i + 4 <= s.len() {
222        let d0 = unsafe { *s.get_unchecked(i) }.wrapping_sub(b'0');
223        if d0 > 9 {
224            break;
225        }
226        let d1 = unsafe { *s.get_unchecked(i + 1) }.wrapping_sub(b'0');
227        if d1 > 9 {
228            value = value.wrapping_mul(10).wrapping_add(d0 as i64);
229            i += 1;
230            has_digits = true;
231            break;
232        }
233        let d2 = unsafe { *s.get_unchecked(i + 2) }.wrapping_sub(b'0');
234        if d2 > 9 {
235            value = value
236                .wrapping_mul(100)
237                .wrapping_add(d0 as i64 * 10 + d1 as i64);
238            i += 2;
239            has_digits = true;
240            break;
241        }
242        let d3 = unsafe { *s.get_unchecked(i + 3) }.wrapping_sub(b'0');
243        if d3 > 9 {
244            value = value
245                .wrapping_mul(1000)
246                .wrapping_add(d0 as i64 * 100 + d1 as i64 * 10 + d2 as i64);
247            i += 3;
248            has_digits = true;
249            break;
250        }
251        value = value
252            .wrapping_mul(10000)
253            .wrapping_add(d0 as i64 * 1000 + d1 as i64 * 100 + d2 as i64 * 10 + d3 as i64);
254        i += 4;
255        has_digits = true;
256    }
257
258    // Tail: remaining digits
259    while i < s.len() {
260        let d = unsafe { *s.get_unchecked(i) }.wrapping_sub(b'0');
261        if d > 9 {
262            break;
263        }
264        value = value.wrapping_mul(10).wrapping_add(d as i64);
265        has_digits = true;
266        i += 1;
267    }
268
269    // If there's a decimal point, this is not a pure integer
270    if i < s.len() && unsafe { *s.get_unchecked(i) } == b'.' {
271        return None;
272    }
273
274    if !has_digits {
275        return Some(0);
276    }
277
278    Some(if negative {
279        value.wrapping_neg()
280    } else {
281        value
282    })
283}
284
285/// Convert an i64 to a sortable u64 whose natural ordering matches signed integer ordering.
286/// Adds i64::MAX + 1 (0x8000000000000000) to shift the range to unsigned.
287#[inline]
288pub fn int_to_sortable_u64(v: i64) -> u64 {
289    (v as u64).wrapping_add(0x8000000000000000)
290}
291
292fn find_numeric_end(s: &[u8]) -> usize {
293    let mut i = 0;
294    if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
295        i += 1;
296    }
297    let mut has_digits = false;
298    while i < s.len() && s[i].is_ascii_digit() {
299        i += 1;
300        has_digits = true;
301    }
302    if i < s.len() && s[i] == b'.' {
303        i += 1;
304        while i < s.len() && s[i].is_ascii_digit() {
305            i += 1;
306            has_digits = true;
307        }
308    }
309    if has_digits { i } else { 0 }
310}
311
312/// General numeric sort (-g): handles scientific notation, infinity, NaN.
313/// O(n) parser.
314pub fn compare_general_numeric(a: &[u8], b: &[u8]) -> Ordering {
315    let va = parse_general_numeric(a);
316    let vb = parse_general_numeric(b);
317    match (va.is_nan(), vb.is_nan()) {
318        (true, true) => Ordering::Equal,
319        (true, false) => Ordering::Less,
320        (false, true) => Ordering::Greater,
321        (false, false) => va.partial_cmp(&vb).unwrap_or(Ordering::Equal),
322    }
323}
324
325pub fn parse_general_numeric(s: &[u8]) -> f64 {
326    let s = skip_leading_blanks(s);
327    if s.is_empty() {
328        return f64::NAN;
329    }
330
331    // Find the longest valid float prefix
332    let mut i = 0;
333
334    // Handle "inf", "-inf", "+inf", "nan" etc.
335    let start = if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
336        i += 1;
337        i - 1
338    } else {
339        i
340    };
341
342    // Check for "inf"/"infinity"/"nan" prefix (case-insensitive)
343    if i + 2 < s.len() {
344        let c0 = s[i].to_ascii_lowercase();
345        let c1 = s[i + 1].to_ascii_lowercase();
346        let c2 = s[i + 2].to_ascii_lowercase();
347        if (c0 == b'i' && c1 == b'n' && c2 == b'f') || (c0 == b'n' && c1 == b'a' && c2 == b'n') {
348            // Try parsing the prefix as a special float
349            let end = s.len().min(i + 8); // "infinity" is 8 chars
350            for e in (i + 3..=end).rev() {
351                if let Ok(text) = std::str::from_utf8(&s[start..e]) {
352                    if let Ok(v) = text.parse::<f64>() {
353                        return v;
354                    }
355                }
356            }
357            return f64::NAN;
358        }
359    }
360
361    // Reset i for numeric parsing
362    i = start;
363    if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
364        i += 1;
365    }
366
367    // Digits before decimal
368    let mut has_digits = false;
369    while i < s.len() && s[i].is_ascii_digit() {
370        i += 1;
371        has_digits = true;
372    }
373    // Decimal point
374    if i < s.len() && s[i] == b'.' {
375        i += 1;
376        while i < s.len() && s[i].is_ascii_digit() {
377            i += 1;
378            has_digits = true;
379        }
380    }
381    if !has_digits {
382        return f64::NAN;
383    }
384    // Exponent
385    if i < s.len() && (s[i] == b'e' || s[i] == b'E') {
386        let save = i;
387        i += 1;
388        if i < s.len() && (s[i] == b'+' || s[i] == b'-') {
389            i += 1;
390        }
391        if i < s.len() && s[i].is_ascii_digit() {
392            while i < s.len() && s[i].is_ascii_digit() {
393                i += 1;
394            }
395        } else {
396            i = save;
397        }
398    }
399
400    // Parse the numeric prefix using standard library
401    std::str::from_utf8(&s[start..i])
402        .ok()
403        .and_then(|s| s.parse::<f64>().ok())
404        .unwrap_or(f64::NAN)
405}
406
407/// Human numeric sort (-h): handles suffixes K, M, G, T, P, E, Z, Y.
408/// GNU sort compares by suffix tier first (no-suffix < K < M < G < T < P < E < Z < Y),
409/// then by numeric value within the same tier. This means 1K > 999999 and 1G > 1023M.
410pub fn compare_human_numeric(a: &[u8], b: &[u8]) -> Ordering {
411    let (val_a, tier_a) = parse_human_numeric_tiered(a);
412    let (val_b, tier_b) = parse_human_numeric_tiered(b);
413
414    // Handle sign: negative values sort before positive, tier comparison inverts
415    let sign_a = if val_a < 0.0 {
416        -1i8
417    } else if val_a > 0.0 {
418        1
419    } else {
420        0
421    };
422    let sign_b = if val_b < 0.0 {
423        -1i8
424    } else if val_b > 0.0 {
425        1
426    } else {
427        0
428    };
429
430    // Different signs: compare directly
431    if sign_a != sign_b {
432        return sign_a.cmp(&sign_b);
433    }
434
435    // Both zero
436    if sign_a == 0 && sign_b == 0 {
437        return Ordering::Equal;
438    }
439
440    // Same sign: compare tier first, then value within tier
441    // For negative numbers, higher tier means MORE negative (reverse tier order)
442    if sign_a > 0 {
443        match tier_a.cmp(&tier_b) {
444            Ordering::Equal => val_a.partial_cmp(&val_b).unwrap_or(Ordering::Equal),
445            other => other,
446        }
447    } else {
448        // Negative: -1G < -1M (higher tier is more negative = smaller)
449        match tier_a.cmp(&tier_b) {
450            Ordering::Equal => val_a.partial_cmp(&val_b).unwrap_or(Ordering::Equal),
451            Ordering::Less => Ordering::Greater,
452            Ordering::Greater => Ordering::Less,
453        }
454    }
455}
456
457/// Parse a human-numeric value and return (numeric_value, suffix_tier).
458/// Tier: 0=no suffix, 1=K, 2=M, 3=G, 4=T, 5=P, 6=E, 7=Z, 8=Y.
459fn parse_human_numeric_tiered(s: &[u8]) -> (f64, u8) {
460    let s = skip_leading_blanks(s);
461    if s.is_empty() {
462        return (0.0, 0);
463    }
464
465    let base = parse_numeric_value(s);
466    let end = find_numeric_end(s);
467
468    if end < s.len() {
469        let tier = match s[end] {
470            b'K' | b'k' => 1,
471            b'M' => 2,
472            b'G' => 3,
473            b'T' => 4,
474            b'P' => 5,
475            b'E' => 6,
476            b'Z' => 7,
477            b'Y' => 8,
478            _ => 0,
479        };
480        (base, tier)
481    } else {
482        (base, 0)
483    }
484}
485
486/// Convert a human-numeric string directly to a sortable u64.
487/// Encodes tier in the top 4 bits and the float value in the bottom 60 bits.
488/// This avoids the f64 precision loss that would occur from encoding tier + value
489/// in a single f64 (e.g., 1e18 + 3.0 == 1e18 + 1.0 due to 52-bit mantissa).
490///
491/// Encoding:
492/// - Non-negative values: (8 + tier) << 60 | (float_sortable >> 4)
493/// - Negative values: (7 - tier) << 60 | (float_sortable >> 4)
494///
495/// This ensures: -1G < -1M < -1K < -1 < 0 < 1 < 1K < 1M < 1G
496pub fn human_numeric_to_sortable_u64(s: &[u8]) -> u64 {
497    let (val, tier) = parse_human_numeric_tiered(s);
498
499    // Convert the numeric value to a sortable u64 (preserves float ordering)
500    let sf = {
501        if val.is_nan() {
502            0u64 // NaN sorts first
503        } else {
504            let bits = val.to_bits();
505            if (bits >> 63) == 0 {
506                bits ^ 0x8000000000000000 // positive: flip sign bit
507            } else {
508                !bits // negative: flip all bits
509            }
510        }
511    };
512
513    // Encode tier in top 4 bits, float in bottom 60 bits
514    if val >= 0.0 || val == 0.0 {
515        ((8 + tier as u64) << 60) | (sf >> 4)
516    } else {
517        ((7 - tier as u64) << 60) | (sf >> 4)
518    }
519}
520
521/// Legacy parse function for backward compatibility (not used in sort hot path).
522pub fn parse_human_numeric(s: &[u8]) -> f64 {
523    let (val, _tier) = parse_human_numeric_tiered(s);
524    val
525}
526
527/// Month sort (-M).
528pub fn compare_month(a: &[u8], b: &[u8]) -> Ordering {
529    let ma = parse_month(a);
530    let mb = parse_month(b);
531    ma.cmp(&mb)
532}
533
534fn parse_month(s: &[u8]) -> u8 {
535    let s = skip_leading_blanks(s);
536    if s.len() < 3 {
537        return 0;
538    }
539    let m = [
540        s[0].to_ascii_uppercase(),
541        s[1].to_ascii_uppercase(),
542        s[2].to_ascii_uppercase(),
543    ];
544    match &m {
545        b"JAN" => 1,
546        b"FEB" => 2,
547        b"MAR" => 3,
548        b"APR" => 4,
549        b"MAY" => 5,
550        b"JUN" => 6,
551        b"JUL" => 7,
552        b"AUG" => 8,
553        b"SEP" => 9,
554        b"OCT" => 10,
555        b"NOV" => 11,
556        b"DEC" => 12,
557        _ => 0,
558    }
559}
560
561/// Compute the length of the prefix before file suffixes.
562/// Matches GNU gnulib `file_prefixlen` from `filevercmp.c`.
563/// Strips trailing suffix groups matching `(\.[A-Za-z~][A-Za-z0-9~]*)*` from the end.
564fn file_prefixlen(s: &[u8]) -> usize {
565    let n = s.len();
566    let mut prefixlen = 0;
567    let mut i = 0;
568    loop {
569        if i == n {
570            return prefixlen;
571        }
572        i += 1;
573        prefixlen = i;
574        while i + 1 < n && s[i] == b'.' && (s[i + 1].is_ascii_alphabetic() || s[i + 1] == b'~') {
575            i += 2;
576            while i < n && (s[i].is_ascii_alphanumeric() || s[i] == b'~') {
577                i += 1;
578            }
579        }
580    }
581}
582
583/// Version sort (-V): GNU filevercmp-compatible version comparison.
584/// Implements the exact same algorithm as GNU coreutils' filevercmp.
585pub fn compare_version(a: &[u8], b: &[u8]) -> Ordering {
586    // GNU filevercmp: skip hidden-file dot prefix, compare, then break tie
587    // by including the prefix.
588    let a_prefix = if a.first() == Some(&b'.') { 1 } else { 0 };
589    let b_prefix = if b.first() == Some(&b'.') { 1 } else { 0 };
590
591    // Strip file suffixes (e.g., .tar.gz) before comparing, as GNU does.
592    let a_body = &a[a_prefix..];
593    let b_body = &b[b_prefix..];
594    let a_plen = file_prefixlen(a_body);
595    let b_plen = file_prefixlen(b_body);
596
597    // First compare the prefix parts (without suffixes)
598    let result = verrevcmp(&a_body[..a_plen], &b_body[..b_plen]);
599    if result != Ordering::Equal {
600        return result;
601    }
602
603    // Tie-break: compare full body (with suffixes)
604    let result = verrevcmp(a_body, b_body);
605    if result != Ordering::Equal {
606        return result;
607    }
608
609    // Final tie-break: compare the full strings (including dot prefix)
610    verrevcmp(a, b)
611}
612
613/// The core comparison algorithm matching GNU's verrevcmp exactly.
614/// From gnulib/lib/filevercmp.c.
615fn verrevcmp(s1: &[u8], s2: &[u8]) -> Ordering {
616    let s1_len = s1.len();
617    let s2_len = s2.len();
618    let mut s1_pos = 0usize;
619    let mut s2_pos = 0usize;
620
621    while s1_pos < s1_len || s2_pos < s2_len {
622        let mut first_diff = 0i32;
623
624        // Compare non-digit characters using the special ordering
625        while (s1_pos < s1_len && !s1[s1_pos].is_ascii_digit())
626            || (s2_pos < s2_len && !s2[s2_pos].is_ascii_digit())
627        {
628            let s1_c = ver_order(s1, s1_pos, s1_len);
629            let s2_c = ver_order(s2, s2_pos, s2_len);
630            if s1_c != s2_c {
631                return if s1_c < s2_c {
632                    Ordering::Less
633                } else {
634                    Ordering::Greater
635                };
636            }
637            s1_pos += 1;
638            s2_pos += 1;
639        }
640
641        // Skip leading zeros
642        while s1_pos < s1_len && s1[s1_pos] == b'0' {
643            s1_pos += 1;
644        }
645        while s2_pos < s2_len && s2[s2_pos] == b'0' {
646            s2_pos += 1;
647        }
648
649        // Compare digit sequences of the same length
650        while s1_pos < s1_len
651            && s2_pos < s2_len
652            && s1[s1_pos].is_ascii_digit()
653            && s2[s2_pos].is_ascii_digit()
654        {
655            if first_diff == 0 {
656                first_diff = s1[s1_pos] as i32 - s2[s2_pos] as i32;
657            }
658            s1_pos += 1;
659            s2_pos += 1;
660        }
661
662        // If one string still has digits, it's the larger number
663        if s1_pos < s1_len && s1[s1_pos].is_ascii_digit() {
664            return Ordering::Greater;
665        }
666        if s2_pos < s2_len && s2[s2_pos].is_ascii_digit() {
667            return Ordering::Less;
668        }
669        if first_diff != 0 {
670            return if first_diff < 0 {
671                Ordering::Less
672            } else {
673                Ordering::Greater
674            };
675        }
676    }
677
678    Ordering::Equal
679}
680
681/// Character ordering for GNU filevercmp (matches gnulib exactly):
682/// ~(-2) < end-of-string(-1) < digits(0) < letters(char) < other(UCHAR_MAX+1+char)
683#[inline]
684fn ver_order(s: &[u8], pos: usize, len: usize) -> i32 {
685    if pos == len {
686        return -1;
687    }
688    let c = s[pos];
689    if c.is_ascii_digit() {
690        0
691    } else if c.is_ascii_alphabetic() {
692        c as i32
693    } else if c == b'~' {
694        -2
695    } else {
696        c as i32 + 256
697    }
698}
699
700/// Random sort (-R): hash-based shuffle that groups identical keys.
701pub fn compare_random(a: &[u8], b: &[u8], seed: u64) -> Ordering {
702    let ha = fnv1a_hash(a, seed);
703    let hb = fnv1a_hash(b, seed);
704    ha.cmp(&hb)
705}
706
707/// FNV-1a hash with seed mixing.
708#[inline]
709fn fnv1a_hash(data: &[u8], seed: u64) -> u64 {
710    let mut hash = 0xcbf29ce484222325u64 ^ seed;
711    for &b in data {
712        hash ^= b as u64;
713        hash = hash.wrapping_mul(0x100000001b3);
714    }
715    hash
716}
717
718/// Compare with text filtering (-d, -i, -f flags in any combination).
719/// Allocation-free: uses iterator filtering.
720#[inline]
721fn is_dict_char(b: u8) -> bool {
722    b.is_ascii_alphanumeric() || b == b' ' || b == b'\t'
723}
724
725#[inline]
726fn is_printable(b: u8) -> bool {
727    b >= 0x20 && b < 0x7f
728}
729
730fn compare_text_filtered(
731    a: &[u8],
732    b: &[u8],
733    dict: bool,
734    no_print: bool,
735    fold_case: bool,
736) -> Ordering {
737    if !dict && !no_print && !fold_case {
738        return a.cmp(b);
739    }
740
741    let mut ai = a.iter().copied();
742    let mut bi = b.iter().copied();
743
744    loop {
745        let na = next_valid(&mut ai, dict, no_print);
746        let nb = next_valid(&mut bi, dict, no_print);
747        match (na, nb) {
748            (Some(ab), Some(bb)) => {
749                let ca = if fold_case {
750                    ab.to_ascii_uppercase()
751                } else {
752                    ab
753                };
754                let cb = if fold_case {
755                    bb.to_ascii_uppercase()
756                } else {
757                    bb
758                };
759                match ca.cmp(&cb) {
760                    Ordering::Equal => continue,
761                    other => return other,
762                }
763            }
764            (Some(_), None) => return Ordering::Greater,
765            (None, Some(_)) => return Ordering::Less,
766            (None, None) => return Ordering::Equal,
767        }
768    }
769}
770
771#[inline]
772fn next_valid(iter: &mut impl Iterator<Item = u8>, dict: bool, no_print: bool) -> Option<u8> {
773    loop {
774        match iter.next() {
775            None => return None,
776            Some(b) => {
777                if dict && !is_dict_char(b) {
778                    continue;
779                }
780                if no_print && !is_printable(b) {
781                    continue;
782                }
783                return Some(b);
784            }
785        }
786    }
787}
788
789/// Case-insensitive lexicographic comparison.
790/// Tight loop specialized for fold-case only (no dict/nonprinting filtering),
791/// avoiding the overhead of the general compare_text_filtered path.
792pub fn compare_ignore_case(a: &[u8], b: &[u8]) -> Ordering {
793    let alen = a.len();
794    let blen = b.len();
795    let min_len = alen.min(blen);
796    let ap = a.as_ptr();
797    let bp = b.as_ptr();
798    // Compare uppercase bytes directly with raw pointers for zero bounds-check overhead
799    let mut i = 0usize;
800    while i < min_len {
801        let ca = unsafe { (*ap.add(i)).to_ascii_uppercase() };
802        let cb = unsafe { (*bp.add(i)).to_ascii_uppercase() };
803        if ca != cb {
804            return ca.cmp(&cb);
805        }
806        i += 1;
807    }
808    alen.cmp(&blen)
809}
810
811pub fn compare_dictionary(a: &[u8], b: &[u8], ignore_case: bool) -> Ordering {
812    compare_text_filtered(a, b, true, false, ignore_case)
813}
814
815pub fn compare_ignore_nonprinting(a: &[u8], b: &[u8], ignore_case: bool) -> Ordering {
816    compare_text_filtered(a, b, false, true, ignore_case)
817}
818
819/// Master comparison function that dispatches based on KeyOpts.
820pub fn compare_with_opts(a: &[u8], b: &[u8], opts: &KeyOpts, random_seed: u64) -> Ordering {
821    let a = if opts.ignore_leading_blanks {
822        skip_leading_blanks(a)
823    } else {
824        a
825    };
826    let b = if opts.ignore_leading_blanks {
827        skip_leading_blanks(b)
828    } else {
829        b
830    };
831
832    let result = if opts.numeric {
833        compare_numeric(a, b)
834    } else if opts.general_numeric {
835        compare_general_numeric(a, b)
836    } else if opts.human_numeric {
837        compare_human_numeric(a, b)
838    } else if opts.month {
839        compare_month(a, b)
840    } else if opts.version {
841        compare_version(a, b)
842    } else if opts.random {
843        compare_random(a, b, random_seed)
844    } else if opts.dictionary_order || opts.ignore_nonprinting || opts.ignore_case {
845        compare_text_filtered(
846            a,
847            b,
848            opts.dictionary_order,
849            opts.ignore_nonprinting,
850            opts.ignore_case,
851        )
852    } else if super::core::is_c_locale() {
853        // C/POSIX locale: strcoll == byte comparison, skip CString overhead
854        a.cmp(b)
855    } else {
856        // Default: locale-aware comparison matching GNU sort's LC_COLLATE behavior
857        compare_locale(a, b)
858    };
859
860    if opts.reverse {
861        result.reverse()
862    } else {
863        result
864    }
865}
866
867/// Concrete comparison function type. Selected once at setup time to avoid
868/// per-comparison flag checking in hot sort loops.
869pub type CompareFn = fn(&[u8], &[u8]) -> Ordering;
870
871/// Select a concrete comparison function based on KeyOpts.
872/// Returns (compare_fn, needs_leading_blank_strip, needs_reverse).
873/// The caller applies blank-stripping and reversal outside the function pointer,
874/// eliminating all per-comparison branching.
875pub fn select_comparator(opts: &KeyOpts, random_seed: u64) -> (CompareFn, bool, bool) {
876    let needs_blank = opts.ignore_leading_blanks;
877    let needs_reverse = opts.reverse;
878
879    let cmp: CompareFn = if opts.numeric {
880        compare_numeric
881    } else if opts.general_numeric {
882        compare_general_numeric
883    } else if opts.human_numeric {
884        compare_human_numeric
885    } else if opts.month {
886        compare_month
887    } else if opts.version {
888        compare_version
889    } else if opts.random {
890        // Random needs seed — wrap in a closure-like pattern
891        // Since we need random_seed, we use a special case
892        return (
893            make_random_comparator(random_seed),
894            needs_blank,
895            needs_reverse,
896        );
897    } else if opts.dictionary_order || opts.ignore_nonprinting || opts.ignore_case {
898        // Text filtering: select specialized variant
899        match (
900            opts.dictionary_order,
901            opts.ignore_nonprinting,
902            opts.ignore_case,
903        ) {
904            (false, false, true) => compare_ignore_case,
905            (true, false, false) => |a: &[u8], b: &[u8]| compare_dictionary(a, b, false),
906            (true, false, true) => |a: &[u8], b: &[u8]| compare_dictionary(a, b, true),
907            (false, true, false) => |a: &[u8], b: &[u8]| compare_ignore_nonprinting(a, b, false),
908            (false, true, true) => |a: &[u8], b: &[u8]| compare_ignore_nonprinting(a, b, true),
909            (true, true, false) => {
910                |a: &[u8], b: &[u8]| compare_text_filtered(a, b, true, true, false)
911            }
912            (true, true, true) => {
913                |a: &[u8], b: &[u8]| compare_text_filtered(a, b, true, true, true)
914            }
915            _ => |a: &[u8], b: &[u8]| a.cmp(b),
916        }
917    } else if super::core::is_c_locale() {
918        // C/POSIX locale: strcoll == byte comparison, skip CString overhead
919        compare_lexical
920    } else {
921        // Default: locale-aware comparison matching GNU sort's LC_COLLATE behavior
922        compare_locale
923    };
924
925    (cmp, needs_blank, needs_reverse)
926}
927
928fn make_random_comparator(seed: u64) -> CompareFn {
929    // We can't capture the seed in a function pointer, so we use a static.
930    // This is safe because sort is single-process and seed doesn't change during a sort.
931    RANDOM_SEED.store(seed, std::sync::atomic::Ordering::Relaxed);
932    random_compare_with_static_seed
933}
934
935static RANDOM_SEED: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
936
937fn random_compare_with_static_seed(a: &[u8], b: &[u8]) -> Ordering {
938    let seed = RANDOM_SEED.load(std::sync::atomic::Ordering::Relaxed);
939    compare_random(a, b, seed)
940}
coreutils_rs/sort/compare.rs

coreutils_rs/sort/
compare.rs