gnu_sort/
zero_copy.rs

1use crate::locale;
2use crate::simd_compare::SIMDCompare;
3use memmap2::Mmap;
4use std::cmp::Ordering;
5use std::fs::File;
6use std::io::{self, BufRead, BufReader};
7use std::path::Path;
8
9/// Zero-copy line representation that points directly into memory-mapped data
10#[derive(Debug, Clone, Copy)]
11pub struct Line {
12    /// Pointer to the start of the line in the mapped memory
13    start: *const u8,
14    /// Length of the line (excluding newline)
15    len: u32,
16}
17
18// SAFETY: Line is safe to send between threads because:
19// 1. It only contains pointers to immutable memory-mapped data
20// 2. The memory-mapped files remain valid for the entire lifetime of the sort operation
21// 3. No thread can mutate the underlying memory during sorting
22unsafe impl Send for Line {}
23unsafe impl Sync for Line {}
24
25impl Line {
26    /// Create a new Line from a slice
27    pub fn new(data: &[u8]) -> Self {
28        Self {
29            start: data.as_ptr(),
30            len: data.len() as u32,
31        }
32    }
33
34    /// Get the line data as a byte slice
35    /// # Safety
36    /// The caller must ensure that:
37    /// 1. The memory this Line points to is still valid (not freed)
38    /// 2. The memory-mapped file has not been unmapped
39    /// 3. No other code is mutating this memory region
40    pub unsafe fn as_bytes(&self) -> &[u8] {
41        // SAFETY: We create a slice from the raw pointer with the stored length.
42        // The caller guarantees the memory is still valid and immutable.
43        unsafe { std::slice::from_raw_parts(self.start, self.len as usize) }
44    }
45
46    /// Extract a field from the line based on field separator
47    /// Fields are 1-indexed (field 1 is the first field)
48    pub fn extract_field(&self, field_num: usize, separator: Option<char>) -> Option<&[u8]> {
49        if field_num == 0 {
50            return None;
51        }
52
53        let bytes = unsafe { self.as_bytes() };
54
55        // If no separator specified, use whitespace
56        if separator.is_none() {
57            return self.extract_field_by_whitespace(bytes, field_num);
58        }
59
60        let sep_byte = separator.unwrap() as u8;
61        let mut field_count = 1;
62        let mut field_start = 0;
63
64        for (i, &byte) in bytes.iter().enumerate() {
65            if byte == sep_byte {
66                if field_count == field_num {
67                    return Some(&bytes[field_start..i]);
68                }
69                field_count += 1;
70                field_start = i + 1;
71            }
72        }
73
74        // Check if we're looking for the last field
75        if field_count == field_num && field_start < bytes.len() {
76            return Some(&bytes[field_start..]);
77        }
78
79        None
80    }
81
82    /// Extract field by whitespace (default behavior when no separator is specified)
83    /// Fields include leading whitespace from previous field separator (GNU sort behavior)
84    fn extract_field_by_whitespace<'a>(
85        &self,
86        bytes: &'a [u8],
87        field_num: usize,
88    ) -> Option<&'a [u8]> {
89        if field_num == 1 {
90            // Special case: field 1 starts at beginning of line
91            // Skip leading whitespace to find start of field 1
92            let mut field_start = 0;
93            for (i, &byte) in bytes.iter().enumerate() {
94                if byte != b' ' && byte != b'\t' {
95                    field_start = i;
96                    break;
97                }
98            }
99
100            // Find the end of field 1 (first whitespace or end of line)
101            for (i, &byte) in bytes[field_start..].iter().enumerate() {
102                if byte == b' ' || byte == b'\t' {
103                    return Some(&bytes[field_start..field_start + i]);
104                }
105            }
106            return Some(&bytes[field_start..]); // Entire remaining line is field 1
107        }
108
109        // For fields > 1, use a different approach
110        // First, skip initial whitespace and find all field boundaries
111        let mut field_boundaries = Vec::new();
112        let mut in_field = false;
113        let mut field_start = 0;
114
115        for (i, &byte) in bytes.iter().enumerate() {
116            let is_whitespace = byte == b' ' || byte == b'\t';
117
118            if !is_whitespace && !in_field {
119                // Starting a new field
120                field_start = i;
121                in_field = true;
122            } else if is_whitespace && in_field {
123                // Ending a field
124                field_boundaries.push(field_start..i);
125                in_field = false;
126            }
127        }
128
129        // Handle case where line ends with a field (no trailing whitespace)
130        if in_field {
131            field_boundaries.push(field_start..bytes.len());
132        }
133
134        if field_num > field_boundaries.len() {
135            return None;
136        }
137
138        let target_field = &field_boundaries[field_num - 1];
139
140        // For field 1, return just the field content
141        if field_num == 1 {
142            return Some(&bytes[target_field.clone()]);
143        }
144
145        // For fields > 1, include the whitespace before the field
146        // Find where the previous field ended
147        let prev_field_end = if field_num > 1 {
148            field_boundaries[field_num - 2].end
149        } else {
150            0
151        };
152
153        // The field includes whitespace from previous field end to current field end
154        Some(&bytes[prev_field_end..target_field.end])
155    }
156
157    /// Extract a key region from the line based on SortKey specification
158    pub fn extract_key(
159        &self,
160        key: &crate::config::SortKey,
161        separator: Option<char>,
162    ) -> Option<&[u8]> {
163        // Extract the starting field
164        let start_field_data = self.extract_field(key.start_field, separator)?;
165
166        // If no end field specified, use just the start field
167        if key.end_field.is_none() {
168            // Apply character positions if specified
169            if let Some(start_char) = key.start_char {
170                if start_char > 0 && start_char <= start_field_data.len() {
171                    return Some(&start_field_data[start_char - 1..]);
172                }
173            }
174            return Some(start_field_data);
175        }
176
177        // Complex case: range of fields
178        // For now, just extract from start field to end field
179        // This is a simplified implementation
180        let bytes = unsafe { self.as_bytes() };
181
182        // Find start position
183        let start_pos = if let Some(field_data) = self.extract_field(key.start_field, separator) {
184            let offset = field_data.as_ptr() as usize - bytes.as_ptr() as usize;
185            if let Some(start_char) = key.start_char {
186                if start_char > 0 && start_char <= field_data.len() {
187                    offset + start_char - 1
188                } else {
189                    offset
190                }
191            } else {
192                offset
193            }
194        } else {
195            return None;
196        };
197
198        // Find end position
199        let end_pos = if let Some(end_field) = key.end_field {
200            if let Some(field_data) = self.extract_field(end_field, separator) {
201                let offset = field_data.as_ptr() as usize - bytes.as_ptr() as usize;
202                let field_end = offset + field_data.len();
203                if let Some(end_char) = key.end_char {
204                    if end_char > 0 && end_char <= field_data.len() {
205                        offset + end_char
206                    } else {
207                        field_end
208                    }
209                } else {
210                    field_end
211                }
212            } else {
213                bytes.len()
214            }
215        } else {
216            bytes.len()
217        };
218
219        if start_pos < end_pos && start_pos < bytes.len() {
220            Some(&bytes[start_pos..end_pos.min(bytes.len())])
221        } else {
222            None
223        }
224    }
225
226    /// Fast numeric parsing for simple integers (optimized path)
227    pub fn parse_int(&self) -> Option<i64> {
228        // SAFETY: as_bytes() is safe here because Line was created from valid memory
229        // that remains valid throughout the sorting operation
230        let bytes = unsafe { self.as_bytes() };
231        if bytes.is_empty() {
232            return Some(0);
233        }
234
235        let mut start = 0;
236        let negative = if bytes[0] == b'-' {
237            start = 1;
238            true
239        } else {
240            false
241        };
242
243        if start >= bytes.len() {
244            return None;
245        }
246
247        let mut result: i64 = 0;
248        for &byte in &bytes[start..] {
249            if !byte.is_ascii_digit() {
250                return None;
251            }
252            result = result.checked_mul(10)?;
253            result = result.checked_add((byte - b'0') as i64)?;
254        }
255
256        Some(if negative { -result } else { result })
257    }
258
259    /// Parse as general numeric (supports scientific notation, inf, nan)
260    pub fn parse_general_numeric(&self) -> f64 {
261        let bytes = unsafe { self.as_bytes() };
262        if let Ok(s) = std::str::from_utf8(bytes) {
263            let trimmed = s.trim();
264
265            // Handle special cases
266            if trimmed.is_empty() {
267                return 0.0;
268            }
269
270            // Parse as float (handles scientific notation automatically)
271            match trimmed.parse::<f64>() {
272                Ok(val) => val,
273                Err(_) => {
274                    // Check for special strings
275                    let lower = trimmed.to_lowercase();
276                    if lower == "inf" || lower == "+inf" || lower == "infinity" {
277                        f64::INFINITY
278                    } else if lower == "-inf" || lower == "-infinity" {
279                        f64::NEG_INFINITY
280                    } else if lower == "nan" {
281                        f64::NAN
282                    } else {
283                        // Non-numeric strings sort to beginning (like GNU sort)
284                        f64::NEG_INFINITY
285                    }
286                }
287            }
288        } else {
289            f64::NEG_INFINITY
290        }
291    }
292
293    /// Compare as general numeric values (scientific notation support)
294    pub fn compare_general_numeric(&self, other: &Line) -> Ordering {
295        let a = self.parse_general_numeric();
296        let b = other.parse_general_numeric();
297
298        // Handle NaN specially (NaN sorts last in GNU sort)
299        match (a.is_nan(), b.is_nan()) {
300            (true, true) => unsafe { self.as_bytes().cmp(other.as_bytes()) }, // Lexicographic tie-breaker
301            (true, false) => Ordering::Greater,
302            (false, true) => Ordering::Less,
303            (false, false) => {
304                // Use total_cmp for consistent ordering including -0.0 vs 0.0
305                match a.total_cmp(&b) {
306                    Ordering::Equal => {
307                        // When numeric values are equal, use lexicographic comparison as tie-breaker
308                        // This matches GNU sort behavior
309                        unsafe { self.as_bytes().cmp(other.as_bytes()) }
310                    }
311                    other => other,
312                }
313            }
314        }
315    }
316
317    /// Compare lines using field-based sorting with multiple keys
318    pub fn compare_with_keys(
319        &self,
320        other: &Line,
321        keys: &[crate::config::SortKey],
322        separator: Option<char>,
323        config: &crate::config::SortConfig,
324    ) -> Ordering {
325        if keys.is_empty() {
326            // No keys specified, compare entire lines based on global options
327            return self.compare_with_config(other, config);
328        }
329
330        // Compare using each key in order
331        for key in keys {
332            let self_field = self.extract_key(key, separator);
333            let other_field = other.extract_key(key, separator);
334
335            let cmp = match (self_field, other_field) {
336                (Some(a), Some(b)) => {
337                    // Create temporary Line structs for the extracted fields
338                    let a_line = Line::new(a);
339                    let b_line = Line::new(b);
340
341                    // Compare based on key options
342                    let result = if key.options.general_numeric {
343                        a_line.compare_general_numeric(&b_line)
344                    } else if key.options.numeric {
345                        a_line.compare_numeric(&b_line)
346                    } else if key.options.month {
347                        a_line.compare_month(&b_line)
348                    } else if key.options.version {
349                        a_line.compare_version(&b_line)
350                    } else if key.options.human_numeric {
351                        a_line.compare_human_numeric(&b_line)
352                    } else if key.options.dictionary_order && key.options.ignore_case {
353                        a_line.compare_dictionary_order_ignore_case(&b_line)
354                    } else if key.options.dictionary_order {
355                        a_line.compare_dictionary_order(&b_line)
356                    } else if key.options.ignore_case {
357                        a_line.compare_ignore_case(&b_line)
358                    } else if key.options.ignore_leading_blanks {
359                        a_line.compare_lexicographic_with_blanks(&b_line, true)
360                    } else {
361                        a_line.compare_lexicographic(&b_line)
362                    };
363
364                    // Apply reverse if specified for this key
365                    let final_result = if key.options.reverse {
366                        result.reverse()
367                    } else {
368                        result
369                    };
370
371                    // Debug output if enabled (GNU sort compatible)
372                    if config.debug {
373                        let self_bytes = unsafe { self.as_bytes() };
374                        let other_bytes = unsafe { other.as_bytes() };
375                        let self_str = String::from_utf8_lossy(self_bytes);
376                        let other_str = String::from_utf8_lossy(other_bytes);
377                        let a_str = String::from_utf8_lossy(a);
378                        let b_str = String::from_utf8_lossy(b);
379
380                        // Convert Ordering to GNU sort style number
381                        let cmp_val = match final_result {
382                            Ordering::Greater => 1,
383                            Ordering::Less => -1,
384                            Ordering::Equal => 0,
385                        };
386
387                        eprintln!("; k1=<{a_str}>; k2=<{b_str}>; s1=<{self_str}>, s2=<{other_str}>; cmp1={cmp_val}");
388                    }
389
390                    final_result
391                }
392                (None, Some(_)) => Ordering::Less,
393                (Some(_), None) => Ordering::Greater,
394                (None, None) => Ordering::Equal,
395            };
396
397            if cmp != Ordering::Equal {
398                return cmp;
399            }
400        }
401
402        // All keys compared equal, use stable sort order (original line order)
403        if config.stable {
404            Ordering::Equal
405        } else {
406            // Use entire line as tie-breaker
407            self.compare_lexicographic(other)
408        }
409    }
410
411    /// Compare lines based on global configuration (when no keys are specified)
412    pub fn compare_with_config(
413        &self,
414        other: &Line,
415        config: &crate::config::SortConfig,
416    ) -> Ordering {
417        let cmp = match config.mode {
418            crate::config::SortMode::GeneralNumeric => self.compare_general_numeric(other),
419            crate::config::SortMode::Numeric => self.compare_numeric(other),
420            crate::config::SortMode::Month => self.compare_month(other),
421            crate::config::SortMode::Version => self.compare_version(other),
422            crate::config::SortMode::HumanNumeric => self.compare_human_numeric(other),
423            crate::config::SortMode::Lexicographic => {
424                if config.dictionary_order && config.ignore_case {
425                    self.compare_dictionary_order_ignore_case(other)
426                } else if config.dictionary_order {
427                    self.compare_dictionary_order(other)
428                } else if config.ignore_case {
429                    self.compare_ignore_case(other)
430                } else if config.ignore_leading_blanks {
431                    self.compare_lexicographic_with_blanks(other, true)
432                } else {
433                    self.compare_lexicographic(other)
434                }
435            }
436            _ => {
437                // For other modes, also check dictionary_order flag
438                if config.dictionary_order {
439                    self.compare_dictionary_order(other)
440                } else if config.ignore_leading_blanks {
441                    self.compare_lexicographic_with_blanks(other, true)
442                } else {
443                    self.compare_lexicographic(other)
444                }
445            }
446        };
447
448        if config.reverse {
449            cmp.reverse()
450        } else {
451            cmp
452        }
453    }
454
455    /// Fast comparison for numeric values (GNU sort style - no string conversion)
456    pub fn compare_numeric(&self, other: &Line) -> Ordering {
457        // Try fast path for simple integers
458        if let (Some(a), Some(b)) = (self.parse_int(), other.parse_int()) {
459            return a.cmp(&b);
460        }
461
462        // GNU sort style: compare as strings with numeric logic
463        self.compare_numeric_string_style(other)
464    }
465
466    /// GNU sort-style numeric string comparison (key optimization!)
467    fn compare_numeric_string_style(&self, other: &Line) -> Ordering {
468        let a_bytes = unsafe { self.as_bytes() };
469        let b_bytes = unsafe { other.as_bytes() };
470
471        // Skip leading whitespace
472        let a_start = self.skip_leading_space(a_bytes);
473        let b_start = self.skip_leading_space(b_bytes);
474
475        if a_start >= a_bytes.len() && b_start >= b_bytes.len() {
476            return Ordering::Equal;
477        }
478        if a_start >= a_bytes.len() {
479            return Ordering::Less;
480        }
481        if b_start >= b_bytes.len() {
482            return Ordering::Greater;
483        }
484
485        let a_rest = &a_bytes[a_start..];
486        let b_rest = &b_bytes[b_start..];
487
488        // Check signs
489        let (a_negative, a_num_start) = self.parse_sign(a_rest);
490        let (b_negative, b_num_start) = self.parse_sign(b_rest);
491
492        match (a_negative, b_negative) {
493            (true, false) => Ordering::Less,
494            (false, true) => Ordering::Greater,
495            _ => {
496                // Same sign - compare magnitudes
497                let a_digits = &a_rest[a_num_start..];
498                let b_digits = &b_rest[b_num_start..];
499
500                // Skip leading zeros (GNU sort behavior)
501                let a_no_zeros = self.skip_leading_zeros(a_digits);
502                let b_no_zeros = self.skip_leading_zeros(b_digits);
503
504                // Compare by digit count first (major optimization!)
505                let a_digit_count = self.count_leading_digits(&a_digits[a_no_zeros..]);
506                let b_digit_count = self.count_leading_digits(&b_digits[b_no_zeros..]);
507
508                let magnitude_cmp = match a_digit_count.cmp(&b_digit_count) {
509                    Ordering::Equal => {
510                        // Same digit count - lexicographic comparison
511                        a_digits[a_no_zeros..a_no_zeros + a_digit_count]
512                            .cmp(&b_digits[b_no_zeros..b_no_zeros + b_digit_count])
513                    }
514                    other => other,
515                };
516
517                if a_negative {
518                    magnitude_cmp.reverse()
519                } else {
520                    magnitude_cmp
521                }
522            }
523        }
524    }
525
526    fn skip_leading_space(&self, bytes: &[u8]) -> usize {
527        bytes
528            .iter()
529            .position(|&b| b != b' ' && b != b'\t')
530            .unwrap_or(bytes.len())
531    }
532
533    fn parse_sign(&self, bytes: &[u8]) -> (bool, usize) {
534        if bytes.is_empty() {
535            return (false, 0);
536        }
537        match bytes[0] {
538            b'-' => (true, 1),
539            b'+' => (false, 1),
540            _ => (false, 0),
541        }
542    }
543
544    fn skip_leading_zeros(&self, bytes: &[u8]) -> usize {
545        bytes.iter().position(|&b| b != b'0').unwrap_or(bytes.len())
546    }
547
548    fn count_leading_digits(&self, bytes: &[u8]) -> usize {
549        bytes.iter().take_while(|&&b| b.is_ascii_digit()).count()
550    }
551
552    /// Byte-level numeric comparison for complex numbers
553    #[allow(dead_code)]
554    fn compare_numeric_bytes(&self, other: &Line) -> Ordering {
555        let a_bytes = unsafe { self.as_bytes() };
556        let b_bytes = unsafe { other.as_bytes() };
557
558        // Skip leading whitespace
559        let a_trimmed = a_bytes
560            .iter()
561            .skip_while(|&&b| b == b' ' || b == b'\t')
562            .collect::<Vec<_>>();
563        let b_trimmed = b_bytes
564            .iter()
565            .skip_while(|&&b| b == b' ' || b == b'\t')
566            .collect::<Vec<_>>();
567
568        if a_trimmed.is_empty() && b_trimmed.is_empty() {
569            return Ordering::Equal;
570        }
571        if a_trimmed.is_empty() {
572            return Ordering::Less;
573        }
574        if b_trimmed.is_empty() {
575            return Ordering::Greater;
576        }
577
578        // Compare signs
579        let a_negative = *a_trimmed[0] == b'-';
580        let b_negative = *b_trimmed[0] == b'-';
581
582        match (a_negative, b_negative) {
583            (true, false) => Ordering::Less,
584            (false, true) => Ordering::Greater,
585            _ => {
586                // Same sign, compare magnitudes
587                let a_digits: Vec<u8> = a_trimmed
588                    .iter()
589                    .skip_while(|&&&b| b == b'-' || b == b'+')
590                    .filter(|&&&b| b.is_ascii_digit())
591                    .map(|&&b| b)
592                    .collect();
593                let b_digits: Vec<u8> = b_trimmed
594                    .iter()
595                    .skip_while(|&&&b| b == b'-' || b == b'+')
596                    .filter(|&&&b| b.is_ascii_digit())
597                    .map(|&&b| b)
598                    .collect();
599
600                let magnitude_cmp = match a_digits.len().cmp(&b_digits.len()) {
601                    Ordering::Equal => a_digits.cmp(&b_digits),
602                    other => other,
603                };
604
605                if a_negative {
606                    magnitude_cmp.reverse()
607                } else {
608                    magnitude_cmp
609                }
610            }
611        }
612    }
613
614    /// Get the length of the line
615    pub fn len(&self) -> usize {
616        self.len as usize
617    }
618
619    /// Check if the line is empty
620    pub fn is_empty(&self) -> bool {
621        self.len == 0
622    }
623
624    /// Locale-aware case-insensitive comparison
625    pub fn compare_ignore_case(&self, other: &Line) -> Ordering {
626        let a_bytes = unsafe { self.as_bytes() };
627        let b_bytes = unsafe { other.as_bytes() };
628
629        // Use locale-aware comparison if enabled
630        if locale::LocaleConfig::is_enabled() {
631            locale::smart_compare(a_bytes, b_bytes, true)
632        } else {
633            // Use SIMD for performance boost when locale is not enabled
634            SIMDCompare::compare_case_insensitive_simd(a_bytes, b_bytes)
635        }
636    }
637
638    /// Locale-aware lexicographic comparison
639    pub fn compare_lexicographic(&self, other: &Line) -> Ordering {
640        let a_bytes = unsafe { self.as_bytes() };
641        let b_bytes = unsafe { other.as_bytes() };
642
643        // Use locale-aware comparison if enabled
644        if locale::LocaleConfig::is_enabled() {
645            locale::smart_compare(a_bytes, b_bytes, false)
646        } else {
647            // Use SIMD for maximum performance when locale is not enabled
648            SIMDCompare::compare_bytes_simd(a_bytes, b_bytes)
649        }
650    }
651
652    /// Lexicographic comparison with option to ignore leading blanks
653    pub fn compare_lexicographic_with_blanks(
654        &self,
655        other: &Line,
656        ignore_leading_blanks: bool,
657    ) -> Ordering {
658        let mut a_bytes = unsafe { self.as_bytes() };
659        let mut b_bytes = unsafe { other.as_bytes() };
660
661        if ignore_leading_blanks {
662            // Skip leading blanks (spaces and tabs)
663            let a_start = a_bytes
664                .iter()
665                .position(|&b| b != b' ' && b != b'\t')
666                .unwrap_or(a_bytes.len());
667            let b_start = b_bytes
668                .iter()
669                .position(|&b| b != b' ' && b != b'\t')
670                .unwrap_or(b_bytes.len());
671            a_bytes = &a_bytes[a_start..];
672            b_bytes = &b_bytes[b_start..];
673        }
674
675        // Use locale-aware comparison if enabled
676        if locale::LocaleConfig::is_enabled() {
677            locale::smart_compare(a_bytes, b_bytes, false)
678        } else {
679            // Use SIMD for maximum performance when locale is not enabled
680            SIMDCompare::compare_bytes_simd(a_bytes, b_bytes)
681        }
682    }
683
684    /// Dictionary order comparison (only alphanumeric characters and blanks)
685    pub fn compare_dictionary_order(&self, other: &Line) -> Ordering {
686        let a_bytes = unsafe { self.as_bytes() };
687        let b_bytes = unsafe { other.as_bytes() };
688
689        let a_filtered = self.filter_dictionary_order(a_bytes);
690        let b_filtered = self.filter_dictionary_order(b_bytes);
691
692        // Use locale-aware comparison if enabled
693        if locale::LocaleConfig::is_enabled() {
694            locale::smart_compare(&a_filtered, &b_filtered, false)
695        } else {
696            // Use SIMD for maximum performance when locale is not enabled
697            SIMDCompare::compare_bytes_simd(&a_filtered, &b_filtered)
698        }
699    }
700
701    /// Dictionary order with case-insensitive comparison
702    pub fn compare_dictionary_order_ignore_case(&self, other: &Line) -> Ordering {
703        let a_bytes = unsafe { self.as_bytes() };
704        let b_bytes = unsafe { other.as_bytes() };
705
706        let a_filtered = self.filter_dictionary_order(a_bytes);
707        let b_filtered = self.filter_dictionary_order(b_bytes);
708
709        // Use locale-aware comparison if enabled
710        if locale::LocaleConfig::is_enabled() {
711            locale::smart_compare(&a_filtered, &b_filtered, true)
712        } else {
713            // Use SIMD for performance boost when locale is not enabled
714            SIMDCompare::compare_case_insensitive_simd(&a_filtered, &b_filtered)
715        }
716    }
717
718    /// Filter bytes to keep only alphanumeric characters and blanks (spaces/tabs)
719    /// This implements GNU sort's dictionary order (-d flag)
720    fn filter_dictionary_order(&self, bytes: &[u8]) -> Vec<u8> {
721        // Convert to string to properly handle Unicode
722        if let Ok(s) = std::str::from_utf8(bytes) {
723            s.chars()
724                .filter(|c| c.is_alphanumeric() || *c == ' ' || *c == '\t')
725                .collect::<String>()
726                .into_bytes()
727        } else {
728            // Fallback for non-UTF8 - filter ASCII only
729            bytes
730                .iter()
731                .filter(|&&b| b.is_ascii_alphanumeric() || b == b' ' || b == b'\t')
732                .copied()
733                .collect()
734        }
735    }
736
737    /// Month-aware comparison (GNU sort compatible)
738    pub fn compare_month(&self, other: &Line) -> Ordering {
739        let a_bytes = unsafe { self.as_bytes() };
740        let b_bytes = unsafe { other.as_bytes() };
741
742        fn month_value(bytes: &[u8]) -> u8 {
743            // Convert to uppercase for case-insensitive comparison
744            let upper_bytes: Vec<u8> = bytes.iter().map(|b| b.to_ascii_uppercase()).collect();
745
746            // Try to match month abbreviations (GNU sort standard)
747            match upper_bytes.as_slice() {
748                b"JAN" | b"JANUARY" => 1,
749                b"FEB" | b"FEBRUARY" => 2,
750                b"MAR" | b"MARCH" => 3,
751                b"APR" | b"APRIL" => 4,
752                b"MAY" => 5,
753                b"JUN" | b"JUNE" => 6,
754                b"JUL" | b"JULY" => 7,
755                b"AUG" | b"AUGUST" => 8,
756                b"SEP" | b"SEPTEMBER" => 9,
757                b"OCT" | b"OCTOBER" => 10,
758                b"NOV" | b"NOVEMBER" => 11,
759                b"DEC" | b"DECEMBER" => 12,
760                _ => 0, // Unknown month, will be compared lexicographically
761            }
762        }
763
764        let a_month = month_value(a_bytes);
765        let b_month = month_value(b_bytes);
766
767        match (a_month, b_month) {
768            // Both are recognized months - compare by month order
769            (a, b) if a > 0 && b > 0 => a.cmp(&b),
770            // Only a is a month - non-months come before months (GNU sort behavior)
771            (a, 0) if a > 0 => Ordering::Greater,
772            // Only b is a month - non-months come before months (GNU sort behavior)
773            (0, b) if b > 0 => Ordering::Less,
774            // Neither is a month - fall back to lexicographic comparison
775            (0, 0) => self.compare_lexicographic(other),
776            // Catch-all for any other cases (should not occur, but satisfies compiler)
777            _ => self.compare_lexicographic(other),
778        }
779    }
780
781    /// Version-aware comparison (GNU sort -V compatible)
782    pub fn compare_version(&self, other: &Line) -> Ordering {
783        let a_bytes = unsafe { self.as_bytes() };
784        let b_bytes = unsafe { other.as_bytes() };
785
786        // Convert to strings for version parsing
787        let a_str = String::from_utf8_lossy(a_bytes);
788        let b_str = String::from_utf8_lossy(b_bytes);
789
790        Self::compare_version_strings(&a_str, &b_str)
791    }
792
793    /// Compare two version strings (like "1.2.3" vs "1.10.1")
794    fn compare_version_strings(a: &str, b: &str) -> Ordering {
795        // Split by non-alphanumeric characters and compare each component
796        let a_parts = Self::version_tokenize(a);
797        let b_parts = Self::version_tokenize(b);
798
799        for (a_part, b_part) in a_parts.iter().zip(b_parts.iter()) {
800            match Self::compare_version_component(a_part, b_part) {
801                Ordering::Equal => continue,
802                other => return other,
803            }
804        }
805
806        // If all compared parts are equal, longer version wins
807        a_parts.len().cmp(&b_parts.len())
808    }
809
810    /// Tokenize version string into alphanumeric components
811    fn version_tokenize(s: &str) -> Vec<String> {
812        let mut tokens = Vec::new();
813        let mut current = String::new();
814        let mut in_alpha = false;
815
816        for ch in s.chars() {
817            let is_alpha = ch.is_alphabetic();
818            let is_digit = ch.is_ascii_digit();
819
820            if is_alpha || is_digit {
821                if in_alpha != is_alpha && !current.is_empty() {
822                    tokens.push(current);
823                    current = String::new();
824                }
825                current.push(ch);
826                in_alpha = is_alpha;
827            } else if !current.is_empty() {
828                tokens.push(current);
829                current = String::new();
830            }
831        }
832
833        if !current.is_empty() {
834            tokens.push(current);
835        }
836
837        tokens
838    }
839
840    /// Compare individual version components (numeric or alphabetic)
841    fn compare_version_component(a: &str, b: &str) -> Ordering {
842        // Check if both are numeric
843        if let (Ok(a_num), Ok(b_num)) = (a.parse::<u64>(), b.parse::<u64>()) {
844            return a_num.cmp(&b_num);
845        }
846
847        // Check if one is numeric and other is not (numeric comes first)
848        match (
849            a.chars().all(|c| c.is_ascii_digit()),
850            b.chars().all(|c| c.is_ascii_digit()),
851        ) {
852            (true, false) => Ordering::Less,
853            (false, true) => Ordering::Greater,
854            _ => a.cmp(b), // Both non-numeric, lexicographic comparison
855        }
856    }
857
858    /// Human numeric comparison (GNU sort -h compatible)
859    pub fn compare_human_numeric(&self, other: &Line) -> Ordering {
860        let a_bytes = unsafe { self.as_bytes() };
861        let b_bytes = unsafe { other.as_bytes() };
862
863        let a_string = String::from_utf8_lossy(a_bytes);
864        let b_string = String::from_utf8_lossy(b_bytes);
865        let a_str = a_string.trim();
866        let b_str = b_string.trim();
867
868        let a_val = Self::parse_human_numeric(a_str);
869        let b_val = Self::parse_human_numeric(b_str);
870
871        match (a_val, b_val) {
872            (Some(a), Some(b)) => {
873                match a.partial_cmp(&b) {
874                    Some(ord) => ord,
875                    None => a_str.cmp(b_str), // Handle NaN case
876                }
877            }
878            (Some(_), None) => Ordering::Less, // Numbers before non-numbers
879            (None, Some(_)) => Ordering::Greater, // Numbers before non-numbers
880            (None, None) => a_str.cmp(b_str),  // Both non-numeric
881        }
882    }
883
884    /// Parse human-readable numeric value (like "1K", "2.5M", "1G")
885    fn parse_human_numeric(s: &str) -> Option<f64> {
886        if s.is_empty() {
887            return None;
888        }
889
890        let s = s.trim();
891        let last_char = s.chars().last()?;
892
893        let multiplier = match last_char.to_ascii_uppercase() {
894            'K' => 1024.0,
895            'M' => 1024.0 * 1024.0,
896            'G' => 1024.0 * 1024.0 * 1024.0,
897            'T' => 1024.0 * 1024.0 * 1024.0 * 1024.0,
898            'P' => 1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024.0,
899            _ => {
900                // No suffix, parse as regular number
901                return s.parse::<f64>().ok();
902            }
903        };
904
905        // Parse the numeric part (without the suffix)
906        let numeric_part = s[..s.len() - 1].trim();
907        let value = numeric_part.parse::<f64>().ok()?;
908
909        Some(value * multiplier)
910    }
911}
912
913/// Memory-mapped file with parsed lines
914pub struct MappedFile {
915    _mmap: Mmap, // Keep mmap alive
916    lines: Vec<Line>,
917}
918
919impl MappedFile {
920    /// Create a new SimpleMappedFile from a file path
921    pub fn new(path: &Path) -> io::Result<Self> {
922        let file = File::open(path)?;
923        let mmap = unsafe { Mmap::map(&file)? };
924
925        // Parse lines while keeping references to the mmap
926        let lines = parse_lines(&mmap);
927
928        Ok(Self { _mmap: mmap, lines })
929    }
930
931    /// Get the lines in this file
932    pub fn lines(&self) -> &[Line] {
933        &self.lines
934    }
935}
936
937/// Fast line parsing that creates Line structs pointing into the mmap'd data
938fn parse_lines(data: &[u8]) -> Vec<Line> {
939    let mut lines = Vec::new();
940    let mut start = 0;
941
942    for (i, &byte) in data.iter().enumerate() {
943        if byte == b'\n' {
944            // Handle both Unix (\n) and Windows (\r\n) line endings
945            let end = if i > 0 && data[i - 1] == b'\r' {
946                i - 1
947            } else {
948                i
949            };
950            let line_data = &data[start..end];
951            lines.push(Line::new(line_data));
952            start = i + 1;
953        }
954    }
955
956    // Handle last line if it doesn't end with newline
957    if start < data.len() {
958        let mut end = data.len();
959        // Strip trailing \r if present
960        if end > start && data[end - 1] == b'\r' {
961            end -= 1;
962        }
963        let line_data = &data[start..end];
964        lines.push(Line::new(line_data));
965    }
966
967    lines
968}
969
970/// Zero-copy line reader for streaming large files
971pub struct ZeroCopyReader {
972    reader: BufReader<File>,
973    buffer: Vec<u8>,
974    lines: Vec<Line>,
975}
976
977impl ZeroCopyReader {
978    pub fn new(file: File) -> Self {
979        Self {
980            reader: BufReader::new(file),
981            buffer: Vec::with_capacity(64 * 1024), // 64KB buffer
982            lines: Vec::new(),
983        }
984    }
985
986    /// Read next chunk of lines, reusing the internal buffer
987    pub fn read_chunk(&mut self) -> io::Result<&[Line]> {
988        self.buffer.clear();
989        self.lines.clear();
990
991        let mut total_read = 0;
992        const CHUNK_SIZE: usize = 64 * 1024;
993
994        // Read up to CHUNK_SIZE bytes
995        while total_read < CHUNK_SIZE {
996            let mut line_buf = Vec::new();
997            let bytes_read = self.reader.read_until(b'\n', &mut line_buf)?;
998
999            if bytes_read == 0 {
1000                break; // EOF
1001            }
1002
1003            let start_idx = self.buffer.len();
1004            self.buffer.extend_from_slice(&line_buf);
1005
1006            // Remove trailing newline if present
1007            let end_idx = if line_buf.ends_with(b"\n") {
1008                self.buffer.len() - 1
1009            } else {
1010                self.buffer.len()
1011            };
1012
1013            let line_data = &self.buffer[start_idx..end_idx];
1014            self.lines.push(Line::new(line_data));
1015
1016            total_read += bytes_read;
1017        }
1018
1019        Ok(&self.lines)
1020    }
1021}
1022
1023/// Optimized numeric comparison for Line structs
1024pub fn compare_numeric_lines(a: &Line, b: &Line) -> Ordering {
1025    unsafe {
1026        let a_bytes = a.as_bytes();
1027        let b_bytes = b.as_bytes();
1028
1029        // Fast path for simple integer comparison
1030        if let (Some(a_num), Some(b_num)) = (parse_int(a_bytes), parse_int(b_bytes)) {
1031            return a_num.cmp(&b_num);
1032        }
1033
1034        // Fall back to lexicographic comparison for complex numbers
1035        compare_numeric_bytes(a_bytes, b_bytes)
1036    }
1037}
1038
1039/// Fast integer parsing for simple cases (digits only, no signs/decimals)
1040fn parse_int(bytes: &[u8]) -> Option<i64> {
1041    if bytes.is_empty() {
1042        return Some(0);
1043    }
1044
1045    let mut result: i64 = 0;
1046    let mut negative = false;
1047    let mut start = 0;
1048
1049    // Handle leading sign
1050    if bytes[0] == b'-' {
1051        negative = true;
1052        start = 1;
1053    } else if bytes[0] == b'+' {
1054        start = 1;
1055    }
1056
1057    // Parse digits
1058    for &byte in &bytes[start..] {
1059        if !byte.is_ascii_digit() {
1060            return None; // Not a simple integer
1061        }
1062
1063        result = result.checked_mul(10)?;
1064        result = result.checked_add((byte - b'0') as i64)?;
1065    }
1066
1067    if negative {
1068        result = -result;
1069    }
1070
1071    Some(result)
1072}
1073
1074/// Numeric comparison for complex numbers (with decimals, scientific notation, etc.)
1075fn compare_numeric_bytes(a: &[u8], b: &[u8]) -> Ordering {
1076    // Skip leading whitespace
1077    let a = skip_whitespace(a);
1078    let b = skip_whitespace(b);
1079
1080    // Handle empty strings
1081    match (a.is_empty(), b.is_empty()) {
1082        (true, true) => return Ordering::Equal,
1083        (true, false) => return Ordering::Less,
1084        (false, true) => return Ordering::Greater,
1085        (false, false) => {
1086            // Continue with comparison
1087        }
1088    }
1089
1090    // Extract signs
1091    let (a_negative, a_digits) = extract_sign(a);
1092    let (b_negative, b_digits) = extract_sign(b);
1093
1094    // Compare signs
1095    match (a_negative, b_negative) {
1096        (false, true) => return Ordering::Greater,
1097        (true, false) => return Ordering::Less,
1098        _ => {}
1099    }
1100
1101    // Both have same sign, compare magnitudes
1102    let magnitude_cmp = compare_magnitude(a_digits, b_digits);
1103
1104    if a_negative {
1105        magnitude_cmp.reverse()
1106    } else {
1107        magnitude_cmp
1108    }
1109}
1110
1111fn skip_whitespace(bytes: &[u8]) -> &[u8] {
1112    let start = bytes
1113        .iter()
1114        .position(|&b| !b.is_ascii_whitespace())
1115        .unwrap_or(bytes.len());
1116    &bytes[start..]
1117}
1118
1119fn extract_sign(bytes: &[u8]) -> (bool, &[u8]) {
1120    if bytes.starts_with(b"-") {
1121        (true, &bytes[1..])
1122    } else if bytes.starts_with(b"+") {
1123        (false, &bytes[1..])
1124    } else {
1125        (false, bytes)
1126    }
1127}
1128
1129fn compare_magnitude(a: &[u8], b: &[u8]) -> Ordering {
1130    // Find decimal points
1131    let a_dot = a.iter().position(|&b| b == b'.');
1132    let b_dot = b.iter().position(|&b| b == b'.');
1133
1134    let (a_int, a_frac) = match a_dot {
1135        Some(pos) => (&a[..pos], &a[pos + 1..]),
1136        None => (a, &[][..]),
1137    };
1138
1139    let (b_int, b_frac) = match b_dot {
1140        Some(pos) => (&b[..pos], &b[pos + 1..]),
1141        None => (b, &[][..]),
1142    };
1143
1144    // Compare integer parts
1145    let int_cmp = compare_integer_parts(a_int, b_int);
1146    if int_cmp != Ordering::Equal {
1147        return int_cmp;
1148    }
1149
1150    // Compare fractional parts
1151    compare_fractional_parts(a_frac, b_frac)
1152}
1153
1154fn compare_integer_parts(a: &[u8], b: &[u8]) -> Ordering {
1155    // Remove leading zeros
1156    let a = skip_leading_zeros(a);
1157    let b = skip_leading_zeros(b);
1158
1159    // Compare lengths first
1160    let len_cmp = a.len().cmp(&b.len());
1161    if len_cmp != Ordering::Equal {
1162        return len_cmp;
1163    }
1164
1165    // Same length, compare digit by digit
1166    a.cmp(b)
1167}
1168
1169fn compare_fractional_parts(a: &[u8], b: &[u8]) -> Ordering {
1170    let max_len = a.len().max(b.len());
1171
1172    for i in 0..max_len {
1173        let a_digit = a.get(i).copied().unwrap_or(b'0');
1174        let b_digit = b.get(i).copied().unwrap_or(b'0');
1175
1176        let cmp = a_digit.cmp(&b_digit);
1177        if cmp != Ordering::Equal {
1178            return cmp;
1179        }
1180    }
1181
1182    Ordering::Equal
1183}
1184
1185fn skip_leading_zeros(bytes: &[u8]) -> &[u8] {
1186    let start = bytes.iter().position(|&b| b != b'0').unwrap_or(bytes.len());
1187    if start == bytes.len() {
1188        b"0" // All zeros, return single zero
1189    } else {
1190        &bytes[start..]
1191    }
1192}
1193
1194/// Fast case-insensitive comparison with locale support
1195pub fn compare_case_insensitive(a: &[u8], b: &[u8]) -> Ordering {
1196    // Use locale-aware comparison if enabled
1197    if locale::LocaleConfig::is_enabled() {
1198        locale::smart_compare(a, b, true)
1199    } else {
1200        // Fast path for C/POSIX locale
1201        let min_len = a.len().min(b.len());
1202
1203        for i in 0..min_len {
1204            let a_char = a[i].to_ascii_lowercase();
1205            let b_char = b[i].to_ascii_lowercase();
1206
1207            match a_char.cmp(&b_char) {
1208                Ordering::Equal => continue,
1209                other => return other,
1210            }
1211        }
1212
1213        a.len().cmp(&b.len())
1214    }
1215}
1216
1217#[cfg(test)]
1218mod tests {
1219    use super::*;
1220
1221    #[test]
1222    fn test_simple_line_creation() {
1223        let data = b"hello world";
1224        let line = Line::new(data);
1225
1226        unsafe {
1227            assert_eq!(line.as_bytes(), b"hello world");
1228        }
1229        assert_eq!(line.len(), 11);
1230    }
1231
1232    #[test]
1233    fn test_numeric_comparison() {
1234        let a = Line::new(b"123");
1235        let b = Line::new(b"456");
1236        let c = Line::new(b"123");
1237
1238        assert_eq!(compare_numeric_lines(&a, &b), Ordering::Less);
1239        assert_eq!(compare_numeric_lines(&b, &a), Ordering::Greater);
1240        assert_eq!(compare_numeric_lines(&a, &c), Ordering::Equal);
1241    }
1242
1243    #[test]
1244    fn test_simple_int_parsing() {
1245        assert_eq!(parse_int(b"123"), Some(123));
1246        assert_eq!(parse_int(b"-456"), Some(-456));
1247        assert_eq!(parse_int(b"+789"), Some(789));
1248        assert_eq!(parse_int(b"0"), Some(0));
1249        assert_eq!(parse_int(b""), Some(0));
1250        assert_eq!(parse_int(b"12.34"), None); // Not simple
1251        assert_eq!(parse_int(b"abc"), None); // Not numeric
1252    }
1253
1254    #[test]
1255    fn test_parse_lines_with_different_endings() {
1256        // Test Unix line endings
1257        let unix_data = b"line1\nline2\nline3";
1258        let unix_lines = parse_lines(unix_data);
1259        assert_eq!(unix_lines.len(), 3);
1260        unsafe {
1261            assert_eq!(unix_lines[0].as_bytes(), b"line1");
1262            assert_eq!(unix_lines[1].as_bytes(), b"line2");
1263            assert_eq!(unix_lines[2].as_bytes(), b"line3");
1264        }
1265
1266        // Test Windows line endings
1267        let windows_data = b"line1\r\nline2\r\nline3\r\n";
1268        let windows_lines = parse_lines(windows_data);
1269        assert_eq!(windows_lines.len(), 3);
1270        unsafe {
1271            assert_eq!(windows_lines[0].as_bytes(), b"line1");
1272            assert_eq!(windows_lines[1].as_bytes(), b"line2");
1273            assert_eq!(windows_lines[2].as_bytes(), b"line3");
1274        }
1275
1276        // Test mixed line endings
1277        let mixed_data = b"line1\r\nline2\nline3\r";
1278        let mixed_lines = parse_lines(mixed_data);
1279        assert_eq!(mixed_lines.len(), 3);
1280        unsafe {
1281            assert_eq!(mixed_lines[0].as_bytes(), b"line1");
1282            assert_eq!(mixed_lines[1].as_bytes(), b"line2");
1283            assert_eq!(mixed_lines[2].as_bytes(), b"line3");
1284        }
1285
1286        // Test single line without ending
1287        let single_data = b"single_line";
1288        let single_lines = parse_lines(single_data);
1289        assert_eq!(single_lines.len(), 1);
1290        unsafe {
1291            assert_eq!(single_lines[0].as_bytes(), b"single_line");
1292        }
1293    }
1294}
gnu_sort/zero_copy.rs

gnu_sort/
zero_copy.rs